shithub: libvpx

Download patch

ref: 4b2c2b9aa4a273a23d90ddb3bbf6dfb3482e0b8f
parent: 6c280c2299f078a475dc87e7615fdf1a4998cd31
author: Ronald S. Bultje <rbultje@google.com>
date: Thu Nov 1 07:09:58 EDT 2012

Rename vp8/ codec directory to vp9/.

Change-Id: Ic084c475844b24092a433ab88138cf58af3abbe4

diff: cannot open a/vp8/common/arm/armv6//null: file does not exist: 'a/vp8/common/arm/armv6//null' diff: cannot open a/vp8/common/arm/neon//null: file does not exist: 'a/vp8/common/arm/neon//null' diff: cannot open a/vp8/common/arm//null: file does not exist: 'a/vp8/common/arm//null' diff: cannot open a/vp8/common/generic//null: file does not exist: 'a/vp8/common/generic//null' diff: cannot open a/vp8/common/ppc//null: file does not exist: 'a/vp8/common/ppc//null' diff: cannot open a/vp8/common/x86//null: file does not exist: 'a/vp8/common/x86//null' diff: cannot open a/vp8/common//null: file does not exist: 'a/vp8/common//null' diff: cannot open a/vp8/decoder/arm/armv6//null: file does not exist: 'a/vp8/decoder/arm/armv6//null' diff: cannot open a/vp8/decoder/arm/neon//null: file does not exist: 'a/vp8/decoder/arm/neon//null' diff: cannot open a/vp8/decoder/arm//null: file does not exist: 'a/vp8/decoder/arm//null' diff: cannot open a/vp8/decoder/x86//null: file does not exist: 'a/vp8/decoder/x86//null' diff: cannot open a/vp8/decoder//null: file does not exist: 'a/vp8/decoder//null' diff: cannot open a/vp8/encoder/arm/armv5te//null: file does not exist: 'a/vp8/encoder/arm/armv5te//null' diff: cannot open a/vp8/encoder/arm/armv6//null: file does not exist: 'a/vp8/encoder/arm/armv6//null' diff: cannot open a/vp8/encoder/arm/neon//null: file does not exist: 'a/vp8/encoder/arm/neon//null' diff: cannot open a/vp8/encoder/arm//null: file does not exist: 'a/vp8/encoder/arm//null' diff: cannot open a/vp8/encoder/generic//null: file does not exist: 'a/vp8/encoder/generic//null' diff: cannot open a/vp8/encoder/ppc//null: file does not exist: 'a/vp8/encoder/ppc//null' diff: cannot open a/vp8/encoder/x86//null: file does not exist: 'a/vp8/encoder/x86//null' diff: cannot open a/vp8/encoder//null: file does not exist: 'a/vp8/encoder//null' diff: cannot open a/vp8//null: file does not exist: 'a/vp8//null' diff: cannot open b/vp9/common/arm/armv6//null: file does not exist: 'b/vp9/common/arm/armv6//null' diff: cannot open b/vp9/common/arm/neon//null: file does not exist: 'b/vp9/common/arm/neon//null' diff: cannot open b/vp9/common/arm//null: file does not exist: 'b/vp9/common/arm//null' diff: cannot open b/vp9/common/generic//null: file does not exist: 'b/vp9/common/generic//null' diff: cannot open b/vp9/common/ppc//null: file does not exist: 'b/vp9/common/ppc//null' diff: cannot open b/vp9/common/x86//null: file does not exist: 'b/vp9/common/x86//null' diff: cannot open b/vp9/common//null: file does not exist: 'b/vp9/common//null' diff: cannot open b/vp9/decoder/arm/armv6//null: file does not exist: 'b/vp9/decoder/arm/armv6//null' diff: cannot open b/vp9/decoder/arm/neon//null: file does not exist: 'b/vp9/decoder/arm/neon//null' diff: cannot open b/vp9/decoder/arm//null: file does not exist: 'b/vp9/decoder/arm//null' diff: cannot open b/vp9/decoder/x86//null: file does not exist: 'b/vp9/decoder/x86//null' diff: cannot open b/vp9/decoder//null: file does not exist: 'b/vp9/decoder//null' diff: cannot open b/vp9/encoder/arm/armv5te//null: file does not exist: 'b/vp9/encoder/arm/armv5te//null' diff: cannot open b/vp9/encoder/arm/armv6//null: file does not exist: 'b/vp9/encoder/arm/armv6//null' diff: cannot open b/vp9/encoder/arm/neon//null: file does not exist: 'b/vp9/encoder/arm/neon//null' diff: cannot open b/vp9/encoder/arm//null: file does not exist: 'b/vp9/encoder/arm//null' diff: cannot open b/vp9/encoder/generic//null: file does not exist: 'b/vp9/encoder/generic//null' diff: cannot open b/vp9/encoder/ppc//null: file does not exist: 'b/vp9/encoder/ppc//null' diff: cannot open b/vp9/encoder/x86//null: file does not exist: 'b/vp9/encoder/x86//null' diff: cannot open b/vp9/encoder//null: file does not exist: 'b/vp9/encoder//null' diff: cannot open b/vp9//null: file does not exist: 'b/vp9//null'
--- a/build/x86-msvs/obj_int_extract.bat
+++ b/build/x86-msvs/obj_int_extract.bat
@@ -7,9 +7,9 @@
 REM   be found in the AUTHORS file in the root of the source tree.
 echo on
 
-cl /I "./" /I "%1" /nologo /c "%1/vp8/common/asm_com_offsets.c"
-cl /I "./" /I "%1" /nologo /c "%1/vp8/decoder/asm_dec_offsets.c"
-cl /I "./" /I "%1" /nologo /c "%1/vp8/encoder/asm_enc_offsets.c"
+cl /I "./" /I "%1" /nologo /c "%1/vp9/common/asm_com_offsets.c"
+cl /I "./" /I "%1" /nologo /c "%1/vp9/decoder/asm_dec_offsets.c"
+cl /I "./" /I "%1" /nologo /c "%1/vp9/encoder/asm_enc_offsets.c"
 obj_int_extract.exe rvds "asm_com_offsets.obj" > "asm_com_offsets.asm"
 obj_int_extract.exe rvds "asm_dec_offsets.obj" > "asm_dec_offsets.asm"
 obj_int_extract.exe rvds "asm_enc_offsets.obj" > "asm_enc_offsets.asm"
--- a/configure
+++ b/configure
@@ -31,7 +31,7 @@
   ${toggle_debug_libs}            in/exclude debug version of libraries
   ${toggle_md5}                   support for output of checksum data
   ${toggle_static_msvcrt}         use static MSVCRT (VS builds only)
-  ${toggle_vp8}                   VP8 codec support
+  ${toggle_vp9}                   VP9 codec support
   ${toggle_internal_stats}        output of encoder internal stats for debug, if supported (encoders)
   ${toggle_mem_tracker}           track memory usage
   ${toggle_postproc}              postprocessing
@@ -161,17 +161,17 @@
 enable os_support
 
 [ -d ${source_path}/../include ] && enable alt_tree_layout
-for d in vp8; do
+for d in vp9; do
     [ -d ${source_path}/${d} ] && disable alt_tree_layout;
 done
 
 if ! enabled alt_tree_layout; then
 # development environment
-[ -d ${source_path}/vp8 ] && CODECS="${CODECS} vp8_encoder vp8_decoder"
+[ -d ${source_path}/vp9 ] && CODECS="${CODECS} vp9_encoder vp9_decoder"
 else
 # customer environment
-[ -f ${source_path}/../include/vpx/vp8cx.h ] && CODECS="${CODECS} vp8_encoder"
-[ -f ${source_path}/../include/vpx/vp8dx.h ] && CODECS="${CODECS} vp8_decoder"
+[ -f ${source_path}/../include/vpx/vp8cx.h ] && CODECS="${CODECS} vp9_encoder"
+[ -f ${source_path}/../include/vpx/vp8dx.h ] && CODECS="${CODECS} vp9_decoder"
 
 [ -f ${source_path}/../lib/*/*mt.lib ] && soft_enable static_msvcrt
 fi
--- a/docs.mk
+++ b/docs.mk
@@ -21,7 +21,7 @@
 		usage_dx.dox \
 
 # Other doxy files sourced in Markdown
-TXT_DOX-$(CONFIG_VP8)          += vp8_api1_migration.dox
+TXT_DOX-$(CONFIG_VP9)          += vp8_api1_migration.dox
 vp8_api1_migration.dox.DESC     = VP8 API 1.x Migration
 
 TXT_DOX = $(call enabled,TXT_DOX)
--- a/example_xma.c
+++ b/example_xma.c
@@ -18,7 +18,7 @@
 #include "vpx_config.h"
 #include "vpx/vpx_decoder.h"
 #include "vpx/vpx_integer.h"
-#if CONFIG_VP8_DECODER
+#if CONFIG_VP9_DECODER
 #include "vpx/vp8dx.h"
 #endif
 
@@ -29,8 +29,8 @@
   const char *name;
   const vpx_codec_iface_t *iface;
 } ifaces[] = {
-#if CONFIG_VP8_DECODER
-  {"vp8",  &vpx_codec_vp8_dx_algo},
+#if CONFIG_VP9_DECODER
+  {"vp9",  &vpx_codec_vp8_dx_algo},
 #endif
 };
 
--- a/examples.mk
+++ b/examples.mk
@@ -81,13 +81,13 @@
 error_resilient.GUID             = DF5837B9-4145-4F92-A031-44E4F832E00C
 error_resilient.DESCRIPTION      = Error Resiliency Feature
 
-GEN_EXAMPLES-$(CONFIG_VP8_ENCODER) += vp8_scalable_patterns.c
+GEN_EXAMPLES-$(CONFIG_VP9_ENCODER) += vp8_scalable_patterns.c
 vp8_scalable_patterns.GUID          = 0D6A210B-F482-4D6F-8570-4A9C01ACC88C
 vp8_scalable_patterns.DESCRIPTION   = VP8 Scalable Bitstream Patterns
-GEN_EXAMPLES-$(CONFIG_VP8_ENCODER) += vp8_set_maps.c
+GEN_EXAMPLES-$(CONFIG_VP9_ENCODER) += vp8_set_maps.c
 vp8_set_maps.GUID                   = ECB2D24D-98B8-4015-A465-A4AF3DCC145F
 vp8_set_maps.DESCRIPTION            = VP8 set active and ROI maps
-GEN_EXAMPLES-$(CONFIG_VP8_ENCODER) += vp8cx_set_ref.c
+GEN_EXAMPLES-$(CONFIG_VP9_ENCODER) += vp8cx_set_ref.c
 vp8cx_set_ref.GUID                  = C5E31F7F-96F6-48BD-BD3E-10EBF6E8057A
 vp8cx_set_ref.DESCRIPTION           = VP8 set encoder reference frame
 
@@ -97,10 +97,10 @@
 # We should not link to math library (libm) on RVCT
 # when building for bare-metal targets
 ifeq ($(CONFIG_OS_SUPPORT), yes)
-CODEC_EXTRA_LIBS-$(CONFIG_VP8)         += m
+CODEC_EXTRA_LIBS-$(CONFIG_VP9)         += m
 else
     ifeq ($(CONFIG_GCC), yes)
-    CODEC_EXTRA_LIBS-$(CONFIG_VP8)         += m
+    CODEC_EXTRA_LIBS-$(CONFIG_VP9)         += m
     endif
 endif
 #
@@ -117,8 +117,8 @@
     INC_PATH := $(SRC_PATH_BARE)/../include
 else
     LIB_PATH-yes                     += $(if $(BUILD_PFX),$(BUILD_PFX),.)
-    INC_PATH-$(CONFIG_VP8_DECODER)   += $(SRC_PATH_BARE)/vp8
-    INC_PATH-$(CONFIG_VP8_ENCODER)   += $(SRC_PATH_BARE)/vp8
+    INC_PATH-$(CONFIG_VP9_DECODER)   += $(SRC_PATH_BARE)/vp9
+    INC_PATH-$(CONFIG_VP9_ENCODER)   += $(SRC_PATH_BARE)/vp9
     LIB_PATH := $(call enabled,LIB_PATH)
     INC_PATH := $(call enabled,INC_PATH)
 endif
--- a/examples/decoder_tmpl.txt
+++ b/examples/decoder_tmpl.txt
@@ -1,7 +1,7 @@
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ DEC_INCLUDES
 #define VPX_CODEC_DISABLE_COMPAT 1
 #include "vpx/vpx_decoder.h"
-#include "vpx/vp8dx.h"
+#include "vpx/vp9dx.h"
 #define interface (vpx_codec_vp8_dx())
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ DEC_INCLUDES
 
--- a/examples/encoder_tmpl.txt
+++ b/examples/encoder_tmpl.txt
@@ -1,7 +1,7 @@
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ENC_INCLUDES
 #define VPX_CODEC_DISABLE_COMPAT 1
 #include "vpx/vpx_encoder.h"
-#include "vpx/vp8cx.h"
+#include "vpx/vp9cx.h"
 #define interface (vpx_codec_vp8_cx())
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ENC_INCLUDES
 
--- a/examples/postproc.txt
+++ b/examples/postproc.txt
@@ -51,7 +51,7 @@
 postprocessors. VP8 is one example. The following sample code toggles
 postprocessing on and off every 15 frames.
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ PRE_DECODE
-#if CONFIG_VP8_DECODER
+#if CONFIG_VP9_DECODER
 if(frame_cnt%30 == 1) {
     vp8_postproc_cfg_t  pp = {0, 0, 0};
 
--- a/libs.mk
+++ b/libs.mk
@@ -30,29 +30,29 @@
 CODEC_SRCS-yes += $(addprefix vpx_scale/,$(call enabled,SCALE_SRCS))
 
 
-ifeq ($(CONFIG_VP8_ENCODER),yes)
-  VP8_PREFIX=vp8/
-  include $(SRC_PATH_BARE)/$(VP8_PREFIX)vp8cx.mk
-  CODEC_SRCS-yes += $(addprefix $(VP8_PREFIX),$(call enabled,VP8_CX_SRCS))
-  CODEC_EXPORTS-yes += $(addprefix $(VP8_PREFIX),$(VP8_CX_EXPORTS))
-  CODEC_SRCS-yes += $(VP8_PREFIX)vp8cx.mk vpx/vp8.h vpx/vp8cx.h vpx/vp8e.h
-  CODEC_SRCS-$(ARCH_ARM) += $(VP8_PREFIX)vp8cx_arm.mk
+ifeq ($(CONFIG_VP9_ENCODER),yes)
+  VP9_PREFIX=vp9/
+  include $(SRC_PATH_BARE)/$(VP9_PREFIX)vp9cx.mk
+  CODEC_SRCS-yes += $(addprefix $(VP9_PREFIX),$(call enabled,VP9_CX_SRCS))
+  CODEC_EXPORTS-yes += $(addprefix $(VP9_PREFIX),$(VP9_CX_EXPORTS))
+  CODEC_SRCS-yes += $(VP9_PREFIX)vp9cx.mk vpx/vp8.h vpx/vp8cx.h vpx/vp8e.h
+  CODEC_SRCS-$(ARCH_ARM) += $(VP9_PREFIX)vp98cx_arm.mk
   INSTALL-LIBS-yes += include/vpx/vp8.h include/vpx/vp8e.h include/vpx/vp8cx.h
-  INSTALL_MAPS += include/vpx/% $(SRC_PATH_BARE)/$(VP8_PREFIX)/%
+  INSTALL_MAPS += include/vpx/% $(SRC_PATH_BARE)/$(VP9_PREFIX)/%
   CODEC_DOC_SRCS += vpx/vp8.h vpx/vp8cx.h
-  CODEC_DOC_SECTIONS += vp8 vp8_encoder
+  CODEC_DOC_SECTIONS += vp9 vp9_encoder
 endif
 
-ifeq ($(CONFIG_VP8_DECODER),yes)
-  VP8_PREFIX=vp8/
-  include $(SRC_PATH_BARE)/$(VP8_PREFIX)vp8dx.mk
-  CODEC_SRCS-yes += $(addprefix $(VP8_PREFIX),$(call enabled,VP8_DX_SRCS))
-  CODEC_EXPORTS-yes += $(addprefix $(VP8_PREFIX),$(VP8_DX_EXPORTS))
-  CODEC_SRCS-yes += $(VP8_PREFIX)vp8dx.mk vpx/vp8.h vpx/vp8dx.h
+ifeq ($(CONFIG_VP9_DECODER),yes)
+  VP9_PREFIX=vp9/
+  include $(SRC_PATH_BARE)/$(VP9_PREFIX)vp9dx.mk
+  CODEC_SRCS-yes += $(addprefix $(VP9_PREFIX),$(call enabled,VP9_DX_SRCS))
+  CODEC_EXPORTS-yes += $(addprefix $(VP9_PREFIX),$(VP9_DX_EXPORTS))
+  CODEC_SRCS-yes += $(VP9_PREFIX)vp9dx.mk vpx/vp8.h vpx/vp8dx.h
   INSTALL-LIBS-yes += include/vpx/vp8.h include/vpx/vp8dx.h
-  INSTALL_MAPS += include/vpx/% $(SRC_PATH_BARE)/$(VP8_PREFIX)/%
+  INSTALL_MAPS += include/vpx/% $(SRC_PATH_BARE)/$(VP9_PREFIX)/%
   CODEC_DOC_SRCS += vpx/vp8.h vpx/vp8dx.h
-  CODEC_DOC_SECTIONS += vp8 vp8_decoder
+  CODEC_DOC_SECTIONS += vp9 vp9_decoder
 endif
 
 
@@ -305,46 +305,46 @@
 OFFSET_PATTERN:='^[a-zA-Z0-9_]* EQU'
 
 ifeq ($(filter icc gcc,$(TGT_CC)), $(TGT_CC))
-    $(BUILD_PFX)asm_com_offsets.asm: $(BUILD_PFX)$(VP8_PREFIX)common/asm_com_offsets.c.S
+    $(BUILD_PFX)asm_com_offsets.asm: $(BUILD_PFX)$(VP9_PREFIX)common/asm_com_offsets.c.S
 	@echo "    [CREATE] $@"
 	$(qexec)LC_ALL=C grep $(OFFSET_PATTERN) $< | tr -d '$$\#' $(ADS2GAS) > $@
-    $(BUILD_PFX)$(VP8_PREFIX)common/asm_com_offsets.c.S: $(VP8_PREFIX)common/asm_com_offsets.c
-    CLEAN-OBJS += $(BUILD_PFX)asm_com_offsets.asm $(BUILD_PFX)$(VP8_PREFIX)common/asm_com_offsets.c.S
+    $(BUILD_PFX)$(VP9_PREFIX)common/asm_com_offsets.c.S: $(VP9_PREFIX)common/asm_com_offsets.c
+    CLEAN-OBJS += $(BUILD_PFX)asm_com_offsets.asm $(BUILD_PFX)$(VP9_PREFIX)common/asm_com_offsets.c.S
 
-    $(BUILD_PFX)asm_enc_offsets.asm: $(BUILD_PFX)$(VP8_PREFIX)encoder/asm_enc_offsets.c.S
+    $(BUILD_PFX)asm_enc_offsets.asm: $(BUILD_PFX)$(VP9_PREFIX)encoder/asm_enc_offsets.c.S
 	@echo "    [CREATE] $@"
 	$(qexec)LC_ALL=C grep $(OFFSET_PATTERN) $< | tr -d '$$\#' $(ADS2GAS) > $@
-    $(BUILD_PFX)$(VP8_PREFIX)encoder/asm_enc_offsets.c.S: $(VP8_PREFIX)encoder/asm_enc_offsets.c
-    CLEAN-OBJS += $(BUILD_PFX)asm_enc_offsets.asm $(BUILD_PFX)$(VP8_PREFIX)encoder/asm_enc_offsets.c.S
+    $(BUILD_PFX)$(VP9_PREFIX)encoder/asm_enc_offsets.c.S: $(VP9_PREFIX)encoder/asm_enc_offsets.c
+    CLEAN-OBJS += $(BUILD_PFX)asm_enc_offsets.asm $(BUILD_PFX)$(VP9_PREFIX)encoder/asm_enc_offsets.c.S
 
-    $(BUILD_PFX)asm_dec_offsets.asm: $(BUILD_PFX)$(VP8_PREFIX)decoder/asm_dec_offsets.c.S
+    $(BUILD_PFX)asm_dec_offsets.asm: $(BUILD_PFX)$(VP9_PREFIX)decoder/asm_dec_offsets.c.S
 	@echo "    [CREATE] $@"
 	$(qexec)LC_ALL=C grep $(OFFSET_PATTERN) $< | tr -d '$$\#' $(ADS2GAS) > $@
-    $(BUILD_PFX)$(VP8_PREFIX)decoder/asm_dec_offsets.c.S: $(VP8_PREFIX)decoder/asm_dec_offsets.c
-    CLEAN-OBJS += $(BUILD_PFX)asm_dec_offsets.asm $(BUILD_PFX)$(VP8_PREFIX)decoder/asm_dec_offsets.c.S
+    $(BUILD_PFX)$(VP9_PREFIX)decoder/asm_dec_offsets.c.S: $(VP9_PREFIX)decoder/asm_dec_offsets.c
+    CLEAN-OBJS += $(BUILD_PFX)asm_dec_offsets.asm $(BUILD_PFX)$(VP9_PREFIX)decoder/asm_dec_offsets.c.S
 else
   ifeq ($(filter rvct,$(TGT_CC)), $(TGT_CC))
     asm_com_offsets.asm: obj_int_extract
-    asm_com_offsets.asm: $(VP8_PREFIX)common/asm_com_offsets.c.o
+    asm_com_offsets.asm: $(VP9_PREFIX)common/asm_com_offsets.c.o
 	@echo "    [CREATE] $@"
 	$(qexec)./obj_int_extract rvds $< $(ADS2GAS) > $@
-    OBJS-yes += $(VP8_PREFIX)common/asm_com_offsets.c.o
+    OBJS-yes += $(VP9_PREFIX)common/asm_com_offsets.c.o
     CLEAN-OBJS += asm_com_offsets.asm
     $(filter %$(ASM).o,$(OBJS-yes)): $(BUILD_PFX)asm_com_offsets.asm
 
     asm_enc_offsets.asm: obj_int_extract
-    asm_enc_offsets.asm: $(VP8_PREFIX)encoder/asm_enc_offsets.c.o
+    asm_enc_offsets.asm: $(VP9_PREFIX)encoder/asm_enc_offsets.c.o
 	@echo "    [CREATE] $@"
 	$(qexec)./obj_int_extract rvds $< $(ADS2GAS) > $@
-    OBJS-yes += $(VP8_PREFIX)encoder/asm_enc_offsets.c.o
+    OBJS-yes += $(VP9_PREFIX)encoder/asm_enc_offsets.c.o
     CLEAN-OBJS += asm_enc_offsets.asm
     $(filter %$(ASM).o,$(OBJS-yes)): $(BUILD_PFX)asm_enc_offsets.asm
 
     asm_dec_offsets.asm: obj_int_extract
-    asm_dec_offsets.asm: $(VP8_PREFIX)decoder/asm_dec_offsets.c.o
+    asm_dec_offsets.asm: $(VP9_PREFIX)decoder/asm_dec_offsets.c.o
 	@echo "    [CREATE] $@"
 	$(qexec)./obj_int_extract rvds $< $(ADS2GAS) > $@
-    OBJS-yes += $(VP8_PREFIX)decoder/asm_dec_offsets.c.o
+    OBJS-yes += $(VP9_PREFIX)decoder/asm_dec_offsets.c.o
     CLEAN-OBJS += asm_dec_offsets.asm
     $(filter %$(ASM).o,$(OBJS-yes)): $(BUILD_PFX)asm_dec_offsets.asm
   endif
--- a/test/boolcoder_test.cc
+++ b/test/boolcoder_test.cc
@@ -15,8 +15,8 @@
 #include "third_party/googletest/src/include/gtest/gtest.h"
 
 extern "C" {
-#include "vp8/encoder/boolhuff.h"
-#include "vp8/decoder/dboolhuff.h"
+#include "vp9/encoder/boolhuff.h"
+#include "vp9/decoder/dboolhuff.h"
 }
 
 #include "acm_random.h"
--- a/test/dct16x16_test.cc
+++ b/test/dct16x16_test.cc
@@ -15,9 +15,9 @@
 #include "third_party/googletest/src/include/gtest/gtest.h"
 
 extern "C" {
-#include "vp8/common/entropy.h"
-#include "vp8/common/idct.h"
-#include "vp8/encoder/dct.h"
+#include "vp9/common/entropy.h"
+#include "vp9/common/idct.h"
+#include "vp9/encoder/dct.h"
 }
 
 #include "acm_random.h"
--- a/test/fdct4x4_test.cc
+++ b/test/fdct4x4_test.cc
@@ -15,8 +15,8 @@
 #include "third_party/googletest/src/include/gtest/gtest.h"
 
 extern "C" {
-#include "vp8/common/idct.h"
-#include "vp8/encoder/dct.h"
+#include "vp9/common/idct.h"
+#include "vp9/encoder/dct.h"
 }
 
 #include "acm_random.h"
--- a/test/fdct8x8_test.cc
+++ b/test/fdct8x8_test.cc
@@ -15,8 +15,8 @@
 #include "third_party/googletest/src/include/gtest/gtest.h"
 
 extern "C" {
-#include "vp8/encoder/dct.h"
-#include "vp8/common/idct.h"
+#include "vp9/encoder/dct.h"
+#include "vp9/common/idct.h"
 }
 
 #include "acm_random.h"
--- a/test/idct8x8_test.cc
+++ b/test/idct8x8_test.cc
@@ -15,8 +15,8 @@
 #include "third_party/googletest/src/include/gtest/gtest.h"
 
 extern "C" {
-#include "vp8/encoder/dct.h"
-#include "vp8/common/idct.h"
+#include "vp9/encoder/dct.h"
+#include "vp9/common/idct.h"
 }
 
 #include "acm_random.h"
--- a/vp8/common/alloccommon.c
+++ /dev/null
@@ -1,220 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vpx_ports/config.h"
-#include "blockd.h"
-#include "vpx_mem/vpx_mem.h"
-#include "onyxc_int.h"
-#include "findnearmv.h"
-#include "entropymode.h"
-#include "entropymv.h"
-#include "systemdependent.h"
-
-
-void vp9_update_mode_info_border(VP9_COMMON *cpi, MODE_INFO *mi_base) {
-  int stride = cpi->mode_info_stride;
-  int i;
-
-  // Clear down top border row
-  vpx_memset(mi_base, 0, sizeof(MODE_INFO) * cpi->mode_info_stride);
-
-  // Clear left border column
-  for (i = 1; i < cpi->mb_rows + 1; i++) {
-    vpx_memset(&mi_base[i * stride], 0, sizeof(MODE_INFO));
-  }
-}
-
-void vp9_update_mode_info_in_image(VP9_COMMON *cpi, MODE_INFO *mi) {
-  int i, j;
-
-  // For each in image mode_info element set the in image flag to 1
-  for (i = 0; i < cpi->mb_rows; i++) {
-    for (j = 0; j < cpi->mb_cols; j++) {
-      mi->mbmi.mb_in_image = 1;
-      mi++;   // Next element in the row
-    }
-
-    mi++;       // Step over border element at start of next row
-  }
-}
-
-void vp9_de_alloc_frame_buffers(VP9_COMMON *oci) {
-  int i;
-
-  for (i = 0; i < NUM_YV12_BUFFERS; i++)
-    vp8_yv12_de_alloc_frame_buffer(&oci->yv12_fb[i]);
-
-  vp8_yv12_de_alloc_frame_buffer(&oci->temp_scale_frame);
-  vp8_yv12_de_alloc_frame_buffer(&oci->post_proc_buffer);
-
-  vpx_free(oci->above_context);
-  vpx_free(oci->mip);
-  vpx_free(oci->prev_mip);
-
-  oci->above_context = 0;
-  oci->mip = 0;
-  oci->prev_mip = 0;
-
-}
-
-int vp9_alloc_frame_buffers(VP9_COMMON *oci, int width, int height) {
-  int i;
-
-  vp9_de_alloc_frame_buffers(oci);
-
-  /* our internal buffers are always multiples of 16 */
-  if ((width & 0xf) != 0)
-    width += 16 - (width & 0xf);
-
-  if ((height & 0xf) != 0)
-    height += 16 - (height & 0xf);
-
-
-  for (i = 0; i < NUM_YV12_BUFFERS; i++) {
-    oci->fb_idx_ref_cnt[i] = 0;
-    oci->yv12_fb[i].flags = 0;
-    if (vp8_yv12_alloc_frame_buffer(&oci->yv12_fb[i], width, height, VP8BORDERINPIXELS) < 0) {
-      vp9_de_alloc_frame_buffers(oci);
-      return 1;
-    }
-  }
-
-  oci->new_fb_idx = 0;
-  oci->lst_fb_idx = 1;
-  oci->gld_fb_idx = 2;
-  oci->alt_fb_idx = 3;
-
-  oci->fb_idx_ref_cnt[0] = 1;
-  oci->fb_idx_ref_cnt[1] = 1;
-  oci->fb_idx_ref_cnt[2] = 1;
-  oci->fb_idx_ref_cnt[3] = 1;
-
-  if (vp8_yv12_alloc_frame_buffer(&oci->temp_scale_frame,   width, 16, VP8BORDERINPIXELS) < 0) {
-    vp9_de_alloc_frame_buffers(oci);
-    return 1;
-  }
-
-  if (vp8_yv12_alloc_frame_buffer(&oci->post_proc_buffer, width, height, VP8BORDERINPIXELS) < 0) {
-    vp9_de_alloc_frame_buffers(oci);
-    return 1;
-  }
-
-  oci->mb_rows = height >> 4;
-  oci->mb_cols = width >> 4;
-  oci->MBs = oci->mb_rows * oci->mb_cols;
-  oci->mode_info_stride = oci->mb_cols + 1;
-  oci->mip = vpx_calloc((oci->mb_cols + 1) * (oci->mb_rows + 1), sizeof(MODE_INFO));
-
-  if (!oci->mip) {
-    vp9_de_alloc_frame_buffers(oci);
-    return 1;
-  }
-
-  oci->mi = oci->mip + oci->mode_info_stride + 1;
-
-  /* allocate memory for last frame MODE_INFO array */
-
-  oci->prev_mip = vpx_calloc((oci->mb_cols + 1) * (oci->mb_rows + 1), sizeof(MODE_INFO));
-
-  if (!oci->prev_mip) {
-    vp9_de_alloc_frame_buffers(oci);
-    return 1;
-  }
-
-  oci->prev_mi = oci->prev_mip + oci->mode_info_stride + 1;
-
-  oci->above_context = vpx_calloc(sizeof(ENTROPY_CONTEXT_PLANES) * oci->mb_cols, 1);
-
-  if (!oci->above_context) {
-    vp9_de_alloc_frame_buffers(oci);
-    return 1;
-  }
-
-  vp9_update_mode_info_border(oci, oci->mip);
-  vp9_update_mode_info_in_image(oci, oci->mi);
-
-  return 0;
-}
-void vp9_setup_version(VP9_COMMON *cm) {
-  if (cm->version & 0x4) {
-    if (!CONFIG_EXPERIMENTAL)
-      vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,
-                         "Bitstream was created by an experimental "
-                         "encoder");
-    cm->experimental = 1;
-  }
-
-  switch (cm->version & 0x3) {
-    case 0:
-      cm->no_lpf = 0;
-      cm->filter_type = NORMAL_LOOPFILTER;
-      cm->use_bilinear_mc_filter = 0;
-      cm->full_pixel = 0;
-      break;
-    case 1:
-      cm->no_lpf = 0;
-      cm->filter_type = SIMPLE_LOOPFILTER;
-      cm->use_bilinear_mc_filter = 1;
-      cm->full_pixel = 0;
-      break;
-    case 2:
-    case 3:
-      cm->no_lpf = 1;
-      cm->filter_type = NORMAL_LOOPFILTER;
-      cm->use_bilinear_mc_filter = 1;
-      cm->full_pixel = 0;
-      break;
-      // Full pel only code deprecated in experimental code base
-      // case 3:
-      //    cm->no_lpf = 1;
-      //    cm->filter_type = SIMPLE_LOOPFILTER;
-      //    cm->use_bilinear_mc_filter = 1;
-      //    cm->full_pixel = 1;
-      //    break;
-  }
-}
-void vp9_create_common(VP9_COMMON *oci) {
-  vp9_machine_specific_config(oci);
-
-  vp9_init_mbmode_probs(oci);
-
-  vp9_default_bmode_probs(oci->fc.bmode_prob);
-
-  oci->txfm_mode = ONLY_4X4;
-  oci->mb_no_coeff_skip = 1;
-  oci->comp_pred_mode = HYBRID_PREDICTION;
-  oci->no_lpf = 0;
-  oci->filter_type = NORMAL_LOOPFILTER;
-  oci->use_bilinear_mc_filter = 0;
-  oci->full_pixel = 0;
-  oci->clr_type = REG_YUV;
-  oci->clamp_type = RECON_CLAMP_REQUIRED;
-
-  /* Initialise reference frame sign bias structure to defaults */
-  vpx_memset(oci->ref_frame_sign_bias, 0, sizeof(oci->ref_frame_sign_bias));
-
-  /* Default disable buffer to buffer copying */
-  oci->copy_buffer_to_gf = 0;
-  oci->copy_buffer_to_arf = 0;
-  oci->kf_ymode_probs_update = 0;
-}
-
-void vp9_remove_common(VP9_COMMON *oci) {
-  vp9_de_alloc_frame_buffers(oci);
-}
-
-void vp9_initialize_common() {
-  vp9_coef_tree_initialize();
-
-  vp9_entropy_mode_init();
-
-  vp9_entropy_mv_init();
-}
--- a/vp8/common/alloccommon.h
+++ /dev/null
@@ -1,26 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef __INC_ALLOCCOMMON_H
-#define __INC_ALLOCCOMMON_H
-
-#include "onyxc_int.h"
-
-void vp9_create_common(VP9_COMMON *oci);
-void vp9_remove_common(VP9_COMMON *oci);
-void vp9_de_alloc_frame_buffers(VP9_COMMON *oci);
-int vp9_alloc_frame_buffers(VP9_COMMON *oci, int width, int height);
-void vp9_setup_version(VP9_COMMON *oci);
-
-void vp9_update_mode_info_border(VP9_COMMON *cpi, MODE_INFO *mi_base);
-void vp9_update_mode_info_in_image(VP9_COMMON *cpi, MODE_INFO *mi);
-
-#endif
--- a/vp8/common/arm/arm_systemdependent.c
+++ /dev/null
@@ -1,92 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vpx_ports/config.h"
-#include "vpx_ports/arm.h"
-#include "vp8/common/pragmas.h"
-#include "vp8/common/subpixel.h"
-#include "vp8/common/loopfilter.h"
-#include "vp8/common/recon.h"
-#include "vp8/common/idct.h"
-#include "vp8/common/onyxc_int.h"
-
-void vp9_arch_arm_common_init(VP9_COMMON *ctx) {
-#if CONFIG_RUNTIME_CPU_DETECT
-  VP9_COMMON_RTCD *rtcd = &ctx->rtcd;
-  int flags = arm_cpu_caps();
-  rtcd->flags = flags;
-
-  /* Override default functions with fastest ones for this CPU. */
-#if HAVE_ARMV5TE
-  if (flags & HAS_EDSP) {
-  }
-#endif
-
-// The commented functions need to be re-written for vpx.
-#if HAVE_ARMV6
-  if (flags & HAS_MEDIA) {
-    rtcd->subpix.sixtap16x16   = vp9_sixtap_predict16x16_armv6;
-    rtcd->subpix.sixtap8x8     = vp9_sixtap_predict8x8_armv6;
-    rtcd->subpix.sixtap8x4     = vp9_sixtap_predict8x4_armv6;
-    rtcd->subpix.sixtap4x4     = vp9_sixtap_predict_armv6;
-
-    rtcd->subpix.bilinear16x16 = vp9_bilinear_predict16x16_armv6;
-    rtcd->subpix.bilinear8x8   = vp9_bilinear_predict8x8_armv6;
-    rtcd->subpix.bilinear8x4   = vp9_bilinear_predict8x4_armv6;
-    rtcd->subpix.bilinear4x4   = vp9_bilinear_predict4x4_armv6;
-
-    // rtcd->idct.idct1        = vp9_short_idct4x4llm_1_v6;
-    // rtcd->idct.idct16       = vp9_short_idct4x4llm_v6_dual;
-    // rtcd->idct.iwalsh1      = vp9_short_inv_walsh4x4_1_v6;
-    // rtcd->idct.iwalsh16     = vp9_short_inv_walsh4x4_v6;
-
-    rtcd->recon.copy16x16   = vp9_copy_mem16x16_v6;
-    rtcd->recon.copy8x8     = vp9_copy_mem8x8_v6;
-    rtcd->recon.copy8x4     = vp9_copy_mem8x4_v6;
-    rtcd->recon.recon       = vp9_recon_b_armv6;
-    rtcd->recon.recon2      = vp9_recon2b_armv6;
-    rtcd->recon.recon4      = vp9_recon4b_armv6;
-  }
-#endif
-
-#if HAVE_ARMV7
-  if (flags & HAS_NEON) {
-    rtcd->subpix.sixtap16x16   = vp9_sixtap_predict16x16_neon;
-    rtcd->subpix.sixtap8x8     = vp9_sixtap_predict8x8_neon;
-    rtcd->subpix.sixtap8x4     = vp9_sixtap_predict8x4_neon;
-    rtcd->subpix.sixtap4x4     = vp9_sixtap_predict_neon;
-
-    rtcd->subpix.bilinear16x16 = vp9_bilinear_predict16x16_neon;
-    rtcd->subpix.bilinear8x8   = vp9_bilinear_predict8x8_neon;
-    rtcd->subpix.bilinear8x4   = vp9_bilinear_predict8x4_neon;
-    rtcd->subpix.bilinear4x4   = vp9_bilinear_predict4x4_neon;
-
-    // rtcd->idct.idct1        = vp9_short_idct4x4llm_1_neon;
-    // rtcd->idct.idct16       = vp9_short_idct4x4llm_neon;
-    // rtcd->idct.iwalsh1      = vp9_short_inv_walsh4x4_1_neon;
-    // rtcd->idct.iwalsh16     = vp9_short_inv_walsh4x4_neon;
-
-    rtcd->recon.copy16x16   = vp9_copy_mem16x16_neon;
-    rtcd->recon.copy8x8     = vp9_copy_mem8x8_neon;
-    rtcd->recon.copy8x4     = vp9_copy_mem8x4_neon;
-    rtcd->recon.recon       = vp9_recon_b_neon;
-    rtcd->recon.recon2      = vp9_recon2b_neon;
-    rtcd->recon.recon4      = vp9_recon4b_neon;
-    rtcd->recon.recon_mb    = vp9_recon_mb_neon;
-    rtcd->recon.build_intra_predictors_mby =
-      vp9_build_intra_predictors_mby_neon;
-    rtcd->recon.build_intra_predictors_mby_s =
-      vp9_build_intra_predictors_mby_s_neon;
-  }
-#endif
-
-#endif
-}
--- a/vp8/common/arm/armv6/bilinearfilter_v6.asm
+++ /dev/null
@@ -1,237 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp9_filter_block2d_bil_first_pass_armv6|
-    EXPORT  |vp9_filter_block2d_bil_second_pass_armv6|
-
-    AREA    |.text|, CODE, READONLY  ; name this block of code
-
-;-------------------------------------
-; r0    unsigned char  *src_ptr,
-; r1    unsigned short *dst_ptr,
-; r2    unsigned int    src_pitch,
-; r3    unsigned int    height,
-; stack unsigned int    width,
-; stack const short    *vp9_filter
-;-------------------------------------
-; The output is transposed stroed in output array to make it easy for second pass filtering.
-|vp9_filter_block2d_bil_first_pass_armv6| PROC
-    stmdb   sp!, {r4 - r11, lr}
-
-    ldr     r11, [sp, #40]                  ; vp9_filter address
-    ldr     r4, [sp, #36]                   ; width
-
-    mov     r12, r3                         ; outer-loop counter
-
-    add     r7, r2, r4                      ; preload next row
-    pld     [r0, r7]
-
-    sub     r2, r2, r4                      ; src increment for height loop
-
-    ldr     r5, [r11]                       ; load up filter coefficients
-
-    mov     r3, r3, lsl #1                  ; height*2
-    add     r3, r3, #2                      ; plus 2 to make output buffer 4-bit aligned since height is actually (height+1)
-
-    mov     r11, r1                         ; save dst_ptr for each row
-
-    cmp     r5, #128                        ; if filter coef = 128, then skip the filter
-    beq     bil_null_1st_filter
-
-|bil_height_loop_1st_v6|
-    ldrb    r6, [r0]                        ; load source data
-    ldrb    r7, [r0, #1]
-    ldrb    r8, [r0, #2]
-    mov     lr, r4, lsr #2                  ; 4-in-parellel loop counter
-
-|bil_width_loop_1st_v6|
-    ldrb    r9, [r0, #3]
-    ldrb    r10, [r0, #4]
-
-    pkhbt   r6, r6, r7, lsl #16             ; src[1] | src[0]
-    pkhbt   r7, r7, r8, lsl #16             ; src[2] | src[1]
-
-    smuad   r6, r6, r5                      ; apply the filter
-    pkhbt   r8, r8, r9, lsl #16             ; src[3] | src[2]
-    smuad   r7, r7, r5
-    pkhbt   r9, r9, r10, lsl #16            ; src[4] | src[3]
-
-    smuad   r8, r8, r5
-    smuad   r9, r9, r5
-
-    add     r0, r0, #4
-    subs    lr, lr, #1
-
-    add     r6, r6, #0x40                   ; round_shift_and_clamp
-    add     r7, r7, #0x40
-    usat    r6, #16, r6, asr #7
-    usat    r7, #16, r7, asr #7
-
-    strh    r6, [r1], r3                    ; result is transposed and stored
-
-    add     r8, r8, #0x40                   ; round_shift_and_clamp
-    strh    r7, [r1], r3
-    add     r9, r9, #0x40
-    usat    r8, #16, r8, asr #7
-    usat    r9, #16, r9, asr #7
-
-    strh    r8, [r1], r3                    ; result is transposed and stored
-
-    ldrneb  r6, [r0]                        ; load source data
-    strh    r9, [r1], r3
-
-    ldrneb  r7, [r0, #1]
-    ldrneb  r8, [r0, #2]
-
-    bne     bil_width_loop_1st_v6
-
-    add     r0, r0, r2                      ; move to next input row
-    subs    r12, r12, #1
-
-    add     r9, r2, r4, lsl #1              ; adding back block width
-    pld     [r0, r9]                        ; preload next row
-
-    add     r11, r11, #2                    ; move over to next column
-    mov     r1, r11
-
-    bne     bil_height_loop_1st_v6
-
-    ldmia   sp!, {r4 - r11, pc}
-
-|bil_null_1st_filter|
-|bil_height_loop_null_1st|
-    mov     lr, r4, lsr #2                  ; loop counter
-
-|bil_width_loop_null_1st|
-    ldrb    r6, [r0]                        ; load data
-    ldrb    r7, [r0, #1]
-    ldrb    r8, [r0, #2]
-    ldrb    r9, [r0, #3]
-
-    strh    r6, [r1], r3                    ; store it to immediate buffer
-    add     r0, r0, #4
-    strh    r7, [r1], r3
-    subs    lr, lr, #1
-    strh    r8, [r1], r3
-    strh    r9, [r1], r3
-
-    bne     bil_width_loop_null_1st
-
-    subs    r12, r12, #1
-    add     r0, r0, r2                      ; move to next input line
-    add     r11, r11, #2                    ; move over to next column
-    mov     r1, r11
-
-    bne     bil_height_loop_null_1st
-
-    ldmia   sp!, {r4 - r11, pc}
-
-    ENDP  ; |vp9_filter_block2d_bil_first_pass_armv6|
-
-
-;---------------------------------
-; r0    unsigned short *src_ptr,
-; r1    unsigned char  *dst_ptr,
-; r2    int             dst_pitch,
-; r3    unsigned int    height,
-; stack unsigned int    width,
-; stack const short    *vp9_filter
-;---------------------------------
-|vp9_filter_block2d_bil_second_pass_armv6| PROC
-    stmdb   sp!, {r4 - r11, lr}
-
-    ldr     r11, [sp, #40]                  ; vp9_filter address
-    ldr     r4, [sp, #36]                   ; width
-
-    ldr     r5, [r11]                       ; load up filter coefficients
-    mov     r12, r4                         ; outer-loop counter = width, since we work on transposed data matrix
-    mov     r11, r1
-
-    cmp     r5, #128                        ; if filter coef = 128, then skip the filter
-    beq     bil_null_2nd_filter
-
-|bil_height_loop_2nd|
-    ldr     r6, [r0]                        ; load the data
-    ldr     r8, [r0, #4]
-    ldrh    r10, [r0, #8]
-    mov     lr, r3, lsr #2                  ; loop counter
-
-|bil_width_loop_2nd|
-    pkhtb   r7, r6, r8                      ; src[1] | src[2]
-    pkhtb   r9, r8, r10                     ; src[3] | src[4]
-
-    smuad   r6, r6, r5                      ; apply filter
-    smuad   r8, r8, r5                      ; apply filter
-
-    subs    lr, lr, #1
-
-    smuadx  r7, r7, r5                      ; apply filter
-    smuadx  r9, r9, r5                      ; apply filter
-
-    add     r0, r0, #8
-
-    add     r6, r6, #0x40                   ; round_shift_and_clamp
-    add     r7, r7, #0x40
-    usat    r6, #8, r6, asr #7
-    usat    r7, #8, r7, asr #7
-    strb    r6, [r1], r2                    ; the result is transposed back and stored
-
-    add     r8, r8, #0x40                   ; round_shift_and_clamp
-    strb    r7, [r1], r2
-    add     r9, r9, #0x40
-    usat    r8, #8, r8, asr #7
-    usat    r9, #8, r9, asr #7
-    strb    r8, [r1], r2                    ; the result is transposed back and stored
-
-    ldrne   r6, [r0]                        ; load data
-    strb    r9, [r1], r2
-    ldrne   r8, [r0, #4]
-    ldrneh  r10, [r0, #8]
-
-    bne     bil_width_loop_2nd
-
-    subs    r12, r12, #1
-    add     r0, r0, #4                      ; update src for next row
-    add     r11, r11, #1
-    mov     r1, r11
-
-    bne     bil_height_loop_2nd
-    ldmia   sp!, {r4 - r11, pc}
-
-|bil_null_2nd_filter|
-|bil_height_loop_null_2nd|
-    mov     lr, r3, lsr #2
-
-|bil_width_loop_null_2nd|
-    ldr     r6, [r0], #4                    ; load data
-    subs    lr, lr, #1
-    ldr     r8, [r0], #4
-
-    strb    r6, [r1], r2                    ; store data
-    mov     r7, r6, lsr #16
-    strb    r7, [r1], r2
-    mov     r9, r8, lsr #16
-    strb    r8, [r1], r2
-    strb    r9, [r1], r2
-
-    bne     bil_width_loop_null_2nd
-
-    subs    r12, r12, #1
-    add     r0, r0, #4
-    add     r11, r11, #1
-    mov     r1, r11
-
-    bne     bil_height_loop_null_2nd
-
-    ldmia   sp!, {r4 - r11, pc}
-    ENDP  ; |vp9_filter_block2d_second_pass_armv6|
-
-    END
--- a/vp8/common/arm/armv6/copymem16x16_v6.asm
+++ /dev/null
@@ -1,186 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp9_copy_mem16x16_v6|
-    ; ARM
-    ; REQUIRE8
-    ; PRESERVE8
-
-    AREA    Block, CODE, READONLY ; name this block of code
-;void copy_mem16x16_v6( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride)
-;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
-|vp9_copy_mem16x16_v6| PROC
-    stmdb       sp!, {r4 - r7}
-    ;push   {r4-r7}
-
-    ;preload
-    pld     [r0, #31]                ; preload for next 16x16 block
-
-    ands    r4, r0, #15
-    beq     copy_mem16x16_fast
-
-    ands    r4, r0, #7
-    beq     copy_mem16x16_8
-
-    ands    r4, r0, #3
-    beq     copy_mem16x16_4
-
-    ;copy one byte each time
-    ldrb    r4, [r0]
-    ldrb    r5, [r0, #1]
-    ldrb    r6, [r0, #2]
-    ldrb    r7, [r0, #3]
-
-    mov     r12, #16
-
-copy_mem16x16_1_loop
-    strb    r4, [r2]
-    strb    r5, [r2, #1]
-    strb    r6, [r2, #2]
-    strb    r7, [r2, #3]
-
-    ldrb    r4, [r0, #4]
-    ldrb    r5, [r0, #5]
-    ldrb    r6, [r0, #6]
-    ldrb    r7, [r0, #7]
-
-    subs    r12, r12, #1
-
-    strb    r4, [r2, #4]
-    strb    r5, [r2, #5]
-    strb    r6, [r2, #6]
-    strb    r7, [r2, #7]
-
-    ldrb    r4, [r0, #8]
-    ldrb    r5, [r0, #9]
-    ldrb    r6, [r0, #10]
-    ldrb    r7, [r0, #11]
-
-    strb    r4, [r2, #8]
-    strb    r5, [r2, #9]
-    strb    r6, [r2, #10]
-    strb    r7, [r2, #11]
-
-    ldrb    r4, [r0, #12]
-    ldrb    r5, [r0, #13]
-    ldrb    r6, [r0, #14]
-    ldrb    r7, [r0, #15]
-
-    add     r0, r0, r1
-
-    strb    r4, [r2, #12]
-    strb    r5, [r2, #13]
-    strb    r6, [r2, #14]
-    strb    r7, [r2, #15]
-
-    add     r2, r2, r3
-
-    ldrneb  r4, [r0]
-    ldrneb  r5, [r0, #1]
-    ldrneb  r6, [r0, #2]
-    ldrneb  r7, [r0, #3]
-
-    pld     [r0, #31]               ; preload for next 16x16 block
-
-    bne     copy_mem16x16_1_loop
-
-    ldmia       sp!, {r4 - r7}
-    ;pop        {r4-r7}
-    mov     pc, lr
-
-;copy 4 bytes each time
-copy_mem16x16_4
-    ldr     r4, [r0]
-    ldr     r5, [r0, #4]
-    ldr     r6, [r0, #8]
-    ldr     r7, [r0, #12]
-
-    mov     r12, #16
-
-copy_mem16x16_4_loop
-    subs    r12, r12, #1
-    add     r0, r0, r1
-
-    str     r4, [r2]
-    str     r5, [r2, #4]
-    str     r6, [r2, #8]
-    str     r7, [r2, #12]
-
-    add     r2, r2, r3
-
-    ldrne   r4, [r0]
-    ldrne   r5, [r0, #4]
-    ldrne   r6, [r0, #8]
-    ldrne   r7, [r0, #12]
-
-    pld     [r0, #31]               ; preload for next 16x16 block
-
-    bne     copy_mem16x16_4_loop
-
-    ldmia       sp!, {r4 - r7}
-    ;pop        {r4-r7}
-    mov     pc, lr
-
-;copy 8 bytes each time
-copy_mem16x16_8
-    sub     r1, r1, #16
-    sub     r3, r3, #16
-
-    mov     r12, #16
-
-copy_mem16x16_8_loop
-    ldmia   r0!, {r4-r5}
-    ;ldm        r0, {r4-r5}
-    ldmia   r0!, {r6-r7}
-
-    add     r0, r0, r1
-
-    stmia   r2!, {r4-r5}
-    subs    r12, r12, #1
-    ;stm        r2, {r4-r5}
-    stmia   r2!, {r6-r7}
-
-    add     r2, r2, r3
-
-    pld     [r0, #31]               ; preload for next 16x16 block
-    bne     copy_mem16x16_8_loop
-
-    ldmia       sp!, {r4 - r7}
-    ;pop        {r4-r7}
-    mov     pc, lr
-
-;copy 16 bytes each time
-copy_mem16x16_fast
-    ;sub        r1, r1, #16
-    ;sub        r3, r3, #16
-
-    mov     r12, #16
-
-copy_mem16x16_fast_loop
-    ldmia   r0, {r4-r7}
-    ;ldm        r0, {r4-r7}
-    add     r0, r0, r1
-
-    subs    r12, r12, #1
-    stmia   r2, {r4-r7}
-    ;stm        r2, {r4-r7}
-    add     r2, r2, r3
-
-    pld     [r0, #31]               ; preload for next 16x16 block
-    bne     copy_mem16x16_fast_loop
-
-    ldmia       sp!, {r4 - r7}
-    ;pop        {r4-r7}
-    mov     pc, lr
-
-    ENDP  ; |vp9_copy_mem16x16_v6|
-
-    END
--- a/vp8/common/arm/armv6/copymem8x4_v6.asm
+++ /dev/null
@@ -1,128 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp9_copy_mem8x4_v6|
-    ; ARM
-    ; REQUIRE8
-    ; PRESERVE8
-
-    AREA    Block, CODE, READONLY ; name this block of code
-;void vp9_copy_mem8x4_v6( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride)
-;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
-|vp9_copy_mem8x4_v6| PROC
-    ;push   {r4-r5}
-    stmdb  sp!, {r4-r5}
-
-    ;preload
-    pld     [r0]
-    pld     [r0, r1]
-    pld     [r0, r1, lsl #1]
-
-    ands    r4, r0, #7
-    beq     copy_mem8x4_fast
-
-    ands    r4, r0, #3
-    beq     copy_mem8x4_4
-
-    ;copy 1 byte each time
-    ldrb    r4, [r0]
-    ldrb    r5, [r0, #1]
-
-    mov     r12, #4
-
-copy_mem8x4_1_loop
-    strb    r4, [r2]
-    strb    r5, [r2, #1]
-
-    ldrb    r4, [r0, #2]
-    ldrb    r5, [r0, #3]
-
-    subs    r12, r12, #1
-
-    strb    r4, [r2, #2]
-    strb    r5, [r2, #3]
-
-    ldrb    r4, [r0, #4]
-    ldrb    r5, [r0, #5]
-
-    strb    r4, [r2, #4]
-    strb    r5, [r2, #5]
-
-    ldrb    r4, [r0, #6]
-    ldrb    r5, [r0, #7]
-
-    add     r0, r0, r1
-
-    strb    r4, [r2, #6]
-    strb    r5, [r2, #7]
-
-    add     r2, r2, r3
-
-    ldrneb  r4, [r0]
-    ldrneb  r5, [r0, #1]
-
-    bne     copy_mem8x4_1_loop
-
-    ldmia       sp!, {r4 - r5}
-    ;pop        {r4-r5}
-    mov     pc, lr
-
-;copy 4 bytes each time
-copy_mem8x4_4
-    ldr     r4, [r0]
-    ldr     r5, [r0, #4]
-
-    mov     r12, #4
-
-copy_mem8x4_4_loop
-    subs    r12, r12, #1
-    add     r0, r0, r1
-
-    str     r4, [r2]
-    str     r5, [r2, #4]
-
-    add     r2, r2, r3
-
-    ldrne   r4, [r0]
-    ldrne   r5, [r0, #4]
-
-    bne     copy_mem8x4_4_loop
-
-    ldmia  sp!, {r4-r5}
-    ;pop        {r4-r5}
-    mov     pc, lr
-
-;copy 8 bytes each time
-copy_mem8x4_fast
-    ;sub        r1, r1, #8
-    ;sub        r3, r3, #8
-
-    mov     r12, #4
-
-copy_mem8x4_fast_loop
-    ldmia   r0, {r4-r5}
-    ;ldm        r0, {r4-r5}
-    add     r0, r0, r1
-
-    subs    r12, r12, #1
-    stmia   r2, {r4-r5}
-    ;stm        r2, {r4-r5}
-    add     r2, r2, r3
-
-    bne     copy_mem8x4_fast_loop
-
-    ldmia  sp!, {r4-r5}
-    ;pop        {r4-r5}
-    mov     pc, lr
-
-    ENDP  ; |vp9_copy_mem8x4_v6|
-
-    END
--- a/vp8/common/arm/armv6/copymem8x8_v6.asm
+++ /dev/null
@@ -1,128 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp9_copy_mem8x8_v6|
-    ; ARM
-    ; REQUIRE8
-    ; PRESERVE8
-
-    AREA    Block, CODE, READONLY ; name this block of code
-;void copy_mem8x8_v6( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride)
-;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
-|vp9_copy_mem8x8_v6| PROC
-    ;push   {r4-r5}
-    stmdb  sp!, {r4-r5}
-
-    ;preload
-    pld     [r0]
-    pld     [r0, r1]
-    pld     [r0, r1, lsl #1]
-
-    ands    r4, r0, #7
-    beq     copy_mem8x8_fast
-
-    ands    r4, r0, #3
-    beq     copy_mem8x8_4
-
-    ;copy 1 byte each time
-    ldrb    r4, [r0]
-    ldrb    r5, [r0, #1]
-
-    mov     r12, #8
-
-copy_mem8x8_1_loop
-    strb    r4, [r2]
-    strb    r5, [r2, #1]
-
-    ldrb    r4, [r0, #2]
-    ldrb    r5, [r0, #3]
-
-    subs    r12, r12, #1
-
-    strb    r4, [r2, #2]
-    strb    r5, [r2, #3]
-
-    ldrb    r4, [r0, #4]
-    ldrb    r5, [r0, #5]
-
-    strb    r4, [r2, #4]
-    strb    r5, [r2, #5]
-
-    ldrb    r4, [r0, #6]
-    ldrb    r5, [r0, #7]
-
-    add     r0, r0, r1
-
-    strb    r4, [r2, #6]
-    strb    r5, [r2, #7]
-
-    add     r2, r2, r3
-
-    ldrneb  r4, [r0]
-    ldrneb  r5, [r0, #1]
-
-    bne     copy_mem8x8_1_loop
-
-    ldmia       sp!, {r4 - r5}
-    ;pop        {r4-r5}
-    mov     pc, lr
-
-;copy 4 bytes each time
-copy_mem8x8_4
-    ldr     r4, [r0]
-    ldr     r5, [r0, #4]
-
-    mov     r12, #8
-
-copy_mem8x8_4_loop
-    subs    r12, r12, #1
-    add     r0, r0, r1
-
-    str     r4, [r2]
-    str     r5, [r2, #4]
-
-    add     r2, r2, r3
-
-    ldrne   r4, [r0]
-    ldrne   r5, [r0, #4]
-
-    bne     copy_mem8x8_4_loop
-
-    ldmia       sp!, {r4 - r5}
-    ;pop        {r4-r5}
-    mov     pc, lr
-
-;copy 8 bytes each time
-copy_mem8x8_fast
-    ;sub        r1, r1, #8
-    ;sub        r3, r3, #8
-
-    mov     r12, #8
-
-copy_mem8x8_fast_loop
-    ldmia   r0, {r4-r5}
-    ;ldm        r0, {r4-r5}
-    add     r0, r0, r1
-
-    subs    r12, r12, #1
-    stmia   r2, {r4-r5}
-    ;stm        r2, {r4-r5}
-    add     r2, r2, r3
-
-    bne     copy_mem8x8_fast_loop
-
-    ldmia  sp!, {r4-r5}
-    ;pop        {r4-r5}
-    mov     pc, lr
-
-    ENDP  ; |vp9_copy_mem8x8_v6|
-
-    END
--- a/vp8/common/arm/armv6/dc_only_idct_add_v6.asm
+++ /dev/null
@@ -1,67 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license and patent
-;  grant that can be found in the LICENSE file in the root of the source
-;  tree. All contributing project authors may be found in the AUTHORS
-;  file in the root of the source tree.
-;
-
-    EXPORT  |vp8_dc_only_idct_add_v6|
-
-    AREA    |.text|, CODE, READONLY
-
-;void vp8_dc_only_idct_add_v6(short input_dc, unsigned char *pred_ptr,
-;                             unsigned char *dst_ptr, int pitch, int stride)
-; r0  input_dc
-; r1  pred_ptr
-; r2  dest_ptr
-; r3  pitch
-; sp  stride
-
-|vp8_dc_only_idct_add_v6| PROC
-    stmdb       sp!, {r4 - r7, lr}
-
-    add         r0, r0, #4                ; input_dc += 4
-    ldr         r12, c0x0000FFFF
-    ldr         r4, [r1], r3
-    ldr         r6, [r1], r3
-    and         r0, r12, r0, asr #3       ; input_dc >> 3 + mask
-    ldr         lr, [sp, #20]
-    orr         r0, r0, r0, lsl #16       ; a1 | a1
-
-    uxtab16     r5, r0, r4                ; a1+2 | a1+0
-    uxtab16     r4, r0, r4, ror #8        ; a1+3 | a1+1
-    uxtab16     r7, r0, r6
-    uxtab16     r6, r0, r6, ror #8
-    usat16      r5, #8, r5
-    usat16      r4, #8, r4
-    usat16      r7, #8, r7
-    usat16      r6, #8, r6
-    orr         r5, r5, r4, lsl #8
-    orr         r7, r7, r6, lsl #8
-    ldr         r4, [r1], r3
-    ldr         r6, [r1]
-    str         r5, [r2], lr
-    str         r7, [r2], lr
-
-    uxtab16     r5, r0, r4
-    uxtab16     r4, r0, r4, ror #8
-    uxtab16     r7, r0, r6
-    uxtab16     r6, r0, r6, ror #8
-    usat16      r5, #8, r5
-    usat16      r4, #8, r4
-    usat16      r7, #8, r7
-    usat16      r6, #8, r6
-    orr         r5, r5, r4, lsl #8
-    orr         r7, r7, r6, lsl #8
-    str         r5, [r2], lr
-    str         r7, [r2]
-
-    ldmia       sp!, {r4 - r7, pc}
-
-    ENDP  ; |vp8_dc_only_idct_add_v6|
-
-; Constant Pool
-c0x0000FFFF DCD 0x0000FFFF
-    END
--- a/vp8/common/arm/armv6/filter_v6.asm
+++ /dev/null
@@ -1,624 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp9_filter_block2d_first_pass_armv6|
-    EXPORT  |vp9_filter_block2d_first_pass_16x16_armv6|
-    EXPORT  |vp9_filter_block2d_first_pass_8x8_armv6|
-    EXPORT  |vp9_filter_block2d_second_pass_armv6|
-    EXPORT  |vp9_filter4_block2d_second_pass_armv6|
-    EXPORT  |vp9_filter_block2d_first_pass_only_armv6|
-    EXPORT  |vp9_filter_block2d_second_pass_only_armv6|
-
-    AREA    |.text|, CODE, READONLY  ; name this block of code
-;-------------------------------------
-; r0    unsigned char *src_ptr
-; r1    short         *output_ptr
-; r2    unsigned int src_pixels_per_line
-; r3    unsigned int output_width
-; stack unsigned int output_height
-; stack const short *vp9_filter
-;-------------------------------------
-; vp9_filter the input and put in the output array.  Apply the 6 tap FIR filter with
-; the output being a 2 byte value and the intput being a 1 byte value.
-|vp9_filter_block2d_first_pass_armv6| PROC
-    stmdb   sp!, {r4 - r11, lr}
-
-    ldr     r11, [sp, #40]                  ; vp9_filter address
-    ldr     r7, [sp, #36]                   ; output height
-
-    sub     r2, r2, r3                      ; inside loop increments input array,
-                                            ; so the height loop only needs to add
-                                            ; r2 - width to the input pointer
-
-    mov     r3, r3, lsl #1                  ; multiply width by 2 because using shorts
-    add     r12, r3, #16                    ; square off the output
-    sub     sp, sp, #4
-
-    ldr     r4, [r11]                       ; load up packed filter coefficients
-    ldr     r5, [r11, #4]
-    ldr     r6, [r11, #8]
-
-    str     r1, [sp]                        ; push destination to stack
-    mov     r7, r7, lsl #16                 ; height is top part of counter
-
-; six tap filter
-|height_loop_1st_6|
-    ldrb    r8, [r0, #-2]                   ; load source data
-    ldrb    r9, [r0, #-1]
-    ldrb    r10, [r0], #2
-    orr     r7, r7, r3, lsr #2              ; construct loop counter
-
-|width_loop_1st_6|
-    ldrb    r11, [r0, #-1]
-
-    pkhbt   lr, r8, r9, lsl #16             ; r9 | r8
-    pkhbt   r8, r9, r10, lsl #16            ; r10 | r9
-
-    ldrb    r9, [r0]
-
-    smuad   lr, lr, r4                      ; apply the filter
-    pkhbt   r10, r10, r11, lsl #16          ; r11 | r10
-    smuad   r8, r8, r4
-    pkhbt   r11, r11, r9, lsl #16           ; r9 | r11
-
-    smlad   lr, r10, r5, lr
-    ldrb    r10, [r0, #1]
-    smlad   r8, r11, r5, r8
-    ldrb    r11, [r0, #2]
-
-    sub     r7, r7, #1
-
-    pkhbt   r9, r9, r10, lsl #16            ; r10 | r9
-    pkhbt   r10, r10, r11, lsl #16          ; r11 | r10
-
-    smlad   lr, r9, r6, lr
-    smlad   r11, r10, r6, r8
-
-    ands    r10, r7, #0xff                  ; test loop counter
-
-    add     lr, lr, #0x40                   ; round_shift_and_clamp
-    ldrneb  r8, [r0, #-2]                   ; load data for next loop
-    usat    lr, #8, lr, asr #7
-    add     r11, r11, #0x40
-    ldrneb  r9, [r0, #-1]
-    usat    r11, #8, r11, asr #7
-
-    strh    lr, [r1], r12                   ; result is transposed and stored, which
-                                            ; will make second pass filtering easier.
-    ldrneb  r10, [r0], #2
-    strh    r11, [r1], r12
-
-    bne     width_loop_1st_6
-
-    ldr     r1, [sp]                        ; load and update dst address
-    subs    r7, r7, #0x10000
-    add     r0, r0, r2                      ; move to next input line
-
-    add     r1, r1, #2                      ; move over to next column
-    str     r1, [sp]
-
-    bne     height_loop_1st_6
-
-    add     sp, sp, #4
-    ldmia   sp!, {r4 - r11, pc}
-
-    ENDP
-
-; --------------------------
-; 16x16 version
-; -----------------------------
-|vp9_filter_block2d_first_pass_16x16_armv6| PROC
-    stmdb   sp!, {r4 - r11, lr}
-
-    ldr     r11, [sp, #40]                  ; vp9_filter address
-    ldr     r7, [sp, #36]                   ; output height
-
-    add     r4, r2, #18                     ; preload next low
-    pld     [r0, r4]
-
-    sub     r2, r2, r3                      ; inside loop increments input array,
-                                            ; so the height loop only needs to add
-                                            ; r2 - width to the input pointer
-
-    mov     r3, r3, lsl #1                  ; multiply width by 2 because using shorts
-    add     r12, r3, #16                    ; square off the output
-    sub     sp, sp, #4
-
-    ldr     r4, [r11]                       ; load up packed filter coefficients
-    ldr     r5, [r11, #4]
-    ldr     r6, [r11, #8]
-
-    str     r1, [sp]                        ; push destination to stack
-    mov     r7, r7, lsl #16                 ; height is top part of counter
-
-; six tap filter
-|height_loop_1st_16_6|
-    ldrb    r8, [r0, #-2]                   ; load source data
-    ldrb    r9, [r0, #-1]
-    ldrb    r10, [r0], #2
-    orr     r7, r7, r3, lsr #2              ; construct loop counter
-
-|width_loop_1st_16_6|
-    ldrb    r11, [r0, #-1]
-
-    pkhbt   lr, r8, r9, lsl #16             ; r9 | r8
-    pkhbt   r8, r9, r10, lsl #16            ; r10 | r9
-
-    ldrb    r9, [r0]
-
-    smuad   lr, lr, r4                      ; apply the filter
-    pkhbt   r10, r10, r11, lsl #16          ; r11 | r10
-    smuad   r8, r8, r4
-    pkhbt   r11, r11, r9, lsl #16           ; r9 | r11
-
-    smlad   lr, r10, r5, lr
-    ldrb    r10, [r0, #1]
-    smlad   r8, r11, r5, r8
-    ldrb    r11, [r0, #2]
-
-    sub     r7, r7, #1
-
-    pkhbt   r9, r9, r10, lsl #16            ; r10 | r9
-    pkhbt   r10, r10, r11, lsl #16          ; r11 | r10
-
-    smlad   lr, r9, r6, lr
-    smlad   r11, r10, r6, r8
-
-    ands    r10, r7, #0xff                  ; test loop counter
-
-    add     lr, lr, #0x40                   ; round_shift_and_clamp
-    ldrneb  r8, [r0, #-2]                   ; load data for next loop
-    usat    lr, #8, lr, asr #7
-    add     r11, r11, #0x40
-    ldrneb  r9, [r0, #-1]
-    usat    r11, #8, r11, asr #7
-
-    strh    lr, [r1], r12                   ; result is transposed and stored, which
-                                            ; will make second pass filtering easier.
-    ldrneb  r10, [r0], #2
-    strh    r11, [r1], r12
-
-    bne     width_loop_1st_16_6
-
-    ldr     r1, [sp]                        ; load and update dst address
-    subs    r7, r7, #0x10000
-    add     r0, r0, r2                      ; move to next input line
-
-    add     r11, r2, #34                    ; adding back block width(=16)
-    pld     [r0, r11]                       ; preload next low
-
-    add     r1, r1, #2                      ; move over to next column
-    str     r1, [sp]
-
-    bne     height_loop_1st_16_6
-
-    add     sp, sp, #4
-    ldmia   sp!, {r4 - r11, pc}
-
-    ENDP
-
-; --------------------------
-; 8x8 version
-; -----------------------------
-|vp9_filter_block2d_first_pass_8x8_armv6| PROC
-    stmdb   sp!, {r4 - r11, lr}
-
-    ldr     r11, [sp, #40]                  ; vp9_filter address
-    ldr     r7, [sp, #36]                   ; output height
-
-    add     r4, r2, #10                     ; preload next low
-    pld     [r0, r4]
-
-    sub     r2, r2, r3                      ; inside loop increments input array,
-                                            ; so the height loop only needs to add
-                                            ; r2 - width to the input pointer
-
-    mov     r3, r3, lsl #1                  ; multiply width by 2 because using shorts
-    add     r12, r3, #16                    ; square off the output
-    sub     sp, sp, #4
-
-    ldr     r4, [r11]                       ; load up packed filter coefficients
-    ldr     r5, [r11, #4]
-    ldr     r6, [r11, #8]
-
-    str     r1, [sp]                        ; push destination to stack
-    mov     r7, r7, lsl #16                 ; height is top part of counter
-
-; six tap filter
-|height_loop_1st_8_6|
-    ldrb    r8, [r0, #-2]                   ; load source data
-    ldrb    r9, [r0, #-1]
-    ldrb    r10, [r0], #2
-    orr     r7, r7, r3, lsr #2              ; construct loop counter
-
-|width_loop_1st_8_6|
-    ldrb    r11, [r0, #-1]
-
-    pkhbt   lr, r8, r9, lsl #16             ; r9 | r8
-    pkhbt   r8, r9, r10, lsl #16            ; r10 | r9
-
-    ldrb    r9, [r0]
-
-    smuad   lr, lr, r4                      ; apply the filter
-    pkhbt   r10, r10, r11, lsl #16          ; r11 | r10
-    smuad   r8, r8, r4
-    pkhbt   r11, r11, r9, lsl #16           ; r9 | r11
-
-    smlad   lr, r10, r5, lr
-    ldrb    r10, [r0, #1]
-    smlad   r8, r11, r5, r8
-    ldrb    r11, [r0, #2]
-
-    sub     r7, r7, #1
-
-    pkhbt   r9, r9, r10, lsl #16            ; r10 | r9
-    pkhbt   r10, r10, r11, lsl #16          ; r11 | r10
-
-    smlad   lr, r9, r6, lr
-    smlad   r11, r10, r6, r8
-
-    ands    r10, r7, #0xff                  ; test loop counter
-
-    add     lr, lr, #0x40                   ; round_shift_and_clamp
-    ldrneb  r8, [r0, #-2]                   ; load data for next loop
-    usat    lr, #8, lr, asr #7
-    add     r11, r11, #0x40
-    ldrneb  r9, [r0, #-1]
-    usat    r11, #8, r11, asr #7
-
-    strh    lr, [r1], r12                   ; result is transposed and stored, which
-                                            ; will make second pass filtering easier.
-    ldrneb  r10, [r0], #2
-    strh    r11, [r1], r12
-
-    bne     width_loop_1st_8_6
-
-    ldr     r1, [sp]                        ; load and update dst address
-    subs    r7, r7, #0x10000
-    add     r0, r0, r2                      ; move to next input line
-
-    add     r11, r2, #18                    ; adding back block width(=8)
-    pld     [r0, r11]                       ; preload next low
-
-    add     r1, r1, #2                      ; move over to next column
-    str     r1, [sp]
-
-    bne     height_loop_1st_8_6
-
-    add     sp, sp, #4
-    ldmia   sp!, {r4 - r11, pc}
-
-    ENDP
-
-;---------------------------------
-; r0    short         *src_ptr,
-; r1    unsigned char *output_ptr,
-; r2    unsigned int output_pitch,
-; r3    unsigned int cnt,
-; stack const short *vp9_filter
-;---------------------------------
-|vp9_filter_block2d_second_pass_armv6| PROC
-    stmdb   sp!, {r4 - r11, lr}
-
-    ldr     r11, [sp, #36]                  ; vp9_filter address
-    sub     sp, sp, #4
-    mov     r7, r3, lsl #16                 ; height is top part of counter
-    str     r1, [sp]                        ; push destination to stack
-
-    ldr     r4, [r11]                       ; load up packed filter coefficients
-    ldr     r5, [r11, #4]
-    ldr     r6, [r11, #8]
-
-    pkhbt   r12, r5, r4                     ; pack the filter differently
-    pkhbt   r11, r6, r5
-
-    sub     r0, r0, #4                      ; offset input buffer
-
-|height_loop_2nd|
-    ldr     r8, [r0]                        ; load the data
-    ldr     r9, [r0, #4]
-    orr     r7, r7, r3, lsr #1              ; loop counter
-
-|width_loop_2nd|
-    smuad   lr, r4, r8                      ; apply filter
-    sub     r7, r7, #1
-    smulbt  r8, r4, r8
-
-    ldr     r10, [r0, #8]
-
-    smlad   lr, r5, r9, lr
-    smladx  r8, r12, r9, r8
-
-    ldrh    r9, [r0, #12]
-
-    smlad   lr, r6, r10, lr
-    smladx  r8, r11, r10, r8
-
-    add     r0, r0, #4
-    smlatb  r10, r6, r9, r8
-
-    add     lr, lr, #0x40                   ; round_shift_and_clamp
-    ands    r8, r7, #0xff
-    usat    lr, #8, lr, asr #7
-    add     r10, r10, #0x40
-    strb    lr, [r1], r2                    ; the result is transposed back and stored
-    usat    r10, #8, r10, asr #7
-
-    ldrne   r8, [r0]                        ; load data for next loop
-    ldrne   r9, [r0, #4]
-    strb    r10, [r1], r2
-
-    bne     width_loop_2nd
-
-    ldr     r1, [sp]                        ; update dst for next loop
-    subs    r7, r7, #0x10000
-    add     r0, r0, #16                     ; updata src for next loop
-    add     r1, r1, #1
-    str     r1, [sp]
-
-    bne     height_loop_2nd
-
-    add     sp, sp, #4
-    ldmia   sp!, {r4 - r11, pc}
-
-    ENDP
-
-;---------------------------------
-; r0    short         *src_ptr,
-; r1    unsigned char *output_ptr,
-; r2    unsigned int output_pitch,
-; r3    unsigned int cnt,
-; stack const short *vp9_filter
-;---------------------------------
-|vp9_filter4_block2d_second_pass_armv6| PROC
-    stmdb   sp!, {r4 - r11, lr}
-
-    ldr     r11, [sp, #36]                  ; vp9_filter address
-    mov     r7, r3, lsl #16                 ; height is top part of counter
-
-    ldr     r4, [r11]                       ; load up packed filter coefficients
-    add     lr, r1, r3                      ; save final destination pointer
-    ldr     r5, [r11, #4]
-    ldr     r6, [r11, #8]
-
-    pkhbt   r12, r5, r4                     ; pack the filter differently
-    pkhbt   r11, r6, r5
-    mov     r4, #0x40                       ; rounding factor (for smlad{x})
-
-|height_loop_2nd_4|
-    ldrd    r8, [r0, #-4]                   ; load the data
-    orr     r7, r7, r3, lsr #1              ; loop counter
-
-|width_loop_2nd_4|
-    ldr     r10, [r0, #4]!
-    smladx  r6, r9, r12, r4                 ; apply filter
-    pkhbt   r8, r9, r8
-    smlad   r5, r8, r12, r4
-    pkhbt   r8, r10, r9
-    smladx  r6, r10, r11, r6
-    sub     r7, r7, #1
-    smlad   r5, r8, r11, r5
-
-    mov     r8, r9                          ; shift the data for the next loop
-    mov     r9, r10
-
-    usat    r6, #8, r6, asr #7              ; shift and clamp
-    usat    r5, #8, r5, asr #7
-
-    strb    r5, [r1], r2                    ; the result is transposed back and stored
-    tst     r7, #0xff
-    strb    r6, [r1], r2
-
-    bne     width_loop_2nd_4
-
-    subs    r7, r7, #0x10000
-    add     r0, r0, #16                     ; update src for next loop
-    sub     r1, lr, r7, lsr #16             ; update dst for next loop
-
-    bne     height_loop_2nd_4
-
-    ldmia   sp!, {r4 - r11, pc}
-
-    ENDP
-
-;------------------------------------
-; r0    unsigned char *src_ptr
-; r1    unsigned char *output_ptr,
-; r2    unsigned int src_pixels_per_line
-; r3    unsigned int cnt,
-; stack unsigned int output_pitch,
-; stack const short *vp9_filter
-;------------------------------------
-|vp9_filter_block2d_first_pass_only_armv6| PROC
-    stmdb   sp!, {r4 - r11, lr}
-
-    add     r7, r2, r3                      ; preload next low
-    add     r7, r7, #2
-    pld     [r0, r7]
-
-    ldr     r4, [sp, #36]                   ; output pitch
-    ldr     r11, [sp, #40]                  ; HFilter address
-    sub     sp, sp, #8
-
-    mov     r7, r3
-    sub     r2, r2, r3                      ; inside loop increments input array,
-                                            ; so the height loop only needs to add
-                                            ; r2 - width to the input pointer
-
-    sub     r4, r4, r3
-    str     r4, [sp]                        ; save modified output pitch
-    str     r2, [sp, #4]
-
-    mov     r2, #0x40
-
-    ldr     r4, [r11]                       ; load up packed filter coefficients
-    ldr     r5, [r11, #4]
-    ldr     r6, [r11, #8]
-
-; six tap filter
-|height_loop_1st_only_6|
-    ldrb    r8, [r0, #-2]                   ; load data
-    ldrb    r9, [r0, #-1]
-    ldrb    r10, [r0], #2
-
-    mov     r12, r3, lsr #1                 ; loop counter
-
-|width_loop_1st_only_6|
-    ldrb    r11, [r0, #-1]
-
-    pkhbt   lr, r8, r9, lsl #16             ; r9 | r8
-    pkhbt   r8, r9, r10, lsl #16            ; r10 | r9
-
-    ldrb    r9, [r0]
-
-;;  smuad   lr, lr, r4
-    smlad   lr, lr, r4, r2
-    pkhbt   r10, r10, r11, lsl #16          ; r11 | r10
-;;  smuad   r8, r8, r4
-    smlad   r8, r8, r4, r2
-    pkhbt   r11, r11, r9, lsl #16           ; r9 | r11
-
-    smlad   lr, r10, r5, lr
-    ldrb    r10, [r0, #1]
-    smlad   r8, r11, r5, r8
-    ldrb    r11, [r0, #2]
-
-    subs    r12, r12, #1
-
-    pkhbt   r9, r9, r10, lsl #16            ; r10 | r9
-    pkhbt   r10, r10, r11, lsl #16          ; r11 | r10
-
-    smlad   lr, r9, r6, lr
-    smlad   r10, r10, r6, r8
-
-;;  add     lr, lr, #0x40                   ; round_shift_and_clamp
-    ldrneb  r8, [r0, #-2]                   ; load data for next loop
-    usat    lr, #8, lr, asr #7
-;;  add     r10, r10, #0x40
-    strb    lr, [r1], #1                    ; store the result
-    usat    r10, #8, r10, asr #7
-
-    ldrneb  r9, [r0, #-1]
-    strb    r10, [r1], #1
-    ldrneb  r10, [r0], #2
-
-    bne     width_loop_1st_only_6
-
-    ldr     lr, [sp]                        ; load back output pitch
-    ldr     r12, [sp, #4]                   ; load back output pitch
-    subs    r7, r7, #1
-    add     r0, r0, r12                     ; updata src for next loop
-
-    add     r11, r12, r3                    ; preload next low
-    add     r11, r11, #2
-    pld     [r0, r11]
-
-    add     r1, r1, lr                      ; update dst for next loop
-
-    bne     height_loop_1st_only_6
-
-    add     sp, sp, #8
-    ldmia   sp!, {r4 - r11, pc}
-    ENDP  ; |vp9_filter_block2d_first_pass_only_armv6|
-
-
-;------------------------------------
-; r0    unsigned char *src_ptr,
-; r1    unsigned char *output_ptr,
-; r2    unsigned int src_pixels_per_line
-; r3    unsigned int cnt,
-; stack unsigned int output_pitch,
-; stack const short *vp9_filter
-;------------------------------------
-|vp9_filter_block2d_second_pass_only_armv6| PROC
-    stmdb   sp!, {r4 - r11, lr}
-
-    ldr     r11, [sp, #40]                  ; VFilter address
-    ldr     r12, [sp, #36]                  ; output pitch
-
-    mov     r7, r3, lsl #16                 ; height is top part of counter
-    sub     r0, r0, r2, lsl #1              ; need 6 elements for filtering, 2 before, 3 after
-
-    sub     sp, sp, #8
-
-    ldr     r4, [r11]                       ; load up packed filter coefficients
-    ldr     r5, [r11, #4]
-    ldr     r6, [r11, #8]
-
-    str     r0, [sp]                        ; save r0 to stack
-    str     r1, [sp, #4]                    ; save dst to stack
-
-; six tap filter
-|width_loop_2nd_only_6|
-    ldrb    r8, [r0], r2                    ; load data
-    orr     r7, r7, r3                      ; loop counter
-    ldrb    r9, [r0], r2
-    ldrb    r10, [r0], r2
-
-|height_loop_2nd_only_6|
-    ; filter first column in this inner loop, than, move to next colum.
-    ldrb    r11, [r0], r2
-
-    pkhbt   lr, r8, r9, lsl #16             ; r9 | r8
-    pkhbt   r8, r9, r10, lsl #16            ; r10 | r9
-
-    ldrb    r9, [r0], r2
-
-    smuad   lr, lr, r4
-    pkhbt   r10, r10, r11, lsl #16          ; r11 | r10
-    smuad   r8, r8, r4
-    pkhbt   r11, r11, r9, lsl #16           ; r9 | r11
-
-    smlad   lr, r10, r5, lr
-    ldrb    r10, [r0], r2
-    smlad   r8, r11, r5, r8
-    ldrb    r11, [r0]
-
-    sub     r7, r7, #2
-    sub     r0, r0, r2, lsl #2
-
-    pkhbt   r9, r9, r10, lsl #16            ; r10 | r9
-    pkhbt   r10, r10, r11, lsl #16          ; r11 | r10
-
-    smlad   lr, r9, r6, lr
-    smlad   r10, r10, r6, r8
-
-    ands    r9, r7, #0xff
-
-    add     lr, lr, #0x40                   ; round_shift_and_clamp
-    ldrneb  r8, [r0], r2                    ; load data for next loop
-    usat    lr, #8, lr, asr #7
-    add     r10, r10, #0x40
-    strb    lr, [r1], r12                   ; store the result for the column
-    usat    r10, #8, r10, asr #7
-
-    ldrneb  r9, [r0], r2
-    strb    r10, [r1], r12
-    ldrneb  r10, [r0], r2
-
-    bne     height_loop_2nd_only_6
-
-    ldr     r0, [sp]
-    ldr     r1, [sp, #4]
-    subs    r7, r7, #0x10000
-    add     r0, r0, #1                      ; move to filter next column
-    str     r0, [sp]
-    add     r1, r1, #1
-    str     r1, [sp, #4]
-
-    bne     width_loop_2nd_only_6
-
-    add     sp, sp, #8
-
-    ldmia   sp!, {r4 - r11, pc}
-    ENDP  ; |vp9_filter_block2d_second_pass_only_armv6|
-
-    END
--- a/vp8/common/arm/armv6/idct_v6.asm
+++ /dev/null
@@ -1,345 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-;                   r0  r1  r2  r3  r4  r5  r6  r7  r8  r9  r10 r11 r12     r14
-    EXPORT  |vp8_short_idct4x4llm_1_v6|
-    EXPORT  |vp8_short_idct4x4llm_v6|
-    EXPORT  |vp8_short_idct4x4llm_v6_scott|
-    EXPORT  |vp8_short_idct4x4llm_v6_dual|
-
-    AREA    |.text|, CODE, READONLY
-
-;********************************************************************************
-;*  void short_idct4x4llm_1_v6(INT16 * input, INT16 * output, INT32 pitch)
-;*      r0  INT16 * input
-;*      r1  INT16 * output
-;*      r2  INT32 pitch
-;*  bench:  3/5
-;********************************************************************************
-
-|vp8_short_idct4x4llm_1_v6| PROC         ;   cycles  in  out pit
-            ;
-    ldrsh   r0, [r0]    ; load input[0] 1, r0 un 2
-    add r0, r0, #4  ;   1   +4
-    stmdb   sp!, {r4, r5, lr}   ; make room for wide writes 1                   backup
-    mov r0, r0, asr #3  ; (input[0] + 4) >> 3   1, r0 req`d ^1  >> 3
-    pkhbt   r4, r0, r0, lsl #16 ; pack r0 into r4   1, r0 req`d ^1                  pack
-    mov r5, r4  ; expand                        expand
-
-    strd    r4, [r1], r2    ; *output = r0, post inc    1
-    strd    r4, [r1], r2    ;   1
-    strd    r4, [r1], r2    ;   1
-    strd    r4, [r1]    ;   1
-            ;
-    ldmia   sp!, {r4, r5, pc}   ; replace vars, return                      restore
-    ENDP        ; |vp8_short_idct4x4llm_1_v6|
-;********************************************************************************
-;********************************************************************************
-;********************************************************************************
-
-;********************************************************************************
-;*  void short_idct4x4llm_v6(INT16 * input, INT16 * output, INT32 pitch)
-;*      r0  INT16 * input
-;*      r1  INT16 * output
-;*      r2  INT32 pitch
-;*  bench:
-;********************************************************************************
-
-|vp8_short_idct4x4llm_v6| PROC           ;   cycles  in  out pit
-            ;
-    stmdb   sp!, {r4-r11, lr}   ; backup registers  1                   backup
-            ;
-    mov r4, #0x00004E00 ;   1                   cst
-    orr r4, r4, #0x0000007B ; cospi8sqrt2minus1
-    mov r5, #0x00008A00 ;   1                       cst
-    orr r5, r5, #0x0000008C ; sinpi8sqrt2
-            ;
-    mov r6, #4  ; i=4   1                           i
-loop1           ;
-    ldrsh   r12, [r0, #8]   ; input[4]  1, r12 unavail 2                                                    [4]
-    ldrsh   r3, [r0, #24]   ; input[12] 1, r3 unavail 2             [12]
-    ldrsh   r8, [r0, #16]   ; input[8]  1, r8 unavail 2                                 [8]
-    ldrsh   r7, [r0], #0x2  ; input[0]  1, r7 unavail 2 ++                          [0]
-    smulwb  r10, r5, r12    ; ([4] * sinpi8sqrt2) >> 16 1, r10 un 2, r12/r5 ^1                                          t1
-    smulwb  r11, r4, r3 ; ([12] * cospi8sqrt2minus1) >> 16  1, r11 un 2, r3/r4 ^1                                               t2
-    add r9, r7, r8  ; a1 = [0] + [8]    1                                       a1
-    sub r7, r7, r8  ; b1 = [0] - [8]    1                               b1
-    add r11, r3, r11    ; temp2 1
-    rsb r11, r11, r10   ; c1 = temp1 - temp2    1                                               c1
-    smulwb  r3, r5, r3  ; ([12] * sinpi8sqrt2) >> 16    1, r3 un 2, r3/r5 ^ 1               t2
-    smulwb  r10, r4, r12    ; ([4] * cospi8sqrt2minus1) >> 16   1, r10 un 2, r12/r4 ^1                                          t1
-    add r8, r7, r11 ; b1 + c1   1                                   b+c
-    strh    r8, [r1, r2]    ; out[pitch] = b1+c1    1
-    sub r7, r7, r11 ; b1 - c1   1                               b-c
-    add r10, r12, r10   ; temp1 1
-    add r3, r10, r3 ; d1 = temp1 + temp2    1               d1
-    add r10, r9, r3 ; a1 + d1   1                                           a+d
-    sub r3, r9, r3  ; a1 - d1   1               a-d
-    add r8, r2, r2  ; pitch * 2 1                                   p*2
-    strh    r7, [r1, r8]    ; out[pitch*2] = b1-c1  1
-    add r7, r2, r2, lsl #1  ; pitch * 3 1                               p*3
-    strh    r3, [r1, r7]    ; out[pitch*3] = a1-d1  1
-    subs    r6, r6, #1  ; i--   1                           --
-    strh    r10, [r1], #0x2 ; out[0] = a1+d1    1       ++
-    bne loop1   ; if i>0, continue
-            ;
-    sub r1, r1, #8  ; set up out for next loop  1       -4
-            ; for this iteration, input=prev output
-    mov r6, #4  ; i=4   1                           i
-;   b   returnfull
-loop2           ;
-    ldrsh   r11, [r1, #2]   ; input[1]  1, r11 un 2                                             [1]
-    ldrsh   r8, [r1, #6]    ; input[3]  1, r8 un 2                                  [3]
-    ldrsh   r3, [r1, #4]    ; input[2]  1, r3 un 2              [2]
-    ldrsh   r0, [r1]    ; input[0]  1, r0 un 2  [0]
-    smulwb  r9, r5, r11 ; ([1] * sinpi8sqrt2) >> 16 1, r9 un 2, r5/r11 ^1                                       t1
-    smulwb  r10, r4, r8 ; ([3] * cospi8sqrt2minus1) >> 16   1, r10 un 2, r4/r8 ^1                                           t2
-    add r7, r0, r3  ; a1 = [0] + [2]    1                               a1
-    sub r0, r0, r3  ; b1 = [0] - [2]    1   b1
-    add r10, r8, r10    ; temp2 1
-    rsb r9, r10, r9 ; c1 = temp1 - temp2    1                                       c1
-    smulwb  r8, r5, r8  ; ([3] * sinpi8sqrt2) >> 16 1, r8 un 2, r5/r8 ^1                                    t2
-    smulwb  r10, r4, r11    ; ([1] * cospi8sqrt2minus1) >> 16   1, r10 un 2, r4/r11 ^1                                          t1
-    add r3, r0, r9  ; b1+c1 1               b+c
-    add r3, r3, #4  ; b1+c1+4   1               +4
-    add r10, r11, r10   ; temp1 1
-    mov r3, r3, asr #3  ; b1+c1+4 >> 3  1, r3 ^1                >>3
-    strh    r3, [r1, #2]    ; out[1] = b1+c1    1
-    add r10, r10, r8    ; d1 = temp1 + temp2    1                                           d1
-    add r3, r7, r10 ; a1+d1 1               a+d
-    add r3, r3, #4  ; a1+d1+4   1               +4
-    sub r7, r7, r10 ; a1-d1 1                               a-d
-    add r7, r7, #4  ; a1-d1+4   1                               +4
-    mov r3, r3, asr #3  ; a1+d1+4 >> 3  1, r3 ^1                >>3
-    mov r7, r7, asr #3  ; a1-d1+4 >> 3  1, r7 ^1                                >>3
-    strh    r7, [r1, #6]    ; out[3] = a1-d1    1
-    sub r0, r0, r9  ; b1-c1 1   b-c
-    add r0, r0, #4  ; b1-c1+4   1   +4
-    subs    r6, r6, #1  ; i--   1                           --
-    mov r0, r0, asr #3  ; b1-c1+4 >> 3  1, r0 ^1    >>3
-    strh    r0, [r1, #4]    ; out[2] = b1-c1    1
-    strh    r3, [r1], r2    ; out[0] = a1+d1    1
-;   add r1, r1, r2  ; out += pitch  1       ++
-    bne loop2   ; if i>0, continue
-returnfull          ;
-    ldmia   sp!, {r4 - r11, pc} ; replace vars, return                      restore
-    ENDP
-
-;********************************************************************************
-;********************************************************************************
-;********************************************************************************
-
-;********************************************************************************
-;*  void short_idct4x4llm_v6_scott(INT16 * input, INT16 * output, INT32 pitch)
-;*      r0  INT16 * input
-;*      r1  INT16 * output
-;*      r2  INT32 pitch
-;*  bench:
-;********************************************************************************
-
-|vp8_short_idct4x4llm_v6_scott| PROC         ;   cycles  in  out pit
-;   mov r0, #0  ;
-;   ldr r0, [r0]    ;
-    stmdb   sp!, {r4 - r11, lr} ; backup registers  1                   backup
-            ;
-    mov r3, #0x00004E00 ;                   cos
-    orr r3, r3, #0x0000007B ; cospi8sqrt2minus1
-    mov r4, #0x00008A00 ;                       sin
-    orr r4, r4, #0x0000008C ; sinpi8sqrt2
-            ;
-    mov r5, #0x2    ; i                         i
-            ;
-short_idct4x4llm_v6_scott_loop1          ;
-    ldr r10, [r0, #(4*2)]   ; i5 | i4                                               5,4
-    ldr r11, [r0, #(12*2)]  ; i13 | i12                                                 13,12
-            ;
-    smulwb  r6, r4, r10 ; ((ip[4] * sinpi8sqrt2) >> 16)                             lt1
-    smulwb  r7, r3, r11 ; ((ip[12] * cospi8sqrt2minus1) >> 16)                                  lt2
-            ;
-    smulwb  r12, r3, r10    ; ((ip[4] * cospi8sqrt2misu1) >> 16)                                                        l2t2
-    smulwb  r14, r4, r11    ; ((ip[12] * sinpi8sqrt2) >> 16)                                                                l2t1
-            ;
-    add r6, r6, r7  ; partial c1                                lt1-lt2
-    add r12, r12, r14   ; partial d1                                                        l2t2+l2t1
-            ;
-    smulwt  r14, r4, r10    ; ((ip[5] * sinpi8sqrt2) >> 16)                                                             ht1
-    smulwt  r7, r3, r11 ; ((ip[13] * cospi8sqrt2minus1) >> 16)                                  ht2
-            ;
-    smulwt  r8, r3, r10 ; ((ip[5] * cospi8sqrt2minus1) >> 16)                                       h2t1
-    smulwt  r9, r4, r11 ; ((ip[13] * sinpi8sqrt2) >> 16)                                            h2t2
-            ;
-    add r7, r14, r7 ; partial c1_2                                  ht1+ht2
-    sub r8, r8, r9  ; partial d1_2                                      h2t1-h2t2
-            ;
-    pkhbt   r6, r6, r7, lsl #16 ; partial c1_2 | partial c1_1                               pack
-    pkhbt   r12, r12, r8, lsl #16   ; partial d1_2 | partial d1_1                                                       pack
-            ;
-    usub16  r6, r6, r10 ; c1_2 | c1_1                               c
-    uadd16  r12, r12, r11   ; d1_2 | d1_1                                                       d
-            ;
-    ldr r10, [r0, #0]   ; i1 | i0                                               1,0
-    ldr r11, [r0, #(8*2)]   ; i9 | i10                                                  9,10
-            ;
-;;;;;;  add r0, r0, #0x4    ;       +4
-;;;;;;  add r1, r1, #0x4    ;           +4
-            ;
-    uadd16  r8, r10, r11    ; i1 + i9 | i0 + i8 aka a1                                      a
-    usub16  r9, r10, r11    ; i1 - i9 | i0 - i8 aka b1                                          b
-            ;
-    uadd16  r7, r8, r12 ; a1 + d1 pair                                  a+d
-    usub16  r14, r8, r12    ; a1 - d1 pair                                                              a-d
-            ;
-    str r7, [r1]    ; op[0] = a1 + d1
-    str r14, [r1, r2]   ; op[pitch*3] = a1 - d1
-            ;
-    add r0, r0, #0x4    ; op[pitch] = b1 + c1       ++
-    add r1, r1, #0x4    ; op[pitch*2] = b1 - c1         ++
-            ;
-    subs    r5, r5, #0x1    ;                           --
-    bne short_idct4x4llm_v6_scott_loop1  ;
-            ;
-    sub r1, r1, #16 ; reset output ptr
-    mov r5, #0x4    ;
-    mov r0, r1  ; input = output
-            ;
-short_idct4x4llm_v6_scott_loop2          ;
-            ;
-    subs    r5, r5, #0x1    ;
-    bne short_idct4x4llm_v6_scott_loop2  ;
-            ;
-    ldmia   sp!, {r4 - r11, pc} ;
-    ENDP        ;
-            ;
-;********************************************************************************
-;********************************************************************************
-;********************************************************************************
-
-;********************************************************************************
-;*  void short_idct4x4llm_v6_dual(INT16 * input, INT16 * output, INT32 pitch)
-;*      r0  INT16 * input
-;*      r1  INT16 * output
-;*      r2  INT32 pitch
-;*  bench:
-;********************************************************************************
-
-|vp8_short_idct4x4llm_v6_dual| PROC          ;   cycles  in  out pit
-            ;
-    stmdb   sp!, {r4-r11, lr}   ; backup registers  1                   backup
-    mov r3, #0x00004E00 ;                   cos
-    orr r3, r3, #0x0000007B ; cospi8sqrt2minus1
-    mov r4, #0x00008A00 ;                       sin
-    orr r4, r4, #0x0000008C ; sinpi8sqrt2
-    mov r5, #0x2    ; i=2                           i
-loop1_dual
-    ldr r6, [r0, #(4*2)]    ; i5 | i4                               5|4
-    ldr r12, [r0, #(12*2)]  ; i13 | i12                                                     13|12
-    ldr r14, [r0, #(8*2)]   ; i9 | i8                                                               9|8
-
-    smulwt  r9, r3, r6  ; (ip[5] * cospi8sqrt2minus1) >> 16                                         5c
-    smulwb  r7, r3, r6  ; (ip[4] * cospi8sqrt2minus1) >> 16                                 4c
-    smulwt  r10, r4, r6 ; (ip[5] * sinpi8sqrt2) >> 16                                               5s
-    smulwb  r8, r4, r6  ; (ip[4] * sinpi8sqrt2) >> 16                                       4s
-    pkhbt   r7, r7, r9, lsl #16 ; 5c | 4c
-    smulwt  r11, r3, r12    ; (ip[13] * cospi8sqrt2minus1) >> 16                                                    13c
-    pkhbt   r8, r8, r10, lsl #16    ; 5s | 4s
-    uadd16  r6, r6, r7  ; 5c+5 | 4c+4
-    smulwt  r7, r4, r12 ; (ip[13] * sinpi8sqrt2) >> 16                                  13s
-    smulwb  r9, r3, r12 ; (ip[12] * cospi8sqrt2minus1) >> 16                                            12c
-    smulwb  r10, r4, r12    ; (ip[12] * sinpi8sqrt2) >> 16                                              12s
-    subs    r5, r5, #0x1    ; i--                           --
-    pkhbt   r9, r9, r11, lsl #16    ; 13c | 12c
-    ldr r11, [r0], #0x4 ; i1 | i0       ++                                          1|0
-    pkhbt   r10, r10, r7, lsl #16   ; 13s | 12s
-    uadd16  r7, r12, r9 ; 13c+13 | 12c+12
-    usub16  r7, r8, r7  ; c                                 c
-    uadd16  r6, r6, r10 ; d                             d
-    uadd16  r10, r11, r14   ; a                                             a
-    usub16  r8, r11, r14    ; b                                     b
-    uadd16  r9, r10, r6 ; a+d                                           a+d
-    usub16  r10, r10, r6    ; a-d                                               a-d
-    uadd16  r6, r8, r7  ; b+c                               b+c
-    usub16  r7, r8, r7  ; b-c                                   b-c
-    str r6, [r1, r2]    ; o5 | o4
-    add r6, r2, r2  ; pitch * 2                             p2
-    str r7, [r1, r6]    ; o9 | o8
-    add r6,  r6, r2 ; pitch * 3                             p3
-    str r10, [r1, r6]   ; o13 | o12
-    str r9, [r1], #0x4  ; o1 | o0           ++
-    bne loop1_dual  ;
-    mov r5, #0x2    ; i=2                           i
-    sub r0, r1, #8  ; reset input/output        i/o
-loop2_dual
-    ldr r6, [r0, r2]    ; i5 | i4                               5|4
-    ldr r1, [r0]    ; i1 | i0           1|0
-    ldr r12, [r0, #0x4] ; i3 | i2                                                       3|2
-    add r14, r2, #0x4   ; pitch + 2                                                             p+2
-    ldr r14, [r0, r14]  ; i7 | i6                                                               7|6
-    smulwt  r9, r3, r6  ; (ip[5] * cospi8sqrt2minus1) >> 16                                         5c
-    smulwt  r7, r3, r1  ; (ip[1] * cospi8sqrt2minus1) >> 16                                 1c
-    smulwt  r10, r4, r6 ; (ip[5] * sinpi8sqrt2) >> 16                                               5s
-    smulwt  r8, r4, r1  ; (ip[1] * sinpi8sqrt2) >> 16                                       1s
-    pkhbt   r11, r6, r1, lsl #16    ; i0 | i4                                                   0|4
-    pkhbt   r7, r9, r7, lsl #16 ; 1c | 5c
-    pkhbt   r8, r10, r8, lsl #16    ; 1s | 5s = temp1 �                                     tc1
-    pkhtb   r1, r1, r6, asr #16 ; i1 | i5           1|5
-    uadd16  r1, r7, r1  ; 1c+1 | 5c+5 = temp2 (d)           td2
-    pkhbt   r9, r14, r12, lsl #16   ; i2 | i6                                           2|6
-    uadd16  r10, r11, r9    ; a                                             a
-    usub16  r9, r11, r9 ; b                                         b
-    pkhtb   r6, r12, r14, asr #16   ; i3 | i7                               3|7
-    subs    r5, r5, #0x1    ; i--                           --
-    smulwt  r7, r3, r6  ; (ip[3] * cospi8sqrt2minus1) >> 16                                 3c
-    smulwt  r11, r4, r6 ; (ip[3] * sinpi8sqrt2) >> 16                                                   3s
-    smulwb  r12, r3, r6 ; (ip[7] * cospi8sqrt2minus1) >> 16                                                     7c
-    smulwb  r14, r4, r6 ; (ip[7] * sinpi8sqrt2) >> 16                                                               7s
-
-    pkhbt   r7, r12, r7, lsl #16    ; 3c | 7c
-    pkhbt   r11, r14, r11, lsl #16  ; 3s | 7s = temp1 (d)                                                   td1
-    uadd16  r6, r7, r6  ; 3c+3 | 7c+7 = temp2  (c)                              tc2
-    usub16  r12, r8, r6 ; c (o1 | o5)                                                       c
-    uadd16  r6, r11, r1 ; d (o3 | o7)                               d
-    uadd16  r7, r10, r6 ; a+d                                   a+d
-    mov r8, #0x4    ; set up 4's                                        4
-    orr r8, r8, #0x40000    ;                                       4|4
-    usub16  r6, r10, r6 ; a-d                               a-d
-    uadd16  r6, r6, r8  ; a-d+4                             3|7
-    uadd16  r7, r7, r8  ; a+d+4                                 0|4
-    uadd16  r10, r9, r12    ; b+c                                               b+c
-    usub16  r1, r9, r12 ; b-c           b-c
-    uadd16  r10, r10, r8    ; b+c+4                                             1|5
-    uadd16  r1, r1, r8  ; b-c+4         2|6
-    mov r8, r10, asr #19    ; o1 >> 3
-    strh    r8, [r0, #2]    ; o1
-    mov r8, r1, asr #19 ; o2 >> 3
-    strh    r8, [r0, #4]    ; o2
-    mov r8, r6, asr #19 ; o3 >> 3
-    strh    r8, [r0, #6]    ; o3
-    mov r8, r7, asr #19 ; o0 >> 3
-    strh    r8, [r0], r2    ; o0        +p
-    sxth    r10, r10    ;
-    mov r8, r10, asr #3 ; o5 >> 3
-    strh    r8, [r0, #2]    ; o5
-    sxth    r1, r1  ;
-    mov r8, r1, asr #3  ; o6 >> 3
-    strh    r8, [r0, #4]    ; o6
-    sxth    r6, r6  ;
-    mov r8, r6, asr #3  ; o7 >> 3
-    strh    r8, [r0, #6]    ; o7
-    sxth    r7, r7  ;
-    mov r8, r7, asr #3  ; o4 >> 3
-    strh    r8, [r0], r2    ; o4        +p
-;;;;;   subs    r5, r5, #0x1    ; i--                           --
-    bne loop2_dual  ;
-            ;
-    ldmia   sp!, {r4 - r11, pc} ; replace vars, return                      restore
-    ENDP
-
-    END
--- a/vp8/common/arm/armv6/iwalsh_v6.asm
+++ /dev/null
@@ -1,152 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-    EXPORT |vp8_short_inv_walsh4x4_v6|
-    EXPORT |vp8_short_inv_walsh4x4_1_v6|
-
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA    |.text|, CODE, READONLY  ; name this block of code
-
-;short vp8_short_inv_walsh4x4_v6(short *input, short *output)
-|vp8_short_inv_walsh4x4_v6| PROC
-
-    stmdb       sp!, {r4 - r11, lr}
-
-    ldr         r2, [r0], #4         ; [1  |  0]
-    ldr         r3, [r0], #4         ; [3  |  2]
-    ldr         r4, [r0], #4         ; [5  |  4]
-    ldr         r5, [r0], #4         ; [7  |  6]
-    ldr         r6, [r0], #4         ; [9  |  8]
-    ldr         r7, [r0], #4         ; [11 | 10]
-    ldr         r8, [r0], #4         ; [13 | 12]
-    ldr         r9, [r0]             ; [15 | 14]
-
-    qadd16      r10, r2, r8          ; a1 [1+13  |  0+12]
-    qadd16      r11, r4, r6          ; b1 [5+9   |  4+8]
-    qsub16      r12, r4, r6          ; c1 [5-9   |  4-8]
-    qsub16      lr, r2, r8           ; d1 [1-13  |  0-12]
-
-    qadd16      r2, r10, r11         ; a1 + b1 [1  |  0]
-    qadd16      r4, r12, lr          ; c1 + d1 [5  |  4]
-    qsub16      r6, r10, r11         ; a1 - b1 [9  |  8]
-    qsub16      r8, lr, r12          ; d1 - c1 [13 | 12]
-
-    qadd16      r10, r3, r9          ; a1 [3+15  |  2+14]
-    qadd16      r11, r5, r7          ; b1 [7+11  |  6+10]
-    qsub16      r12, r5, r7          ; c1 [7-11  |  6-10]
-    qsub16      lr, r3, r9           ; d1 [3-15  |  2-14]
-
-    qadd16      r3, r10, r11         ; a1 + b1 [3  |  2]
-    qadd16      r5, r12, lr          ; c1 + d1 [7  |  6]
-    qsub16      r7, r10, r11         ; a1 - b1 [11 | 10]
-    qsub16      r9, lr, r12          ; d1 - c1 [15 | 14]
-
-    ; first transform complete
-
-    qsubaddx    r10, r2, r3          ; [c1|a1] [1-2   |   0+3]
-    qaddsubx    r11, r2, r3          ; [b1|d1] [1+2   |   0-3]
-    qsubaddx    r12, r4, r5          ; [c1|a1] [5-6   |   4+7]
-    qaddsubx    lr, r4, r5           ; [b1|d1] [5+6   |   4-7]
-
-    qaddsubx    r2, r10, r11         ; [b2|c2] [c1+d1 | a1-b1]
-    qaddsubx    r3, r11, r10         ; [a2|d2] [b1+a1 | d1-c1]
-    ldr         r10, c0x00030003
-    qaddsubx    r4, r12, lr          ; [b2|c2] [c1+d1 | a1-b1]
-    qaddsubx    r5, lr, r12          ; [a2|d2] [b1+a1 | d1-c1]
-
-    qadd16      r2, r2, r10          ; [b2+3|c2+3]
-    qadd16      r3, r3, r10          ; [a2+3|d2+3]
-    qadd16      r4, r4, r10          ; [b2+3|c2+3]
-    qadd16      r5, r5, r10          ; [a2+3|d2+3]
-
-    asr         r12, r2, #3          ; [1  |  x]
-    pkhtb       r12, r12, r3, asr #19; [1  |  0]
-    lsl         lr, r3, #16          ; [~3 |  x]
-    lsl         r2, r2, #16          ; [~2 |  x]
-    asr         lr, lr, #3           ; [3  |  x]
-    pkhtb       lr, lr, r2, asr #19  ; [3  |  2]
-
-    asr         r2, r4, #3           ; [5  |  x]
-    pkhtb       r2, r2, r5, asr #19  ; [5  |  4]
-    lsl         r3, r5, #16          ; [~7 |  x]
-    lsl         r4, r4, #16          ; [~6 |  x]
-    asr         r3, r3, #3           ; [7  |  x]
-    pkhtb       r3, r3, r4, asr #19  ; [7  |  6]
-
-    str         r12, [r1], #4
-    str         lr, [r1], #4
-    str         r2, [r1], #4
-    str         r3, [r1], #4
-
-    qsubaddx    r2, r6, r7           ; [c1|a1] [9-10  |  8+11]
-    qaddsubx    r3, r6, r7           ; [b1|d1] [9+10  |  8-11]
-    qsubaddx    r4, r8, r9           ; [c1|a1] [13-14 | 12+15]
-    qaddsubx    r5, r8, r9           ; [b1|d1] [13+14 | 12-15]
-
-    qaddsubx    r6, r2, r3           ; [b2|c2] [c1+d1 | a1-b1]
-    qaddsubx    r7, r3, r2           ; [a2|d2] [b1+a1 | d1-c1]
-    qaddsubx    r8, r4, r5           ; [b2|c2] [c1+d1 | a1-b1]
-    qaddsubx    r9, r5, r4           ; [a2|d2] [b1+a1 | d1-c1]
-
-    qadd16      r6, r6, r10          ; [b2+3|c2+3]
-    qadd16      r7, r7, r10          ; [a2+3|d2+3]
-    qadd16      r8, r8, r10          ; [b2+3|c2+3]
-    qadd16      r9, r9, r10          ; [a2+3|d2+3]
-
-    asr         r2, r6, #3           ; [9  |  x]
-    pkhtb       r2, r2, r7, asr #19  ; [9  |  8]
-    lsl         r3, r7, #16          ; [~11|  x]
-    lsl         r4, r6, #16          ; [~10|  x]
-    asr         r3, r3, #3           ; [11 |  x]
-    pkhtb       r3, r3, r4, asr #19  ; [11 | 10]
-
-    asr         r4, r8, #3           ; [13 |  x]
-    pkhtb       r4, r4, r9, asr #19  ; [13 | 12]
-    lsl         r5, r9, #16          ; [~15|  x]
-    lsl         r6, r8, #16          ; [~14|  x]
-    asr         r5, r5, #3           ; [15 |  x]
-    pkhtb       r5, r5, r6, asr #19  ; [15 | 14]
-
-    str         r2, [r1], #4
-    str         r3, [r1], #4
-    str         r4, [r1], #4
-    str         r5, [r1]
-
-    ldmia       sp!, {r4 - r11, pc}
-    ENDP        ; |vp8_short_inv_walsh4x4_v6|
-
-
-;short vp8_short_inv_walsh4x4_1_v6(short *input, short *output)
-|vp8_short_inv_walsh4x4_1_v6| PROC
-
-    ldrsh       r2, [r0]             ; [0]
-    add         r2, r2, #3           ; [0] + 3
-    asr         r2, r2, #3           ; a1 ([0]+3) >> 3
-    lsl         r2, r2, #16          ; [a1 |  x]
-    orr         r2, r2, r2, lsr #16  ; [a1 | a1]
-
-    str         r2, [r1], #4
-    str         r2, [r1], #4
-    str         r2, [r1], #4
-    str         r2, [r1], #4
-    str         r2, [r1], #4
-    str         r2, [r1], #4
-    str         r2, [r1], #4
-    str         r2, [r1]
-
-    bx          lr
-    ENDP        ; |vp8_short_inv_walsh4x4_1_v6|
-
-; Constant Pool
-c0x00030003 DCD 0x00030003
-    END
--- a/vp8/common/arm/armv6/loopfilter_v6.asm
+++ /dev/null
@@ -1,1282 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT |vp9_loop_filter_horizontal_edge_armv6|
-    EXPORT |vp9_mbloop_filter_horizontal_edge_armv6|
-    EXPORT |vp9_loop_filter_vertical_edge_armv6|
-    EXPORT |vp9_mbloop_filter_vertical_edge_armv6|
-
-    AREA    |.text|, CODE, READONLY  ; name this block of code
-
-    MACRO
-    TRANSPOSE_MATRIX $a0, $a1, $a2, $a3, $b0, $b1, $b2, $b3
-    ; input: $a0, $a1, $a2, $a3; output: $b0, $b1, $b2, $b3
-    ; a0: 03 02 01 00
-    ; a1: 13 12 11 10
-    ; a2: 23 22 21 20
-    ; a3: 33 32 31 30
-    ;     b3 b2 b1 b0
-
-    uxtb16      $b1, $a1                    ; xx 12 xx 10
-    uxtb16      $b0, $a0                    ; xx 02 xx 00
-    uxtb16      $b3, $a3                    ; xx 32 xx 30
-    uxtb16      $b2, $a2                    ; xx 22 xx 20
-    orr         $b1, $b0, $b1, lsl #8       ; 12 02 10 00
-    orr         $b3, $b2, $b3, lsl #8       ; 32 22 30 20
-
-    uxtb16      $a1, $a1, ror #8            ; xx 13 xx 11
-    uxtb16      $a3, $a3, ror #8            ; xx 33 xx 31
-    uxtb16      $a0, $a0, ror #8            ; xx 03 xx 01
-    uxtb16      $a2, $a2, ror #8            ; xx 23 xx 21
-    orr         $a0, $a0, $a1, lsl #8       ; 13 03 11 01
-    orr         $a2, $a2, $a3, lsl #8       ; 33 23 31 21
-
-    pkhtb       $b2, $b3, $b1, asr #16      ; 32 22 12 02   -- p1
-    pkhbt       $b0, $b1, $b3, lsl #16      ; 30 20 10 00   -- p3
-
-    pkhtb       $b3, $a2, $a0, asr #16      ; 33 23 13 03   -- p0
-    pkhbt       $b1, $a0, $a2, lsl #16      ; 31 21 11 01   -- p2
-    MEND
-
-
-src         RN  r0
-pstep       RN  r1
-count       RN  r5
-
-;r0     unsigned char *src_ptr,
-;r1     int src_pixel_step,
-;r2     const char *blimit,
-;r3     const char *limit,
-;stack  const char *thresh,
-;stack  int  count
-
-;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
-|vp9_loop_filter_horizontal_edge_armv6| PROC
-;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
-    stmdb       sp!, {r4 - r11, lr}
-
-    sub         src, src, pstep, lsl #2     ; move src pointer down by 4 lines
-    ldr         count, [sp, #40]            ; count for 8-in-parallel
-    ldr         r6, [sp, #36]               ; load thresh address
-    sub         sp, sp, #16                 ; create temp buffer
-
-    ldr         r9, [src], pstep            ; p3
-    ldrb        r4, [r2]                    ; blimit
-    ldr         r10, [src], pstep           ; p2
-    ldrb        r2, [r3]                    ; limit
-    ldr         r11, [src], pstep           ; p1
-    orr         r4, r4, r4, lsl #8
-    ldrb        r3, [r6]                    ; thresh
-    orr         r2, r2, r2, lsl #8
-    mov         count, count, lsl #1        ; 4-in-parallel
-    orr         r4, r4, r4, lsl #16
-    orr         r3, r3, r3, lsl #8
-    orr         r2, r2, r2, lsl #16
-    orr         r3, r3, r3, lsl #16
-
-|Hnext8|
-    ; vp9_filter_mask() function
-    ; calculate breakout conditions
-    ldr         r12, [src], pstep           ; p0
-
-    uqsub8      r6, r9, r10                 ; p3 - p2
-    uqsub8      r7, r10, r9                 ; p2 - p3
-    uqsub8      r8, r10, r11                ; p2 - p1
-    uqsub8      r10, r11, r10               ; p1 - p2
-
-    orr         r6, r6, r7                  ; abs (p3-p2)
-    orr         r8, r8, r10                 ; abs (p2-p1)
-    uqsub8      lr, r6, r2                  ; compare to limit. lr: vp9_filter_mask
-    uqsub8      r8, r8, r2                  ; compare to limit
-    uqsub8      r6, r11, r12                ; p1 - p0
-    orr         lr, lr, r8
-    uqsub8      r7, r12, r11                ; p0 - p1
-    ldr         r9, [src], pstep            ; q0
-    ldr         r10, [src], pstep           ; q1
-    orr         r6, r6, r7                  ; abs (p1-p0)
-    uqsub8      r7, r6, r2                  ; compare to limit
-    uqsub8      r8, r6, r3                  ; compare to thresh  -- save r8 for later
-    orr         lr, lr, r7
-
-    uqsub8      r6, r11, r10                ; p1 - q1
-    uqsub8      r7, r10, r11                ; q1 - p1
-    uqsub8      r11, r12, r9                ; p0 - q0
-    uqsub8      r12, r9, r12                ; q0 - p0
-    orr         r6, r6, r7                  ; abs (p1-q1)
-    ldr         r7, c0x7F7F7F7F
-    orr         r12, r11, r12               ; abs (p0-q0)
-    ldr         r11, [src], pstep           ; q2
-    uqadd8      r12, r12, r12               ; abs (p0-q0) * 2
-    and         r6, r7, r6, lsr #1          ; abs (p1-q1) / 2
-    uqsub8      r7, r9, r10                 ; q0 - q1
-    uqadd8      r12, r12, r6                ; abs (p0-q0)*2 + abs (p1-q1)/2
-    uqsub8      r6, r10, r9                 ; q1 - q0
-    uqsub8      r12, r12, r4                ; compare to flimit
-    uqsub8      r9, r11, r10                ; q2 - q1
-
-    orr         lr, lr, r12
-
-    ldr         r12, [src], pstep           ; q3
-    uqsub8      r10, r10, r11               ; q1 - q2
-    orr         r6, r7, r6                  ; abs (q1-q0)
-    orr         r10, r9, r10                ; abs (q2-q1)
-    uqsub8      r7, r6, r2                  ; compare to limit
-    uqsub8      r10, r10, r2                ; compare to limit
-    uqsub8      r6, r6, r3                  ; compare to thresh -- save r6 for later
-    orr         lr, lr, r7
-    orr         lr, lr, r10
-
-    uqsub8      r10, r12, r11               ; q3 - q2
-    uqsub8      r9, r11, r12                ; q2 - q3
-
-    mvn         r11, #0                     ; r11 == -1
-
-    orr         r10, r10, r9                ; abs (q3-q2)
-    uqsub8      r10, r10, r2                ; compare to limit
-
-    mov         r12, #0
-    orr         lr, lr, r10
-    sub         src, src, pstep, lsl #2
-
-    usub8       lr, r12, lr                 ; use usub8 instead of ssub8
-    sel         lr, r11, r12                ; filter mask: lr
-
-    cmp         lr, #0
-    beq         hskip_filter                 ; skip filtering
-
-    sub         src, src, pstep, lsl #1     ; move src pointer down by 6 lines
-
-    ;vp8_hevmask() function
-    ;calculate high edge variance
-    orr         r10, r6, r8                 ; calculate vp8_hevmask
-
-    ldr         r7, [src], pstep            ; p1
-
-    usub8       r10, r12, r10               ; use usub8 instead of ssub8
-    sel         r6, r12, r11                ; obtain vp8_hevmask: r6
-
-    ;vp9_filter() function
-    ldr         r8, [src], pstep            ; p0
-    ldr         r12, c0x80808080
-    ldr         r9, [src], pstep            ; q0
-    ldr         r10, [src], pstep           ; q1
-
-    eor         r7, r7, r12                 ; p1 offset to convert to a signed value
-    eor         r8, r8, r12                 ; p0 offset to convert to a signed value
-    eor         r9, r9, r12                 ; q0 offset to convert to a signed value
-    eor         r10, r10, r12               ; q1 offset to convert to a signed value
-
-    str         r9, [sp]                    ; store qs0 temporarily
-    str         r8, [sp, #4]                ; store ps0 temporarily
-    str         r10, [sp, #8]               ; store qs1 temporarily
-    str         r7, [sp, #12]               ; store ps1 temporarily
-
-    qsub8       r7, r7, r10                 ; vp9_signed_char_clamp(ps1-qs1)
-    qsub8       r8, r9, r8                  ; vp9_signed_char_clamp(vp9_filter + 3 * ( qs0 - ps0))
-
-    and         r7, r7, r6                  ; vp9_filter (r7) &= hev
-
-    qadd8       r7, r7, r8
-    ldr         r9, c0x03030303             ; r9 = 3 --modified for vp8
-
-    qadd8       r7, r7, r8
-    ldr         r10, c0x04040404
-
-    qadd8       r7, r7, r8
-    and         r7, r7, lr                  ; vp9_filter &= mask;
-
-    ;modify code for vp8 -- Filter1 = vp9_filter (r7)
-    qadd8       r8 , r7 , r9                ; Filter2 (r8) = vp9_signed_char_clamp(vp9_filter+3)
-    qadd8       r7 , r7 , r10               ; vp9_filter = vp9_signed_char_clamp(vp9_filter+4)
-
-    mov         r9, #0
-    shadd8      r8 , r8 , r9                ; Filter2 >>= 3
-    shadd8      r7 , r7 , r9                ; vp9_filter >>= 3
-    shadd8      r8 , r8 , r9
-    shadd8      r7 , r7 , r9
-    shadd8      lr , r8 , r9                ; lr: Filter2
-    shadd8      r7 , r7 , r9                ; r7: filter
-
-    ;usub8      lr, r8, r10                 ; s = (s==4)*-1
-    ;sel        lr, r11, r9
-    ;usub8      r8, r10, r8
-    ;sel        r8, r11, r9
-    ;and        r8, r8, lr                  ; -1 for each element that equals 4
-
-    ;calculate output
-    ;qadd8      lr, r8, r7                  ; u = vp9_signed_char_clamp(s + vp9_filter)
-
-    ldr         r8, [sp]                    ; load qs0
-    ldr         r9, [sp, #4]                ; load ps0
-
-    ldr         r10, c0x01010101
-
-    qsub8       r8 ,r8, r7                  ; u = vp9_signed_char_clamp(qs0 - vp9_filter)
-    qadd8       r9, r9, lr                  ; u = vp9_signed_char_clamp(ps0 + Filter2)
-
-    ;end of modification for vp8
-
-    mov         lr, #0
-    sadd8       r7, r7 , r10                ; vp9_filter += 1
-    shadd8      r7, r7, lr                  ; vp9_filter >>= 1
-
-    ldr         r11, [sp, #12]              ; load ps1
-    ldr         r10, [sp, #8]               ; load qs1
-
-    bic         r7, r7, r6                  ; vp9_filter &= ~hev
-    sub         src, src, pstep, lsl #2
-
-    qadd8       r11, r11, r7                ; u = vp9_signed_char_clamp(ps1 + vp9_filter)
-    qsub8       r10, r10,r7                 ; u = vp9_signed_char_clamp(qs1 - vp9_filter)
-
-    eor         r11, r11, r12               ; *op1 = u^0x80
-    str         r11, [src], pstep           ; store op1
-    eor         r9, r9, r12                 ; *op0 = u^0x80
-    str         r9, [src], pstep            ; store op0 result
-    eor         r8, r8, r12                 ; *oq0 = u^0x80
-    str         r8, [src], pstep            ; store oq0 result
-    eor         r10, r10, r12               ; *oq1 = u^0x80
-    str         r10, [src], pstep           ; store oq1
-
-    sub         src, src, pstep, lsl #1
-
-|hskip_filter|
-    add         src, src, #4
-    sub         src, src, pstep, lsl #2
-
-    subs        count, count, #1
-
-    ldrne       r9, [src], pstep            ; p3
-    ldrne       r10, [src], pstep           ; p2
-    ldrne       r11, [src], pstep           ; p1
-
-    bne         Hnext8
-
-    add         sp, sp, #16
-    ldmia       sp!, {r4 - r11, pc}
-    ENDP        ; |vp9_loop_filter_horizontal_edge_armv6|
-
-
-;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
-|vp8_mbloop_filter_horizontal_edge_armv6| PROC
-;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
-    stmdb       sp!, {r4 - r11, lr}
-
-    sub         src, src, pstep, lsl #2     ; move src pointer down by 4 lines
-    ldr         count, [sp, #40]            ; count for 8-in-parallel
-    ldr         r6, [sp, #36]               ; load thresh address
-    sub         sp, sp, #16                 ; create temp buffer
-
-    ldr         r9, [src], pstep            ; p3
-    ldrb        r4, [r2]                    ; blimit
-    ldr         r10, [src], pstep           ; p2
-    ldrb        r2, [r3]                    ; limit
-    ldr         r11, [src], pstep           ; p1
-    orr         r4, r4, r4, lsl #8
-    ldrb        r3, [r6]                    ; thresh
-    orr         r2, r2, r2, lsl #8
-    mov         count, count, lsl #1        ; 4-in-parallel
-    orr         r4, r4, r4, lsl #16
-    orr         r3, r3, r3, lsl #8
-    orr         r2, r2, r2, lsl #16
-    orr         r3, r3, r3, lsl #16
-
-|MBHnext8|
-
-    ; vp9_filter_mask() function
-    ; calculate breakout conditions
-    ldr         r12, [src], pstep           ; p0
-
-    uqsub8      r6, r9, r10                 ; p3 - p2
-    uqsub8      r7, r10, r9                 ; p2 - p3
-    uqsub8      r8, r10, r11                ; p2 - p1
-    uqsub8      r10, r11, r10               ; p1 - p2
-
-    orr         r6, r6, r7                  ; abs (p3-p2)
-    orr         r8, r8, r10                 ; abs (p2-p1)
-    uqsub8      lr, r6, r2                  ; compare to limit. lr: vp9_filter_mask
-    uqsub8      r8, r8, r2                  ; compare to limit
-
-    uqsub8      r6, r11, r12                ; p1 - p0
-    orr         lr, lr, r8
-    uqsub8      r7, r12, r11                ; p0 - p1
-    ldr         r9, [src], pstep            ; q0
-    ldr         r10, [src], pstep           ; q1
-    orr         r6, r6, r7                  ; abs (p1-p0)
-    uqsub8      r7, r6, r2                  ; compare to limit
-    uqsub8      r8, r6, r3                  ; compare to thresh  -- save r8 for later
-    orr         lr, lr, r7
-
-    uqsub8      r6, r11, r10                ; p1 - q1
-    uqsub8      r7, r10, r11                ; q1 - p1
-    uqsub8      r11, r12, r9                ; p0 - q0
-    uqsub8      r12, r9, r12                ; q0 - p0
-    orr         r6, r6, r7                  ; abs (p1-q1)
-    ldr         r7, c0x7F7F7F7F
-    orr         r12, r11, r12               ; abs (p0-q0)
-    ldr         r11, [src], pstep           ; q2
-    uqadd8      r12, r12, r12               ; abs (p0-q0) * 2
-    and         r6, r7, r6, lsr #1          ; abs (p1-q1) / 2
-    uqsub8      r7, r9, r10                 ; q0 - q1
-    uqadd8      r12, r12, r6                ; abs (p0-q0)*2 + abs (p1-q1)/2
-    uqsub8      r6, r10, r9                 ; q1 - q0
-    uqsub8      r12, r12, r4                ; compare to flimit
-    uqsub8      r9, r11, r10                ; q2 - q1
-
-    orr         lr, lr, r12
-
-    ldr         r12, [src], pstep           ; q3
-
-    uqsub8      r10, r10, r11               ; q1 - q2
-    orr         r6, r7, r6                  ; abs (q1-q0)
-    orr         r10, r9, r10                ; abs (q2-q1)
-    uqsub8      r7, r6, r2                  ; compare to limit
-    uqsub8      r10, r10, r2                ; compare to limit
-    uqsub8      r6, r6, r3                  ; compare to thresh -- save r6 for later
-    orr         lr, lr, r7
-    orr         lr, lr, r10
-
-    uqsub8      r10, r12, r11               ; q3 - q2
-    uqsub8      r9, r11, r12                ; q2 - q3
-
-    mvn         r11, #0                     ; r11 == -1
-
-    orr         r10, r10, r9                ; abs (q3-q2)
-    uqsub8      r10, r10, r2                ; compare to limit
-
-    mov         r12, #0
-
-    orr         lr, lr, r10
-
-    usub8       lr, r12, lr                 ; use usub8 instead of ssub8
-    sel         lr, r11, r12                ; filter mask: lr
-
-    cmp         lr, #0
-    beq         mbhskip_filter               ; skip filtering
-
-    ;vp8_hevmask() function
-    ;calculate high edge variance
-    sub         src, src, pstep, lsl #2     ; move src pointer down by 6 lines
-    sub         src, src, pstep, lsl #1
-
-    orr         r10, r6, r8
-    ldr         r7, [src], pstep            ; p1
-
-    usub8       r10, r12, r10
-    sel         r6, r12, r11                ; hev mask: r6
-
-    ;vp8_mbfilter() function
-    ;p2, q2 are only needed at the end. Don't need to load them in now.
-    ldr         r8, [src], pstep            ; p0
-    ldr         r12, c0x80808080
-    ldr         r9, [src], pstep            ; q0
-    ldr         r10, [src]                  ; q1
-
-    eor         r7, r7, r12                 ; ps1
-    eor         r8, r8, r12                 ; ps0
-    eor         r9, r9, r12                 ; qs0
-    eor         r10, r10, r12               ; qs1
-
-    qsub8       r12, r9, r8                 ; vp9_signed_char_clamp(vp9_filter + 3 * ( qs0 - ps0))
-    str         r7, [sp, #12]               ; store ps1 temporarily
-    qsub8       r7, r7, r10                 ; vp9_signed_char_clamp(ps1-qs1)
-    str         r10, [sp, #8]               ; store qs1 temporarily
-    qadd8       r7, r7, r12
-    str         r9, [sp]                    ; store qs0 temporarily
-    qadd8       r7, r7, r12
-    str         r8, [sp, #4]                ; store ps0 temporarily
-    qadd8       r7, r7, r12                 ; vp9_filter: r7
-
-    ldr         r10, c0x03030303            ; r10 = 3 --modified for vp8
-    ldr         r9, c0x04040404
-
-    and         r7, r7, lr                  ; vp9_filter &= mask (lr is free)
-
-    mov         r12, r7                     ; Filter2: r12
-    and         r12, r12, r6                ; Filter2 &= hev
-
-    ;modify code for vp8
-    ;save bottom 3 bits so that we round one side +4 and the other +3
-    qadd8       r8 , r12 , r9               ; Filter1 (r8) = vp9_signed_char_clamp(Filter2+4)
-    qadd8       r12 , r12 , r10             ; Filter2 (r12) = vp9_signed_char_clamp(Filter2+3)
-
-    mov         r10, #0
-    shadd8      r8 , r8 , r10               ; Filter1 >>= 3
-    shadd8      r12 , r12 , r10             ; Filter2 >>= 3
-    shadd8      r8 , r8 , r10
-    shadd8      r12 , r12 , r10
-    shadd8      r8 , r8 , r10               ; r8: Filter1
-    shadd8      r12 , r12 , r10             ; r12: Filter2
-
-    ldr         r9, [sp]                    ; load qs0
-    ldr         r11, [sp, #4]               ; load ps0
-
-    qsub8       r9 , r9, r8                 ; qs0 = vp9_signed_char_clamp(qs0 - Filter1)
-    qadd8       r11, r11, r12               ; ps0 = vp9_signed_char_clamp(ps0 + Filter2)
-
-    ;save bottom 3 bits so that we round one side +4 and the other +3
-    ;and            r8, r12, r10                ; s = Filter2 & 7 (s: r8)
-    ;qadd8      r12 , r12 , r9              ; Filter2 = vp9_signed_char_clamp(Filter2+4)
-    ;mov            r10, #0
-    ;shadd8     r12 , r12 , r10             ; Filter2 >>= 3
-    ;usub8      lr, r8, r9                  ; s = (s==4)*-1
-    ;sel            lr, r11, r10
-    ;shadd8     r12 , r12 , r10
-    ;usub8      r8, r9, r8
-    ;sel            r8, r11, r10
-    ;ldr            r9, [sp]                    ; load qs0
-    ;ldr            r11, [sp, #4]               ; load ps0
-    ;shadd8     r12 , r12 , r10
-    ;and            r8, r8, lr                  ; -1 for each element that equals 4
-    ;qadd8      r10, r8, r12                ; u = vp9_signed_char_clamp(s + Filter2)
-    ;qsub8      r9 , r9, r12                ; qs0 = vp9_signed_char_clamp(qs0 - Filter2)
-    ;qadd8      r11, r11, r10               ; ps0 = vp9_signed_char_clamp(ps0 + u)
-
-    ;end of modification for vp8
-
-    bic         r12, r7, r6                 ; vp9_filter &= ~hev    ( r6 is free)
-    ;mov        r12, r7
-
-    ;roughly 3/7th difference across boundary
-    mov         lr, #0x1b                   ; 27
-    mov         r7, #0x3f                   ; 63
-
-    sxtb16      r6, r12
-    sxtb16      r10, r12, ror #8
-    smlabb      r8, r6, lr, r7
-    smlatb      r6, r6, lr, r7
-    smlabb      r7, r10, lr, r7
-    smultb      r10, r10, lr
-    ssat        r8, #8, r8, asr #7
-    ssat        r6, #8, r6, asr #7
-    add         r10, r10, #63
-    ssat        r7, #8, r7, asr #7
-    ssat        r10, #8, r10, asr #7
-
-    ldr         lr, c0x80808080
-
-    pkhbt       r6, r8, r6, lsl #16
-    pkhbt       r10, r7, r10, lsl #16
-    uxtb16      r6, r6
-    uxtb16      r10, r10
-
-    sub         src, src, pstep
-
-    orr         r10, r6, r10, lsl #8        ; u = vp9_signed_char_clamp((63 + Filter2 * 27)>>7)
-
-    qsub8       r8, r9, r10                 ; s = vp9_signed_char_clamp(qs0 - u)
-    qadd8       r10, r11, r10               ; s = vp9_signed_char_clamp(ps0 + u)
-    eor         r8, r8, lr                  ; *oq0 = s^0x80
-    str         r8, [src]                   ; store *oq0
-    sub         src, src, pstep
-    eor         r10, r10, lr                ; *op0 = s^0x80
-    str         r10, [src]                  ; store *op0
-
-    ;roughly 2/7th difference across boundary
-    mov         lr, #0x12                   ; 18
-    mov         r7, #0x3f                   ; 63
-
-    sxtb16      r6, r12
-    sxtb16      r10, r12, ror #8
-    smlabb      r8, r6, lr, r7
-    smlatb      r6, r6, lr, r7
-    smlabb      r9, r10, lr, r7
-    smlatb      r10, r10, lr, r7
-    ssat        r8, #8, r8, asr #7
-    ssat        r6, #8, r6, asr #7
-    ssat        r9, #8, r9, asr #7
-    ssat        r10, #8, r10, asr #7
-
-    ldr         lr, c0x80808080
-
-    pkhbt       r6, r8, r6, lsl #16
-    pkhbt       r10, r9, r10, lsl #16
-
-    ldr         r9, [sp, #8]                ; load qs1
-    ldr         r11, [sp, #12]              ; load ps1
-
-    uxtb16      r6, r6
-    uxtb16      r10, r10
-
-    sub         src, src, pstep
-
-    orr         r10, r6, r10, lsl #8        ; u = vp9_signed_char_clamp((63 + Filter2 * 18)>>7)
-
-    qadd8       r11, r11, r10               ; s = vp9_signed_char_clamp(ps1 + u)
-    qsub8       r8, r9, r10                 ; s = vp9_signed_char_clamp(qs1 - u)
-    eor         r11, r11, lr                ; *op1 = s^0x80
-    str         r11, [src], pstep           ; store *op1
-    eor         r8, r8, lr                  ; *oq1 = s^0x80
-    add         src, src, pstep, lsl #1
-
-    mov         r7, #0x3f                   ; 63
-
-    str         r8, [src], pstep            ; store *oq1
-
-    ;roughly 1/7th difference across boundary
-    mov         lr, #0x9                    ; 9
-    ldr         r9, [src]                   ; load q2
-
-    sxtb16      r6, r12
-    sxtb16      r10, r12, ror #8
-    smlabb      r8, r6, lr, r7
-    smlatb      r6, r6, lr, r7
-    smlabb      r12, r10, lr, r7
-    smlatb      r10, r10, lr, r7
-    ssat        r8, #8, r8, asr #7
-    ssat        r6, #8, r6, asr #7
-    ssat        r12, #8, r12, asr #7
-    ssat        r10, #8, r10, asr #7
-
-    sub         src, src, pstep, lsl #2
-
-    pkhbt       r6, r8, r6, lsl #16
-    pkhbt       r10, r12, r10, lsl #16
-
-    sub         src, src, pstep
-    ldr         lr, c0x80808080
-
-    ldr         r11, [src]                  ; load p2
-
-    uxtb16      r6, r6
-    uxtb16      r10, r10
-
-    eor         r9, r9, lr
-    eor         r11, r11, lr
-
-    orr         r10, r6, r10, lsl #8        ; u = vp9_signed_char_clamp((63 + Filter2 * 9)>>7)
-
-    qadd8       r8, r11, r10                ; s = vp9_signed_char_clamp(ps2 + u)
-    qsub8       r10, r9, r10                ; s = vp9_signed_char_clamp(qs2 - u)
-    eor         r8, r8, lr                  ; *op2 = s^0x80
-    str         r8, [src], pstep, lsl #2    ; store *op2
-    add         src, src, pstep
-    eor         r10, r10, lr                ; *oq2 = s^0x80
-    str         r10, [src], pstep, lsl #1   ; store *oq2
-
-|mbhskip_filter|
-    add         src, src, #4
-    sub         src, src, pstep, lsl #3
-    subs        count, count, #1
-
-    ldrne       r9, [src], pstep            ; p3
-    ldrne       r10, [src], pstep           ; p2
-    ldrne       r11, [src], pstep           ; p1
-
-    bne         MBHnext8
-
-    add         sp, sp, #16
-    ldmia       sp!, {r4 - r11, pc}
-    ENDP        ; |vp8_mbloop_filter_horizontal_edge_armv6|
-
-
-;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
-|vp9_loop_filter_vertical_edge_armv6| PROC
-;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
-    stmdb       sp!, {r4 - r11, lr}
-
-    sub         src, src, #4                ; move src pointer down by 4
-    ldr         count, [sp, #40]            ; count for 8-in-parallel
-    ldr         r12, [sp, #36]              ; load thresh address
-    sub         sp, sp, #16                 ; create temp buffer
-
-    ldr         r6, [src], pstep            ; load source data
-    ldrb        r4, [r2]                    ; blimit
-    ldr         r7, [src], pstep
-    ldrb        r2, [r3]                    ; limit
-    ldr         r8, [src], pstep
-    orr         r4, r4, r4, lsl #8
-    ldrb        r3, [r12]                   ; thresh
-    orr         r2, r2, r2, lsl #8
-    ldr         lr, [src], pstep
-    mov         count, count, lsl #1        ; 4-in-parallel
-    orr         r4, r4, r4, lsl #16
-    orr         r3, r3, r3, lsl #8
-    orr         r2, r2, r2, lsl #16
-    orr         r3, r3, r3, lsl #16
-
-|Vnext8|
-
-    ; vp9_filter_mask() function
-    ; calculate breakout conditions
-    ; transpose the source data for 4-in-parallel operation
-    TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12
-
-    uqsub8      r7, r9, r10                 ; p3 - p2
-    uqsub8      r8, r10, r9                 ; p2 - p3
-    uqsub8      r9, r10, r11                ; p2 - p1
-    uqsub8      r10, r11, r10               ; p1 - p2
-    orr         r7, r7, r8                  ; abs (p3-p2)
-    orr         r10, r9, r10                ; abs (p2-p1)
-    uqsub8      lr, r7, r2                  ; compare to limit. lr: vp9_filter_mask
-    uqsub8      r10, r10, r2                ; compare to limit
-
-    sub         src, src, pstep, lsl #2     ; move src pointer down by 4 lines
-
-    orr         lr, lr, r10
-
-    uqsub8      r6, r11, r12                ; p1 - p0
-    uqsub8      r7, r12, r11                ; p0 - p1
-    add         src, src, #4                ; move src pointer up by 4
-    orr         r6, r6, r7                  ; abs (p1-p0)
-    str         r11, [sp, #12]              ; save p1
-    uqsub8      r10, r6, r2                 ; compare to limit
-    uqsub8      r11, r6, r3                 ; compare to thresh
-    orr         lr, lr, r10
-
-    ; transpose uses 8 regs(r6 - r12 and lr). Need to save reg value now
-    ; transpose the source data for 4-in-parallel operation
-    ldr         r6, [src], pstep            ; load source data
-    str         r11, [sp]                   ; push r11 to stack
-    ldr         r7, [src], pstep
-    str         r12, [sp, #4]               ; save current reg before load q0 - q3 data
-    ldr         r8, [src], pstep
-    str         lr, [sp, #8]
-    ldr         lr, [src], pstep
-
-    TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12
-
-    ldr         lr, [sp, #8]                ; load back (f)limit accumulator
-
-    uqsub8      r6, r12, r11                ; q3 - q2
-    uqsub8      r7, r11, r12                ; q2 - q3
-    uqsub8      r12, r11, r10               ; q2 - q1
-    uqsub8      r11, r10, r11               ; q1 - q2
-    orr         r6, r6, r7                  ; abs (q3-q2)
-    orr         r7, r12, r11                ; abs (q2-q1)
-    uqsub8      r6, r6, r2                  ; compare to limit
-    uqsub8      r7, r7, r2                  ; compare to limit
-    ldr         r11, [sp, #4]               ; load back p0
-    ldr         r12, [sp, #12]              ; load back p1
-    orr         lr, lr, r6
-    orr         lr, lr, r7
-
-    uqsub8      r6, r11, r9                 ; p0 - q0
-    uqsub8      r7, r9, r11                 ; q0 - p0
-    uqsub8      r8, r12, r10                ; p1 - q1
-    uqsub8      r11, r10, r12               ; q1 - p1
-    orr         r6, r6, r7                  ; abs (p0-q0)
-    ldr         r7, c0x7F7F7F7F
-    orr         r8, r8, r11                 ; abs (p1-q1)
-    uqadd8      r6, r6, r6                  ; abs (p0-q0) * 2
-    and         r8, r7, r8, lsr #1          ; abs (p1-q1) / 2
-    uqsub8      r11, r10, r9                ; q1 - q0
-    uqadd8      r6, r8, r6                  ; abs (p0-q0)*2 + abs (p1-q1)/2
-    uqsub8      r12, r9, r10                ; q0 - q1
-    uqsub8      r6, r6, r4                  ; compare to flimit
-
-    orr         r9, r11, r12                ; abs (q1-q0)
-    uqsub8      r8, r9, r2                  ; compare to limit
-    uqsub8      r10, r9, r3                 ; compare to thresh
-    orr         lr, lr, r6
-    orr         lr, lr, r8
-
-    mvn         r11, #0                     ; r11 == -1
-    mov         r12, #0
-
-    usub8       lr, r12, lr
-    ldr         r9, [sp]                    ; load the compared result
-    sel         lr, r11, r12                ; filter mask: lr
-
-    cmp         lr, #0
-    beq         vskip_filter                 ; skip filtering
-
-    ;vp8_hevmask() function
-    ;calculate high edge variance
-
-    sub         src, src, pstep, lsl #2     ; move src pointer down by 4 lines
-
-    orr         r9, r9, r10
-
-    ldrh        r7, [src, #-2]
-    ldrh        r8, [src], pstep
-
-    usub8       r9, r12, r9
-    sel         r6, r12, r11                ; hev mask: r6
-
-    ;vp9_filter() function
-    ; load soure data to r6, r11, r12, lr
-    ldrh        r9, [src, #-2]
-    ldrh        r10, [src], pstep
-
-    pkhbt       r12, r7, r8, lsl #16
-
-    ldrh        r7, [src, #-2]
-    ldrh        r8, [src], pstep
-
-    pkhbt       r11, r9, r10, lsl #16
-
-    ldrh        r9, [src, #-2]
-    ldrh        r10, [src], pstep
-
-    ; Transpose needs 8 regs(r6 - r12, and lr). Save r6 and lr first
-    str         r6, [sp]
-    str         lr, [sp, #4]
-
-    pkhbt       r6, r7, r8, lsl #16
-    pkhbt       lr, r9, r10, lsl #16
-
-    ;transpose r12, r11, r6, lr to r7, r8, r9, r10
-    TRANSPOSE_MATRIX r12, r11, r6, lr, r7, r8, r9, r10
-
-    ;load back hev_mask r6 and filter_mask lr
-    ldr         r12, c0x80808080
-    ldr         r6, [sp]
-    ldr         lr, [sp, #4]
-
-    eor         r7, r7, r12                 ; p1 offset to convert to a signed value
-    eor         r8, r8, r12                 ; p0 offset to convert to a signed value
-    eor         r9, r9, r12                 ; q0 offset to convert to a signed value
-    eor         r10, r10, r12               ; q1 offset to convert to a signed value
-
-    str         r9, [sp]                    ; store qs0 temporarily
-    str         r8, [sp, #4]                ; store ps0 temporarily
-    str         r10, [sp, #8]               ; store qs1 temporarily
-    str         r7, [sp, #12]               ; store ps1 temporarily
-
-    qsub8       r7, r7, r10                 ; vp9_signed_char_clamp(ps1-qs1)
-    qsub8       r8, r9, r8                  ; vp9_signed_char_clamp(vp9_filter + 3 * ( qs0 - ps0))
-
-    and         r7, r7, r6                  ;  vp9_filter (r7) &= hev (r7 : filter)
-
-    qadd8       r7, r7, r8
-    ldr         r9, c0x03030303             ; r9 = 3 --modified for vp8
-
-    qadd8       r7, r7, r8
-    ldr         r10, c0x04040404
-
-    qadd8       r7, r7, r8
-    ;mvn         r11, #0                     ; r11 == -1
-
-    and         r7, r7, lr                  ; vp9_filter &= mask
-
-    ;modify code for vp8 -- Filter1 = vp9_filter (r7)
-    qadd8       r8 , r7 , r9                ; Filter2 (r8) = vp9_signed_char_clamp(vp9_filter+3)
-    qadd8       r7 , r7 , r10               ; vp9_filter = vp9_signed_char_clamp(vp9_filter+4)
-
-    mov         r9, #0
-    shadd8      r8 , r8 , r9                ; Filter2 >>= 3
-    shadd8      r7 , r7 , r9                ; vp9_filter >>= 3
-    shadd8      r8 , r8 , r9
-    shadd8      r7 , r7 , r9
-    shadd8      lr , r8 , r9                ; lr: filter2
-    shadd8      r7 , r7 , r9                ; r7: filter
-
-    ;usub8      lr, r8, r10                 ; s = (s==4)*-1
-    ;sel            lr, r11, r9
-    ;usub8      r8, r10, r8
-    ;sel            r8, r11, r9
-    ;and            r8, r8, lr                  ; -1 for each element that equals 4 -- r8: s
-
-    ;calculate output
-    ;qadd8      lr, r8, r7                  ; u = vp9_signed_char_clamp(s + vp9_filter)
-
-    ldr         r8, [sp]                    ; load qs0
-    ldr         r9, [sp, #4]                ; load ps0
-
-    ldr         r10, c0x01010101
-
-    qsub8       r8, r8, r7                  ; u = vp9_signed_char_clamp(qs0 - vp9_filter)
-    qadd8       r9, r9, lr                  ; u = vp9_signed_char_clamp(ps0 + Filter2)
-    ;end of modification for vp8
-
-    eor         r8, r8, r12
-    eor         r9, r9, r12
-
-    mov         lr, #0
-
-    sadd8       r7, r7, r10
-    shadd8      r7, r7, lr
-
-    ldr         r10, [sp, #8]               ; load qs1
-    ldr         r11, [sp, #12]              ; load ps1
-
-    bic         r7, r7, r6                  ; r7: vp9_filter
-
-    qsub8       r10 , r10, r7               ; u = vp9_signed_char_clamp(qs1 - vp9_filter)
-    qadd8       r11, r11, r7                ; u = vp9_signed_char_clamp(ps1 + vp9_filter)
-    eor         r10, r10, r12
-    eor         r11, r11, r12
-
-    sub         src, src, pstep, lsl #2
-
-    ;we can use TRANSPOSE_MATRIX macro to transpose output - input: q1, q0, p0, p1
-    ;output is b0, b1, b2, b3
-    ;b0: 03 02 01 00
-    ;b1: 13 12 11 10
-    ;b2: 23 22 21 20
-    ;b3: 33 32 31 30
-    ;    p1 p0 q0 q1
-    ;   (a3 a2 a1 a0)
-    TRANSPOSE_MATRIX r11, r9, r8, r10, r6, r7, r12, lr
-
-    strh        r6, [src, #-2]              ; store the result
-    mov         r6, r6, lsr #16
-    strh        r6, [src], pstep
-
-    strh        r7, [src, #-2]
-    mov         r7, r7, lsr #16
-    strh        r7, [src], pstep
-
-    strh        r12, [src, #-2]
-    mov         r12, r12, lsr #16
-    strh        r12, [src], pstep
-
-    strh        lr, [src, #-2]
-    mov         lr, lr, lsr #16
-    strh        lr, [src], pstep
-
-|vskip_filter|
-    sub         src, src, #4
-    subs        count, count, #1
-
-    ldrne       r6, [src], pstep            ; load source data
-    ldrne       r7, [src], pstep
-    ldrne       r8, [src], pstep
-    ldrne       lr, [src], pstep
-
-    bne         Vnext8
-
-    add         sp, sp, #16
-
-    ldmia       sp!, {r4 - r11, pc}
-    ENDP        ; |vp9_loop_filter_vertical_edge_armv6|
-
-
-
-;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
-|vp8_mbloop_filter_vertical_edge_armv6| PROC
-;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
-    stmdb       sp!, {r4 - r11, lr}
-
-    sub         src, src, #4                ; move src pointer down by 4
-    ldr         count, [sp, #40]            ; count for 8-in-parallel
-    ldr         r12, [sp, #36]              ; load thresh address
-    pld         [src, #23]                  ; preload for next block
-    sub         sp, sp, #16                 ; create temp buffer
-
-    ldr         r6, [src], pstep            ; load source data
-    ldrb        r4, [r2]                    ; blimit
-    pld         [src, #23]
-    ldr         r7, [src], pstep
-    ldrb        r2, [r3]                    ; limit
-    pld         [src, #23]
-    ldr         r8, [src], pstep
-    orr         r4, r4, r4, lsl #8
-    ldrb        r3, [r12]                   ; thresh
-    orr         r2, r2, r2, lsl #8
-    pld         [src, #23]
-    ldr         lr, [src], pstep
-    mov         count, count, lsl #1        ; 4-in-parallel
-    orr         r4, r4, r4, lsl #16
-    orr         r3, r3, r3, lsl #8
-    orr         r2, r2, r2, lsl #16
-    orr         r3, r3, r3, lsl #16
-
-|MBVnext8|
-    ; vp9_filter_mask() function
-    ; calculate breakout conditions
-    ; transpose the source data for 4-in-parallel operation
-    TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12
-
-    uqsub8      r7, r9, r10                 ; p3 - p2
-    uqsub8      r8, r10, r9                 ; p2 - p3
-    uqsub8      r9, r10, r11                ; p2 - p1
-    uqsub8      r10, r11, r10               ; p1 - p2
-    orr         r7, r7, r8                  ; abs (p3-p2)
-    orr         r10, r9, r10                ; abs (p2-p1)
-    uqsub8      lr, r7, r2                  ; compare to limit. lr: vp9_filter_mask
-    uqsub8      r10, r10, r2                ; compare to limit
-
-    sub         src, src, pstep, lsl #2     ; move src pointer down by 4 lines
-
-    orr         lr, lr, r10
-
-    uqsub8      r6, r11, r12                ; p1 - p0
-    uqsub8      r7, r12, r11                ; p0 - p1
-    add         src, src, #4                ; move src pointer up by 4
-    orr         r6, r6, r7                  ; abs (p1-p0)
-    str         r11, [sp, #12]              ; save p1
-    uqsub8      r10, r6, r2                 ; compare to limit
-    uqsub8      r11, r6, r3                 ; compare to thresh
-    orr         lr, lr, r10
-
-    ; transpose uses 8 regs(r6 - r12 and lr). Need to save reg value now
-    ; transpose the source data for 4-in-parallel operation
-    ldr         r6, [src], pstep            ; load source data
-    str         r11, [sp]                   ; push r11 to stack
-    ldr         r7, [src], pstep
-    str         r12, [sp, #4]               ; save current reg before load q0 - q3 data
-    ldr         r8, [src], pstep
-    str         lr, [sp, #8]
-    ldr         lr, [src], pstep
-
-
-    TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12
-
-    ldr         lr, [sp, #8]                ; load back (f)limit accumulator
-
-    uqsub8      r6, r12, r11                ; q3 - q2
-    uqsub8      r7, r11, r12                ; q2 - q3
-    uqsub8      r12, r11, r10               ; q2 - q1
-    uqsub8      r11, r10, r11               ; q1 - q2
-    orr         r6, r6, r7                  ; abs (q3-q2)
-    orr         r7, r12, r11                ; abs (q2-q1)
-    uqsub8      r6, r6, r2                  ; compare to limit
-    uqsub8      r7, r7, r2                  ; compare to limit
-    ldr         r11, [sp, #4]               ; load back p0
-    ldr         r12, [sp, #12]              ; load back p1
-    orr         lr, lr, r6
-    orr         lr, lr, r7
-
-    uqsub8      r6, r11, r9                 ; p0 - q0
-    uqsub8      r7, r9, r11                 ; q0 - p0
-    uqsub8      r8, r12, r10                ; p1 - q1
-    uqsub8      r11, r10, r12               ; q1 - p1
-    orr         r6, r6, r7                  ; abs (p0-q0)
-    ldr         r7, c0x7F7F7F7F
-    orr         r8, r8, r11                 ; abs (p1-q1)
-    uqadd8      r6, r6, r6                  ; abs (p0-q0) * 2
-    and         r8, r7, r8, lsr #1          ; abs (p1-q1) / 2
-    uqsub8      r11, r10, r9                ; q1 - q0
-    uqadd8      r6, r8, r6                  ; abs (p0-q0)*2 + abs (p1-q1)/2
-    uqsub8      r12, r9, r10                ; q0 - q1
-    uqsub8      r6, r6, r4                  ; compare to flimit
-
-    orr         r9, r11, r12                ; abs (q1-q0)
-    uqsub8      r8, r9, r2                  ; compare to limit
-    uqsub8      r10, r9, r3                 ; compare to thresh
-    orr         lr, lr, r6
-    orr         lr, lr, r8
-
-    mvn         r11, #0                     ; r11 == -1
-    mov         r12, #0
-
-    usub8       lr, r12, lr
-    ldr         r9, [sp]                    ; load the compared result
-    sel         lr, r11, r12                ; filter mask: lr
-
-    cmp         lr, #0
-    beq         mbvskip_filter               ; skip filtering
-
-
-
-    ;vp8_hevmask() function
-    ;calculate high edge variance
-
-    sub         src, src, pstep, lsl #2     ; move src pointer down by 4 lines
-
-    orr         r9, r9, r10
-
-    ldrh        r7, [src, #-2]
-    ldrh        r8, [src], pstep
-
-    usub8       r9, r12, r9
-    sel         r6, r12, r11                ; hev mask: r6
-
-
-    ; vp8_mbfilter() function
-    ; p2, q2 are only needed at the end. Don't need to load them in now.
-    ; Transpose needs 8 regs(r6 - r12, and lr). Save r6 and lr first
-    ; load soure data to r6, r11, r12, lr
-    ldrh        r9, [src, #-2]
-    ldrh        r10, [src], pstep
-
-    pkhbt       r12, r7, r8, lsl #16
-
-    ldrh        r7, [src, #-2]
-    ldrh        r8, [src], pstep
-
-    pkhbt       r11, r9, r10, lsl #16
-
-    ldrh        r9, [src, #-2]
-    ldrh        r10, [src], pstep
-
-    str         r6, [sp]                    ; save r6
-    str         lr, [sp, #4]                ; save lr
-
-    pkhbt       r6, r7, r8, lsl #16
-    pkhbt       lr, r9, r10, lsl #16
-
-    ;transpose r12, r11, r6, lr to p1, p0, q0, q1
-    TRANSPOSE_MATRIX r12, r11, r6, lr, r7, r8, r9, r10
-
-    ;load back hev_mask r6 and filter_mask lr
-    ldr         r12, c0x80808080
-    ldr         r6, [sp]
-    ldr         lr, [sp, #4]
-
-    eor         r7, r7, r12                 ; ps1
-    eor         r8, r8, r12                 ; ps0
-    eor         r9, r9, r12                 ; qs0
-    eor         r10, r10, r12               ; qs1
-
-    qsub8       r12, r9, r8                 ; vp9_signed_char_clamp(vp9_filter + 3 * ( qs0 - ps0))
-    str         r7, [sp, #12]               ; store ps1 temporarily
-    qsub8       r7, r7, r10                 ; vp9_signed_char_clamp(ps1-qs1)
-    str         r10, [sp, #8]               ; store qs1 temporarily
-    qadd8       r7, r7, r12
-    str         r9, [sp]                    ; store qs0 temporarily
-    qadd8       r7, r7, r12
-    str         r8, [sp, #4]                ; store ps0 temporarily
-    qadd8       r7, r7, r12                 ; vp9_filter: r7
-
-    ldr         r10, c0x03030303            ; r10 = 3 --modified for vp8
-    ldr         r9, c0x04040404
-    ;mvn         r11, #0                     ; r11 == -1
-
-    and         r7, r7, lr                  ; vp9_filter &= mask (lr is free)
-
-    mov         r12, r7                     ; Filter2: r12
-    and         r12, r12, r6                ; Filter2 &= hev
-
-    ;modify code for vp8
-    ;save bottom 3 bits so that we round one side +4 and the other +3
-    qadd8       r8 , r12 , r9               ; Filter1 (r8) = vp9_signed_char_clamp(Filter2+4)
-    qadd8       r12 , r12 , r10             ; Filter2 (r12) = vp9_signed_char_clamp(Filter2+3)
-
-    mov         r10, #0
-    shadd8      r8 , r8 , r10               ; Filter1 >>= 3
-    shadd8      r12 , r12 , r10             ; Filter2 >>= 3
-    shadd8      r8 , r8 , r10
-    shadd8      r12 , r12 , r10
-    shadd8      r8 , r8 , r10               ; r8: Filter1
-    shadd8      r12 , r12 , r10             ; r12: Filter2
-
-    ldr         r9, [sp]                    ; load qs0
-    ldr         r11, [sp, #4]               ; load ps0
-
-    qsub8       r9 , r9, r8                 ; qs0 = vp9_signed_char_clamp(qs0 - Filter1)
-    qadd8       r11, r11, r12               ; ps0 = vp9_signed_char_clamp(ps0 + Filter2)
-
-    ;save bottom 3 bits so that we round one side +4 and the other +3
-    ;and            r8, r12, r10                ; s = Filter2 & 7 (s: r8)
-    ;qadd8      r12 , r12 , r9              ; Filter2 = vp9_signed_char_clamp(Filter2+4)
-    ;mov            r10, #0
-    ;shadd8     r12 , r12 , r10             ; Filter2 >>= 3
-    ;usub8      lr, r8, r9                  ; s = (s==4)*-1
-    ;sel            lr, r11, r10
-    ;shadd8     r12 , r12 , r10
-    ;usub8      r8, r9, r8
-    ;sel            r8, r11, r10
-    ;ldr            r9, [sp]                    ; load qs0
-    ;ldr            r11, [sp, #4]               ; load ps0
-    ;shadd8     r12 , r12 , r10
-    ;and            r8, r8, lr                  ; -1 for each element that equals 4
-    ;qadd8      r10, r8, r12                ; u = vp9_signed_char_clamp(s + Filter2)
-    ;qsub8      r9 , r9, r12                ; qs0 = vp9_signed_char_clamp(qs0 - Filter2)
-    ;qadd8      r11, r11, r10               ; ps0 = vp9_signed_char_clamp(ps0 + u)
-
-    ;end of modification for vp8
-
-    bic         r12, r7, r6                 ;vp9_filter &= ~hev    ( r6 is free)
-    ;mov            r12, r7
-
-    ;roughly 3/7th difference across boundary
-    mov         lr, #0x1b                   ; 27
-    mov         r7, #0x3f                   ; 63
-
-    sxtb16      r6, r12
-    sxtb16      r10, r12, ror #8
-    smlabb      r8, r6, lr, r7
-    smlatb      r6, r6, lr, r7
-    smlabb      r7, r10, lr, r7
-    smultb      r10, r10, lr
-    ssat        r8, #8, r8, asr #7
-    ssat        r6, #8, r6, asr #7
-    add         r10, r10, #63
-    ssat        r7, #8, r7, asr #7
-    ssat        r10, #8, r10, asr #7
-
-    ldr         lr, c0x80808080
-
-    pkhbt       r6, r8, r6, lsl #16
-    pkhbt       r10, r7, r10, lsl #16
-    uxtb16      r6, r6
-    uxtb16      r10, r10
-
-    sub         src, src, pstep, lsl #2     ; move src pointer down by 4 lines
-
-    orr         r10, r6, r10, lsl #8        ; u = vp9_signed_char_clamp((63 + Filter2 * 27)>>7)
-
-    qsub8       r8, r9, r10                 ; s = vp9_signed_char_clamp(qs0 - u)
-    qadd8       r10, r11, r10               ; s = vp9_signed_char_clamp(ps0 + u)
-    eor         r8, r8, lr                  ; *oq0 = s^0x80
-    eor         r10, r10, lr                ; *op0 = s^0x80
-
-    strb        r10, [src, #-1]             ; store op0 result
-    strb        r8, [src], pstep            ; store oq0 result
-    mov         r10, r10, lsr #8
-    mov         r8, r8, lsr #8
-    strb        r10, [src, #-1]
-    strb        r8, [src], pstep
-    mov         r10, r10, lsr #8
-    mov         r8, r8, lsr #8
-    strb        r10, [src, #-1]
-    strb        r8, [src], pstep
-    mov         r10, r10, lsr #8
-    mov         r8, r8, lsr #8
-    strb        r10, [src, #-1]
-    strb        r8, [src], pstep
-
-    ;roughly 2/7th difference across boundary
-    mov         lr, #0x12                   ; 18
-    mov         r7, #0x3f                   ; 63
-
-    sxtb16      r6, r12
-    sxtb16      r10, r12, ror #8
-    smlabb      r8, r6, lr, r7
-    smlatb      r6, r6, lr, r7
-    smlabb      r9, r10, lr, r7
-
-    smlatb      r10, r10, lr, r7
-    ssat        r8, #8, r8, asr #7
-    ssat        r6, #8, r6, asr #7
-    ssat        r9, #8, r9, asr #7
-    ssat        r10, #8, r10, asr #7
-
-    sub         src, src, pstep, lsl #2     ; move src pointer down by 4 lines
-
-    pkhbt       r6, r8, r6, lsl #16
-    pkhbt       r10, r9, r10, lsl #16
-
-    ldr         r9, [sp, #8]                ; load qs1
-    ldr         r11, [sp, #12]              ; load ps1
-    ldr         lr, c0x80808080
-
-    uxtb16      r6, r6
-    uxtb16      r10, r10
-
-    add         src, src, #2
-
-    orr         r10, r6, r10, lsl #8        ; u = vp9_signed_char_clamp((63 + Filter2 * 18)>>7)
-
-    qsub8       r8, r9, r10                 ; s = vp9_signed_char_clamp(qs1 - u)
-    qadd8       r10, r11, r10               ; s = vp9_signed_char_clamp(ps1 + u)
-    eor         r8, r8, lr                  ; *oq1 = s^0x80
-    eor         r10, r10, lr                ; *op1 = s^0x80
-
-    ldrb        r11, [src, #-5]             ; load p2 for 1/7th difference across boundary
-    strb        r10, [src, #-4]             ; store op1
-    strb        r8, [src, #-1]              ; store oq1
-    ldrb        r9, [src], pstep            ; load q2 for 1/7th difference across boundary
-
-    mov         r10, r10, lsr #8
-    mov         r8, r8, lsr #8
-
-    ldrb        r6, [src, #-5]
-    strb        r10, [src, #-4]
-    strb        r8, [src, #-1]
-    ldrb        r7, [src], pstep
-
-    mov         r10, r10, lsr #8
-    mov         r8, r8, lsr #8
-    orr         r11, r11, r6, lsl #8
-    orr         r9, r9, r7, lsl #8
-
-    ldrb        r6, [src, #-5]
-    strb        r10, [src, #-4]
-    strb        r8, [src, #-1]
-    ldrb        r7, [src], pstep
-
-    mov         r10, r10, lsr #8
-    mov         r8, r8, lsr #8
-    orr         r11, r11, r6, lsl #16
-    orr         r9, r9, r7, lsl #16
-
-    ldrb        r6, [src, #-5]
-    strb        r10, [src, #-4]
-    strb        r8, [src, #-1]
-    ldrb        r7, [src], pstep
-    orr         r11, r11, r6, lsl #24
-    orr         r9, r9, r7, lsl #24
-
-    ;roughly 1/7th difference across boundary
-    eor         r9, r9, lr
-    eor         r11, r11, lr
-
-    mov         lr, #0x9                    ; 9
-    mov         r7, #0x3f                   ; 63
-
-    sxtb16      r6, r12
-    sxtb16      r10, r12, ror #8
-    smlabb      r8, r6, lr, r7
-    smlatb      r6, r6, lr, r7
-    smlabb      r12, r10, lr, r7
-    smlatb      r10, r10, lr, r7
-    ssat        r8, #8, r8, asr #7
-    ssat        r6, #8, r6, asr #7
-    ssat        r12, #8, r12, asr #7
-    ssat        r10, #8, r10, asr #7
-
-    sub         src, src, pstep, lsl #2
-
-    pkhbt       r6, r8, r6, lsl #16
-    pkhbt       r10, r12, r10, lsl #16
-
-    uxtb16      r6, r6
-    uxtb16      r10, r10
-
-    ldr         lr, c0x80808080
-
-    orr         r10, r6, r10, lsl #8        ; u = vp9_signed_char_clamp((63 + Filter2 * 9)>>7)
-
-    qadd8       r8, r11, r10                ; s = vp9_signed_char_clamp(ps2 + u)
-    qsub8       r10, r9, r10                ; s = vp9_signed_char_clamp(qs2 - u)
-    eor         r8, r8, lr                  ; *op2 = s^0x80
-    eor         r10, r10, lr                ; *oq2 = s^0x80
-
-    strb        r8, [src, #-5]              ; store *op2
-    strb        r10, [src], pstep           ; store *oq2
-    mov         r8, r8, lsr #8
-    mov         r10, r10, lsr #8
-    strb        r8, [src, #-5]
-    strb        r10, [src], pstep
-    mov         r8, r8, lsr #8
-    mov         r10, r10, lsr #8
-    strb        r8, [src, #-5]
-    strb        r10, [src], pstep
-    mov         r8, r8, lsr #8
-    mov         r10, r10, lsr #8
-    strb        r8, [src, #-5]
-    strb        r10, [src], pstep
-
-    ;adjust src pointer for next loop
-    sub         src, src, #2
-
-|mbvskip_filter|
-    sub         src, src, #4
-    subs        count, count, #1
-
-    pld         [src, #23]                  ; preload for next block
-    ldrne       r6, [src], pstep            ; load source data
-    pld         [src, #23]
-    ldrne       r7, [src], pstep
-    pld         [src, #23]
-    ldrne       r8, [src], pstep
-    pld         [src, #23]
-    ldrne       lr, [src], pstep
-
-    bne         MBVnext8
-
-    add         sp, sp, #16
-
-    ldmia       sp!, {r4 - r11, pc}
-    ENDP        ; |vp8_mbloop_filter_vertical_edge_armv6|
-
-; Constant Pool
-c0x80808080 DCD     0x80808080
-c0x03030303 DCD     0x03030303
-c0x04040404 DCD     0x04040404
-c0x01010101 DCD     0x01010101
-c0x7F7F7F7F DCD     0x7F7F7F7F
-
-    END
--- a/vp8/common/arm/armv6/recon_v6.asm
+++ /dev/null
@@ -1,281 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp8_recon_b_armv6|
-    EXPORT  |vp8_recon2b_armv6|
-    EXPORT  |vp8_recon4b_armv6|
-
-    AREA    |.text|, CODE, READONLY  ; name this block of code
-prd     RN  r0
-dif     RN  r1
-dst     RN  r2
-stride      RN  r3
-
-;void recon_b(unsigned char *pred_ptr, short *diff_ptr, unsigned char *dst_ptr, int stride)
-; R0 char* pred_ptr
-; R1 short * dif_ptr
-; R2 char * dst_ptr
-; R3 int stride
-
-; Description:
-; Loop through the block adding the Pred and Diff together.  Clamp and then
-; store back into the Dst.
-
-; Restrictions :
-; all buffers are expected to be 4 byte aligned coming in and
-; going out.
-;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
-;
-;
-;
-;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
-|vp8_recon_b_armv6| PROC
-    stmdb   sp!, {r4 - r9, lr}
-
-    ;0, 1, 2, 3
-    ldr     r4, [prd], #16          ; 3 | 2 | 1 | 0
-    ldr     r6, [dif, #0]           ;     1 |     0
-    ldr     r7, [dif, #4]           ;     3 |     2
-
-    pkhbt   r8, r6, r7, lsl #16     ;     2 |     0
-    pkhtb   r9, r7, r6, asr #16     ;     3 |     1
-
-    uxtab16 r8, r8, r4              ;     2 |     0  +  3 | 2 | 2 | 0
-    uxtab16 r9, r9, r4, ror #8      ;     3 |     1  +  0 | 3 | 2 | 1
-
-    usat16  r8, #8, r8
-    usat16  r9, #8, r9
-    add     dif, dif, #32
-    orr     r8, r8, r9, lsl #8
-
-    str     r8, [dst], stride
-
-    ;0, 1, 2, 3
-    ldr     r4, [prd], #16          ; 3 | 2 | 1 | 0
-;;  ldr     r6, [dif, #8]           ;     1 |     0
-;;  ldr     r7, [dif, #12]          ;     3 |     2
-    ldr     r6, [dif, #0]           ;     1 |     0
-    ldr     r7, [dif, #4]           ;     3 |     2
-
-    pkhbt   r8, r6, r7, lsl #16     ;     2 |     0
-    pkhtb   r9, r7, r6, asr #16     ;     3 |     1
-
-    uxtab16 r8, r8, r4              ;     2 |     0  +  3 | 2 | 2 | 0
-    uxtab16 r9, r9, r4, ror #8      ;     3 |     1  +  0 | 3 | 2 | 1
-
-    usat16  r8, #8, r8
-    usat16  r9, #8, r9
-    add     dif, dif, #32
-    orr     r8, r8, r9, lsl #8
-
-    str     r8, [dst], stride
-
-    ;0, 1, 2, 3
-    ldr     r4, [prd], #16          ; 3 | 2 | 1 | 0
-;;  ldr     r6, [dif, #16]          ;     1 |     0
-;;  ldr     r7, [dif, #20]          ;     3 |     2
-    ldr     r6, [dif, #0]           ;     1 |     0
-    ldr     r7, [dif, #4]           ;     3 |     2
-
-    pkhbt   r8, r6, r7, lsl #16     ;     2 |     0
-    pkhtb   r9, r7, r6, asr #16     ;     3 |     1
-
-    uxtab16 r8, r8, r4              ;     2 |     0  +  3 | 2 | 2 | 0
-    uxtab16 r9, r9, r4, ror #8      ;     3 |     1  +  0 | 3 | 2 | 1
-
-    usat16  r8, #8, r8
-    usat16  r9, #8, r9
-    add     dif, dif, #32
-    orr     r8, r8, r9, lsl #8
-
-    str     r8, [dst], stride
-
-    ;0, 1, 2, 3
-    ldr     r4, [prd], #16          ; 3 | 2 | 1 | 0
-;;  ldr     r6, [dif, #24]          ;     1 |     0
-;;  ldr     r7, [dif, #28]          ;     3 |     2
-    ldr     r6, [dif, #0]           ;     1 |     0
-    ldr     r7, [dif, #4]           ;     3 |     2
-
-    pkhbt   r8, r6, r7, lsl #16     ;     2 |     0
-    pkhtb   r9, r7, r6, asr #16     ;     3 |     1
-
-    uxtab16 r8, r8, r4              ;     2 |     0  +  3 | 2 | 2 | 0
-    uxtab16 r9, r9, r4, ror #8      ;     3 |     1  +  0 | 3 | 2 | 1
-
-    usat16  r8, #8, r8
-    usat16  r9, #8, r9
-    orr     r8, r8, r9, lsl #8
-
-    str     r8, [dst], stride
-
-    ldmia   sp!, {r4 - r9, pc}
-
-    ENDP    ; |recon_b|
-
-;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
-;
-;
-;
-;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
-; R0 char  *pred_ptr
-; R1 short *dif_ptr
-; R2 char  *dst_ptr
-; R3 int stride
-|vp8_recon4b_armv6| PROC
-    stmdb   sp!, {r4 - r9, lr}
-
-    mov     lr, #4
-
-recon4b_loop
-    ;0, 1, 2, 3
-    ldr     r4, [prd], #4           ; 3 | 2 | 1 | 0
-    ldr     r6, [dif, #0]           ;     1 |     0
-    ldr     r7, [dif, #4]           ;     3 |     2
-
-    pkhbt   r8, r6, r7, lsl #16     ;     2 |     0
-    pkhtb   r9, r7, r6, asr #16     ;     3 |     1
-
-    uxtab16 r8, r8, r4              ;     2 |     0  +  3 | 2 | 2 | 0
-    uxtab16 r9, r9, r4, ror #8      ;     3 |     1  +  0 | 3 | 2 | 1
-
-    usat16  r8, #8, r8
-    usat16  r9, #8, r9
-    orr     r8, r8, r9, lsl #8
-
-    str     r8, [dst]
-
-    ;4, 5, 6, 7
-    ldr     r4, [prd], #4
-;;  ldr     r6, [dif, #32]
-;;  ldr     r7, [dif, #36]
-    ldr     r6, [dif, #8]
-    ldr     r7, [dif, #12]
-
-    pkhbt   r8, r6, r7, lsl #16
-    pkhtb   r9, r7, r6, asr #16
-
-    uxtab16 r8, r8, r4
-    uxtab16 r9, r9, r4, ror #8
-    usat16  r8, #8, r8
-    usat16  r9, #8, r9
-    orr     r8, r8, r9, lsl #8
-
-    str     r8, [dst, #4]
-
-    ;8, 9, 10, 11
-    ldr     r4, [prd], #4
-;;  ldr     r6, [dif, #64]
-;;  ldr     r7, [dif, #68]
-    ldr     r6, [dif, #16]
-    ldr     r7, [dif, #20]
-
-    pkhbt   r8, r6, r7, lsl #16
-    pkhtb   r9, r7, r6, asr #16
-
-    uxtab16 r8, r8, r4
-    uxtab16 r9, r9, r4, ror #8
-    usat16  r8, #8, r8
-    usat16  r9, #8, r9
-    orr     r8, r8, r9, lsl #8
-
-    str     r8, [dst, #8]
-
-    ;12, 13, 14, 15
-    ldr     r4, [prd], #4
-;;  ldr     r6, [dif, #96]
-;;  ldr     r7, [dif, #100]
-    ldr     r6, [dif, #24]
-    ldr     r7, [dif, #28]
-
-    pkhbt   r8, r6, r7, lsl #16
-    pkhtb   r9, r7, r6, asr #16
-
-    uxtab16 r8, r8, r4
-    uxtab16 r9, r9, r4, ror #8
-    usat16  r8, #8, r8
-    usat16  r9, #8, r9
-    orr     r8, r8, r9, lsl #8
-
-    str     r8, [dst, #12]
-
-    add     dst, dst, stride
-;;  add     dif, dif, #8
-    add     dif, dif, #32
-
-    subs    lr, lr, #1
-    bne     recon4b_loop
-
-    ldmia   sp!, {r4 - r9, pc}
-
-    ENDP    ; |Recon4B|
-
-;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
-;
-;
-;
-;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
-; R0 char  *pred_ptr
-; R1 short *dif_ptr
-; R2 char  *dst_ptr
-; R3 int stride
-|vp8_recon2b_armv6| PROC
-    stmdb   sp!, {r4 - r9, lr}
-
-    mov     lr, #4
-
-recon2b_loop
-    ;0, 1, 2, 3
-    ldr     r4, [prd], #4
-    ldr     r6, [dif, #0]
-    ldr     r7, [dif, #4]
-
-    pkhbt   r8, r6, r7, lsl #16
-    pkhtb   r9, r7, r6, asr #16
-
-    uxtab16 r8, r8, r4
-    uxtab16 r9, r9, r4, ror #8
-    usat16  r8, #8, r8
-    usat16  r9, #8, r9
-    orr     r8, r8, r9, lsl #8
-
-    str     r8, [dst]
-
-    ;4, 5, 6, 7
-    ldr     r4, [prd], #4
-;;  ldr     r6, [dif, #32]
-;;  ldr     r7, [dif, #36]
-    ldr     r6, [dif, #8]
-    ldr     r7, [dif, #12]
-
-    pkhbt   r8, r6, r7, lsl #16
-    pkhtb   r9, r7, r6, asr #16
-
-    uxtab16 r8, r8, r4
-    uxtab16 r9, r9, r4, ror #8
-    usat16  r8, #8, r8
-    usat16  r9, #8, r9
-    orr     r8, r8, r9, lsl #8
-
-    str     r8, [dst, #4]
-
-    add     dst, dst, stride
-;;  add     dif, dif, #8
-    add     dif, dif, #16
-
-    subs    lr, lr, #1
-    bne     recon2b_loop
-
-    ldmia   sp!, {r4 - r9, pc}
-
-    ENDP    ; |Recon2B|
-
-    END
--- a/vp8/common/arm/armv6/simpleloopfilter_v6.asm
+++ /dev/null
@@ -1,286 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT |vp9_loop_filter_simple_horizontal_edge_armv6|
-    EXPORT |vp9_loop_filter_simple_vertical_edge_armv6|
-
-    AREA    |.text|, CODE, READONLY  ; name this block of code
-
-    MACRO
-    TRANSPOSE_MATRIX $a0, $a1, $a2, $a3, $b0, $b1, $b2, $b3
-    ; input: $a0, $a1, $a2, $a3; output: $b0, $b1, $b2, $b3
-    ; a0: 03 02 01 00
-    ; a1: 13 12 11 10
-    ; a2: 23 22 21 20
-    ; a3: 33 32 31 30
-    ;     b3 b2 b1 b0
-
-    uxtb16      $b1, $a1                    ; xx 12 xx 10
-    uxtb16      $b0, $a0                    ; xx 02 xx 00
-    uxtb16      $b3, $a3                    ; xx 32 xx 30
-    uxtb16      $b2, $a2                    ; xx 22 xx 20
-    orr         $b1, $b0, $b1, lsl #8       ; 12 02 10 00
-    orr         $b3, $b2, $b3, lsl #8       ; 32 22 30 20
-
-    uxtb16      $a1, $a1, ror #8            ; xx 13 xx 11
-    uxtb16      $a3, $a3, ror #8            ; xx 33 xx 31
-    uxtb16      $a0, $a0, ror #8            ; xx 03 xx 01
-    uxtb16      $a2, $a2, ror #8            ; xx 23 xx 21
-    orr         $a0, $a0, $a1, lsl #8       ; 13 03 11 01
-    orr         $a2, $a2, $a3, lsl #8       ; 33 23 31 21
-
-    pkhtb       $b2, $b3, $b1, asr #16      ; 32 22 12 02   -- p1
-    pkhbt       $b0, $b1, $b3, lsl #16      ; 30 20 10 00   -- p3
-
-    pkhtb       $b3, $a2, $a0, asr #16      ; 33 23 13 03   -- p0
-    pkhbt       $b1, $a0, $a2, lsl #16      ; 31 21 11 01   -- p2
-    MEND
-
-
-
-src         RN  r0
-pstep       RN  r1
-
-;r0     unsigned char *src_ptr,
-;r1     int src_pixel_step,
-;r2     const char *blimit
-
-;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
-|vp9_loop_filter_simple_horizontal_edge_armv6| PROC
-;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
-    stmdb       sp!, {r4 - r11, lr}
-
-    ldrb        r12, [r2]                   ; blimit
-    ldr         r3, [src, -pstep, lsl #1]   ; p1
-    ldr         r4, [src, -pstep]           ; p0
-    ldr         r5, [src]                   ; q0
-    ldr         r6, [src, pstep]            ; q1
-    orr         r12, r12, r12, lsl #8       ; blimit
-    ldr         r2, c0x80808080
-    orr         r12, r12, r12, lsl #16      ; blimit
-    mov         r9, #4                      ; double the count. we're doing 4 at a time
-    mov         lr, #0                      ; need 0 in a couple places
-
-|simple_hnext8|
-    ; vp8_simple_filter_mask()
-
-    uqsub8      r7, r3, r6                  ; p1 - q1
-    uqsub8      r8, r6, r3                  ; q1 - p1
-    uqsub8      r10, r4, r5                 ; p0 - q0
-    uqsub8      r11, r5, r4                 ; q0 - p0
-    orr         r8, r8, r7                  ; abs(p1 - q1)
-    orr         r10, r10, r11               ; abs(p0 - q0)
-    uqadd8      r10, r10, r10               ; abs(p0 - q0) * 2
-    uhadd8      r8, r8, lr                  ; abs(p1 - q2) >> 1
-    uqadd8      r10, r10, r8                ; abs(p0 - q0)*2 + abs(p1 - q1)/2
-    mvn         r8, #0
-    usub8       r10, r12, r10               ; compare to flimit. usub8 sets GE flags
-    sel         r10, r8, lr                 ; filter mask: F or 0
-    cmp         r10, #0
-    beq         simple_hskip_filter         ; skip filtering if all masks are 0x00
-
-    ;vp8_simple_filter()
-
-    eor         r3, r3, r2                  ; p1 offset to convert to a signed value
-    eor         r6, r6, r2                  ; q1 offset to convert to a signed value
-    eor         r4, r4, r2                  ; p0 offset to convert to a signed value
-    eor         r5, r5, r2                  ; q0 offset to convert to a signed value
-
-    qsub8       r3, r3, r6                  ; vp9_filter = p1 - q1
-    qsub8       r6, r5, r4                  ; q0 - p0
-    qadd8       r3, r3, r6                  ; += q0 - p0
-    ldr         r7, c0x04040404
-    qadd8       r3, r3, r6                  ; += q0 - p0
-    ldr         r8, c0x03030303
-    qadd8       r3, r3, r6                  ; vp9_filter = p1-q1 + 3*(q0-p0))
-    ;STALL
-    and         r3, r3, r10                 ; vp9_filter &= mask
-
-    qadd8       r7 , r3 , r7                ; Filter1 = vp9_filter + 4
-    qadd8       r8 , r3 , r8                ; Filter2 = vp9_filter + 3
-
-    shadd8      r7 , r7 , lr
-    shadd8      r8 , r8 , lr
-    shadd8      r7 , r7 , lr
-    shadd8      r8 , r8 , lr
-    shadd8      r7 , r7 , lr                ; Filter1 >>= 3
-    shadd8      r8 , r8 , lr                ; Filter2 >>= 3
-
-    qsub8       r5 ,r5, r7                  ; u = q0 - Filter1
-    qadd8       r4, r4, r8                  ; u = p0 + Filter2
-    eor         r5, r5, r2                  ; *oq0 = u^0x80
-    str         r5, [src]                   ; store oq0 result
-    eor         r4, r4, r2                  ; *op0 = u^0x80
-    str         r4, [src, -pstep]           ; store op0 result
-
-|simple_hskip_filter|
-    subs        r9, r9, #1
-    addne       src, src, #4                ; next row
-
-    ldrne       r3, [src, -pstep, lsl #1]   ; p1
-    ldrne       r4, [src, -pstep]           ; p0
-    ldrne       r5, [src]                   ; q0
-    ldrne       r6, [src, pstep]            ; q1
-
-    bne         simple_hnext8
-
-    ldmia       sp!, {r4 - r11, pc}
-    ENDP        ; |vp9_loop_filter_simple_horizontal_edge_armv6|
-
-
-;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
-|vp9_loop_filter_simple_vertical_edge_armv6| PROC
-;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
-    stmdb       sp!, {r4 - r11, lr}
-
-    ldrb        r12, [r2]                   ; r12: blimit
-    ldr         r2, c0x80808080
-    orr         r12, r12, r12, lsl #8
-
-    ; load soure data to r7, r8, r9, r10
-    ldrh        r3, [src, #-2]
-    pld         [src, #23]                  ; preload for next block
-    ldrh        r4, [src], pstep
-    orr         r12, r12, r12, lsl #16
-
-    ldrh        r5, [src, #-2]
-    pld         [src, #23]
-    ldrh        r6, [src], pstep
-
-    pkhbt       r7, r3, r4, lsl #16
-
-    ldrh        r3, [src, #-2]
-    pld         [src, #23]
-    ldrh        r4, [src], pstep
-
-    pkhbt       r8, r5, r6, lsl #16
-
-    ldrh        r5, [src, #-2]
-    pld         [src, #23]
-    ldrh        r6, [src], pstep
-    mov         r11, #4                     ; double the count. we're doing 4 at a time
-
-|simple_vnext8|
-    ; vp8_simple_filter_mask() function
-    pkhbt       r9, r3, r4, lsl #16
-    pkhbt       r10, r5, r6, lsl #16
-
-    ;transpose r7, r8, r9, r10 to r3, r4, r5, r6
-    TRANSPOSE_MATRIX r7, r8, r9, r10, r3, r4, r5, r6
-
-    uqsub8      r7, r3, r6                  ; p1 - q1
-    uqsub8      r8, r6, r3                  ; q1 - p1
-    uqsub8      r9, r4, r5                  ; p0 - q0
-    uqsub8      r10, r5, r4                 ; q0 - p0
-    orr         r7, r7, r8                  ; abs(p1 - q1)
-    orr         r9, r9, r10                 ; abs(p0 - q0)
-    mov         r8, #0
-    uqadd8      r9, r9, r9                  ; abs(p0 - q0) * 2
-    uhadd8      r7, r7, r8                  ; abs(p1 - q1) / 2
-    uqadd8      r7, r7, r9                  ; abs(p0 - q0)*2 + abs(p1 - q1)/2
-    mvn         r10, #0                     ; r10 == -1
-
-    usub8       r7, r12, r7                 ; compare to flimit
-    sel         lr, r10, r8                 ; filter mask
-
-    cmp         lr, #0
-    beq         simple_vskip_filter         ; skip filtering
-
-    ;vp8_simple_filter() function
-    eor         r3, r3, r2                  ; p1 offset to convert to a signed value
-    eor         r6, r6, r2                  ; q1 offset to convert to a signed value
-    eor         r4, r4, r2                  ; p0 offset to convert to a signed value
-    eor         r5, r5, r2                  ; q0 offset to convert to a signed value
-
-    qsub8       r3, r3, r6                  ; vp9_filter = p1 - q1
-    qsub8       r6, r5, r4                  ; q0 - p0
-
-    qadd8       r3, r3, r6                  ; vp9_filter += q0 - p0
-    ldr         r9, c0x03030303             ; r9 = 3
-
-    qadd8       r3, r3, r6                  ; vp9_filter += q0 - p0
-    ldr         r7, c0x04040404
-
-    qadd8       r3, r3, r6                  ; vp9_filter = p1-q1 + 3*(q0-p0))
-    ;STALL
-    and         r3, r3, lr                  ; vp9_filter &= mask
-
-    qadd8       r9 , r3 , r9                ; Filter2 = vp9_filter + 3
-    qadd8       r3 , r3 , r7                ; Filter1 = vp9_filter + 4
-
-    shadd8      r9 , r9 , r8
-    shadd8      r3 , r3 , r8
-    shadd8      r9 , r9 , r8
-    shadd8      r3 , r3 , r8
-    shadd8      r9 , r9 , r8                ; Filter2 >>= 3
-    shadd8      r3 , r3 , r8                ; Filter1 >>= 3
-
-    ;calculate output
-    sub         src, src, pstep, lsl #2
-
-    qadd8       r4, r4, r9                  ; u = p0 + Filter2
-    qsub8       r5, r5, r3                  ; u = q0 - Filter1
-    eor         r4, r4, r2                  ; *op0 = u^0x80
-    eor         r5, r5, r2                  ; *oq0 = u^0x80
-
-    strb        r4, [src, #-1]              ; store the result
-    mov         r4, r4, lsr #8
-    strb        r5, [src], pstep
-    mov         r5, r5, lsr #8
-
-    strb        r4, [src, #-1]
-    mov         r4, r4, lsr #8
-    strb        r5, [src], pstep
-    mov         r5, r5, lsr #8
-
-    strb        r4, [src, #-1]
-    mov         r4, r4, lsr #8
-    strb        r5, [src], pstep
-    mov         r5, r5, lsr #8
-
-    strb        r4, [src, #-1]
-    strb        r5, [src], pstep
-
-|simple_vskip_filter|
-    subs        r11, r11, #1
-
-    ; load soure data to r7, r8, r9, r10
-    ldrneh      r3, [src, #-2]
-    pld         [src, #23]                  ; preload for next block
-    ldrneh      r4, [src], pstep
-
-    ldrneh      r5, [src, #-2]
-    pld         [src, #23]
-    ldrneh      r6, [src], pstep
-
-    pkhbt       r7, r3, r4, lsl #16
-
-    ldrneh      r3, [src, #-2]
-    pld         [src, #23]
-    ldrneh      r4, [src], pstep
-
-    pkhbt       r8, r5, r6, lsl #16
-
-    ldrneh      r5, [src, #-2]
-    pld         [src, #23]
-    ldrneh      r6, [src], pstep
-
-    bne         simple_vnext8
-
-    ldmia       sp!, {r4 - r11, pc}
-    ENDP        ; |vp9_loop_filter_simple_vertical_edge_armv6|
-
-; Constant Pool
-c0x80808080 DCD     0x80808080
-c0x03030303 DCD     0x03030303
-c0x04040404 DCD     0x04040404
-
-    END
--- a/vp8/common/arm/armv6/sixtappredict8x4_v6.asm
+++ /dev/null
@@ -1,273 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp8_sixtap_predict8x4_armv6|
-
-    AREA    |.text|, CODE, READONLY  ; name this block of code
-;-------------------------------------
-; r0    unsigned char *src_ptr,
-; r1    int  src_pixels_per_line,
-; r2    int  xoffset,
-; r3    int  yoffset,
-; stack unsigned char *dst_ptr,
-; stack int  dst_pitch
-;-------------------------------------
-;note: In first pass, store the result in transpose(8linesx9columns) on stack. Temporary stack size is 184.
-;Line width is 20 that is 9 short data plus 2 to make it 4bytes aligned. In second pass, load data from stack,
-;and the result is stored in transpose.
-|vp8_sixtap_predict8x4_armv6| PROC
-    stmdb       sp!, {r4 - r11, lr}
-    str         r3, [sp, #-184]!            ;reserve space on stack for temporary storage, store yoffset
-
-    cmp         r2, #0                      ;skip first_pass filter if xoffset=0
-    add         lr, sp, #4                  ;point to temporary buffer
-    beq         skip_firstpass_filter
-
-;first-pass filter
-    adr         r12, filter8_coeff
-    sub         r0, r0, r1, lsl #1
-
-    add         r3, r1, #10                 ; preload next low
-    pld         [r0, r3]
-
-    add         r2, r12, r2, lsl #4         ;calculate filter location
-    add         r0, r0, #3                  ;adjust src only for loading convinience
-
-    ldr         r3, [r2]                    ; load up packed filter coefficients
-    ldr         r4, [r2, #4]
-    ldr         r5, [r2, #8]
-
-    mov         r2, #0x90000                ; height=9 is top part of counter
-
-    sub         r1, r1, #8
-
-|first_pass_hloop_v6|
-    ldrb        r6, [r0, #-5]               ; load source data
-    ldrb        r7, [r0, #-4]
-    ldrb        r8, [r0, #-3]
-    ldrb        r9, [r0, #-2]
-    ldrb        r10, [r0, #-1]
-
-    orr         r2, r2, #0x4                ; construct loop counter. width=8=4x2
-
-    pkhbt       r6, r6, r7, lsl #16         ; r7 | r6
-    pkhbt       r7, r7, r8, lsl #16         ; r8 | r7
-
-    pkhbt       r8, r8, r9, lsl #16         ; r9 | r8
-    pkhbt       r9, r9, r10, lsl #16        ; r10 | r9
-
-|first_pass_wloop_v6|
-    smuad       r11, r6, r3                 ; vp9_filter[0], vp9_filter[1]
-    smuad       r12, r7, r3
-
-    ldrb        r6, [r0], #1
-
-    smlad       r11, r8, r4, r11            ; vp9_filter[2], vp9_filter[3]
-    ldrb        r7, [r0], #1
-    smlad       r12, r9, r4, r12
-
-    pkhbt       r10, r10, r6, lsl #16       ; r10 | r9
-    pkhbt       r6, r6, r7, lsl #16         ; r11 | r10
-    smlad       r11, r10, r5, r11           ; vp9_filter[4], vp9_filter[5]
-    smlad       r12, r6, r5, r12
-
-    sub         r2, r2, #1
-
-    add         r11, r11, #0x40             ; round_shift_and_clamp
-    tst         r2, #0xff                   ; test loop counter
-    usat        r11, #8, r11, asr #7
-    add         r12, r12, #0x40
-    strh        r11, [lr], #20              ; result is transposed and stored, which
-    usat        r12, #8, r12, asr #7
-
-    strh        r12, [lr], #20
-
-    movne       r11, r6
-    movne       r12, r7
-
-    movne       r6, r8
-    movne       r7, r9
-    movne       r8, r10
-    movne       r9, r11
-    movne       r10, r12
-
-    bne         first_pass_wloop_v6
-
-    ;;add       r9, ppl, #30                ; attempt to load 2 adjacent cache lines
-    ;;IF ARCHITECTURE=6
-    ;pld        [src, ppl]
-    ;;pld       [src, r9]
-    ;;ENDIF
-
-    subs        r2, r2, #0x10000
-
-    sub         lr, lr, #158
-
-    add         r0, r0, r1                  ; move to next input line
-
-    add         r11, r1, #18                ; preload next low. adding back block width(=8), which is subtracted earlier
-    pld         [r0, r11]
-
-    bne         first_pass_hloop_v6
-
-;second pass filter
-secondpass_filter
-    ldr         r3, [sp], #4                ; load back yoffset
-    ldr         r0, [sp, #216]              ; load dst address from stack 180+36
-    ldr         r1, [sp, #220]              ; load dst stride from stack 180+40
-
-    cmp         r3, #0
-    beq         skip_secondpass_filter
-
-    adr         r12, filter8_coeff
-    add         lr, r12, r3, lsl #4         ;calculate filter location
-
-    mov         r2, #0x00080000
-
-    ldr         r3, [lr]                    ; load up packed filter coefficients
-    ldr         r4, [lr, #4]
-    ldr         r5, [lr, #8]
-
-    pkhbt       r12, r4, r3                 ; pack the filter differently
-    pkhbt       r11, r5, r4
-
-second_pass_hloop_v6
-    ldr         r6, [sp]                    ; load the data
-    ldr         r7, [sp, #4]
-
-    orr         r2, r2, #2                  ; loop counter
-
-second_pass_wloop_v6
-    smuad       lr, r3, r6                  ; apply filter
-    smulbt      r10, r3, r6
-
-    ldr         r8, [sp, #8]
-
-    smlad       lr, r4, r7, lr
-    smladx      r10, r12, r7, r10
-
-    ldrh        r9, [sp, #12]
-
-    smlad       lr, r5, r8, lr
-    smladx      r10, r11, r8, r10
-
-    add         sp, sp, #4
-    smlatb      r10, r5, r9, r10
-
-    sub         r2, r2, #1
-
-    add         lr, lr, #0x40               ; round_shift_and_clamp
-    tst         r2, #0xff
-    usat        lr, #8, lr, asr #7
-    add         r10, r10, #0x40
-    strb        lr, [r0], r1                ; the result is transposed back and stored
-    usat        r10, #8, r10, asr #7
-
-    strb        r10, [r0],r1
-
-    movne       r6, r7
-    movne       r7, r8
-
-    bne         second_pass_wloop_v6
-
-    subs        r2, r2, #0x10000
-    add         sp, sp, #12                 ; updata src for next loop (20-8)
-    sub         r0, r0, r1, lsl #2
-    add         r0, r0, #1
-
-    bne         second_pass_hloop_v6
-
-    add         sp, sp, #20
-    ldmia       sp!, {r4 - r11, pc}
-
-;--------------------
-skip_firstpass_filter
-    sub         r0, r0, r1, lsl #1
-    sub         r1, r1, #8
-    mov         r2, #9
-
-skip_firstpass_hloop
-    ldrb        r4, [r0], #1                ; load data
-    subs        r2, r2, #1
-    ldrb        r5, [r0], #1
-    strh        r4, [lr], #20               ; store it to immediate buffer
-    ldrb        r6, [r0], #1                ; load data
-    strh        r5, [lr], #20
-    ldrb        r7, [r0], #1
-    strh        r6, [lr], #20
-    ldrb        r8, [r0], #1
-    strh        r7, [lr], #20
-    ldrb        r9, [r0], #1
-    strh        r8, [lr], #20
-    ldrb        r10, [r0], #1
-    strh        r9, [lr], #20
-    ldrb        r11, [r0], #1
-    strh        r10, [lr], #20
-    add         r0, r0, r1                  ; move to next input line
-    strh        r11, [lr], #20
-
-    sub         lr, lr, #158                ; move over to next column
-    bne         skip_firstpass_hloop
-
-    b           secondpass_filter
-
-;--------------------
-skip_secondpass_filter
-    mov         r2, #8
-    add         sp, sp, #4                  ;start from src[0] instead of src[-2]
-
-skip_secondpass_hloop
-    ldr         r6, [sp], #4
-    subs        r2, r2, #1
-    ldr         r8, [sp], #4
-
-    mov         r7, r6, lsr #16             ; unpack
-    strb        r6, [r0], r1
-    mov         r9, r8, lsr #16
-    strb        r7, [r0], r1
-    add         sp, sp, #12                 ; 20-8
-    strb        r8, [r0], r1
-    strb        r9, [r0], r1
-
-    sub         r0, r0, r1, lsl #2
-    add         r0, r0, #1
-
-    bne         skip_secondpass_hloop
-
-    add         sp, sp, #16                 ; 180 - (160 +4)
-
-    ldmia       sp!, {r4 - r11, pc}
-
-    ENDP
-
-;-----------------
-;One word each is reserved. Label filter_coeff can be used to access the data.
-;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
-filter8_coeff
-    DCD     0x00000000,     0x00000080,     0x00000000,     0x00000000
-    DCD     0xfffa0000,     0x000c007b,     0x0000ffff,     0x00000000
-    DCD     0xfff50002,     0x0024006c,     0x0001fff8,     0x00000000
-    DCD     0xfff70000,     0x0032005d,     0x0000fffa,     0x00000000
-    DCD     0xfff00003,     0x004d004d,     0x0003fff0,     0x00000000
-    DCD     0xfffa0000,     0x005d0032,     0x0000fff7,     0x00000000
-    DCD     0xfff80001,     0x006c0024,     0x0002fff5,     0x00000000
-    DCD     0xffff0000,     0x007b000c,     0x0000fffa,     0x00000000
-
-    ;DCD        0,  0,  128,    0,   0,  0
-    ;DCD        0, -6,  123,   12,  -1,  0
-    ;DCD        2, -11, 108,   36,  -8,  1
-    ;DCD        0, -9,   93,   50,  -6,  0
-    ;DCD        3, -16,  77,   77, -16,  3
-    ;DCD        0, -6,   50,   93,  -9,  0
-    ;DCD        1, -8,   36,  108, -11,  2
-    ;DCD        0, -1,   12,  123,  -6,  0
-
-    END
--- a/vp8/common/arm/bilinearfilter_arm.c
+++ /dev/null
@@ -1,108 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include <math.h>
-#include "vp8/common/filter.h"
-#include "vp8/common/subpixel.h"
-#include "bilinearfilter_arm.h"
-
-void vp9_filter_block2d_bil_armv6
-(
-  unsigned char *src_ptr,
-  unsigned char *dst_ptr,
-  unsigned int   src_pitch,
-  unsigned int   dst_pitch,
-  const short   *HFilter,
-  const short   *VFilter,
-  int            Width,
-  int            Height
-) {
-  unsigned short FData[36 * 16]; /* Temp data buffer used in filtering */
-
-  /* First filter 1-D horizontally... */
-  vp9_filter_block2d_bil_first_pass_armv6(src_ptr, FData, src_pitch, Height + 1, Width, HFilter);
-
-  /* then 1-D vertically... */
-  vp9_filter_block2d_bil_second_pass_armv6(FData, dst_ptr, dst_pitch, Height, Width, VFilter);
-}
-
-
-void vp9_bilinear_predict4x4_armv6
-(
-  unsigned char  *src_ptr,
-  int   src_pixels_per_line,
-  int  xoffset,
-  int  yoffset,
-  unsigned char *dst_ptr,
-  int dst_pitch
-) {
-  const short  *HFilter;
-  const short  *VFilter;
-
-  HFilter = vp8_bilinear_filters[xoffset];
-  VFilter = vp8_bilinear_filters[yoffset];
-
-  vp9_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 4, 4);
-}
-
-void vp9_bilinear_predict8x8_armv6
-(
-  unsigned char  *src_ptr,
-  int  src_pixels_per_line,
-  int  xoffset,
-  int  yoffset,
-  unsigned char *dst_ptr,
-  int  dst_pitch
-) {
-  const short  *HFilter;
-  const short  *VFilter;
-
-  HFilter = vp8_bilinear_filters[xoffset];
-  VFilter = vp8_bilinear_filters[yoffset];
-
-  vp9_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 8);
-}
-
-void vp9_bilinear_predict8x4_armv6
-(
-  unsigned char  *src_ptr,
-  int  src_pixels_per_line,
-  int  xoffset,
-  int  yoffset,
-  unsigned char *dst_ptr,
-  int  dst_pitch
-) {
-  const short  *HFilter;
-  const short  *VFilter;
-
-  HFilter = vp8_bilinear_filters[xoffset];
-  VFilter = vp8_bilinear_filters[yoffset];
-
-  vp9_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 4);
-}
-
-void vp9_bilinear_predict16x16_armv6
-(
-  unsigned char  *src_ptr,
-  int  src_pixels_per_line,
-  int  xoffset,
-  int  yoffset,
-  unsigned char *dst_ptr,
-  int  dst_pitch
-) {
-  const short  *HFilter;
-  const short  *VFilter;
-
-  HFilter = vp8_bilinear_filters[xoffset];
-  VFilter = vp8_bilinear_filters[yoffset];
-
-  vp9_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 16, 16);
-}
--- a/vp8/common/arm/bilinearfilter_arm.h
+++ /dev/null
@@ -1,35 +1,0 @@
-/*
- *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef BILINEARFILTER_ARM_H
-#define BILINEARFILTER_ARM_H
-
-extern void vp9_filter_block2d_bil_first_pass_armv6
-(
-  const unsigned char  *src_ptr,
-  unsigned short       *dst_ptr,
-  unsigned int          src_pitch,
-  unsigned int          height,
-  unsigned int          width,
-  const short          *vp9_filter
-);
-
-extern void vp9_filter_block2d_bil_second_pass_armv6
-(
-  const unsigned short *src_ptr,
-  unsigned char        *dst_ptr,
-  int                   dst_pitch,
-  unsigned int          height,
-  unsigned int          width,
-  const short         *vp9_filter
-);
-
-#endif /* BILINEARFILTER_ARM_H */
--- a/vp8/common/arm/filter_arm.c
+++ /dev/null
@@ -1,198 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vpx_ports/config.h"
-#include <math.h>
-#include "vp8/common/filter.h"
-#include "vp8/common/subpixel.h"
-#include "vpx_ports/mem.h"
-
-extern void vp9_filter_block2d_first_pass_armv6
-(
-  unsigned char *src_ptr,
-  short         *output_ptr,
-  unsigned int src_pixels_per_line,
-  unsigned int output_width,
-  unsigned int output_height,
-  const short *vp9_filter
-);
-
-// 8x8
-extern void vp9_filter_block2d_first_pass_8x8_armv6
-(
-  unsigned char *src_ptr,
-  short         *output_ptr,
-  unsigned int src_pixels_per_line,
-  unsigned int output_width,
-  unsigned int output_height,
-  const short *vp9_filter
-);
-
-// 16x16
-extern void vp9_filter_block2d_first_pass_16x16_armv6
-(
-  unsigned char *src_ptr,
-  short         *output_ptr,
-  unsigned int src_pixels_per_line,
-  unsigned int output_width,
-  unsigned int output_height,
-  const short *vp9_filter
-);
-
-extern void vp9_filter_block2d_second_pass_armv6
-(
-  short         *src_ptr,
-  unsigned char *output_ptr,
-  unsigned int output_pitch,
-  unsigned int cnt,
-  const short *vp9_filter
-);
-
-extern void vp9_filter4_block2d_second_pass_armv6
-(
-  short         *src_ptr,
-  unsigned char *output_ptr,
-  unsigned int output_pitch,
-  unsigned int cnt,
-  const short *vp9_filter
-);
-
-extern void vp9_filter_block2d_first_pass_only_armv6
-(
-  unsigned char *src_ptr,
-  unsigned char *output_ptr,
-  unsigned int src_pixels_per_line,
-  unsigned int cnt,
-  unsigned int output_pitch,
-  const short *vp9_filter
-);
-
-
-extern void vp9_filter_block2d_second_pass_only_armv6
-(
-  unsigned char *src_ptr,
-  unsigned char *output_ptr,
-  unsigned int src_pixels_per_line,
-  unsigned int cnt,
-  unsigned int output_pitch,
-  const short *vp9_filter
-);
-
-#if HAVE_ARMV6
-void vp9_sixtap_predict_armv6
-(
-  unsigned char  *src_ptr,
-  int  src_pixels_per_line,
-  int  xoffset,
-  int  yoffset,
-  unsigned char *dst_ptr,
-  int  dst_pitch
-) {
-  const short  *HFilter;
-  const short  *VFilter;
-  DECLARE_ALIGNED_ARRAY(4, short, FData, 12 * 4); /* Temp data buffer used in filtering */
-
-
-  HFilter = vp8_sub_pel_filters[xoffset];   /* 6 tap */
-  VFilter = vp8_sub_pel_filters[yoffset];   /* 6 tap */
-
-  /* Vfilter is null. First pass only */
-  if (xoffset && !yoffset) {
-    /*vp9_filter_block2d_first_pass_armv6 ( src_ptr, FData+2, src_pixels_per_line, 4, 4, HFilter );
-    vp9_filter_block2d_second_pass_armv6 ( FData+2, dst_ptr, dst_pitch, 4, VFilter );*/
-
-    vp9_filter_block2d_first_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 4, dst_pitch, HFilter);
-  }
-  /* Hfilter is null. Second pass only */
-  else if (!xoffset && yoffset) {
-    vp9_filter_block2d_second_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 4, dst_pitch, VFilter);
-  } else {
-    /* Vfilter is a 4 tap filter */
-    if (yoffset & 0x1) {
-      vp9_filter_block2d_first_pass_armv6(src_ptr - src_pixels_per_line, FData + 1, src_pixels_per_line, 4, 7, HFilter);
-      vp9_filter4_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 4, VFilter);
-    }
-    /* Vfilter is 6 tap filter */
-    else {
-      vp9_filter_block2d_first_pass_armv6(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 4, 9, HFilter);
-      vp9_filter_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 4, VFilter);
-    }
-  }
-}
-
-void vp9_sixtap_predict8x8_armv6
-(
-  unsigned char  *src_ptr,
-  int  src_pixels_per_line,
-  int  xoffset,
-  int  yoffset,
-  unsigned char *dst_ptr,
-  int  dst_pitch
-) {
-  const short  *HFilter;
-  const short  *VFilter;
-  DECLARE_ALIGNED_ARRAY(4, short, FData, 16 * 8); /* Temp data buffer used in filtering */
-
-  HFilter = vp8_sub_pel_filters[xoffset];   /* 6 tap */
-  VFilter = vp8_sub_pel_filters[yoffset];   /* 6 tap */
-
-  if (xoffset && !yoffset) {
-    vp9_filter_block2d_first_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 8, dst_pitch, HFilter);
-  }
-  /* Hfilter is null. Second pass only */
-  else if (!xoffset && yoffset) {
-    vp9_filter_block2d_second_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 8, dst_pitch, VFilter);
-  } else {
-    if (yoffset & 0x1) {
-      vp9_filter_block2d_first_pass_8x8_armv6(src_ptr - src_pixels_per_line, FData + 1, src_pixels_per_line, 8, 11, HFilter);
-      vp9_filter4_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 8, VFilter);
-    } else {
-      vp9_filter_block2d_first_pass_8x8_armv6(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 8, 13, HFilter);
-      vp9_filter_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 8, VFilter);
-    }
-  }
-}
-
-
-void vp9_sixtap_predict16x16_armv6
-(
-  unsigned char  *src_ptr,
-  int  src_pixels_per_line,
-  int  xoffset,
-  int  yoffset,
-  unsigned char *dst_ptr,
-  int  dst_pitch
-) {
-  const short  *HFilter;
-  const short  *VFilter;
-  DECLARE_ALIGNED_ARRAY(4, short, FData, 24 * 16);  /* Temp data buffer used in filtering */
-
-  HFilter = vp8_sub_pel_filters[xoffset];   /* 6 tap */
-  VFilter = vp8_sub_pel_filters[yoffset];   /* 6 tap */
-
-  if (xoffset && !yoffset) {
-    vp9_filter_block2d_first_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 16, dst_pitch, HFilter);
-  }
-  /* Hfilter is null. Second pass only */
-  else if (!xoffset && yoffset) {
-    vp9_filter_block2d_second_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 16, dst_pitch, VFilter);
-  } else {
-    if (yoffset & 0x1) {
-      vp9_filter_block2d_first_pass_16x16_armv6(src_ptr - src_pixels_per_line, FData + 1, src_pixels_per_line, 16, 19, HFilter);
-      vp9_filter4_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 16, VFilter);
-    } else {
-      vp9_filter_block2d_first_pass_16x16_armv6(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 16, 21, HFilter);
-      vp9_filter_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 16, VFilter);
-    }
-  }
-
-}
-#endif
--- a/vp8/common/arm/idct_arm.h
+++ /dev/null
@@ -1,65 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef IDCT_ARM_H
-#define IDCT_ARM_H
-
-#if HAVE_ARMV6
-extern prototype_idct(vp9_short_idct4x4llm_1_v6);
-extern prototype_idct(vp9_short_idct4x4llm_v6_dual);
-extern prototype_idct_scalar_add(vp9_dc_only_idct_add_v6);
-extern prototype_second_order(vp9_short_inv_walsh4x4_1_v6);
-extern prototype_second_order(vp9_short_inv_walsh4x4_v6);
-
-#if !CONFIG_RUNTIME_CPU_DETECT
-#undef  vp9_idct_idct1
-#define vp9_idct_idct1 vp9_short_idct4x4llm_1_v6
-
-#undef  vp9_idct_idct16
-#define vp9_idct_idct16 vp9_short_idct4x4llm_v6_dual
-
-#undef  vp9_idct_idct1_scalar_add
-#define vp9_idct_idct1_scalar_add vp9_dc_only_idct_add_v6
-
-#undef  vp8_idct_iwalsh1
-#define vp8_idct_iwalsh1 vp9_short_inv_walsh4x4_1_v6
-
-#undef  vp8_idct_iwalsh16
-#define vp8_idct_iwalsh16 vp9_short_inv_walsh4x4_v6
-#endif
-#endif
-
-#if HAVE_ARMV7
-extern prototype_idct(vp9_short_idct4x4llm_1_neon);
-extern prototype_idct(vp9_short_idct4x4llm_neon);
-extern prototype_idct_scalar_add(vp9_dc_only_idct_add_neon);
-extern prototype_second_order(vp9_short_inv_walsh4x4_1_neon);
-extern prototype_second_order(vp9_short_inv_walsh4x4_neon);
-
-#if !CONFIG_RUNTIME_CPU_DETECT
-#undef  vp9_idct_idct1
-#define vp9_idct_idct1 vp9_short_idct4x4llm_1_neon
-
-#undef  vp9_idct_idct16
-#define vp9_idct_idct16 vp9_short_idct4x4llm_neon
-
-#undef  vp9_idct_idct1_scalar_add
-#define vp9_idct_idct1_scalar_add vp9_dc_only_idct_add_neon
-
-#undef  vp8_idct_iwalsh1
-#define vp8_idct_iwalsh1 vp9_short_inv_walsh4x4_1_neon
-
-#undef  vp8_idct_iwalsh16
-#define vp8_idct_iwalsh16 vp9_short_inv_walsh4x4_neon
-#endif
-#endif
-
-#endif
--- a/vp8/common/arm/loopfilter_arm.c
+++ /dev/null
@@ -1,166 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vpx_config.h"
-#include "vp8/common/loopfilter.h"
-#include "vp8/common/onyxc_int.h"
-
-#if HAVE_ARMV6
-extern prototype_loopfilter(vp9_loop_filter_horizontal_edge_armv6);
-extern prototype_loopfilter(vp9_loop_filter_vertical_edge_armv6);
-extern prototype_loopfilter(vp9_mbloop_filter_horizontal_edge_armv6);
-extern prototype_loopfilter(vp9_mbloop_filter_vertical_edge_armv6);
-#endif
-
-#if HAVE_ARMV7
-typedef void loopfilter_y_neon(unsigned char *src, int pitch,
-                               unsigned char blimit, unsigned char limit, unsigned char thresh);
-typedef void loopfilter_uv_neon(unsigned char *u, int pitch,
-                                unsigned char blimit, unsigned char limit, unsigned char thresh,
-                                unsigned char *v);
-
-extern loopfilter_y_neon vp9_loop_filter_horizontal_edge_y_neon;
-extern loopfilter_y_neon vp9_loop_filter_vertical_edge_y_neon;
-extern loopfilter_y_neon vp9_mbloop_filter_horizontal_edge_y_neon;
-extern loopfilter_y_neon vp9_mbloop_filter_vertical_edge_y_neon;
-
-extern loopfilter_uv_neon vp9_loop_filter_horizontal_edge_uv_neon;
-extern loopfilter_uv_neon vp9_loop_filter_vertical_edge_uv_neon;
-extern loopfilter_uv_neon vp9_mbloop_filter_horizontal_edge_uv_neon;
-extern loopfilter_uv_neon vp9_mbloop_filter_vertical_edge_uv_neon;
-#endif
-
-#if HAVE_ARMV6
-/*ARMV6 loopfilter functions*/
-/* Horizontal MB filtering */
-void vp9_loop_filter_mbh_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                               int y_stride, int uv_stride, loop_filter_info *lfi) {
-  vp9_mbloop_filter_horizontal_edge_armv6(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2);
-
-  if (u_ptr)
-    vp9_mbloop_filter_horizontal_edge_armv6(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
-
-  if (v_ptr)
-    vp9_mbloop_filter_horizontal_edge_armv6(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
-}
-
-/* Vertical MB Filtering */
-void vp9_loop_filter_mbv_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                               int y_stride, int uv_stride, loop_filter_info *lfi) {
-  vp9_mbloop_filter_vertical_edge_armv6(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2);
-
-  if (u_ptr)
-    vp9_mbloop_filter_vertical_edge_armv6(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
-
-  if (v_ptr)
-    vp9_mbloop_filter_vertical_edge_armv6(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
-}
-
-/* Horizontal B Filtering */
-void vp9_loop_filter_bh_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                              int y_stride, int uv_stride, loop_filter_info *lfi) {
-  vp9_loop_filter_horizontal_edge_armv6(y_ptr + 4 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
-  vp9_loop_filter_horizontal_edge_armv6(y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
-  vp9_loop_filter_horizontal_edge_armv6(y_ptr + 12 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
-
-  if (u_ptr)
-    vp9_loop_filter_horizontal_edge_armv6(u_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
-
-  if (v_ptr)
-    vp9_loop_filter_horizontal_edge_armv6(v_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
-}
-
-void vp9_loop_filter_bhs_armv6(unsigned char *y_ptr, int y_stride,
-                               const unsigned char *blimit) {
-  vp9_loop_filter_simple_horizontal_edge_armv6(y_ptr + 4 * y_stride, y_stride, blimit);
-  vp9_loop_filter_simple_horizontal_edge_armv6(y_ptr + 8 * y_stride, y_stride, blimit);
-  vp9_loop_filter_simple_horizontal_edge_armv6(y_ptr + 12 * y_stride, y_stride, blimit);
-}
-
-/* Vertical B Filtering */
-void vp9_loop_filter_bv_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                              int y_stride, int uv_stride, loop_filter_info *lfi) {
-  vp9_loop_filter_vertical_edge_armv6(y_ptr + 4, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
-  vp9_loop_filter_vertical_edge_armv6(y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
-  vp9_loop_filter_vertical_edge_armv6(y_ptr + 12, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
-
-  if (u_ptr)
-    vp9_loop_filter_vertical_edge_armv6(u_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
-
-  if (v_ptr)
-    vp9_loop_filter_vertical_edge_armv6(v_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
-}
-
-void vp9_loop_filter_bvs_armv6(unsigned char *y_ptr, int y_stride,
-                               const unsigned char *blimit) {
-  vp9_loop_filter_simple_vertical_edge_armv6(y_ptr + 4, y_stride, blimit);
-  vp9_loop_filter_simple_vertical_edge_armv6(y_ptr + 8, y_stride, blimit);
-  vp9_loop_filter_simple_vertical_edge_armv6(y_ptr + 12, y_stride, blimit);
-}
-#endif
-
-#if HAVE_ARMV7
-/* NEON loopfilter functions */
-/* Horizontal MB filtering */
-void vp9_loop_filter_mbh_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                              int y_stride, int uv_stride, loop_filter_info *lfi) {
-  unsigned char mblim = *lfi->mblim;
-  unsigned char lim = *lfi->lim;
-  unsigned char hev_thr = *lfi->hev_thr;
-  vp9_mbloop_filter_horizontal_edge_y_neon(y_ptr, y_stride, mblim, lim, hev_thr);
-
-  if (u_ptr)
-    vp9_mbloop_filter_horizontal_edge_uv_neon(u_ptr, uv_stride, mblim, lim, hev_thr, v_ptr);
-}
-
-/* Vertical MB Filtering */
-void vp9_loop_filter_mbv_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                              int y_stride, int uv_stride, loop_filter_info *lfi) {
-  unsigned char mblim = *lfi->mblim;
-  unsigned char lim = *lfi->lim;
-  unsigned char hev_thr = *lfi->hev_thr;
-
-  vp9_mbloop_filter_vertical_edge_y_neon(y_ptr, y_stride, mblim, lim, hev_thr);
-
-  if (u_ptr)
-    vp9_mbloop_filter_vertical_edge_uv_neon(u_ptr, uv_stride, mblim, lim, hev_thr, v_ptr);
-}
-
-/* Horizontal B Filtering */
-void vp9_loop_filter_bh_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                             int y_stride, int uv_stride, loop_filter_info *lfi) {
-  unsigned char blim = *lfi->blim;
-  unsigned char lim = *lfi->lim;
-  unsigned char hev_thr = *lfi->hev_thr;
-
-  vp9_loop_filter_horizontal_edge_y_neon(y_ptr + 4 * y_stride, y_stride, blim, lim, hev_thr);
-  vp9_loop_filter_horizontal_edge_y_neon(y_ptr + 8 * y_stride, y_stride, blim, lim, hev_thr);
-  vp9_loop_filter_horizontal_edge_y_neon(y_ptr + 12 * y_stride, y_stride, blim, lim, hev_thr);
-
-  if (u_ptr)
-    vp9_loop_filter_horizontal_edge_uv_neon(u_ptr + 4 * uv_stride, uv_stride, blim, lim, hev_thr, v_ptr + 4 * uv_stride);
-}
-
-/* Vertical B Filtering */
-void vp9_loop_filter_bv_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                             int y_stride, int uv_stride, loop_filter_info *lfi) {
-  unsigned char blim = *lfi->blim;
-  unsigned char lim = *lfi->lim;
-  unsigned char hev_thr = *lfi->hev_thr;
-
-  vp9_loop_filter_vertical_edge_y_neon(y_ptr + 4, y_stride, blim, lim, hev_thr);
-  vp9_loop_filter_vertical_edge_y_neon(y_ptr + 8, y_stride, blim, lim, hev_thr);
-  vp9_loop_filter_vertical_edge_y_neon(y_ptr + 12, y_stride, blim, lim, hev_thr);
-
-  if (u_ptr)
-    vp9_loop_filter_vertical_edge_uv_neon(u_ptr + 4, uv_stride, blim, lim, hev_thr, v_ptr + 4);
-}
-#endif
--- a/vp8/common/arm/loopfilter_arm.h
+++ /dev/null
@@ -1,41 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef LOOPFILTER_ARM_H
-#define LOOPFILTER_ARM_H
-
-#include "vpx_config.h"
-
-#if HAVE_ARMV6
-extern prototype_loopfilter_block(vp9_loop_filter_mbv_armv6);
-extern prototype_loopfilter_block(vp9_loop_filter_bv_armv6);
-extern prototype_loopfilter_block(vp9_loop_filter_mbh_armv6);
-extern prototype_loopfilter_block(vp9_loop_filter_bh_armv6);
-extern prototype_simple_loopfilter(vp9_loop_filter_bvs_armv6);
-extern prototype_simple_loopfilter(vp9_loop_filter_bhs_armv6);
-extern prototype_simple_loopfilter(vp9_loop_filter_simple_horizontal_edge_armv6);
-extern prototype_simple_loopfilter(vp9_loop_filter_simple_vertical_edge_armv6);
-
-#endif /* HAVE_ARMV6 */
-
-#if HAVE_ARMV7
-extern prototype_loopfilter_block(vp9_loop_filter_mbv_neon);
-extern prototype_loopfilter_block(vp9_loop_filter_bv_neon);
-extern prototype_loopfilter_block(vp9_loop_filter_mbh_neon);
-extern prototype_loopfilter_block(vp9_loop_filter_bh_neon);
-extern prototype_simple_loopfilter(vp9_loop_filter_mbvs_neon);
-extern prototype_simple_loopfilter(vp9_loop_filter_bvs_neon);
-extern prototype_simple_loopfilter(vp9_loop_filter_mbhs_neon);
-extern prototype_simple_loopfilter(vp9_loop_filter_bhs_neon);
-
-#endif /* HAVE_ARMV7 */
-
-#endif /* LOOPFILTER_ARM_H */
--- a/vp8/common/arm/neon/bilinearpredict16x16_neon.asm
+++ /dev/null
@@ -1,357 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp8_bilinear_predict16x16_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-; r0    unsigned char  *src_ptr,
-; r1    int  src_pixels_per_line,
-; r2    int  xoffset,
-; r3    int  yoffset,
-; r4    unsigned char *dst_ptr,
-; stack(r5) int  dst_pitch
-
-|vp8_bilinear_predict16x16_neon| PROC
-    push            {r4-r5, lr}
-
-    adr             r12, bifilter16_coeff
-    ldr             r4, [sp, #12]           ;load parameters from stack
-    ldr             r5, [sp, #16]           ;load parameters from stack
-
-    cmp             r2, #0                  ;skip first_pass filter if xoffset=0
-    beq             secondpass_bfilter16x16_only
-
-    add             r2, r12, r2, lsl #3     ;calculate filter location
-
-    cmp             r3, #0                  ;skip second_pass filter if yoffset=0
-
-    vld1.s32        {d31}, [r2]             ;load first_pass filter
-
-    beq             firstpass_bfilter16x16_only
-
-    sub             sp, sp, #272            ;reserve space on stack for temporary storage
-    vld1.u8         {d2, d3, d4}, [r0], r1      ;load src data
-    mov             lr, sp
-    vld1.u8         {d5, d6, d7}, [r0], r1
-
-    mov             r2, #3                  ;loop counter
-    vld1.u8         {d8, d9, d10}, [r0], r1
-
-    vdup.8          d0, d31[0]              ;first_pass filter (d0 d1)
-    vld1.u8         {d11, d12, d13}, [r0], r1
-
-    vdup.8          d1, d31[4]
-
-;First Pass: output_height lines x output_width columns (17x16)
-filt_blk2d_fp16x16_loop_neon
-    pld             [r0]
-    pld             [r0, r1]
-    pld             [r0, r1, lsl #1]
-
-    vmull.u8        q7, d2, d0              ;(src_ptr[0] * vp9_filter[0])
-    vmull.u8        q8, d3, d0
-    vmull.u8        q9, d5, d0
-    vmull.u8        q10, d6, d0
-    vmull.u8        q11, d8, d0
-    vmull.u8        q12, d9, d0
-    vmull.u8        q13, d11, d0
-    vmull.u8        q14, d12, d0
-
-    vext.8          d2, d2, d3, #1          ;construct src_ptr[1]
-    vext.8          d5, d5, d6, #1
-    vext.8          d8, d8, d9, #1
-    vext.8          d11, d11, d12, #1
-
-    vmlal.u8        q7, d2, d1              ;(src_ptr[0] * vp9_filter[1])
-    vmlal.u8        q9, d5, d1
-    vmlal.u8        q11, d8, d1
-    vmlal.u8        q13, d11, d1
-
-    vext.8          d3, d3, d4, #1
-    vext.8          d6, d6, d7, #1
-    vext.8          d9, d9, d10, #1
-    vext.8          d12, d12, d13, #1
-
-    vmlal.u8        q8, d3, d1              ;(src_ptr[0] * vp9_filter[1])
-    vmlal.u8        q10, d6, d1
-    vmlal.u8        q12, d9, d1
-    vmlal.u8        q14, d12, d1
-
-    subs            r2, r2, #1
-
-    vqrshrn.u16    d14, q7, #7              ;shift/round/saturate to u8
-    vqrshrn.u16    d15, q8, #7
-    vqrshrn.u16    d16, q9, #7
-    vqrshrn.u16    d17, q10, #7
-    vqrshrn.u16    d18, q11, #7
-    vqrshrn.u16    d19, q12, #7
-    vqrshrn.u16    d20, q13, #7
-
-    vld1.u8         {d2, d3, d4}, [r0], r1      ;load src data
-    vqrshrn.u16    d21, q14, #7
-    vld1.u8         {d5, d6, d7}, [r0], r1
-
-    vst1.u8         {d14, d15, d16, d17}, [lr]!     ;store result
-    vld1.u8         {d8, d9, d10}, [r0], r1
-    vst1.u8         {d18, d19, d20, d21}, [lr]!
-    vld1.u8         {d11, d12, d13}, [r0], r1
-
-    bne             filt_blk2d_fp16x16_loop_neon
-
-;First-pass filtering for rest 5 lines
-    vld1.u8         {d14, d15, d16}, [r0], r1
-
-    vmull.u8        q9, d2, d0              ;(src_ptr[0] * vp9_filter[0])
-    vmull.u8        q10, d3, d0
-    vmull.u8        q11, d5, d0
-    vmull.u8        q12, d6, d0
-    vmull.u8        q13, d8, d0
-    vmull.u8        q14, d9, d0
-
-    vext.8          d2, d2, d3, #1          ;construct src_ptr[1]
-    vext.8          d5, d5, d6, #1
-    vext.8          d8, d8, d9, #1
-
-    vmlal.u8        q9, d2, d1              ;(src_ptr[0] * vp9_filter[1])
-    vmlal.u8        q11, d5, d1
-    vmlal.u8        q13, d8, d1
-
-    vext.8          d3, d3, d4, #1
-    vext.8          d6, d6, d7, #1
-    vext.8          d9, d9, d10, #1
-
-    vmlal.u8        q10, d3, d1             ;(src_ptr[0] * vp9_filter[1])
-    vmlal.u8        q12, d6, d1
-    vmlal.u8        q14, d9, d1
-
-    vmull.u8        q1, d11, d0
-    vmull.u8        q2, d12, d0
-    vmull.u8        q3, d14, d0
-    vmull.u8        q4, d15, d0
-
-    vext.8          d11, d11, d12, #1       ;construct src_ptr[1]
-    vext.8          d14, d14, d15, #1
-
-    vmlal.u8        q1, d11, d1             ;(src_ptr[0] * vp9_filter[1])
-    vmlal.u8        q3, d14, d1
-
-    vext.8          d12, d12, d13, #1
-    vext.8          d15, d15, d16, #1
-
-    vmlal.u8        q2, d12, d1             ;(src_ptr[0] * vp9_filter[1])
-    vmlal.u8        q4, d15, d1
-
-    vqrshrn.u16    d10, q9, #7              ;shift/round/saturate to u8
-    vqrshrn.u16    d11, q10, #7
-    vqrshrn.u16    d12, q11, #7
-    vqrshrn.u16    d13, q12, #7
-    vqrshrn.u16    d14, q13, #7
-    vqrshrn.u16    d15, q14, #7
-    vqrshrn.u16    d16, q1, #7
-    vqrshrn.u16    d17, q2, #7
-    vqrshrn.u16    d18, q3, #7
-    vqrshrn.u16    d19, q4, #7
-
-    vst1.u8         {d10, d11, d12, d13}, [lr]!         ;store result
-    vst1.u8         {d14, d15, d16, d17}, [lr]!
-    vst1.u8         {d18, d19}, [lr]!
-
-;Second pass: 16x16
-;secondpass_filter
-    add             r3, r12, r3, lsl #3
-    sub             lr, lr, #272
-
-    vld1.u32        {d31}, [r3]             ;load second_pass filter
-
-    vld1.u8         {d22, d23}, [lr]!       ;load src data
-
-    vdup.8          d0, d31[0]              ;second_pass filter parameters (d0 d1)
-    vdup.8          d1, d31[4]
-    mov             r12, #4                 ;loop counter
-
-filt_blk2d_sp16x16_loop_neon
-    vld1.u8         {d24, d25}, [lr]!
-    vmull.u8        q1, d22, d0             ;(src_ptr[0] * vp9_filter[0])
-    vld1.u8         {d26, d27}, [lr]!
-    vmull.u8        q2, d23, d0
-    vld1.u8         {d28, d29}, [lr]!
-    vmull.u8        q3, d24, d0
-    vld1.u8         {d30, d31}, [lr]!
-
-    vmull.u8        q4, d25, d0
-    vmull.u8        q5, d26, d0
-    vmull.u8        q6, d27, d0
-    vmull.u8        q7, d28, d0
-    vmull.u8        q8, d29, d0
-
-    vmlal.u8        q1, d24, d1             ;(src_ptr[pixel_step] * vp9_filter[1])
-    vmlal.u8        q2, d25, d1
-    vmlal.u8        q3, d26, d1
-    vmlal.u8        q4, d27, d1
-    vmlal.u8        q5, d28, d1
-    vmlal.u8        q6, d29, d1
-    vmlal.u8        q7, d30, d1
-    vmlal.u8        q8, d31, d1
-
-    subs            r12, r12, #1
-
-    vqrshrn.u16    d2, q1, #7               ;shift/round/saturate to u8
-    vqrshrn.u16    d3, q2, #7
-    vqrshrn.u16    d4, q3, #7
-    vqrshrn.u16    d5, q4, #7
-    vqrshrn.u16    d6, q5, #7
-    vqrshrn.u16    d7, q6, #7
-    vqrshrn.u16    d8, q7, #7
-    vqrshrn.u16    d9, q8, #7
-
-    vst1.u8         {d2, d3}, [r4], r5      ;store result
-    vst1.u8         {d4, d5}, [r4], r5
-    vst1.u8         {d6, d7}, [r4], r5
-    vmov            q11, q15
-    vst1.u8         {d8, d9}, [r4], r5
-
-    bne             filt_blk2d_sp16x16_loop_neon
-
-    add             sp, sp, #272
-
-    pop             {r4-r5,pc}
-
-;--------------------
-firstpass_bfilter16x16_only
-    mov             r2, #4                      ;loop counter
-    vdup.8          d0, d31[0]                  ;first_pass filter (d0 d1)
-    vdup.8          d1, d31[4]
-
-;First Pass: output_height lines x output_width columns (16x16)
-filt_blk2d_fpo16x16_loop_neon
-    vld1.u8         {d2, d3, d4}, [r0], r1      ;load src data
-    vld1.u8         {d5, d6, d7}, [r0], r1
-    vld1.u8         {d8, d9, d10}, [r0], r1
-    vld1.u8         {d11, d12, d13}, [r0], r1
-
-    pld             [r0]
-    pld             [r0, r1]
-    pld             [r0, r1, lsl #1]
-
-    vmull.u8        q7, d2, d0              ;(src_ptr[0] * vp9_filter[0])
-    vmull.u8        q8, d3, d0
-    vmull.u8        q9, d5, d0
-    vmull.u8        q10, d6, d0
-    vmull.u8        q11, d8, d0
-    vmull.u8        q12, d9, d0
-    vmull.u8        q13, d11, d0
-    vmull.u8        q14, d12, d0
-
-    vext.8          d2, d2, d3, #1          ;construct src_ptr[1]
-    vext.8          d5, d5, d6, #1
-    vext.8          d8, d8, d9, #1
-    vext.8          d11, d11, d12, #1
-
-    vmlal.u8        q7, d2, d1              ;(src_ptr[0] * vp9_filter[1])
-    vmlal.u8        q9, d5, d1
-    vmlal.u8        q11, d8, d1
-    vmlal.u8        q13, d11, d1
-
-    vext.8          d3, d3, d4, #1
-    vext.8          d6, d6, d7, #1
-    vext.8          d9, d9, d10, #1
-    vext.8          d12, d12, d13, #1
-
-    vmlal.u8        q8, d3, d1              ;(src_ptr[0] * vp9_filter[1])
-    vmlal.u8        q10, d6, d1
-    vmlal.u8        q12, d9, d1
-    vmlal.u8        q14, d12, d1
-
-    subs            r2, r2, #1
-
-    vqrshrn.u16    d14, q7, #7              ;shift/round/saturate to u8
-    vqrshrn.u16    d15, q8, #7
-    vqrshrn.u16    d16, q9, #7
-    vqrshrn.u16    d17, q10, #7
-    vqrshrn.u16    d18, q11, #7
-    vqrshrn.u16    d19, q12, #7
-    vqrshrn.u16    d20, q13, #7
-    vst1.u8         {d14, d15}, [r4], r5        ;store result
-    vqrshrn.u16    d21, q14, #7
-
-    vst1.u8         {d16, d17}, [r4], r5
-    vst1.u8         {d18, d19}, [r4], r5
-    vst1.u8         {d20, d21}, [r4], r5
-
-    bne             filt_blk2d_fpo16x16_loop_neon
-    pop             {r4-r5,pc}
-
-;---------------------
-secondpass_bfilter16x16_only
-;Second pass: 16x16
-;secondpass_filter
-    add             r3, r12, r3, lsl #3
-    mov             r12, #4                     ;loop counter
-    vld1.u32        {d31}, [r3]                 ;load second_pass filter
-    vld1.u8         {d22, d23}, [r0], r1        ;load src data
-
-    vdup.8          d0, d31[0]                  ;second_pass filter parameters (d0 d1)
-    vdup.8          d1, d31[4]
-
-filt_blk2d_spo16x16_loop_neon
-    vld1.u8         {d24, d25}, [r0], r1
-    vmull.u8        q1, d22, d0             ;(src_ptr[0] * vp9_filter[0])
-    vld1.u8         {d26, d27}, [r0], r1
-    vmull.u8        q2, d23, d0
-    vld1.u8         {d28, d29}, [r0], r1
-    vmull.u8        q3, d24, d0
-    vld1.u8         {d30, d31}, [r0], r1
-
-    vmull.u8        q4, d25, d0
-    vmull.u8        q5, d26, d0
-    vmull.u8        q6, d27, d0
-    vmull.u8        q7, d28, d0
-    vmull.u8        q8, d29, d0
-
-    vmlal.u8        q1, d24, d1             ;(src_ptr[pixel_step] * vp9_filter[1])
-    vmlal.u8        q2, d25, d1
-    vmlal.u8        q3, d26, d1
-    vmlal.u8        q4, d27, d1
-    vmlal.u8        q5, d28, d1
-    vmlal.u8        q6, d29, d1
-    vmlal.u8        q7, d30, d1
-    vmlal.u8        q8, d31, d1
-
-    vqrshrn.u16    d2, q1, #7               ;shift/round/saturate to u8
-    vqrshrn.u16    d3, q2, #7
-    vqrshrn.u16    d4, q3, #7
-    vqrshrn.u16    d5, q4, #7
-    vqrshrn.u16    d6, q5, #7
-    vqrshrn.u16    d7, q6, #7
-    vqrshrn.u16    d8, q7, #7
-    vqrshrn.u16    d9, q8, #7
-
-    vst1.u8         {d2, d3}, [r4], r5      ;store result
-    subs            r12, r12, #1
-    vst1.u8         {d4, d5}, [r4], r5
-    vmov            q11, q15
-    vst1.u8         {d6, d7}, [r4], r5
-    vst1.u8         {d8, d9}, [r4], r5
-
-    bne             filt_blk2d_spo16x16_loop_neon
-    pop             {r4-r5,pc}
-
-    ENDP
-
-;-----------------
-
-bifilter16_coeff
-    DCD     128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112
-
-    END
--- a/vp8/common/arm/neon/bilinearpredict4x4_neon.asm
+++ /dev/null
@@ -1,130 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp8_bilinear_predict4x4_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-; r0    unsigned char  *src_ptr,
-; r1    int  src_pixels_per_line,
-; r2    int  xoffset,
-; r3    int  yoffset,
-; r4    unsigned char *dst_ptr,
-; stack(lr) int  dst_pitch
-
-|vp8_bilinear_predict4x4_neon| PROC
-    push            {r4, lr}
-
-    adr             r12, bifilter4_coeff
-    ldr             r4, [sp, #8]            ;load parameters from stack
-    ldr             lr, [sp, #12]           ;load parameters from stack
-
-    cmp             r2, #0                  ;skip first_pass filter if xoffset=0
-    beq             skip_firstpass_filter
-
-;First pass: output_height lines x output_width columns (5x4)
-    vld1.u8         {d2}, [r0], r1          ;load src data
-    add             r2, r12, r2, lsl #3     ;calculate Hfilter location (2coeffsx4bytes=8bytes)
-
-    vld1.u8         {d3}, [r0], r1
-    vld1.u32        {d31}, [r2]             ;first_pass filter
-
-    vld1.u8         {d4}, [r0], r1
-    vdup.8          d0, d31[0]              ;first_pass filter (d0-d1)
-    vld1.u8         {d5}, [r0], r1
-    vdup.8          d1, d31[4]
-    vld1.u8         {d6}, [r0], r1
-
-    vshr.u64        q4, q1, #8              ;construct src_ptr[1]
-    vshr.u64        q5, q2, #8
-    vshr.u64        d12, d6, #8
-
-    vzip.32         d2, d3                  ;put 2-line data in 1 register (src_ptr[0])
-    vzip.32         d4, d5
-    vzip.32         d8, d9                  ;put 2-line data in 1 register (src_ptr[1])
-    vzip.32         d10, d11
-
-    vmull.u8        q7, d2, d0              ;(src_ptr[0] * vp9_filter[0])
-    vmull.u8        q8, d4, d0
-    vmull.u8        q9, d6, d0
-
-    vmlal.u8        q7, d8, d1              ;(src_ptr[1] * vp9_filter[1])
-    vmlal.u8        q8, d10, d1
-    vmlal.u8        q9, d12, d1
-
-    vqrshrn.u16    d28, q7, #7              ;shift/round/saturate to u8
-    vqrshrn.u16    d29, q8, #7
-    vqrshrn.u16    d30, q9, #7
-
-;Second pass: 4x4
-secondpass_filter
-    cmp             r3, #0                  ;skip second_pass filter if yoffset=0
-    beq             skip_secondpass_filter
-
-    add             r3, r12, r3, lsl #3 ;calculate Vfilter location
-    vld1.u32        {d31}, [r3]         ;load second_pass filter
-
-    vdup.8          d0, d31[0]              ;second_pass filter parameters (d0-d5)
-    vdup.8          d1, d31[4]
-
-    vmull.u8        q1, d28, d0
-    vmull.u8        q2, d29, d0
-
-    vext.8          d26, d28, d29, #4       ;construct src_ptr[pixel_step]
-    vext.8          d27, d29, d30, #4
-
-    vmlal.u8        q1, d26, d1
-    vmlal.u8        q2, d27, d1
-
-    add             r0, r4, lr
-    add             r1, r0, lr
-    add             r2, r1, lr
-
-    vqrshrn.u16    d2, q1, #7               ;shift/round/saturate to u8
-    vqrshrn.u16    d3, q2, #7
-
-    vst1.32         {d2[0]}, [r4]           ;store result
-    vst1.32         {d2[1]}, [r0]
-    vst1.32         {d3[0]}, [r1]
-    vst1.32         {d3[1]}, [r2]
-
-    pop             {r4, pc}
-
-;--------------------
-skip_firstpass_filter
-
-    vld1.32         {d28[0]}, [r0], r1      ;load src data
-    vld1.32         {d28[1]}, [r0], r1
-    vld1.32         {d29[0]}, [r0], r1
-    vld1.32         {d29[1]}, [r0], r1
-    vld1.32         {d30[0]}, [r0], r1
-
-    b               secondpass_filter
-
-;---------------------
-skip_secondpass_filter
-    vst1.32         {d28[0]}, [r4], lr      ;store result
-    vst1.32         {d28[1]}, [r4], lr
-    vst1.32         {d29[0]}, [r4], lr
-    vst1.32         {d29[1]}, [r4], lr
-
-    pop             {r4, pc}
-
-    ENDP
-
-;-----------------
-
-bifilter4_coeff
-    DCD     128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112
-
-    END
--- a/vp8/common/arm/neon/bilinearpredict8x4_neon.asm
+++ /dev/null
@@ -1,135 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp8_bilinear_predict8x4_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-; r0    unsigned char  *src_ptr,
-; r1    int  src_pixels_per_line,
-; r2    int  xoffset,
-; r3    int  yoffset,
-; r4    unsigned char *dst_ptr,
-; stack(lr) int  dst_pitch
-
-|vp8_bilinear_predict8x4_neon| PROC
-    push            {r4, lr}
-
-    adr             r12, bifilter8x4_coeff
-    ldr             r4, [sp, #8]            ;load parameters from stack
-    ldr             lr, [sp, #12]           ;load parameters from stack
-
-    cmp             r2, #0                  ;skip first_pass filter if xoffset=0
-    beq             skip_firstpass_filter
-
-;First pass: output_height lines x output_width columns (5x8)
-    add             r2, r12, r2, lsl #3     ;calculate filter location
-
-    vld1.u8         {q1}, [r0], r1          ;load src data
-    vld1.u32        {d31}, [r2]             ;load first_pass filter
-    vld1.u8         {q2}, [r0], r1
-    vdup.8          d0, d31[0]              ;first_pass filter (d0 d1)
-    vld1.u8         {q3}, [r0], r1
-    vdup.8          d1, d31[4]
-    vld1.u8         {q4}, [r0], r1
-
-    vmull.u8        q6, d2, d0              ;(src_ptr[0] * vp9_filter[0])
-    vld1.u8         {q5}, [r0], r1
-    vmull.u8        q7, d4, d0
-    vmull.u8        q8, d6, d0
-    vmull.u8        q9, d8, d0
-    vmull.u8        q10, d10, d0
-
-    vext.8          d3, d2, d3, #1          ;construct src_ptr[-1]
-    vext.8          d5, d4, d5, #1
-    vext.8          d7, d6, d7, #1
-    vext.8          d9, d8, d9, #1
-    vext.8          d11, d10, d11, #1
-
-    vmlal.u8        q6, d3, d1              ;(src_ptr[1] * vp9_filter[1])
-    vmlal.u8        q7, d5, d1
-    vmlal.u8        q8, d7, d1
-    vmlal.u8        q9, d9, d1
-    vmlal.u8        q10, d11, d1
-
-    vqrshrn.u16    d22, q6, #7              ;shift/round/saturate to u8
-    vqrshrn.u16    d23, q7, #7
-    vqrshrn.u16    d24, q8, #7
-    vqrshrn.u16    d25, q9, #7
-    vqrshrn.u16    d26, q10, #7
-
-;Second pass: 4x8
-secondpass_filter
-    cmp             r3, #0                  ;skip second_pass filter if yoffset=0
-    beq             skip_secondpass_filter
-
-    add             r3, r12, r3, lsl #3
-    add             r0, r4, lr
-
-    vld1.u32        {d31}, [r3]             ;load second_pass filter
-    add             r1, r0, lr
-
-    vdup.8          d0, d31[0]              ;second_pass filter parameters (d0 d1)
-    vdup.8          d1, d31[4]
-
-    vmull.u8        q1, d22, d0             ;(src_ptr[0] * vp9_filter[0])
-    vmull.u8        q2, d23, d0
-    vmull.u8        q3, d24, d0
-    vmull.u8        q4, d25, d0
-
-    vmlal.u8        q1, d23, d1             ;(src_ptr[pixel_step] * vp9_filter[1])
-    vmlal.u8        q2, d24, d1
-    vmlal.u8        q3, d25, d1
-    vmlal.u8        q4, d26, d1
-
-    add             r2, r1, lr
-
-    vqrshrn.u16    d2, q1, #7               ;shift/round/saturate to u8
-    vqrshrn.u16    d3, q2, #7
-    vqrshrn.u16    d4, q3, #7
-    vqrshrn.u16    d5, q4, #7
-
-    vst1.u8         {d2}, [r4]              ;store result
-    vst1.u8         {d3}, [r0]
-    vst1.u8         {d4}, [r1]
-    vst1.u8         {d5}, [r2]
-
-    pop             {r4, pc}
-
-;--------------------
-skip_firstpass_filter
-    vld1.u8         {d22}, [r0], r1         ;load src data
-    vld1.u8         {d23}, [r0], r1
-    vld1.u8         {d24}, [r0], r1
-    vld1.u8         {d25}, [r0], r1
-    vld1.u8         {d26}, [r0], r1
-
-    b               secondpass_filter
-
-;---------------------
-skip_secondpass_filter
-    vst1.u8         {d22}, [r4], lr         ;store result
-    vst1.u8         {d23}, [r4], lr
-    vst1.u8         {d24}, [r4], lr
-    vst1.u8         {d25}, [r4], lr
-
-    pop             {r4, pc}
-
-    ENDP
-
-;-----------------
-
-bifilter8x4_coeff
-    DCD     128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112
-
-    END
--- a/vp8/common/arm/neon/bilinearpredict8x8_neon.asm
+++ /dev/null
@@ -1,183 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp8_bilinear_predict8x8_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-; r0    unsigned char  *src_ptr,
-; r1    int  src_pixels_per_line,
-; r2    int  xoffset,
-; r3    int  yoffset,
-; r4    unsigned char *dst_ptr,
-; stack(lr) int  dst_pitch
-
-|vp8_bilinear_predict8x8_neon| PROC
-    push            {r4, lr}
-
-    adr             r12, bifilter8_coeff
-    ldr             r4, [sp, #8]            ;load parameters from stack
-    ldr             lr, [sp, #12]           ;load parameters from stack
-
-    cmp             r2, #0                  ;skip first_pass filter if xoffset=0
-    beq             skip_firstpass_filter
-
-;First pass: output_height lines x output_width columns (9x8)
-    add             r2, r12, r2, lsl #3     ;calculate filter location
-
-    vld1.u8         {q1}, [r0], r1          ;load src data
-    vld1.u32        {d31}, [r2]             ;load first_pass filter
-    vld1.u8         {q2}, [r0], r1
-    vdup.8          d0, d31[0]              ;first_pass filter (d0 d1)
-    vld1.u8         {q3}, [r0], r1
-    vdup.8          d1, d31[4]
-    vld1.u8         {q4}, [r0], r1
-
-    vmull.u8        q6, d2, d0              ;(src_ptr[0] * vp9_filter[0])
-    vmull.u8        q7, d4, d0
-    vmull.u8        q8, d6, d0
-    vmull.u8        q9, d8, d0
-
-    vext.8          d3, d2, d3, #1          ;construct src_ptr[-1]
-    vext.8          d5, d4, d5, #1
-    vext.8          d7, d6, d7, #1
-    vext.8          d9, d8, d9, #1
-
-    vmlal.u8        q6, d3, d1              ;(src_ptr[1] * vp9_filter[1])
-    vmlal.u8        q7, d5, d1
-    vmlal.u8        q8, d7, d1
-    vmlal.u8        q9, d9, d1
-
-    vld1.u8         {q1}, [r0], r1          ;load src data
-    vqrshrn.u16    d22, q6, #7              ;shift/round/saturate to u8
-    vld1.u8         {q2}, [r0], r1
-    vqrshrn.u16    d23, q7, #7
-    vld1.u8         {q3}, [r0], r1
-    vqrshrn.u16    d24, q8, #7
-    vld1.u8         {q4}, [r0], r1
-    vqrshrn.u16    d25, q9, #7
-
-    ;first_pass filtering on the rest 5-line data
-    vld1.u8         {q5}, [r0], r1
-
-    vmull.u8        q6, d2, d0              ;(src_ptr[0] * vp9_filter[0])
-    vmull.u8        q7, d4, d0
-    vmull.u8        q8, d6, d0
-    vmull.u8        q9, d8, d0
-    vmull.u8        q10, d10, d0
-
-    vext.8          d3, d2, d3, #1          ;construct src_ptr[-1]
-    vext.8          d5, d4, d5, #1
-    vext.8          d7, d6, d7, #1
-    vext.8          d9, d8, d9, #1
-    vext.8          d11, d10, d11, #1
-
-    vmlal.u8        q6, d3, d1              ;(src_ptr[1] * vp9_filter[1])
-    vmlal.u8        q7, d5, d1
-    vmlal.u8        q8, d7, d1
-    vmlal.u8        q9, d9, d1
-    vmlal.u8        q10, d11, d1
-
-    vqrshrn.u16    d26, q6, #7              ;shift/round/saturate to u8
-    vqrshrn.u16    d27, q7, #7
-    vqrshrn.u16    d28, q8, #7
-    vqrshrn.u16    d29, q9, #7
-    vqrshrn.u16    d30, q10, #7
-
-;Second pass: 8x8
-secondpass_filter
-    cmp             r3, #0                  ;skip second_pass filter if yoffset=0
-    beq             skip_secondpass_filter
-
-    add             r3, r12, r3, lsl #3
-    add             r0, r4, lr
-
-    vld1.u32        {d31}, [r3]             ;load second_pass filter
-    add             r1, r0, lr
-
-    vdup.8          d0, d31[0]              ;second_pass filter parameters (d0 d1)
-    vdup.8          d1, d31[4]
-
-    vmull.u8        q1, d22, d0             ;(src_ptr[0] * vp9_filter[0])
-    vmull.u8        q2, d23, d0
-    vmull.u8        q3, d24, d0
-    vmull.u8        q4, d25, d0
-    vmull.u8        q5, d26, d0
-    vmull.u8        q6, d27, d0
-    vmull.u8        q7, d28, d0
-    vmull.u8        q8, d29, d0
-
-    vmlal.u8        q1, d23, d1             ;(src_ptr[pixel_step] * vp9_filter[1])
-    vmlal.u8        q2, d24, d1
-    vmlal.u8        q3, d25, d1
-    vmlal.u8        q4, d26, d1
-    vmlal.u8        q5, d27, d1
-    vmlal.u8        q6, d28, d1
-    vmlal.u8        q7, d29, d1
-    vmlal.u8        q8, d30, d1
-
-    vqrshrn.u16    d2, q1, #7               ;shift/round/saturate to u8
-    vqrshrn.u16    d3, q2, #7
-    vqrshrn.u16    d4, q3, #7
-    vqrshrn.u16    d5, q4, #7
-    vqrshrn.u16    d6, q5, #7
-    vqrshrn.u16    d7, q6, #7
-    vqrshrn.u16    d8, q7, #7
-    vqrshrn.u16    d9, q8, #7
-
-    vst1.u8         {d2}, [r4]              ;store result
-    vst1.u8         {d3}, [r0]
-    vst1.u8         {d4}, [r1], lr
-    vst1.u8         {d5}, [r1], lr
-    vst1.u8         {d6}, [r1], lr
-    vst1.u8         {d7}, [r1], lr
-    vst1.u8         {d8}, [r1], lr
-    vst1.u8         {d9}, [r1], lr
-
-    pop             {r4, pc}
-
-;--------------------
-skip_firstpass_filter
-    vld1.u8         {d22}, [r0], r1         ;load src data
-    vld1.u8         {d23}, [r0], r1
-    vld1.u8         {d24}, [r0], r1
-    vld1.u8         {d25}, [r0], r1
-    vld1.u8         {d26}, [r0], r1
-    vld1.u8         {d27}, [r0], r1
-    vld1.u8         {d28}, [r0], r1
-    vld1.u8         {d29}, [r0], r1
-    vld1.u8         {d30}, [r0], r1
-
-    b               secondpass_filter
-
-;---------------------
-skip_secondpass_filter
-    vst1.u8         {d22}, [r4], lr         ;store result
-    vst1.u8         {d23}, [r4], lr
-    vst1.u8         {d24}, [r4], lr
-    vst1.u8         {d25}, [r4], lr
-    vst1.u8         {d26}, [r4], lr
-    vst1.u8         {d27}, [r4], lr
-    vst1.u8         {d28}, [r4], lr
-    vst1.u8         {d29}, [r4], lr
-
-    pop             {r4, pc}
-
-    ENDP
-
-;-----------------
-
-bifilter8_coeff
-    DCD     128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112
-
-    END
--- a/vp8/common/arm/neon/buildintrapredictorsmby_neon.asm
+++ /dev/null
@@ -1,584 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp8_build_intra_predictors_mby_neon_func|
-    EXPORT  |vp8_build_intra_predictors_mby_s_neon_func|
-
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-; r0    unsigned char *y_buffer
-; r1    unsigned char *ypred_ptr
-; r2    int y_stride
-; r3    int mode
-; stack int Up
-; stack int Left
-
-|vp8_build_intra_predictors_mby_neon_func| PROC
-    push            {r4-r8, lr}
-
-    cmp             r3, #0
-    beq             case_dc_pred
-    cmp             r3, #1
-    beq             case_v_pred
-    cmp             r3, #2
-    beq             case_h_pred
-    cmp             r3, #3
-    beq             case_tm_pred
-
-case_dc_pred
-    ldr             r4, [sp, #24]       ; Up
-    ldr             r5, [sp, #28]       ; Left
-
-    ; Default the DC average to 128
-    mov             r12, #128
-    vdup.u8         q0, r12
-
-    ; Zero out running sum
-    mov             r12, #0
-
-    ; compute shift and jump
-    adds            r7, r4, r5
-    beq             skip_dc_pred_up_left
-
-    ; Load above row, if it exists
-    cmp             r4, #0
-    beq             skip_dc_pred_up
-
-    sub             r6, r0, r2
-    vld1.8          {q1}, [r6]
-    vpaddl.u8       q2, q1
-    vpaddl.u16      q3, q2
-    vpaddl.u32      q4, q3
-
-    vmov.32         r4, d8[0]
-    vmov.32         r6, d9[0]
-
-    add             r12, r4, r6
-
-    ; Move back to interger registers
-
-skip_dc_pred_up
-
-    cmp             r5, #0
-    beq             skip_dc_pred_left
-
-    sub             r0, r0, #1
-
-    ; Load left row, if it exists
-    ldrb            r3, [r0], r2
-    ldrb            r4, [r0], r2
-    ldrb            r5, [r0], r2
-    ldrb            r6, [r0], r2
-
-    add             r12, r12, r3
-    add             r12, r12, r4
-    add             r12, r12, r5
-    add             r12, r12, r6
-
-    ldrb            r3, [r0], r2
-    ldrb            r4, [r0], r2
-    ldrb            r5, [r0], r2
-    ldrb            r6, [r0], r2
-
-    add             r12, r12, r3
-    add             r12, r12, r4
-    add             r12, r12, r5
-    add             r12, r12, r6
-
-    ldrb            r3, [r0], r2
-    ldrb            r4, [r0], r2
-    ldrb            r5, [r0], r2
-    ldrb            r6, [r0], r2
-
-    add             r12, r12, r3
-    add             r12, r12, r4
-    add             r12, r12, r5
-    add             r12, r12, r6
-
-    ldrb            r3, [r0], r2
-    ldrb            r4, [r0], r2
-    ldrb            r5, [r0], r2
-    ldrb            r6, [r0]
-
-    add             r12, r12, r3
-    add             r12, r12, r4
-    add             r12, r12, r5
-    add             r12, r12, r6
-
-skip_dc_pred_left
-    add             r7, r7, #3          ; Shift
-    sub             r4, r7, #1
-    mov             r5, #1
-    add             r12, r12, r5, lsl r4
-    mov             r5, r12, lsr r7     ; expected_dc
-
-    vdup.u8         q0, r5
-
-skip_dc_pred_up_left
-    vst1.u8         {q0}, [r1]!
-    vst1.u8         {q0}, [r1]!
-    vst1.u8         {q0}, [r1]!
-    vst1.u8         {q0}, [r1]!
-    vst1.u8         {q0}, [r1]!
-    vst1.u8         {q0}, [r1]!
-    vst1.u8         {q0}, [r1]!
-    vst1.u8         {q0}, [r1]!
-    vst1.u8         {q0}, [r1]!
-    vst1.u8         {q0}, [r1]!
-    vst1.u8         {q0}, [r1]!
-    vst1.u8         {q0}, [r1]!
-    vst1.u8         {q0}, [r1]!
-    vst1.u8         {q0}, [r1]!
-    vst1.u8         {q0}, [r1]!
-    vst1.u8         {q0}, [r1]!
-
-    pop             {r4-r8,pc}
-case_v_pred
-    ; Copy down above row
-    sub             r6, r0, r2
-    vld1.8          {q0}, [r6]
-
-    vst1.u8         {q0}, [r1]!
-    vst1.u8         {q0}, [r1]!
-    vst1.u8         {q0}, [r1]!
-    vst1.u8         {q0}, [r1]!
-    vst1.u8         {q0}, [r1]!
-    vst1.u8         {q0}, [r1]!
-    vst1.u8         {q0}, [r1]!
-    vst1.u8         {q0}, [r1]!
-    vst1.u8         {q0}, [r1]!
-    vst1.u8         {q0}, [r1]!
-    vst1.u8         {q0}, [r1]!
-    vst1.u8         {q0}, [r1]!
-    vst1.u8         {q0}, [r1]!
-    vst1.u8         {q0}, [r1]!
-    vst1.u8         {q0}, [r1]!
-    vst1.u8         {q0}, [r1]!
-    pop             {r4-r8,pc}
-
-case_h_pred
-    ; Load 4x yleft_col
-    sub             r0, r0, #1
-
-    ldrb            r3, [r0], r2
-    ldrb            r4, [r0], r2
-    ldrb            r5, [r0], r2
-    ldrb            r6, [r0], r2
-    vdup.u8         q0, r3
-    vdup.u8         q1, r4
-    vdup.u8         q2, r5
-    vdup.u8         q3, r6
-    vst1.u8         {q0}, [r1]!
-    vst1.u8         {q1}, [r1]!
-    vst1.u8         {q2}, [r1]!
-    vst1.u8         {q3}, [r1]!
-
-    ldrb            r3, [r0], r2
-    ldrb            r4, [r0], r2
-    ldrb            r5, [r0], r2
-    ldrb            r6, [r0], r2
-    vdup.u8         q0, r3
-    vdup.u8         q1, r4
-    vdup.u8         q2, r5
-    vdup.u8         q3, r6
-    vst1.u8         {q0}, [r1]!
-    vst1.u8         {q1}, [r1]!
-    vst1.u8         {q2}, [r1]!
-    vst1.u8         {q3}, [r1]!
-
-
-    ldrb            r3, [r0], r2
-    ldrb            r4, [r0], r2
-    ldrb            r5, [r0], r2
-    ldrb            r6, [r0], r2
-    vdup.u8         q0, r3
-    vdup.u8         q1, r4
-    vdup.u8         q2, r5
-    vdup.u8         q3, r6
-    vst1.u8         {q0}, [r1]!
-    vst1.u8         {q1}, [r1]!
-    vst1.u8         {q2}, [r1]!
-    vst1.u8         {q3}, [r1]!
-
-    ldrb            r3, [r0], r2
-    ldrb            r4, [r0], r2
-    ldrb            r5, [r0], r2
-    ldrb            r6, [r0], r2
-    vdup.u8         q0, r3
-    vdup.u8         q1, r4
-    vdup.u8         q2, r5
-    vdup.u8         q3, r6
-    vst1.u8         {q0}, [r1]!
-    vst1.u8         {q1}, [r1]!
-    vst1.u8         {q2}, [r1]!
-    vst1.u8         {q3}, [r1]!
-
-    pop             {r4-r8,pc}
-
-case_tm_pred
-    ; Load yabove_row
-    sub             r3, r0, r2
-    vld1.8          {q8}, [r3]
-
-    ; Load ytop_left
-    sub             r3, r3, #1
-    ldrb            r7, [r3]
-
-    vdup.u16        q7, r7
-
-    ; Compute yabove_row - ytop_left
-    mov             r3, #1
-    vdup.u8         q0, r3
-
-    vmull.u8        q4, d16, d0
-    vmull.u8        q5, d17, d0
-
-    vsub.s16        q4, q4, q7
-    vsub.s16        q5, q5, q7
-
-    ; Load 4x yleft_col
-    sub             r0, r0, #1
-    mov             r12, #4
-
-case_tm_pred_loop
-    ldrb            r3, [r0], r2
-    ldrb            r4, [r0], r2
-    ldrb            r5, [r0], r2
-    ldrb            r6, [r0], r2
-    vdup.u16        q0, r3
-    vdup.u16        q1, r4
-    vdup.u16        q2, r5
-    vdup.u16        q3, r6
-
-    vqadd.s16       q8, q0, q4
-    vqadd.s16       q9, q0, q5
-
-    vqadd.s16       q10, q1, q4
-    vqadd.s16       q11, q1, q5
-
-    vqadd.s16       q12, q2, q4
-    vqadd.s16       q13, q2, q5
-
-    vqadd.s16       q14, q3, q4
-    vqadd.s16       q15, q3, q5
-
-    vqshrun.s16     d0, q8, #0
-    vqshrun.s16     d1, q9, #0
-
-    vqshrun.s16     d2, q10, #0
-    vqshrun.s16     d3, q11, #0
-
-    vqshrun.s16     d4, q12, #0
-    vqshrun.s16     d5, q13, #0
-
-    vqshrun.s16     d6, q14, #0
-    vqshrun.s16     d7, q15, #0
-
-    vst1.u8         {q0}, [r1]!
-    vst1.u8         {q1}, [r1]!
-    vst1.u8         {q2}, [r1]!
-    vst1.u8         {q3}, [r1]!
-
-    subs            r12, r12, #1
-    bne             case_tm_pred_loop
-
-    pop             {r4-r8,pc}
-
-    ENDP
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-; r0    unsigned char *y_buffer
-; r1    unsigned char *ypred_ptr
-; r2    int y_stride
-; r3    int mode
-; stack int Up
-; stack int Left
-
-|vp8_build_intra_predictors_mby_s_neon_func| PROC
-    push            {r4-r8, lr}
-
-    mov             r1, r0      ;   unsigned char *ypred_ptr = x->dst.y_buffer; //x->Predictor;
-
-    cmp             r3, #0
-    beq             case_dc_pred_s
-    cmp             r3, #1
-    beq             case_v_pred_s
-    cmp             r3, #2
-    beq             case_h_pred_s
-    cmp             r3, #3
-    beq             case_tm_pred_s
-
-case_dc_pred_s
-    ldr             r4, [sp, #24]       ; Up
-    ldr             r5, [sp, #28]       ; Left
-
-    ; Default the DC average to 128
-    mov             r12, #128
-    vdup.u8         q0, r12
-
-    ; Zero out running sum
-    mov             r12, #0
-
-    ; compute shift and jump
-    adds            r7, r4, r5
-    beq             skip_dc_pred_up_left_s
-
-    ; Load above row, if it exists
-    cmp             r4, #0
-    beq             skip_dc_pred_up_s
-
-    sub             r6, r0, r2
-    vld1.8          {q1}, [r6]
-    vpaddl.u8       q2, q1
-    vpaddl.u16      q3, q2
-    vpaddl.u32      q4, q3
-
-    vmov.32         r4, d8[0]
-    vmov.32         r6, d9[0]
-
-    add             r12, r4, r6
-
-    ; Move back to interger registers
-
-skip_dc_pred_up_s
-
-    cmp             r5, #0
-    beq             skip_dc_pred_left_s
-
-    sub             r0, r0, #1
-
-    ; Load left row, if it exists
-    ldrb            r3, [r0], r2
-    ldrb            r4, [r0], r2
-    ldrb            r5, [r0], r2
-    ldrb            r6, [r0], r2
-
-    add             r12, r12, r3
-    add             r12, r12, r4
-    add             r12, r12, r5
-    add             r12, r12, r6
-
-    ldrb            r3, [r0], r2
-    ldrb            r4, [r0], r2
-    ldrb            r5, [r0], r2
-    ldrb            r6, [r0], r2
-
-    add             r12, r12, r3
-    add             r12, r12, r4
-    add             r12, r12, r5
-    add             r12, r12, r6
-
-    ldrb            r3, [r0], r2
-    ldrb            r4, [r0], r2
-    ldrb            r5, [r0], r2
-    ldrb            r6, [r0], r2
-
-    add             r12, r12, r3
-    add             r12, r12, r4
-    add             r12, r12, r5
-    add             r12, r12, r6
-
-    ldrb            r3, [r0], r2
-    ldrb            r4, [r0], r2
-    ldrb            r5, [r0], r2
-    ldrb            r6, [r0]
-
-    add             r12, r12, r3
-    add             r12, r12, r4
-    add             r12, r12, r5
-    add             r12, r12, r6
-
-skip_dc_pred_left_s
-    add             r7, r7, #3          ; Shift
-    sub             r4, r7, #1
-    mov             r5, #1
-    add             r12, r12, r5, lsl r4
-    mov             r5, r12, lsr r7     ; expected_dc
-
-    vdup.u8         q0, r5
-
-skip_dc_pred_up_left_s
-    vst1.u8         {q0}, [r1], r2
-    vst1.u8         {q0}, [r1], r2
-    vst1.u8         {q0}, [r1], r2
-    vst1.u8         {q0}, [r1], r2
-    vst1.u8         {q0}, [r1], r2
-    vst1.u8         {q0}, [r1], r2
-    vst1.u8         {q0}, [r1], r2
-    vst1.u8         {q0}, [r1], r2
-    vst1.u8         {q0}, [r1], r2
-    vst1.u8         {q0}, [r1], r2
-    vst1.u8         {q0}, [r1], r2
-    vst1.u8         {q0}, [r1], r2
-    vst1.u8         {q0}, [r1], r2
-    vst1.u8         {q0}, [r1], r2
-    vst1.u8         {q0}, [r1], r2
-    vst1.u8         {q0}, [r1], r2
-
-    pop             {r4-r8,pc}
-case_v_pred_s
-    ; Copy down above row
-    sub             r6, r0, r2
-    vld1.8          {q0}, [r6]
-
-    vst1.u8         {q0}, [r1], r2
-    vst1.u8         {q0}, [r1], r2
-    vst1.u8         {q0}, [r1], r2
-    vst1.u8         {q0}, [r1], r2
-    vst1.u8         {q0}, [r1], r2
-    vst1.u8         {q0}, [r1], r2
-    vst1.u8         {q0}, [r1], r2
-    vst1.u8         {q0}, [r1], r2
-    vst1.u8         {q0}, [r1], r2
-    vst1.u8         {q0}, [r1], r2
-    vst1.u8         {q0}, [r1], r2
-    vst1.u8         {q0}, [r1], r2
-    vst1.u8         {q0}, [r1], r2
-    vst1.u8         {q0}, [r1], r2
-    vst1.u8         {q0}, [r1], r2
-    vst1.u8         {q0}, [r1], r2
-    pop             {r4-r8,pc}
-
-case_h_pred_s
-    ; Load 4x yleft_col
-    sub             r0, r0, #1
-
-    ldrb            r3, [r0], r2
-    ldrb            r4, [r0], r2
-    ldrb            r5, [r0], r2
-    ldrb            r6, [r0], r2
-    vdup.u8         q0, r3
-    vdup.u8         q1, r4
-    vdup.u8         q2, r5
-    vdup.u8         q3, r6
-    vst1.u8         {q0}, [r1], r2
-    vst1.u8         {q1}, [r1], r2
-    vst1.u8         {q2}, [r1], r2
-    vst1.u8         {q3}, [r1], r2
-
-    ldrb            r3, [r0], r2
-    ldrb            r4, [r0], r2
-    ldrb            r5, [r0], r2
-    ldrb            r6, [r0], r2
-    vdup.u8         q0, r3
-    vdup.u8         q1, r4
-    vdup.u8         q2, r5
-    vdup.u8         q3, r6
-    vst1.u8         {q0}, [r1], r2
-    vst1.u8         {q1}, [r1], r2
-    vst1.u8         {q2}, [r1], r2
-    vst1.u8         {q3}, [r1], r2
-
-
-    ldrb            r3, [r0], r2
-    ldrb            r4, [r0], r2
-    ldrb            r5, [r0], r2
-    ldrb            r6, [r0], r2
-    vdup.u8         q0, r3
-    vdup.u8         q1, r4
-    vdup.u8         q2, r5
-    vdup.u8         q3, r6
-    vst1.u8         {q0}, [r1], r2
-    vst1.u8         {q1}, [r1], r2
-    vst1.u8         {q2}, [r1], r2
-    vst1.u8         {q3}, [r1], r2
-
-    ldrb            r3, [r0], r2
-    ldrb            r4, [r0], r2
-    ldrb            r5, [r0], r2
-    ldrb            r6, [r0], r2
-    vdup.u8         q0, r3
-    vdup.u8         q1, r4
-    vdup.u8         q2, r5
-    vdup.u8         q3, r6
-    vst1.u8         {q0}, [r1], r2
-    vst1.u8         {q1}, [r1], r2
-    vst1.u8         {q2}, [r1], r2
-    vst1.u8         {q3}, [r1], r2
-
-    pop             {r4-r8,pc}
-
-case_tm_pred_s
-    ; Load yabove_row
-    sub             r3, r0, r2
-    vld1.8          {q8}, [r3]
-
-    ; Load ytop_left
-    sub             r3, r3, #1
-    ldrb            r7, [r3]
-
-    vdup.u16        q7, r7
-
-    ; Compute yabove_row - ytop_left
-    mov             r3, #1
-    vdup.u8         q0, r3
-
-    vmull.u8        q4, d16, d0
-    vmull.u8        q5, d17, d0
-
-    vsub.s16        q4, q4, q7
-    vsub.s16        q5, q5, q7
-
-    ; Load 4x yleft_col
-    sub             r0, r0, #1
-    mov             r12, #4
-
-case_tm_pred_loop_s
-    ldrb            r3, [r0], r2
-    ldrb            r4, [r0], r2
-    ldrb            r5, [r0], r2
-    ldrb            r6, [r0], r2
-    vdup.u16        q0, r3
-    vdup.u16        q1, r4
-    vdup.u16        q2, r5
-    vdup.u16        q3, r6
-
-    vqadd.s16       q8, q0, q4
-    vqadd.s16       q9, q0, q5
-
-    vqadd.s16       q10, q1, q4
-    vqadd.s16       q11, q1, q5
-
-    vqadd.s16       q12, q2, q4
-    vqadd.s16       q13, q2, q5
-
-    vqadd.s16       q14, q3, q4
-    vqadd.s16       q15, q3, q5
-
-    vqshrun.s16     d0, q8, #0
-    vqshrun.s16     d1, q9, #0
-
-    vqshrun.s16     d2, q10, #0
-    vqshrun.s16     d3, q11, #0
-
-    vqshrun.s16     d4, q12, #0
-    vqshrun.s16     d5, q13, #0
-
-    vqshrun.s16     d6, q14, #0
-    vqshrun.s16     d7, q15, #0
-
-    vst1.u8         {q0}, [r1], r2
-    vst1.u8         {q1}, [r1], r2
-    vst1.u8         {q2}, [r1], r2
-    vst1.u8         {q3}, [r1], r2
-
-    subs            r12, r12, #1
-    bne             case_tm_pred_loop_s
-
-    pop             {r4-r8,pc}
-
-    ENDP
-
-
-    END
--- a/vp8/common/arm/neon/copymem16x16_neon.asm
+++ /dev/null
@@ -1,59 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp9_copy_mem16x16_neon|
-    ; ARM
-    ; REQUIRE8
-    ; PRESERVE8
-
-    AREA    Block, CODE, READONLY ; name this block of code
-;void copy_mem16x16_neon( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride)
-;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
-|vp9_copy_mem16x16_neon| PROC
-
-    vld1.u8     {q0}, [r0], r1
-    vld1.u8     {q1}, [r0], r1
-    vld1.u8     {q2}, [r0], r1
-    vst1.u8     {q0}, [r2], r3
-    vld1.u8     {q3}, [r0], r1
-    vst1.u8     {q1}, [r2], r3
-    vld1.u8     {q4}, [r0], r1
-    vst1.u8     {q2}, [r2], r3
-    vld1.u8     {q5}, [r0], r1
-    vst1.u8     {q3}, [r2], r3
-    vld1.u8     {q6}, [r0], r1
-    vst1.u8     {q4}, [r2], r3
-    vld1.u8     {q7}, [r0], r1
-    vst1.u8     {q5}, [r2], r3
-    vld1.u8     {q8}, [r0], r1
-    vst1.u8     {q6}, [r2], r3
-    vld1.u8     {q9}, [r0], r1
-    vst1.u8     {q7}, [r2], r3
-    vld1.u8     {q10}, [r0], r1
-    vst1.u8     {q8}, [r2], r3
-    vld1.u8     {q11}, [r0], r1
-    vst1.u8     {q9}, [r2], r3
-    vld1.u8     {q12}, [r0], r1
-    vst1.u8     {q10}, [r2], r3
-    vld1.u8     {q13}, [r0], r1
-    vst1.u8     {q11}, [r2], r3
-    vld1.u8     {q14}, [r0], r1
-    vst1.u8     {q12}, [r2], r3
-    vld1.u8     {q15}, [r0], r1
-    vst1.u8     {q13}, [r2], r3
-    vst1.u8     {q14}, [r2], r3
-    vst1.u8     {q15}, [r2], r3
-
-    mov     pc, lr
-
-    ENDP  ; |vp9_copy_mem16x16_neon|
-
-    END
--- a/vp8/common/arm/neon/copymem8x4_neon.asm
+++ /dev/null
@@ -1,34 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp9_copy_mem8x4_neon|
-    ; ARM
-    ; REQUIRE8
-    ; PRESERVE8
-
-    AREA    Block, CODE, READONLY ; name this block of code
-;void copy_mem8x4_neon( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride)
-;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
-|vp9_copy_mem8x4_neon| PROC
-    vld1.u8     {d0}, [r0], r1
-    vld1.u8     {d1}, [r0], r1
-    vst1.u8     {d0}, [r2], r3
-    vld1.u8     {d2}, [r0], r1
-    vst1.u8     {d1}, [r2], r3
-    vld1.u8     {d3}, [r0], r1
-    vst1.u8     {d2}, [r2], r3
-    vst1.u8     {d3}, [r2], r3
-
-    mov     pc, lr
-
-    ENDP  ; |vp9_copy_mem8x4_neon|
-
-    END
--- a/vp8/common/arm/neon/copymem8x8_neon.asm
+++ /dev/null
@@ -1,43 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp9_copy_mem8x8_neon|
-    ; ARM
-    ; REQUIRE8
-    ; PRESERVE8
-
-    AREA    Block, CODE, READONLY ; name this block of code
-;void copy_mem8x8_neon( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride)
-;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
-|vp9_copy_mem8x8_neon| PROC
-
-    vld1.u8     {d0}, [r0], r1
-    vld1.u8     {d1}, [r0], r1
-    vst1.u8     {d0}, [r2], r3
-    vld1.u8     {d2}, [r0], r1
-    vst1.u8     {d1}, [r2], r3
-    vld1.u8     {d3}, [r0], r1
-    vst1.u8     {d2}, [r2], r3
-    vld1.u8     {d4}, [r0], r1
-    vst1.u8     {d3}, [r2], r3
-    vld1.u8     {d5}, [r0], r1
-    vst1.u8     {d4}, [r2], r3
-    vld1.u8     {d6}, [r0], r1
-    vst1.u8     {d5}, [r2], r3
-    vld1.u8     {d7}, [r0], r1
-    vst1.u8     {d6}, [r2], r3
-    vst1.u8     {d7}, [r2], r3
-
-    mov     pc, lr
-
-    ENDP  ; |vp9_copy_mem8x8_neon|
-
-    END
--- a/vp8/common/arm/neon/dc_only_idct_add_neon.asm
+++ /dev/null
@@ -1,49 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license and patent
-;  grant that can be found in the LICENSE file in the root of the source
-;  tree. All contributing project authors may be found in the AUTHORS
-;  file in the root of the source tree.
-;
-
-
-    EXPORT  |vp8_dc_only_idct_add_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-;void vp8_dc_only_idct_add_neon(short input_dc, unsigned char *pred_ptr,
-;                               unsigned char *dst_ptr, int pitch, int stride)
-; r0  input_dc
-; r1  pred_ptr
-; r2  dst_ptr
-; r3  pitch
-; sp  stride
-|vp8_dc_only_idct_add_neon| PROC
-    add             r0, r0, #4
-    asr             r0, r0, #3
-    ldr             r12, [sp]
-    vdup.16         q0, r0
-
-    vld1.32         {d2[0]}, [r1], r3
-    vld1.32         {d2[1]}, [r1], r3
-    vld1.32         {d4[0]}, [r1], r3
-    vld1.32         {d4[1]}, [r1]
-
-    vaddw.u8        q1, q0, d2
-    vaddw.u8        q2, q0, d4
-
-    vqmovun.s16     d2, q1
-    vqmovun.s16     d4, q2
-
-    vst1.32         {d2[0]}, [r2], r12
-    vst1.32         {d2[1]}, [r2], r12
-    vst1.32         {d4[0]}, [r2], r12
-    vst1.32         {d4[1]}, [r2]
-
-    bx             lr
-
-    ENDP
-    END
--- a/vp8/common/arm/neon/iwalsh_neon.asm
+++ /dev/null
@@ -1,80 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-    EXPORT  |vp8_short_inv_walsh4x4_neon|
-    EXPORT  |vp8_short_inv_walsh4x4_1_neon|
-
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA    |.text|, CODE, READONLY  ; name this block of code
-
-;short vp8_short_inv_walsh4x4_neon(short *input, short *output)
-|vp8_short_inv_walsh4x4_neon| PROC
-
-    ; read in all four lines of values: d0->d3
-    vld1.i16 {q0-q1}, [r0@128]
-
-    ; first for loop
-    vadd.s16 d4, d0, d3 ;a = [0] + [12]
-    vadd.s16 d6, d1, d2 ;b = [4] + [8]
-    vsub.s16 d5, d0, d3 ;d = [0] - [12]
-    vsub.s16 d7, d1, d2 ;c = [4] - [8]
-
-    vadd.s16 q0, q2, q3 ; a+b d+c
-    vsub.s16 q1, q2, q3 ; a-b d-c
-
-    vtrn.32 d0, d2 ;d0:  0  1  8  9
-                   ;d2:  2  3 10 11
-    vtrn.32 d1, d3 ;d1:  4  5 12 13
-                   ;d3:  6  7 14 15
-
-    vtrn.16 d0, d1 ;d0:  0  4  8 12
-                   ;d1:  1  5  9 13
-    vtrn.16 d2, d3 ;d2:  2  6 10 14
-                   ;d3:  3  7 11 15
-
-    ; second for loop
-
-    vadd.s16 d4, d0, d3 ;a = [0] + [3]
-    vadd.s16 d6, d1, d2 ;b = [1] + [2]
-    vsub.s16 d5, d0, d3 ;d = [0] - [3]
-    vsub.s16 d7, d1, d2 ;c = [1] - [2]
-
-    vmov.i16 q8, #3
-
-    vadd.s16 q0, q2, q3 ; a+b d+c
-    vsub.s16 q1, q2, q3 ; a-b d-c
-
-    vadd.i16 q0, q0, q8 ;e/f += 3
-    vadd.i16 q1, q1, q8 ;g/h += 3
-
-    vshr.s16 q0, q0, #3 ;e/f >> 3
-    vshr.s16 q1, q1, #3 ;g/h >> 3
-
-    vst4.i16 {d0,d1,d2,d3}, [r1@128]
-
-    bx lr
-    ENDP    ; |vp8_short_inv_walsh4x4_neon|
-
-
-;short vp8_short_inv_walsh4x4_1_neon(short *input, short *output)
-|vp8_short_inv_walsh4x4_1_neon| PROC
-    ldrsh r2, [r0]          ; load input[0]
-    add r3, r2, #3          ; add 3
-    add r2, r1, #16         ; base for last 8 output
-    asr r0, r3, #3          ; right shift 3
-    vdup.16 q0, r0          ; load and duplicate
-    vst1.16 {q0}, [r1@128]  ; write back 8
-    vst1.16 {q0}, [r2@128]  ; write back last 8
-    bx lr
-    ENDP    ; |vp8_short_inv_walsh4x4_1_neon|
-
-    END
--- a/vp8/common/arm/neon/loopfilter_neon.asm
+++ /dev/null
@@ -1,397 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp9_loop_filter_horizontal_edge_y_neon|
-    EXPORT  |vp9_loop_filter_horizontal_edge_uv_neon|
-    EXPORT  |vp9_loop_filter_vertical_edge_y_neon|
-    EXPORT  |vp9_loop_filter_vertical_edge_uv_neon|
-    ARM
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-; r0    unsigned char *src
-; r1    int pitch
-; r2    unsigned char blimit
-; r3    unsigned char limit
-; sp    unsigned char thresh,
-|vp9_loop_filter_horizontal_edge_y_neon| PROC
-    push        {lr}
-    vdup.u8     q0, r2                     ; duplicate blimit
-    vdup.u8     q1, r3                     ; duplicate limit
-    sub         r2, r0, r1, lsl #2         ; move src pointer down by 4 lines
-    ldr         r3, [sp, #4]               ; load thresh
-    add         r12, r2, r1
-    add         r1, r1, r1
-
-    vdup.u8     q2, r3                     ; duplicate thresh
-
-    vld1.u8     {q3}, [r2@128], r1              ; p3
-    vld1.u8     {q4}, [r12@128], r1             ; p2
-    vld1.u8     {q5}, [r2@128], r1              ; p1
-    vld1.u8     {q6}, [r12@128], r1             ; p0
-    vld1.u8     {q7}, [r2@128], r1              ; q0
-    vld1.u8     {q8}, [r12@128], r1             ; q1
-    vld1.u8     {q9}, [r2@128]                  ; q2
-    vld1.u8     {q10}, [r12@128]                ; q3
-
-    sub         r2, r2, r1, lsl #1
-    sub         r12, r12, r1, lsl #1
-
-    bl          vp9_loop_filter_neon
-
-    vst1.u8     {q5}, [r2@128], r1              ; store op1
-    vst1.u8     {q6}, [r12@128], r1             ; store op0
-    vst1.u8     {q7}, [r2@128], r1              ; store oq0
-    vst1.u8     {q8}, [r12@128], r1             ; store oq1
-
-    pop         {pc}
-    ENDP        ; |vp9_loop_filter_horizontal_edge_y_neon|
-
-
-; r0    unsigned char *u,
-; r1    int pitch,
-; r2    unsigned char blimit
-; r3    unsigned char limit
-; sp    unsigned char thresh,
-; sp+4  unsigned char *v
-|vp9_loop_filter_horizontal_edge_uv_neon| PROC
-    push        {lr}
-    vdup.u8     q0, r2                      ; duplicate blimit
-    vdup.u8     q1, r3                      ; duplicate limit
-    ldr         r12, [sp, #4]               ; load thresh
-    ldr         r2, [sp, #8]                ; load v ptr
-    vdup.u8     q2, r12                     ; duplicate thresh
-
-    sub         r3, r0, r1, lsl #2          ; move u pointer down by 4 lines
-    sub         r12, r2, r1, lsl #2         ; move v pointer down by 4 lines
-
-    vld1.u8     {d6}, [r3@64], r1              ; p3
-    vld1.u8     {d7}, [r12@64], r1             ; p3
-    vld1.u8     {d8}, [r3@64], r1              ; p2
-    vld1.u8     {d9}, [r12@64], r1             ; p2
-    vld1.u8     {d10}, [r3@64], r1             ; p1
-    vld1.u8     {d11}, [r12@64], r1            ; p1
-    vld1.u8     {d12}, [r3@64], r1             ; p0
-    vld1.u8     {d13}, [r12@64], r1            ; p0
-    vld1.u8     {d14}, [r3@64], r1             ; q0
-    vld1.u8     {d15}, [r12@64], r1            ; q0
-    vld1.u8     {d16}, [r3@64], r1             ; q1
-    vld1.u8     {d17}, [r12@64], r1            ; q1
-    vld1.u8     {d18}, [r3@64], r1             ; q2
-    vld1.u8     {d19}, [r12@64], r1            ; q2
-    vld1.u8     {d20}, [r3@64]                 ; q3
-    vld1.u8     {d21}, [r12@64]                ; q3
-
-    bl          vp9_loop_filter_neon
-
-    sub         r0, r0, r1, lsl #1
-    sub         r2, r2, r1, lsl #1
-
-    vst1.u8     {d10}, [r0@64], r1             ; store u op1
-    vst1.u8     {d11}, [r2@64], r1             ; store v op1
-    vst1.u8     {d12}, [r0@64], r1             ; store u op0
-    vst1.u8     {d13}, [r2@64], r1             ; store v op0
-    vst1.u8     {d14}, [r0@64], r1             ; store u oq0
-    vst1.u8     {d15}, [r2@64], r1             ; store v oq0
-    vst1.u8     {d16}, [r0@64]                 ; store u oq1
-    vst1.u8     {d17}, [r2@64]                 ; store v oq1
-
-    pop         {pc}
-    ENDP        ; |vp9_loop_filter_horizontal_edge_uv_neon|
-
-; void vp9_loop_filter_vertical_edge_y_neon(unsigned char *src, int pitch,
-;                                           const signed char *flimit,
-;                                           const signed char *limit,
-;                                           const signed char *thresh,
-;                                           int count)
-; r0    unsigned char *src
-; r1    int pitch
-; r2    unsigned char blimit
-; r3    unsigned char limit
-; sp    unsigned char thresh,
-
-|vp9_loop_filter_vertical_edge_y_neon| PROC
-    push        {lr}
-    vdup.u8     q0, r2                     ; duplicate blimit
-    vdup.u8     q1, r3                     ; duplicate limit
-    sub         r2, r0, #4                 ; src ptr down by 4 columns
-    add         r1, r1, r1
-    ldr         r3, [sp, #4]               ; load thresh
-    add         r12, r2, r1, asr #1
-
-    vld1.u8     {d6}, [r2], r1
-    vld1.u8     {d8}, [r12], r1
-    vld1.u8     {d10}, [r2], r1
-    vld1.u8     {d12}, [r12], r1
-    vld1.u8     {d14}, [r2], r1
-    vld1.u8     {d16}, [r12], r1
-    vld1.u8     {d18}, [r2], r1
-    vld1.u8     {d20}, [r12], r1
-
-    vld1.u8     {d7}, [r2], r1              ; load second 8-line src data
-    vld1.u8     {d9}, [r12], r1
-    vld1.u8     {d11}, [r2], r1
-    vld1.u8     {d13}, [r12], r1
-    vld1.u8     {d15}, [r2], r1
-    vld1.u8     {d17}, [r12], r1
-    vld1.u8     {d19}, [r2]
-    vld1.u8     {d21}, [r12]
-
-    ;transpose to 8x16 matrix
-    vtrn.32     q3, q7
-    vtrn.32     q4, q8
-    vtrn.32     q5, q9
-    vtrn.32     q6, q10
-
-    vdup.u8     q2, r3                     ; duplicate thresh
-
-    vtrn.16     q3, q5
-    vtrn.16     q4, q6
-    vtrn.16     q7, q9
-    vtrn.16     q8, q10
-
-    vtrn.8      q3, q4
-    vtrn.8      q5, q6
-    vtrn.8      q7, q8
-    vtrn.8      q9, q10
-
-    bl          vp9_loop_filter_neon
-
-    vswp        d12, d11
-    vswp        d16, d13
-
-    sub         r0, r0, #2                 ; dst ptr
-
-    vswp        d14, d12
-    vswp        d16, d15
-
-    add         r12, r0, r1, asr #1
-
-    ;store op1, op0, oq0, oq1
-    vst4.8      {d10[0], d11[0], d12[0], d13[0]}, [r0], r1
-    vst4.8      {d10[1], d11[1], d12[1], d13[1]}, [r12], r1
-    vst4.8      {d10[2], d11[2], d12[2], d13[2]}, [r0], r1
-    vst4.8      {d10[3], d11[3], d12[3], d13[3]}, [r12], r1
-    vst4.8      {d10[4], d11[4], d12[4], d13[4]}, [r0], r1
-    vst4.8      {d10[5], d11[5], d12[5], d13[5]}, [r12], r1
-    vst4.8      {d10[6], d11[6], d12[6], d13[6]}, [r0], r1
-    vst4.8      {d10[7], d11[7], d12[7], d13[7]}, [r12], r1
-
-    vst4.8      {d14[0], d15[0], d16[0], d17[0]}, [r0], r1
-    vst4.8      {d14[1], d15[1], d16[1], d17[1]}, [r12], r1
-    vst4.8      {d14[2], d15[2], d16[2], d17[2]}, [r0], r1
-    vst4.8      {d14[3], d15[3], d16[3], d17[3]}, [r12], r1
-    vst4.8      {d14[4], d15[4], d16[4], d17[4]}, [r0], r1
-    vst4.8      {d14[5], d15[5], d16[5], d17[5]}, [r12], r1
-    vst4.8      {d14[6], d15[6], d16[6], d17[6]}, [r0]
-    vst4.8      {d14[7], d15[7], d16[7], d17[7]}, [r12]
-
-    pop         {pc}
-    ENDP        ; |vp9_loop_filter_vertical_edge_y_neon|
-
-; void vp9_loop_filter_vertical_edge_uv_neon(unsigned char *u, int pitch
-;                                            const signed char *flimit,
-;                                            const signed char *limit,
-;                                            const signed char *thresh,
-;                                            unsigned char *v)
-; r0    unsigned char *u,
-; r1    int pitch,
-; r2    unsigned char blimit
-; r3    unsigned char limit
-; sp    unsigned char thresh,
-; sp+4  unsigned char *v
-|vp9_loop_filter_vertical_edge_uv_neon| PROC
-    push        {lr}
-    vdup.u8     q0, r2                      ; duplicate blimit
-    sub         r12, r0, #4                 ; move u pointer down by 4 columns
-    ldr         r2, [sp, #8]                ; load v ptr
-    vdup.u8     q1, r3                      ; duplicate limit
-    sub         r3, r2, #4                  ; move v pointer down by 4 columns
-
-    vld1.u8     {d6}, [r12], r1             ;load u data
-    vld1.u8     {d7}, [r3], r1              ;load v data
-    vld1.u8     {d8}, [r12], r1
-    vld1.u8     {d9}, [r3], r1
-    vld1.u8     {d10}, [r12], r1
-    vld1.u8     {d11}, [r3], r1
-    vld1.u8     {d12}, [r12], r1
-    vld1.u8     {d13}, [r3], r1
-    vld1.u8     {d14}, [r12], r1
-    vld1.u8     {d15}, [r3], r1
-    vld1.u8     {d16}, [r12], r1
-    vld1.u8     {d17}, [r3], r1
-    vld1.u8     {d18}, [r12], r1
-    vld1.u8     {d19}, [r3], r1
-    vld1.u8     {d20}, [r12]
-    vld1.u8     {d21}, [r3]
-
-    ldr        r12, [sp, #4]               ; load thresh
-
-    ;transpose to 8x16 matrix
-    vtrn.32     q3, q7
-    vtrn.32     q4, q8
-    vtrn.32     q5, q9
-    vtrn.32     q6, q10
-
-    vdup.u8     q2, r12                     ; duplicate thresh
-
-    vtrn.16     q3, q5
-    vtrn.16     q4, q6
-    vtrn.16     q7, q9
-    vtrn.16     q8, q10
-
-    vtrn.8      q3, q4
-    vtrn.8      q5, q6
-    vtrn.8      q7, q8
-    vtrn.8      q9, q10
-
-    bl          vp9_loop_filter_neon
-
-    vswp        d12, d11
-    vswp        d16, d13
-    vswp        d14, d12
-    vswp        d16, d15
-
-    sub         r0, r0, #2
-    sub         r2, r2, #2
-
-    ;store op1, op0, oq0, oq1
-    vst4.8      {d10[0], d11[0], d12[0], d13[0]}, [r0], r1
-    vst4.8      {d14[0], d15[0], d16[0], d17[0]}, [r2], r1
-    vst4.8      {d10[1], d11[1], d12[1], d13[1]}, [r0], r1
-    vst4.8      {d14[1], d15[1], d16[1], d17[1]}, [r2], r1
-    vst4.8      {d10[2], d11[2], d12[2], d13[2]}, [r0], r1
-    vst4.8      {d14[2], d15[2], d16[2], d17[2]}, [r2], r1
-    vst4.8      {d10[3], d11[3], d12[3], d13[3]}, [r0], r1
-    vst4.8      {d14[3], d15[3], d16[3], d17[3]}, [r2], r1
-    vst4.8      {d10[4], d11[4], d12[4], d13[4]}, [r0], r1
-    vst4.8      {d14[4], d15[4], d16[4], d17[4]}, [r2], r1
-    vst4.8      {d10[5], d11[5], d12[5], d13[5]}, [r0], r1
-    vst4.8      {d14[5], d15[5], d16[5], d17[5]}, [r2], r1
-    vst4.8      {d10[6], d11[6], d12[6], d13[6]}, [r0], r1
-    vst4.8      {d14[6], d15[6], d16[6], d17[6]}, [r2], r1
-    vst4.8      {d10[7], d11[7], d12[7], d13[7]}, [r0]
-    vst4.8      {d14[7], d15[7], d16[7], d17[7]}, [r2]
-
-    pop         {pc}
-    ENDP        ; |vp9_loop_filter_vertical_edge_uv_neon|
-
-; void vp9_loop_filter_neon();
-; This is a helper function for the loopfilters. The invidual functions do the
-; necessary load, transpose (if necessary) and store.
-
-; r0-r3 PRESERVE
-; q0    flimit
-; q1    limit
-; q2    thresh
-; q3    p3
-; q4    p2
-; q5    p1
-; q6    p0
-; q7    q0
-; q8    q1
-; q9    q2
-; q10   q3
-|vp9_loop_filter_neon| PROC
-
-    ; vp9_filter_mask
-    vabd.u8     q11, q3, q4                 ; abs(p3 - p2)
-    vabd.u8     q12, q4, q5                 ; abs(p2 - p1)
-    vabd.u8     q13, q5, q6                 ; abs(p1 - p0)
-    vabd.u8     q14, q8, q7                 ; abs(q1 - q0)
-    vabd.u8     q3, q9, q8                  ; abs(q2 - q1)
-    vabd.u8     q4, q10, q9                 ; abs(q3 - q2)
-
-    vmax.u8     q11, q11, q12
-    vmax.u8     q12, q13, q14
-    vmax.u8     q3, q3, q4
-    vmax.u8     q15, q11, q12
-
-    vabd.u8     q9, q6, q7                  ; abs(p0 - q0)
-
-    ; vp8_hevmask
-    vcgt.u8     q13, q13, q2                ; (abs(p1 - p0) > thresh)*-1
-    vcgt.u8     q14, q14, q2                ; (abs(q1 - q0) > thresh)*-1
-    vmax.u8     q15, q15, q3
-
-    vmov.u8     q10, #0x80                   ; 0x80
-
-    vabd.u8     q2, q5, q8                  ; a = abs(p1 - q1)
-    vqadd.u8    q9, q9, q9                  ; b = abs(p0 - q0) * 2
-
-    vcge.u8     q15, q1, q15
-
-    ; vp9_filter() function
-    ; convert to signed
-    veor        q7, q7, q10                 ; qs0
-    vshr.u8     q2, q2, #1                  ; a = a / 2
-    veor        q6, q6, q10                 ; ps0
-
-    veor        q5, q5, q10                 ; ps1
-    vqadd.u8    q9, q9, q2                  ; a = b + a
-
-    veor        q8, q8, q10                 ; qs1
-
-    vmov.u8     q10, #3                     ; #3
-
-    vsubl.s8    q2, d14, d12                ; ( qs0 - ps0)
-    vsubl.s8    q11, d15, d13
-
-    vcge.u8     q9, q0, q9                  ; (a > flimit * 2 + limit) * -1
-
-    vmovl.u8    q4, d20
-
-    vqsub.s8    q1, q5, q8                  ; vp9_filter = clamp(ps1-qs1)
-    vorr        q14, q13, q14               ; vp8_hevmask
-
-    vmul.i16    q2, q2, q4                  ; 3 * ( qs0 - ps0)
-    vmul.i16    q11, q11, q4
-
-    vand        q1, q1, q14                 ; vp9_filter &= hev
-    vand        q15, q15, q9                ; vp9_filter_mask
-
-    vaddw.s8    q2, q2, d2
-    vaddw.s8    q11, q11, d3
-
-    vmov.u8     q9, #4                      ; #4
-
-    ; vp9_filter = clamp(vp9_filter + 3 * ( qs0 - ps0))
-    vqmovn.s16  d2, q2
-    vqmovn.s16  d3, q11
-    vand        q1, q1, q15                 ; vp9_filter &= mask
-
-    vqadd.s8    q2, q1, q10                 ; Filter2 = clamp(vp9_filter+3)
-    vqadd.s8    q1, q1, q9                  ; Filter1 = clamp(vp9_filter+4)
-    vshr.s8     q2, q2, #3                  ; Filter2 >>= 3
-    vshr.s8     q1, q1, #3                  ; Filter1 >>= 3
-
-
-    vqadd.s8    q11, q6, q2                 ; u = clamp(ps0 + Filter2)
-    vqsub.s8    q10, q7, q1                 ; u = clamp(qs0 - Filter1)
-
-    ; outer tap adjustments: ++vp9_filter >> 1
-    vrshr.s8    q1, q1, #1
-    vbic        q1, q1, q14                 ; vp9_filter &= ~hev
-    vmov.u8     q0, #0x80                   ; 0x80
-    vqadd.s8    q13, q5, q1                 ; u = clamp(ps1 + vp9_filter)
-    vqsub.s8    q12, q8, q1                 ; u = clamp(qs1 - vp9_filter)
-
-    veor        q6, q11, q0                 ; *op0 = u^0x80
-    veor        q7, q10, q0                 ; *oq0 = u^0x80
-    veor        q5, q13, q0                 ; *op1 = u^0x80
-    veor        q8, q12, q0                 ; *oq1 = u^0x80
-
-    bx          lr
-    ENDP        ; |vp9_loop_filter_horizontal_edge_y_neon|
-
-;-----------------
-
-    END
--- a/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.asm
+++ /dev/null
@@ -1,117 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    ;EXPORT  |vp9_loop_filter_simple_horizontal_edge_neon|
-    EXPORT  |vp9_loop_filter_bhs_neon|
-    EXPORT  |vp9_loop_filter_mbhs_neon|
-    ARM
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-; r0    unsigned char *s, PRESERVE
-; r1    int p, PRESERVE
-; q1    limit, PRESERVE
-
-|vp9_loop_filter_simple_horizontal_edge_neon| PROC
-
-    sub         r3, r0, r1, lsl #1          ; move src pointer down by 2 lines
-
-    vld1.u8     {q7}, [r0@128], r1          ; q0
-    vld1.u8     {q5}, [r3@128], r1          ; p0
-    vld1.u8     {q8}, [r0@128]              ; q1
-    vld1.u8     {q6}, [r3@128]              ; p1
-
-    vabd.u8     q15, q6, q7                 ; abs(p0 - q0)
-    vabd.u8     q14, q5, q8                 ; abs(p1 - q1)
-
-    vqadd.u8    q15, q15, q15               ; abs(p0 - q0) * 2
-    vshr.u8     q14, q14, #1                ; abs(p1 - q1) / 2
-    vmov.u8     q0, #0x80                   ; 0x80
-    vmov.s16    q13, #3
-    vqadd.u8    q15, q15, q14               ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2
-
-    veor        q7, q7, q0                  ; qs0: q0 offset to convert to a signed value
-    veor        q6, q6, q0                  ; ps0: p0 offset to convert to a signed value
-    veor        q5, q5, q0                  ; ps1: p1 offset to convert to a signed value
-    veor        q8, q8, q0                  ; qs1: q1 offset to convert to a signed value
-
-    vcge.u8     q15, q1, q15                ; (abs(p0 - q0)*2 + abs(p1-q1)/2 > limit)*-1
-
-    vsubl.s8    q2, d14, d12                ; ( qs0 - ps0)
-    vsubl.s8    q3, d15, d13
-
-    vqsub.s8    q4, q5, q8                  ; q4: vp9_filter = vp9_signed_char_clamp(ps1-qs1)
-
-    vmul.s16    q2, q2, q13                 ;  3 * ( qs0 - ps0)
-    vmul.s16    q3, q3, q13
-
-    vmov.u8     q10, #0x03                  ; 0x03
-    vmov.u8     q9, #0x04                   ; 0x04
-
-    vaddw.s8    q2, q2, d8                  ; vp9_filter + 3 * ( qs0 - ps0)
-    vaddw.s8    q3, q3, d9
-
-    vqmovn.s16  d8, q2                      ; vp9_filter = vp9_signed_char_clamp(vp9_filter + 3 * ( qs0 - ps0))
-    vqmovn.s16  d9, q3
-
-    vand        q14, q4, q15                ; vp9_filter &= mask
-
-    vqadd.s8    q2, q14, q10                ; Filter2 = vp9_signed_char_clamp(vp9_filter+3)
-    vqadd.s8    q3, q14, q9                 ; Filter1 = vp9_signed_char_clamp(vp9_filter+4)
-    vshr.s8     q2, q2, #3                  ; Filter2 >>= 3
-    vshr.s8     q4, q3, #3                  ; Filter1 >>= 3
-
-    sub         r0, r0, r1
-
-    ;calculate output
-    vqadd.s8    q11, q6, q2                 ; u = vp9_signed_char_clamp(ps0 + Filter2)
-    vqsub.s8    q10, q7, q4                 ; u = vp9_signed_char_clamp(qs0 - Filter1)
-
-    veor        q6, q11, q0                 ; *op0 = u^0x80
-    veor        q7, q10, q0                 ; *oq0 = u^0x80
-
-    vst1.u8     {q6}, [r3@128]              ; store op0
-    vst1.u8     {q7}, [r0@128]              ; store oq0
-
-    bx          lr
-    ENDP        ; |vp9_loop_filter_simple_horizontal_edge_neon|
-
-; r0    unsigned char *y
-; r1    int ystride
-; r2    const unsigned char *blimit
-
-|vp9_loop_filter_bhs_neon| PROC
-    push        {r4, lr}
-    ldrb        r3, [r2]                    ; load blim from mem
-    vdup.s8     q1, r3                      ; duplicate blim
-
-    add         r0, r0, r1, lsl #2          ; src = y_ptr + 4 * y_stride
-    bl          vp9_loop_filter_simple_horizontal_edge_neon
-    ; vp9_loop_filter_simple_horizontal_edge_neon preserves r0, r1 and q1
-    add         r0, r0, r1, lsl #2          ; src = y_ptr + 8* y_stride
-    bl          vp9_loop_filter_simple_horizontal_edge_neon
-    add         r0, r0, r1, lsl #2          ; src = y_ptr + 12 * y_stride
-    pop         {r4, lr}
-    b           vp9_loop_filter_simple_horizontal_edge_neon
-    ENDP        ;|vp9_loop_filter_bhs_neon|
-
-; r0    unsigned char *y
-; r1    int ystride
-; r2    const unsigned char *blimit
-
-|vp9_loop_filter_mbhs_neon| PROC
-    ldrb        r3, [r2]                   ; load blim from mem
-    vdup.s8     q1, r3                     ; duplicate mblim
-    b           vp9_loop_filter_simple_horizontal_edge_neon
-    ENDP        ;|vp9_loop_filter_bhs_neon|
-
-    END
--- a/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm
+++ /dev/null
@@ -1,154 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    ;EXPORT  |vp9_loop_filter_simple_vertical_edge_neon|
-    EXPORT |vp9_loop_filter_bvs_neon|
-    EXPORT |vp9_loop_filter_mbvs_neon|
-    ARM
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-; r0    unsigned char *s, PRESERVE
-; r1    int p, PRESERVE
-; q1    limit, PRESERVE
-
-|vp9_loop_filter_simple_vertical_edge_neon| PROC
-    sub         r0, r0, #2                  ; move src pointer down by 2 columns
-    add         r12, r1, r1
-    add         r3, r0, r1
-
-    vld4.8      {d6[0], d7[0], d8[0], d9[0]}, [r0], r12
-    vld4.8      {d6[1], d7[1], d8[1], d9[1]}, [r3], r12
-    vld4.8      {d6[2], d7[2], d8[2], d9[2]}, [r0], r12
-    vld4.8      {d6[3], d7[3], d8[3], d9[3]}, [r3], r12
-    vld4.8      {d6[4], d7[4], d8[4], d9[4]}, [r0], r12
-    vld4.8      {d6[5], d7[5], d8[5], d9[5]}, [r3], r12
-    vld4.8      {d6[6], d7[6], d8[6], d9[6]}, [r0], r12
-    vld4.8      {d6[7], d7[7], d8[7], d9[7]}, [r3], r12
-
-    vld4.8      {d10[0], d11[0], d12[0], d13[0]}, [r0], r12
-    vld4.8      {d10[1], d11[1], d12[1], d13[1]}, [r3], r12
-    vld4.8      {d10[2], d11[2], d12[2], d13[2]}, [r0], r12
-    vld4.8      {d10[3], d11[3], d12[3], d13[3]}, [r3], r12
-    vld4.8      {d10[4], d11[4], d12[4], d13[4]}, [r0], r12
-    vld4.8      {d10[5], d11[5], d12[5], d13[5]}, [r3], r12
-    vld4.8      {d10[6], d11[6], d12[6], d13[6]}, [r0], r12
-    vld4.8      {d10[7], d11[7], d12[7], d13[7]}, [r3]
-
-    vswp        d7, d10
-    vswp        d12, d9
-
-    ;vp9_filter_mask() function
-    ;vp8_hevmask() function
-    sub         r0, r0, r1, lsl #4
-    vabd.u8     q15, q5, q4                 ; abs(p0 - q0)
-    vabd.u8     q14, q3, q6                 ; abs(p1 - q1)
-
-    vqadd.u8    q15, q15, q15               ; abs(p0 - q0) * 2
-    vshr.u8     q14, q14, #1                ; abs(p1 - q1) / 2
-    vmov.u8     q0, #0x80                   ; 0x80
-    vmov.s16    q11, #3
-    vqadd.u8    q15, q15, q14               ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2
-
-    veor        q4, q4, q0                  ; qs0: q0 offset to convert to a signed value
-    veor        q5, q5, q0                  ; ps0: p0 offset to convert to a signed value
-    veor        q3, q3, q0                  ; ps1: p1 offset to convert to a signed value
-    veor        q6, q6, q0                  ; qs1: q1 offset to convert to a signed value
-
-    vcge.u8     q15, q1, q15                ; abs(p0 - q0)*2 + abs(p1-q1)/2 > flimit*2 + limit)*-1
-
-    vsubl.s8    q2, d8, d10                 ; ( qs0 - ps0)
-    vsubl.s8    q13, d9, d11
-
-    vqsub.s8    q14, q3, q6                  ; vp9_filter = vp9_signed_char_clamp(ps1-qs1)
-
-    vmul.s16    q2, q2, q11                 ;  3 * ( qs0 - ps0)
-    vmul.s16    q13, q13, q11
-
-    vmov.u8     q11, #0x03                  ; 0x03
-    vmov.u8     q12, #0x04                  ; 0x04
-
-    vaddw.s8    q2, q2, d28                  ; vp9_filter + 3 * ( qs0 - ps0)
-    vaddw.s8    q13, q13, d29
-
-    vqmovn.s16  d28, q2                      ; vp9_filter = vp9_signed_char_clamp(vp9_filter + 3 * ( qs0 - ps0))
-    vqmovn.s16  d29, q13
-
-    add         r0, r0, #1
-    add         r3, r0, r1
-
-    vand        q14, q14, q15                 ; vp9_filter &= mask
-
-    vqadd.s8    q2, q14, q11                 ; Filter2 = vp9_signed_char_clamp(vp9_filter+3)
-    vqadd.s8    q3, q14, q12                 ; Filter1 = vp9_signed_char_clamp(vp9_filter+4)
-    vshr.s8     q2, q2, #3                  ; Filter2 >>= 3
-    vshr.s8     q14, q3, #3                  ; Filter1 >>= 3
-
-    ;calculate output
-    vqadd.s8    q11, q5, q2                 ; u = vp9_signed_char_clamp(ps0 + Filter2)
-    vqsub.s8    q10, q4, q14                 ; u = vp9_signed_char_clamp(qs0 - Filter1)
-
-    veor        q6, q11, q0                 ; *op0 = u^0x80
-    veor        q7, q10, q0                 ; *oq0 = u^0x80
-    add         r12, r1, r1
-    vswp        d13, d14
-
-    ;store op1, op0, oq0, oq1
-    vst2.8      {d12[0], d13[0]}, [r0], r12
-    vst2.8      {d12[1], d13[1]}, [r3], r12
-    vst2.8      {d12[2], d13[2]}, [r0], r12
-    vst2.8      {d12[3], d13[3]}, [r3], r12
-    vst2.8      {d12[4], d13[4]}, [r0], r12
-    vst2.8      {d12[5], d13[5]}, [r3], r12
-    vst2.8      {d12[6], d13[6]}, [r0], r12
-    vst2.8      {d12[7], d13[7]}, [r3], r12
-    vst2.8      {d14[0], d15[0]}, [r0], r12
-    vst2.8      {d14[1], d15[1]}, [r3], r12
-    vst2.8      {d14[2], d15[2]}, [r0], r12
-    vst2.8      {d14[3], d15[3]}, [r3], r12
-    vst2.8      {d14[4], d15[4]}, [r0], r12
-    vst2.8      {d14[5], d15[5]}, [r3], r12
-    vst2.8      {d14[6], d15[6]}, [r0], r12
-    vst2.8      {d14[7], d15[7]}, [r3]
-
-    bx          lr
-    ENDP        ; |vp9_loop_filter_simple_vertical_edge_neon|
-
-; r0    unsigned char *y
-; r1    int ystride
-; r2    const unsigned char *blimit
-
-|vp9_loop_filter_bvs_neon| PROC
-    push        {r4, lr}
-    ldrb        r3, [r2]                   ; load blim from mem
-    mov         r4, r0
-    add         r0, r0, #4
-    vdup.s8     q1, r3                     ; duplicate blim
-    bl          vp9_loop_filter_simple_vertical_edge_neon
-    ; vp9_loop_filter_simple_vertical_edge_neon preserves  r1 and q1
-    add         r0, r4, #8
-    bl          vp9_loop_filter_simple_vertical_edge_neon
-    add         r0, r4, #12
-    pop         {r4, lr}
-    b           vp9_loop_filter_simple_vertical_edge_neon
-    ENDP        ;|vp9_loop_filter_bvs_neon|
-
-; r0    unsigned char *y
-; r1    int ystride
-; r2    const unsigned char *blimit
-
-|vp9_loop_filter_mbvs_neon| PROC
-    ldrb        r3, [r2]                   ; load mblim from mem
-    vdup.s8     q1, r3                     ; duplicate mblim
-    b           vp9_loop_filter_simple_vertical_edge_neon
-    ENDP        ;|vp9_loop_filter_bvs_neon|
-    END
--- a/vp8/common/arm/neon/mbloopfilter_neon.asm
+++ /dev/null
@@ -1,469 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp8_mbloop_filter_horizontal_edge_y_neon|
-    EXPORT  |vp8_mbloop_filter_horizontal_edge_uv_neon|
-    EXPORT  |vp8_mbloop_filter_vertical_edge_y_neon|
-    EXPORT  |vp8_mbloop_filter_vertical_edge_uv_neon|
-    ARM
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-; void vp8_mbloop_filter_horizontal_edge_y_neon(unsigned char *src, int pitch,
-;                                               const unsigned char *blimit,
-;                                               const unsigned char *limit,
-;                                               const unsigned char *thresh)
-; r0    unsigned char *src,
-; r1    int pitch,
-; r2    unsigned char blimit
-; r3    unsigned char limit
-; sp    unsigned char thresh,
-|vp8_mbloop_filter_horizontal_edge_y_neon| PROC
-    push        {lr}
-    add         r1, r1, r1                  ; double stride
-    ldr         r12, [sp, #4]               ; load thresh
-    sub         r0, r0, r1, lsl #1          ; move src pointer down by 4 lines
-    vdup.u8     q2, r12                     ; thresh
-    add         r12, r0, r1,  lsr #1        ; move src pointer up by 1 line
-
-    vld1.u8     {q3}, [r0@128], r1              ; p3
-    vld1.u8     {q4}, [r12@128], r1             ; p2
-    vld1.u8     {q5}, [r0@128], r1              ; p1
-    vld1.u8     {q6}, [r12@128], r1             ; p0
-    vld1.u8     {q7}, [r0@128], r1              ; q0
-    vld1.u8     {q8}, [r12@128], r1             ; q1
-    vld1.u8     {q9}, [r0@128], r1              ; q2
-    vld1.u8     {q10}, [r12@128], r1            ; q3
-
-    bl          vp8_mbloop_filter_neon
-
-    sub         r12, r12, r1, lsl #2
-    add         r0, r12, r1, lsr #1
-
-    vst1.u8     {q4}, [r12@128],r1         ; store op2
-    vst1.u8     {q5}, [r0@128],r1          ; store op1
-    vst1.u8     {q6}, [r12@128], r1        ; store op0
-    vst1.u8     {q7}, [r0@128],r1          ; store oq0
-    vst1.u8     {q8}, [r12@128]            ; store oq1
-    vst1.u8     {q9}, [r0@128]             ; store oq2
-
-    pop         {pc}
-    ENDP        ; |vp8_mbloop_filter_horizontal_edge_y_neon|
-
-; void vp8_mbloop_filter_horizontal_edge_uv_neon(unsigned char *u, int pitch,
-;                                                const unsigned char *blimit,
-;                                                const unsigned char *limit,
-;                                                const unsigned char *thresh,
-;                                                unsigned char *v)
-; r0    unsigned char *u,
-; r1    int pitch,
-; r2    unsigned char blimit
-; r3    unsigned char limit
-; sp    unsigned char thresh,
-; sp+4  unsigned char *v
-
-|vp8_mbloop_filter_horizontal_edge_uv_neon| PROC
-    push        {lr}
-    ldr         r12, [sp, #4]                 ; load thresh
-    sub         r0, r0, r1, lsl #2            ; move u pointer down by 4 lines
-    vdup.u8     q2, r12                       ; thresh
-    ldr         r12, [sp, #8]                 ; load v ptr
-    sub         r12, r12, r1, lsl #2          ; move v pointer down by 4 lines
-
-    vld1.u8     {d6}, [r0@64], r1              ; p3
-    vld1.u8     {d7}, [r12@64], r1              ; p3
-    vld1.u8     {d8}, [r0@64], r1              ; p2
-    vld1.u8     {d9}, [r12@64], r1              ; p2
-    vld1.u8     {d10}, [r0@64], r1             ; p1
-    vld1.u8     {d11}, [r12@64], r1             ; p1
-    vld1.u8     {d12}, [r0@64], r1             ; p0
-    vld1.u8     {d13}, [r12@64], r1             ; p0
-    vld1.u8     {d14}, [r0@64], r1             ; q0
-    vld1.u8     {d15}, [r12@64], r1             ; q0
-    vld1.u8     {d16}, [r0@64], r1             ; q1
-    vld1.u8     {d17}, [r12@64], r1             ; q1
-    vld1.u8     {d18}, [r0@64], r1             ; q2
-    vld1.u8     {d19}, [r12@64], r1             ; q2
-    vld1.u8     {d20}, [r0@64], r1             ; q3
-    vld1.u8     {d21}, [r12@64], r1             ; q3
-
-    bl          vp8_mbloop_filter_neon
-
-    sub         r0, r0, r1, lsl #3
-    sub         r12, r12, r1, lsl #3
-
-    add         r0, r0, r1
-    add         r12, r12, r1
-
-    vst1.u8     {d8}, [r0@64], r1              ; store u op2
-    vst1.u8     {d9}, [r12@64], r1              ; store v op2
-    vst1.u8     {d10}, [r0@64], r1             ; store u op1
-    vst1.u8     {d11}, [r12@64], r1             ; store v op1
-    vst1.u8     {d12}, [r0@64], r1             ; store u op0
-    vst1.u8     {d13}, [r12@64], r1             ; store v op0
-    vst1.u8     {d14}, [r0@64], r1             ; store u oq0
-    vst1.u8     {d15}, [r12@64], r1             ; store v oq0
-    vst1.u8     {d16}, [r0@64], r1             ; store u oq1
-    vst1.u8     {d17}, [r12@64], r1             ; store v oq1
-    vst1.u8     {d18}, [r0@64], r1             ; store u oq2
-    vst1.u8     {d19}, [r12@64], r1             ; store v oq2
-
-    pop         {pc}
-    ENDP        ; |vp8_mbloop_filter_horizontal_edge_uv_neon|
-
-; void vp8_mbloop_filter_vertical_edge_y_neon(unsigned char *src, int pitch,
-;                                             const unsigned char *blimit,
-;                                             const unsigned char *limit,
-;                                             const unsigned char *thresh)
-; r0    unsigned char *src,
-; r1    int pitch,
-; r2    unsigned char blimit
-; r3    unsigned char limit
-; sp    unsigned char thresh,
-|vp8_mbloop_filter_vertical_edge_y_neon| PROC
-    push        {lr}
-    ldr         r12, [sp, #4]               ; load thresh
-    sub         r0, r0, #4                  ; move src pointer down by 4 columns
-    vdup.s8     q2, r12                     ; thresh
-    add         r12, r0, r1, lsl #3         ; move src pointer down by 8 lines
-
-    vld1.u8     {d6}, [r0], r1              ; load first 8-line src data
-    vld1.u8     {d7}, [r12], r1             ; load second 8-line src data
-    vld1.u8     {d8}, [r0], r1
-    vld1.u8     {d9}, [r12], r1
-    vld1.u8     {d10}, [r0], r1
-    vld1.u8     {d11}, [r12], r1
-    vld1.u8     {d12}, [r0], r1
-    vld1.u8     {d13}, [r12], r1
-    vld1.u8     {d14}, [r0], r1
-    vld1.u8     {d15}, [r12], r1
-    vld1.u8     {d16}, [r0], r1
-    vld1.u8     {d17}, [r12], r1
-    vld1.u8     {d18}, [r0], r1
-    vld1.u8     {d19}, [r12], r1
-    vld1.u8     {d20}, [r0], r1
-    vld1.u8     {d21}, [r12], r1
-
-    ;transpose to 8x16 matrix
-    vtrn.32     q3, q7
-    vtrn.32     q4, q8
-    vtrn.32     q5, q9
-    vtrn.32     q6, q10
-
-    vtrn.16     q3, q5
-    vtrn.16     q4, q6
-    vtrn.16     q7, q9
-    vtrn.16     q8, q10
-
-    vtrn.8      q3, q4
-    vtrn.8      q5, q6
-    vtrn.8      q7, q8
-    vtrn.8      q9, q10
-
-    sub         r0, r0, r1, lsl #3
-
-    bl          vp8_mbloop_filter_neon
-
-    sub         r12, r12, r1, lsl #3
-
-    ;transpose to 16x8 matrix
-    vtrn.32     q3, q7
-    vtrn.32     q4, q8
-    vtrn.32     q5, q9
-    vtrn.32     q6, q10
-
-    vtrn.16     q3, q5
-    vtrn.16     q4, q6
-    vtrn.16     q7, q9
-    vtrn.16     q8, q10
-
-    vtrn.8      q3, q4
-    vtrn.8      q5, q6
-    vtrn.8      q7, q8
-    vtrn.8      q9, q10
-
-    ;store op2, op1, op0, oq0, oq1, oq2
-    vst1.8      {d6}, [r0], r1
-    vst1.8      {d7}, [r12], r1
-    vst1.8      {d8}, [r0], r1
-    vst1.8      {d9}, [r12], r1
-    vst1.8      {d10}, [r0], r1
-    vst1.8      {d11}, [r12], r1
-    vst1.8      {d12}, [r0], r1
-    vst1.8      {d13}, [r12], r1
-    vst1.8      {d14}, [r0], r1
-    vst1.8      {d15}, [r12], r1
-    vst1.8      {d16}, [r0], r1
-    vst1.8      {d17}, [r12], r1
-    vst1.8      {d18}, [r0], r1
-    vst1.8      {d19}, [r12], r1
-    vst1.8      {d20}, [r0]
-    vst1.8      {d21}, [r12]
-
-    pop         {pc}
-    ENDP        ; |vp8_mbloop_filter_vertical_edge_y_neon|
-
-; void vp8_mbloop_filter_vertical_edge_uv_neon(unsigned char *u, int pitch,
-;                                              const unsigned char *blimit,
-;                                              const unsigned char *limit,
-;                                              const unsigned char *thresh,
-;                                              unsigned char *v)
-; r0    unsigned char *u,
-; r1    int pitch,
-; r2    const signed char *flimit,
-; r3    const signed char *limit,
-; sp    const signed char *thresh,
-; sp+4  unsigned char *v
-|vp8_mbloop_filter_vertical_edge_uv_neon| PROC
-    push        {lr}
-    ldr         r12, [sp, #4]               ; load thresh
-    sub         r0, r0, #4                  ; move u pointer down by 4 columns
-    vdup.u8     q2, r12                     ; thresh
-    ldr         r12, [sp, #8]               ; load v ptr
-    sub         r12, r12, #4                ; move v pointer down by 4 columns
-
-    vld1.u8     {d6}, [r0], r1              ;load u data
-    vld1.u8     {d7}, [r12], r1             ;load v data
-    vld1.u8     {d8}, [r0], r1
-    vld1.u8     {d9}, [r12], r1
-    vld1.u8     {d10}, [r0], r1
-    vld1.u8     {d11}, [r12], r1
-    vld1.u8     {d12}, [r0], r1
-    vld1.u8     {d13}, [r12], r1
-    vld1.u8     {d14}, [r0], r1
-    vld1.u8     {d15}, [r12], r1
-    vld1.u8     {d16}, [r0], r1
-    vld1.u8     {d17}, [r12], r1
-    vld1.u8     {d18}, [r0], r1
-    vld1.u8     {d19}, [r12], r1
-    vld1.u8     {d20}, [r0], r1
-    vld1.u8     {d21}, [r12], r1
-
-    ;transpose to 8x16 matrix
-    vtrn.32     q3, q7
-    vtrn.32     q4, q8
-    vtrn.32     q5, q9
-    vtrn.32     q6, q10
-
-    vtrn.16     q3, q5
-    vtrn.16     q4, q6
-    vtrn.16     q7, q9
-    vtrn.16     q8, q10
-
-    vtrn.8      q3, q4
-    vtrn.8      q5, q6
-    vtrn.8      q7, q8
-    vtrn.8      q9, q10
-
-    sub         r0, r0, r1, lsl #3
-
-    bl          vp8_mbloop_filter_neon
-
-    sub         r12, r12, r1, lsl #3
-
-    ;transpose to 16x8 matrix
-    vtrn.32     q3, q7
-    vtrn.32     q4, q8
-    vtrn.32     q5, q9
-    vtrn.32     q6, q10
-
-    vtrn.16     q3, q5
-    vtrn.16     q4, q6
-    vtrn.16     q7, q9
-    vtrn.16     q8, q10
-
-    vtrn.8      q3, q4
-    vtrn.8      q5, q6
-    vtrn.8      q7, q8
-    vtrn.8      q9, q10
-
-    ;store op2, op1, op0, oq0, oq1, oq2
-    vst1.8      {d6}, [r0], r1
-    vst1.8      {d7}, [r12], r1
-    vst1.8      {d8}, [r0], r1
-    vst1.8      {d9}, [r12], r1
-    vst1.8      {d10}, [r0], r1
-    vst1.8      {d11}, [r12], r1
-    vst1.8      {d12}, [r0], r1
-    vst1.8      {d13}, [r12], r1
-    vst1.8      {d14}, [r0], r1
-    vst1.8      {d15}, [r12], r1
-    vst1.8      {d16}, [r0], r1
-    vst1.8      {d17}, [r12], r1
-    vst1.8      {d18}, [r0], r1
-    vst1.8      {d19}, [r12], r1
-    vst1.8      {d20}, [r0]
-    vst1.8      {d21}, [r12]
-
-    pop         {pc}
-    ENDP        ; |vp8_mbloop_filter_vertical_edge_uv_neon|
-
-; void vp8_mbloop_filter_neon()
-; This is a helper function for the macroblock loopfilters. The individual
-; functions do the necessary load, transpose (if necessary), preserve (if
-; necessary) and store.
-
-; r0,r1 PRESERVE
-; r2    mblimit
-; r3    limit
-
-; q2    thresh
-; q3    p3 PRESERVE
-; q4    p2
-; q5    p1
-; q6    p0
-; q7    q0
-; q8    q1
-; q9    q2
-; q10   q3 PRESERVE
-
-|vp8_mbloop_filter_neon| PROC
-
-    ; vp9_filter_mask
-    vabd.u8     q11, q3, q4                 ; abs(p3 - p2)
-    vabd.u8     q12, q4, q5                 ; abs(p2 - p1)
-    vabd.u8     q13, q5, q6                 ; abs(p1 - p0)
-    vabd.u8     q14, q8, q7                 ; abs(q1 - q0)
-    vabd.u8     q1, q9, q8                  ; abs(q2 - q1)
-    vabd.u8     q0, q10, q9                 ; abs(q3 - q2)
-
-    vmax.u8     q11, q11, q12
-    vmax.u8     q12, q13, q14
-    vmax.u8     q1, q1, q0
-    vmax.u8     q15, q11, q12
-
-    vabd.u8     q12, q6, q7                 ; abs(p0 - q0)
-
-    ; vp8_hevmask
-    vcgt.u8     q13, q13, q2                ; (abs(p1 - p0) > thresh) * -1
-    vcgt.u8     q14, q14, q2                ; (abs(q1 - q0) > thresh) * -1
-    vmax.u8     q15, q15, q1
-
-    vdup.u8     q1, r3                      ; limit
-    vdup.u8     q2, r2                      ; mblimit
-
-    vmov.u8     q0, #0x80                   ; 0x80
-
-    vcge.u8     q15, q1, q15
-
-    vabd.u8     q1, q5, q8                  ; a = abs(p1 - q1)
-    vqadd.u8    q12, q12, q12               ; b = abs(p0 - q0) * 2
-    vmov.u16    q11, #3                     ; #3
-
-    ; vp9_filter
-    ; convert to signed
-    veor        q7, q7, q0                  ; qs0
-    vshr.u8     q1, q1, #1                  ; a = a / 2
-    veor        q6, q6, q0                  ; ps0
-    veor        q5, q5, q0                  ; ps1
-
-    vqadd.u8    q12, q12, q1                ; a = b + a
-
-    veor        q8, q8, q0                  ; qs1
-    veor        q4, q4, q0                  ; ps2
-    veor        q9, q9, q0                  ; qs2
-
-    vorr        q14, q13, q14               ; vp8_hevmask
-
-    vcge.u8     q12, q2, q12                ; (a > flimit * 2 + limit) * -1
-
-    vsubl.s8    q2, d14, d12                ; qs0 - ps0
-    vsubl.s8    q13, d15, d13
-
-    vqsub.s8    q1, q5, q8                  ; vp9_filter = clamp(ps1-qs1)
-
-    vmul.i16    q2, q2, q11                 ; 3 * ( qs0 - ps0)
-
-    vand        q15, q15, q12               ; vp9_filter_mask
-
-    vmul.i16    q13, q13, q11
-
-    vmov.u8     q12, #3                     ; #3
-
-    vaddw.s8    q2, q2, d2                  ; vp9_filter + 3 * ( qs0 - ps0)
-    vaddw.s8    q13, q13, d3
-
-    vmov.u8     q11, #4                     ; #4
-
-    ; vp9_filter = clamp(vp9_filter + 3 * ( qs0 - ps0))
-    vqmovn.s16  d2, q2
-    vqmovn.s16  d3, q13
-
-    vand        q1, q1, q15                 ; vp9_filter &= mask
-
-    vmov.u16    q15, #63                    ; #63
-
-    vand        q13, q1, q14                ; Filter2 &= hev
-
-    vqadd.s8    q2, q13, q11                ; Filter1 = clamp(Filter2+4)
-    vqadd.s8    q13, q13, q12               ; Filter2 = clamp(Filter2+3)
-
-    vmov        q0, q15
-
-    vshr.s8     q2, q2, #3                  ; Filter1 >>= 3
-    vshr.s8     q13, q13, #3                ; Filter2 >>= 3
-
-    vmov        q11, q15
-    vmov        q12, q15
-
-    vqsub.s8    q7, q7, q2                  ; qs0 = clamp(qs0 - Filter1)
-
-    vqadd.s8    q6, q6, q13                 ; ps0 = clamp(ps0 + Filter2)
-
-    vbic        q1, q1, q14                 ; vp9_filter &= ~hev
-
-    ; roughly 1/7th difference across boundary
-    ; roughly 2/7th difference across boundary
-    ; roughly 3/7th difference across boundary
-
-    vmov.u8     d5, #9                      ; #9
-    vmov.u8     d4, #18                     ; #18
-
-    vmov        q13, q15
-    vmov        q14, q15
-
-    vmlal.s8    q0, d2, d5                  ; 63 + Filter2 * 9
-    vmlal.s8    q11, d3, d5
-    vmov.u8     d5, #27                     ; #27
-    vmlal.s8    q12, d2, d4                 ; 63 + Filter2 * 18
-    vmlal.s8    q13, d3, d4
-    vmlal.s8    q14, d2, d5                 ; 63 + Filter2 * 27
-    vmlal.s8    q15, d3, d5
-
-    vqshrn.s16  d0, q0, #7                  ; u = clamp((63 + Filter2 * 9)>>7)
-    vqshrn.s16  d1, q11, #7
-    vqshrn.s16  d24, q12, #7                ; u = clamp((63 + Filter2 * 18)>>7)
-    vqshrn.s16  d25, q13, #7
-    vqshrn.s16  d28, q14, #7                ; u = clamp((63 + Filter2 * 27)>>7)
-    vqshrn.s16  d29, q15, #7
-
-    vmov.u8     q1, #0x80                   ; 0x80
-
-    vqsub.s8    q11, q9, q0                 ; s = clamp(qs2 - u)
-    vqadd.s8    q0, q4, q0                  ; s = clamp(ps2 + u)
-    vqsub.s8    q13, q8, q12                ; s = clamp(qs1 - u)
-    vqadd.s8    q12, q5, q12                ; s = clamp(ps1 + u)
-    vqsub.s8    q15, q7, q14                ; s = clamp(qs0 - u)
-    vqadd.s8    q14, q6, q14                ; s = clamp(ps0 + u)
-
-    veor        q9, q11, q1                 ; *oq2 = s^0x80
-    veor        q4, q0, q1                  ; *op2 = s^0x80
-    veor        q8, q13, q1                 ; *oq1 = s^0x80
-    veor        q5, q12, q1                 ; *op2 = s^0x80
-    veor        q7, q15, q1                 ; *oq0 = s^0x80
-    veor        q6, q14, q1                 ; *op0 = s^0x80
-
-    bx          lr
-    ENDP        ; |vp8_mbloop_filter_neon|
-
-;-----------------
-
-    END
--- a/vp8/common/arm/neon/recon16x16mb_neon.asm
+++ /dev/null
@@ -1,131 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp8_recon16x16mb_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-; r0    unsigned char  *pred_ptr,
-; r1    short *diff_ptr,
-; r2    unsigned char *dst_ptr,
-; r3    int ystride,
-; stack unsigned char *udst_ptr,
-; stack unsigned char *vdst_ptr
-
-|vp8_recon16x16mb_neon| PROC
-    mov             r12, #4             ;loop counter for Y loop
-
-recon16x16mb_loop_y
-    vld1.u8         {q12, q13}, [r0]!   ;load data from pred_ptr
-    vld1.16         {q8, q9}, [r1]!     ;load data from diff_ptr
-    vld1.u8         {q14, q15}, [r0]!
-    vld1.16         {q10, q11}, [r1]!
-
-    vmovl.u8        q0, d24             ;modify Pred data from 8 bits to 16 bits
-    vmovl.u8        q1, d25
-    vmovl.u8        q2, d26
-    vmovl.u8        q3, d27
-    vmovl.u8        q4, d28
-    vmovl.u8        q5, d29
-    vmovl.u8        q6, d30
-    vld1.16         {q12, q13}, [r1]!
-    vmovl.u8        q7, d31
-    vld1.16         {q14, q15}, [r1]!
-
-    pld             [r0]
-    pld             [r1]
-    pld             [r1, #64]
-
-    vadd.s16        q0, q0, q8          ;add Diff data and Pred data together
-    vadd.s16        q1, q1, q9
-    vadd.s16        q2, q2, q10
-    vadd.s16        q3, q3, q11
-    vadd.s16        q4, q4, q12
-    vadd.s16        q5, q5, q13
-    vadd.s16        q6, q6, q14
-    vadd.s16        q7, q7, q15
-
-    vqmovun.s16     d0, q0              ;CLAMP() saturation
-    vqmovun.s16     d1, q1
-    vqmovun.s16     d2, q2
-    vqmovun.s16     d3, q3
-    vqmovun.s16     d4, q4
-    vqmovun.s16     d5, q5
-    vst1.u8         {q0}, [r2], r3      ;store result
-    vqmovun.s16     d6, q6
-    vst1.u8         {q1}, [r2], r3
-    vqmovun.s16     d7, q7
-    vst1.u8         {q2}, [r2], r3
-    subs            r12, r12, #1
-
-    moveq           r12, #2             ;loop counter for UV loop
-
-    vst1.u8         {q3}, [r2], r3
-    bne             recon16x16mb_loop_y
-
-    mov             r3, r3, lsr #1      ;uv_stride = ystride>>1
-    ldr             r2, [sp]            ;load upred_ptr
-
-recon16x16mb_loop_uv
-    vld1.u8         {q12, q13}, [r0]!   ;load data from pred_ptr
-    vld1.16         {q8, q9}, [r1]!     ;load data from diff_ptr
-    vld1.u8         {q14, q15}, [r0]!
-    vld1.16         {q10, q11}, [r1]!
-
-    vmovl.u8        q0, d24             ;modify Pred data from 8 bits to 16 bits
-    vmovl.u8        q1, d25
-    vmovl.u8        q2, d26
-    vmovl.u8        q3, d27
-    vmovl.u8        q4, d28
-    vmovl.u8        q5, d29
-    vmovl.u8        q6, d30
-    vld1.16         {q12, q13}, [r1]!
-    vmovl.u8        q7, d31
-    vld1.16         {q14, q15}, [r1]!
-
-    vadd.s16        q0, q0, q8          ;add Diff data and Pred data together
-    vadd.s16        q1, q1, q9
-    vadd.s16        q2, q2, q10
-    vadd.s16        q3, q3, q11
-    vadd.s16        q4, q4, q12
-    vadd.s16        q5, q5, q13
-    vadd.s16        q6, q6, q14
-
-    vqmovun.s16     d0, q0              ;CLAMP() saturation
-    vadd.s16        q7, q7, q15
-    vqmovun.s16     d1, q1
-    vqmovun.s16     d2, q2
-    vqmovun.s16     d3, q3
-    vst1.u8         {d0}, [r2], r3      ;store result
-    vqmovun.s16     d4, q4
-    vst1.u8         {d1}, [r2], r3
-    vqmovun.s16     d5, q5
-    vst1.u8         {d2}, [r2], r3
-    vqmovun.s16     d6, q6
-    vst1.u8         {d3}, [r2], r3
-    vqmovun.s16     d7, q7
-    vst1.u8         {d4}, [r2], r3
-    subs            r12, r12, #1
-
-    vst1.u8         {d5}, [r2], r3
-    vst1.u8         {d6}, [r2], r3
-    vst1.u8         {d7}, [r2], r3
-
-    ldrne           r2, [sp, #4]        ;load vpred_ptr
-    bne             recon16x16mb_loop_uv
-
-    bx             lr
-
-    ENDP
-    END
--- a/vp8/common/arm/neon/recon2b_neon.asm
+++ /dev/null
@@ -1,54 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp8_recon2b_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-; r0    unsigned char  *pred_ptr,
-; r1    short *diff_ptr,
-; r2    unsigned char *dst_ptr,
-; r3    int stride
-
-|vp8_recon2b_neon| PROC
-    vld1.u8         {q8, q9}, [r0]      ;load data from pred_ptr
-    vld1.16         {q4, q5}, [r1]!     ;load data from diff_ptr
-
-    vmovl.u8        q0, d16             ;modify Pred data from 8 bits to 16 bits
-    vld1.16         {q6, q7}, [r1]!
-    vmovl.u8        q1, d17
-    vmovl.u8        q2, d18
-    vmovl.u8        q3, d19
-
-    vadd.s16        q0, q0, q4          ;add Diff data and Pred data together
-    vadd.s16        q1, q1, q5
-    vadd.s16        q2, q2, q6
-    vadd.s16        q3, q3, q7
-
-    vqmovun.s16     d0, q0              ;CLAMP() saturation
-    vqmovun.s16     d1, q1
-    vqmovun.s16     d2, q2
-    vqmovun.s16     d3, q3
-    add             r0, r2, r3
-
-    vst1.u8         {d0}, [r2]          ;store result
-    vst1.u8         {d1}, [r0], r3
-    add             r2, r0, r3
-    vst1.u8         {d2}, [r0]
-    vst1.u8         {d3}, [r2], r3
-
-    bx             lr
-
-    ENDP
-    END
--- a/vp8/common/arm/neon/recon4b_neon.asm
+++ /dev/null
@@ -1,69 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp8_recon4b_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-; r0    unsigned char  *pred_ptr,
-; r1    short *diff_ptr,
-; r2    unsigned char *dst_ptr,
-; r3    int stride
-
-|vp8_recon4b_neon| PROC
-    vld1.u8         {q12, q13}, [r0]!   ;load data from pred_ptr
-    vld1.16         {q8, q9}, [r1]!     ;load data from diff_ptr
-    vld1.u8         {q14, q15}, [r0]
-    vld1.16         {q10, q11}, [r1]!
-
-    vmovl.u8        q0, d24             ;modify Pred data from 8 bits to 16 bits
-    vmovl.u8        q1, d25
-    vmovl.u8        q2, d26
-    vmovl.u8        q3, d27
-    vmovl.u8        q4, d28
-    vmovl.u8        q5, d29
-    vmovl.u8        q6, d30
-    vld1.16         {q12, q13}, [r1]!
-    vmovl.u8        q7, d31
-    vld1.16         {q14, q15}, [r1]
-
-    vadd.s16        q0, q0, q8          ;add Diff data and Pred data together
-    vadd.s16        q1, q1, q9
-    vadd.s16        q2, q2, q10
-    vadd.s16        q3, q3, q11
-    vadd.s16        q4, q4, q12
-    vadd.s16        q5, q5, q13
-    vadd.s16        q6, q6, q14
-    vadd.s16        q7, q7, q15
-
-    vqmovun.s16     d0, q0              ;CLAMP() saturation
-    vqmovun.s16     d1, q1
-    vqmovun.s16     d2, q2
-    vqmovun.s16     d3, q3
-    vqmovun.s16     d4, q4
-    vqmovun.s16     d5, q5
-    vqmovun.s16     d6, q6
-    vqmovun.s16     d7, q7
-    add             r0, r2, r3
-
-    vst1.u8         {q0}, [r2]          ;store result
-    vst1.u8         {q1}, [r0], r3
-    add             r2, r0, r3
-    vst1.u8         {q2}, [r0]
-    vst1.u8         {q3}, [r2], r3
-
-    bx             lr
-
-    ENDP
-    END
--- a/vp8/common/arm/neon/recon_neon.c
+++ /dev/null
@@ -1,29 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vpx_ports/config.h"
-#include "vp8/common/recon.h"
-#include "vp8/common/blockd.h"
-
-extern void vp8_recon16x16mb_neon(unsigned char *pred_ptr, short *diff_ptr, unsigned char *dst_ptr, int ystride, unsigned char *udst_ptr, unsigned char *vdst_ptr);
-
-void vp8_recon_mb_neon(const vp8_recon_rtcd_vtable_t *rtcd, MACROBLOCKD *xd) {
-  unsigned char *pred_ptr = &xd->predictor[0];
-  short *diff_ptr = &xd->diff[0];
-  unsigned char *dst_ptr = xd->dst.y_buffer;
-  unsigned char *udst_ptr = xd->dst.u_buffer;
-  unsigned char *vdst_ptr = xd->dst.v_buffer;
-  int ystride = xd->dst.y_stride;
-  /*int uv_stride = xd->dst.uv_stride;*/
-
-  vp8_recon16x16mb_neon(pred_ptr, diff_ptr, dst_ptr, ystride,
-                        udst_ptr, vdst_ptr);
-}
--- a/vp8/common/arm/neon/reconb_neon.asm
+++ /dev/null
@@ -1,61 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp8_recon_b_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-; r0    unsigned char  *pred_ptr,
-; r1    short *diff_ptr,
-; r2    unsigned char *dst_ptr,
-; r3    int stride
-
-|vp8_recon_b_neon| PROC
-    mov             r12, #16
-
-    vld1.u8         {d28}, [r0], r12    ;load 4 data/line from pred_ptr
-    vld1.16         {q10, q11}, [r1]!   ;load data from diff_ptr
-    vld1.u8         {d29}, [r0], r12
-    vld1.16         {q11, q12}, [r1]!
-    vld1.u8         {d30}, [r0], r12
-    vld1.16         {q12, q13}, [r1]!
-    vld1.u8         {d31}, [r0], r12
-    vld1.16         {q13}, [r1]
-
-    vmovl.u8        q0, d28             ;modify Pred data from 8 bits to 16 bits
-    vmovl.u8        q1, d29             ;Pred data in d0, d2, d4, d6
-    vmovl.u8        q2, d30
-    vmovl.u8        q3, d31
-
-    vadd.s16        d0, d0, d20         ;add Diff data and Pred data together
-    vadd.s16        d2, d2, d22
-    vadd.s16        d4, d4, d24
-    vadd.s16        d6, d6, d26
-
-    vqmovun.s16     d0, q0              ;CLAMP() saturation
-    vqmovun.s16     d1, q1
-    vqmovun.s16     d2, q2
-    vqmovun.s16     d3, q3
-    add             r1, r2, r3
-
-    vst1.32         {d0[0]}, [r2]       ;store result
-    vst1.32         {d1[0]}, [r1], r3
-    add             r2, r1, r3
-    vst1.32         {d2[0]}, [r1]
-    vst1.32         {d3[0]}, [r2], r3
-
-    bx             lr
-
-    ENDP
-    END
--- a/vp8/common/arm/neon/save_neon_reg.asm
+++ /dev/null
@@ -1,36 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp9_push_neon|
-    EXPORT  |vp9_pop_neon|
-
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-|vp9_push_neon| PROC
-    vst1.i64            {d8, d9, d10, d11}, [r0]!
-    vst1.i64            {d12, d13, d14, d15}, [r0]!
-    bx              lr
-
-    ENDP
-
-|vp9_pop_neon| PROC
-    vld1.i64            {d8, d9, d10, d11}, [r0]!
-    vld1.i64            {d12, d13, d14, d15}, [r0]!
-    bx              lr
-
-    ENDP
-
-    END
-
--- a/vp8/common/arm/neon/shortidct4x4llm_1_neon.asm
+++ /dev/null
@@ -1,67 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp8_short_idct4x4llm_1_neon|
-    EXPORT  |vp8_dc_only_idct_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;void vp8_short_idct4x4llm_1_c(short *input, short *output, int pitch);
-; r0    short *input;
-; r1    short *output;
-; r2    int pitch;
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-|vp8_short_idct4x4llm_1_neon| PROC
-    vld1.16         {d0[]}, [r0]            ;load input[0]
-
-    add             r3, r1, r2
-    add             r12, r3, r2
-
-    vrshr.s16       d0, d0, #3
-
-    add             r0, r12, r2
-
-    vst1.16         {d0}, [r1]
-    vst1.16         {d0}, [r3]
-    vst1.16         {d0}, [r12]
-    vst1.16         {d0}, [r0]
-
-    bx             lr
-    ENDP
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;void vp8_dc_only_idct_c(short input_dc, short *output, int pitch);
-; r0    short input_dc;
-; r1    short *output;
-; r2    int pitch;
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-|vp8_dc_only_idct_neon| PROC
-    vdup.16         d0, r0
-
-    add             r3, r1, r2
-    add             r12, r3, r2
-
-    vrshr.s16       d0, d0, #3
-
-    add             r0, r12, r2
-
-    vst1.16         {d0}, [r1]
-    vst1.16         {d0}, [r3]
-    vst1.16         {d0}, [r12]
-    vst1.16         {d0}, [r0]
-
-    bx             lr
-
-    ENDP
-    END
--- a/vp8/common/arm/neon/shortidct4x4llm_neon.asm
+++ /dev/null
@@ -1,122 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp8_short_idct4x4llm_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-;*************************************************************
-;void vp8_short_idct4x4llm_c(short *input, short *output, int pitch)
-;r0 short * input
-;r1 short * output
-;r2 int pitch
-;*************************************************************
-;static const int cospi8sqrt2minus1=20091;
-;static const int sinpi8sqrt2      =35468;
-;static const int rounding = 0;
-;Optimization note: The resulted data from dequantization are signed 13-bit data that is
-;in the range of [-4096, 4095]. This allows to use "vqdmulh"(neon) instruction since
-;it won't go out of range (13+16+1=30bits<32bits). This instruction gives the high half
-;result of the multiplication that is needed in IDCT.
-
-|vp8_short_idct4x4llm_neon| PROC
-    adr             r12, idct_coeff
-    vld1.16         {q1, q2}, [r0]
-    vld1.16         {d0}, [r12]
-
-    vswp            d3, d4                  ;q2(vp[4] vp[12])
-
-    vqdmulh.s16     q3, q2, d0[2]
-    vqdmulh.s16     q4, q2, d0[0]
-
-    vqadd.s16       d12, d2, d3             ;a1
-    vqsub.s16       d13, d2, d3             ;b1
-
-    vshr.s16        q3, q3, #1
-    vshr.s16        q4, q4, #1
-
-    vqadd.s16       q3, q3, q2              ;modify since sinpi8sqrt2 > 65536/2 (negtive number)
-    vqadd.s16       q4, q4, q2
-
-    ;d6 - c1:temp1
-    ;d7 - d1:temp2
-    ;d8 - d1:temp1
-    ;d9 - c1:temp2
-
-    vqsub.s16       d10, d6, d9             ;c1
-    vqadd.s16       d11, d7, d8             ;d1
-
-    vqadd.s16       d2, d12, d11
-    vqadd.s16       d3, d13, d10
-    vqsub.s16       d4, d13, d10
-    vqsub.s16       d5, d12, d11
-
-    vtrn.32         d2, d4
-    vtrn.32         d3, d5
-    vtrn.16         d2, d3
-    vtrn.16         d4, d5
-
-    vswp            d3, d4
-
-    vqdmulh.s16     q3, q2, d0[2]
-    vqdmulh.s16     q4, q2, d0[0]
-
-    vqadd.s16       d12, d2, d3             ;a1
-    vqsub.s16       d13, d2, d3             ;b1
-
-    vshr.s16        q3, q3, #1
-    vshr.s16        q4, q4, #1
-
-    vqadd.s16       q3, q3, q2              ;modify since sinpi8sqrt2 > 65536/2 (negtive number)
-    vqadd.s16       q4, q4, q2
-
-    vqsub.s16       d10, d6, d9             ;c1
-    vqadd.s16       d11, d7, d8             ;d1
-
-    vqadd.s16       d2, d12, d11
-    vqadd.s16       d3, d13, d10
-    vqsub.s16       d4, d13, d10
-    vqsub.s16       d5, d12, d11
-
-    vrshr.s16       d2, d2, #3
-    vrshr.s16       d3, d3, #3
-    vrshr.s16       d4, d4, #3
-    vrshr.s16       d5, d5, #3
-
-    add             r3, r1, r2
-    add             r12, r3, r2
-    add             r0, r12, r2
-
-    vtrn.32         d2, d4
-    vtrn.32         d3, d5
-    vtrn.16         d2, d3
-    vtrn.16         d4, d5
-
-    vst1.16         {d2}, [r1]
-    vst1.16         {d3}, [r3]
-    vst1.16         {d4}, [r12]
-    vst1.16         {d5}, [r0]
-
-    bx             lr
-
-    ENDP
-
-;-----------------
-
-idct_coeff
-    DCD     0x4e7b4e7b, 0x8a8c8a8c
-
-;20091, 20091, 35468, 35468
-
-    END
--- a/vp8/common/arm/neon/sixtappredict16x16_neon.asm
+++ /dev/null
@@ -1,490 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp8_sixtap_predict16x16_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-filter16_coeff
-    DCD     0,  0,  128,    0,   0,  0,   0,  0
-    DCD     0, -6,  123,   12,  -1,  0,   0,  0
-    DCD     2, -11, 108,   36,  -8,  1,   0,  0
-    DCD     0, -9,   93,   50,  -6,  0,   0,  0
-    DCD     3, -16,  77,   77, -16,  3,   0,  0
-    DCD     0, -6,   50,   93,  -9,  0,   0,  0
-    DCD     1, -8,   36,  108, -11,  2,   0,  0
-    DCD     0, -1,   12,  123,  -6,   0,  0,  0
-
-; r0    unsigned char  *src_ptr,
-; r1    int  src_pixels_per_line,
-; r2    int  xoffset,
-; r3    int  yoffset,
-; r4    unsigned char *dst_ptr,
-; stack(r5) int  dst_pitch
-
-;Note: To take advantage of 8-bit mulplication instruction in NEON. First apply abs() to
-; filter coeffs to make them u8. Then, use vmlsl for negtive coeffs. After multiplication,
-; the result can be negtive. So, I treat the result as s16. But, since it is also possible
-; that the result can be a large positive number (> 2^15-1), which could be confused as a
-; negtive number. To avoid that error, apply filter coeffs in the order of 0, 1, 4 ,5 ,2,
-; which ensures that the result stays in s16 range. Finally, saturated add the result by
-; applying 3rd filter coeff. Same applys to other filter functions.
-
-|vp8_sixtap_predict16x16_neon| PROC
-    push            {r4-r5, lr}
-
-    adr             r12, filter16_coeff
-    ldr             r4, [sp, #12]           ;load parameters from stack
-    ldr             r5, [sp, #16]           ;load parameters from stack
-
-    cmp             r2, #0                  ;skip first_pass filter if xoffset=0
-    beq             secondpass_filter16x16_only
-
-    add             r2, r12, r2, lsl #5     ;calculate filter location
-
-    cmp             r3, #0                  ;skip second_pass filter if yoffset=0
-
-    vld1.s32        {q14, q15}, [r2]        ;load first_pass filter
-
-    beq             firstpass_filter16x16_only
-
-    sub             sp, sp, #336            ;reserve space on stack for temporary storage
-    mov             lr, sp
-
-    vabs.s32        q12, q14
-    vabs.s32        q13, q15
-
-    mov             r2, #7                  ;loop counter
-    sub             r0, r0, #2              ;move srcptr back to (line-2) and (column-2)
-    sub             r0, r0, r1, lsl #1
-
-    vdup.8          d0, d24[0]              ;first_pass filter (d0-d5)
-    vdup.8          d1, d24[4]
-    vdup.8          d2, d25[0]
-    vdup.8          d3, d25[4]
-    vdup.8          d4, d26[0]
-    vdup.8          d5, d26[4]
-
-;First Pass: output_height lines x output_width columns (21x16)
-filt_blk2d_fp16x16_loop_neon
-    vld1.u8         {d6, d7, d8}, [r0], r1      ;load src data
-    vld1.u8         {d9, d10, d11}, [r0], r1
-    vld1.u8         {d12, d13, d14}, [r0], r1
-
-    pld             [r0]
-    pld             [r0, r1]
-    pld             [r0, r1, lsl #1]
-
-    vmull.u8        q8, d6, d0              ;(src_ptr[-2] * vp9_filter[0])
-    vmull.u8        q9, d7, d0
-    vmull.u8        q10, d9, d0
-    vmull.u8        q11, d10, d0
-    vmull.u8        q12, d12, d0
-    vmull.u8        q13, d13, d0
-
-    vext.8          d28, d6, d7, #1         ;construct src_ptr[-1]
-    vext.8          d29, d9, d10, #1
-    vext.8          d30, d12, d13, #1
-
-    vmlsl.u8        q8, d28, d1             ;-(src_ptr[-1] * vp9_filter[1])
-    vmlsl.u8        q10, d29, d1
-    vmlsl.u8        q12, d30, d1
-
-    vext.8          d28, d7, d8, #1
-    vext.8          d29, d10, d11, #1
-    vext.8          d30, d13, d14, #1
-
-    vmlsl.u8        q9, d28, d1             ;-(src_ptr[-1] * vp9_filter[1])
-    vmlsl.u8        q11, d29, d1
-    vmlsl.u8        q13, d30, d1
-
-    vext.8          d28, d6, d7, #4         ;construct src_ptr[2]
-    vext.8          d29, d9, d10, #4
-    vext.8          d30, d12, d13, #4
-
-    vmlsl.u8        q8, d28, d4             ;-(src_ptr[2] * vp9_filter[4])
-    vmlsl.u8        q10, d29, d4
-    vmlsl.u8        q12, d30, d4
-
-    vext.8          d28, d7, d8, #4
-    vext.8          d29, d10, d11, #4
-    vext.8          d30, d13, d14, #4
-
-    vmlsl.u8        q9, d28, d4             ;-(src_ptr[2] * vp9_filter[4])
-    vmlsl.u8        q11, d29, d4
-    vmlsl.u8        q13, d30, d4
-
-    vext.8          d28, d6, d7, #5         ;construct src_ptr[3]
-    vext.8          d29, d9, d10, #5
-    vext.8          d30, d12, d13, #5
-
-    vmlal.u8        q8, d28, d5             ;(src_ptr[3] * vp9_filter[5])
-    vmlal.u8        q10, d29, d5
-    vmlal.u8        q12, d30, d5
-
-    vext.8          d28, d7, d8, #5
-    vext.8          d29, d10, d11, #5
-    vext.8          d30, d13, d14, #5
-
-    vmlal.u8        q9, d28, d5             ;(src_ptr[3] * vp9_filter[5])
-    vmlal.u8        q11, d29, d5
-    vmlal.u8        q13, d30, d5
-
-    vext.8          d28, d6, d7, #2         ;construct src_ptr[0]
-    vext.8          d29, d9, d10, #2
-    vext.8          d30, d12, d13, #2
-
-    vmlal.u8        q8, d28, d2             ;(src_ptr[0] * vp9_filter[2])
-    vmlal.u8        q10, d29, d2
-    vmlal.u8        q12, d30, d2
-
-    vext.8          d28, d7, d8, #2
-    vext.8          d29, d10, d11, #2
-    vext.8          d30, d13, d14, #2
-
-    vmlal.u8        q9, d28, d2             ;(src_ptr[0] * vp9_filter[2])
-    vmlal.u8        q11, d29, d2
-    vmlal.u8        q13, d30, d2
-
-    vext.8          d28, d6, d7, #3         ;construct src_ptr[1]
-    vext.8          d29, d9, d10, #3
-    vext.8          d30, d12, d13, #3
-
-    vext.8          d15, d7, d8, #3
-    vext.8          d31, d10, d11, #3
-    vext.8          d6, d13, d14, #3
-
-    vmull.u8        q4, d28, d3             ;(src_ptr[1] * vp9_filter[3])
-    vmull.u8        q5, d29, d3
-    vmull.u8        q6, d30, d3
-
-    vqadd.s16       q8, q4                  ;sum of all (src_data*filter_parameters)
-    vqadd.s16       q10, q5
-    vqadd.s16       q12, q6
-
-    vmull.u8        q6, d15, d3             ;(src_ptr[1] * vp9_filter[3])
-    vmull.u8        q7, d31, d3
-    vmull.u8        q3, d6, d3
-
-    subs            r2, r2, #1
-
-    vqadd.s16       q9, q6
-    vqadd.s16       q11, q7
-    vqadd.s16       q13, q3
-
-    vqrshrun.s16    d6, q8, #7              ;shift/round/saturate to u8
-    vqrshrun.s16    d7, q9, #7
-    vqrshrun.s16    d8, q10, #7
-    vqrshrun.s16    d9, q11, #7
-    vqrshrun.s16    d10, q12, #7
-    vqrshrun.s16    d11, q13, #7
-
-    vst1.u8         {d6, d7, d8}, [lr]!     ;store result
-    vst1.u8         {d9, d10, d11}, [lr]!
-
-    bne             filt_blk2d_fp16x16_loop_neon
-
-;Second pass: 16x16
-;secondpass_filter - do first 8-columns and then second 8-columns
-    add             r3, r12, r3, lsl #5
-    sub             lr, lr, #336
-
-    vld1.s32        {q5, q6}, [r3]          ;load second_pass filter
-    mov             r3, #2                  ;loop counter
-
-    vabs.s32        q7, q5
-    vabs.s32        q8, q6
-
-    mov             r2, #16
-
-    vdup.8          d0, d14[0]              ;second_pass filter parameters (d0-d5)
-    vdup.8          d1, d14[4]
-    vdup.8          d2, d15[0]
-    vdup.8          d3, d15[4]
-    vdup.8          d4, d16[0]
-    vdup.8          d5, d16[4]
-
-filt_blk2d_sp16x16_outloop_neon
-    vld1.u8         {d18}, [lr], r2         ;load src data
-    vld1.u8         {d19}, [lr], r2
-    vld1.u8         {d20}, [lr], r2
-    vld1.u8         {d21}, [lr], r2
-    mov             r12, #4                 ;loop counter
-    vld1.u8         {d22}, [lr], r2
-
-secondpass_inner_loop_neon
-    vld1.u8         {d23}, [lr], r2         ;load src data
-    vld1.u8         {d24}, [lr], r2
-    vld1.u8         {d25}, [lr], r2
-    vld1.u8         {d26}, [lr], r2
-
-    vmull.u8        q3, d18, d0             ;(src_ptr[-2] * vp9_filter[0])
-    vmull.u8        q4, d19, d0
-    vmull.u8        q5, d20, d0
-    vmull.u8        q6, d21, d0
-
-    vmlsl.u8        q3, d19, d1             ;-(src_ptr[-1] * vp9_filter[1])
-    vmlsl.u8        q4, d20, d1
-    vmlsl.u8        q5, d21, d1
-    vmlsl.u8        q6, d22, d1
-
-    vmlsl.u8        q3, d22, d4             ;-(src_ptr[2] * vp9_filter[4])
-    vmlsl.u8        q4, d23, d4
-    vmlsl.u8        q5, d24, d4
-    vmlsl.u8        q6, d25, d4
-
-    vmlal.u8        q3, d20, d2             ;(src_ptr[0] * vp9_filter[2])
-    vmlal.u8        q4, d21, d2
-    vmlal.u8        q5, d22, d2
-    vmlal.u8        q6, d23, d2
-
-    vmlal.u8        q3, d23, d5             ;(src_ptr[3] * vp9_filter[5])
-    vmlal.u8        q4, d24, d5
-    vmlal.u8        q5, d25, d5
-    vmlal.u8        q6, d26, d5
-
-    vmull.u8        q7, d21, d3             ;(src_ptr[1] * vp9_filter[3])
-    vmull.u8        q8, d22, d3
-    vmull.u8        q9, d23, d3
-    vmull.u8        q10, d24, d3
-
-    subs            r12, r12, #1
-
-    vqadd.s16       q7, q3                  ;sum of all (src_data*filter_parameters)
-    vqadd.s16       q8, q4
-    vqadd.s16       q9, q5
-    vqadd.s16       q10, q6
-
-    vqrshrun.s16    d6, q7, #7              ;shift/round/saturate to u8
-    vqrshrun.s16    d7, q8, #7
-    vqrshrun.s16    d8, q9, #7
-    vqrshrun.s16    d9, q10, #7
-
-    vst1.u8         {d6}, [r4], r5          ;store result
-    vmov            q9, q11
-    vst1.u8         {d7}, [r4], r5
-    vmov            q10, q12
-    vst1.u8         {d8}, [r4], r5
-    vmov            d22, d26
-    vst1.u8         {d9}, [r4], r5
-
-    bne             secondpass_inner_loop_neon
-
-    subs            r3, r3, #1
-    sub             lr, lr, #336
-    add             lr, lr, #8
-
-    sub             r4, r4, r5, lsl #4
-    add             r4, r4, #8
-
-    bne filt_blk2d_sp16x16_outloop_neon
-
-    add             sp, sp, #336
-    pop             {r4-r5,pc}
-
-;--------------------
-firstpass_filter16x16_only
-    vabs.s32        q12, q14
-    vabs.s32        q13, q15
-
-    mov             r2, #8                  ;loop counter
-    sub             r0, r0, #2              ;move srcptr back to (column-2)
-
-    vdup.8          d0, d24[0]              ;first_pass filter (d0-d5)
-    vdup.8          d1, d24[4]
-    vdup.8          d2, d25[0]
-    vdup.8          d3, d25[4]
-    vdup.8          d4, d26[0]
-    vdup.8          d5, d26[4]
-
-;First Pass: output_height lines x output_width columns (16x16)
-filt_blk2d_fpo16x16_loop_neon
-    vld1.u8         {d6, d7, d8}, [r0], r1      ;load src data
-    vld1.u8         {d9, d10, d11}, [r0], r1
-
-    pld             [r0]
-    pld             [r0, r1]
-
-    vmull.u8        q6, d6, d0              ;(src_ptr[-2] * vp9_filter[0])
-    vmull.u8        q7, d7, d0
-    vmull.u8        q8, d9, d0
-    vmull.u8        q9, d10, d0
-
-    vext.8          d20, d6, d7, #1         ;construct src_ptr[-1]
-    vext.8          d21, d9, d10, #1
-    vext.8          d22, d7, d8, #1
-    vext.8          d23, d10, d11, #1
-    vext.8          d24, d6, d7, #4         ;construct src_ptr[2]
-    vext.8          d25, d9, d10, #4
-    vext.8          d26, d7, d8, #4
-    vext.8          d27, d10, d11, #4
-    vext.8          d28, d6, d7, #5         ;construct src_ptr[3]
-    vext.8          d29, d9, d10, #5
-
-    vmlsl.u8        q6, d20, d1             ;-(src_ptr[-1] * vp9_filter[1])
-    vmlsl.u8        q8, d21, d1
-    vmlsl.u8        q7, d22, d1             ;-(src_ptr[-1] * vp9_filter[1])
-    vmlsl.u8        q9, d23, d1
-    vmlsl.u8        q6, d24, d4             ;-(src_ptr[2] * vp9_filter[4])
-    vmlsl.u8        q8, d25, d4
-    vmlsl.u8        q7, d26, d4             ;-(src_ptr[2] * vp9_filter[4])
-    vmlsl.u8        q9, d27, d4
-    vmlal.u8        q6, d28, d5             ;(src_ptr[3] * vp9_filter[5])
-    vmlal.u8        q8, d29, d5
-
-    vext.8          d20, d7, d8, #5
-    vext.8          d21, d10, d11, #5
-    vext.8          d22, d6, d7, #2         ;construct src_ptr[0]
-    vext.8          d23, d9, d10, #2
-    vext.8          d24, d7, d8, #2
-    vext.8          d25, d10, d11, #2
-
-    vext.8          d26, d6, d7, #3         ;construct src_ptr[1]
-    vext.8          d27, d9, d10, #3
-    vext.8          d28, d7, d8, #3
-    vext.8          d29, d10, d11, #3
-
-    vmlal.u8        q7, d20, d5             ;(src_ptr[3] * vp9_filter[5])
-    vmlal.u8        q9, d21, d5
-    vmlal.u8        q6, d22, d2             ;(src_ptr[0] * vp9_filter[2])
-    vmlal.u8        q8, d23, d2
-    vmlal.u8        q7, d24, d2             ;(src_ptr[0] * vp9_filter[2])
-    vmlal.u8        q9, d25, d2
-
-    vmull.u8        q10, d26, d3            ;(src_ptr[1] * vp9_filter[3])
-    vmull.u8        q11, d27, d3
-    vmull.u8        q12, d28, d3            ;(src_ptr[1] * vp9_filter[3])
-    vmull.u8        q15, d29, d3
-
-    vqadd.s16       q6, q10                 ;sum of all (src_data*filter_parameters)
-    vqadd.s16       q8, q11
-    vqadd.s16       q7, q12
-    vqadd.s16       q9, q15
-
-    subs            r2, r2, #1
-
-    vqrshrun.s16    d6, q6, #7              ;shift/round/saturate to u8
-    vqrshrun.s16    d7, q7, #7
-    vqrshrun.s16    d8, q8, #7
-    vqrshrun.s16    d9, q9, #7
-
-    vst1.u8         {q3}, [r4], r5              ;store result
-    vst1.u8         {q4}, [r4], r5
-
-    bne             filt_blk2d_fpo16x16_loop_neon
-
-    pop             {r4-r5,pc}
-
-;--------------------
-secondpass_filter16x16_only
-;Second pass: 16x16
-    add             r3, r12, r3, lsl #5
-    sub             r0, r0, r1, lsl #1
-
-    vld1.s32        {q5, q6}, [r3]          ;load second_pass filter
-    mov             r3, #2                  ;loop counter
-
-    vabs.s32        q7, q5
-    vabs.s32        q8, q6
-
-    vdup.8          d0, d14[0]              ;second_pass filter parameters (d0-d5)
-    vdup.8          d1, d14[4]
-    vdup.8          d2, d15[0]
-    vdup.8          d3, d15[4]
-    vdup.8          d4, d16[0]
-    vdup.8          d5, d16[4]
-
-filt_blk2d_spo16x16_outloop_neon
-    vld1.u8         {d18}, [r0], r1         ;load src data
-    vld1.u8         {d19}, [r0], r1
-    vld1.u8         {d20}, [r0], r1
-    vld1.u8         {d21}, [r0], r1
-    mov             r12, #4                 ;loop counter
-    vld1.u8         {d22}, [r0], r1
-
-secondpass_only_inner_loop_neon
-    vld1.u8         {d23}, [r0], r1         ;load src data
-    vld1.u8         {d24}, [r0], r1
-    vld1.u8         {d25}, [r0], r1
-    vld1.u8         {d26}, [r0], r1
-
-    vmull.u8        q3, d18, d0             ;(src_ptr[-2] * vp9_filter[0])
-    vmull.u8        q4, d19, d0
-    vmull.u8        q5, d20, d0
-    vmull.u8        q6, d21, d0
-
-    vmlsl.u8        q3, d19, d1             ;-(src_ptr[-1] * vp9_filter[1])
-    vmlsl.u8        q4, d20, d1
-    vmlsl.u8        q5, d21, d1
-    vmlsl.u8        q6, d22, d1
-
-    vmlsl.u8        q3, d22, d4             ;-(src_ptr[2] * vp9_filter[4])
-    vmlsl.u8        q4, d23, d4
-    vmlsl.u8        q5, d24, d4
-    vmlsl.u8        q6, d25, d4
-
-    vmlal.u8        q3, d20, d2             ;(src_ptr[0] * vp9_filter[2])
-    vmlal.u8        q4, d21, d2
-    vmlal.u8        q5, d22, d2
-    vmlal.u8        q6, d23, d2
-
-    vmlal.u8        q3, d23, d5             ;(src_ptr[3] * vp9_filter[5])
-    vmlal.u8        q4, d24, d5
-    vmlal.u8        q5, d25, d5
-    vmlal.u8        q6, d26, d5
-
-    vmull.u8        q7, d21, d3             ;(src_ptr[1] * vp9_filter[3])
-    vmull.u8        q8, d22, d3
-    vmull.u8        q9, d23, d3
-    vmull.u8        q10, d24, d3
-
-    subs            r12, r12, #1
-
-    vqadd.s16       q7, q3                  ;sum of all (src_data*filter_parameters)
-    vqadd.s16       q8, q4
-    vqadd.s16       q9, q5
-    vqadd.s16       q10, q6
-
-    vqrshrun.s16    d6, q7, #7              ;shift/round/saturate to u8
-    vqrshrun.s16    d7, q8, #7
-    vqrshrun.s16    d8, q9, #7
-    vqrshrun.s16    d9, q10, #7
-
-    vst1.u8         {d6}, [r4], r5          ;store result
-    vmov            q9, q11
-    vst1.u8         {d7}, [r4], r5
-    vmov            q10, q12
-    vst1.u8         {d8}, [r4], r5
-    vmov            d22, d26
-    vst1.u8         {d9}, [r4], r5
-
-    bne             secondpass_only_inner_loop_neon
-
-    subs            r3, r3, #1
-    sub             r0, r0, r1, lsl #4
-    sub             r0, r0, r1, lsl #2
-    sub             r0, r0, r1
-    add             r0, r0, #8
-
-    sub             r4, r4, r5, lsl #4
-    add             r4, r4, #8
-
-    bne filt_blk2d_spo16x16_outloop_neon
-
-    pop             {r4-r5,pc}
-
-    ENDP
-
-;-----------------
-    END
--- a/vp8/common/arm/neon/sixtappredict4x4_neon.asm
+++ /dev/null
@@ -1,422 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp8_sixtap_predict_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-filter4_coeff
-    DCD     0,  0,  128,    0,   0,  0,   0,  0
-    DCD     0, -6,  123,   12,  -1,  0,   0,  0
-    DCD     2, -11, 108,   36,  -8,  1,   0,  0
-    DCD     0, -9,   93,   50,  -6,  0,   0,  0
-    DCD     3, -16,  77,   77, -16,  3,   0,  0
-    DCD     0, -6,   50,   93,  -9,  0,   0,  0
-    DCD     1, -8,   36,  108, -11,  2,   0,  0
-    DCD     0, -1,   12,  123,  -6,   0,  0,  0
-
-; r0    unsigned char  *src_ptr,
-; r1    int  src_pixels_per_line,
-; r2    int  xoffset,
-; r3    int  yoffset,
-; stack(r4) unsigned char *dst_ptr,
-; stack(lr) int  dst_pitch
-
-|vp8_sixtap_predict_neon| PROC
-    push            {r4, lr}
-
-    adr             r12, filter4_coeff
-    ldr             r4, [sp, #8]            ;load parameters from stack
-    ldr             lr, [sp, #12]           ;load parameters from stack
-
-    cmp             r2, #0                  ;skip first_pass filter if xoffset=0
-    beq             secondpass_filter4x4_only
-
-    add             r2, r12, r2, lsl #5     ;calculate filter location
-
-    cmp             r3, #0                  ;skip second_pass filter if yoffset=0
-    vld1.s32        {q14, q15}, [r2]        ;load first_pass filter
-
-    beq             firstpass_filter4x4_only
-
-    vabs.s32        q12, q14                ;get abs(filer_parameters)
-    vabs.s32        q13, q15
-
-    sub             r0, r0, #2              ;go back 2 columns of src data
-    sub             r0, r0, r1, lsl #1      ;go back 2 lines of src data
-
-;First pass: output_height lines x output_width columns (9x4)
-    vld1.u8         {q3}, [r0], r1          ;load first 4-line src data
-    vdup.8          d0, d24[0]              ;first_pass filter (d0-d5)
-    vld1.u8         {q4}, [r0], r1
-    vdup.8          d1, d24[4]
-    vld1.u8         {q5}, [r0], r1
-    vdup.8          d2, d25[0]
-    vld1.u8         {q6}, [r0], r1
-    vdup.8          d3, d25[4]
-    vdup.8          d4, d26[0]
-    vdup.8          d5, d26[4]
-
-    pld             [r0]
-    pld             [r0, r1]
-    pld             [r0, r1, lsl #1]
-
-    vext.8          d18, d6, d7, #5         ;construct src_ptr[3]
-    vext.8          d19, d8, d9, #5
-    vext.8          d20, d10, d11, #5
-    vext.8          d21, d12, d13, #5
-
-    vswp            d7, d8                  ;discard 2nd half data after src_ptr[3] is done
-    vswp            d11, d12
-
-    vzip.32         d18, d19                ;put 2-line data in 1 register (src_ptr[3])
-    vzip.32         d20, d21
-    vmull.u8        q7, d18, d5             ;(src_ptr[3] * vp9_filter[5])
-    vmull.u8        q8, d20, d5
-
-    vmov            q4, q3                  ;keep original src data in q4 q6
-    vmov            q6, q5
-
-    vzip.32         d6, d7                  ;construct src_ptr[-2], and put 2-line data together
-    vzip.32         d10, d11
-    vshr.u64        q9, q4, #8              ;construct src_ptr[-1]
-    vshr.u64        q10, q6, #8
-    vmlal.u8        q7, d6, d0              ;+(src_ptr[-2] * vp9_filter[0])
-    vmlal.u8        q8, d10, d0
-
-    vzip.32         d18, d19                ;put 2-line data in 1 register (src_ptr[-1])
-    vzip.32         d20, d21
-    vshr.u64        q3, q4, #32             ;construct src_ptr[2]
-    vshr.u64        q5, q6, #32
-    vmlsl.u8        q7, d18, d1             ;-(src_ptr[-1] * vp9_filter[1])
-    vmlsl.u8        q8, d20, d1
-
-    vzip.32         d6, d7                  ;put 2-line data in 1 register (src_ptr[2])
-    vzip.32         d10, d11
-    vshr.u64        q9, q4, #16             ;construct src_ptr[0]
-    vshr.u64        q10, q6, #16
-    vmlsl.u8        q7, d6, d4              ;-(src_ptr[2] * vp9_filter[4])
-    vmlsl.u8        q8, d10, d4
-
-    vzip.32         d18, d19                ;put 2-line data in 1 register (src_ptr[0])
-    vzip.32         d20, d21
-    vshr.u64        q3, q4, #24             ;construct src_ptr[1]
-    vshr.u64        q5, q6, #24
-    vmlal.u8        q7, d18, d2             ;(src_ptr[0] * vp9_filter[2])
-    vmlal.u8        q8, d20, d2
-
-    vzip.32         d6, d7                  ;put 2-line data in 1 register (src_ptr[1])
-    vzip.32         d10, d11
-    vmull.u8        q9, d6, d3              ;(src_ptr[1] * vp9_filter[3])
-    vmull.u8        q10, d10, d3
-
-    vld1.u8         {q3}, [r0], r1          ;load rest 5-line src data
-    vld1.u8         {q4}, [r0], r1
-
-    vqadd.s16       q7, q9                  ;sum of all (src_data*filter_parameters)
-    vqadd.s16       q8, q10
-
-    vld1.u8         {q5}, [r0], r1
-    vld1.u8         {q6}, [r0], r1
-
-    vqrshrun.s16    d27, q7, #7             ;shift/round/saturate to u8
-    vqrshrun.s16    d28, q8, #7
-
-    ;First Pass on rest 5-line data
-    vld1.u8         {q11}, [r0], r1
-
-    vext.8          d18, d6, d7, #5         ;construct src_ptr[3]
-    vext.8          d19, d8, d9, #5
-    vext.8          d20, d10, d11, #5
-    vext.8          d21, d12, d13, #5
-
-    vswp            d7, d8                  ;discard 2nd half data after src_ptr[3] is done
-    vswp            d11, d12
-
-    vzip.32         d18, d19                ;put 2-line data in 1 register (src_ptr[3])
-    vzip.32         d20, d21
-    vext.8          d31, d22, d23, #5       ;construct src_ptr[3]
-    vmull.u8        q7, d18, d5             ;(src_ptr[3] * vp9_filter[5])
-    vmull.u8        q8, d20, d5
-    vmull.u8        q12, d31, d5            ;(src_ptr[3] * vp9_filter[5])
-
-    vmov            q4, q3                  ;keep original src data in q4 q6
-    vmov            q6, q5
-
-    vzip.32         d6, d7                  ;construct src_ptr[-2], and put 2-line data together
-    vzip.32         d10, d11
-    vshr.u64        q9, q4, #8              ;construct src_ptr[-1]
-    vshr.u64        q10, q6, #8
-
-    vmlal.u8        q7, d6, d0              ;+(src_ptr[-2] * vp9_filter[0])
-    vmlal.u8        q8, d10, d0
-    vmlal.u8        q12, d22, d0            ;(src_ptr[-2] * vp9_filter[0])
-
-    vzip.32         d18, d19                ;put 2-line data in 1 register (src_ptr[-1])
-    vzip.32         d20, d21
-    vshr.u64        q3, q4, #32             ;construct src_ptr[2]
-    vshr.u64        q5, q6, #32
-    vext.8          d31, d22, d23, #1       ;construct src_ptr[-1]
-
-    vmlsl.u8        q7, d18, d1             ;-(src_ptr[-1] * vp9_filter[1])
-    vmlsl.u8        q8, d20, d1
-    vmlsl.u8        q12, d31, d1            ;-(src_ptr[-1] * vp9_filter[1])
-
-    vzip.32         d6, d7                  ;put 2-line data in 1 register (src_ptr[2])
-    vzip.32         d10, d11
-    vshr.u64        q9, q4, #16             ;construct src_ptr[0]
-    vshr.u64        q10, q6, #16
-    vext.8          d31, d22, d23, #4       ;construct src_ptr[2]
-
-    vmlsl.u8        q7, d6, d4              ;-(src_ptr[2] * vp9_filter[4])
-    vmlsl.u8        q8, d10, d4
-    vmlsl.u8        q12, d31, d4            ;-(src_ptr[2] * vp9_filter[4])
-
-    vzip.32         d18, d19                ;put 2-line data in 1 register (src_ptr[0])
-    vzip.32         d20, d21
-    vshr.u64        q3, q4, #24             ;construct src_ptr[1]
-    vshr.u64        q5, q6, #24
-    vext.8          d31, d22, d23, #2       ;construct src_ptr[0]
-
-    vmlal.u8        q7, d18, d2             ;(src_ptr[0] * vp9_filter[2])
-    vmlal.u8        q8, d20, d2
-    vmlal.u8        q12, d31, d2            ;(src_ptr[0] * vp9_filter[2])
-
-    vzip.32         d6, d7                  ;put 2-line data in 1 register (src_ptr[1])
-    vzip.32         d10, d11
-    vext.8          d31, d22, d23, #3       ;construct src_ptr[1]
-    vmull.u8        q9, d6, d3              ;(src_ptr[1] * vp9_filter[3])
-    vmull.u8        q10, d10, d3
-    vmull.u8        q11, d31, d3            ;(src_ptr[1] * vp9_filter[3])
-
-    add             r3, r12, r3, lsl #5
-
-    vqadd.s16       q7, q9                  ;sum of all (src_data*filter_parameters)
-    vqadd.s16       q8, q10
-    vqadd.s16       q12, q11
-
-    vext.8          d23, d27, d28, #4
-    vld1.s32        {q5, q6}, [r3]          ;load second_pass filter
-
-    vqrshrun.s16    d29, q7, #7             ;shift/round/saturate to u8
-    vqrshrun.s16    d30, q8, #7
-    vqrshrun.s16    d31, q12, #7
-
-;Second pass: 4x4
-    vabs.s32        q7, q5
-    vabs.s32        q8, q6
-
-    vext.8          d24, d28, d29, #4
-    vext.8          d25, d29, d30, #4
-    vext.8          d26, d30, d31, #4
-
-    vdup.8          d0, d14[0]              ;second_pass filter parameters (d0-d5)
-    vdup.8          d1, d14[4]
-    vdup.8          d2, d15[0]
-    vdup.8          d3, d15[4]
-    vdup.8          d4, d16[0]
-    vdup.8          d5, d16[4]
-
-    vmull.u8        q3, d27, d0             ;(src_ptr[-2] * vp9_filter[0])
-    vmull.u8        q4, d28, d0
-
-    vmull.u8        q5, d25, d5             ;(src_ptr[3] * vp9_filter[5])
-    vmull.u8        q6, d26, d5
-
-    vmlsl.u8        q3, d29, d4             ;-(src_ptr[2] * vp9_filter[4])
-    vmlsl.u8        q4, d30, d4
-
-    vmlsl.u8        q5, d23, d1             ;-(src_ptr[-1] * vp9_filter[1])
-    vmlsl.u8        q6, d24, d1
-
-    vmlal.u8        q3, d28, d2             ;(src_ptr[0] * vp9_filter[2])
-    vmlal.u8        q4, d29, d2
-
-    vmlal.u8        q5, d24, d3             ;(src_ptr[1] * vp9_filter[3])
-    vmlal.u8        q6, d25, d3
-
-    add             r0, r4, lr
-    add             r1, r0, lr
-    add             r2, r1, lr
-
-    vqadd.s16       q5, q3                  ;sum of all (src_data*filter_parameters)
-    vqadd.s16       q6, q4
-
-    vqrshrun.s16    d3, q5, #7              ;shift/round/saturate to u8
-    vqrshrun.s16    d4, q6, #7
-
-    vst1.32         {d3[0]}, [r4]           ;store result
-    vst1.32         {d3[1]}, [r0]
-    vst1.32         {d4[0]}, [r1]
-    vst1.32         {d4[1]}, [r2]
-
-    pop             {r4, pc}
-
-
-;---------------------
-firstpass_filter4x4_only
-    vabs.s32        q12, q14                ;get abs(filer_parameters)
-    vabs.s32        q13, q15
-
-    sub             r0, r0, #2              ;go back 2 columns of src data
-
-;First pass: output_height lines x output_width columns (4x4)
-    vld1.u8         {q3}, [r0], r1          ;load first 4-line src data
-    vdup.8          d0, d24[0]              ;first_pass filter (d0-d5)
-    vld1.u8         {q4}, [r0], r1
-    vdup.8          d1, d24[4]
-    vld1.u8         {q5}, [r0], r1
-    vdup.8          d2, d25[0]
-    vld1.u8         {q6}, [r0], r1
-
-    vdup.8          d3, d25[4]
-    vdup.8          d4, d26[0]
-    vdup.8          d5, d26[4]
-
-    vext.8          d18, d6, d7, #5         ;construct src_ptr[3]
-    vext.8          d19, d8, d9, #5
-    vext.8          d20, d10, d11, #5
-    vext.8          d21, d12, d13, #5
-
-    vswp            d7, d8                  ;discard 2nd half data after src_ptr[3] is done
-    vswp            d11, d12
-
-    vzip.32         d18, d19                ;put 2-line data in 1 register (src_ptr[3])
-    vzip.32         d20, d21
-    vmull.u8        q7, d18, d5             ;(src_ptr[3] * vp9_filter[5])
-    vmull.u8        q8, d20, d5
-
-    vmov            q4, q3                  ;keep original src data in q4 q6
-    vmov            q6, q5
-
-    vzip.32         d6, d7                  ;construct src_ptr[-2], and put 2-line data together
-    vzip.32         d10, d11
-    vshr.u64        q9, q4, #8              ;construct src_ptr[-1]
-    vshr.u64        q10, q6, #8
-    vmlal.u8        q7, d6, d0              ;+(src_ptr[-2] * vp9_filter[0])
-    vmlal.u8        q8, d10, d0
-
-    vzip.32         d18, d19                ;put 2-line data in 1 register (src_ptr[-1])
-    vzip.32         d20, d21
-    vshr.u64        q3, q4, #32             ;construct src_ptr[2]
-    vshr.u64        q5, q6, #32
-    vmlsl.u8        q7, d18, d1             ;-(src_ptr[-1] * vp9_filter[1])
-    vmlsl.u8        q8, d20, d1
-
-    vzip.32         d6, d7                  ;put 2-line data in 1 register (src_ptr[2])
-    vzip.32         d10, d11
-    vshr.u64        q9, q4, #16             ;construct src_ptr[0]
-    vshr.u64        q10, q6, #16
-    vmlsl.u8        q7, d6, d4              ;-(src_ptr[2] * vp9_filter[4])
-    vmlsl.u8        q8, d10, d4
-
-    vzip.32         d18, d19                ;put 2-line data in 1 register (src_ptr[0])
-    vzip.32         d20, d21
-    vshr.u64        q3, q4, #24             ;construct src_ptr[1]
-    vshr.u64        q5, q6, #24
-    vmlal.u8        q7, d18, d2             ;(src_ptr[0] * vp9_filter[2])
-    vmlal.u8        q8, d20, d2
-
-    vzip.32         d6, d7                  ;put 2-line data in 1 register (src_ptr[1])
-    vzip.32         d10, d11
-    vmull.u8        q9, d6, d3              ;(src_ptr[1] * vp9_filter[3])
-    vmull.u8        q10, d10, d3
-
-    add             r0, r4, lr
-    add             r1, r0, lr
-    add             r2, r1, lr
-
-    vqadd.s16       q7, q9                  ;sum of all (src_data*filter_parameters)
-    vqadd.s16       q8, q10
-
-    vqrshrun.s16    d27, q7, #7             ;shift/round/saturate to u8
-    vqrshrun.s16    d28, q8, #7
-
-    vst1.32         {d27[0]}, [r4]          ;store result
-    vst1.32         {d27[1]}, [r0]
-    vst1.32         {d28[0]}, [r1]
-    vst1.32         {d28[1]}, [r2]
-
-    pop             {r4, pc}
-
-
-;---------------------
-secondpass_filter4x4_only
-    sub             r0, r0, r1, lsl #1
-    add             r3, r12, r3, lsl #5
-
-    vld1.32         {d27[0]}, [r0], r1      ;load src data
-    vld1.s32        {q5, q6}, [r3]          ;load second_pass filter
-    vld1.32         {d27[1]}, [r0], r1
-    vabs.s32        q7, q5
-    vld1.32         {d28[0]}, [r0], r1
-    vabs.s32        q8, q6
-    vld1.32         {d28[1]}, [r0], r1
-    vdup.8          d0, d14[0]              ;second_pass filter parameters (d0-d5)
-    vld1.32         {d29[0]}, [r0], r1
-    vdup.8          d1, d14[4]
-    vld1.32         {d29[1]}, [r0], r1
-    vdup.8          d2, d15[0]
-    vld1.32         {d30[0]}, [r0], r1
-    vdup.8          d3, d15[4]
-    vld1.32         {d30[1]}, [r0], r1
-    vdup.8          d4, d16[0]
-    vld1.32         {d31[0]}, [r0], r1
-    vdup.8          d5, d16[4]
-
-    vext.8          d23, d27, d28, #4
-    vext.8          d24, d28, d29, #4
-    vext.8          d25, d29, d30, #4
-    vext.8          d26, d30, d31, #4
-
-    vmull.u8        q3, d27, d0             ;(src_ptr[-2] * vp9_filter[0])
-    vmull.u8        q4, d28, d0
-
-    vmull.u8        q5, d25, d5             ;(src_ptr[3] * vp9_filter[5])
-    vmull.u8        q6, d26, d5
-
-    vmlsl.u8        q3, d29, d4             ;-(src_ptr[2] * vp9_filter[4])
-    vmlsl.u8        q4, d30, d4
-
-    vmlsl.u8        q5, d23, d1             ;-(src_ptr[-1] * vp9_filter[1])
-    vmlsl.u8        q6, d24, d1
-
-    vmlal.u8        q3, d28, d2             ;(src_ptr[0] * vp9_filter[2])
-    vmlal.u8        q4, d29, d2
-
-    vmlal.u8        q5, d24, d3             ;(src_ptr[1] * vp9_filter[3])
-    vmlal.u8        q6, d25, d3
-
-    add             r0, r4, lr
-    add             r1, r0, lr
-    add             r2, r1, lr
-
-    vqadd.s16       q5, q3                  ;sum of all (src_data*filter_parameters)
-    vqadd.s16       q6, q4
-
-    vqrshrun.s16    d3, q5, #7              ;shift/round/saturate to u8
-    vqrshrun.s16    d4, q6, #7
-
-    vst1.32         {d3[0]}, [r4]           ;store result
-    vst1.32         {d3[1]}, [r0]
-    vst1.32         {d4[0]}, [r1]
-    vst1.32         {d4[1]}, [r2]
-
-    pop             {r4, pc}
-
-    ENDP
-
-;-----------------
-
-    END
--- a/vp8/common/arm/neon/sixtappredict8x4_neon.asm
+++ /dev/null
@@ -1,473 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp8_sixtap_predict8x4_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-filter8_coeff
-    DCD     0,  0,  128,    0,   0,  0,   0,  0
-    DCD     0, -6,  123,   12,  -1,  0,   0,  0
-    DCD     2, -11, 108,   36,  -8,  1,   0,  0
-    DCD     0, -9,   93,   50,  -6,  0,   0,  0
-    DCD     3, -16,  77,   77, -16,  3,   0,  0
-    DCD     0, -6,   50,   93,  -9,  0,   0,  0
-    DCD     1, -8,   36,  108, -11,  2,   0,  0
-    DCD     0, -1,   12,  123,  -6,   0,  0,  0
-
-; r0    unsigned char  *src_ptr,
-; r1    int  src_pixels_per_line,
-; r2    int  xoffset,
-; r3    int  yoffset,
-; r4    unsigned char *dst_ptr,
-; stack(r5) int  dst_pitch
-
-|vp8_sixtap_predict8x4_neon| PROC
-    push            {r4-r5, lr}
-
-    adr             r12, filter8_coeff
-    ldr             r4, [sp, #12]           ;load parameters from stack
-    ldr             r5, [sp, #16]           ;load parameters from stack
-
-    cmp             r2, #0                  ;skip first_pass filter if xoffset=0
-    beq             secondpass_filter8x4_only
-
-    add             r2, r12, r2, lsl #5     ;calculate filter location
-
-    cmp             r3, #0                  ;skip second_pass filter if yoffset=0
-
-    vld1.s32        {q14, q15}, [r2]        ;load first_pass filter
-
-    beq             firstpass_filter8x4_only
-
-    sub             sp, sp, #32             ;reserve space on stack for temporary storage
-    vabs.s32        q12, q14
-    vabs.s32        q13, q15
-
-    sub             r0, r0, #2              ;move srcptr back to (line-2) and (column-2)
-    mov             lr, sp
-    sub             r0, r0, r1, lsl #1
-
-    vdup.8          d0, d24[0]              ;first_pass filter (d0-d5)
-    vdup.8          d1, d24[4]
-    vdup.8          d2, d25[0]
-
-;First pass: output_height lines x output_width columns (9x8)
-    vld1.u8         {q3}, [r0], r1          ;load src data
-    vdup.8          d3, d25[4]
-    vld1.u8         {q4}, [r0], r1
-    vdup.8          d4, d26[0]
-    vld1.u8         {q5}, [r0], r1
-    vdup.8          d5, d26[4]
-    vld1.u8         {q6}, [r0], r1
-
-    pld             [r0]
-    pld             [r0, r1]
-    pld             [r0, r1, lsl #1]
-
-    vmull.u8        q7, d6, d0              ;(src_ptr[-2] * vp9_filter[0])
-    vmull.u8        q8, d8, d0
-    vmull.u8        q9, d10, d0
-    vmull.u8        q10, d12, d0
-
-    vext.8          d28, d6, d7, #1         ;construct src_ptr[-1]
-    vext.8          d29, d8, d9, #1
-    vext.8          d30, d10, d11, #1
-    vext.8          d31, d12, d13, #1
-
-    vmlsl.u8        q7, d28, d1             ;-(src_ptr[-1] * vp9_filter[1])
-    vmlsl.u8        q8, d29, d1
-    vmlsl.u8        q9, d30, d1
-    vmlsl.u8        q10, d31, d1
-
-    vext.8          d28, d6, d7, #4         ;construct src_ptr[2]
-    vext.8          d29, d8, d9, #4
-    vext.8          d30, d10, d11, #4
-    vext.8          d31, d12, d13, #4
-
-    vmlsl.u8        q7, d28, d4             ;-(src_ptr[2] * vp9_filter[4])
-    vmlsl.u8        q8, d29, d4
-    vmlsl.u8        q9, d30, d4
-    vmlsl.u8        q10, d31, d4
-
-    vext.8          d28, d6, d7, #2         ;construct src_ptr[0]
-    vext.8          d29, d8, d9, #2
-    vext.8          d30, d10, d11, #2
-    vext.8          d31, d12, d13, #2
-
-    vmlal.u8        q7, d28, d2             ;(src_ptr[0] * vp9_filter[2])
-    vmlal.u8        q8, d29, d2
-    vmlal.u8        q9, d30, d2
-    vmlal.u8        q10, d31, d2
-
-    vext.8          d28, d6, d7, #5         ;construct src_ptr[3]
-    vext.8          d29, d8, d9, #5
-    vext.8          d30, d10, d11, #5
-    vext.8          d31, d12, d13, #5
-
-    vmlal.u8        q7, d28, d5             ;(src_ptr[3] * vp9_filter[5])
-    vmlal.u8        q8, d29, d5
-    vmlal.u8        q9, d30, d5
-    vmlal.u8        q10, d31, d5
-
-    vext.8          d28, d6, d7, #3         ;construct src_ptr[1]
-    vext.8          d29, d8, d9, #3
-    vext.8          d30, d10, d11, #3
-    vext.8          d31, d12, d13, #3
-
-    vmull.u8        q3, d28, d3             ;(src_ptr[1] * vp9_filter[3])
-    vmull.u8        q4, d29, d3
-    vmull.u8        q5, d30, d3
-    vmull.u8        q6, d31, d3
-
-    vqadd.s16       q7, q3                  ;sum of all (src_data*filter_parameters)
-    vqadd.s16       q8, q4
-    vqadd.s16       q9, q5
-    vqadd.s16       q10, q6
-
-    vld1.u8         {q3}, [r0], r1          ;load src data
-
-    vqrshrun.s16    d22, q7, #7             ;shift/round/saturate to u8
-    vqrshrun.s16    d23, q8, #7
-    vqrshrun.s16    d24, q9, #7
-    vqrshrun.s16    d25, q10, #7
-
-    vld1.u8         {q4}, [r0], r1
-    vst1.u8         {d22}, [lr]!            ;store result
-    vld1.u8         {q5}, [r0], r1
-    vst1.u8         {d23}, [lr]!
-    vld1.u8         {q6}, [r0], r1
-    vst1.u8         {d24}, [lr]!
-    vld1.u8         {q7}, [r0], r1
-    vst1.u8         {d25}, [lr]!
-
-    ;first_pass filtering on the rest 5-line data
-    vmull.u8        q8, d6, d0              ;(src_ptr[-2] * vp9_filter[0])
-    vmull.u8        q9, d8, d0
-    vmull.u8        q10, d10, d0
-    vmull.u8        q11, d12, d0
-    vmull.u8        q12, d14, d0
-
-    vext.8          d27, d6, d7, #1         ;construct src_ptr[-1]
-    vext.8          d28, d8, d9, #1
-    vext.8          d29, d10, d11, #1
-    vext.8          d30, d12, d13, #1
-    vext.8          d31, d14, d15, #1
-
-    vmlsl.u8        q8, d27, d1             ;-(src_ptr[-1] * vp9_filter[1])
-    vmlsl.u8        q9, d28, d1
-    vmlsl.u8        q10, d29, d1
-    vmlsl.u8        q11, d30, d1
-    vmlsl.u8        q12, d31, d1
-
-    vext.8          d27, d6, d7, #4         ;construct src_ptr[2]
-    vext.8          d28, d8, d9, #4
-    vext.8          d29, d10, d11, #4
-    vext.8          d30, d12, d13, #4
-    vext.8          d31, d14, d15, #4
-
-    vmlsl.u8        q8, d27, d4             ;-(src_ptr[2] * vp9_filter[4])
-    vmlsl.u8        q9, d28, d4
-    vmlsl.u8        q10, d29, d4
-    vmlsl.u8        q11, d30, d4
-    vmlsl.u8        q12, d31, d4
-
-    vext.8          d27, d6, d7, #2         ;construct src_ptr[0]
-    vext.8          d28, d8, d9, #2
-    vext.8          d29, d10, d11, #2
-    vext.8          d30, d12, d13, #2
-    vext.8          d31, d14, d15, #2
-
-    vmlal.u8        q8, d27, d2             ;(src_ptr[0] * vp9_filter[2])
-    vmlal.u8        q9, d28, d2
-    vmlal.u8        q10, d29, d2
-    vmlal.u8        q11, d30, d2
-    vmlal.u8        q12, d31, d2
-
-    vext.8          d27, d6, d7, #5         ;construct src_ptr[3]
-    vext.8          d28, d8, d9, #5
-    vext.8          d29, d10, d11, #5
-    vext.8          d30, d12, d13, #5
-    vext.8          d31, d14, d15, #5
-
-    vmlal.u8        q8, d27, d5             ;(src_ptr[3] * vp9_filter[5])
-    vmlal.u8        q9, d28, d5
-    vmlal.u8        q10, d29, d5
-    vmlal.u8        q11, d30, d5
-    vmlal.u8        q12, d31, d5
-
-    vext.8          d27, d6, d7, #3         ;construct src_ptr[1]
-    vext.8          d28, d8, d9, #3
-    vext.8          d29, d10, d11, #3
-    vext.8          d30, d12, d13, #3
-    vext.8          d31, d14, d15, #3
-
-    vmull.u8        q3, d27, d3             ;(src_ptr[1] * vp9_filter[3])
-    vmull.u8        q4, d28, d3
-    vmull.u8        q5, d29, d3
-    vmull.u8        q6, d30, d3
-    vmull.u8        q7, d31, d3
-
-    vqadd.s16       q8, q3                  ;sum of all (src_data*filter_parameters)
-    vqadd.s16       q9, q4
-    vqadd.s16       q10, q5
-    vqadd.s16       q11, q6
-    vqadd.s16       q12, q7
-
-    vqrshrun.s16    d26, q8, #7             ;shift/round/saturate to u8
-    vqrshrun.s16    d27, q9, #7
-    vqrshrun.s16    d28, q10, #7
-    vqrshrun.s16    d29, q11, #7                ;load intermediate data from stack
-    vqrshrun.s16    d30, q12, #7
-
-;Second pass: 8x4
-;secondpass_filter
-    add             r3, r12, r3, lsl #5
-    sub             lr, lr, #32
-
-    vld1.s32        {q5, q6}, [r3]          ;load second_pass filter
-    vld1.u8         {q11}, [lr]!
-
-    vabs.s32        q7, q5
-    vabs.s32        q8, q6
-
-    vld1.u8         {q12}, [lr]!
-
-    vdup.8          d0, d14[0]              ;second_pass filter parameters (d0-d5)
-    vdup.8          d1, d14[4]
-    vdup.8          d2, d15[0]
-    vdup.8          d3, d15[4]
-    vdup.8          d4, d16[0]
-    vdup.8          d5, d16[4]
-
-    vmull.u8        q3, d22, d0             ;(src_ptr[-2] * vp9_filter[0])
-    vmull.u8        q4, d23, d0
-    vmull.u8        q5, d24, d0
-    vmull.u8        q6, d25, d0
-
-    vmlsl.u8        q3, d23, d1             ;-(src_ptr[-1] * vp9_filter[1])
-    vmlsl.u8        q4, d24, d1
-    vmlsl.u8        q5, d25, d1
-    vmlsl.u8        q6, d26, d1
-
-    vmlsl.u8        q3, d26, d4             ;-(src_ptr[2] * vp9_filter[4])
-    vmlsl.u8        q4, d27, d4
-    vmlsl.u8        q5, d28, d4
-    vmlsl.u8        q6, d29, d4
-
-    vmlal.u8        q3, d24, d2             ;(src_ptr[0] * vp9_filter[2])
-    vmlal.u8        q4, d25, d2
-    vmlal.u8        q5, d26, d2
-    vmlal.u8        q6, d27, d2
-
-    vmlal.u8        q3, d27, d5             ;(src_ptr[3] * vp9_filter[5])
-    vmlal.u8        q4, d28, d5
-    vmlal.u8        q5, d29, d5
-    vmlal.u8        q6, d30, d5
-
-    vmull.u8        q7, d25, d3             ;(src_ptr[1] * vp9_filter[3])
-    vmull.u8        q8, d26, d3
-    vmull.u8        q9, d27, d3
-    vmull.u8        q10, d28, d3
-
-    vqadd.s16       q7, q3                  ;sum of all (src_data*filter_parameters)
-    vqadd.s16       q8, q4
-    vqadd.s16       q9, q5
-    vqadd.s16       q10, q6
-
-    vqrshrun.s16    d6, q7, #7              ;shift/round/saturate to u8
-    vqrshrun.s16    d7, q8, #7
-    vqrshrun.s16    d8, q9, #7
-    vqrshrun.s16    d9, q10, #7
-
-    vst1.u8         {d6}, [r4], r5          ;store result
-    vst1.u8         {d7}, [r4], r5
-    vst1.u8         {d8}, [r4], r5
-    vst1.u8         {d9}, [r4], r5
-
-    add             sp, sp, #32
-    pop             {r4-r5,pc}
-
-;--------------------
-firstpass_filter8x4_only
-    vabs.s32        q12, q14
-    vabs.s32        q13, q15
-
-    sub             r0, r0, #2              ;move srcptr back to (line-2) and (column-2)
-    vld1.u8         {q3}, [r0], r1          ;load src data
-
-    vdup.8          d0, d24[0]              ;first_pass filter (d0-d5)
-    vld1.u8         {q4}, [r0], r1
-    vdup.8          d1, d24[4]
-    vld1.u8         {q5}, [r0], r1
-    vdup.8          d2, d25[0]
-    vld1.u8         {q6}, [r0], r1
-    vdup.8          d3, d25[4]
-    vdup.8          d4, d26[0]
-    vdup.8          d5, d26[4]
-
-;First pass: output_height lines x output_width columns (4x8)
-    pld             [r0]
-    pld             [r0, r1]
-    pld             [r0, r1, lsl #1]
-
-    vmull.u8        q7, d6, d0              ;(src_ptr[-2] * vp9_filter[0])
-    vmull.u8        q8, d8, d0
-    vmull.u8        q9, d10, d0
-    vmull.u8        q10, d12, d0
-
-    vext.8          d28, d6, d7, #1         ;construct src_ptr[-1]
-    vext.8          d29, d8, d9, #1
-    vext.8          d30, d10, d11, #1
-    vext.8          d31, d12, d13, #1
-
-    vmlsl.u8        q7, d28, d1             ;-(src_ptr[-1] * vp9_filter[1])
-    vmlsl.u8        q8, d29, d1
-    vmlsl.u8        q9, d30, d1
-    vmlsl.u8        q10, d31, d1
-
-    vext.8          d28, d6, d7, #4         ;construct src_ptr[2]
-    vext.8          d29, d8, d9, #4
-    vext.8          d30, d10, d11, #4
-    vext.8          d31, d12, d13, #4
-
-    vmlsl.u8        q7, d28, d4             ;-(src_ptr[2] * vp9_filter[4])
-    vmlsl.u8        q8, d29, d4
-    vmlsl.u8        q9, d30, d4
-    vmlsl.u8        q10, d31, d4
-
-    vext.8          d28, d6, d7, #2         ;construct src_ptr[0]
-    vext.8          d29, d8, d9, #2
-    vext.8          d30, d10, d11, #2
-    vext.8          d31, d12, d13, #2
-
-    vmlal.u8        q7, d28, d2             ;(src_ptr[0] * vp9_filter[2])
-    vmlal.u8        q8, d29, d2
-    vmlal.u8        q9, d30, d2
-    vmlal.u8        q10, d31, d2
-
-    vext.8          d28, d6, d7, #5         ;construct src_ptr[3]
-    vext.8          d29, d8, d9, #5
-    vext.8          d30, d10, d11, #5
-    vext.8          d31, d12, d13, #5
-
-    vmlal.u8        q7, d28, d5             ;(src_ptr[3] * vp9_filter[5])
-    vmlal.u8        q8, d29, d5
-    vmlal.u8        q9, d30, d5
-    vmlal.u8        q10, d31, d5
-
-    vext.8          d28, d6, d7, #3         ;construct src_ptr[1]
-    vext.8          d29, d8, d9, #3
-    vext.8          d30, d10, d11, #3
-    vext.8          d31, d12, d13, #3
-
-    vmull.u8        q3, d28, d3             ;(src_ptr[1] * vp9_filter[3])
-    vmull.u8        q4, d29, d3
-    vmull.u8        q5, d30, d3
-    vmull.u8        q6, d31, d3
-
-    vqadd.s16       q7, q3                  ;sum of all (src_data*filter_parameters)
-    vqadd.s16       q8, q4
-    vqadd.s16       q9, q5
-    vqadd.s16       q10, q6
-
-    vqrshrun.s16    d22, q7, #7             ;shift/round/saturate to u8
-    vqrshrun.s16    d23, q8, #7
-    vqrshrun.s16    d24, q9, #7
-    vqrshrun.s16    d25, q10, #7
-
-    vst1.u8         {d22}, [r4], r5         ;store result
-    vst1.u8         {d23}, [r4], r5
-    vst1.u8         {d24}, [r4], r5
-    vst1.u8         {d25}, [r4], r5
-
-    pop             {r4-r5,pc}
-
-;---------------------
-secondpass_filter8x4_only
-;Second pass: 8x4
-    add             r3, r12, r3, lsl #5
-    sub             r0, r0, r1, lsl #1
-    vld1.s32        {q5, q6}, [r3]          ;load second_pass filter
-    vabs.s32        q7, q5
-    vabs.s32        q8, q6
-
-    vld1.u8         {d22}, [r0], r1
-    vld1.u8         {d23}, [r0], r1
-    vld1.u8         {d24}, [r0], r1
-    vdup.8          d0, d14[0]              ;second_pass filter parameters (d0-d5)
-    vld1.u8         {d25}, [r0], r1
-    vdup.8          d1, d14[4]
-    vld1.u8         {d26}, [r0], r1
-    vdup.8          d2, d15[0]
-    vld1.u8         {d27}, [r0], r1
-    vdup.8          d3, d15[4]
-    vld1.u8         {d28}, [r0], r1
-    vdup.8          d4, d16[0]
-    vld1.u8         {d29}, [r0], r1
-    vdup.8          d5, d16[4]
-    vld1.u8         {d30}, [r0], r1
-
-    vmull.u8        q3, d22, d0             ;(src_ptr[-2] * vp9_filter[0])
-    vmull.u8        q4, d23, d0
-    vmull.u8        q5, d24, d0
-    vmull.u8        q6, d25, d0
-
-    vmlsl.u8        q3, d23, d1             ;-(src_ptr[-1] * vp9_filter[1])
-    vmlsl.u8        q4, d24, d1
-    vmlsl.u8        q5, d25, d1
-    vmlsl.u8        q6, d26, d1
-
-    vmlsl.u8        q3, d26, d4             ;-(src_ptr[2] * vp9_filter[4])
-    vmlsl.u8        q4, d27, d4
-    vmlsl.u8        q5, d28, d4
-    vmlsl.u8        q6, d29, d4
-
-    vmlal.u8        q3, d24, d2             ;(src_ptr[0] * vp9_filter[2])
-    vmlal.u8        q4, d25, d2
-    vmlal.u8        q5, d26, d2
-    vmlal.u8        q6, d27, d2
-
-    vmlal.u8        q3, d27, d5             ;(src_ptr[3] * vp9_filter[5])
-    vmlal.u8        q4, d28, d5
-    vmlal.u8        q5, d29, d5
-    vmlal.u8        q6, d30, d5
-
-    vmull.u8        q7, d25, d3             ;(src_ptr[1] * vp9_filter[3])
-    vmull.u8        q8, d26, d3
-    vmull.u8        q9, d27, d3
-    vmull.u8        q10, d28, d3
-
-    vqadd.s16       q7, q3                  ;sum of all (src_data*filter_parameters)
-    vqadd.s16       q8, q4
-    vqadd.s16       q9, q5
-    vqadd.s16       q10, q6
-
-    vqrshrun.s16    d6, q7, #7              ;shift/round/saturate to u8
-    vqrshrun.s16    d7, q8, #7
-    vqrshrun.s16    d8, q9, #7
-    vqrshrun.s16    d9, q10, #7
-
-    vst1.u8         {d6}, [r4], r5          ;store result
-    vst1.u8         {d7}, [r4], r5
-    vst1.u8         {d8}, [r4], r5
-    vst1.u8         {d9}, [r4], r5
-
-    pop             {r4-r5,pc}
-
-    ENDP
-
-;-----------------
-
-    END
--- a/vp8/common/arm/neon/sixtappredict8x8_neon.asm
+++ /dev/null
@@ -1,524 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp8_sixtap_predict8x8_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-filter8_coeff
-    DCD     0,  0,  128,    0,   0,  0,   0,  0
-    DCD     0, -6,  123,   12,  -1,  0,   0,  0
-    DCD     2, -11, 108,   36,  -8,  1,   0,  0
-    DCD     0, -9,   93,   50,  -6,  0,   0,  0
-    DCD     3, -16,  77,   77, -16,  3,   0,  0
-    DCD     0, -6,   50,   93,  -9,  0,   0,  0
-    DCD     1, -8,   36,  108, -11,  2,   0,  0
-    DCD     0, -1,   12,  123,  -6,   0,  0,  0
-
-; r0    unsigned char  *src_ptr,
-; r1    int  src_pixels_per_line,
-; r2    int  xoffset,
-; r3    int  yoffset,
-; stack(r4) unsigned char *dst_ptr,
-; stack(r5) int  dst_pitch
-
-|vp8_sixtap_predict8x8_neon| PROC
-    push            {r4-r5, lr}
-
-    adr             r12, filter8_coeff
-
-    ldr             r4, [sp, #12]           ;load parameters from stack
-    ldr             r5, [sp, #16]           ;load parameters from stack
-
-    cmp             r2, #0                  ;skip first_pass filter if xoffset=0
-    beq             secondpass_filter8x8_only
-
-    add             r2, r12, r2, lsl #5     ;calculate filter location
-
-    cmp             r3, #0                  ;skip second_pass filter if yoffset=0
-
-    vld1.s32        {q14, q15}, [r2]        ;load first_pass filter
-
-    beq             firstpass_filter8x8_only
-
-    sub             sp, sp, #64             ;reserve space on stack for temporary storage
-    mov             lr, sp
-
-    vabs.s32        q12, q14
-    vabs.s32        q13, q15
-
-    mov             r2, #2                  ;loop counter
-    sub             r0, r0, #2              ;move srcptr back to (line-2) and (column-2)
-    sub             r0, r0, r1, lsl #1
-
-    vdup.8          d0, d24[0]              ;first_pass filter (d0-d5)
-    vdup.8          d1, d24[4]
-    vdup.8          d2, d25[0]
-
-;First pass: output_height lines x output_width columns (13x8)
-    vld1.u8         {q3}, [r0], r1          ;load src data
-    vdup.8          d3, d25[4]
-    vld1.u8         {q4}, [r0], r1
-    vdup.8          d4, d26[0]
-    vld1.u8         {q5}, [r0], r1
-    vdup.8          d5, d26[4]
-    vld1.u8         {q6}, [r0], r1
-
-filt_blk2d_fp8x8_loop_neon
-    pld             [r0]
-    pld             [r0, r1]
-    pld             [r0, r1, lsl #1]
-
-    vmull.u8        q7, d6, d0              ;(src_ptr[-2] * vp9_filter[0])
-    vmull.u8        q8, d8, d0
-    vmull.u8        q9, d10, d0
-    vmull.u8        q10, d12, d0
-
-    vext.8          d28, d6, d7, #1         ;construct src_ptr[-1]
-    vext.8          d29, d8, d9, #1
-    vext.8          d30, d10, d11, #1
-    vext.8          d31, d12, d13, #1
-
-    vmlsl.u8        q7, d28, d1             ;-(src_ptr[-1] * vp9_filter[1])
-    vmlsl.u8        q8, d29, d1
-    vmlsl.u8        q9, d30, d1
-    vmlsl.u8        q10, d31, d1
-
-    vext.8          d28, d6, d7, #4         ;construct src_ptr[2]
-    vext.8          d29, d8, d9, #4
-    vext.8          d30, d10, d11, #4
-    vext.8          d31, d12, d13, #4
-
-    vmlsl.u8        q7, d28, d4             ;-(src_ptr[2] * vp9_filter[4])
-    vmlsl.u8        q8, d29, d4
-    vmlsl.u8        q9, d30, d4
-    vmlsl.u8        q10, d31, d4
-
-    vext.8          d28, d6, d7, #2         ;construct src_ptr[0]
-    vext.8          d29, d8, d9, #2
-    vext.8          d30, d10, d11, #2
-    vext.8          d31, d12, d13, #2
-
-    vmlal.u8        q7, d28, d2             ;(src_ptr[0] * vp9_filter[2])
-    vmlal.u8        q8, d29, d2
-    vmlal.u8        q9, d30, d2
-    vmlal.u8        q10, d31, d2
-
-    vext.8          d28, d6, d7, #5         ;construct src_ptr[3]
-    vext.8          d29, d8, d9, #5
-    vext.8          d30, d10, d11, #5
-    vext.8          d31, d12, d13, #5
-
-    vmlal.u8        q7, d28, d5             ;(src_ptr[3] * vp9_filter[5])
-    vmlal.u8        q8, d29, d5
-    vmlal.u8        q9, d30, d5
-    vmlal.u8        q10, d31, d5
-
-    vext.8          d28, d6, d7, #3         ;construct src_ptr[1]
-    vext.8          d29, d8, d9, #3
-    vext.8          d30, d10, d11, #3
-    vext.8          d31, d12, d13, #3
-
-    vmull.u8        q3, d28, d3             ;(src_ptr[1] * vp9_filter[3])
-    vmull.u8        q4, d29, d3
-    vmull.u8        q5, d30, d3
-    vmull.u8        q6, d31, d3
-
-    subs            r2, r2, #1
-
-    vqadd.s16       q7, q3                  ;sum of all (src_data*filter_parameters)
-    vqadd.s16       q8, q4
-    vqadd.s16       q9, q5
-    vqadd.s16       q10, q6
-
-    vld1.u8         {q3}, [r0], r1          ;load src data
-
-    vqrshrun.s16    d22, q7, #7             ;shift/round/saturate to u8
-    vqrshrun.s16    d23, q8, #7
-    vqrshrun.s16    d24, q9, #7
-    vqrshrun.s16    d25, q10, #7
-
-    vst1.u8         {d22}, [lr]!            ;store result
-    vld1.u8         {q4}, [r0], r1
-    vst1.u8         {d23}, [lr]!
-    vld1.u8         {q5}, [r0], r1
-    vst1.u8         {d24}, [lr]!
-    vld1.u8         {q6}, [r0], r1
-    vst1.u8         {d25}, [lr]!
-
-    bne             filt_blk2d_fp8x8_loop_neon
-
-    ;first_pass filtering on the rest 5-line data
-    ;vld1.u8            {q3}, [r0], r1          ;load src data
-    ;vld1.u8            {q4}, [r0], r1
-    ;vld1.u8            {q5}, [r0], r1
-    ;vld1.u8            {q6}, [r0], r1
-    vld1.u8         {q7}, [r0], r1
-
-    vmull.u8        q8, d6, d0              ;(src_ptr[-2] * vp9_filter[0])
-    vmull.u8        q9, d8, d0
-    vmull.u8        q10, d10, d0
-    vmull.u8        q11, d12, d0
-    vmull.u8        q12, d14, d0
-
-    vext.8          d27, d6, d7, #1         ;construct src_ptr[-1]
-    vext.8          d28, d8, d9, #1
-    vext.8          d29, d10, d11, #1
-    vext.8          d30, d12, d13, #1
-    vext.8          d31, d14, d15, #1
-
-    vmlsl.u8        q8, d27, d1             ;-(src_ptr[-1] * vp9_filter[1])
-    vmlsl.u8        q9, d28, d1
-    vmlsl.u8        q10, d29, d1
-    vmlsl.u8        q11, d30, d1
-    vmlsl.u8        q12, d31, d1
-
-    vext.8          d27, d6, d7, #4         ;construct src_ptr[2]
-    vext.8          d28, d8, d9, #4
-    vext.8          d29, d10, d11, #4
-    vext.8          d30, d12, d13, #4
-    vext.8          d31, d14, d15, #4
-
-    vmlsl.u8        q8, d27, d4             ;-(src_ptr[2] * vp9_filter[4])
-    vmlsl.u8        q9, d28, d4
-    vmlsl.u8        q10, d29, d4
-    vmlsl.u8        q11, d30, d4
-    vmlsl.u8        q12, d31, d4
-
-    vext.8          d27, d6, d7, #2         ;construct src_ptr[0]
-    vext.8          d28, d8, d9, #2
-    vext.8          d29, d10, d11, #2
-    vext.8          d30, d12, d13, #2
-    vext.8          d31, d14, d15, #2
-
-    vmlal.u8        q8, d27, d2             ;(src_ptr[0] * vp9_filter[2])
-    vmlal.u8        q9, d28, d2
-    vmlal.u8        q10, d29, d2
-    vmlal.u8        q11, d30, d2
-    vmlal.u8        q12, d31, d2
-
-    vext.8          d27, d6, d7, #5         ;construct src_ptr[3]
-    vext.8          d28, d8, d9, #5
-    vext.8          d29, d10, d11, #5
-    vext.8          d30, d12, d13, #5
-    vext.8          d31, d14, d15, #5
-
-    vmlal.u8        q8, d27, d5             ;(src_ptr[3] * vp9_filter[5])
-    vmlal.u8        q9, d28, d5
-    vmlal.u8        q10, d29, d5
-    vmlal.u8        q11, d30, d5
-    vmlal.u8        q12, d31, d5
-
-    vext.8          d27, d6, d7, #3         ;construct src_ptr[1]
-    vext.8          d28, d8, d9, #3
-    vext.8          d29, d10, d11, #3
-    vext.8          d30, d12, d13, #3
-    vext.8          d31, d14, d15, #3
-
-    vmull.u8        q3, d27, d3             ;(src_ptr[1] * vp9_filter[3])
-    vmull.u8        q4, d28, d3
-    vmull.u8        q5, d29, d3
-    vmull.u8        q6, d30, d3
-    vmull.u8        q7, d31, d3
-
-    vqadd.s16       q8, q3                  ;sum of all (src_data*filter_parameters)
-    vqadd.s16       q9, q4
-    vqadd.s16       q10, q5
-    vqadd.s16       q11, q6
-    vqadd.s16       q12, q7
-
-    add             r3, r12, r3, lsl #5
-
-    vqrshrun.s16    d26, q8, #7             ;shift/round/saturate to u8
-    sub             lr, lr, #64
-    vqrshrun.s16    d27, q9, #7
-    vld1.u8         {q9}, [lr]!             ;load intermediate data from stack
-    vqrshrun.s16    d28, q10, #7
-    vld1.u8         {q10}, [lr]!
-
-    vld1.s32        {q5, q6}, [r3]          ;load second_pass filter
-
-    vqrshrun.s16    d29, q11, #7
-    vld1.u8         {q11}, [lr]!
-
-    vabs.s32        q7, q5
-    vabs.s32        q8, q6
-
-    vqrshrun.s16    d30, q12, #7
-    vld1.u8         {q12}, [lr]!
-
-;Second pass: 8x8
-    mov             r3, #2                  ;loop counter
-
-    vdup.8          d0, d14[0]              ;second_pass filter parameters (d0-d5)
-    vdup.8          d1, d14[4]
-    vdup.8          d2, d15[0]
-    vdup.8          d3, d15[4]
-    vdup.8          d4, d16[0]
-    vdup.8          d5, d16[4]
-
-filt_blk2d_sp8x8_loop_neon
-    vmull.u8        q3, d18, d0             ;(src_ptr[-2] * vp9_filter[0])
-    vmull.u8        q4, d19, d0
-    vmull.u8        q5, d20, d0
-    vmull.u8        q6, d21, d0
-
-    vmlsl.u8        q3, d19, d1             ;-(src_ptr[-1] * vp9_filter[1])
-    vmlsl.u8        q4, d20, d1
-    vmlsl.u8        q5, d21, d1
-    vmlsl.u8        q6, d22, d1
-
-    vmlsl.u8        q3, d22, d4             ;-(src_ptr[2] * vp9_filter[4])
-    vmlsl.u8        q4, d23, d4
-    vmlsl.u8        q5, d24, d4
-    vmlsl.u8        q6, d25, d4
-
-    vmlal.u8        q3, d20, d2             ;(src_ptr[0] * vp9_filter[2])
-    vmlal.u8        q4, d21, d2
-    vmlal.u8        q5, d22, d2
-    vmlal.u8        q6, d23, d2
-
-    vmlal.u8        q3, d23, d5             ;(src_ptr[3] * vp9_filter[5])
-    vmlal.u8        q4, d24, d5
-    vmlal.u8        q5, d25, d5
-    vmlal.u8        q6, d26, d5
-
-    vmull.u8        q7, d21, d3             ;(src_ptr[1] * vp9_filter[3])
-    vmull.u8        q8, d22, d3
-    vmull.u8        q9, d23, d3
-    vmull.u8        q10, d24, d3
-
-    subs            r3, r3, #1
-
-    vqadd.s16       q7, q3                  ;sum of all (src_data*filter_parameters)
-    vqadd.s16       q8, q4
-    vqadd.s16       q9, q5
-    vqadd.s16       q10, q6
-
-    vqrshrun.s16    d6, q7, #7              ;shift/round/saturate to u8
-    vqrshrun.s16    d7, q8, #7
-    vqrshrun.s16    d8, q9, #7
-    vqrshrun.s16    d9, q10, #7
-
-    vmov            q9, q11
-    vst1.u8         {d6}, [r4], r5          ;store result
-    vmov            q10, q12
-    vst1.u8         {d7}, [r4], r5
-    vmov            q11, q13
-    vst1.u8         {d8}, [r4], r5
-    vmov            q12, q14
-    vst1.u8         {d9}, [r4], r5
-    vmov            d26, d30
-
-    bne filt_blk2d_sp8x8_loop_neon
-
-    add             sp, sp, #64
-    pop             {r4-r5,pc}
-
-;---------------------
-firstpass_filter8x8_only
-    ;add                r2, r12, r2, lsl #5     ;calculate filter location
-    ;vld1.s32       {q14, q15}, [r2]        ;load first_pass filter
-    vabs.s32        q12, q14
-    vabs.s32        q13, q15
-
-    mov             r2, #2                  ;loop counter
-    sub             r0, r0, #2              ;move srcptr back to (line-2) and (column-2)
-
-    vdup.8          d0, d24[0]              ;first_pass filter (d0-d5)
-    vdup.8          d1, d24[4]
-    vdup.8          d2, d25[0]
-    vdup.8          d3, d25[4]
-    vdup.8          d4, d26[0]
-    vdup.8          d5, d26[4]
-
-;First pass: output_height lines x output_width columns (8x8)
-filt_blk2d_fpo8x8_loop_neon
-    vld1.u8         {q3}, [r0], r1          ;load src data
-    vld1.u8         {q4}, [r0], r1
-    vld1.u8         {q5}, [r0], r1
-    vld1.u8         {q6}, [r0], r1
-
-    pld             [r0]
-    pld             [r0, r1]
-    pld             [r0, r1, lsl #1]
-
-    vmull.u8        q7, d6, d0              ;(src_ptr[-2] * vp9_filter[0])
-    vmull.u8        q8, d8, d0
-    vmull.u8        q9, d10, d0
-    vmull.u8        q10, d12, d0
-
-    vext.8          d28, d6, d7, #1         ;construct src_ptr[-1]
-    vext.8          d29, d8, d9, #1
-    vext.8          d30, d10, d11, #1
-    vext.8          d31, d12, d13, #1
-
-    vmlsl.u8        q7, d28, d1             ;-(src_ptr[-1] * vp9_filter[1])
-    vmlsl.u8        q8, d29, d1
-    vmlsl.u8        q9, d30, d1
-    vmlsl.u8        q10, d31, d1
-
-    vext.8          d28, d6, d7, #4         ;construct src_ptr[2]
-    vext.8          d29, d8, d9, #4
-    vext.8          d30, d10, d11, #4
-    vext.8          d31, d12, d13, #4
-
-    vmlsl.u8        q7, d28, d4             ;-(src_ptr[2] * vp9_filter[4])
-    vmlsl.u8        q8, d29, d4
-    vmlsl.u8        q9, d30, d4
-    vmlsl.u8        q10, d31, d4
-
-    vext.8          d28, d6, d7, #2         ;construct src_ptr[0]
-    vext.8          d29, d8, d9, #2
-    vext.8          d30, d10, d11, #2
-    vext.8          d31, d12, d13, #2
-
-    vmlal.u8        q7, d28, d2             ;(src_ptr[0] * vp9_filter[2])
-    vmlal.u8        q8, d29, d2
-    vmlal.u8        q9, d30, d2
-    vmlal.u8        q10, d31, d2
-
-    vext.8          d28, d6, d7, #5         ;construct src_ptr[3]
-    vext.8          d29, d8, d9, #5
-    vext.8          d30, d10, d11, #5
-    vext.8          d31, d12, d13, #5
-
-    vmlal.u8        q7, d28, d5             ;(src_ptr[3] * vp9_filter[5])
-    vmlal.u8        q8, d29, d5
-    vmlal.u8        q9, d30, d5
-    vmlal.u8        q10, d31, d5
-
-    vext.8          d28, d6, d7, #3         ;construct src_ptr[1]
-    vext.8          d29, d8, d9, #3
-    vext.8          d30, d10, d11, #3
-    vext.8          d31, d12, d13, #3
-
-    vmull.u8        q3, d28, d3             ;(src_ptr[1] * vp9_filter[3])
-    vmull.u8        q4, d29, d3
-    vmull.u8        q5, d30, d3
-    vmull.u8        q6, d31, d3
- ;
-    vqadd.s16       q7, q3                  ;sum of all (src_data*filter_parameters)
-    vqadd.s16       q8, q4
-    vqadd.s16       q9, q5
-    vqadd.s16       q10, q6
-
-    subs            r2, r2, #1
-
-    vqrshrun.s16    d22, q7, #7             ;shift/round/saturate to u8
-    vqrshrun.s16    d23, q8, #7
-    vqrshrun.s16    d24, q9, #7
-    vqrshrun.s16    d25, q10, #7
-
-    vst1.u8         {d22}, [r4], r5         ;store result
-    vst1.u8         {d23}, [r4], r5
-    vst1.u8         {d24}, [r4], r5
-    vst1.u8         {d25}, [r4], r5
-
-    bne             filt_blk2d_fpo8x8_loop_neon
-
-    pop             {r4-r5,pc}
-
-;---------------------
-secondpass_filter8x8_only
-    sub             r0, r0, r1, lsl #1
-    add             r3, r12, r3, lsl #5
-
-    vld1.u8         {d18}, [r0], r1         ;load src data
-    vld1.s32        {q5, q6}, [r3]          ;load second_pass filter
-    vld1.u8         {d19}, [r0], r1
-    vabs.s32        q7, q5
-    vld1.u8         {d20}, [r0], r1
-    vabs.s32        q8, q6
-    vld1.u8         {d21}, [r0], r1
-    mov             r3, #2                  ;loop counter
-    vld1.u8         {d22}, [r0], r1
-    vdup.8          d0, d14[0]              ;second_pass filter parameters (d0-d5)
-    vld1.u8         {d23}, [r0], r1
-    vdup.8          d1, d14[4]
-    vld1.u8         {d24}, [r0], r1
-    vdup.8          d2, d15[0]
-    vld1.u8         {d25}, [r0], r1
-    vdup.8          d3, d15[4]
-    vld1.u8         {d26}, [r0], r1
-    vdup.8          d4, d16[0]
-    vld1.u8         {d27}, [r0], r1
-    vdup.8          d5, d16[4]
-    vld1.u8         {d28}, [r0], r1
-    vld1.u8         {d29}, [r0], r1
-    vld1.u8         {d30}, [r0], r1
-
-;Second pass: 8x8
-filt_blk2d_spo8x8_loop_neon
-    vmull.u8        q3, d18, d0             ;(src_ptr[-2] * vp9_filter[0])
-    vmull.u8        q4, d19, d0
-    vmull.u8        q5, d20, d0
-    vmull.u8        q6, d21, d0
-
-    vmlsl.u8        q3, d19, d1             ;-(src_ptr[-1] * vp9_filter[1])
-    vmlsl.u8        q4, d20, d1
-    vmlsl.u8        q5, d21, d1
-    vmlsl.u8        q6, d22, d1
-
-    vmlsl.u8        q3, d22, d4             ;-(src_ptr[2] * vp9_filter[4])
-    vmlsl.u8        q4, d23, d4
-    vmlsl.u8        q5, d24, d4
-    vmlsl.u8        q6, d25, d4
-
-    vmlal.u8        q3, d20, d2             ;(src_ptr[0] * vp9_filter[2])
-    vmlal.u8        q4, d21, d2
-    vmlal.u8        q5, d22, d2
-    vmlal.u8        q6, d23, d2
-
-    vmlal.u8        q3, d23, d5             ;(src_ptr[3] * vp9_filter[5])
-    vmlal.u8        q4, d24, d5
-    vmlal.u8        q5, d25, d5
-    vmlal.u8        q6, d26, d5
-
-    vmull.u8        q7, d21, d3             ;(src_ptr[1] * vp9_filter[3])
-    vmull.u8        q8, d22, d3
-    vmull.u8        q9, d23, d3
-    vmull.u8        q10, d24, d3
-
-    subs            r3, r3, #1
-
-    vqadd.s16       q7, q3                  ;sum of all (src_data*filter_parameters)
-    vqadd.s16       q8, q4
-    vqadd.s16       q9, q5
-    vqadd.s16       q10, q6
-
-    vqrshrun.s16    d6, q7, #7              ;shift/round/saturate to u8
-    vqrshrun.s16    d7, q8, #7
-    vqrshrun.s16    d8, q9, #7
-    vqrshrun.s16    d9, q10, #7
-
-    vmov            q9, q11
-    vst1.u8         {d6}, [r4], r5          ;store result
-    vmov            q10, q12
-    vst1.u8         {d7}, [r4], r5
-    vmov            q11, q13
-    vst1.u8         {d8}, [r4], r5
-    vmov            q12, q14
-    vst1.u8         {d9}, [r4], r5
-    vmov            d26, d30
-
-    bne filt_blk2d_spo8x8_loop_neon
-
-    pop             {r4-r5,pc}
-
-    ENDP
-
-;-----------------
-
-    END
--- a/vp8/common/arm/recon_arm.h
+++ /dev/null
@@ -1,90 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef RECON_ARM_H
-#define RECON_ARM_H
-
-#if HAVE_ARMV6
-extern prototype_recon_block(vp9_recon_b_armv6);
-extern prototype_recon_block(vp9_recon2b_armv6);
-extern prototype_recon_block(vp9_recon4b_armv6);
-
-extern prototype_copy_block(vp9_copy_mem8x8_v6);
-extern prototype_copy_block(vp9_copy_mem8x4_v6);
-extern prototype_copy_block(vp9_copy_mem16x16_v6);
-
-#if !CONFIG_RUNTIME_CPU_DETECT
-#undef  vp8_recon_recon
-#define vp8_recon_recon vp9_recon_b_armv6
-
-#undef  vp8_recon_recon2
-#define vp8_recon_recon2 vp9_recon2b_armv6
-
-#undef  vp8_recon_recon4
-#define vp8_recon_recon4 vp9_recon4b_armv6
-
-#undef  vp8_recon_copy8x8
-#define vp8_recon_copy8x8 vp9_copy_mem8x8_v6
-
-#undef  vp8_recon_copy8x4
-#define vp8_recon_copy8x4 vp9_copy_mem8x4_v6
-
-#undef  vp8_recon_copy16x16
-#define vp8_recon_copy16x16 vp9_copy_mem16x16_v6
-#endif
-#endif
-
-#if HAVE_ARMV7
-extern prototype_recon_block(vp9_recon_b_neon);
-extern prototype_recon_block(vp9_recon2b_neon);
-extern prototype_recon_block(vp9_recon4b_neon);
-
-extern prototype_copy_block(vp9_copy_mem8x8_neon);
-extern prototype_copy_block(vp9_copy_mem8x4_neon);
-extern prototype_copy_block(vp9_copy_mem16x16_neon);
-
-extern prototype_recon_macroblock(vp9_recon_mb_neon);
-
-extern prototype_build_intra_predictors(vp9_build_intra_predictors_mby_neon);
-extern prototype_build_intra_predictors(vp9_build_intra_predictors_mby_s_neon);
-
-#if !CONFIG_RUNTIME_CPU_DETECT
-#undef  vp8_recon_recon
-#define vp8_recon_recon vp9_recon_b_neon
-
-#undef  vp8_recon_recon2
-#define vp8_recon_recon2 vp9_recon2b_neon
-
-#undef  vp8_recon_recon4
-#define vp8_recon_recon4 vp9_recon4b_neon
-
-#undef  vp8_recon_copy8x8
-#define vp8_recon_copy8x8 vp9_copy_mem8x8_neon
-
-#undef  vp8_recon_copy8x4
-#define vp8_recon_copy8x4 vp9_copy_mem8x4_neon
-
-#undef  vp8_recon_copy16x16
-#define vp8_recon_copy16x16 vp9_copy_mem16x16_neon
-
-#undef  vp8_recon_recon_mb
-#define vp8_recon_recon_mb vp9_recon_mb_neon
-
-#undef  vp9_recon_build_intra_predictors_mby
-#define vp9_recon_build_intra_predictors_mby vp9_build_intra_predictors_mby_neon
-
-#undef  vp9_recon_build_intra_predictors_mby_s
-#define vp9_recon_build_intra_predictors_mby_s vp9_build_intra_predictors_mby_s_neon
-
-#endif
-#endif
-
-#endif
--- a/vp8/common/arm/reconintra_arm.c
+++ /dev/null
@@ -1,62 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vpx_ports/config.h"
-#include "vp8/common/blockd.h"
-#include "vp8/common/reconintra.h"
-#include "vpx_mem/vpx_mem.h"
-#include "vp8/common/recon.h"
-
-#if HAVE_ARMV7
-extern void vp9_build_intra_predictors_mby_neon_func(
-  unsigned char *y_buffer,
-  unsigned char *ypred_ptr,
-  int y_stride,
-  int mode,
-  int Up,
-  int Left);
-
-void vp9_build_intra_predictors_mby_neon(MACROBLOCKD *xd) {
-  unsigned char *y_buffer = xd->dst.y_buffer;
-  unsigned char *ypred_ptr = xd->predictor;
-  int y_stride = xd->dst.y_stride;
-  int mode = xd->mode_info_context->mbmi.mode;
-  int Up = xd->up_available;
-  int Left = xd->left_available;
-
-  vp9_build_intra_predictors_mby_neon_func(y_buffer, ypred_ptr,
-                                           y_stride, mode, Up, Left);
-}
-#endif
-
-
-#if HAVE_ARMV7
-extern void vp9_build_intra_predictors_mby_s_neon_func(
-  unsigned char *y_buffer,
-  unsigned char *ypred_ptr,
-  int y_stride,
-  int mode,
-  int Up,
-  int Left);
-
-void vp9_build_intra_predictors_mby_s_neon(MACROBLOCKD *xd) {
-  unsigned char *y_buffer = xd->dst.y_buffer;
-  unsigned char *ypred_ptr = xd->predictor;
-  int y_stride = xd->dst.y_stride;
-  int mode = xd->mode_info_context->mbmi.mode;
-  int Up = xd->up_available;
-  int Left = xd->left_available;
-
-  vp9_build_intra_predictors_mby_s_neon_func(y_buffer, ypred_ptr,
-                                             y_stride, mode, Up, Left);
-}
-
-#endif
--- a/vp8/common/arm/subpixel_arm.h
+++ /dev/null
@@ -1,89 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef SUBPIXEL_ARM_H
-#define SUBPIXEL_ARM_H
-
-#if HAVE_ARMV6
-extern prototype_subpixel_predict(vp9_sixtap_predict16x16_armv6);
-extern prototype_subpixel_predict(vp9_sixtap_predict8x8_armv6);
-extern prototype_subpixel_predict(vp9_sixtap_predict8x4_armv6);
-extern prototype_subpixel_predict(vp9_sixtap_predict_armv6);
-extern prototype_subpixel_predict(vp9_bilinear_predict16x16_armv6);
-extern prototype_subpixel_predict(vp9_bilinear_predict8x8_armv6);
-extern prototype_subpixel_predict(vp9_bilinear_predict8x4_armv6);
-extern prototype_subpixel_predict(vp9_bilinear_predict4x4_armv6);
-
-#if !CONFIG_RUNTIME_CPU_DETECT
-#undef  vp9_subpix_sixtap16x16
-#define vp9_subpix_sixtap16x16 vp9_sixtap_predict16x16_armv6
-
-#undef  vp9_subpix_sixtap8x8
-#define vp9_subpix_sixtap8x8 vp9_sixtap_predict8x8_armv6
-
-#undef  vp9_subpix_sixtap8x4
-#define vp9_subpix_sixtap8x4 vp9_sixtap_predict8x4_armv6
-
-#undef  vp9_subpix_sixtap4x4
-#define vp9_subpix_sixtap4x4 vp9_sixtap_predict_armv6
-
-#undef  vp9_subpix_bilinear16x16
-#define vp9_subpix_bilinear16x16 vp9_bilinear_predict16x16_armv6
-
-#undef  vp9_subpix_bilinear8x8
-#define vp9_subpix_bilinear8x8 vp9_bilinear_predict8x8_armv6
-
-#undef  vp9_subpix_bilinear8x4
-#define vp9_subpix_bilinear8x4 vp9_bilinear_predict8x4_armv6
-
-#undef  vp9_subpix_bilinear4x4
-#define vp9_subpix_bilinear4x4 vp9_bilinear_predict4x4_armv6
-#endif
-#endif
-
-#if HAVE_ARMV7
-extern prototype_subpixel_predict(vp9_sixtap_predict16x16_neon);
-extern prototype_subpixel_predict(vp9_sixtap_predict8x8_neon);
-extern prototype_subpixel_predict(vp9_sixtap_predict8x4_neon);
-extern prototype_subpixel_predict(vp9_sixtap_predict_neon);
-extern prototype_subpixel_predict(vp9_bilinear_predict16x16_neon);
-extern prototype_subpixel_predict(vp9_bilinear_predict8x8_neon);
-extern prototype_subpixel_predict(vp9_bilinear_predict8x4_neon);
-extern prototype_subpixel_predict(vp9_bilinear_predict4x4_neon);
-
-#if !CONFIG_RUNTIME_CPU_DETECT
-#undef  vp9_subpix_sixtap16x16
-#define vp9_subpix_sixtap16x16 vp9_sixtap_predict16x16_neon
-
-#undef  vp9_subpix_sixtap8x8
-#define vp9_subpix_sixtap8x8 vp9_sixtap_predict8x8_neon
-
-#undef  vp9_subpix_sixtap8x4
-#define vp9_subpix_sixtap8x4 vp9_sixtap_predict8x4_neon
-
-#undef  vp9_subpix_sixtap4x4
-#define vp9_subpix_sixtap4x4 vp9_sixtap_predict_neon
-
-#undef  vp9_subpix_bilinear16x16
-#define vp9_subpix_bilinear16x16 vp9_bilinear_predict16x16_neon
-
-#undef  vp9_subpix_bilinear8x8
-#define vp9_subpix_bilinear8x8 vp9_bilinear_predict8x8_neon
-
-#undef  vp9_subpix_bilinear8x4
-#define vp9_subpix_bilinear8x4 vp9_bilinear_predict8x4_neon
-
-#undef  vp9_subpix_bilinear4x4
-#define vp9_subpix_bilinear4x4 vp9_bilinear_predict4x4_neon
-#endif
-#endif
-
-#endif
--- a/vp8/common/asm_com_offsets.c
+++ /dev/null
@@ -1,40 +1,0 @@
-/*
- *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vpx_config.h"
-#include "vpx/vpx_codec.h"
-#include "vpx_ports/asm_offsets.h"
-#include "vpx_scale/yv12config.h"
-
-BEGIN
-
-/* vpx_scale */
-DEFINE(yv12_buffer_config_y_width,              offsetof(YV12_BUFFER_CONFIG, y_width));
-DEFINE(yv12_buffer_config_y_height,             offsetof(YV12_BUFFER_CONFIG, y_height));
-DEFINE(yv12_buffer_config_y_stride,             offsetof(YV12_BUFFER_CONFIG, y_stride));
-DEFINE(yv12_buffer_config_uv_width,             offsetof(YV12_BUFFER_CONFIG, uv_width));
-DEFINE(yv12_buffer_config_uv_height,            offsetof(YV12_BUFFER_CONFIG, uv_height));
-DEFINE(yv12_buffer_config_uv_stride,            offsetof(YV12_BUFFER_CONFIG, uv_stride));
-DEFINE(yv12_buffer_config_y_buffer,             offsetof(YV12_BUFFER_CONFIG, y_buffer));
-DEFINE(yv12_buffer_config_u_buffer,             offsetof(YV12_BUFFER_CONFIG, u_buffer));
-DEFINE(yv12_buffer_config_v_buffer,             offsetof(YV12_BUFFER_CONFIG, v_buffer));
-DEFINE(yv12_buffer_config_border,               offsetof(YV12_BUFFER_CONFIG, border));
-DEFINE(VP8BORDERINPIXELS_VAL,                   VP8BORDERINPIXELS);
-
-END
-
-/* add asserts for any offset that is not supported by assembly code */
-/* add asserts for any size that is not supported by assembly code */
-
-#if HAVE_ARMV7
-/* vp8_yv12_extend_frame_borders_neon makes several assumptions based on this */
-ct_assert(VP8BORDERINPIXELS_VAL, VP8BORDERINPIXELS == 32)
-#endif
--- a/vp8/common/blockd.c
+++ /dev/null
@@ -1,29 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "blockd.h"
-#include "vpx_mem/vpx_mem.h"
-
-
-const unsigned char vp9_block2left[25] = {
-  0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
-};
-const unsigned char vp9_block2above[25] = {
-  0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8
-};
-
-const unsigned char vp9_block2left_8x8[25] = {
-  0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 6, 6, 6, 6, 8
-};
-const unsigned char vp9_block2above_8x8[25] = {
-  0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, 2, 2, 2, 2, 4, 4, 4, 4, 6, 6, 6, 6, 8
-};
-
--- a/vp8/common/blockd.h
+++ /dev/null
@@ -1,518 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef __INC_BLOCKD_H
-#define __INC_BLOCKD_H
-
-void vpx_log(const char *format, ...);
-
-#include "vpx_ports/config.h"
-#include "vpx_scale/yv12config.h"
-#include "mv.h"
-#include "treecoder.h"
-#include "subpixel.h"
-#include "vpx_ports/mem.h"
-#include "common.h"
-
-#define TRUE    1
-#define FALSE   0
-
-// #define MODE_STATS
-
-/*#define DCPRED 1*/
-#define DCPREDSIMTHRESH 0
-#define DCPREDCNTTHRESH 3
-
-#define MB_FEATURE_TREE_PROBS   3
-#define PREDICTION_PROBS 3
-
-#define MBSKIP_CONTEXTS 3
-
-#define MAX_MB_SEGMENTS         4
-
-#define MAX_REF_LF_DELTAS       4
-#define MAX_MODE_LF_DELTAS      4
-
-/* Segment Feature Masks */
-#define SEGMENT_DELTADATA   0
-#define SEGMENT_ABSDATA     1
-#if CONFIG_NEWBESTREFMV || CONFIG_NEW_MVREF
-#define MAX_MV_REFS 19
-#endif
-
-typedef struct {
-  int r, c;
-} POS;
-
-typedef enum PlaneType {
-  PLANE_TYPE_Y_NO_DC = 0,
-  PLANE_TYPE_Y2,
-  PLANE_TYPE_UV,
-  PLANE_TYPE_Y_WITH_DC,
-} PLANE_TYPE;
-
-typedef char ENTROPY_CONTEXT;
-typedef struct {
-  ENTROPY_CONTEXT y1[4];
-  ENTROPY_CONTEXT u[2];
-  ENTROPY_CONTEXT v[2];
-  ENTROPY_CONTEXT y2;
-} ENTROPY_CONTEXT_PLANES;
-
-extern const unsigned char vp9_block2left[25];
-extern const unsigned char vp9_block2above[25];
-extern const unsigned char vp9_block2left_8x8[25];
-extern const unsigned char vp9_block2above_8x8[25];
-
-#define VP9_COMBINEENTROPYCONTEXTS( Dest, A, B) \
-  Dest = ((A)!=0) + ((B)!=0);
-
-typedef enum {
-  KEY_FRAME = 0,
-  INTER_FRAME = 1
-} FRAME_TYPE;
-
-typedef enum
-{
-  SIXTAP   = 0,
-  BILINEAR = 1,
-  EIGHTTAP = 2,
-  EIGHTTAP_SHARP = 3,
-  SWITCHABLE  /* should be the last one */
-} INTERPOLATIONFILTERTYPE;
-
-typedef enum
-{
-  DC_PRED,            /* average of above and left pixels */
-  V_PRED,             /* vertical prediction */
-  H_PRED,             /* horizontal prediction */
-  D45_PRED,           /* Directional 45 deg prediction  [anti-clockwise from 0 deg hor] */
-  D135_PRED,          /* Directional 135 deg prediction [anti-clockwise from 0 deg hor] */
-  D117_PRED,          /* Directional 112 deg prediction [anti-clockwise from 0 deg hor] */
-  D153_PRED,          /* Directional 157 deg prediction [anti-clockwise from 0 deg hor] */
-  D27_PRED,           /* Directional 22 deg prediction  [anti-clockwise from 0 deg hor] */
-  D63_PRED,           /* Directional 67 deg prediction  [anti-clockwise from 0 deg hor] */
-  TM_PRED,            /* Truemotion prediction */
-  I8X8_PRED,          /* 8x8 based prediction, each 8x8 has its own prediction mode */
-  B_PRED,             /* block based prediction, each block has its own prediction mode */
-
-  NEARESTMV,
-  NEARMV,
-  ZEROMV,
-  NEWMV,
-  SPLITMV,
-
-  MB_MODE_COUNT
-} MB_PREDICTION_MODE;
-
-// Segment level features.
-typedef enum {
-  SEG_LVL_ALT_Q = 0,               // Use alternate Quantizer ....
-  SEG_LVL_ALT_LF = 1,              // Use alternate loop filter value...
-  SEG_LVL_REF_FRAME = 2,           // Optional Segment reference frame
-  SEG_LVL_MODE = 3,                // Optional Segment mode
-  SEG_LVL_EOB = 4,                 // EOB end stop marker.
-  SEG_LVL_TRANSFORM = 5,           // Block transform size.
-  SEG_LVL_MAX = 6                  // Number of MB level features supported
-
-} SEG_LVL_FEATURES;
-
-// Segment level features.
-typedef enum {
-  TX_4X4,                      // 4x4 dct transform
-  TX_8X8,                      // 8x8 dct transform
-  TX_16X16,                    // 16x16 dct transform
-  TX_SIZE_MAX                  // Number of different transforms available
-} TX_SIZE;
-
-typedef enum {
-  DCT_DCT   = 0,                      // DCT  in both horizontal and vertical
-  ADST_DCT  = 1,                      // ADST in vertical, DCT in horizontal
-  DCT_ADST  = 2,                      // DCT  in vertical, ADST in horizontal
-  ADST_ADST = 3                       // ADST in both directions
-} TX_TYPE;
-
-#define VP9_YMODES  (B_PRED + 1)
-#define VP9_UV_MODES (TM_PRED + 1)
-#define VP9_I8X8_MODES (TM_PRED + 1)
-#define VP9_I32X32_MODES (TM_PRED + 1)
-
-#define VP9_MVREFS (1 + SPLITMV - NEARESTMV)
-
-typedef enum {
-  B_DC_PRED,          /* average of above and left pixels */
-  B_TM_PRED,
-
-  B_VE_PRED,           /* vertical prediction */
-  B_HE_PRED,           /* horizontal prediction */
-
-  B_LD_PRED,
-  B_RD_PRED,
-
-  B_VR_PRED,
-  B_VL_PRED,
-  B_HD_PRED,
-  B_HU_PRED,
-
-  LEFT4X4,
-  ABOVE4X4,
-  ZERO4X4,
-  NEW4X4,
-
-  B_MODE_COUNT
-} B_PREDICTION_MODE;
-
-#define VP9_BINTRAMODES (B_HU_PRED + 1)  /* 10 */
-#define VP9_SUBMVREFS (1 + NEW4X4 - LEFT4X4)
-
-typedef enum {
-  PARTITIONING_16X8 = 0,
-  PARTITIONING_8X16,
-  PARTITIONING_8X8,
-  PARTITIONING_4X4,
-  NB_PARTITIONINGS,
-} SPLITMV_PARTITIONING_TYPE;
-
-/* For keyframes, intra block modes are predicted by the (already decoded)
-   modes for the Y blocks to the left and above us; for interframes, there
-   is a single probability table. */
-
-union b_mode_info {
-  struct {
-    B_PREDICTION_MODE first;
-    TX_TYPE           tx_type;
-
-#if CONFIG_COMP_INTRA_PRED
-    B_PREDICTION_MODE second;
-#endif
-  } as_mode;
-  struct {
-    int_mv first;
-    int_mv second;
-  } as_mv;
-};
-
-typedef enum {
-  INTRA_FRAME = 0,
-  LAST_FRAME = 1,
-  GOLDEN_FRAME = 2,
-  ALTREF_FRAME = 3,
-  MAX_REF_FRAMES = 4
-} MV_REFERENCE_FRAME;
-
-typedef struct {
-  MB_PREDICTION_MODE mode, uv_mode;
-#if CONFIG_COMP_INTRA_PRED
-  MB_PREDICTION_MODE second_mode, second_uv_mode;
-#endif
-  MV_REFERENCE_FRAME ref_frame, second_ref_frame;
-  TX_SIZE txfm_size;
-  int_mv mv[2]; // for each reference frame used
-#if CONFIG_NEWBESTREFMV || CONFIG_NEW_MVREF
-  int_mv ref_mvs[MAX_REF_FRAMES][MAX_MV_REFS];
-#endif
-
-  SPLITMV_PARTITIONING_TYPE partitioning;
-  unsigned char mb_skip_coeff;                                /* does this mb has coefficients at all, 1=no coefficients, 0=need decode tokens */
-  unsigned char need_to_clamp_mvs;
-  unsigned char need_to_clamp_secondmv;
-  unsigned char segment_id;                  /* Which set of segmentation parameters should be used for this MB */
-
-  // Flags used for prediction status of various bistream signals
-  unsigned char seg_id_predicted;
-  unsigned char ref_predicted;
-
-  // Indicates if the mb is part of the image (1) vs border (0)
-  // This can be useful in determining whether the MB provides
-  // a valid predictor
-  unsigned char mb_in_image;
-
-#if CONFIG_PRED_FILTER
-  // Flag to turn prediction signal filter on(1)/off(0 ) at the MB level
-  unsigned int pred_filter_enabled;
-#endif
-    INTERPOLATIONFILTERTYPE interp_filter;
-
-#if CONFIG_SUPERBLOCKS
-  // FIXME need a SB array of 4 MB_MODE_INFOs that
-  // only needs one encoded_as_sb.
-  unsigned char encoded_as_sb;
-#endif
-} MB_MODE_INFO;
-
-typedef struct {
-  MB_MODE_INFO mbmi;
-  union b_mode_info bmi[16];
-} MODE_INFO;
-
-typedef struct blockd {
-  short *qcoeff;
-  short *dqcoeff;
-  unsigned char  *predictor;
-  short *diff;
-  short *dequant;
-
-  /* 16 Y blocks, 4 U blocks, 4 V blocks each with 16 entries */
-  unsigned char **base_pre;
-  unsigned char **base_second_pre;
-  int pre;
-  int pre_stride;
-
-  unsigned char **base_dst;
-  int dst;
-  int dst_stride;
-
-  int eob;
-
-  union b_mode_info bmi;
-} BLOCKD;
-
-typedef struct macroblockd {
-  DECLARE_ALIGNED(16, short, diff[400]);      /* from idct diff */
-  DECLARE_ALIGNED(16, unsigned char,  predictor[384]);
-  DECLARE_ALIGNED(16, short, qcoeff[400]);
-  DECLARE_ALIGNED(16, short, dqcoeff[400]);
-  DECLARE_ALIGNED(16, char,  eobs[25]);
-
-  /* 16 Y blocks, 4 U, 4 V, 1 DC 2nd order block, each with 16 entries. */
-  BLOCKD block[25];
-  int fullpixel_mask;
-
-  YV12_BUFFER_CONFIG pre; /* Filtered copy of previous frame reconstruction */
-  struct {
-    uint8_t *y_buffer, *u_buffer, *v_buffer;
-  } second_pre;
-  YV12_BUFFER_CONFIG dst;
-
-  MODE_INFO *prev_mode_info_context;
-  MODE_INFO *mode_info_context;
-  int mode_info_stride;
-
-  FRAME_TYPE frame_type;
-
-  int up_available;
-  int left_available;
-
-  /* Y,U,V,Y2 */
-  ENTROPY_CONTEXT_PLANES *above_context;
-  ENTROPY_CONTEXT_PLANES *left_context;
-
-  /* 0 indicates segmentation at MB level is not enabled. Otherwise the individual bits indicate which features are active. */
-  unsigned char segmentation_enabled;
-
-  /* 0 (do not update) 1 (update) the macroblock segmentation map. */
-  unsigned char update_mb_segmentation_map;
-
-  /* 0 (do not update) 1 (update) the macroblock segmentation feature data. */
-  unsigned char update_mb_segmentation_data;
-
-  /* 0 (do not update) 1 (update) the macroblock segmentation feature data. */
-  unsigned char mb_segment_abs_delta;
-
-  /* Per frame flags that define which MB level features (such as quantizer or loop filter level) */
-  /* are enabled and when enabled the proabilities used to decode the per MB flags in MB_MODE_INFO */
-
-  // Probability Tree used to code Segment number
-  vp9_prob mb_segment_tree_probs[MB_FEATURE_TREE_PROBS];
-
-#if CONFIG_NEW_MVREF
-  vp9_prob mb_mv_ref_id_probs[MAX_REF_FRAMES][3];
-#endif
-
-  // Segment features
-  signed char segment_feature_data[MAX_MB_SEGMENTS][SEG_LVL_MAX];
-  unsigned int segment_feature_mask[MAX_MB_SEGMENTS];
-
-  /* mode_based Loop filter adjustment */
-  unsigned char mode_ref_lf_delta_enabled;
-  unsigned char mode_ref_lf_delta_update;
-
-  /* Delta values have the range +/- MAX_LOOP_FILTER */
-  signed char last_ref_lf_deltas[MAX_REF_LF_DELTAS];                /* 0 = Intra, Last, GF, ARF */
-  signed char ref_lf_deltas[MAX_REF_LF_DELTAS];                     /* 0 = Intra, Last, GF, ARF */
-  signed char last_mode_lf_deltas[MAX_MODE_LF_DELTAS];              /* 0 = BPRED, ZERO_MV, MV, SPLIT */
-  signed char mode_lf_deltas[MAX_MODE_LF_DELTAS];                   /* 0 = BPRED, ZERO_MV, MV, SPLIT */
-
-  /* Distance of MB away from frame edges */
-  int mb_to_left_edge;
-  int mb_to_right_edge;
-  int mb_to_top_edge;
-  int mb_to_bottom_edge;
-
-  unsigned int frames_since_golden;
-  unsigned int frames_till_alt_ref_frame;
-  vp9_subpix_fn_t  subpixel_predict;
-  vp9_subpix_fn_t  subpixel_predict8x4;
-  vp9_subpix_fn_t  subpixel_predict8x8;
-  vp9_subpix_fn_t  subpixel_predict16x16;
-  vp9_subpix_fn_t  subpixel_predict_avg;
-  vp9_subpix_fn_t  subpixel_predict_avg8x4;
-  vp9_subpix_fn_t  subpixel_predict_avg8x8;
-  vp9_subpix_fn_t  subpixel_predict_avg16x16;
-  int allow_high_precision_mv;
-
-  int corrupted;
-
-#if !CONFIG_SUPERBLOCKS && (ARCH_X86 || ARCH_X86_64)
-  /* This is an intermediate buffer currently used in sub-pixel motion search
-   * to keep a copy of the reference area. This buffer can be used for other
-   * purpose.
-   */
-  DECLARE_ALIGNED(32, unsigned char, y_buf[22 * 32]);
-#endif
-
-#if CONFIG_RUNTIME_CPU_DETECT
-  struct VP9_COMMON_RTCD  *rtcd;
-#endif
-
-  int mb_index;   // Index of the MB in the SB (0..3)
-  int q_index;
-
-} MACROBLOCKD;
-
-#define ACTIVE_HT 110                // quantization stepsize threshold
-
-#define ACTIVE_HT8 300
-
-#define ACTIVE_HT16 300
-
-// convert MB_PREDICTION_MODE to B_PREDICTION_MODE
-static B_PREDICTION_MODE pred_mode_conv(MB_PREDICTION_MODE mode) {
-  B_PREDICTION_MODE b_mode;
-  switch (mode) {
-    case DC_PRED:
-      b_mode = B_DC_PRED;
-      break;
-    case V_PRED:
-      b_mode = B_VE_PRED;
-      break;
-    case H_PRED:
-      b_mode = B_HE_PRED;
-      break;
-    case TM_PRED:
-      b_mode = B_TM_PRED;
-      break;
-    case D45_PRED:
-      b_mode = B_LD_PRED;
-      break;
-    case D135_PRED:
-      b_mode = B_RD_PRED;
-      break;
-    case D117_PRED:
-      b_mode = B_VR_PRED;
-      break;
-    case D153_PRED:
-      b_mode = B_HD_PRED;
-      break;
-    case D27_PRED:
-      b_mode = B_HU_PRED;
-      break;
-    case D63_PRED:
-      b_mode = B_VL_PRED;
-      break;
-    default :
-      // for debug purpose, to be removed after full testing
-      assert(0);
-      break;
-  }
-  return b_mode;
-}
-
-// transform mapping
-static TX_TYPE txfm_map(B_PREDICTION_MODE bmode) {
-  // map transform type
-  TX_TYPE tx_type;
-  switch (bmode) {
-    case B_TM_PRED :
-    case B_RD_PRED :
-      tx_type = ADST_ADST;
-      break;
-
-    case B_VE_PRED :
-    case B_VR_PRED :
-      tx_type = ADST_DCT;
-      break;
-
-    case B_HE_PRED :
-    case B_HD_PRED :
-    case B_HU_PRED :
-      tx_type = DCT_ADST;
-      break;
-
-    default :
-      tx_type = DCT_DCT;
-      break;
-  }
-  return tx_type;
-}
-
-static TX_TYPE get_tx_type_4x4(const MACROBLOCKD *xd, const BLOCKD *b) {
-  TX_TYPE tx_type = DCT_DCT;
-  if (xd->mode_info_context->mbmi.mode == B_PRED &&
-      xd->q_index < ACTIVE_HT) {
-    tx_type = txfm_map(b->bmi.as_mode.first);
-  }
-  return tx_type;
-}
-
-static TX_TYPE get_tx_type_8x8(const MACROBLOCKD *xd, const BLOCKD *b) {
-  TX_TYPE tx_type = DCT_DCT;
-  if (xd->mode_info_context->mbmi.mode == I8X8_PRED &&
-      xd->q_index < ACTIVE_HT8) {
-    tx_type = txfm_map(pred_mode_conv(b->bmi.as_mode.first));
-  }
-  return tx_type;
-}
-
-static TX_TYPE get_tx_type_16x16(const MACROBLOCKD *xd, const BLOCKD *b) {
-  TX_TYPE tx_type = DCT_DCT;
-  if (xd->mode_info_context->mbmi.mode < I8X8_PRED &&
-      xd->q_index < ACTIVE_HT16) {
-    tx_type = txfm_map(pred_mode_conv(xd->mode_info_context->mbmi.mode));
-  }
-  return tx_type;
-}
-
-static TX_TYPE get_tx_type(const MACROBLOCKD *xd, const BLOCKD *b) {
-  TX_TYPE tx_type = DCT_DCT;
-  int ib = (b - xd->block);
-  if (ib >= 16)
-    return tx_type;
-  if (xd->mode_info_context->mbmi.txfm_size == TX_16X16) {
-    tx_type = get_tx_type_16x16(xd, b);
-  }
-  if (xd->mode_info_context->mbmi.txfm_size  == TX_8X8) {
-    ib = (ib & 8) + ((ib & 4) >> 1);
-    tx_type = get_tx_type_8x8(xd, &xd->block[ib]);
-  }
-  if (xd->mode_info_context->mbmi.txfm_size  == TX_4X4) {
-    tx_type = get_tx_type_4x4(xd, b);
-  }
-  return tx_type;
-}
-
-extern void vp9_build_block_doffsets(MACROBLOCKD *xd);
-extern void vp9_setup_block_dptrs(MACROBLOCKD *xd);
-
-static void update_blockd_bmi(MACROBLOCKD *xd) {
-  int i;
-  int is_4x4;
-  is_4x4 = (xd->mode_info_context->mbmi.mode == SPLITMV) ||
-           (xd->mode_info_context->mbmi.mode == I8X8_PRED) ||
-           (xd->mode_info_context->mbmi.mode == B_PRED);
-
-  if (is_4x4) {
-    for (i = 0; i < 16; i++) {
-      xd->block[i].bmi = xd->mode_info_context->bmi[i];
-    }
-  }
-}
-#endif  /* __INC_BLOCKD_H */
--- a/vp8/common/coefupdateprobs.h
+++ /dev/null
@@ -1,16 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-/* Update probabilities for the nodes in the token entropy tree.
-   Generated file included by entropy.c */
-#define COEF_UPDATE_PROB 252
-#define COEF_UPDATE_PROB_8X8 252
-#define COEF_UPDATE_PROB_16X16 252
--- a/vp8/common/common.h
+++ /dev/null
@@ -1,41 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef common_h
-#define common_h 1
-
-#include <assert.h>
-#include "vpx_config.h"
-/* Interface header for common constant data structures and lookup tables */
-
-#include "vpx_mem/vpx_mem.h"
-
-#include "common_types.h"
-
-/* Only need this for fixed-size arrays, for structs just assign. */
-
-#define vp9_copy( Dest, Src) { \
-    assert( sizeof( Dest) == sizeof( Src)); \
-    vpx_memcpy( Dest, Src, sizeof( Src)); \
-  }
-
-/* Use this for variably-sized arrays. */
-
-#define vp9_copy_array( Dest, Src, N) { \
-    assert( sizeof( *Dest) == sizeof( *Src)); \
-    vpx_memcpy( Dest, Src, N * sizeof( *Src)); \
-  }
-
-#define vp9_zero( Dest)  vpx_memset( &Dest, 0, sizeof( Dest));
-
-#define vp9_zero_array( Dest, N)  vpx_memset( Dest, 0, N * sizeof( *Dest));
-
-#endif  /* common_h */
--- a/vp8/common/common_types.h
+++ /dev/null
@@ -1,18 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef __INC_COMMON_TYPES
-#define __INC_COMMON_TYPES
-
-#define TRUE    1
-#define FALSE   0
-
-#endif
--- a/vp8/common/context.c
+++ /dev/null
@@ -1,397 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "entropy.h"
-
-/* *** GENERATED FILE: DO NOT EDIT *** */
-
-#if 0
-int Contexts[vp8_coef_counter_dimen];
-
-const int default_contexts[vp8_coef_counter_dimen] = {
-  {
-    // Block Type ( 0 )
-    {
-      // Coeff Band ( 0 )
-      {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0},
-      {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0},
-      {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0},
-    },
-    {
-      // Coeff Band ( 1 )
-      {30190, 26544, 225,  24,   4,   0,   0,   0,   0,   0,   0, 4171593},
-      {26846, 25157, 1241, 130,  26,   6,   1,   0,   0,   0,   0, 149987},
-      {10484, 9538, 1006, 160,  36,  18,   0,   0,   0,   0,   0, 15104},
-    },
-    {
-      // Coeff Band ( 2 )
-      {25842, 40456, 1126,  83,  11,   2,   0,   0,   0,   0,   0,   0},
-      {9338, 8010, 512,  73,   7,   3,   2,   0,   0,   0,   0, 43294},
-      {1047, 751, 149,  31,  13,   6,   1,   0,   0,   0,   0, 879},
-    },
-    {
-      // Coeff Band ( 3 )
-      {26136, 9826, 252,  13,   0,   0,   0,   0,   0,   0,   0,   0},
-      {8134, 5574, 191,  14,   2,   0,   0,   0,   0,   0,   0, 35302},
-      { 605, 677, 116,   9,   1,   0,   0,   0,   0,   0,   0, 611},
-    },
-    {
-      // Coeff Band ( 4 )
-      {10263, 15463, 283,  17,   0,   0,   0,   0,   0,   0,   0,   0},
-      {2773, 2191, 128,   9,   2,   2,   0,   0,   0,   0,   0, 10073},
-      { 134, 125,  32,   4,   0,   2,   0,   0,   0,   0,   0,  50},
-    },
-    {
-      // Coeff Band ( 5 )
-      {10483, 2663,  23,   1,   0,   0,   0,   0,   0,   0,   0,   0},
-      {2137, 1251,  27,   1,   1,   0,   0,   0,   0,   0,   0, 14362},
-      { 116, 156,  14,   2,   1,   0,   0,   0,   0,   0,   0, 190},
-    },
-    {
-      // Coeff Band ( 6 )
-      {40977, 27614, 412,  28,   0,   0,   0,   0,   0,   0,   0,   0},
-      {6113, 5213, 261,  22,   3,   0,   0,   0,   0,   0,   0, 26164},
-      { 382, 312,  50,  14,   2,   0,   0,   0,   0,   0,   0, 345},
-    },
-    {
-      // Coeff Band ( 7 )
-      {   0,  26,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0},
-      {   0,  13,   0,   0,   0,   0,   0,   0,   0,   0,   0, 319},
-      {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   8},
-    },
-  },
-  {
-    // Block Type ( 1 )
-    {
-      // Coeff Band ( 0 )
-      {3268, 19382, 1043, 250,  93,  82,  49,  26,  17,   8,  25, 82289},
-      {8758, 32110, 5436, 1832, 827, 668, 420, 153,  24,   0,   3, 52914},
-      {9337, 23725, 8487, 3954, 2107, 1836, 1069, 399,  59,   0,   0, 18620},
-    },
-    {
-      // Coeff Band ( 1 )
-      {12419, 8420, 452,  62,   9,   1,   0,   0,   0,   0,   0,   0},
-      {11715, 8705, 693,  92,  15,   7,   2,   0,   0,   0,   0, 53988},
-      {7603, 8585, 2306, 778, 270, 145,  39,   5,   0,   0,   0, 9136},
-    },
-    {
-      // Coeff Band ( 2 )
-      {15938, 14335, 1207, 184,  55,  13,   4,   1,   0,   0,   0,   0},
-      {7415, 6829, 1138, 244,  71,  26,   7,   0,   0,   0,   0, 9980},
-      {1580, 1824, 655, 241,  89,  46,  10,   2,   0,   0,   0, 429},
-    },
-    {
-      // Coeff Band ( 3 )
-      {19453, 5260, 201,  19,   0,   0,   0,   0,   0,   0,   0,   0},
-      {9173, 3758, 213,  22,   1,   1,   0,   0,   0,   0,   0, 9820},
-      {1689, 1277, 276,  51,  17,   4,   0,   0,   0,   0,   0, 679},
-    },
-    {
-      // Coeff Band ( 4 )
-      {12076, 10667, 620,  85,  19,   9,   5,   0,   0,   0,   0,   0},
-      {4665, 3625, 423,  55,  19,   9,   0,   0,   0,   0,   0, 5127},
-      { 415, 440, 143,  34,  20,   7,   2,   0,   0,   0,   0, 101},
-    },
-    {
-      // Coeff Band ( 5 )
-      {12183, 4846, 115,  11,   1,   0,   0,   0,   0,   0,   0,   0},
-      {4226, 3149, 177,  21,   2,   0,   0,   0,   0,   0,   0, 7157},
-      { 375, 621, 189,  51,  11,   4,   1,   0,   0,   0,   0, 198},
-    },
-    {
-      // Coeff Band ( 6 )
-      {61658, 37743, 1203,  94,  10,   3,   0,   0,   0,   0,   0,   0},
-      {15514, 11563, 903, 111,  14,   5,   0,   0,   0,   0,   0, 25195},
-      { 929, 1077, 291,  78,  14,   7,   1,   0,   0,   0,   0, 507},
-    },
-    {
-      // Coeff Band ( 7 )
-      {   0, 990,  15,   3,   0,   0,   0,   0,   0,   0,   0,   0},
-      {   0, 412,  13,   0,   0,   0,   0,   0,   0,   0,   0, 1641},
-      {   0,  18,   7,   1,   0,   0,   0,   0,   0,   0,   0,  30},
-    },
-  },
-  {
-    // Block Type ( 2 )
-    {
-      // Coeff Band ( 0 )
-      { 953, 24519, 628, 120,  28,  12,   4,   0,   0,   0,   0, 2248798},
-      {1525, 25654, 2647, 617, 239, 143,  42,   5,   0,   0,   0, 66837},
-      {1180, 11011, 3001, 1237, 532, 448, 239,  54,   5,   0,   0, 7122},
-    },
-    {
-      // Coeff Band ( 1 )
-      {1356, 2220,  67,  10,   4,   1,   0,   0,   0,   0,   0,   0},
-      {1450, 2544, 102,  18,   4,   3,   0,   0,   0,   0,   0, 57063},
-      {1182, 2110, 470, 130,  41,  21,   0,   0,   0,   0,   0, 6047},
-    },
-    {
-      // Coeff Band ( 2 )
-      { 370, 3378, 200,  30,   5,   4,   1,   0,   0,   0,   0,   0},
-      { 293, 1006, 131,  29,  11,   0,   0,   0,   0,   0,   0, 5404},
-      { 114, 387,  98,  23,   4,   8,   1,   0,   0,   0,   0, 236},
-    },
-    {
-      // Coeff Band ( 3 )
-      { 579, 194,   4,   0,   0,   0,   0,   0,   0,   0,   0,   0},
-      { 395, 213,   5,   1,   0,   0,   0,   0,   0,   0,   0, 4157},
-      { 119, 122,   4,   0,   0,   0,   0,   0,   0,   0,   0, 300},
-    },
-    {
-      // Coeff Band ( 4 )
-      {  38, 557,  19,   0,   0,   0,   0,   0,   0,   0,   0,   0},
-      {  21, 114,  12,   1,   0,   0,   0,   0,   0,   0,   0, 427},
-      {   0,   5,   0,   0,   0,   0,   0,   0,   0,   0,   0,   7},
-    },
-    {
-      // Coeff Band ( 5 )
-      {  52,   7,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0},
-      {  18,   6,   0,   0,   0,   0,   0,   0,   0,   0,   0, 652},
-      {   1,   1,   0,   0,   0,   0,   0,   0,   0,   0,   0,  30},
-    },
-    {
-      // Coeff Band ( 6 )
-      { 640, 569,  10,   0,   0,   0,   0,   0,   0,   0,   0,   0},
-      {  25,  77,   2,   0,   0,   0,   0,   0,   0,   0,   0, 517},
-      {   4,   7,   0,   0,   0,   0,   0,   0,   0,   0,   0,   3},
-    },
-    {
-      // Coeff Band ( 7 )
-      {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0},
-      {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0},
-      {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0},
-    },
-  },
-  {
-    // Block Type ( 3 )
-    {
-      // Coeff Band ( 0 )
-      {2506, 20161, 2707, 767, 261, 178, 107,  30,  14,   3,   0, 100694},
-      {8806, 36478, 8817, 3268, 1280, 850, 401, 114,  42,   0,   0, 58572},
-      {11003, 27214, 11798, 5716, 2482, 2072, 1048, 175,  32,   0,   0, 19284},
-    },
-    {
-      // Coeff Band ( 1 )
-      {9738, 11313, 959, 205,  70,  18,  11,   1,   0,   0,   0,   0},
-      {12628, 15085, 1507, 273,  52,  19,   9,   0,   0,   0,   0, 54280},
-      {10701, 15846, 5561, 1926, 813, 570, 249,  36,   0,   0,   0, 6460},
-    },
-    {
-      // Coeff Band ( 2 )
-      {6781, 22539, 2784, 634, 182, 123,  20,   4,   0,   0,   0,   0},
-      {6263, 11544, 2649, 790, 259, 168,  27,   5,   0,   0,   0, 20539},
-      {3109, 4075, 2031, 896, 457, 386, 158,  29,   0,   0,   0, 1138},
-    },
-    {
-      // Coeff Band ( 3 )
-      {11515, 4079, 465,  73,   5,  14,   2,   0,   0,   0,   0,   0},
-      {9361, 5834, 650,  96,  24,   8,   4,   0,   0,   0,   0, 22181},
-      {4343, 3974, 1360, 415, 132,  96,  14,   1,   0,   0,   0, 1267},
-    },
-    {
-      // Coeff Band ( 4 )
-      {4787, 9297, 823, 168,  44,  12,   4,   0,   0,   0,   0,   0},
-      {3619, 4472, 719, 198,  60,  31,   3,   0,   0,   0,   0, 8401},
-      {1157, 1175, 483, 182,  88,  31,   8,   0,   0,   0,   0, 268},
-    },
-    {
-      // Coeff Band ( 5 )
-      {8299, 1226,  32,   5,   1,   0,   0,   0,   0,   0,   0,   0},
-      {3502, 1568,  57,   4,   1,   1,   0,   0,   0,   0,   0, 9811},
-      {1055, 1070, 166,  29,   6,   1,   0,   0,   0,   0,   0, 527},
-    },
-    {
-      // Coeff Band ( 6 )
-      {27414, 27927, 1989, 347,  69,  26,   0,   0,   0,   0,   0,   0},
-      {5876, 10074, 1574, 341,  91,  24,   4,   0,   0,   0,   0, 21954},
-      {1571, 2171, 778, 324, 124,  65,  16,   0,   0,   0,   0, 979},
-    },
-    {
-      // Coeff Band ( 7 )
-      {   0,  29,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0},
-      {   0,  23,   0,   0,   0,   0,   0,   0,   0,   0,   0, 459},
-      {   0,   1,   0,   0,   0,   0,   0,   0,   0,   0,   0,  13},
-    },
-  },
-};
-
-// Update probabilities for the nodes in the token entropy tree.
-const vp9_prob tree_update_probs[vp9_coef_tree_dimen] = {
-  {
-    {
-      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
-      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
-      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
-    },
-    {
-      {176, 246, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
-      {223, 241, 252, 255, 255, 255, 255, 255, 255, 255, 255, },
-      {249, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
-    },
-    {
-      {255, 244, 252, 255, 255, 255, 255, 255, 255, 255, 255, },
-      {234, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
-      {253, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
-    },
-    {
-      {255, 246, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
-      {239, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
-      {254, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
-    },
-    {
-      {255, 248, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
-      {251, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
-      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
-    },
-    {
-      {255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
-      {251, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
-      {254, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
-    },
-    {
-      {255, 254, 253, 255, 254, 255, 255, 255, 255, 255, 255, },
-      {250, 255, 254, 255, 254, 255, 255, 255, 255, 255, 255, },
-      {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
-    },
-    {
-      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
-      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
-      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
-    },
-  },
-  {
-    {
-      {217, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
-      {225, 252, 241, 253, 255, 255, 254, 255, 255, 255, 255, },
-      {234, 250, 241, 250, 253, 255, 253, 254, 255, 255, 255, },
-    },
-    {
-      {255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
-      {223, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
-      {238, 253, 254, 254, 255, 255, 255, 255, 255, 255, 255, },
-    },
-    {
-      {255, 248, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
-      {249, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
-      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
-    },
-    {
-      {255, 253, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
-      {247, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
-      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
-    },
-    {
-      {255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
-      {252, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
-      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
-    },
-    {
-      {255, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
-      {253, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
-      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
-    },
-    {
-      {255, 254, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
-      {250, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
-      {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
-    },
-    {
-      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
-      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
-      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
-    },
-  },
-  {
-    {
-      {186, 251, 250, 255, 255, 255, 255, 255, 255, 255, 255, },
-      {234, 251, 244, 254, 255, 255, 255, 255, 255, 255, 255, },
-      {251, 251, 243, 253, 254, 255, 254, 255, 255, 255, 255, },
-    },
-    {
-      {255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
-      {236, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
-      {251, 253, 253, 254, 254, 255, 255, 255, 255, 255, 255, },
-    },
-    {
-      {255, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
-      {254, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
-      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
-    },
-    {
-      {255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
-      {254, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
-      {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
-    },
-    {
-      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
-      {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
-      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
-    },
-    {
-      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
-      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
-      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
-    },
-    {
-      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
-      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
-      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
-    },
-    {
-      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
-      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
-      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
-    },
-  },
-  {
-    {
-      {248, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
-      {250, 254, 252, 254, 255, 255, 255, 255, 255, 255, 255, },
-      {248, 254, 249, 253, 255, 255, 255, 255, 255, 255, 255, },
-    },
-    {
-      {255, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
-      {246, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
-      {252, 254, 251, 254, 254, 255, 255, 255, 255, 255, 255, },
-    },
-    {
-      {255, 254, 252, 255, 255, 255, 255, 255, 255, 255, 255, },
-      {248, 254, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
-      {253, 255, 254, 254, 255, 255, 255, 255, 255, 255, 255, },
-    },
-    {
-      {255, 251, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
-      {245, 251, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
-      {253, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
-    },
-    {
-      {255, 251, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
-      {252, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
-      {255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
-    },
-    {
-      {255, 252, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
-      {249, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
-      {255, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
-    },
-    {
-      {255, 255, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
-      {250, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
-      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
-    },
-    {
-      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
-      {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
-      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
-    },
-  },
-};
-#endif
--- a/vp8/common/debugmodes.c
+++ /dev/null
@@ -1,146 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <stdio.h>
-#include "blockd.h"
-
-void vp9_print_modes_and_motion_vectors(MODE_INFO *mi, int rows, int cols,
-                                        int frame) {
-  int mb_row;
-  int mb_col;
-  int mb_index = 0;
-  FILE *mvs = fopen("mvs.stt", "a");
-
-  /* print out the macroblock Y modes */
-  mb_index = 0;
-  fprintf(mvs, "Mb Modes for Frame %d\n", frame);
-
-  for (mb_row = 0; mb_row < rows; mb_row++) {
-    for (mb_col = 0; mb_col < cols; mb_col++) {
-
-      fprintf(mvs, "%2d ", mi[mb_index].mbmi.mode);
-
-      mb_index++;
-    }
-
-    fprintf(mvs, "\n");
-    mb_index++;
-  }
-
-  fprintf(mvs, "\n");
-
-  mb_index = 0;
-  fprintf(mvs, "Mb mv ref for Frame %d\n", frame);
-
-  for (mb_row = 0; mb_row < rows; mb_row++) {
-    for (mb_col = 0; mb_col < cols; mb_col++) {
-
-      fprintf(mvs, "%2d ", mi[mb_index].mbmi.ref_frame);
-
-      mb_index++;
-    }
-
-    fprintf(mvs, "\n");
-    mb_index++;
-  }
-
-  fprintf(mvs, "\n");
-
-  /* print out the macroblock UV modes */
-  mb_index = 0;
-  fprintf(mvs, "UV Modes for Frame %d\n", frame);
-
-  for (mb_row = 0; mb_row < rows; mb_row++) {
-    for (mb_col = 0; mb_col < cols; mb_col++) {
-
-      fprintf(mvs, "%2d ", mi[mb_index].mbmi.uv_mode);
-
-      mb_index++;
-    }
-
-    mb_index++;
-    fprintf(mvs, "\n");
-  }
-
-  fprintf(mvs, "\n");
-
-  /* print out the block modes */
-  mb_index = 0;
-  fprintf(mvs, "Mbs for Frame %d\n", frame);
-  {
-    int b_row;
-
-    for (b_row = 0; b_row < 4 * rows; b_row++) {
-      int b_col;
-      int bindex;
-
-      for (b_col = 0; b_col < 4 * cols; b_col++) {
-        mb_index = (b_row >> 2) * (cols + 1) + (b_col >> 2);
-        bindex = (b_row & 3) * 4 + (b_col & 3);
-
-        if (mi[mb_index].mbmi.mode == B_PRED) {
-          fprintf(mvs, "%2d ", mi[mb_index].bmi[bindex].as_mode.first);
-#if CONFIG_COMP_INTRA_PRED
-          fprintf(mvs, "%2d ", mi[mb_index].bmi[bindex].as_mode.second);
-#endif
-        } else
-          fprintf(mvs, "xx ");
-
-      }
-
-      fprintf(mvs, "\n");
-    }
-  }
-  fprintf(mvs, "\n");
-
-  /* print out the macroblock mvs */
-  mb_index = 0;
-  fprintf(mvs, "MVs for Frame %d\n", frame);
-
-  for (mb_row = 0; mb_row < rows; mb_row++) {
-    for (mb_col = 0; mb_col < cols; mb_col++) {
-      fprintf(mvs, "%5d:%-5d", mi[mb_index].mbmi.mv[0].as_mv.row / 2,
-          mi[mb_index].mbmi.mv[0].as_mv.col / 2);
-
-      mb_index++;
-    }
-
-    mb_index++;
-    fprintf(mvs, "\n");
-  }
-
-  fprintf(mvs, "\n");
-
-  /* print out the block modes */
-  mb_index = 0;
-  fprintf(mvs, "MVs for Frame %d\n", frame);
-  {
-    int b_row;
-
-    for (b_row = 0; b_row < 4 * rows; b_row++) {
-      int b_col;
-      int bindex;
-
-      for (b_col = 0; b_col < 4 * cols; b_col++) {
-        mb_index = (b_row >> 2) * (cols + 1) + (b_col >> 2);
-        bindex = (b_row & 3) * 4 + (b_col & 3);
-        fprintf(mvs, "%3d:%-3d ",
-                mi[mb_index].bmi[bindex].as_mv.first.as_mv.row,
-                mi[mb_index].bmi[bindex].as_mv.first.as_mv.col);
-
-      }
-
-      fprintf(mvs, "\n");
-    }
-  }
-  fprintf(mvs, "\n");
-
-  fclose(mvs);
-}
--- a/vp8/common/default_coef_probs.h
+++ /dev/null
@@ -1,1377 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
-*/
-
-
-/*Generated file, included by entropy.c*/
-
-
-static const vp9_prob default_coef_probs [BLOCK_TYPES]
-                                         [COEF_BANDS]
-                                         [PREV_COEF_CONTEXTS]
-                                         [ENTROPY_NODES] = {
-  {
-    /* Block Type ( 0 ) */
-    {
-      /* Coeff Band ( 0 )*/
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-    },
-    {
-      /* Coeff Band ( 1 )*/
-      { 253, 136, 254, 255, 228, 219, 128, 128, 128, 128, 128 },
-      { 189, 129, 242, 255, 227, 213, 255, 219, 128, 128, 128 },
-      { 106, 126, 227, 252, 214, 209, 255, 255, 128, 128, 128 },
-      { 90, 116, 227, 252, 214, 209, 255, 255, 128, 128, 128 },
-    },
-    {
-      /* Coeff Band ( 2 )*/
-      {   1,  98, 248, 255, 236, 226, 255, 255, 128, 128, 128 },
-      { 181, 133, 238, 254, 221, 234, 255, 154, 128, 128, 128 },
-      {  78, 134, 202, 247, 198, 180, 255, 219, 128, 128, 128 },
-      {  64, 128, 202, 247, 198, 180, 255, 219, 128, 128, 128 },
-    },
-    {
-      /* Coeff Band ( 3 )*/
-      {   1, 185, 249, 255, 243, 255, 128, 128, 128, 128, 128 },
-      { 184, 150, 247, 255, 236, 224, 128, 128, 128, 128, 128 },
-      {  77, 110, 216, 255, 236, 230, 128, 128, 128, 128, 128 },
-      {  64, 100, 216, 255, 236, 230, 128, 128, 128, 128, 128 },
-    },
-    {
-      /* Coeff Band ( 4 )*/
-      {   1, 101, 251, 255, 241, 255, 128, 128, 128, 128, 128 },
-      { 170, 139, 241, 252, 236, 209, 255, 255, 128, 128, 128 },
-      {  37, 116, 196, 243, 228, 255, 255, 255, 128, 128, 128 },
-      {  28, 110, 196, 243, 228, 255, 255, 255, 128, 128, 128 },
-    },
-    {
-      /* Coeff Band ( 5 )*/
-      {   1, 204, 254, 255, 245, 255, 128, 128, 128, 128, 128 },
-      { 207, 160, 250, 255, 238, 128, 128, 128, 128, 128, 128 },
-      { 102, 103, 231, 255, 211, 171, 128, 128, 128, 128, 128 },
-      { 90, 90, 231, 255, 211, 171, 128, 128, 128, 128, 128 },
-    },
-    {
-      /* Coeff Band ( 6 )*/
-      {   1, 152, 252, 255, 240, 255, 128, 128, 128, 128, 128 },
-      { 177, 135, 243, 255, 234, 225, 128, 128, 128, 128, 128 },
-      {  80, 129, 211, 255, 194, 224, 128, 128, 128, 128, 128 },
-      {  64, 120, 211, 255, 194, 224, 128, 128, 128, 128, 128 },
-    },
-    {
-      /* Coeff Band ( 7 )*/
-      {   1,   1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 246,   1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 255, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 255, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-    }
-  },
-  {
-    /* Block Type ( 1 ) */
-    {
-      /* Coeff Band ( 0 )*/
-      { 198,  35, 237, 223, 193, 187, 162, 160, 145, 155,  62 },
-      { 131,  45, 198, 221, 172, 176, 220, 157, 252, 221,   1 },
-      {  68,  47, 146, 208, 149, 167, 221, 162, 255, 223, 128 },
-      {  48,  32, 146, 208, 149, 167, 221, 162, 255, 223, 128 },
-    },
-    {
-      /* Coeff Band ( 1 )*/
-      {   1, 149, 241, 255, 221, 224, 255, 255, 128, 128, 128 },
-      { 184, 141, 234, 253, 222, 220, 255, 199, 128, 128, 128 },
-      {  81,  99, 181, 242, 176, 190, 249, 202, 255, 255, 128 },
-      {  66,  90, 181, 242, 176, 190, 249, 202, 255, 255, 128 },
-    },
-    {
-      /* Coeff Band ( 2 )*/
-      {   1, 129, 232, 253, 214, 197, 242, 196, 255, 255, 128 },
-      {  99, 121, 210, 250, 201, 198, 255, 202, 128, 128, 128 },
-      {  23,  91, 163, 242, 170, 187, 247, 210, 255, 255, 128 },
-      {  18,  80, 163, 242, 170, 187, 247, 210, 255, 255, 128 },
-    },
-    {
-      /* Coeff Band ( 3 )*/
-      {   1, 200, 246, 255, 234, 255, 128, 128, 128, 128, 128 },
-      { 109, 178, 241, 255, 231, 245, 255, 255, 128, 128, 128 },
-      {  44, 130, 201, 253, 205, 192, 255, 255, 128, 128, 128 },
-      {  36, 120, 201, 253, 205, 192, 255, 255, 128, 128, 128 },
-    },
-    {
-      /* Coeff Band ( 4 )*/
-      {   1, 132, 239, 251, 219, 209, 255, 165, 128, 128, 128 },
-      {  94, 136, 225, 251, 218, 190, 255, 255, 128, 128, 128 },
-      {  22, 100, 174, 245, 186, 161, 255, 199, 128, 128, 128 },
-      {  18, 90, 174, 245, 186, 161, 255, 199, 128, 128, 128 },
-    },
-    {
-      /* Coeff Band ( 5 )*/
-      {   1, 182, 249, 255, 232, 235, 128, 128, 128, 128, 128 },
-      { 124, 143, 241, 255, 227, 234, 128, 128, 128, 128, 128 },
-      {  35,  77, 181, 251, 193, 211, 255, 205, 128, 128, 128 },
-      {  28,  70, 181, 251, 193, 211, 255, 205, 128, 128, 128 },
-    },
-    {
-      /* Coeff Band ( 6 )*/
-      {   1, 157, 247, 255, 236, 231, 255, 255, 128, 128, 128 },
-      { 121, 141, 235, 255, 225, 227, 255, 255, 128, 128, 128 },
-      {  45,  99, 188, 251, 195, 217, 255, 224, 128, 128, 128 },
-      {  40,  90, 188, 251, 195, 217, 255, 224, 128, 128, 128 },
-    },
-    {
-      /* Coeff Band ( 7 )*/
-      {   1,   1, 251, 255, 213, 255, 128, 128, 128, 128, 128 },
-      { 203,   1, 248, 255, 255, 128, 128, 128, 128, 128, 128 },
-      { 137,   1, 177, 255, 224, 255, 128, 128, 128, 128, 128 },
-      { 137,   1, 177, 255, 224, 255, 128, 128, 128, 128, 128 },
-    }
-  },
-  {
-    /* Block Type ( 2 ) */
-    {
-      /* Coeff Band ( 0 )*/
-      { 253,   9, 248, 251, 207, 208, 255, 192, 128, 128, 128 },
-      { 175,  13, 224, 243, 193, 185, 249, 198, 255, 255, 128 },
-      {  73,  17, 171, 221, 161, 179, 236, 167, 255, 234, 128 },
-      {  64,  17, 171, 221, 161, 179, 236, 167, 255, 234, 128 },
-    },
-    {
-      /* Coeff Band ( 1 )*/
-      {   1,  95, 247, 253, 212, 183, 255, 255, 128, 128, 128 },
-      { 239,  90, 244, 250, 211, 209, 255, 255, 128, 128, 128 },
-      { 155,  77, 195, 248, 188, 195, 255, 255, 128, 128, 128 },
-      { 140,  70, 195, 248, 188, 195, 255, 255, 128, 128, 128 },
-    },
-    {
-      /* Coeff Band ( 2 )*/
-      {   1,  24, 239, 251, 218, 219, 255, 205, 128, 128, 128 },
-      { 201,  51, 219, 255, 196, 186, 128, 128, 128, 128, 128 },
-      {  69,  46, 190, 239, 201, 218, 255, 228, 128, 128, 128 },
-      {  60,  40, 190, 239, 201, 218, 255, 228, 128, 128, 128 },
-    },
-    {
-      /* Coeff Band ( 3 )*/
-      {   1, 191, 251, 255, 255, 128, 128, 128, 128, 128, 128 },
-      { 223, 165, 249, 255, 213, 255, 128, 128, 128, 128, 128 },
-      { 141, 124, 248, 255, 255, 128, 128, 128, 128, 128, 128 },
-      { 132, 118, 248, 255, 255, 128, 128, 128, 128, 128, 128 },
-    },
-    {
-      /* Coeff Band ( 4 )*/
-      {   1,  16, 248, 255, 255, 128, 128, 128, 128, 128, 128 },
-      { 190,  36, 230, 255, 236, 255, 128, 128, 128, 128, 128 },
-      { 149,   1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 149,   1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
-    },
-    {
-      /* Coeff Band ( 5 )*/
-      {   1, 226, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 247, 192, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 240, 128, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 240, 128, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
-    },
-    {
-      /* Coeff Band ( 6 )*/
-      {   1, 134, 252, 255, 255, 128, 128, 128, 128, 128, 128 },
-      { 213,  62, 250, 255, 255, 128, 128, 128, 128, 128, 128 },
-      {  55,  93, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
-      {  48,  85, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
-    },
-    {
-      /* Coeff Band ( 7 )*/
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-    }
-  },
-  {
-    /* Block Type ( 3 ) */
-    {
-      /* Coeff Band ( 0 )*/
-      { 202,  24, 213, 235, 186, 191, 220, 160, 240, 175, 255 },
-      { 126,  38, 182, 232, 169, 184, 228, 174, 255, 187, 128 },
-      {  63,  48, 138, 219, 151, 178, 240, 170, 255, 216, 128 },
-      {  54,  40, 138, 219, 151, 178, 240, 170, 255, 216, 128 },
-    },
-    {
-      /* Coeff Band ( 1 )*/
-      {   1, 112, 230, 250, 199, 191, 247, 159, 255, 255, 128 },
-      { 166, 109, 228, 252, 211, 215, 255, 174, 128, 128, 128 },
-      {  44,  84, 162, 232, 172, 180, 245, 178, 255, 255, 128 },
-      {  32,  70, 162, 232, 172, 180, 245, 178, 255, 255, 128 },
-    },
-    {
-      /* Coeff Band ( 2 )*/
-      {   1,  52, 220, 246, 198, 199, 249, 220, 255, 255, 128 },
-      { 124,  74, 191, 243, 183, 193, 250, 221, 255, 255, 128 },
-      {  24,  71, 130, 219, 154, 170, 243, 182, 255, 255, 128 },
-      {  24,  71, 130, 219, 154, 170, 243, 182, 255, 255, 128 },
-    },
-    {
-      /* Coeff Band ( 3 )*/
-      {   1, 182, 225, 249, 219, 240, 255, 224, 128, 128, 128 },
-      { 149, 150, 226, 252, 216, 205, 255, 171, 128, 128, 128 },
-      {  28, 108, 170, 242, 183, 194, 254, 223, 255, 255, 128 },
-      {  26, 104, 170, 242, 183, 194, 254, 223, 255, 255, 128 },
-    },
-    {
-      /* Coeff Band ( 4 )*/
-      {   1,  81, 230, 252, 204, 203, 255, 192, 128, 128, 128 },
-      { 123, 102, 209, 247, 188, 196, 255, 233, 128, 128, 128 },
-      {  20,  95, 153, 243, 164, 173, 255, 203, 128, 128, 128 },
-      {  20,  95, 153, 243, 164, 173, 255, 203, 128, 128, 128 },
-    },
-    {
-      /* Coeff Band ( 5 )*/
-      {   1, 222, 248, 255, 216, 213, 128, 128, 128, 128, 128 },
-      { 168, 175, 246, 252, 235, 205, 255, 255, 128, 128, 128 },
-      {  47, 116, 215, 255, 211, 212, 255, 255, 128, 128, 128 },
-      {  47, 116, 215, 255, 211, 212, 255, 255, 128, 128, 128 },
-    },
-    {
-      /* Coeff Band ( 6 )*/
-      {   1, 121, 236, 253, 212, 214, 255, 255, 128, 128, 128 },
-      { 141,  84, 213, 252, 201, 202, 255, 219, 128, 128, 128 },
-      {  42,  80, 160, 240, 162, 185, 255, 205, 128, 128, 128 },
-      {  42,  80, 160, 240, 162, 185, 255, 205, 128, 128, 128 },
-    },
-    {
-      /* Coeff Band ( 7 )*/
-      {   1,   1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 244,   1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 238,   1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 238,   1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
-    }
-  }
-};
-
-static const vp9_prob default_hybrid_coef_probs [BLOCK_TYPES]
-                                                [COEF_BANDS]
-                                                [PREV_COEF_CONTEXTS]
-                                                [ENTROPY_NODES] = {
-  {
-    /* Block Type ( 0 ) */
-    {
-      /* Coeff Band ( 0 )*/
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-    },
-    {
-      /* Coeff Band ( 1 )*/
-      { 253, 136, 254, 255, 228, 219, 128, 128, 128, 128, 128 },
-      { 189, 129, 242, 255, 227, 213, 255, 219, 128, 128, 128 },
-      { 106, 126, 227, 252, 214, 209, 255, 255, 128, 128, 128 },
-      { 90, 116, 227, 252, 214, 209, 255, 255, 128, 128, 128 },
-    },
-    {
-      /* Coeff Band ( 2 )*/
-      {   1,  98, 248, 255, 236, 226, 255, 255, 128, 128, 128 },
-      { 181, 133, 238, 254, 221, 234, 255, 154, 128, 128, 128 },
-      {  78, 134, 202, 247, 198, 180, 255, 219, 128, 128, 128 },
-      {  64, 128, 202, 247, 198, 180, 255, 219, 128, 128, 128 },
-    },
-    {
-      /* Coeff Band ( 3 )*/
-      {   1, 185, 249, 255, 243, 255, 128, 128, 128, 128, 128 },
-      { 184, 150, 247, 255, 236, 224, 128, 128, 128, 128, 128 },
-      {  77, 110, 216, 255, 236, 230, 128, 128, 128, 128, 128 },
-      {  64, 100, 216, 255, 236, 230, 128, 128, 128, 128, 128 },
-    },
-    {
-      /* Coeff Band ( 4 )*/
-      {   1, 101, 251, 255, 241, 255, 128, 128, 128, 128, 128 },
-      { 170, 139, 241, 252, 236, 209, 255, 255, 128, 128, 128 },
-      {  37, 116, 196, 243, 228, 255, 255, 255, 128, 128, 128 },
-      {  28, 110, 196, 243, 228, 255, 255, 255, 128, 128, 128 },
-    },
-    {
-      /* Coeff Band ( 5 )*/
-      {   1, 204, 254, 255, 245, 255, 128, 128, 128, 128, 128 },
-      { 207, 160, 250, 255, 238, 128, 128, 128, 128, 128, 128 },
-      { 102, 103, 231, 255, 211, 171, 128, 128, 128, 128, 128 },
-      { 90, 90, 231, 255, 211, 171, 128, 128, 128, 128, 128 },
-    },
-    {
-      /* Coeff Band ( 6 )*/
-      {   1, 152, 252, 255, 240, 255, 128, 128, 128, 128, 128 },
-      { 177, 135, 243, 255, 234, 225, 128, 128, 128, 128, 128 },
-      {  80, 129, 211, 255, 194, 224, 128, 128, 128, 128, 128 },
-      {  64, 120, 211, 255, 194, 224, 128, 128, 128, 128, 128 },
-    },
-    {
-      /* Coeff Band ( 7 )*/
-      {   1,   1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 246,   1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 255, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 255, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-    }
-  },
-  {
-    /* Block Type ( 1 ) */
-    {
-      /* Coeff Band ( 0 )*/
-      { 198,  35, 237, 223, 193, 187, 162, 160, 145, 155,  62 },
-      { 131,  45, 198, 221, 172, 176, 220, 157, 252, 221,   1 },
-      {  68,  47, 146, 208, 149, 167, 221, 162, 255, 223, 128 },
-      {  48,  32, 146, 208, 149, 167, 221, 162, 255, 223, 128 },
-    },
-    {
-      /* Coeff Band ( 1 )*/
-      {   1, 149, 241, 255, 221, 224, 255, 255, 128, 128, 128 },
-      { 184, 141, 234, 253, 222, 220, 255, 199, 128, 128, 128 },
-      {  81,  99, 181, 242, 176, 190, 249, 202, 255, 255, 128 },
-      {  66,  90, 181, 242, 176, 190, 249, 202, 255, 255, 128 },
-    },
-    {
-      /* Coeff Band ( 2 )*/
-      {   1, 129, 232, 253, 214, 197, 242, 196, 255, 255, 128 },
-      {  99, 121, 210, 250, 201, 198, 255, 202, 128, 128, 128 },
-      {  23,  91, 163, 242, 170, 187, 247, 210, 255, 255, 128 },
-      {  18,  80, 163, 242, 170, 187, 247, 210, 255, 255, 128 },
-    },
-    {
-      /* Coeff Band ( 3 )*/
-      {   1, 200, 246, 255, 234, 255, 128, 128, 128, 128, 128 },
-      { 109, 178, 241, 255, 231, 245, 255, 255, 128, 128, 128 },
-      {  44, 130, 201, 253, 205, 192, 255, 255, 128, 128, 128 },
-      {  36, 120, 201, 253, 205, 192, 255, 255, 128, 128, 128 },
-    },
-    {
-      /* Coeff Band ( 4 )*/
-      {   1, 132, 239, 251, 219, 209, 255, 165, 128, 128, 128 },
-      {  94, 136, 225, 251, 218, 190, 255, 255, 128, 128, 128 },
-      {  22, 100, 174, 245, 186, 161, 255, 199, 128, 128, 128 },
-      {  18, 90, 174, 245, 186, 161, 255, 199, 128, 128, 128 },
-    },
-    {
-      /* Coeff Band ( 5 )*/
-      {   1, 182, 249, 255, 232, 235, 128, 128, 128, 128, 128 },
-      { 124, 143, 241, 255, 227, 234, 128, 128, 128, 128, 128 },
-      {  35,  77, 181, 251, 193, 211, 255, 205, 128, 128, 128 },
-      {  28,  70, 181, 251, 193, 211, 255, 205, 128, 128, 128 },
-    },
-    {
-      /* Coeff Band ( 6 )*/
-      {   1, 157, 247, 255, 236, 231, 255, 255, 128, 128, 128 },
-      { 121, 141, 235, 255, 225, 227, 255, 255, 128, 128, 128 },
-      {  45,  99, 188, 251, 195, 217, 255, 224, 128, 128, 128 },
-      {  40,  90, 188, 251, 195, 217, 255, 224, 128, 128, 128 },
-    },
-    {
-      /* Coeff Band ( 7 )*/
-      {   1,   1, 251, 255, 213, 255, 128, 128, 128, 128, 128 },
-      { 203,   1, 248, 255, 255, 128, 128, 128, 128, 128, 128 },
-      { 137,   1, 177, 255, 224, 255, 128, 128, 128, 128, 128 },
-      { 137,   1, 177, 255, 224, 255, 128, 128, 128, 128, 128 },
-    }
-  },
-  {
-    /* Block Type ( 2 ) */
-    {
-      /* Coeff Band ( 0 )*/
-      { 253,   9, 248, 251, 207, 208, 255, 192, 128, 128, 128 },
-      { 175,  13, 224, 243, 193, 185, 249, 198, 255, 255, 128 },
-      {  73,  17, 171, 221, 161, 179, 236, 167, 255, 234, 128 },
-      {  64,  17, 171, 221, 161, 179, 236, 167, 255, 234, 128 },
-    },
-    {
-      /* Coeff Band ( 1 )*/
-      {   1,  95, 247, 253, 212, 183, 255, 255, 128, 128, 128 },
-      { 239,  90, 244, 250, 211, 209, 255, 255, 128, 128, 128 },
-      { 155,  77, 195, 248, 188, 195, 255, 255, 128, 128, 128 },
-      { 140,  70, 195, 248, 188, 195, 255, 255, 128, 128, 128 },
-    },
-    {
-      /* Coeff Band ( 2 )*/
-      {   1,  24, 239, 251, 218, 219, 255, 205, 128, 128, 128 },
-      { 201,  51, 219, 255, 196, 186, 128, 128, 128, 128, 128 },
-      {  69,  46, 190, 239, 201, 218, 255, 228, 128, 128, 128 },
-      {  60,  40, 190, 239, 201, 218, 255, 228, 128, 128, 128 },
-    },
-    {
-      /* Coeff Band ( 3 )*/
-      {   1, 191, 251, 255, 255, 128, 128, 128, 128, 128, 128 },
-      { 223, 165, 249, 255, 213, 255, 128, 128, 128, 128, 128 },
-      { 141, 124, 248, 255, 255, 128, 128, 128, 128, 128, 128 },
-      { 132, 118, 248, 255, 255, 128, 128, 128, 128, 128, 128 },
-    },
-    {
-      /* Coeff Band ( 4 )*/
-      {   1,  16, 248, 255, 255, 128, 128, 128, 128, 128, 128 },
-      { 190,  36, 230, 255, 236, 255, 128, 128, 128, 128, 128 },
-      { 149,   1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 149,   1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
-    },
-    {
-      /* Coeff Band ( 5 )*/
-      {   1, 226, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 247, 192, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 240, 128, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 240, 128, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
-    },
-    {
-      /* Coeff Band ( 6 )*/
-      {   1, 134, 252, 255, 255, 128, 128, 128, 128, 128, 128 },
-      { 213,  62, 250, 255, 255, 128, 128, 128, 128, 128, 128 },
-      {  55,  93, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
-      {  48,  85, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
-    },
-    {
-      /* Coeff Band ( 7 )*/
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-    }
-  },
-  {
-    /* Block Type ( 3 ) */
-    {
-      /* Coeff Band ( 0 )*/
-      { 202,  24, 213, 235, 186, 191, 220, 160, 240, 175, 255 },
-      { 126,  38, 182, 232, 169, 184, 228, 174, 255, 187, 128 },
-      {  63,  48, 138, 219, 151, 178, 240, 170, 255, 216, 128 },
-      {  54,  40, 138, 219, 151, 178, 240, 170, 255, 216, 128 },
-    },
-    {
-      /* Coeff Band ( 1 )*/
-      {   1, 112, 230, 250, 199, 191, 247, 159, 255, 255, 128 },
-      { 166, 109, 228, 252, 211, 215, 255, 174, 128, 128, 128 },
-      {  44,  84, 162, 232, 172, 180, 245, 178, 255, 255, 128 },
-      {  32,  70, 162, 232, 172, 180, 245, 178, 255, 255, 128 },
-    },
-    {
-      /* Coeff Band ( 2 )*/
-      {   1,  52, 220, 246, 198, 199, 249, 220, 255, 255, 128 },
-      { 124,  74, 191, 243, 183, 193, 250, 221, 255, 255, 128 },
-      {  24,  71, 130, 219, 154, 170, 243, 182, 255, 255, 128 },
-      {  24,  71, 130, 219, 154, 170, 243, 182, 255, 255, 128 },
-    },
-    {
-      /* Coeff Band ( 3 )*/
-      {   1, 182, 225, 249, 219, 240, 255, 224, 128, 128, 128 },
-      { 149, 150, 226, 252, 216, 205, 255, 171, 128, 128, 128 },
-      {  28, 108, 170, 242, 183, 194, 254, 223, 255, 255, 128 },
-      {  26, 104, 170, 242, 183, 194, 254, 223, 255, 255, 128 },
-    },
-    {
-      /* Coeff Band ( 4 )*/
-      {   1,  81, 230, 252, 204, 203, 255, 192, 128, 128, 128 },
-      { 123, 102, 209, 247, 188, 196, 255, 233, 128, 128, 128 },
-      {  20,  95, 153, 243, 164, 173, 255, 203, 128, 128, 128 },
-      {  20,  95, 153, 243, 164, 173, 255, 203, 128, 128, 128 },
-    },
-    {
-      /* Coeff Band ( 5 )*/
-      {   1, 222, 248, 255, 216, 213, 128, 128, 128, 128, 128 },
-      { 168, 175, 246, 252, 235, 205, 255, 255, 128, 128, 128 },
-      {  47, 116, 215, 255, 211, 212, 255, 255, 128, 128, 128 },
-      {  47, 116, 215, 255, 211, 212, 255, 255, 128, 128, 128 },
-    },
-    {
-      /* Coeff Band ( 6 )*/
-      {   1, 121, 236, 253, 212, 214, 255, 255, 128, 128, 128 },
-      { 141,  84, 213, 252, 201, 202, 255, 219, 128, 128, 128 },
-      {  42,  80, 160, 240, 162, 185, 255, 205, 128, 128, 128 },
-      {  42,  80, 160, 240, 162, 185, 255, 205, 128, 128, 128 },
-    },
-    {
-      /* Coeff Band ( 7 )*/
-      {   1,   1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 244,   1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 238,   1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 238,   1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
-    }
-  }
-};
-
-static const vp9_prob
-default_coef_probs_8x8[BLOCK_TYPES_8X8]
-[COEF_BANDS]
-[PREV_COEF_CONTEXTS]
-[ENTROPY_NODES] = {
-  {
-    /* block Type 0 */
-    {
-      /* Coeff Band 0 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}
-    },
-    {
-      /* Coeff Band 1 */
-      { 60, 140, 195, 255, 212, 214, 128, 128, 128, 128, 128},
-      { 75, 221, 231, 255, 203, 255, 128, 128, 128, 128, 128},
-      { 9, 212, 196, 251, 197, 207, 255, 185, 128, 128, 128},
-      { 9, 212, 196, 251, 197, 207, 255, 185, 128, 128, 128}
-    },
-    {
-      /* Coeff Band 2 */
-      { 1, 227, 226, 255, 215, 215, 128, 128, 128, 128, 128},
-      { 5, 163, 209, 255, 212, 212, 255, 255, 128, 128, 128},
-      { 1, 133, 203, 255, 210, 220, 255, 255, 128, 128, 128},
-      { 1, 133, 203, 255, 210, 220, 255, 255, 128, 128, 128}
-    },
-    {
-      /* Coeff Band 3 */
-      { 1, 226, 225, 255, 228, 236, 128, 128, 128, 128, 128},
-      { 6, 163, 208, 255, 224, 234, 255, 255, 128, 128, 128},
-      { 1, 122, 196, 253, 212, 248, 255, 255, 128, 128, 128},
-      { 1, 122, 196, 253, 212, 248, 255, 255, 128, 128, 128}
-    },
-    {
-      /* Coeff Band 4 */
-      { 1, 222, 197, 254, 193, 216, 255, 236, 128, 128, 128},
-      { 7, 140, 163, 251, 195, 211, 255, 238, 128, 128, 128},
-      { 1, 91, 152, 249, 181, 197, 255, 239, 128, 128, 128},
-      { 1, 91, 152, 249, 181, 197, 255, 239, 128, 128, 128}
-    },
-    {
-      /* Coeff Band 5 */
-      { 1, 226, 218, 255, 216, 241, 255, 255, 128, 128, 128},
-      { 6, 154, 191, 255, 218, 240, 255, 255, 128, 128, 128},
-      { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128},
-      { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128}
-    },
-    {
-      /* Coeff Band 6 */
-      { 1, 221, 217, 255, 208, 217, 255, 232, 128, 128, 128},
-      { 11, 155, 189, 254, 203, 211, 255, 249, 128, 128, 128},
-      { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128},
-      { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128}
-    },
-    {
-      /* Coeff Band 7 */
-      { 1, 207, 235, 255, 232, 240, 128, 128, 128, 128, 128},
-      { 58, 161, 216, 255, 229, 235, 255, 255, 128, 128, 128},
-      { 8, 133, 204, 255, 219, 231, 255, 255, 128, 128, 128},
-      { 8, 133, 204, 255, 219, 231, 255, 255, 128, 128, 128}
-    }
-  },
-  {
-    /* block Type 1 */
-    {
-      /* Coeff Band 0 */
-      { 134, 152, 233, 224, 234, 52, 255, 166, 128, 128, 128},
-      { 97, 132, 185, 234, 186, 189, 197, 171, 255, 212, 128},
-      { 84, 110, 185, 237, 182, 182, 145, 145, 255, 255, 128}
-    },
-    {
-      /* Coeff Band 1 */
-      { 1, 124, 213, 247, 192, 212, 255, 255, 128, 128, 128},
-      { 88, 111, 178, 254, 189, 211, 255, 255, 128, 128, 128},
-      { 12, 59, 129, 236, 150, 179, 239, 195, 255, 255, 128},
-      { 12, 59, 129, 236, 150, 179, 239, 195, 255, 255, 128}
-    },
-    {
-      /* Coeff Band 2 */
-      { 1, 102, 225, 255, 210, 240, 128, 128, 128, 128, 128},
-      { 110, 78, 195, 254, 200, 191, 255, 255, 128, 128, 128},
-      { 37, 63, 177, 255, 194, 195, 128, 128, 128, 128, 128},
-      { 37, 63, 177, 255, 194, 195, 128, 128, 128, 128, 128}
-    },
-    {
-      /* Coeff Band 3 */
-      { 1, 1, 229, 255, 202, 224, 128, 128, 128, 128, 128},
-      { 150, 1, 192, 255, 206, 226, 128, 128, 128, 128, 128},
-      { 75, 1, 138, 255, 172, 228, 128, 128, 128, 128, 128},
-      { 75, 1, 138, 255, 172, 228, 128, 128, 128, 128, 128}
-    },
-    {
-      /* Coeff Band 4 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}
-    },
-    {
-      /* Coeff Band 5 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}
-    },
-    {
-      /* Coeff Band 6 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}
-    },
-    {
-      /* Coeff Band 7 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}
-    }
-  },
-  {
-    /* block Type 2 */
-    {
-      /* Coeff Band 0 */
-      { 11, 181, 226, 199, 183, 255, 255, 255, 128, 128, 128},
-      { 2, 147, 185, 248, 163, 180, 255, 236, 128, 128, 128},
-      { 1, 123, 157, 238, 154, 176, 255, 226, 255, 255, 128},
-      { 1, 123, 157, 238, 154, 176, 255, 226, 255, 255, 128}
-    },
-    {
-      /* Coeff Band 1 */
-      { 1, 150, 191, 246, 174, 188, 255, 235, 128, 128, 128},
-      { 1, 125, 166, 245, 165, 185, 255, 234, 128, 128, 128},
-      { 1, 79, 125, 240, 148, 179, 255, 234, 255, 255, 128},
-      { 1, 79, 125, 240, 148, 179, 255, 234, 255, 255, 128}
-    },
-    {
-      /* Coeff Band 2 */
-      { 1, 146, 184, 242, 167, 183, 255, 230, 255, 255, 128},
-      { 1, 119, 160, 239, 156, 178, 255, 231, 255, 255, 128},
-      { 1, 75, 115, 234, 142, 173, 255, 225, 255, 255, 128},
-      { 1, 75, 115, 234, 142, 173, 255, 225, 255, 255, 128}
-    },
-    {
-      /* Coeff Band 3 */
-      { 1, 150, 188, 244, 169, 183, 255, 233, 255, 255, 128},
-      { 1, 123, 162, 243, 161, 180, 255, 233, 128, 128, 128},
-      { 1, 76, 120, 238, 148, 178, 255, 230, 255, 255, 128},
-      { 1, 76, 120, 238, 148, 178, 255, 230, 255, 255, 128}
-    },
-    {
-      /* Coeff Band 4 */
-      { 1, 163, 202, 252, 188, 204, 255, 248, 128, 128, 128},
-      { 1, 136, 180, 251, 181, 201, 255, 246, 128, 128, 128},
-      { 1, 92, 146, 249, 170, 197, 255, 245, 128, 128, 128},
-      { 1, 92, 146, 249, 170, 197, 255, 245, 128, 128, 128}
-    },
-    {
-      /* Coeff Band 5 */
-      { 1, 156, 195, 249, 179, 193, 255, 241, 255, 255, 128},
-      { 1, 128, 169, 248, 171, 192, 255, 242, 255, 255, 128},
-      { 1, 84, 132, 245, 158, 187, 255, 240, 255, 255, 128},
-      { 1, 84, 132, 245, 158, 187, 255, 240, 255, 255, 128}
-    },
-    {
-      /* Coeff Band 6 */
-      { 1, 36, 71, 251, 192, 201, 255, 243, 255, 255, 128},
-      { 1, 49, 185, 250, 184, 199, 255, 242, 128, 128, 128},
-      { 1, 95, 147, 247, 168, 190, 255, 239, 255, 255, 128},
-      { 1, 95, 147, 247, 168, 190, 255, 239, 255, 255, 128}
-    },
-    {
-      /* Coeff Band 7 */
-      { 1, 19, 98, 255, 218, 222, 255, 255, 128, 128, 128},
-      { 36, 50, 210, 255, 212, 221, 255, 255, 128, 128, 128},
-      { 6, 117, 180, 254, 199, 216, 255, 251, 128, 128, 128},
-      { 6, 117, 180, 254, 199, 216, 255, 251, 128, 128, 128}
-    }
-  },
-  { /* block Type 3 */
-    { /* Coeff Band 0 */
-      { 192, 18, 155, 172, 145, 164, 192, 135, 246, 223, 255},
-      { 94, 29, 97, 131, 131, 153, 171, 121, 250, 190, 255},
-      { 25, 29, 63, 128, 119, 147, 168, 124, 251, 183, 255},
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}
-    },
-    { /* Coeff Band 1 */
-      { 1, 108, 192, 220, 186, 173, 255, 194, 255, 255, 128},
-      { 123, 104, 188, 221, 165, 171, 247, 180, 255, 255, 128},
-      { 23, 76, 152, 216, 154, 166, 226, 182, 255, 209, 128},
-      { 1, 26, 52, 162, 109, 152, 208, 144, 255, 231, 128}
-    },
-    { /* Coeff Band 2 */
-      { 1, 57, 179, 220, 156, 175, 210, 158, 255, 223, 128},
-      { 48, 57, 134, 212, 151, 170, 219, 185, 255, 248, 128},
-      { 4, 35, 63, 189, 120, 156, 221, 159, 255, 241, 128},
-      { 1, 17, 23, 110, 97, 143, 187, 120, 255, 234, 128}
-    },
-    { /* Coeff Band 3 */
-      { 1, 115, 205, 243, 182, 187, 254, 218, 255, 255, 128},
-      { 80, 101, 186, 241, 183, 186, 249, 182, 255, 255, 128},
-      { 10, 81, 144, 229, 164, 175, 241, 185, 255, 255, 128},
-      { 1, 44, 81, 192, 130, 148, 240, 180, 255, 255, 128}
-    },
-    { /* Coeff Band 4 */
-      { 1, 161, 207, 249, 187, 176, 255, 180, 128, 128, 128},
-      { 79, 148, 196, 240, 186, 182, 253, 171, 255, 255, 128},
-      { 14, 111, 171, 233, 170, 178, 235, 204, 255, 255, 128},
-      { 1, 63, 103, 202, 143, 162, 240, 178, 255, 255, 128}
-    },
-    { /* Coeff Band 5 */
-      { 1, 101, 202, 239, 185, 184, 252, 186, 255, 255, 128},
-      { 43, 67, 166, 237, 178, 190, 246, 194, 255, 255, 128},
-      { 4, 49, 85, 220, 140, 168, 253, 182, 255, 255, 128},
-      { 1, 24, 35, 144, 93, 135, 239, 159, 255, 253, 128}
-    },
-    { /* Coeff Band 6 */
-      { 1, 212, 243, 255, 240, 234, 255, 255, 128, 128, 128},
-      { 98, 168, 234, 255, 229, 234, 255, 255, 128, 128, 128},
-      { 19, 127, 199, 255, 212, 198, 255, 255, 128, 128, 128},
-      { 1, 103, 162, 253, 186, 151, 255, 255, 128, 128, 128}
-    },
-    { /* Coeff Band 7 */
-      { 1, 188, 253, 255, 255, 128, 128, 128, 128, 128, 128},
-      { 191, 68, 242, 255, 255, 128, 128, 128, 128, 128, 128},
-      { 8, 132, 255, 128, 128, 128, 128, 128, 128, 128, 128},
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}
-    }
-  }
-};
-
-static const vp9_prob
-default_hybrid_coef_probs_8x8[BLOCK_TYPES_8X8]
-                             [COEF_BANDS]
-                             [PREV_COEF_CONTEXTS]
-                             [ENTROPY_NODES] = {
-  {
-    /* block Type 0 */
-    {
-      /* Coeff Band 0 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}
-    },
-    {
-      /* Coeff Band 1 */
-      { 60, 140, 195, 255, 212, 214, 128, 128, 128, 128, 128},
-      { 75, 221, 231, 255, 203, 255, 128, 128, 128, 128, 128},
-      { 9, 212, 196, 251, 197, 207, 255, 185, 128, 128, 128},
-      { 9, 212, 196, 251, 197, 207, 255, 185, 128, 128, 128}
-    },
-    {
-      /* Coeff Band 2 */
-      { 1, 227, 226, 255, 215, 215, 128, 128, 128, 128, 128},
-      { 5, 163, 209, 255, 212, 212, 255, 255, 128, 128, 128},
-      { 1, 133, 203, 255, 210, 220, 255, 255, 128, 128, 128},
-      { 1, 133, 203, 255, 210, 220, 255, 255, 128, 128, 128}
-    },
-    {
-      /* Coeff Band 3 */
-      { 1, 226, 225, 255, 228, 236, 128, 128, 128, 128, 128},
-      { 6, 163, 208, 255, 224, 234, 255, 255, 128, 128, 128},
-      { 1, 122, 196, 253, 212, 248, 255, 255, 128, 128, 128},
-      { 1, 122, 196, 253, 212, 248, 255, 255, 128, 128, 128}
-    },
-    {
-      /* Coeff Band 4 */
-      { 1, 222, 197, 254, 193, 216, 255, 236, 128, 128, 128},
-      { 7, 140, 163, 251, 195, 211, 255, 238, 128, 128, 128},
-      { 1, 91, 152, 249, 181, 197, 255, 239, 128, 128, 128},
-      { 1, 91, 152, 249, 181, 197, 255, 239, 128, 128, 128}
-    },
-    {
-      /* Coeff Band 5 */
-      { 1, 226, 218, 255, 216, 241, 255, 255, 128, 128, 128},
-      { 6, 154, 191, 255, 218, 240, 255, 255, 128, 128, 128},
-      { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128},
-      { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128}
-    },
-    {
-      /* Coeff Band 6 */
-      { 1, 221, 217, 255, 208, 217, 255, 232, 128, 128, 128},
-      { 11, 155, 189, 254, 203, 211, 255, 249, 128, 128, 128},
-      { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128},
-      { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128}
-    },
-    {
-      /* Coeff Band 7 */
-      { 1, 207, 235, 255, 232, 240, 128, 128, 128, 128, 128},
-      { 58, 161, 216, 255, 229, 235, 255, 255, 128, 128, 128},
-      { 8, 133, 204, 255, 219, 231, 255, 255, 128, 128, 128},
-      { 8, 133, 204, 255, 219, 231, 255, 255, 128, 128, 128}
-    }
-  },
-  {
-    /* block Type 1 */
-    {
-      /* Coeff Band 0 */
-      { 134, 152, 233, 224, 234, 52, 255, 166, 128, 128, 128},
-      { 97, 132, 185, 234, 186, 189, 197, 171, 255, 212, 128},
-      { 84, 110, 185, 237, 182, 182, 145, 145, 255, 255, 128}
-    },
-    {
-      /* Coeff Band 1 */
-      { 1, 124, 213, 247, 192, 212, 255, 255, 128, 128, 128},
-      { 88, 111, 178, 254, 189, 211, 255, 255, 128, 128, 128},
-      { 12, 59, 129, 236, 150, 179, 239, 195, 255, 255, 128},
-      { 12, 59, 129, 236, 150, 179, 239, 195, 255, 255, 128}
-    },
-    {
-      /* Coeff Band 2 */
-      { 1, 102, 225, 255, 210, 240, 128, 128, 128, 128, 128},
-      { 110, 78, 195, 254, 200, 191, 255, 255, 128, 128, 128},
-      { 37, 63, 177, 255, 194, 195, 128, 128, 128, 128, 128},
-      { 37, 63, 177, 255, 194, 195, 128, 128, 128, 128, 128}
-    },
-    {
-      /* Coeff Band 3 */
-      { 1, 1, 229, 255, 202, 224, 128, 128, 128, 128, 128},
-      { 150, 1, 192, 255, 206, 226, 128, 128, 128, 128, 128},
-      { 75, 1, 138, 255, 172, 228, 128, 128, 128, 128, 128},
-      { 75, 1, 138, 255, 172, 228, 128, 128, 128, 128, 128}
-    },
-    {
-      /* Coeff Band 4 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}
-    },
-    {
-      /* Coeff Band 5 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}
-    },
-    {
-      /* Coeff Band 6 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}
-    },
-    {
-      /* Coeff Band 7 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}
-    }
-  },
-  {
-    /* block Type 2 */
-    {
-      /* Coeff Band 0 */
-      { 11, 181, 226, 199, 183, 255, 255, 255, 128, 128, 128},
-      { 2, 147, 185, 248, 163, 180, 255, 236, 128, 128, 128},
-      { 1, 123, 157, 238, 154, 176, 255, 226, 255, 255, 128},
-      { 1, 123, 157, 238, 154, 176, 255, 226, 255, 255, 128}
-    },
-    {
-      /* Coeff Band 1 */
-      { 1, 150, 191, 246, 174, 188, 255, 235, 128, 128, 128},
-      { 1, 125, 166, 245, 165, 185, 255, 234, 128, 128, 128},
-      { 1, 79, 125, 240, 148, 179, 255, 234, 255, 255, 128},
-      { 1, 79, 125, 240, 148, 179, 255, 234, 255, 255, 128}
-    },
-    {
-      /* Coeff Band 2 */
-      { 1, 146, 184, 242, 167, 183, 255, 230, 255, 255, 128},
-      { 1, 119, 160, 239, 156, 178, 255, 231, 255, 255, 128},
-      { 1, 75, 115, 234, 142, 173, 255, 225, 255, 255, 128},
-      { 1, 75, 115, 234, 142, 173, 255, 225, 255, 255, 128}
-    },
-    {
-      /* Coeff Band 3 */
-      { 1, 150, 188, 244, 169, 183, 255, 233, 255, 255, 128},
-      { 1, 123, 162, 243, 161, 180, 255, 233, 128, 128, 128},
-      { 1, 76, 120, 238, 148, 178, 255, 230, 255, 255, 128},
-      { 1, 76, 120, 238, 148, 178, 255, 230, 255, 255, 128}
-    },
-    {
-      /* Coeff Band 4 */
-      { 1, 163, 202, 252, 188, 204, 255, 248, 128, 128, 128},
-      { 1, 136, 180, 251, 181, 201, 255, 246, 128, 128, 128},
-      { 1, 92, 146, 249, 170, 197, 255, 245, 128, 128, 128},
-      { 1, 92, 146, 249, 170, 197, 255, 245, 128, 128, 128}
-    },
-    {
-      /* Coeff Band 5 */
-      { 1, 156, 195, 249, 179, 193, 255, 241, 255, 255, 128},
-      { 1, 128, 169, 248, 171, 192, 255, 242, 255, 255, 128},
-      { 1, 84, 132, 245, 158, 187, 255, 240, 255, 255, 128},
-      { 1, 84, 132, 245, 158, 187, 255, 240, 255, 255, 128}
-    },
-    {
-      /* Coeff Band 6 */
-      { 1, 36, 71, 251, 192, 201, 255, 243, 255, 255, 128},
-      { 1, 49, 185, 250, 184, 199, 255, 242, 128, 128, 128},
-      { 1, 95, 147, 247, 168, 190, 255, 239, 255, 255, 128},
-      { 1, 95, 147, 247, 168, 190, 255, 239, 255, 255, 128}
-    },
-    {
-      /* Coeff Band 7 */
-      { 1, 19, 98, 255, 218, 222, 255, 255, 128, 128, 128},
-      { 36, 50, 210, 255, 212, 221, 255, 255, 128, 128, 128},
-      { 6, 117, 180, 254, 199, 216, 255, 251, 128, 128, 128},
-      { 6, 117, 180, 254, 199, 216, 255, 251, 128, 128, 128}
-    }
-  },
-  { /* block Type 3 */
-    { /* Coeff Band 0 */
-      { 192, 18, 155, 172, 145, 164, 192, 135, 246, 223, 255},
-      { 94, 29, 97, 131, 131, 153, 171, 121, 250, 190, 255},
-      { 25, 29, 63, 128, 119, 147, 168, 124, 251, 183, 255},
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}
-    },
-    { /* Coeff Band 1 */
-      { 1, 108, 192, 220, 186, 173, 255, 194, 255, 255, 128},
-      { 123, 104, 188, 221, 165, 171, 247, 180, 255, 255, 128},
-      { 23, 76, 152, 216, 154, 166, 226, 182, 255, 209, 128},
-      { 1, 26, 52, 162, 109, 152, 208, 144, 255, 231, 128}
-    },
-    { /* Coeff Band 2 */
-      { 1, 57, 179, 220, 156, 175, 210, 158, 255, 223, 128},
-      { 48, 57, 134, 212, 151, 170, 219, 185, 255, 248, 128},
-      { 4, 35, 63, 189, 120, 156, 221, 159, 255, 241, 128},
-      { 1, 17, 23, 110, 97, 143, 187, 120, 255, 234, 128}
-    },
-    { /* Coeff Band 3 */
-      { 1, 115, 205, 243, 182, 187, 254, 218, 255, 255, 128},
-      { 80, 101, 186, 241, 183, 186, 249, 182, 255, 255, 128},
-      { 10, 81, 144, 229, 164, 175, 241, 185, 255, 255, 128},
-      { 1, 44, 81, 192, 130, 148, 240, 180, 255, 255, 128}
-    },
-    { /* Coeff Band 4 */
-      { 1, 161, 207, 249, 187, 176, 255, 180, 128, 128, 128},
-      { 79, 148, 196, 240, 186, 182, 253, 171, 255, 255, 128},
-      { 14, 111, 171, 233, 170, 178, 235, 204, 255, 255, 128},
-      { 1, 63, 103, 202, 143, 162, 240, 178, 255, 255, 128}
-    },
-    { /* Coeff Band 5 */
-      { 1, 101, 202, 239, 185, 184, 252, 186, 255, 255, 128},
-      { 43, 67, 166, 237, 178, 190, 246, 194, 255, 255, 128},
-      { 4, 49, 85, 220, 140, 168, 253, 182, 255, 255, 128},
-      { 1, 24, 35, 144, 93, 135, 239, 159, 255, 253, 128}
-    },
-    { /* Coeff Band 6 */
-      { 1, 212, 243, 255, 240, 234, 255, 255, 128, 128, 128},
-      { 98, 168, 234, 255, 229, 234, 255, 255, 128, 128, 128},
-      { 19, 127, 199, 255, 212, 198, 255, 255, 128, 128, 128},
-      { 1, 103, 162, 253, 186, 151, 255, 255, 128, 128, 128}
-    },
-    { /* Coeff Band 7 */
-      { 1, 188, 253, 255, 255, 128, 128, 128, 128, 128, 128},
-      { 191, 68, 242, 255, 255, 128, 128, 128, 128, 128, 128},
-      { 8, 132, 255, 128, 128, 128, 128, 128, 128, 128, 128},
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}
-    }
-  }
-};
-
-static const vp9_prob
-  default_coef_probs_16x16[BLOCK_TYPES_16X16]
-                          [COEF_BANDS]
-                          [PREV_COEF_CONTEXTS]
-                          [ENTROPY_NODES] = {
-  { /* block Type 0 */
-    { /* Coeff Band 0 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}
-    },
-    { /* Coeff Band 1 */
-      { 60, 140, 195, 255, 212, 214, 128, 128, 128, 128, 128},
-      { 75, 221, 231, 255, 203, 255, 128, 128, 128, 128, 128},
-      { 9, 212, 196, 251, 197, 207, 255, 185, 128, 128, 128},
-      { 9, 212, 196, 251, 197, 207, 255, 185, 128, 128, 128}
-    },
-    { /* Coeff Band 2 */
-      { 1, 227, 226, 255, 215, 215, 128, 128, 128, 128, 128},
-      { 5, 163, 209, 255, 212, 212, 255, 255, 128, 128, 128},
-      { 1, 133, 203, 255, 210, 220, 255, 255, 128, 128, 128},
-      { 1, 133, 203, 255, 210, 220, 255, 255, 128, 128, 128}
-    },
-    { /* Coeff Band 3 */
-      { 1, 226, 225, 255, 228, 236, 128, 128, 128, 128, 128},
-      { 6, 163, 208, 255, 224, 234, 255, 255, 128, 128, 128},
-      { 1, 122, 196, 253, 212, 248, 255, 255, 128, 128, 128},
-      { 1, 122, 196, 253, 212, 248, 255, 255, 128, 128, 128}
-    },
-    { /* Coeff Band 4 */
-      { 1, 222, 197, 254, 193, 216, 255, 236, 128, 128, 128},
-      { 7, 140, 163, 251, 195, 211, 255, 238, 128, 128, 128},
-      { 1, 91, 152, 249, 181, 197, 255, 239, 128, 128, 128},
-      { 1, 91, 152, 249, 181, 197, 255, 239, 128, 128, 128}
-    },
-    { /* Coeff Band 5 */
-      { 1, 226, 218, 255, 216, 241, 255, 255, 128, 128, 128},
-      { 6, 154, 191, 255, 218, 240, 255, 255, 128, 128, 128},
-      { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128},
-      { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128}
-    },
-    { /* Coeff Band 6 */
-      { 1, 221, 217, 255, 208, 217, 255, 232, 128, 128, 128},
-      { 11, 155, 189, 254, 203, 211, 255, 249, 128, 128, 128},
-      { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128},
-      { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128}
-    },
-    { /* Coeff Band 7 */
-      { 1, 207, 235, 255, 232, 240, 128, 128, 128, 128, 128},
-      { 58, 161, 216, 255, 229, 235, 255, 255, 128, 128, 128},
-      { 8, 133, 204, 255, 219, 231, 255, 255, 128, 128, 128},
-      { 8, 133, 204, 255, 219, 231, 255, 255, 128, 128, 128}
-    }
-  },
-  { /* block Type 1 */
-      { /* Coeff Band 0 */
-        { 1, 30, 103, 204, 142, 168, 235, 161, 255, 228, 128},
-        { 1, 35, 90, 192, 130, 161, 227, 158, 255, 226, 255},
-        { 1, 36, 78, 180, 122, 156, 221, 153, 255, 222, 255},
-        { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}
-      },
-      { /* Coeff Band 1 */
-        { 1, 163, 228, 253, 212, 194, 255, 205, 128, 128, 128},
-        { 67, 160, 226, 253, 210, 202, 245, 172, 255, 255, 128},
-        { 51, 147, 219, 251, 207, 207, 255, 217, 128, 128, 128},
-        { 25, 107, 175, 245, 183, 190, 254, 209, 255, 255, 128}
-      },
-      { /* Coeff Band 2 */
-        { 1, 66, 170, 240, 177, 186, 252, 203, 255, 245, 128},
-        { 23, 64, 145, 230, 161, 177, 252, 198, 255, 255, 128},
-        { 6, 51, 99, 208, 135, 163, 249, 178, 255, 248, 128},
-        { 1, 33, 59, 161, 104, 151, 238, 164, 255, 237, 128}
-      },
-      { /* Coeff Band 3 */
-        { 1, 76, 216, 250, 198, 199, 255, 226, 255, 255, 128},
-        { 86, 83, 200, 247, 189, 193, 255, 224, 255, 255, 128},
-        { 30, 75, 164, 242, 172, 184, 254, 218, 255, 255, 128},
-        { 3, 54, 103, 227, 140, 172, 253, 201, 255, 255, 128}
-      },
-      { /* Coeff Band 4 */
-        { 1, 241, 247, 255, 233, 223, 255, 255, 128, 128, 128},
-        { 78, 212, 242, 255, 226, 230, 255, 255, 128, 128, 128},
-        { 10, 167, 224, 255, 217, 225, 255, 128, 128, 128, 128},
-        { 1, 104, 176, 250, 166, 219, 255, 255, 128, 128, 128}
-      },
-      { /* Coeff Band 5 */
-        { 1, 194, 241, 254, 228, 214, 248, 237, 255, 255, 128},
-        { 95, 133, 228, 254, 218, 215, 255, 229, 128, 128, 128},
-        { 24, 119, 201, 252, 202, 205, 255, 229, 128, 128, 128},
-        { 1, 88, 155, 246, 183, 193, 255, 205, 128, 128, 128}
-      },
-      { /* Coeff Band 6 */
-        { 1, 204, 236, 255, 222, 220, 255, 239, 128, 128, 128},
-        { 126, 105, 225, 254, 214, 217, 255, 254, 128, 128, 128},
-        { 44, 86, 190, 251, 197, 204, 255, 233, 128, 128, 128},
-        { 6, 71, 130, 240, 164, 188, 255, 246, 128, 128, 128}
-      },
-      { /* Coeff Band 7 */
-        { 1, 195, 250, 255, 239, 197, 128, 128, 128, 128, 128},
-        { 167, 102, 249, 255, 234, 255, 128, 128, 128, 128, 128},
-        { 65, 91, 222, 255, 217, 255, 128, 128, 128, 128, 128},
-        { 1, 59, 128, 255, 154, 255, 128, 128, 128, 128, 128}
-      }
-  },
-  { /* block Type 2 */
-      { /* Coeff Band 0 */
-        { 1, 30, 103, 204, 142, 168, 235, 161, 255, 228, 128},
-        { 1, 35, 90, 192, 130, 161, 227, 158, 255, 226, 255},
-        { 1, 36, 78, 180, 122, 156, 221, 153, 255, 222, 255},
-        { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}
-      },
-      { /* Coeff Band 1 */
-        { 1, 163, 228, 253, 212, 194, 255, 205, 128, 128, 128},
-        { 67, 160, 226, 253, 210, 202, 245, 172, 255, 255, 128},
-        { 51, 147, 219, 251, 207, 207, 255, 217, 128, 128, 128},
-        { 25, 107, 175, 245, 183, 190, 254, 209, 255, 255, 128}
-      },
-      { /* Coeff Band 2 */
-        { 1, 66, 170, 240, 177, 186, 252, 203, 255, 245, 128},
-        { 23, 64, 145, 230, 161, 177, 252, 198, 255, 255, 128},
-        { 6, 51, 99, 208, 135, 163, 249, 178, 255, 248, 128},
-        { 1, 33, 59, 161, 104, 151, 238, 164, 255, 237, 128}
-      },
-      { /* Coeff Band 3 */
-        { 1, 76, 216, 250, 198, 199, 255, 226, 255, 255, 128},
-        { 86, 83, 200, 247, 189, 193, 255, 224, 255, 255, 128},
-        { 30, 75, 164, 242, 172, 184, 254, 218, 255, 255, 128},
-        { 3, 54, 103, 227, 140, 172, 253, 201, 255, 255, 128}
-      },
-      { /* Coeff Band 4 */
-        { 1, 241, 247, 255, 233, 223, 255, 255, 128, 128, 128},
-        { 78, 212, 242, 255, 226, 230, 255, 255, 128, 128, 128},
-        { 10, 167, 224, 255, 217, 225, 255, 128, 128, 128, 128},
-        { 1, 104, 176, 250, 166, 219, 255, 255, 128, 128, 128}
-      },
-      { /* Coeff Band 5 */
-        { 1, 194, 241, 254, 228, 214, 248, 237, 255, 255, 128},
-        { 95, 133, 228, 254, 218, 215, 255, 229, 128, 128, 128},
-        { 24, 119, 201, 252, 202, 205, 255, 229, 128, 128, 128},
-        { 1, 88, 155, 246, 183, 193, 255, 205, 128, 128, 128}
-      },
-      { /* Coeff Band 6 */
-        { 1, 204, 236, 255, 222, 220, 255, 239, 128, 128, 128},
-        { 126, 105, 225, 254, 214, 217, 255, 254, 128, 128, 128},
-        { 44, 86, 190, 251, 197, 204, 255, 233, 128, 128, 128},
-        { 6, 71, 130, 240, 164, 188, 255, 246, 128, 128, 128}
-      },
-      { /* Coeff Band 7 */
-        { 1, 195, 250, 255, 239, 197, 128, 128, 128, 128, 128},
-        { 167, 102, 249, 255, 234, 255, 128, 128, 128, 128, 128},
-        { 65, 91, 222, 255, 217, 255, 128, 128, 128, 128, 128},
-        { 1, 59, 128, 255, 154, 255, 128, 128, 128, 128, 128}
-      }
-  },
-  { /* block Type 3 */
-    { /* Coeff Band 0 */
-      { 17, 105, 227, 195, 164, 170, 168, 137, 221, 160, 184},
-      { 6, 92, 166, 193, 158, 169, 179, 142, 236, 175, 200},
-      { 2, 68, 118, 193, 147, 168, 187, 149, 241, 178, 247},
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}
-    },
-    { /* Coeff Band 1 */
-      { 1, 193, 221, 246, 198, 194, 244, 176, 255, 192, 128},
-      { 112, 160, 209, 244, 196, 194, 243, 175, 255, 209, 128},
-      { 45, 123, 175, 240, 184, 195, 239, 178, 255, 218, 255},
-      { 16, 53, 75, 169, 119, 152, 209, 146, 255, 219, 255}
-    },
-    { /* Coeff Band 2 */
-      { 1, 141, 183, 240, 176, 187, 246, 198, 255, 218, 128},
-      { 36, 97, 150, 231, 161, 180, 243, 191, 255, 217, 255},
-      { 8, 65, 111, 210, 143, 166, 230, 167, 255, 224, 255},
-      { 2, 35, 61, 157, 113, 149, 208, 142, 255, 217, 255}
-    },
-    { /* Coeff Band 3 */
-      { 1, 173, 196, 245, 184, 191, 252, 211, 255, 240, 128},
-      { 35, 119, 175, 242, 177, 187, 252, 209, 255, 235, 128},
-      { 4, 88, 141, 234, 161, 180, 249, 200, 255, 228, 128},
-      { 1, 57, 95, 203, 133, 161, 235, 167, 255, 231, 255}
-    },
-    { /* Coeff Band 4 */
-      { 1, 208, 227, 249, 209, 204, 248, 188, 255, 248, 128},
-      { 28, 162, 211, 247, 203, 200, 252, 188, 255, 232, 128},
-      { 5, 114, 174, 238, 182, 189, 245, 184, 255, 238, 128},
-      { 1, 61, 100, 205, 136, 164, 235, 163, 255, 239, 128}
-    },
-    { /* Coeff Band 5 */
-      { 1, 195, 218, 252, 208, 207, 250, 205, 255, 245, 128},
-      { 22, 141, 196, 249, 198, 201, 250, 202, 255, 244, 128},
-      { 2, 105, 163, 240, 178, 189, 246, 191, 255, 246, 128},
-      { 1, 70, 112, 206, 144, 167, 232, 162, 255, 239, 128}
-    },
-    { /* Coeff Band 6 */
-      { 1, 204, 215, 251, 204, 203, 255, 222, 255, 225, 128},
-      { 15, 140, 194, 249, 194, 199, 254, 221, 255, 253, 128},
-      { 1, 95, 153, 243, 172, 188, 254, 213, 255, 248, 128},
-      { 1, 59, 99, 216, 135, 166, 247, 190, 255, 237, 255}
-    },
-    { /* Coeff Band 7 */
-      { 1, 7, 231, 255, 227, 223, 255, 240, 255, 255, 128},
-      { 15, 157, 217, 255, 218, 219, 255, 239, 255, 255, 128},
-      { 1, 114, 182, 252, 198, 207, 255, 235, 255, 255, 128},
-      { 1, 71, 122, 238, 154, 181, 255, 216, 255, 255, 128}
-    }
-  }
-};
-
-static const vp9_prob
-  default_hybrid_coef_probs_16x16[BLOCK_TYPES_16X16]
-                                 [COEF_BANDS]
-                                 [PREV_COEF_CONTEXTS]
-                                 [ENTROPY_NODES] = {
-  { /* block Type 0 */
-    { /* Coeff Band 0 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}
-    },
-    { /* Coeff Band 1 */
-      { 60, 140, 195, 255, 212, 214, 128, 128, 128, 128, 128},
-      { 75, 221, 231, 255, 203, 255, 128, 128, 128, 128, 128},
-      { 9, 212, 196, 251, 197, 207, 255, 185, 128, 128, 128},
-      { 9, 212, 196, 251, 197, 207, 255, 185, 128, 128, 128}
-    },
-    { /* Coeff Band 2 */
-      { 1, 227, 226, 255, 215, 215, 128, 128, 128, 128, 128},
-      { 5, 163, 209, 255, 212, 212, 255, 255, 128, 128, 128},
-      { 1, 133, 203, 255, 210, 220, 255, 255, 128, 128, 128},
-      { 1, 133, 203, 255, 210, 220, 255, 255, 128, 128, 128}
-    },
-    { /* Coeff Band 3 */
-      { 1, 226, 225, 255, 228, 236, 128, 128, 128, 128, 128},
-      { 6, 163, 208, 255, 224, 234, 255, 255, 128, 128, 128},
-      { 1, 122, 196, 253, 212, 248, 255, 255, 128, 128, 128},
-      { 1, 122, 196, 253, 212, 248, 255, 255, 128, 128, 128}
-    },
-    { /* Coeff Band 4 */
-      { 1, 222, 197, 254, 193, 216, 255, 236, 128, 128, 128},
-      { 7, 140, 163, 251, 195, 211, 255, 238, 128, 128, 128},
-      { 1, 91, 152, 249, 181, 197, 255, 239, 128, 128, 128},
-      { 1, 91, 152, 249, 181, 197, 255, 239, 128, 128, 128}
-    },
-    { /* Coeff Band 5 */
-      { 1, 226, 218, 255, 216, 241, 255, 255, 128, 128, 128},
-      { 6, 154, 191, 255, 218, 240, 255, 255, 128, 128, 128},
-      { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128},
-      { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128}
-    },
-    { /* Coeff Band 6 */
-      { 1, 221, 217, 255, 208, 217, 255, 232, 128, 128, 128},
-      { 11, 155, 189, 254, 203, 211, 255, 249, 128, 128, 128},
-      { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128},
-      { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128}
-    },
-    { /* Coeff Band 7 */
-      { 1, 207, 235, 255, 232, 240, 128, 128, 128, 128, 128},
-      { 58, 161, 216, 255, 229, 235, 255, 255, 128, 128, 128},
-      { 8, 133, 204, 255, 219, 231, 255, 255, 128, 128, 128},
-      { 8, 133, 204, 255, 219, 231, 255, 255, 128, 128, 128}
-    }
-  },
-  { /* block Type 1 */
-      { /* Coeff Band 0 */
-        { 1, 30, 103, 204, 142, 168, 235, 161, 255, 228, 128},
-        { 1, 35, 90, 192, 130, 161, 227, 158, 255, 226, 255},
-        { 1, 36, 78, 180, 122, 156, 221, 153, 255, 222, 255},
-        { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}
-      },
-      { /* Coeff Band 1 */
-        { 1, 163, 228, 253, 212, 194, 255, 205, 128, 128, 128},
-        { 67, 160, 226, 253, 210, 202, 245, 172, 255, 255, 128},
-        { 51, 147, 219, 251, 207, 207, 255, 217, 128, 128, 128},
-        { 25, 107, 175, 245, 183, 190, 254, 209, 255, 255, 128}
-      },
-      { /* Coeff Band 2 */
-        { 1, 66, 170, 240, 177, 186, 252, 203, 255, 245, 128},
-        { 23, 64, 145, 230, 161, 177, 252, 198, 255, 255, 128},
-        { 6, 51, 99, 208, 135, 163, 249, 178, 255, 248, 128},
-        { 1, 33, 59, 161, 104, 151, 238, 164, 255, 237, 128}
-      },
-      { /* Coeff Band 3 */
-        { 1, 76, 216, 250, 198, 199, 255, 226, 255, 255, 128},
-        { 86, 83, 200, 247, 189, 193, 255, 224, 255, 255, 128},
-        { 30, 75, 164, 242, 172, 184, 254, 218, 255, 255, 128},
-        { 3, 54, 103, 227, 140, 172, 253, 201, 255, 255, 128}
-      },
-      { /* Coeff Band 4 */
-        { 1, 241, 247, 255, 233, 223, 255, 255, 128, 128, 128},
-        { 78, 212, 242, 255, 226, 230, 255, 255, 128, 128, 128},
-        { 10, 167, 224, 255, 217, 225, 255, 128, 128, 128, 128},
-        { 1, 104, 176, 250, 166, 219, 255, 255, 128, 128, 128}
-      },
-      { /* Coeff Band 5 */
-        { 1, 194, 241, 254, 228, 214, 248, 237, 255, 255, 128},
-        { 95, 133, 228, 254, 218, 215, 255, 229, 128, 128, 128},
-        { 24, 119, 201, 252, 202, 205, 255, 229, 128, 128, 128},
-        { 1, 88, 155, 246, 183, 193, 255, 205, 128, 128, 128}
-      },
-      { /* Coeff Band 6 */
-        { 1, 204, 236, 255, 222, 220, 255, 239, 128, 128, 128},
-        { 126, 105, 225, 254, 214, 217, 255, 254, 128, 128, 128},
-        { 44, 86, 190, 251, 197, 204, 255, 233, 128, 128, 128},
-        { 6, 71, 130, 240, 164, 188, 255, 246, 128, 128, 128}
-      },
-      { /* Coeff Band 7 */
-        { 1, 195, 250, 255, 239, 197, 128, 128, 128, 128, 128},
-        { 167, 102, 249, 255, 234, 255, 128, 128, 128, 128, 128},
-        { 65, 91, 222, 255, 217, 255, 128, 128, 128, 128, 128},
-        { 1, 59, 128, 255, 154, 255, 128, 128, 128, 128, 128}
-      }
-  },
-  { /* block Type 2 */
-      { /* Coeff Band 0 */
-        { 1, 30, 103, 204, 142, 168, 235, 161, 255, 228, 128},
-        { 1, 35, 90, 192, 130, 161, 227, 158, 255, 226, 255},
-        { 1, 36, 78, 180, 122, 156, 221, 153, 255, 222, 255},
-        { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}
-      },
-      { /* Coeff Band 1 */
-        { 1, 163, 228, 253, 212, 194, 255, 205, 128, 128, 128},
-        { 67, 160, 226, 253, 210, 202, 245, 172, 255, 255, 128},
-        { 51, 147, 219, 251, 207, 207, 255, 217, 128, 128, 128},
-        { 25, 107, 175, 245, 183, 190, 254, 209, 255, 255, 128}
-      },
-      { /* Coeff Band 2 */
-        { 1, 66, 170, 240, 177, 186, 252, 203, 255, 245, 128},
-        { 23, 64, 145, 230, 161, 177, 252, 198, 255, 255, 128},
-        { 6, 51, 99, 208, 135, 163, 249, 178, 255, 248, 128},
-        { 1, 33, 59, 161, 104, 151, 238, 164, 255, 237, 128}
-      },
-      { /* Coeff Band 3 */
-        { 1, 76, 216, 250, 198, 199, 255, 226, 255, 255, 128},
-        { 86, 83, 200, 247, 189, 193, 255, 224, 255, 255, 128},
-        { 30, 75, 164, 242, 172, 184, 254, 218, 255, 255, 128},
-        { 3, 54, 103, 227, 140, 172, 253, 201, 255, 255, 128}
-      },
-      { /* Coeff Band 4 */
-        { 1, 241, 247, 255, 233, 223, 255, 255, 128, 128, 128},
-        { 78, 212, 242, 255, 226, 230, 255, 255, 128, 128, 128},
-        { 10, 167, 224, 255, 217, 225, 255, 128, 128, 128, 128},
-        { 1, 104, 176, 250, 166, 219, 255, 255, 128, 128, 128}
-      },
-      { /* Coeff Band 5 */
-        { 1, 194, 241, 254, 228, 214, 248, 237, 255, 255, 128},
-        { 95, 133, 228, 254, 218, 215, 255, 229, 128, 128, 128},
-        { 24, 119, 201, 252, 202, 205, 255, 229, 128, 128, 128},
-        { 1, 88, 155, 246, 183, 193, 255, 205, 128, 128, 128}
-      },
-      { /* Coeff Band 6 */
-        { 1, 204, 236, 255, 222, 220, 255, 239, 128, 128, 128},
-        { 126, 105, 225, 254, 214, 217, 255, 254, 128, 128, 128},
-        { 44, 86, 190, 251, 197, 204, 255, 233, 128, 128, 128},
-        { 6, 71, 130, 240, 164, 188, 255, 246, 128, 128, 128}
-      },
-      { /* Coeff Band 7 */
-        { 1, 195, 250, 255, 239, 197, 128, 128, 128, 128, 128},
-        { 167, 102, 249, 255, 234, 255, 128, 128, 128, 128, 128},
-        { 65, 91, 222, 255, 217, 255, 128, 128, 128, 128, 128},
-        { 1, 59, 128, 255, 154, 255, 128, 128, 128, 128, 128}
-      }
-  },
-  { /* block Type 3 */
-    { /* Coeff Band 0 */
-      { 17, 105, 227, 195, 164, 170, 168, 137, 221, 160, 184},
-      { 6, 92, 166, 193, 158, 169, 179, 142, 236, 175, 200},
-      { 2, 68, 118, 193, 147, 168, 187, 149, 241, 178, 247},
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}
-    },
-    { /* Coeff Band 1 */
-      { 1, 193, 221, 246, 198, 194, 244, 176, 255, 192, 128},
-      { 112, 160, 209, 244, 196, 194, 243, 175, 255, 209, 128},
-      { 45, 123, 175, 240, 184, 195, 239, 178, 255, 218, 255},
-      { 16, 53, 75, 169, 119, 152, 209, 146, 255, 219, 255}
-    },
-    { /* Coeff Band 2 */
-      { 1, 141, 183, 240, 176, 187, 246, 198, 255, 218, 128},
-      { 36, 97, 150, 231, 161, 180, 243, 191, 255, 217, 255},
-      { 8, 65, 111, 210, 143, 166, 230, 167, 255, 224, 255},
-      { 2, 35, 61, 157, 113, 149, 208, 142, 255, 217, 255}
-    },
-    { /* Coeff Band 3 */
-      { 1, 173, 196, 245, 184, 191, 252, 211, 255, 240, 128},
-      { 35, 119, 175, 242, 177, 187, 252, 209, 255, 235, 128},
-      { 4, 88, 141, 234, 161, 180, 249, 200, 255, 228, 128},
-      { 1, 57, 95, 203, 133, 161, 235, 167, 255, 231, 255}
-    },
-    { /* Coeff Band 4 */
-      { 1, 208, 227, 249, 209, 204, 248, 188, 255, 248, 128},
-      { 28, 162, 211, 247, 203, 200, 252, 188, 255, 232, 128},
-      { 5, 114, 174, 238, 182, 189, 245, 184, 255, 238, 128},
-      { 1, 61, 100, 205, 136, 164, 235, 163, 255, 239, 128}
-    },
-    { /* Coeff Band 5 */
-      { 1, 195, 218, 252, 208, 207, 250, 205, 255, 245, 128},
-      { 22, 141, 196, 249, 198, 201, 250, 202, 255, 244, 128},
-      { 2, 105, 163, 240, 178, 189, 246, 191, 255, 246, 128},
-      { 1, 70, 112, 206, 144, 167, 232, 162, 255, 239, 128}
-    },
-    { /* Coeff Band 6 */
-      { 1, 204, 215, 251, 204, 203, 255, 222, 255, 225, 128},
-      { 15, 140, 194, 249, 194, 199, 254, 221, 255, 253, 128},
-      { 1, 95, 153, 243, 172, 188, 254, 213, 255, 248, 128},
-      { 1, 59, 99, 216, 135, 166, 247, 190, 255, 237, 255}
-    },
-    { /* Coeff Band 7 */
-      { 1, 7, 231, 255, 227, 223, 255, 240, 255, 255, 128},
-      { 15, 157, 217, 255, 218, 219, 255, 239, 255, 255, 128},
-      { 1, 114, 182, 252, 198, 207, 255, 235, 255, 255, 128},
-      { 1, 71, 122, 238, 154, 181, 255, 216, 255, 255, 128}
-    }
-  }
-};
--- a/vp8/common/entropy.c
+++ /dev/null
@@ -1,447 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include <stdio.h>
-
-#include "entropy.h"
-#include "string.h"
-#include "blockd.h"
-#include "onyxc_int.h"
-#include "entropymode.h"
-#include "vpx_mem/vpx_mem.h"
-
-#define uchar unsigned char     /* typedefs can clash */
-#define uint  unsigned int
-
-typedef const uchar cuchar;
-typedef const uint cuint;
-
-typedef vp9_prob Prob;
-
-#include "coefupdateprobs.h"
-
-const int vp9_i8x8_block[4] = {0, 2, 8, 10};
-
-DECLARE_ALIGNED(16, const unsigned char, vp9_norm[256]) = {
-  0, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,
-  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
-  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
-};
-
-DECLARE_ALIGNED(16, const int, vp9_coef_bands[16]) = {
-  0, 1, 2, 3, 6, 4, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7
-};
-
-DECLARE_ALIGNED(16, cuchar, vp9_prev_token_class[MAX_ENTROPY_TOKENS]) = {
-  0, 1, 2, 2, 3, 3, 3, 3, 3, 3, 3, 0
-};
-
-DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d[16]) = {
-  0,  1,  4,  8,
-  5,  2,  3,  6,
-  9, 12, 13, 10,
-  7, 11, 14, 15,
-};
-
-DECLARE_ALIGNED(16, const int, vp9_col_scan[16]) = {
-  0, 4,  8, 12,
-  1, 5,  9, 13,
-  2, 6, 10, 14,
-  3, 7, 11, 15
-};
-DECLARE_ALIGNED(16, const int, vp9_row_scan[16]) = {
-  0,   1,  2,  3,
-  4,   5,  6,  7,
-  8,   9, 10, 11,
-  12, 13, 14, 15
-};
-
-
-DECLARE_ALIGNED(64, const int, vp9_coef_bands_8x8[64]) = { 0, 1, 2, 3, 5, 4, 4, 5,
-                                                           5, 3, 6, 3, 5, 4, 6, 6,
-                                                           6, 5, 5, 6, 6, 6, 6, 6,
-                                                           6, 6, 6, 6, 6, 6, 6, 6,
-                                                           6, 6, 6, 6, 7, 7, 7, 7,
-                                                           7, 7, 7, 7, 7, 7, 7, 7,
-                                                           7, 7, 7, 7, 7, 7, 7, 7,
-                                                           7, 7, 7, 7, 7, 7, 7, 7
-                                                         };
-DECLARE_ALIGNED(64, const int, vp9_default_zig_zag1d_8x8[64]) = {
-  0,  1,  8, 16,  9,  2,  3, 10, 17, 24, 32, 25, 18, 11,  4,  5,
-  12, 19, 26, 33, 40, 48, 41, 34, 27, 20, 13,  6,  7, 14, 21, 28,
-  35, 42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23, 30, 37, 44, 51,
-  58, 59, 52, 45, 38, 31, 39, 46, 53, 60, 61, 54, 47, 55, 62, 63,
-};
-
-// Table can be optimized.
-DECLARE_ALIGNED(16, const int, vp9_coef_bands_16x16[256]) = {
-    0, 1, 2, 3, 5, 4, 4, 5, 5, 3, 6, 3, 5, 4, 6, 6,
-    6, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
-    6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-};
-DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d_16x16[256]) = {
-      0,   1,  16,  32,  17,   2,   3,  18,  33,  48,  64,  49,  34,  19,   4,   5,
-     20,  35,  50,  65,  80,  96,  81,  66,  51,  36,  21,   6,   7,  22,  37,  52,
-     67,  82,  97, 112, 128, 113,  98,  83,  68,  53,  38,  23,   8,   9,  24,  39,
-     54,  69,  84,  99, 114, 129, 144, 160, 145, 130, 115, 100,  85,  70,  55,  40,
-     25,  10,  11,  26,  41,  56,  71,  86, 101, 116, 131, 146, 161, 176, 192, 177,
-    162, 147, 132, 117, 102,  87,  72,  57,  42,  27,  12,  13,  28,  43,  58,  73,
-     88, 103, 118, 133, 148, 163, 178, 193, 208, 224, 209, 194, 179, 164, 149, 134,
-    119, 104,  89,  74,  59,  44,  29,  14,  15,  30,  45,  60,  75,  90, 105, 120,
-    135, 150, 165, 180, 195, 210, 225, 240, 241, 226, 211, 196, 181, 166, 151, 136,
-    121, 106,  91,  76,  61,  46,  31,  47,  62,  77,  92, 107, 122, 137, 152, 167,
-    182, 197, 212, 227, 242, 243, 228, 213, 198, 183, 168, 153, 138, 123, 108,  93,
-     78,  63,  79,  94, 109, 124, 139, 154, 169, 184, 199, 214, 229, 244, 245, 230,
-    215, 200, 185, 170, 155, 140, 125, 110,  95, 111, 126, 141, 156, 171, 186, 201,
-    216, 231, 246, 247, 232, 217, 202, 187, 172, 157, 142, 127, 143, 158, 173, 188,
-    203, 218, 233, 248, 249, 234, 219, 204, 189, 174, 159, 175, 190, 205, 220, 235,
-    250, 251, 236, 221, 206, 191, 207, 222, 237, 252, 253, 238, 223, 239, 254, 255,
-};
-
-
-/* Array indices are identical to previously-existing CONTEXT_NODE indices */
-
-const vp9_tree_index vp9_coef_tree[ 22] =     /* corresponding _CONTEXT_NODEs */
-{
-  -DCT_EOB_TOKEN, 2,                             /* 0 = EOB */
-  -ZERO_TOKEN, 4,                               /* 1 = ZERO */
-  -ONE_TOKEN, 6,                               /* 2 = ONE */
-  8, 12,                                      /* 3 = LOW_VAL */
-  -TWO_TOKEN, 10,                            /* 4 = TWO */
-  -THREE_TOKEN, -FOUR_TOKEN,                /* 5 = THREE */
-  14, 16,                                    /* 6 = HIGH_LOW */
-  -DCT_VAL_CATEGORY1, -DCT_VAL_CATEGORY2,   /* 7 = CAT_ONE */
-  18, 20,                                   /* 8 = CAT_THREEFOUR */
-  -DCT_VAL_CATEGORY3, -DCT_VAL_CATEGORY4,  /* 9 = CAT_THREE */
-  -DCT_VAL_CATEGORY5, -DCT_VAL_CATEGORY6   /* 10 = CAT_FIVE */
-};
-
-struct vp9_token_struct vp9_coef_encodings[MAX_ENTROPY_TOKENS];
-
-/* Trees for extra bits.  Probabilities are constant and
-   do not depend on previously encoded bits */
-
-static const Prob Pcat1[] = { 159};
-static const Prob Pcat2[] = { 165, 145};
-static const Prob Pcat3[] = { 173, 148, 140};
-static const Prob Pcat4[] = { 176, 155, 140, 135};
-static const Prob Pcat5[] = { 180, 157, 141, 134, 130};
-static const Prob Pcat6[] =
-{ 254, 254, 252, 249, 243, 230, 196, 177, 153, 140, 133, 130, 129};
-
-static vp9_tree_index cat1[2], cat2[4], cat3[6], cat4[8], cat5[10], cat6[26];
-
-static void init_bit_tree(vp9_tree_index *p, int n) {
-  int i = 0;
-
-  while (++i < n) {
-    p[0] = p[1] = i << 1;
-    p += 2;
-  }
-
-  p[0] = p[1] = 0;
-}
-
-static void init_bit_trees() {
-  init_bit_tree(cat1, 1);
-  init_bit_tree(cat2, 2);
-  init_bit_tree(cat3, 3);
-  init_bit_tree(cat4, 4);
-  init_bit_tree(cat5, 5);
-  init_bit_tree(cat6, 13);
-}
-
-vp9_extra_bit_struct vp9_extra_bits[12] = {
-  { 0, 0, 0, 0},
-  { 0, 0, 0, 1},
-  { 0, 0, 0, 2},
-  { 0, 0, 0, 3},
-  { 0, 0, 0, 4},
-  { cat1, Pcat1, 1, 5},
-  { cat2, Pcat2, 2, 7},
-  { cat3, Pcat3, 3, 11},
-  { cat4, Pcat4, 4, 19},
-  { cat5, Pcat5, 5, 35},
-  { cat6, Pcat6, 13, 67},
-  { 0, 0, 0, 0}
-};
-
-#include "default_coef_probs.h"
-
-void vp9_default_coef_probs(VP9_COMMON *pc) {
-  vpx_memcpy(pc->fc.coef_probs, default_coef_probs,
-             sizeof(pc->fc.coef_probs));
-  vpx_memcpy(pc->fc.hybrid_coef_probs, default_hybrid_coef_probs,
-             sizeof(pc->fc.hybrid_coef_probs));
-
-  vpx_memcpy(pc->fc.coef_probs_8x8, default_coef_probs_8x8,
-             sizeof(pc->fc.coef_probs_8x8));
-  vpx_memcpy(pc->fc.hybrid_coef_probs_8x8, default_hybrid_coef_probs_8x8,
-             sizeof(pc->fc.hybrid_coef_probs_8x8));
-
-  vpx_memcpy(pc->fc.coef_probs_16x16, default_coef_probs_16x16,
-             sizeof(pc->fc.coef_probs_16x16));
-  vpx_memcpy(pc->fc.hybrid_coef_probs_16x16,
-             default_hybrid_coef_probs_16x16,
-             sizeof(pc->fc.hybrid_coef_probs_16x16));
-}
-
-void vp9_coef_tree_initialize() {
-  init_bit_trees();
-  vp9_tokens_from_tree(vp9_coef_encodings, vp9_coef_tree);
-}
-
-// #define COEF_COUNT_TESTING
-
-#define COEF_COUNT_SAT 24
-#define COEF_MAX_UPDATE_FACTOR 112
-#define COEF_COUNT_SAT_KEY 24
-#define COEF_MAX_UPDATE_FACTOR_KEY 112
-#define COEF_COUNT_SAT_AFTER_KEY 24
-#define COEF_MAX_UPDATE_FACTOR_AFTER_KEY 128
-
-void vp9_adapt_coef_probs(VP9_COMMON *cm) {
-  int t, i, j, k, count;
-  unsigned int branch_ct[ENTROPY_NODES][2];
-  vp9_prob coef_probs[ENTROPY_NODES];
-  int update_factor; /* denominator 256 */
-  int factor;
-  int count_sat;
-
-  // printf("Frame type: %d\n", cm->frame_type);
-  if (cm->frame_type == KEY_FRAME) {
-    update_factor = COEF_MAX_UPDATE_FACTOR_KEY;
-    count_sat = COEF_COUNT_SAT_KEY;
-  } else if (cm->last_frame_type == KEY_FRAME) {
-    update_factor = COEF_MAX_UPDATE_FACTOR_AFTER_KEY;  /* adapt quickly */
-    count_sat = COEF_COUNT_SAT_AFTER_KEY;
-  } else {
-    update_factor = COEF_MAX_UPDATE_FACTOR;
-    count_sat = COEF_COUNT_SAT;
-  }
-
-#ifdef COEF_COUNT_TESTING
-  {
-    printf("static const unsigned int\ncoef_counts"
-           "[BLOCK_TYPES] [COEF_BANDS]"
-           "[PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS] = {\n");
-    for (i = 0; i < BLOCK_TYPES; ++i) {
-      printf("  {\n");
-      for (j = 0; j < COEF_BANDS; ++j) {
-        printf("    {\n");
-        for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {
-          printf("      {");
-          for (t = 0; t < MAX_ENTROPY_TOKENS; ++t)
-            printf("%d, ", cm->fc.coef_counts[i][j][k][t]);
-          printf("},\n");
-        }
-        printf("    },\n");
-      }
-      printf("  },\n");
-    }
-    printf("};\n");
-    printf("static const unsigned int\ncoef_counts_8x8"
-           "[BLOCK_TYPES_8X8] [COEF_BANDS]"
-           "[PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS] = {\n");
-    for (i = 0; i < BLOCK_TYPES_8X8; ++i) {
-      printf("  {\n");
-      for (j = 0; j < COEF_BANDS; ++j) {
-        printf("    {\n");
-        for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {
-          printf("      {");
-          for (t = 0; t < MAX_ENTROPY_TOKENS; ++t)
-            printf("%d, ", cm->fc.coef_counts_8x8[i][j][k][t]);
-          printf("},\n");
-        }
-        printf("    },\n");
-      }
-      printf("  },\n");
-    }
-    printf("};\n");
-    printf("static const unsigned int\nhybrid_coef_counts"
-           "[BLOCK_TYPES] [COEF_BANDS]"
-           "[PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS] = {\n");
-    for (i = 0; i < BLOCK_TYPES; ++i) {
-      printf("  {\n");
-      for (j = 0; j < COEF_BANDS; ++j) {
-        printf("    {\n");
-        for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {
-          printf("      {");
-          for (t = 0; t < MAX_ENTROPY_TOKENS; ++t)
-            printf("%d, ", cm->fc.hybrid_coef_counts[i][j][k][t]);
-          printf("},\n");
-        }
-        printf("    },\n");
-      }
-      printf("  },\n");
-    }
-    printf("};\n");
-  }
-#endif
-
-  for (i = 0; i < BLOCK_TYPES; ++i)
-    for (j = 0; j < COEF_BANDS; ++j)
-      for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {
-        if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0)))
-          continue;
-        vp9_tree_probs_from_distribution(
-          MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree,
-          coef_probs, branch_ct, cm->fc.coef_counts [i][j][k],
-          256, 1);
-        for (t = 0; t < ENTROPY_NODES; ++t) {
-          int prob;
-          count = branch_ct[t][0] + branch_ct[t][1];
-          count = count > count_sat ? count_sat : count;
-          factor = (update_factor * count / count_sat);
-          prob = ((int)cm->fc.pre_coef_probs[i][j][k][t] * (256 - factor) +
-                  (int)coef_probs[t] * factor + 128) >> 8;
-          if (prob <= 0) cm->fc.coef_probs[i][j][k][t] = 1;
-          else if (prob > 255) cm->fc.coef_probs[i][j][k][t] = 255;
-          else cm->fc.coef_probs[i][j][k][t] = prob;
-        }
-      }
-
-  for (i = 0; i < BLOCK_TYPES; ++i)
-    for (j = 0; j < COEF_BANDS; ++j)
-      for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {
-        if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0)))
-          continue;
-        vp9_tree_probs_from_distribution(
-          MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree,
-          coef_probs, branch_ct, cm->fc.hybrid_coef_counts [i][j][k],
-          256, 1);
-        for (t = 0; t < ENTROPY_NODES; ++t) {
-          int prob;
-          count = branch_ct[t][0] + branch_ct[t][1];
-          count = count > count_sat ? count_sat : count;
-          factor = (update_factor * count / count_sat);
-          prob = ((int)cm->fc.pre_hybrid_coef_probs[i][j][k][t] * (256 - factor) +
-                  (int)coef_probs[t] * factor + 128) >> 8;
-          if (prob <= 0) cm->fc.hybrid_coef_probs[i][j][k][t] = 1;
-          else if (prob > 255) cm->fc.hybrid_coef_probs[i][j][k][t] = 255;
-          else cm->fc.hybrid_coef_probs[i][j][k][t] = prob;
-        }
-      }
-
-  for (i = 0; i < BLOCK_TYPES_8X8; ++i)
-    for (j = 0; j < COEF_BANDS; ++j)
-      for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {
-        if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0)))
-          continue;
-        vp9_tree_probs_from_distribution(
-          MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree,
-          coef_probs, branch_ct, cm->fc.coef_counts_8x8 [i][j][k],
-          256, 1);
-        for (t = 0; t < ENTROPY_NODES; ++t) {
-          int prob;
-          count = branch_ct[t][0] + branch_ct[t][1];
-          count = count > count_sat ? count_sat : count;
-          factor = (update_factor * count / count_sat);
-          prob = ((int)cm->fc.pre_coef_probs_8x8[i][j][k][t] * (256 - factor) +
-                  (int)coef_probs[t] * factor + 128) >> 8;
-          if (prob <= 0) cm->fc.coef_probs_8x8[i][j][k][t] = 1;
-          else if (prob > 255) cm->fc.coef_probs_8x8[i][j][k][t] = 255;
-          else cm->fc.coef_probs_8x8[i][j][k][t] = prob;
-        }
-      }
-
-  for (i = 0; i < BLOCK_TYPES_8X8; ++i)
-    for (j = 0; j < COEF_BANDS; ++j)
-      for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {
-        if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0)))
-          continue;
-        vp9_tree_probs_from_distribution(
-          MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree,
-          coef_probs, branch_ct, cm->fc.hybrid_coef_counts_8x8 [i][j][k],
-          256, 1);
-        for (t = 0; t < ENTROPY_NODES; ++t) {
-          int prob;
-          count = branch_ct[t][0] + branch_ct[t][1];
-          count = count > count_sat ? count_sat : count;
-          factor = (update_factor * count / count_sat);
-          prob = ((int)cm->fc.pre_hybrid_coef_probs_8x8[i][j][k][t] *
-                  (256 - factor) +
-                  (int)coef_probs[t] * factor + 128) >> 8;
-          if (prob <= 0) cm->fc.hybrid_coef_probs_8x8[i][j][k][t] = 1;
-          else if (prob > 255) cm->fc.hybrid_coef_probs_8x8[i][j][k][t] = 255;
-          else cm->fc.hybrid_coef_probs_8x8[i][j][k][t] = prob;
-        }
-      }
-
-  for (i = 0; i < BLOCK_TYPES_16X16; ++i)
-    for (j = 0; j < COEF_BANDS; ++j)
-      for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {
-        if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0)))
-          continue;
-        vp9_tree_probs_from_distribution(
-          MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree,
-          coef_probs, branch_ct, cm->fc.coef_counts_16x16[i][j][k], 256, 1);
-        for (t = 0; t < ENTROPY_NODES; ++t) {
-          int prob;
-          count = branch_ct[t][0] + branch_ct[t][1];
-          count = count > count_sat ? count_sat : count;
-          factor = (update_factor * count / count_sat);
-          prob = ((int)cm->fc.pre_coef_probs_16x16[i][j][k][t] *
-                  (256 - factor) +
-                  (int)coef_probs[t] * factor + 128) >> 8;
-          if (prob <= 0) cm->fc.coef_probs_16x16[i][j][k][t] = 1;
-          else if (prob > 255) cm->fc.coef_probs_16x16[i][j][k][t] = 255;
-          else cm->fc.coef_probs_16x16[i][j][k][t] = prob;
-        }
-      }
-
-  for (i = 0; i < BLOCK_TYPES_16X16; ++i)
-    for (j = 0; j < COEF_BANDS; ++j)
-      for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {
-        if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0)))
-          continue;
-        vp9_tree_probs_from_distribution(
-          MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree,
-          coef_probs, branch_ct, cm->fc.hybrid_coef_counts_16x16[i][j][k], 256, 1);
-        for (t = 0; t < ENTROPY_NODES; ++t) {
-          int prob;
-          count = branch_ct[t][0] + branch_ct[t][1];
-          count = count > count_sat ? count_sat : count;
-          factor = (update_factor * count / count_sat);
-          prob = ((int)cm->fc.pre_hybrid_coef_probs_16x16[i][j][k][t] * (256 - factor) +
-                  (int)coef_probs[t] * factor + 128) >> 8;
-          if (prob <= 0) cm->fc.hybrid_coef_probs_16x16[i][j][k][t] = 1;
-          else if (prob > 255) cm->fc.hybrid_coef_probs_16x16[i][j][k][t] = 255;
-          else cm->fc.hybrid_coef_probs_16x16[i][j][k][t] = prob;
-        }
-      }
-}
--- a/vp8/common/entropy.h
+++ /dev/null
@@ -1,112 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef __INC_ENTROPY_H
-#define __INC_ENTROPY_H
-
-#include "treecoder.h"
-#include "blockd.h"
-#include "common.h"
-#include "coefupdateprobs.h"
-
-extern const int vp9_i8x8_block[4];
-
-/* Coefficient token alphabet */
-
-#define ZERO_TOKEN              0       /* 0         Extra Bits 0+0 */
-#define ONE_TOKEN               1       /* 1         Extra Bits 0+1 */
-#define TWO_TOKEN               2       /* 2         Extra Bits 0+1 */
-#define THREE_TOKEN             3       /* 3         Extra Bits 0+1 */
-#define FOUR_TOKEN              4       /* 4         Extra Bits 0+1 */
-#define DCT_VAL_CATEGORY1       5       /* 5-6       Extra Bits 1+1 */
-#define DCT_VAL_CATEGORY2       6       /* 7-10      Extra Bits 2+1 */
-#define DCT_VAL_CATEGORY3       7       /* 11-18     Extra Bits 3+1 */
-#define DCT_VAL_CATEGORY4       8       /* 19-34     Extra Bits 4+1 */
-#define DCT_VAL_CATEGORY5       9       /* 35-66     Extra Bits 5+1 */
-#define DCT_VAL_CATEGORY6       10      /* 67+       Extra Bits 13+1 */
-#define DCT_EOB_TOKEN           11      /* EOB       Extra Bits 0+0 */
-#define MAX_ENTROPY_TOKENS 12
-#define ENTROPY_NODES 11
-#define EOSB_TOKEN              127     /* Not signalled, encoder only */
-
-extern const vp9_tree_index vp9_coef_tree[];
-
-extern struct vp9_token_struct vp9_coef_encodings[MAX_ENTROPY_TOKENS];
-
-typedef struct {
-  vp9_tree_p tree;
-  const vp9_prob *prob;
-  int Len;
-  int base_val;
-} vp9_extra_bit_struct;
-
-extern vp9_extra_bit_struct vp9_extra_bits[12];    /* indexed by token value */
-
-#define PROB_UPDATE_BASELINE_COST   7
-
-#define MAX_PROB                255
-#define DCT_MAX_VALUE           8192
-
-/* Coefficients are predicted via a 3-dimensional probability table. */
-
-/* Outside dimension.  0 = Y no DC, 1 = Y2, 2 = UV, 3 = Y with DC */
-#define BLOCK_TYPES 4
-
-#define BLOCK_TYPES_8X8 4
-
-#define BLOCK_TYPES_16X16 4
-
-/* Middle dimension is a coarsening of the coefficient's
-   position within the 4x4 DCT. */
-
-#define COEF_BANDS 8
-extern DECLARE_ALIGNED(16, const int, vp9_coef_bands[16]);
-extern DECLARE_ALIGNED(64, const int, vp9_coef_bands_8x8[64]);
-extern DECLARE_ALIGNED(16, const int, vp9_coef_bands_16x16[256]);
-
-/* Inside dimension is 3-valued measure of nearby complexity, that is,
-   the extent to which nearby coefficients are nonzero.  For the first
-   coefficient (DC, unless block type is 0), we look at the (already encoded)
-   blocks above and to the left of the current block.  The context index is
-   then the number (0,1,or 2) of these blocks having nonzero coefficients.
-   After decoding a coefficient, the measure is roughly the size of the
-   most recently decoded coefficient (0 for 0, 1 for 1, 2 for >1).
-   Note that the intuitive meaning of this measure changes as coefficients
-   are decoded, e.g., prior to the first token, a zero means that my neighbors
-   are empty while, after the first token, because of the use of end-of-block,
-   a zero means we just decoded a zero and hence guarantees that a non-zero
-   coefficient will appear later in this block.  However, this shift
-   in meaning is perfectly OK because our context depends also on the
-   coefficient band (and since zigzag positions 0, 1, and 2 are in
-   distinct bands). */
-
-/*# define DC_TOKEN_CONTEXTS        3*/ /* 00, 0!0, !0!0 */
-#define PREV_COEF_CONTEXTS       4
-
-#define SUBEXP_PARAM                4   /* Subexponential code parameter */
-#define MODULUS_PARAM               13  /* Modulus parameter */
-
-extern DECLARE_ALIGNED(16, const unsigned char, vp9_prev_token_class[MAX_ENTROPY_TOKENS]);
-
-struct VP9Common;
-void vp9_default_coef_probs(struct VP9Common *);
-extern DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d[16]);
-
-extern DECLARE_ALIGNED(16, const int, vp9_col_scan[16]);
-extern DECLARE_ALIGNED(16, const int, vp9_row_scan[16]);
-
-extern DECLARE_ALIGNED(64, const int, vp9_default_zig_zag1d_8x8[64]);
-void vp9_coef_tree_initialize(void);
-
-extern DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d_16x16[256]);
-void vp9_adapt_coef_probs(struct VP9Common *);
-
-#endif
--- a/vp8/common/entropymode.c
+++ /dev/null
@@ -1,614 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "onyxc_int.h"
-#include "modecont.h"
-#include "vpx_mem/vpx_mem.h"
-
-
-static const unsigned int kf_y_mode_cts[8][VP9_YMODES] = {
-  /* DC V   H  D45 135 117 153 D27 D63 TM i8x8 BPRED */
-  {12,  6,  5,  5,  5,  5,  5,  5,  5,  2, 22, 200},
-  {25, 13, 13,  7,  7,  7,  7,  7,  7,  6, 27, 160},
-  {31, 17, 18,  8,  8,  8,  8,  8,  8,  9, 26, 139},
-  {40, 22, 23,  8,  8,  8,  8,  8,  8, 12, 27, 116},
-  {53, 26, 28,  8,  8,  8,  8,  8,  8, 13, 26,  94},
-  {68, 33, 35,  8,  8,  8,  8,  8,  8, 17, 20,  68},
-  {78, 38, 38,  8,  8,  8,  8,  8,  8, 19, 16,  52},
-  {89, 42, 42,  8,  8,  8,  8,  8,  8, 21, 12,  34},
-};
-
-static const unsigned int y_mode_cts  [VP9_YMODES] = {
-  /* DC V   H  D45 135 117 153 D27 D63 TM i8x8 BPRED */
-  98, 19, 15, 14, 14, 14, 14, 12, 12, 13, 16, 70
-};
-
-static const unsigned int uv_mode_cts [VP9_YMODES] [VP9_UV_MODES] = {
-  /* DC   V   H  D45 135 117 153 D27 D63 TM */
-  { 200, 15, 15, 10, 10, 10, 10, 10, 10,  6}, /* DC */
-  { 130, 75, 10, 10, 10, 10, 10, 10, 10,  6}, /* V */
-  { 130, 10, 75, 10, 10, 10, 10, 10, 10,  6}, /* H */
-  { 130, 15, 10, 75, 10, 10, 10, 10, 10,  6}, /* D45 */
-  { 150, 15, 10, 10, 75, 10, 10, 10, 10,  6}, /* D135 */
-  { 150, 15, 10, 10, 10, 75, 10, 10, 10,  6}, /* D117 */
-  { 150, 15, 10, 10, 10, 10, 75, 10, 10,  6}, /* D153 */
-  { 150, 15, 10, 10, 10, 10, 10, 75, 10,  6}, /* D27 */
-  { 150, 15, 10, 10, 10, 10, 10, 10, 75,  6}, /* D63 */
-  { 160, 30, 30, 10, 10, 10, 10, 10, 10, 16}, /* TM */
-  { 132, 46, 40, 10, 10, 10, 10, 10, 10, 18}, /* i8x8 - never used */
-  { 150, 35, 41, 10, 10, 10, 10, 10, 10, 10}, /* BPRED */
-};
-
-static const unsigned int i8x8_mode_cts  [VP9_I8X8_MODES] = {
-  /* DC V   H D45 135 117 153 D27 D63  TM */
-  73, 49, 61, 30, 30, 30, 30, 30, 30, 13
-};
-
-static const unsigned int kf_uv_mode_cts [VP9_YMODES] [VP9_UV_MODES] = {
-  // DC   V   H  D45 135 117 153 D27 D63 TM
-  { 160, 24, 24, 20, 20, 20, 20, 20, 20,  8}, /* DC */
-  { 102, 64, 30, 20, 20, 20, 20, 20, 20, 10}, /* V */
-  { 102, 30, 64, 20, 20, 20, 20, 20, 20, 10}, /* H */
-  { 102, 33, 20, 64, 20, 20, 20, 20, 20, 14}, /* D45 */
-  { 102, 33, 20, 20, 64, 20, 20, 20, 20, 14}, /* D135 */
-  { 122, 33, 20, 20, 20, 64, 20, 20, 20, 14}, /* D117 */
-  { 102, 33, 20, 20, 20, 20, 64, 20, 20, 14}, /* D153 */
-  { 102, 33, 20, 20, 20, 20, 20, 64, 20, 14}, /* D27 */
-  { 102, 33, 20, 20, 20, 20, 20, 20, 64, 14}, /* D63 */
-  { 132, 36, 30, 20, 20, 20, 20, 20, 20, 18}, /* TM */
-  { 122, 41, 35, 20, 20, 20, 20, 20, 20, 18}, /* i8x8 - never used */
-  { 122, 41, 35, 20, 20, 20, 20, 20, 20, 18}, /* BPRED */
-};
-
-static const unsigned int bmode_cts[VP9_BINTRAMODES] = {
-  /* DC    TM     VE     HE   LD    RD    VR    VL    HD    HU */
-  43891, 17694, 10036, 3920, 3363, 2546, 5119, 3221, 2471, 1723
-};
-
-typedef enum {
-  SUBMVREF_NORMAL,
-  SUBMVREF_LEFT_ZED,
-  SUBMVREF_ABOVE_ZED,
-  SUBMVREF_LEFT_ABOVE_SAME,
-  SUBMVREF_LEFT_ABOVE_ZED
-} sumvfref_t;
-
-int vp9_mv_cont(const int_mv *l, const int_mv *a) {
-  int lez = (l->as_int == 0);
-  int aez = (a->as_int == 0);
-  int lea = (l->as_int == a->as_int);
-
-  if (lea && lez)
-    return SUBMVREF_LEFT_ABOVE_ZED;
-
-  if (lea)
-    return SUBMVREF_LEFT_ABOVE_SAME;
-
-  if (aez)
-    return SUBMVREF_ABOVE_ZED;
-
-  if (lez)
-    return SUBMVREF_LEFT_ZED;
-
-  return SUBMVREF_NORMAL;
-}
-
-const vp9_prob vp9_sub_mv_ref_prob [VP9_SUBMVREFS - 1] = { 180, 162, 25};
-
-const vp9_prob vp9_sub_mv_ref_prob2 [SUBMVREF_COUNT][VP9_SUBMVREFS - 1] = {
-  { 147, 136, 18 },
-  { 106, 145, 1  },
-  { 179, 121, 1  },
-  { 223, 1, 34 },
-  { 208, 1, 1  }
-};
-
-vp9_mbsplit vp9_mbsplits [VP9_NUMMBSPLITS] = {
-  {
-    0,  0,  0,  0,
-    0,  0,  0,  0,
-    1,  1,  1,  1,
-    1,  1,  1,  1,
-  }, {
-    0,  0,  1,  1,
-    0,  0,  1,  1,
-    0,  0,  1,  1,
-    0,  0,  1,  1,
-  }, {
-    0,  0,  1,  1,
-    0,  0,  1,  1,
-    2,  2,  3,  3,
-    2,  2,  3,  3,
-  }, {
-    0,  1,  2,  3,
-    4,  5,  6,  7,
-    8,  9,  10, 11,
-    12, 13, 14, 15,
-  },
-};
-
-const int vp9_mbsplit_count [VP9_NUMMBSPLITS] = { 2, 2, 4, 16};
-
-const vp9_prob vp9_mbsplit_probs [VP9_NUMMBSPLITS - 1] = { 110, 111, 150};
-
-/* Array indices are identical to previously-existing INTRAMODECONTEXTNODES. */
-
-const vp9_tree_index vp9_bmode_tree[VP9_BINTRAMODES * 2 - 2] = /* INTRAMODECONTEXTNODE value */
-{
-  -B_DC_PRED, 2,                             /* 0 = DC_NODE */
-  -B_TM_PRED, 4,                            /* 1 = TM_NODE */
-  -B_VE_PRED, 6,                           /* 2 = VE_NODE */
-  8, 12,                                  /* 3 = COM_NODE */
-  -B_HE_PRED, 10,                        /* 4 = HE_NODE */
-  -B_RD_PRED, -B_VR_PRED,               /* 5 = RD_NODE */
-  -B_LD_PRED, 14,                        /* 6 = LD_NODE */
-  -B_VL_PRED, 16,                      /* 7 = VL_NODE */
-  -B_HD_PRED, -B_HU_PRED             /* 8 = HD_NODE */
-};
-
-/* Again, these trees use the same probability indices as their
-   explicitly-programmed predecessors. */
-const vp9_tree_index vp9_ymode_tree[VP9_YMODES * 2 - 2] = {
-  2, 14,
-  -DC_PRED, 4,
-  6, 8,
-  -D45_PRED, -D135_PRED,
-  10, 12,
-  -D117_PRED, -D153_PRED,
-  -D27_PRED, -D63_PRED,
-  16, 18,
-  -V_PRED, -H_PRED,
-  -TM_PRED, 20,
-  -B_PRED, -I8X8_PRED
-};
-
-const vp9_tree_index vp9_kf_ymode_tree[VP9_YMODES * 2 - 2] = {
-  2, 14,
-  -DC_PRED, 4,
-  6, 8,
-  -D45_PRED, -D135_PRED,
-  10, 12,
-  -D117_PRED, -D153_PRED,
-  -D27_PRED, -D63_PRED,
-  16, 18,
-  -V_PRED, -H_PRED,
-  -TM_PRED, 20,
-  -B_PRED, -I8X8_PRED
-};
-
-const vp9_tree_index vp9_i8x8_mode_tree[VP9_I8X8_MODES * 2 - 2] = {
-  2, 14,
-  -DC_PRED, 4,
-  6, 8,
-  -D45_PRED, -D135_PRED,
-  10, 12,
-  -D117_PRED, -D153_PRED,
-  -D27_PRED, -D63_PRED,
-  -V_PRED, 16,
-  -H_PRED, -TM_PRED
-};
-
-const vp9_tree_index vp9_uv_mode_tree[VP9_UV_MODES * 2 - 2] = {
-  2, 14,
-  -DC_PRED, 4,
-  6, 8,
-  -D45_PRED, -D135_PRED,
-  10, 12,
-  -D117_PRED, -D153_PRED,
-  -D27_PRED, -D63_PRED,
-  -V_PRED, 16,
-  -H_PRED, -TM_PRED
-};
-
-const vp9_tree_index vp9_mbsplit_tree[6] = {
-  -PARTITIONING_4X4,   2,
-  -PARTITIONING_8X8,   4,
-  -PARTITIONING_16X8, -PARTITIONING_8X16,
-};
-
-const vp9_tree_index vp9_mv_ref_tree[8] = {
-  -ZEROMV, 2,
-  -NEARESTMV, 4,
-  -NEARMV, 6,
-  -NEWMV, -SPLITMV
-};
-
-#if CONFIG_SUPERBLOCKS
-const vp9_tree_index vp9_sb_mv_ref_tree[6] = {
-  -ZEROMV, 2,
-  -NEARESTMV, 4,
-  -NEARMV, -NEWMV
-};
-#endif
-
-const vp9_tree_index vp9_sub_mv_ref_tree[6] = {
-  -LEFT4X4, 2,
-  -ABOVE4X4, 4,
-  -ZERO4X4, -NEW4X4
-};
-
-struct vp9_token_struct vp9_bmode_encodings   [VP9_BINTRAMODES];
-struct vp9_token_struct vp9_ymode_encodings   [VP9_YMODES];
-#if CONFIG_SUPERBLOCKS
-struct vp9_token_struct vp9_sb_kf_ymode_encodings [VP9_I32X32_MODES];
-#endif
-struct vp9_token_struct vp9_kf_ymode_encodings [VP9_YMODES];
-struct vp9_token_struct vp9_uv_mode_encodings  [VP9_UV_MODES];
-struct vp9_token_struct vp9_i8x8_mode_encodings  [VP9_I8X8_MODES];
-struct vp9_token_struct vp9_mbsplit_encodings [VP9_NUMMBSPLITS];
-
-struct vp9_token_struct vp9_mv_ref_encoding_array    [VP9_MVREFS];
-#if CONFIG_SUPERBLOCKS
-struct vp9_token_struct vp9_sb_mv_ref_encoding_array  [VP9_MVREFS];
-#endif
-struct vp9_token_struct vp9_sub_mv_ref_encoding_array [VP9_SUBMVREFS];
-
-void vp9_init_mbmode_probs(VP9_COMMON *x) {
-  unsigned int bct [VP9_YMODES] [2];      /* num Ymodes > num UV modes */
-
-  vp9_tree_probs_from_distribution(VP9_YMODES, vp9_ymode_encodings,
-                                   vp9_ymode_tree, x->fc.ymode_prob,
-                                   bct, y_mode_cts, 256, 1);
-  {
-    int i;
-    for (i = 0; i < 8; i++) {
-      vp9_tree_probs_from_distribution(VP9_YMODES, vp9_kf_ymode_encodings,
-                                       vp9_kf_ymode_tree, x->kf_ymode_prob[i],
-                                       bct, kf_y_mode_cts[i], 256, 1);
-#if CONFIG_SUPERBLOCKS
-      vp9_tree_probs_from_distribution(VP9_I32X32_MODES,
-                                       vp9_sb_kf_ymode_encodings,
-                                       vp9_sb_ymode_tree,
-                                       x->sb_kf_ymode_prob[i], bct,
-                                       kf_y_mode_cts[i], 256, 1);
-#endif
-    }
-  }
-  {
-    int i;
-    for (i = 0; i < VP9_YMODES; i++) {
-      vp9_tree_probs_from_distribution(VP9_UV_MODES, vp9_uv_mode_encodings,
-                                       vp9_uv_mode_tree, x->kf_uv_mode_prob[i],
-                                       bct, kf_uv_mode_cts[i], 256, 1);
-      vp9_tree_probs_from_distribution(VP9_UV_MODES, vp9_uv_mode_encodings,
-                                       vp9_uv_mode_tree, x->fc.uv_mode_prob[i],
-                                       bct, uv_mode_cts[i], 256, 1);
-    }
-  }
-
-  vp9_tree_probs_from_distribution(VP9_I8X8_MODES, vp9_i8x8_mode_encodings,
-                                   vp9_i8x8_mode_tree, x->fc.i8x8_mode_prob,
-                                   bct, i8x8_mode_cts, 256, 1);
-
-  vpx_memcpy(x->fc.sub_mv_ref_prob, vp9_sub_mv_ref_prob2,
-             sizeof(vp9_sub_mv_ref_prob2));
-  vpx_memcpy(x->fc.mbsplit_prob, vp9_mbsplit_probs, sizeof(vp9_mbsplit_probs));
-  vpx_memcpy(x->fc.switchable_interp_prob, vp9_switchable_interp_prob,
-             sizeof(vp9_switchable_interp_prob));
-}
-
-
-static void intra_bmode_probs_from_distribution(
-  vp9_prob p [VP9_BINTRAMODES - 1],
-  unsigned int branch_ct [VP9_BINTRAMODES - 1] [2],
-  const unsigned int events [VP9_BINTRAMODES]) {
-  vp9_tree_probs_from_distribution(VP9_BINTRAMODES, vp9_bmode_encodings,
-                                   vp9_bmode_tree, p, branch_ct,
-                                   events, 256, 1);
-}
-
-void vp9_default_bmode_probs(vp9_prob p [VP9_BINTRAMODES - 1]) {
-  unsigned int branch_ct [VP9_BINTRAMODES - 1] [2];
-  intra_bmode_probs_from_distribution(p, branch_ct, bmode_cts);
-}
-
-void vp9_kf_default_bmode_probs(vp9_prob p[VP9_BINTRAMODES][VP9_BINTRAMODES]
-                                          [VP9_BINTRAMODES - 1]) {
-  unsigned int branch_ct[VP9_BINTRAMODES - 1][2];
-  int i, j;
-
-  for (i = 0; i < VP9_BINTRAMODES; i++) {
-    for (j = 0; j < VP9_BINTRAMODES; j++) {
-      intra_bmode_probs_from_distribution(
-        p[i][j], branch_ct, vp9_kf_default_bmode_counts[i][j]);
-    }
-  }
-}
-
-#if VP9_SWITCHABLE_FILTERS == 3
-const vp9_tree_index vp9_switchable_interp_tree[VP9_SWITCHABLE_FILTERS*2-2] = {
-  -0, 2,
-  -1, -2
-};
-struct vp9_token_struct vp9_switchable_interp_encodings[VP9_SWITCHABLE_FILTERS];
-const INTERPOLATIONFILTERTYPE vp9_switchable_interp[VP9_SWITCHABLE_FILTERS] = {
-  EIGHTTAP, SIXTAP, EIGHTTAP_SHARP};
-const int vp9_switchable_interp_map[SWITCHABLE+1] = {1, -1, 0, 2, -1};
-const vp9_prob vp9_switchable_interp_prob [VP9_SWITCHABLE_FILTERS+1]
-                                          [VP9_SWITCHABLE_FILTERS-1] = {
-  {248, 192}, { 32, 248}, { 32,  32}, {192, 160}
-};
-#elif VP9_SWITCHABLE_FILTERS == 2
-const vp9_tree_index vp9_switchable_interp_tree[VP9_SWITCHABLE_FILTERS*2-2] = {
-  -0, -1,
-};
-struct vp9_token_struct vp9_switchable_interp_encodings[VP9_SWITCHABLE_FILTERS];
-const vp9_prob vp9_switchable_interp_prob [VP9_SWITCHABLE_FILTERS+1]
-                                          [VP9_SWITCHABLE_FILTERS-1] = {
-  {248},
-  { 64},
-  {192},
-};
-const INTERPOLATIONFILTERTYPE vp9_switchable_interp[VP9_SWITCHABLE_FILTERS] = {
-  EIGHTTAP, EIGHTTAP_SHARP};
-const int vp9_switchable_interp_map[SWITCHABLE+1] = {-1, -1, 0, 1, -1}; //8, 8s
-#endif
-
-void vp9_entropy_mode_init() {
-  vp9_tokens_from_tree(vp9_bmode_encodings,   vp9_bmode_tree);
-  vp9_tokens_from_tree(vp9_ymode_encodings,   vp9_ymode_tree);
-  vp9_tokens_from_tree(vp9_kf_ymode_encodings, vp9_kf_ymode_tree);
-#if CONFIG_SUPERBLOCKS
-  vp9_tokens_from_tree(vp9_sb_kf_ymode_encodings, vp9_sb_ymode_tree);
-#endif
-  vp9_tokens_from_tree(vp9_uv_mode_encodings,  vp9_uv_mode_tree);
-  vp9_tokens_from_tree(vp9_i8x8_mode_encodings,  vp9_i8x8_mode_tree);
-  vp9_tokens_from_tree(vp9_mbsplit_encodings, vp9_mbsplit_tree);
-  vp9_tokens_from_tree(vp9_switchable_interp_encodings,
-                       vp9_switchable_interp_tree);
-
-  vp9_tokens_from_tree_offset(vp9_mv_ref_encoding_array,
-                              vp9_mv_ref_tree, NEARESTMV);
-#if CONFIG_SUPERBLOCKS
-  vp9_tokens_from_tree_offset(vp9_sb_mv_ref_encoding_array,
-                              vp9_sb_mv_ref_tree, NEARESTMV);
-#endif
-  vp9_tokens_from_tree_offset(vp9_sub_mv_ref_encoding_array,
-                              vp9_sub_mv_ref_tree, LEFT4X4);
-}
-
-void vp9_init_mode_contexts(VP9_COMMON *pc) {
-  vpx_memset(pc->fc.mv_ref_ct, 0, sizeof(pc->fc.mv_ref_ct));
-  vpx_memset(pc->fc.mv_ref_ct_a, 0, sizeof(pc->fc.mv_ref_ct_a));
-
-  vpx_memcpy(pc->fc.mode_context,
-             vp9_default_mode_contexts,
-             sizeof(pc->fc.mode_context));
-  vpx_memcpy(pc->fc.mode_context_a,
-             vp9_default_mode_contexts_a,
-             sizeof(pc->fc.mode_context_a));
-
-}
-
-void vp9_accum_mv_refs(VP9_COMMON *pc,
-                       MB_PREDICTION_MODE m,
-                       const int ct[4]) {
-  int (*mv_ref_ct)[4][2];
-
-  if (pc->refresh_alt_ref_frame)
-    mv_ref_ct = pc->fc.mv_ref_ct_a;
-  else
-    mv_ref_ct = pc->fc.mv_ref_ct;
-
-  if (m == ZEROMV) {
-    ++mv_ref_ct [ct[0]] [0] [0];
-  } else {
-    ++mv_ref_ct [ct[0]] [0] [1];
-    if (m == NEARESTMV) {
-      ++mv_ref_ct [ct[1]] [1] [0];
-    } else {
-      ++mv_ref_ct [ct[1]] [1] [1];
-      if (m == NEARMV) {
-        ++mv_ref_ct [ct[2]] [2] [0];
-      } else {
-        ++mv_ref_ct [ct[2]] [2] [1];
-        if (m == NEWMV) {
-          ++mv_ref_ct [ct[3]] [3] [0];
-        } else {
-          ++mv_ref_ct [ct[3]] [3] [1];
-        }
-      }
-    }
-  }
-}
-
-#define MVREF_COUNT_SAT 20
-#define MVREF_MAX_UPDATE_FACTOR 144
-void vp9_update_mode_context(VP9_COMMON *pc) {
-  int i, j;
-  int (*mv_ref_ct)[4][2];
-  int (*mode_context)[4];
-
-  if (pc->refresh_alt_ref_frame) {
-    mv_ref_ct = pc->fc.mv_ref_ct_a;
-    mode_context = pc->fc.mode_context_a;
-  } else {
-    mv_ref_ct = pc->fc.mv_ref_ct;
-    mode_context = pc->fc.mode_context;
-  }
-
-  for (j = 0; j < 6; j++) {
-    for (i = 0; i < 4; i++) {
-      int this_prob;
-      int count = mv_ref_ct[j][i][0] + mv_ref_ct[j][i][1];
-      int factor;
-      {
-        this_prob = count > 0 ? 256 * mv_ref_ct[j][i][0] / count : 128;
-        count = count > MVREF_COUNT_SAT ? MVREF_COUNT_SAT : count;
-        factor = (MVREF_MAX_UPDATE_FACTOR * count / MVREF_COUNT_SAT);
-        this_prob = (pc->fc.vp8_mode_contexts[j][i] * (256 - factor) +
-                     this_prob * factor + 128) >> 8;
-        this_prob = this_prob ? (this_prob < 255 ? this_prob : 255) : 1;
-        mode_context[j][i] = this_prob;
-      }
-    }
-  }
-}
-
-#ifdef MODE_STATS
-#include "vp8/common/modecont.h"
-void print_mode_contexts(VP9_COMMON *pc) {
-  int j, i;
-  printf("\n====================\n");
-  for (j = 0; j < 6; j++) {
-    for (i = 0; i < 4; i++) {
-      printf("%4d ", pc->fc.mode_context[j][i]);
-    }
-    printf("\n");
-  }
-  printf("====================\n");
-  for (j = 0; j < 6; j++) {
-    for (i = 0; i < 4; i++) {
-      printf("%4d ", pc->fc.mode_context_a[j][i]);
-    }
-    printf("\n");
-  }
-}
-#endif
-
-// #define MODE_COUNT_TESTING
-#define MODE_COUNT_SAT 20
-#define MODE_MAX_UPDATE_FACTOR 144
-void vp9_adapt_mode_probs(VP9_COMMON *cm) {
-  int i, t, count, factor;
-  unsigned int branch_ct[32][2];
-  vp9_prob ymode_probs[VP9_YMODES - 1];
-  vp9_prob uvmode_probs[VP9_UV_MODES - 1];
-  vp9_prob bmode_probs[VP9_BINTRAMODES - 1];
-  vp9_prob i8x8_mode_probs[VP9_I8X8_MODES - 1];
-  vp9_prob sub_mv_ref_probs[VP9_SUBMVREFS - 1];
-  vp9_prob mbsplit_probs[VP9_NUMMBSPLITS - 1];
-#ifdef MODE_COUNT_TESTING
-  printf("static const unsigned int\nymode_counts"
-         "[VP9_YMODES] = {\n");
-  for (t = 0; t < VP9_YMODES; ++t) printf("%d, ", cm->fc.ymode_counts[t]);
-  printf("};\n");
-  printf("static const unsigned int\nuv_mode_counts"
-         "[VP9_YMODES] [VP9_UV_MODES] = {\n");
-  for (i = 0; i < VP9_YMODES; ++i) {
-    printf("  {");
-    for (t = 0; t < VP9_UV_MODES; ++t) printf("%d, ", cm->fc.uv_mode_counts[i][t]);
-    printf("},\n");
-  }
-  printf("};\n");
-  printf("static const unsigned int\nbmode_counts"
-         "[VP9_BINTRAMODES] = {\n");
-  for (t = 0; t < VP9_BINTRAMODES; ++t) printf("%d, ", cm->fc.bmode_counts[t]);
-  printf("};\n");
-  printf("static const unsigned int\ni8x8_mode_counts"
-         "[VP9_I8X8_MODES] = {\n");
-  for (t = 0; t < VP9_I8X8_MODES; ++t) printf("%d, ", cm->fc.i8x8_mode_counts[t]);
-  printf("};\n");
-  printf("static const unsigned int\nsub_mv_ref_counts"
-         "[SUBMVREF_COUNT] [VP9_SUBMVREFS] = {\n");
-  for (i = 0; i < SUBMVREF_COUNT; ++i) {
-    printf("  {");
-    for (t = 0; t < VP9_SUBMVREFS; ++t) printf("%d, ", cm->fc.sub_mv_ref_counts[i][t]);
-    printf("},\n");
-  }
-  printf("};\n");
-  printf("static const unsigned int\nmbsplit_counts"
-         "[VP9_NUMMBSPLITS] = {\n");
-  for (t = 0; t < VP9_NUMMBSPLITS; ++t) printf("%d, ", cm->fc.mbsplit_counts[t]);
-  printf("};\n");
-#endif
-  vp9_tree_probs_from_distribution(
-    VP9_YMODES, vp9_ymode_encodings, vp9_ymode_tree,
-    ymode_probs, branch_ct, cm->fc.ymode_counts,
-    256, 1);
-  for (t = 0; t < VP9_YMODES - 1; ++t) {
-    int prob;
-    count = branch_ct[t][0] + branch_ct[t][1];
-    count = count > MODE_COUNT_SAT ? MODE_COUNT_SAT : count;
-    factor = (MODE_MAX_UPDATE_FACTOR * count / MODE_COUNT_SAT);
-    prob = ((int)cm->fc.pre_ymode_prob[t] * (256 - factor) +
-            (int)ymode_probs[t] * factor + 128) >> 8;
-    if (prob <= 0) cm->fc.ymode_prob[t] = 1;
-    else if (prob > 255) cm->fc.ymode_prob[t] = 255;
-    else cm->fc.ymode_prob[t] = prob;
-  }
-  for (i = 0; i < VP9_YMODES; ++i) {
-    vp9_tree_probs_from_distribution(VP9_UV_MODES, vp9_uv_mode_encodings,
-                                     vp9_uv_mode_tree, uvmode_probs, branch_ct,
-                                     cm->fc.uv_mode_counts[i], 256, 1);
-    for (t = 0; t < VP9_UV_MODES - 1; ++t) {
-      int prob;
-      count = branch_ct[t][0] + branch_ct[t][1];
-      count = count > MODE_COUNT_SAT ? MODE_COUNT_SAT : count;
-      factor = (MODE_MAX_UPDATE_FACTOR * count / MODE_COUNT_SAT);
-      prob = ((int)cm->fc.pre_uv_mode_prob[i][t] * (256 - factor) +
-              (int)uvmode_probs[t] * factor + 128) >> 8;
-      if (prob <= 0) cm->fc.uv_mode_prob[i][t] = 1;
-      else if (prob > 255) cm->fc.uv_mode_prob[i][t] = 255;
-      else cm->fc.uv_mode_prob[i][t] = prob;
-    }
-  }
-  vp9_tree_probs_from_distribution(VP9_BINTRAMODES, vp9_bmode_encodings,
-                                   vp9_bmode_tree, bmode_probs, branch_ct,
-                                   cm->fc.bmode_counts, 256, 1);
-  for (t = 0; t < VP9_BINTRAMODES - 1; ++t) {
-    int prob;
-    count = branch_ct[t][0] + branch_ct[t][1];
-    count = count > MODE_COUNT_SAT ? MODE_COUNT_SAT : count;
-    factor = (MODE_MAX_UPDATE_FACTOR * count / MODE_COUNT_SAT);
-    prob = ((int)cm->fc.pre_bmode_prob[t] * (256 - factor) +
-            (int)bmode_probs[t] * factor + 128) >> 8;
-    if (prob <= 0) cm->fc.bmode_prob[t] = 1;
-    else if (prob > 255) cm->fc.bmode_prob[t] = 255;
-    else cm->fc.bmode_prob[t] = prob;
-  }
-  vp9_tree_probs_from_distribution(VP9_I8X8_MODES, vp9_i8x8_mode_encodings,
-                                   vp9_i8x8_mode_tree, i8x8_mode_probs,
-                                   branch_ct, cm->fc.i8x8_mode_counts, 256, 1);
-  for (t = 0; t < VP9_I8X8_MODES - 1; ++t) {
-    int prob;
-    count = branch_ct[t][0] + branch_ct[t][1];
-    count = count > MODE_COUNT_SAT ? MODE_COUNT_SAT : count;
-    factor = (MODE_MAX_UPDATE_FACTOR * count / MODE_COUNT_SAT);
-    prob = ((int)cm->fc.pre_i8x8_mode_prob[t] * (256 - factor) +
-            (int)i8x8_mode_probs[t] * factor + 128) >> 8;
-    if (prob <= 0) cm->fc.i8x8_mode_prob[t] = 1;
-    else if (prob > 255) cm->fc.i8x8_mode_prob[t] = 255;
-    else cm->fc.i8x8_mode_prob[t] = prob;
-  }
-  for (i = 0; i < SUBMVREF_COUNT; ++i) {
-    vp9_tree_probs_from_distribution(VP9_SUBMVREFS,
-                                     vp9_sub_mv_ref_encoding_array,
-                                     vp9_sub_mv_ref_tree, sub_mv_ref_probs,
-                                     branch_ct, cm->fc.sub_mv_ref_counts[i],
-                                     256, 1);
-    for (t = 0; t < VP9_SUBMVREFS - 1; ++t) {
-      int prob;
-      count = branch_ct[t][0] + branch_ct[t][1];
-      count = count > MODE_COUNT_SAT ? MODE_COUNT_SAT : count;
-      factor = (MODE_MAX_UPDATE_FACTOR * count / MODE_COUNT_SAT);
-      prob = ((int)cm->fc.pre_sub_mv_ref_prob[i][t] * (256 - factor) +
-              (int)sub_mv_ref_probs[t] * factor + 128) >> 8;
-      if (prob <= 0) cm->fc.sub_mv_ref_prob[i][t] = 1;
-      else if (prob > 255) cm->fc.sub_mv_ref_prob[i][t] = 255;
-      else cm->fc.sub_mv_ref_prob[i][t] = prob;
-    }
-  }
-  vp9_tree_probs_from_distribution(VP9_NUMMBSPLITS, vp9_mbsplit_encodings,
-                                   vp9_mbsplit_tree, mbsplit_probs, branch_ct,
-                                   cm->fc.mbsplit_counts, 256, 1);
-  for (t = 0; t < VP9_NUMMBSPLITS - 1; ++t) {
-    int prob;
-    count = branch_ct[t][0] + branch_ct[t][1];
-    count = count > MODE_COUNT_SAT ? MODE_COUNT_SAT : count;
-    factor = (MODE_MAX_UPDATE_FACTOR * count / MODE_COUNT_SAT);
-    prob = ((int)cm->fc.pre_mbsplit_prob[t] * (256 - factor) +
-            (int)mbsplit_probs[t] * factor + 128) >> 8;
-    if (prob <= 0) cm->fc.mbsplit_prob[t] = 1;
-    else if (prob > 255) cm->fc.mbsplit_prob[t] = 255;
-    else cm->fc.mbsplit_prob[t] = prob;
-  }
-}
--- a/vp8/common/entropymode.h
+++ /dev/null
@@ -1,102 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef __INC_ENTROPYMODE_H
-#define __INC_ENTROPYMODE_H
-
-#include "blockd.h"
-#include "treecoder.h"
-
-#define SUBMVREF_COUNT 5
-#define VP9_NUMMBSPLITS 4
-
-typedef const int vp9_mbsplit[16];
-
-extern vp9_mbsplit vp9_mbsplits[VP9_NUMMBSPLITS];
-
-extern const int vp9_mbsplit_count[VP9_NUMMBSPLITS];    /* # of subsets */
-
-extern const vp9_prob vp9_mbsplit_probs[VP9_NUMMBSPLITS - 1];
-
-extern int vp9_mv_cont(const int_mv *l, const int_mv *a);
-
-extern const vp9_prob vp9_sub_mv_ref_prob[VP9_SUBMVREFS - 1];
-
-extern const vp9_prob vp9_sub_mv_ref_prob2[SUBMVREF_COUNT][VP9_SUBMVREFS - 1];
-
-extern const unsigned int vp9_kf_default_bmode_counts[VP9_BINTRAMODES]
-                                                     [VP9_BINTRAMODES]
-                                                     [VP9_BINTRAMODES];
-
-extern const vp9_tree_index vp9_bmode_tree[];
-
-extern const vp9_tree_index  vp9_ymode_tree[];
-extern const vp9_tree_index  vp9_kf_ymode_tree[];
-extern const vp9_tree_index  vp9_uv_mode_tree[];
-#define vp9_sb_ymode_tree vp9_uv_mode_tree
-extern const vp9_tree_index  vp9_i8x8_mode_tree[];
-extern const vp9_tree_index  vp9_mbsplit_tree[];
-extern const vp9_tree_index  vp9_mv_ref_tree[];
-extern const vp9_tree_index  vp9_sb_mv_ref_tree[];
-extern const vp9_tree_index  vp9_sub_mv_ref_tree[];
-
-extern struct vp9_token_struct vp9_bmode_encodings[VP9_BINTRAMODES];
-extern struct vp9_token_struct vp9_ymode_encodings[VP9_YMODES];
-extern struct vp9_token_struct vp9_sb_kf_ymode_encodings[VP9_I32X32_MODES];
-extern struct vp9_token_struct vp9_kf_ymode_encodings[VP9_YMODES];
-extern struct vp9_token_struct vp9_i8x8_mode_encodings[VP9_I8X8_MODES];
-extern struct vp9_token_struct vp9_uv_mode_encodings[VP9_UV_MODES];
-extern struct vp9_token_struct vp9_mbsplit_encodings[VP9_NUMMBSPLITS];
-
-/* Inter mode values do not start at zero */
-
-extern struct vp9_token_struct vp9_mv_ref_encoding_array[VP9_MVREFS];
-extern struct vp9_token_struct vp9_sb_mv_ref_encoding_array[VP9_MVREFS];
-extern struct vp9_token_struct vp9_sub_mv_ref_encoding_array[VP9_SUBMVREFS];
-
-void vp9_entropy_mode_init(void);
-
-struct VP9Common;
-
-void vp9_init_mbmode_probs(struct VP9Common *x);
-
-extern void vp9_init_mode_contexts(struct VP9Common *pc);
-
-extern void vp9_update_mode_context(struct VP9Common *pc);
-
-extern void vp9_accum_mv_refs(struct VP9Common *pc,
-                              MB_PREDICTION_MODE m,
-                              const int ct[4]);
-
-void vp9_default_bmode_probs(vp9_prob dest[VP9_BINTRAMODES - 1]);
-
-void vp9_kf_default_bmode_probs(vp9_prob dest[VP9_BINTRAMODES][VP9_BINTRAMODES]
-                                             [VP9_BINTRAMODES - 1]);
-
-void vp9_adapt_mode_probs(struct VP9Common *);
-
-#define VP9_SWITCHABLE_FILTERS 2 /* number of switchable filters */
-
-extern const  INTERPOLATIONFILTERTYPE vp9_switchable_interp
-                  [VP9_SWITCHABLE_FILTERS];
-
-extern const  int vp9_switchable_interp_map[SWITCHABLE + 1];
-
-extern const  vp9_tree_index vp9_switchable_interp_tree
-                  [2 * (VP9_SWITCHABLE_FILTERS - 1)];
-
-extern struct vp9_token_struct vp9_switchable_interp_encodings
-                  [VP9_SWITCHABLE_FILTERS];
-
-extern const  vp9_prob vp9_switchable_interp_prob[VP9_SWITCHABLE_FILTERS + 1]
-                                                 [VP9_SWITCHABLE_FILTERS - 1];
-
-#endif
--- a/vp8/common/entropymv.c
+++ /dev/null
@@ -1,465 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "onyxc_int.h"
-#include "entropymv.h"
-
-//#define MV_COUNT_TESTING
-
-#define MV_COUNT_SAT 16
-#define MV_MAX_UPDATE_FACTOR 160
-
-#if CONFIG_NEW_MVREF
-/* Integer pel reference mv threshold for use of high-precision 1/8 mv */
-#define COMPANDED_MVREF_THRESH    1000000
-#else
-/* Integer pel reference mv threshold for use of high-precision 1/8 mv */
-#define COMPANDED_MVREF_THRESH    8
-#endif
-
-/* Smooth or bias the mv-counts before prob computation */
-/* #define SMOOTH_MV_COUNTS */
-
-const vp9_tree_index vp9_mv_joint_tree[2 * MV_JOINTS - 2] = {
-  -MV_JOINT_ZERO, 2,
-  -MV_JOINT_HNZVZ, 4,
-  -MV_JOINT_HZVNZ, -MV_JOINT_HNZVNZ
-};
-struct vp9_token_struct vp9_mv_joint_encodings[MV_JOINTS];
-
-const vp9_tree_index vp9_mv_class_tree[2 * MV_CLASSES - 2] = {
-  -MV_CLASS_0, 2,
-  -MV_CLASS_1, 4,
-  6, 8,
-  -MV_CLASS_2, -MV_CLASS_3,
-  10, 12,
-  -MV_CLASS_4, -MV_CLASS_5,
-  -MV_CLASS_6, -MV_CLASS_7,
-};
-struct vp9_token_struct vp9_mv_class_encodings[MV_CLASSES];
-
-const vp9_tree_index vp9_mv_class0_tree [2 * CLASS0_SIZE - 2] = {
-  -0, -1,
-};
-struct vp9_token_struct vp9_mv_class0_encodings[CLASS0_SIZE];
-
-const vp9_tree_index vp9_mv_fp_tree [2 * 4 - 2] = {
-  -0, 2,
-  -1, 4,
-  -2, -3
-};
-struct vp9_token_struct vp9_mv_fp_encodings[4];
-
-const nmv_context vp9_default_nmv_context = {
-  {32, 64, 96},
-  {
-    { /* vert component */
-      128,                                             /* sign */
-      {224, 144, 192, 168, 192, 176, 192},             /* class */
-      {216},                                           /* class0 */
-      {136, 140, 148, 160, 176, 192, 224},             /* bits */
-      {{128, 128, 64}, {96, 112, 64}},                 /* class0_fp */
-      {64, 96, 64},                                    /* fp */
-      160,                                             /* class0_hp bit */
-      128,                                             /* hp */
-    },
-    { /* hor component */
-      128,                                             /* sign */
-      {216, 128, 176, 160, 176, 176, 192},             /* class */
-      {208},                                           /* class0 */
-      {136, 140, 148, 160, 176, 192, 224},             /* bits */
-      {{128, 128, 64}, {96, 112, 64}},                 /* class0_fp */
-      {64, 96, 64},                                    /* fp */
-      160,                                             /* class0_hp bit */
-      128,                                             /* hp */
-    }
-  },
-};
-
-MV_JOINT_TYPE vp9_get_mv_joint(MV mv) {
-  if (mv.row == 0 && mv.col == 0) return MV_JOINT_ZERO;
-  else if (mv.row == 0 && mv.col != 0) return MV_JOINT_HNZVZ;
-  else if (mv.row != 0 && mv.col == 0) return MV_JOINT_HZVNZ;
-  else return MV_JOINT_HNZVNZ;
-}
-
-#define mv_class_base(c) ((c) ? (CLASS0_SIZE << (c + 2)) : 0)
-
-MV_CLASS_TYPE vp9_get_mv_class(int z, int *offset) {
-  MV_CLASS_TYPE c;
-  if      (z < CLASS0_SIZE * 8)    c = MV_CLASS_0;
-  else if (z < CLASS0_SIZE * 16)   c = MV_CLASS_1;
-  else if (z < CLASS0_SIZE * 32)   c = MV_CLASS_2;
-  else if (z < CLASS0_SIZE * 64)   c = MV_CLASS_3;
-  else if (z < CLASS0_SIZE * 128)  c = MV_CLASS_4;
-  else if (z < CLASS0_SIZE * 256)  c = MV_CLASS_5;
-  else if (z < CLASS0_SIZE * 512)  c = MV_CLASS_6;
-  else if (z < CLASS0_SIZE * 1024) c = MV_CLASS_7;
-  else assert(0);
-  if (offset)
-    *offset = z - mv_class_base(c);
-  return c;
-}
-
-int vp9_use_nmv_hp(const MV *ref) {
-  if ((abs(ref->row) >> 3) < COMPANDED_MVREF_THRESH &&
-      (abs(ref->col) >> 3) < COMPANDED_MVREF_THRESH)
-    return 1;
-  else
-    return 0;
-}
-
-int vp9_get_mv_mag(MV_CLASS_TYPE c, int offset) {
-  return mv_class_base(c) + offset;
-}
-
-static void increment_nmv_component_count(int v,
-                                          nmv_component_counts *mvcomp,
-                                          int incr,
-                                          int usehp) {
-  assert (v != 0);            /* should not be zero */
-  mvcomp->mvcount[MV_MAX + v] += incr;
-}
-
-static void increment_nmv_component(int v,
-                                    nmv_component_counts *mvcomp,
-                                    int incr,
-                                    int usehp) {
-  int s, z, c, o, d, e, f;
-  assert (v != 0);            /* should not be zero */
-  s = v < 0;
-  mvcomp->sign[s] += incr;
-  z = (s ? -v : v) - 1;       /* magnitude - 1 */
-
-  c = vp9_get_mv_class(z, &o);
-  mvcomp->classes[c] += incr;
-
-  d = (o >> 3);               /* int mv data */
-  f = (o >> 1) & 3;           /* fractional pel mv data */
-  e = (o & 1);                /* high precision mv data */
-  if (c == MV_CLASS_0) {
-    mvcomp->class0[d] += incr;
-  } else {
-    int i, b;
-    b = c + CLASS0_BITS - 1;  /* number of bits */
-    for (i = 0; i < b; ++i)
-      mvcomp->bits[i][((d >> i) & 1)] += incr;
-  }
-
-  /* Code the fractional pel bits */
-  if (c == MV_CLASS_0) {
-    mvcomp->class0_fp[d][f] += incr;
-  } else {
-    mvcomp->fp[f] += incr;
-  }
-
-  /* Code the high precision bit */
-  if (usehp) {
-    if (c == MV_CLASS_0) {
-      mvcomp->class0_hp[e] += incr;
-    } else {
-      mvcomp->hp[e] += incr;
-    }
-  }
-}
-
-#ifdef SMOOTH_MV_COUNTS
-static void smooth_counts(nmv_component_counts *mvcomp) {
-  static const int flen = 3;  // (filter_length + 1) / 2
-  static const int fval[] = {8, 3, 1};
-  static const int fvalbits = 4;
-  int i;
-  unsigned int smvcount[MV_VALS];
-  vpx_memcpy(smvcount, mvcomp->mvcount, sizeof(smvcount));
-  smvcount[MV_MAX] = (smvcount[MV_MAX - 1] + smvcount[MV_MAX + 1]) >> 1;
-  for (i = flen - 1; i <= MV_VALS - flen; ++i) {
-    int j, s = smvcount[i] * fval[0];
-    for (j = 1; j < flen; ++j)
-      s += (smvcount[i - j] + smvcount[i + j]) * fval[j];
-    mvcomp->mvcount[i] = (s + (1 << (fvalbits - 1))) >> fvalbits;
-  }
-}
-#endif
-
-static void counts_to_context(nmv_component_counts *mvcomp, int usehp) {
-  int v;
-  vpx_memset(mvcomp->sign, 0, sizeof(nmv_component_counts) - sizeof(mvcomp->mvcount));
-  for (v = 1; v <= MV_MAX; v++) {
-    increment_nmv_component(-v, mvcomp, mvcomp->mvcount[MV_MAX - v], usehp);
-    increment_nmv_component( v, mvcomp, mvcomp->mvcount[MV_MAX + v], usehp);
-  }
-}
-
-void vp9_increment_nmv(const MV *mv, const MV *ref, nmv_context_counts *mvctx,
-                       int usehp) {
-  MV_JOINT_TYPE j = vp9_get_mv_joint(*mv);
-  mvctx->joints[j]++;
-  usehp = usehp && vp9_use_nmv_hp(ref);
-  if (j == MV_JOINT_HZVNZ || j == MV_JOINT_HNZVNZ) {
-    increment_nmv_component_count(mv->row, &mvctx->comps[0], 1, usehp);
-  }
-  if (j == MV_JOINT_HNZVZ || j == MV_JOINT_HNZVNZ) {
-    increment_nmv_component_count(mv->col, &mvctx->comps[1], 1, usehp);
-  }
-}
-
-static void adapt_prob(vp9_prob *dest, vp9_prob prep, vp9_prob newp,
-                       unsigned int ct[2]) {
-  int factor;
-  int prob;
-  int count = ct[0] + ct[1];
-  if (count) {
-    count = count > MV_COUNT_SAT ? MV_COUNT_SAT : count;
-    factor = (MV_MAX_UPDATE_FACTOR * count / MV_COUNT_SAT);
-    prob = ((int)prep * (256 - factor) + (int)(newp) * factor + 128) >> 8;
-    prob += !prob;
-    prob = (prob > 255 ? 255 : prob);
-    *dest = prob;
-  }
-}
-
-void vp9_counts_to_nmv_context(
-    nmv_context_counts *NMVcount,
-    nmv_context *prob,
-    int usehp,
-    unsigned int (*branch_ct_joint)[2],
-    unsigned int (*branch_ct_sign)[2],
-    unsigned int (*branch_ct_classes)[MV_CLASSES - 1][2],
-    unsigned int (*branch_ct_class0)[CLASS0_SIZE - 1][2],
-    unsigned int (*branch_ct_bits)[MV_OFFSET_BITS][2],
-    unsigned int (*branch_ct_class0_fp)[CLASS0_SIZE][4 - 1][2],
-    unsigned int (*branch_ct_fp)[4 - 1][2],
-    unsigned int (*branch_ct_class0_hp)[2],
-    unsigned int (*branch_ct_hp)[2]) {
-  int i, j, k;
-  counts_to_context(&NMVcount->comps[0], usehp);
-  counts_to_context(&NMVcount->comps[1], usehp);
-  vp9_tree_probs_from_distribution(MV_JOINTS,
-                                   vp9_mv_joint_encodings,
-                                   vp9_mv_joint_tree,
-                                   prob->joints,
-                                   branch_ct_joint,
-                                   NMVcount->joints,
-                                   256, 1);
-  for (i = 0; i < 2; ++i) {
-    prob->comps[i].sign =
-        vp9_bin_prob_from_distribution(NMVcount->comps[i].sign);
-    branch_ct_sign[i][0] = NMVcount->comps[i].sign[0];
-    branch_ct_sign[i][1] = NMVcount->comps[i].sign[1];
-    vp9_tree_probs_from_distribution(MV_CLASSES,
-                                     vp9_mv_class_encodings,
-                                     vp9_mv_class_tree,
-                                     prob->comps[i].classes,
-                                     branch_ct_classes[i],
-                                     NMVcount->comps[i].classes,
-                                     256, 1);
-    vp9_tree_probs_from_distribution(CLASS0_SIZE,
-                                     vp9_mv_class0_encodings,
-                                     vp9_mv_class0_tree,
-                                     prob->comps[i].class0,
-                                     branch_ct_class0[i],
-                                     NMVcount->comps[i].class0,
-                                     256, 1);
-    for (j = 0; j < MV_OFFSET_BITS; ++j) {
-      prob->comps[i].bits[j] = vp9_bin_prob_from_distribution(
-          NMVcount->comps[i].bits[j]);
-      branch_ct_bits[i][j][0] = NMVcount->comps[i].bits[j][0];
-      branch_ct_bits[i][j][1] = NMVcount->comps[i].bits[j][1];
-    }
-  }
-  for (i = 0; i < 2; ++i) {
-    for (k = 0; k < CLASS0_SIZE; ++k) {
-      vp9_tree_probs_from_distribution(4,
-                                       vp9_mv_fp_encodings,
-                                       vp9_mv_fp_tree,
-                                       prob->comps[i].class0_fp[k],
-                                       branch_ct_class0_fp[i][k],
-                                       NMVcount->comps[i].class0_fp[k],
-                                       256, 1);
-    }
-    vp9_tree_probs_from_distribution(4,
-                                     vp9_mv_fp_encodings,
-                                     vp9_mv_fp_tree,
-                                     prob->comps[i].fp,
-                                     branch_ct_fp[i],
-                                     NMVcount->comps[i].fp,
-                                     256, 1);
-  }
-  if (usehp) {
-    for (i = 0; i < 2; ++i) {
-      prob->comps[i].class0_hp = vp9_bin_prob_from_distribution(
-          NMVcount->comps[i].class0_hp);
-      branch_ct_class0_hp[i][0] = NMVcount->comps[i].class0_hp[0];
-      branch_ct_class0_hp[i][1] = NMVcount->comps[i].class0_hp[1];
-
-      prob->comps[i].hp =
-          vp9_bin_prob_from_distribution(NMVcount->comps[i].hp);
-      branch_ct_hp[i][0] = NMVcount->comps[i].hp[0];
-      branch_ct_hp[i][1] = NMVcount->comps[i].hp[1];
-    }
-  }
-}
-
-void vp9_adapt_nmv_probs(VP9_COMMON *cm, int usehp) {
-  int i, j, k;
-  nmv_context prob;
-  unsigned int branch_ct_joint[MV_JOINTS - 1][2];
-  unsigned int branch_ct_sign[2][2];
-  unsigned int branch_ct_classes[2][MV_CLASSES - 1][2];
-  unsigned int branch_ct_class0[2][CLASS0_SIZE - 1][2];
-  unsigned int branch_ct_bits[2][MV_OFFSET_BITS][2];
-  unsigned int branch_ct_class0_fp[2][CLASS0_SIZE][4 - 1][2];
-  unsigned int branch_ct_fp[2][4 - 1][2];
-  unsigned int branch_ct_class0_hp[2][2];
-  unsigned int branch_ct_hp[2][2];
-#ifdef MV_COUNT_TESTING
-  printf("joints count: ");
-  for (j = 0; j < MV_JOINTS; ++j) printf("%d ", cm->fc.NMVcount.joints[j]);
-  printf("\n"); fflush(stdout);
-  printf("signs count:\n");
-  for (i = 0; i < 2; ++i)
-    printf("%d/%d ", cm->fc.NMVcount.comps[i].sign[0], cm->fc.NMVcount.comps[i].sign[1]);
-  printf("\n"); fflush(stdout);
-  printf("classes count:\n");
-  for (i = 0; i < 2; ++i) {
-    for (j = 0; j < MV_CLASSES; ++j)
-      printf("%d ", cm->fc.NMVcount.comps[i].classes[j]);
-    printf("\n"); fflush(stdout);
-  }
-  printf("class0 count:\n");
-  for (i = 0; i < 2; ++i) {
-    for (j = 0; j < CLASS0_SIZE; ++j)
-      printf("%d ", cm->fc.NMVcount.comps[i].class0[j]);
-    printf("\n"); fflush(stdout);
-  }
-  printf("bits count:\n");
-  for (i = 0; i < 2; ++i) {
-    for (j = 0; j < MV_OFFSET_BITS; ++j)
-      printf("%d/%d ", cm->fc.NMVcount.comps[i].bits[j][0],
-                       cm->fc.NMVcount.comps[i].bits[j][1]);
-    printf("\n"); fflush(stdout);
-  }
-  printf("class0_fp count:\n");
-  for (i = 0; i < 2; ++i) {
-    for (j = 0; j < CLASS0_SIZE; ++j) {
-      printf("{");
-      for (k = 0; k < 4; ++k)
-        printf("%d ", cm->fc.NMVcount.comps[i].class0_fp[j][k]);
-      printf("}, ");
-    }
-    printf("\n"); fflush(stdout);
-  }
-  printf("fp count:\n");
-  for (i = 0; i < 2; ++i) {
-    for (j = 0; j < 4; ++j)
-      printf("%d ", cm->fc.NMVcount.comps[i].fp[j]);
-    printf("\n"); fflush(stdout);
-  }
-  if (usehp) {
-    printf("class0_hp count:\n");
-    for (i = 0; i < 2; ++i)
-      printf("%d/%d ", cm->fc.NMVcount.comps[i].class0_hp[0],
-                       cm->fc.NMVcount.comps[i].class0_hp[1]);
-    printf("\n"); fflush(stdout);
-    printf("hp count:\n");
-    for (i = 0; i < 2; ++i)
-      printf("%d/%d ", cm->fc.NMVcount.comps[i].hp[0],
-                       cm->fc.NMVcount.comps[i].hp[1]);
-    printf("\n"); fflush(stdout);
-  }
-#endif
-#ifdef SMOOTH_MV_COUNTS
-  smooth_counts(&cm->fc.NMVcount.comps[0]);
-  smooth_counts(&cm->fc.NMVcount.comps[1]);
-#endif
-  vp9_counts_to_nmv_context(&cm->fc.NMVcount,
-                            &prob,
-                            usehp,
-                            branch_ct_joint,
-                            branch_ct_sign,
-                            branch_ct_classes,
-                            branch_ct_class0,
-                            branch_ct_bits,
-                            branch_ct_class0_fp,
-                            branch_ct_fp,
-                            branch_ct_class0_hp,
-                            branch_ct_hp);
-
-  for (j = 0; j < MV_JOINTS - 1; ++j) {
-    adapt_prob(&cm->fc.nmvc.joints[j],
-               cm->fc.pre_nmvc.joints[j],
-               prob.joints[j],
-               branch_ct_joint[j]);
-  }
-  for (i = 0; i < 2; ++i) {
-    adapt_prob(&cm->fc.nmvc.comps[i].sign,
-               cm->fc.pre_nmvc.comps[i].sign,
-               prob.comps[i].sign,
-               branch_ct_sign[i]);
-    for (j = 0; j < MV_CLASSES - 1; ++j) {
-      adapt_prob(&cm->fc.nmvc.comps[i].classes[j],
-                 cm->fc.pre_nmvc.comps[i].classes[j],
-                 prob.comps[i].classes[j],
-                 branch_ct_classes[i][j]);
-    }
-    for (j = 0; j < CLASS0_SIZE - 1; ++j) {
-      adapt_prob(&cm->fc.nmvc.comps[i].class0[j],
-                 cm->fc.pre_nmvc.comps[i].class0[j],
-                 prob.comps[i].class0[j],
-                 branch_ct_class0[i][j]);
-    }
-    for (j = 0; j < MV_OFFSET_BITS; ++j) {
-      adapt_prob(&cm->fc.nmvc.comps[i].bits[j],
-                 cm->fc.pre_nmvc.comps[i].bits[j],
-                 prob.comps[i].bits[j],
-                 branch_ct_bits[i][j]);
-    }
-  }
-  for (i = 0; i < 2; ++i) {
-    for (j = 0; j < CLASS0_SIZE; ++j) {
-      for (k = 0; k < 3; ++k) {
-        adapt_prob(&cm->fc.nmvc.comps[i].class0_fp[j][k],
-                   cm->fc.pre_nmvc.comps[i].class0_fp[j][k],
-                   prob.comps[i].class0_fp[j][k],
-                   branch_ct_class0_fp[i][j][k]);
-      }
-    }
-    for (j = 0; j < 3; ++j) {
-      adapt_prob(&cm->fc.nmvc.comps[i].fp[j],
-                 cm->fc.pre_nmvc.comps[i].fp[j],
-                 prob.comps[i].fp[j],
-                 branch_ct_fp[i][j]);
-    }
-  }
-  if (usehp) {
-    for (i = 0; i < 2; ++i) {
-      adapt_prob(&cm->fc.nmvc.comps[i].class0_hp,
-                 cm->fc.pre_nmvc.comps[i].class0_hp,
-                 prob.comps[i].class0_hp,
-                 branch_ct_class0_hp[i]);
-      adapt_prob(&cm->fc.nmvc.comps[i].hp,
-                 cm->fc.pre_nmvc.comps[i].hp,
-                 prob.comps[i].hp,
-                 branch_ct_hp[i]);
-    }
-  }
-}
-
-void vp9_entropy_mv_init() {
-  vp9_tokens_from_tree(vp9_mv_joint_encodings, vp9_mv_joint_tree);
-  vp9_tokens_from_tree(vp9_mv_class_encodings, vp9_mv_class_tree);
-  vp9_tokens_from_tree(vp9_mv_class0_encodings, vp9_mv_class0_tree);
-  vp9_tokens_from_tree(vp9_mv_fp_encodings, vp9_mv_fp_tree);
-}
-
-void vp9_init_mv_probs(VP9_COMMON *cm) {
-  vpx_memcpy(&cm->fc.nmvc, &vp9_default_nmv_context, sizeof(nmv_context));
-}
--- a/vp8/common/entropymv.h
+++ /dev/null
@@ -1,129 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef __INC_ENTROPYMV_H
-#define __INC_ENTROPYMV_H
-
-#include "treecoder.h"
-#include "vpx_config.h"
-#include "blockd.h"
-
-struct VP9Common;
-
-void vp9_entropy_mv_init();
-void vp9_init_mv_probs(struct VP9Common *cm);
-
-void vp9_adapt_nmv_probs(struct VP9Common *cm, int usehp);
-int vp9_use_nmv_hp(const MV *ref);
-
-#define VP9_NMV_UPDATE_PROB  255
-//#define MV_GROUP_UPDATE
-
-#define LOW_PRECISION_MV_UPDATE  /* Use 7 bit forward update */
-
-/* Symbols for coding which components are zero jointly */
-#define MV_JOINTS     4
-typedef enum {
-  MV_JOINT_ZERO = 0,             /* Zero vector */
-  MV_JOINT_HNZVZ = 1,            /* Vert zero, hor nonzero */
-  MV_JOINT_HZVNZ = 2,            /* Hor zero, vert nonzero */
-  MV_JOINT_HNZVNZ = 3,           /* Both components nonzero */
-} MV_JOINT_TYPE;
-
-extern const vp9_tree_index vp9_mv_joint_tree[2 * MV_JOINTS - 2];
-extern struct vp9_token_struct vp9_mv_joint_encodings [MV_JOINTS];
-
-/* Symbols for coding magnitude class of nonzero components */
-#define MV_CLASSES     8
-typedef enum {
-  MV_CLASS_0 = 0,      /* (0, 2]     integer pel */
-  MV_CLASS_1 = 1,      /* (2, 4]     integer pel */
-  MV_CLASS_2 = 2,      /* (4, 8]     integer pel */
-  MV_CLASS_3 = 3,      /* (8, 16]    integer pel */
-  MV_CLASS_4 = 4,      /* (16, 32]   integer pel */
-  MV_CLASS_5 = 5,      /* (32, 64]   integer pel */
-  MV_CLASS_6 = 6,      /* (64, 128]  integer pel */
-  MV_CLASS_7 = 7,      /* (128, 256] integer pel */
-} MV_CLASS_TYPE;
-
-extern const vp9_tree_index vp9_mv_class_tree[2 * MV_CLASSES - 2];
-extern struct vp9_token_struct vp9_mv_class_encodings [MV_CLASSES];
-
-#define CLASS0_BITS    1  /* bits at integer precision for class 0 */
-#define CLASS0_SIZE    (1 << CLASS0_BITS)
-#define MV_OFFSET_BITS (MV_CLASSES + CLASS0_BITS - 2)
-
-#define MV_MAX_BITS    (MV_CLASSES + CLASS0_BITS + 2)
-#define MV_MAX         ((1 << MV_MAX_BITS) - 1)
-#define MV_VALS        ((MV_MAX << 1) + 1)
-
-extern const vp9_tree_index vp9_mv_class0_tree[2 * CLASS0_SIZE - 2];
-extern struct vp9_token_struct vp9_mv_class0_encodings[CLASS0_SIZE];
-
-extern const vp9_tree_index vp9_mv_fp_tree[2 * 4 - 2];
-extern struct vp9_token_struct vp9_mv_fp_encodings[4];
-
-typedef struct {
-  vp9_prob sign;
-  vp9_prob classes[MV_CLASSES - 1];
-  vp9_prob class0[CLASS0_SIZE - 1];
-  vp9_prob bits[MV_OFFSET_BITS];
-  vp9_prob class0_fp[CLASS0_SIZE][4 - 1];
-  vp9_prob fp[4 - 1];
-  vp9_prob class0_hp;
-  vp9_prob hp;
-} nmv_component;
-
-typedef struct {
-  vp9_prob joints[MV_JOINTS - 1];
-  nmv_component comps[2];
-} nmv_context;
-
-MV_JOINT_TYPE vp9_get_mv_joint(MV mv);
-MV_CLASS_TYPE vp9_get_mv_class(int z, int *offset);
-int vp9_get_mv_mag(MV_CLASS_TYPE c, int offset);
-
-
-typedef struct {
-  unsigned int mvcount[MV_VALS];
-  unsigned int sign[2];
-  unsigned int classes[MV_CLASSES];
-  unsigned int class0[CLASS0_SIZE];
-  unsigned int bits[MV_OFFSET_BITS][2];
-  unsigned int class0_fp[CLASS0_SIZE][4];
-  unsigned int fp[4];
-  unsigned int class0_hp[2];
-  unsigned int hp[2];
-} nmv_component_counts;
-
-typedef struct {
-  unsigned int joints[MV_JOINTS];
-  nmv_component_counts comps[2];
-} nmv_context_counts;
-
-void vp9_increment_nmv(const MV *mv, const MV *ref, nmv_context_counts *mvctx,
-                       int usehp);
-extern const nmv_context vp9_default_nmv_context;
-void vp9_counts_to_nmv_context(
-    nmv_context_counts *NMVcount,
-    nmv_context *prob,
-    int usehp,
-    unsigned int (*branch_ct_joint)[2],
-    unsigned int (*branch_ct_sign)[2],
-    unsigned int (*branch_ct_classes)[MV_CLASSES - 1][2],
-    unsigned int (*branch_ct_class0)[CLASS0_SIZE - 1][2],
-    unsigned int (*branch_ct_bits)[MV_OFFSET_BITS][2],
-    unsigned int (*branch_ct_class0_fp)[CLASS0_SIZE][4 - 1][2],
-    unsigned int (*branch_ct_fp)[4 - 1][2],
-    unsigned int (*branch_ct_class0_hp)[2],
-    unsigned int (*branch_ct_hp)[2]);
-
-#endif
--- a/vp8/common/extend.c
+++ /dev/null
@@ -1,169 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "extend.h"
-#include "vpx_mem/vpx_mem.h"
-
-static void copy_and_extend_plane(unsigned char *s, /* source */
-                                  int sp,           /* source pitch */
-                                  unsigned char *d, /* destination */
-                                  int dp,           /* destination pitch */
-                                  int h,            /* height */
-                                  int w,            /* width */
-                                  int et,           /* extend top border */
-                                  int el,           /* extend left border */
-                                  int eb,           /* extend bottom border */
-                                  int er) {         /* extend right border */
-  int i;
-  unsigned char *src_ptr1, *src_ptr2;
-  unsigned char *dest_ptr1, *dest_ptr2;
-  int linesize;
-
-  /* copy the left and right most columns out */
-  src_ptr1 = s;
-  src_ptr2 = s + w - 1;
-  dest_ptr1 = d - el;
-  dest_ptr2 = d + w;
-
-  for (i = 0; i < h; i++) {
-    vpx_memset(dest_ptr1, src_ptr1[0], el);
-    vpx_memcpy(dest_ptr1 + el, src_ptr1, w);
-    vpx_memset(dest_ptr2, src_ptr2[0], er);
-    src_ptr1  += sp;
-    src_ptr2  += sp;
-    dest_ptr1 += dp;
-    dest_ptr2 += dp;
-  }
-
-  /* Now copy the top and bottom lines into each line of the respective
-   * borders
-   */
-  src_ptr1 = d - el;
-  src_ptr2 = d + dp * (h - 1) - el;
-  dest_ptr1 = d + dp * (-et) - el;
-  dest_ptr2 = d + dp * (h) - el;
-  linesize = el + er + w;
-
-  for (i = 0; i < et; i++) {
-    vpx_memcpy(dest_ptr1, src_ptr1, linesize);
-    dest_ptr1 += dp;
-  }
-
-  for (i = 0; i < eb; i++) {
-    vpx_memcpy(dest_ptr2, src_ptr2, linesize);
-    dest_ptr2 += dp;
-  }
-}
-
-void vp9_copy_and_extend_frame(YV12_BUFFER_CONFIG *src,
-                               YV12_BUFFER_CONFIG *dst) {
-  int et = dst->border;
-  int el = dst->border;
-  int eb = dst->border + dst->y_height - src->y_height;
-  int er = dst->border + dst->y_width - src->y_width;
-
-  copy_and_extend_plane(src->y_buffer, src->y_stride,
-                        dst->y_buffer, dst->y_stride,
-                        src->y_height, src->y_width,
-                        et, el, eb, er);
-
-  et = dst->border >> 1;
-  el = dst->border >> 1;
-  eb = (dst->border >> 1) + dst->uv_height - src->uv_height;
-  er = (dst->border >> 1) + dst->uv_width - src->uv_width;
-
-  copy_and_extend_plane(src->u_buffer, src->uv_stride,
-                        dst->u_buffer, dst->uv_stride,
-                        src->uv_height, src->uv_width,
-                        et, el, eb, er);
-
-  copy_and_extend_plane(src->v_buffer, src->uv_stride,
-                        dst->v_buffer, dst->uv_stride,
-                        src->uv_height, src->uv_width,
-                        et, el, eb, er);
-}
-
-void vp9_copy_and_extend_frame_with_rect(YV12_BUFFER_CONFIG *src,
-                                         YV12_BUFFER_CONFIG *dst,
-                                         int srcy, int srcx,
-                                         int srch, int srcw) {
-  int et = dst->border;
-  int el = dst->border;
-  int eb = dst->border + dst->y_height - src->y_height;
-  int er = dst->border + dst->y_width - src->y_width;
-  int src_y_offset = srcy * src->y_stride + srcx;
-  int dst_y_offset = srcy * dst->y_stride + srcx;
-  int src_uv_offset = ((srcy * src->uv_stride) >> 1) + (srcx >> 1);
-  int dst_uv_offset = ((srcy * dst->uv_stride) >> 1) + (srcx >> 1);
-
-  // If the side is not touching the bounder then don't extend.
-  if (srcy)
-    et = 0;
-  if (srcx)
-    el = 0;
-  if (srcy + srch != src->y_height)
-    eb = 0;
-  if (srcx + srcw != src->y_width)
-    er = 0;
-
-  copy_and_extend_plane(src->y_buffer + src_y_offset,
-                        src->y_stride,
-                        dst->y_buffer + dst_y_offset,
-                        dst->y_stride,
-                        srch, srcw,
-                        et, el, eb, er);
-
-  et = (et + 1) >> 1;
-  el = (el + 1) >> 1;
-  eb = (eb + 1) >> 1;
-  er = (er + 1) >> 1;
-  srch = (srch + 1) >> 1;
-  srcw = (srcw + 1) >> 1;
-
-  copy_and_extend_plane(src->u_buffer + src_uv_offset,
-                        src->uv_stride,
-                        dst->u_buffer + dst_uv_offset,
-                        dst->uv_stride,
-                        srch, srcw,
-                        et, el, eb, er);
-
-  copy_and_extend_plane(src->v_buffer + src_uv_offset,
-                        src->uv_stride,
-                        dst->v_buffer + dst_uv_offset,
-                        dst->uv_stride,
-                        srch, srcw,
-                        et, el, eb, er);
-}
-
-/* note the extension is only for the last row, for intra prediction purpose */
-void vp9_extend_mb_row(YV12_BUFFER_CONFIG *ybf, unsigned char *YPtr,
-                       unsigned char *UPtr, unsigned char *VPtr) {
-  int i;
-
-  YPtr += ybf->y_stride * 14;
-  UPtr += ybf->uv_stride * 6;
-  VPtr += ybf->uv_stride * 6;
-
-  for (i = 0; i < 4; i++) {
-    YPtr[i] = YPtr[-1];
-    UPtr[i] = UPtr[-1];
-    VPtr[i] = VPtr[-1];
-  }
-
-  YPtr += ybf->y_stride;
-  UPtr += ybf->uv_stride;
-  VPtr += ybf->uv_stride;
-
-  for (i = 0; i < 4; i++) {
-    YPtr[i] = YPtr[-1];
-    UPtr[i] = UPtr[-1];
-    VPtr[i] = VPtr[-1];
-  }
-}
--- a/vp8/common/extend.h
+++ /dev/null
@@ -1,27 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef __INC_EXTEND_H
-#define __INC_EXTEND_H
-
-#include "vpx_scale/yv12config.h"
-
-void vp9_extend_mb_row(YV12_BUFFER_CONFIG *ybf, unsigned char *YPtr,
-                       unsigned char *UPtr, unsigned char *VPtr);
-
-void vp9_copy_and_extend_frame(YV12_BUFFER_CONFIG *src,
-                               YV12_BUFFER_CONFIG *dst);
-
-void vp9_copy_and_extend_frame_with_rect(YV12_BUFFER_CONFIG *src,
-                                         YV12_BUFFER_CONFIG *dst,
-                                         int srcy, int srcx,
-                                         int srch, int srcw);
-
-#endif  // __INC_EXTEND_H
--- a/vp8/common/filter.c
+++ /dev/null
@@ -1,1159 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include <stdlib.h>
-#include "filter.h"
-#include "vpx_ports/mem.h"
-#include "vpx_rtcd.h"
-
-DECLARE_ALIGNED(16, const short, vp9_bilinear_filters[SUBPEL_SHIFTS][2]) = {
-  { 128,   0 },
-  { 120,   8 },
-  { 112,  16 },
-  { 104,  24 },
-  {  96,  32 },
-  {  88,  40 },
-  {  80,  48 },
-  {  72,  56 },
-  {  64,  64 },
-  {  56,  72 },
-  {  48,  80 },
-  {  40,  88 },
-  {  32,  96 },
-  {  24, 104 },
-  {  16, 112 },
-  {   8, 120 }
-};
-
-#define FILTER_ALPHA       0
-#define FILTER_ALPHA_SHARP 1
-DECLARE_ALIGNED(16, const short, vp9_sub_pel_filters_8[SUBPEL_SHIFTS][8]) = {
-#if FILTER_ALPHA == 0
-  /* Lagrangian interpolation filter */
-  { 0,   0,   0, 128,   0,   0,   0,  0},
-  { 0,   1,  -5, 126,   8,  -3,   1,  0},
-  { -1,   3, -10, 122,  18,  -6,   2,  0},
-  { -1,   4, -13, 118,  27,  -9,   3, -1},
-  { -1,   4, -16, 112,  37, -11,   4, -1},
-  { -1,   5, -18, 105,  48, -14,   4, -1},
-  { -1,   5, -19,  97,  58, -16,   5, -1},
-  { -1,   6, -19,  88,  68, -18,   5, -1},
-  { -1,   6, -19,  78,  78, -19,   6, -1},
-  { -1,   5, -18,  68,  88, -19,   6, -1},
-  { -1,   5, -16,  58,  97, -19,   5, -1},
-  { -1,   4, -14,  48, 105, -18,   5, -1},
-  { -1,   4, -11,  37, 112, -16,   4, -1},
-  { -1,   3,  -9,  27, 118, -13,   4, -1},
-  { 0,   2,  -6,  18, 122, -10,   3, -1},
-  { 0,   1,  -3,   8, 126,  -5,   1,  0}
-#elif FILTER_ALPHA == 50
-  /* Generated using MATLAB:
-   * alpha = 0.5;
-   * b=intfilt(8,4,alpha);
-   * bi=round(128*b);
-   * ba=flipud(reshape([bi 0], 8, 8));
-   * disp(num2str(ba, '%d,'))
-   */
-  { 0,   0,   0, 128,   0,   0,   0,  0},
-  { 0,   1,  -5, 126,   8,  -3,   1,  0},
-  { 0,   2, -10, 122,  18,  -6,   2,  0},
-  { -1,   3, -13, 118,  27,  -9,   3,  0},
-  { -1,   4, -16, 112,  37, -11,   3,  0},
-  { -1,   5, -17, 104,  48, -14,   4, -1},
-  { -1,   5, -18,  96,  58, -16,   5, -1},
-  { -1,   5, -19,  88,  68, -17,   5, -1},
-  { -1,   5, -18,  78,  78, -18,   5, -1},
-  { -1,   5, -17,  68,  88, -19,   5, -1},
-  { -1,   5, -16,  58,  96, -18,   5, -1},
-  { -1,   4, -14,  48, 104, -17,   5, -1},
-  { 0,   3, -11,  37, 112, -16,   4, -1},
-  { 0,   3,  -9,  27, 118, -13,   3, -1},
-  { 0,   2,  -6,  18, 122, -10,   2,  0},
-  { 0,   1,  -3,   8, 126,  -5,   1,  0}
-#endif  /* FILTER_ALPHA */
-};
-
-DECLARE_ALIGNED(16, const short, vp9_sub_pel_filters_8s[SUBPEL_SHIFTS][8]) = {
-#if FILTER_ALPHA_SHARP == 1
-  /* dct based filter */
-  {0,   0,   0, 128,   0,   0,   0, 0},
-  {-1,   3,  -7, 127,   8,  -3,   1, 0},
-  {-2,   5, -13, 125,  17,  -6,   3, -1},
-  {-3,   7, -17, 121,  27, -10,   5, -2},
-  {-4,   9, -20, 115,  37, -13,   6, -2},
-  {-4,  10, -23, 108,  48, -16,   8, -3},
-  {-4,  10, -24, 100,  59, -19,   9, -3},
-  {-4,  11, -24,  90,  70, -21,  10, -4},
-  {-4,  11, -23,  80,  80, -23,  11, -4},
-  {-4,  10, -21,  70,  90, -24,  11, -4},
-  {-3,   9, -19,  59, 100, -24,  10, -4},
-  {-3,   8, -16,  48, 108, -23,  10, -4},
-  {-2,   6, -13,  37, 115, -20,   9, -4},
-  {-2,   5, -10,  27, 121, -17,   7, -3},
-  {-1,   3,  -6,  17, 125, -13,   5, -2},
-  {0,   1,  -3,   8, 127,  -7,   3, -1}
-#elif FILTER_ALPHA_SHARP == 75
-  /* alpha = 0.75 */
-  {0,   0,   0, 128,   0,   0,   0, 0},
-  {-1,   2,  -6, 126,   9,  -3,   2, -1},
-  {-1,   4, -11, 123,  18,  -7,   3, -1},
-  {-2,   6, -16, 119,  28, -10,   5, -2},
-  {-2,   7, -19, 113,  38, -13,   6, -2},
-  {-3,   8, -21, 106,  49, -16,   7, -2},
-  {-3,   9, -22,  99,  59, -19,   8, -3},
-  {-3,   9, -23,  90,  70, -21,   9, -3},
-  {-3,   9, -22,  80,  80, -22,   9, -3},
-  {-3,   9, -21,  70,  90, -23,   9, -3},
-  {-3,   8, -19,  59,  99, -22,   9, -3},
-  {-2,   7, -16,  49, 106, -21,   8, -3},
-  {-2,   6, -13,  38, 113, -19,   7, -2},
-  {-2,   5, -10,  28, 119, -16,   6, -2},
-  {-1,   3,  -7,  18, 123, -11,   4, -1},
-  {-1,   2,  -3,   9, 126,  -6,   2, -1}
-#endif  /* FILTER_ALPHA_SHARP */
-};
-
-DECLARE_ALIGNED(16, const short, vp9_sub_pel_filters_6[SUBPEL_SHIFTS][6]) = {
-  {0,   0, 128,   0,   0, 0},
-  {1,  -5, 125,   8,  -2, 1},
-  {1,  -8, 122,  17,  -5, 1},
-  {2, -11, 116,  27,  -8, 2},
-  {3, -14, 110,  37, -10, 2},
-  {3, -15, 103,  47, -12, 2},
-  {3, -16,  95,  57, -14, 3},
-  {3, -16,  86,  67, -15, 3},
-  {3, -16,  77,  77, -16, 3},
-  {3, -15,  67,  86, -16, 3},
-  {3, -14,  57,  95, -16, 3},
-  {2, -12,  47, 103, -15, 3},
-  {2, -10,  37, 110, -14, 3},
-  {2,  -8,  27, 116, -11, 2},
-  {1,  -5,  17, 122,  -8, 1},
-  {1,  -2,   8, 125,  -5, 1}
-};
-
-static void filter_block2d_first_pass_6(unsigned char *src_ptr,
-                                        int *output_ptr,
-                                        unsigned int src_pixels_per_line,
-                                        unsigned int pixel_step,
-                                        unsigned int output_height,
-                                        unsigned int output_width,
-                                        const short *vp9_filter) {
-  unsigned int i, j;
-  int  Temp;
-
-  for (i = 0; i < output_height; i++) {
-    for (j = 0; j < output_width; j++) {
-      Temp = ((int)src_ptr[-2 * (int)pixel_step] * vp9_filter[0]) +
-             ((int)src_ptr[-1 * (int)pixel_step] * vp9_filter[1]) +
-             ((int)src_ptr[0]                    * vp9_filter[2]) +
-             ((int)src_ptr[pixel_step]           * vp9_filter[3]) +
-             ((int)src_ptr[2 * pixel_step]       * vp9_filter[4]) +
-             ((int)src_ptr[3 * pixel_step]       * vp9_filter[5]) +
-             (VP9_FILTER_WEIGHT >> 1);      /* Rounding */
-
-      /* Normalize back to 0-255 */
-      Temp = Temp >> VP9_FILTER_SHIFT;
-
-      if (Temp < 0)
-        Temp = 0;
-      else if (Temp > 255)
-        Temp = 255;
-
-      output_ptr[j] = Temp;
-      src_ptr++;
-    }
-
-    /* Next row... */
-    src_ptr    += src_pixels_per_line - output_width;
-    output_ptr += output_width;
-  }
-}
-
-static void filter_block2d_second_pass_6(int *src_ptr,
-                                         unsigned char *output_ptr,
-                                         int output_pitch,
-                                         unsigned int src_pixels_per_line,
-                                         unsigned int pixel_step,
-                                         unsigned int output_height,
-                                         unsigned int output_width,
-                                         const short *vp9_filter) {
-  unsigned int i, j;
-  int  Temp;
-
-  for (i = 0; i < output_height; i++) {
-    for (j = 0; j < output_width; j++) {
-      /* Apply filter */
-      Temp = ((int)src_ptr[-2 * (int)pixel_step] * vp9_filter[0]) +
-             ((int)src_ptr[-1 * (int)pixel_step] * vp9_filter[1]) +
-             ((int)src_ptr[0]                    * vp9_filter[2]) +
-             ((int)src_ptr[pixel_step]           * vp9_filter[3]) +
-             ((int)src_ptr[2 * pixel_step]         * vp9_filter[4]) +
-             ((int)src_ptr[3 * pixel_step]         * vp9_filter[5]) +
-             (VP9_FILTER_WEIGHT >> 1);   /* Rounding */
-
-      /* Normalize back to 0-255 */
-      Temp = Temp >> VP9_FILTER_SHIFT;
-
-      if (Temp < 0)
-        Temp = 0;
-      else if (Temp > 255)
-        Temp = 255;
-
-      output_ptr[j] = (unsigned char)Temp;
-      src_ptr++;
-    }
-
-    /* Start next row */
-    src_ptr    += src_pixels_per_line - output_width;
-    output_ptr += output_pitch;
-  }
-}
-
-/*
- * The only functional difference between filter_block2d_second_pass()
- * and this function is that filter_block2d_second_pass() does a sixtap
- * filter on the input and stores it in the output. This function
- * (filter_block2d_second_pass_avg()) does a sixtap filter on the input,
- * and then averages that with the content already present in the output
- * ((filter_result + dest + 1) >> 1) and stores that in the output.
- */
-static void filter_block2d_second_pass_avg_6(int *src_ptr,
-                                             unsigned char *output_ptr,
-                                             int output_pitch,
-                                             unsigned int src_pixels_per_line,
-                                             unsigned int pixel_step,
-                                             unsigned int output_height,
-                                             unsigned int output_width,
-                                             const short *vp9_filter) {
-  unsigned int i, j;
-  int  Temp;
-
-  for (i = 0; i < output_height; i++) {
-    for (j = 0; j < output_width; j++) {
-      /* Apply filter */
-      Temp = ((int)src_ptr[-2 * (int)pixel_step] * vp9_filter[0]) +
-             ((int)src_ptr[-1 * (int)pixel_step] * vp9_filter[1]) +
-             ((int)src_ptr[0]                    * vp9_filter[2]) +
-             ((int)src_ptr[pixel_step]           * vp9_filter[3]) +
-             ((int)src_ptr[2 * pixel_step]         * vp9_filter[4]) +
-             ((int)src_ptr[3 * pixel_step]         * vp9_filter[5]) +
-             (VP9_FILTER_WEIGHT >> 1);   /* Rounding */
-
-      /* Normalize back to 0-255 */
-      Temp = Temp >> VP9_FILTER_SHIFT;
-
-      if (Temp < 0)
-        Temp = 0;
-      else if (Temp > 255)
-        Temp = 255;
-
-      output_ptr[j] = (unsigned char)((output_ptr[j] + Temp + 1) >> 1);
-      src_ptr++;
-    }
-
-    /* Start next row */
-    src_ptr    += src_pixels_per_line - output_width;
-    output_ptr += output_pitch;
-  }
-}
-
-#define Interp_Extend 3
-static void filter_block2d_6(unsigned char  *src_ptr,
-                             unsigned char  *output_ptr,
-                             unsigned int src_pixels_per_line,
-                             int output_pitch,
-                             const short  *HFilter,
-                             const short  *VFilter) {
-  int FData[(3 + Interp_Extend * 2) * 4]; /* Temp data buffer used in filtering */
-
-  /* First filter 1-D horizontally... */
-  filter_block2d_first_pass_6(src_ptr - ((Interp_Extend - 1) * src_pixels_per_line), FData, src_pixels_per_line, 1,
-                              3 + Interp_Extend * 2, 4, HFilter);
-
-  /* then filter verticaly... */
-  filter_block2d_second_pass_6(FData + 4 * (Interp_Extend - 1), output_ptr, output_pitch, 4, 4, 4, 4, VFilter);
-}
-
-
-void vp9_sixtap_predict_c(unsigned char  *src_ptr,
-                          int   src_pixels_per_line,
-                          int  xoffset,
-                          int  yoffset,
-                          unsigned char *dst_ptr,
-                          int dst_pitch) {
-  const short  *HFilter;
-  const short  *VFilter;
-
-  HFilter = vp9_sub_pel_filters_6[xoffset];   /* 6 tap */
-  VFilter = vp9_sub_pel_filters_6[yoffset];   /* 6 tap */
-
-  filter_block2d_6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter);
-}
-
-/*
- * The difference between filter_block2d_6() and filter_block2d_avg_6 is
- * that filter_block2d_6() does a 6-tap filter and stores it in the output
- * buffer, whereas filter_block2d_avg_6() does the same 6-tap filter, and
- * then averages that with the content already present in the output
- * ((filter_result + dest + 1) >> 1) and stores that in the output.
- */
-static void filter_block2d_avg_6(unsigned char  *src_ptr,
-                                 unsigned char  *output_ptr,
-                                 unsigned int src_pixels_per_line,
-                                 int output_pitch,
-                                 const short  *HFilter,
-                                 const short  *VFilter) {
-  int FData[(3 + Interp_Extend * 2) * 4]; /* Temp data buffer used in filtering */
-
-  /* First filter 1-D horizontally... */
-  filter_block2d_first_pass_6(src_ptr - ((Interp_Extend - 1) * src_pixels_per_line),
-                              FData, src_pixels_per_line, 1,
-                              3 + Interp_Extend * 2, 4, HFilter);
-
-  /* then filter verticaly... */
-  filter_block2d_second_pass_avg_6(FData + 4 * (Interp_Extend - 1), output_ptr,
-                                   output_pitch, 4, 4, 4, 4, VFilter);
-}
-
-void vp9_sixtap_predict_avg_c
-(
-  unsigned char  *src_ptr,
-  int   src_pixels_per_line,
-  int  xoffset,
-  int  yoffset,
-  unsigned char *dst_ptr,
-  int dst_pitch
-) {
-  const short  *HFilter;
-  const short  *VFilter;
-
-  HFilter = vp9_sub_pel_filters_6[xoffset];   /* 6 tap */
-  VFilter = vp9_sub_pel_filters_6[yoffset];   /* 6 tap */
-
-  filter_block2d_avg_6(src_ptr, dst_ptr, src_pixels_per_line,
-                       dst_pitch, HFilter, VFilter);
-}
-
-void vp9_sixtap_predict8x8_c
-(
-  unsigned char  *src_ptr,
-  int  src_pixels_per_line,
-  int  xoffset,
-  int  yoffset,
-  unsigned char *dst_ptr,
-  int  dst_pitch
-) {
-  const short  *HFilter;
-  const short  *VFilter;
-  // int FData[(7+Interp_Extend*2)*16];   /* Temp data buffer used in filtering */
-  int FData[(7 + Interp_Extend * 2) * 8]; /* Temp data buffer used in filtering */
-
-  HFilter = vp9_sub_pel_filters_6[xoffset];   /* 6 tap */
-  VFilter = vp9_sub_pel_filters_6[yoffset];   /* 6 tap */
-
-  /* First filter 1-D horizontally... */
-  filter_block2d_first_pass_6(src_ptr - ((Interp_Extend - 1) * src_pixels_per_line), FData, src_pixels_per_line, 1,
-                              7 + Interp_Extend * 2, 8, HFilter);
-
-
-  /* then filter verticaly... */
-  filter_block2d_second_pass_6(FData + 8 * (Interp_Extend - 1), dst_ptr, dst_pitch, 8, 8, 8, 8, VFilter);
-
-}
-
-void vp9_sixtap_predict_avg8x8_c
-(
-  unsigned char  *src_ptr,
-  int  src_pixels_per_line,
-  int  xoffset,
-  int  yoffset,
-  unsigned char *dst_ptr,
-  int  dst_pitch
-) {
-  const short  *HFilter;
-  const short  *VFilter;
-  // int FData[(7+Interp_Extend*2)*16];   /* Temp data buffer used in filtering */
-  int FData[(7 + Interp_Extend * 2) * 8]; /* Temp data buffer used in filtering */
-
-  HFilter = vp9_sub_pel_filters_6[xoffset];   /* 6 tap */
-  VFilter = vp9_sub_pel_filters_6[yoffset];   /* 6 tap */
-
-  /* First filter 1-D horizontally... */
-  filter_block2d_first_pass_6(src_ptr - ((Interp_Extend - 1) * src_pixels_per_line), FData, src_pixels_per_line, 1,
-                              7 + Interp_Extend * 2, 8, HFilter);
-
-  /* then filter verticaly... */
-  filter_block2d_second_pass_avg_6(FData + 8 * (Interp_Extend - 1), dst_ptr, dst_pitch, 8, 8, 8, 8, VFilter);
-}
-
-void vp9_sixtap_predict8x4_c
-(
-  unsigned char  *src_ptr,
-  int  src_pixels_per_line,
-  int  xoffset,
-  int  yoffset,
-  unsigned char *dst_ptr,
-  int  dst_pitch
-) {
-  const short  *HFilter;
-  const short  *VFilter;
-  // int FData[(7+Interp_Extend*2)*16];   /* Temp data buffer used in filtering */
-  int FData[(3 + Interp_Extend * 2) * 8]; /* Temp data buffer used in filtering */
-
-  HFilter = vp9_sub_pel_filters_6[xoffset];   /* 6 tap */
-  VFilter = vp9_sub_pel_filters_6[yoffset];   /* 6 tap */
-
-  /* First filter 1-D horizontally... */
-  filter_block2d_first_pass_6(src_ptr - ((Interp_Extend - 1) * src_pixels_per_line), FData, src_pixels_per_line, 1,
-                              3 + Interp_Extend * 2, 8, HFilter);
-
-
-  /* then filter verticaly... */
-  filter_block2d_second_pass_6(FData + 8 * (Interp_Extend - 1), dst_ptr, dst_pitch, 8, 8, 4, 8, VFilter);
-
-}
-
-void vp9_sixtap_predict16x16_c
-(
-  unsigned char  *src_ptr,
-  int  src_pixels_per_line,
-  int  xoffset,
-  int  yoffset,
-  unsigned char *dst_ptr,
-  int  dst_pitch
-) {
-  const short  *HFilter;
-  const short  *VFilter;
-  // int FData[(15+Interp_Extend*2)*24];   /* Temp data buffer used in filtering */
-  int FData[(15 + Interp_Extend * 2) * 16]; /* Temp data buffer used in filtering */
-
-
-  HFilter = vp9_sub_pel_filters_6[xoffset];   /* 6 tap */
-  VFilter = vp9_sub_pel_filters_6[yoffset];   /* 6 tap */
-
-  /* First filter 1-D horizontally... */
-  filter_block2d_first_pass_6(src_ptr - ((Interp_Extend - 1) * src_pixels_per_line), FData, src_pixels_per_line, 1,
-                              15 + Interp_Extend * 2, 16, HFilter);
-
-  /* then filter verticaly... */
-  filter_block2d_second_pass_6(FData + 16 * (Interp_Extend - 1), dst_ptr, dst_pitch, 16, 16, 16, 16, VFilter);
-
-}
-
-void vp9_sixtap_predict_avg16x16_c
-(
-  unsigned char  *src_ptr,
-  int  src_pixels_per_line,
-  int  xoffset,
-  int  yoffset,
-  unsigned char *dst_ptr,
-  int  dst_pitch
-) {
-  const short  *HFilter;
-  const short  *VFilter;
-  // int FData[(15+Interp_Extend*2)*24];   /* Temp data buffer used in filtering */
-  int FData[(15 + Interp_Extend * 2) * 16]; /* Temp data buffer used in filtering */
-
-  HFilter = vp9_sub_pel_filters_6[xoffset];   /* 6 tap */
-  VFilter = vp9_sub_pel_filters_6[yoffset];   /* 6 tap */
-
-  /* First filter 1-D horizontally... */
-  filter_block2d_first_pass_6(src_ptr - ((Interp_Extend - 1) * src_pixels_per_line), FData,
-                              src_pixels_per_line, 1, 15 + Interp_Extend * 2, 16, HFilter);
-
-  /* then filter verticaly... */
-  filter_block2d_second_pass_avg_6(FData + 16 * (Interp_Extend - 1), dst_ptr, dst_pitch,
-                                   16, 16, 16, 16, VFilter);
-}
-
-typedef enum {
-  VPX_FILTER_4x4 = 0,
-  VPX_FILTER_8x8 = 1,
-  VPX_FILTER_8x4 = 2,
-  VPX_FILTER_16x16 = 3,
-} filter_size_t;
-
-static const unsigned int filter_size_to_wh[][2] = {
-  {4, 4},
-  {8, 8},
-  {8, 4},
-  {16,16},
-};
-
-static const unsigned int filter_max_height = 16;
-static const unsigned int filter_max_width = 16;
-
-static void filter_block2d_8_c(const unsigned char *src_ptr,
-                               const unsigned int   src_stride,
-                               const short *HFilter,
-                               const short *VFilter,
-                               const filter_size_t filter_size,
-                               unsigned char *dst_ptr,
-                               unsigned int   dst_stride) {
-  const unsigned int output_width = filter_size_to_wh[filter_size][0];
-  const unsigned int output_height = filter_size_to_wh[filter_size][1];
-
-  // Between passes, we use an intermediate buffer whose height is extended to
-  // have enough horizontally filtered values as input for the vertical pass.
-  // This buffer is allocated to be big enough for the largest block type we
-  // support.
-  const int kInterp_Extend = 4;
-  const unsigned int intermediate_height =
-    (kInterp_Extend - 1) +     output_height + kInterp_Extend;
-  const unsigned int max_intermediate_height =
-    (kInterp_Extend - 1) + filter_max_height + kInterp_Extend;
-#ifdef _MSC_VER
-  // MSVC does not support C99 style declaration
-  unsigned char intermediate_buffer[23 * 16];
-#else
-  unsigned char intermediate_buffer[max_intermediate_height * filter_max_width];
-#endif
-  const int intermediate_next_stride = 1 - intermediate_height * output_width;
-
-  // Horizontal pass (src -> transposed intermediate).
-  {
-    unsigned char *output_ptr = intermediate_buffer;
-    const int src_next_row_stride = src_stride - output_width;
-    unsigned int i, j;
-    src_ptr -= (kInterp_Extend - 1) * src_stride + (kInterp_Extend - 1);
-    for (i = 0; i < intermediate_height; i++) {
-      for (j = 0; j < output_width; j++) {
-        // Apply filter...
-        int temp = ((int)src_ptr[0] * HFilter[0]) +
-                   ((int)src_ptr[1] * HFilter[1]) +
-                   ((int)src_ptr[2] * HFilter[2]) +
-                   ((int)src_ptr[3] * HFilter[3]) +
-                   ((int)src_ptr[4] * HFilter[4]) +
-                   ((int)src_ptr[5] * HFilter[5]) +
-                   ((int)src_ptr[6] * HFilter[6]) +
-                   ((int)src_ptr[7] * HFilter[7]) +
-                   (VP9_FILTER_WEIGHT >> 1); // Rounding
-
-        // Normalize back to 0-255...
-        temp >>= VP9_FILTER_SHIFT;
-        if (temp < 0) {
-          temp = 0;
-        } else if (temp > 255) {
-          temp = 255;
-        }
-        src_ptr++;
-        *output_ptr = temp;
-        output_ptr += intermediate_height;
-      }
-      src_ptr += src_next_row_stride;
-      output_ptr += intermediate_next_stride;
-    }
-  }
-
-  // Vertical pass (transposed intermediate -> dst).
-  {
-    unsigned char *src_ptr = intermediate_buffer;
-    const int dst_next_row_stride = dst_stride - output_width;
-    unsigned int i, j;
-    for (i = 0; i < output_height; i++) {
-      for (j = 0; j < output_width; j++) {
-        // Apply filter...
-        int temp = ((int)src_ptr[0] * VFilter[0]) +
-                   ((int)src_ptr[1] * VFilter[1]) +
-                   ((int)src_ptr[2] * VFilter[2]) +
-                   ((int)src_ptr[3] * VFilter[3]) +
-                   ((int)src_ptr[4] * VFilter[4]) +
-                   ((int)src_ptr[5] * VFilter[5]) +
-                   ((int)src_ptr[6] * VFilter[6]) +
-                   ((int)src_ptr[7] * VFilter[7]) +
-                   (VP9_FILTER_WEIGHT >> 1); // Rounding
-
-        // Normalize back to 0-255...
-        temp >>= VP9_FILTER_SHIFT;
-        if (temp < 0) {
-          temp = 0;
-        } else if (temp > 255) {
-          temp = 255;
-        }
-
-        src_ptr += intermediate_height;
-        *dst_ptr++ = (unsigned char)temp;
-      }
-      src_ptr += intermediate_next_stride;
-      dst_ptr += dst_next_row_stride;
-    }
-  }
-}
-
-void vp9_filter_block2d_4x4_8_c(const unsigned char *src_ptr,
-                                const unsigned int src_stride,
-                                const short *HFilter_aligned16,
-                                const short *VFilter_aligned16,
-                                unsigned char *dst_ptr,
-                                unsigned int dst_stride) {
-  filter_block2d_8_c(src_ptr, src_stride,
-                     HFilter_aligned16, VFilter_aligned16,
-                     VPX_FILTER_4x4, dst_ptr, dst_stride);
-}
-
-void vp9_filter_block2d_8x4_8_c(const unsigned char *src_ptr,
-                                const unsigned int src_stride,
-                                const short *HFilter_aligned16,
-                                const short *VFilter_aligned16,
-                                unsigned char *dst_ptr,
-                                unsigned int dst_stride) {
-  filter_block2d_8_c(src_ptr, src_stride,
-                     HFilter_aligned16, VFilter_aligned16,
-                     VPX_FILTER_8x4, dst_ptr, dst_stride);
-}
-
-void vp9_filter_block2d_8x8_8_c(const unsigned char *src_ptr,
-                                const unsigned int src_stride,
-                                const short *HFilter_aligned16,
-                                const short *VFilter_aligned16,
-                                unsigned char *dst_ptr,
-                                unsigned int dst_stride) {
-  filter_block2d_8_c(src_ptr, src_stride,
-                     HFilter_aligned16, VFilter_aligned16,
-                     VPX_FILTER_8x8, dst_ptr, dst_stride);
-}
-
-void vp9_filter_block2d_16x16_8_c(const unsigned char *src_ptr,
-                                  const unsigned int src_stride,
-                                  const short *HFilter_aligned16,
-                                  const short *VFilter_aligned16,
-                                  unsigned char *dst_ptr,
-                                  unsigned int dst_stride) {
-  filter_block2d_8_c(src_ptr, src_stride,
-                     HFilter_aligned16, VFilter_aligned16,
-                     VPX_FILTER_16x16, dst_ptr, dst_stride);
-}
-
-static void block2d_average_c(unsigned char *src,
-                              unsigned int   src_stride,
-                              unsigned char *output_ptr,
-                              unsigned int output_stride,
-                              const filter_size_t filter_size) {
-  const unsigned int output_width = filter_size_to_wh[filter_size][0];
-  const unsigned int output_height = filter_size_to_wh[filter_size][1];
-
-  unsigned int i, j;
-  for (i = 0; i < output_height; i++) {
-    for (j = 0; j < output_width; j++) {
-      output_ptr[j] = (output_ptr[j] + src[i * src_stride + j] + 1) >> 1;
-    }
-    output_ptr += output_stride;
-  }
-}
-
-#define block2d_average block2d_average_c
-
-void vp9_eighttap_predict_c(unsigned char  *src_ptr,
-                            int   src_pixels_per_line,
-                            int  xoffset,
-                            int  yoffset,
-                            unsigned char *dst_ptr,
-                            int dst_pitch) {
-  const short  *HFilter;
-  const short  *VFilter;
-
-  HFilter = vp9_sub_pel_filters_8[xoffset];
-  VFilter = vp9_sub_pel_filters_8[yoffset];
-
-  vp9_filter_block2d_4x4_8(src_ptr, src_pixels_per_line,
-                           HFilter, VFilter,
-                           dst_ptr, dst_pitch);
-}
-
-void vp9_eighttap_predict_avg4x4_c(unsigned char  *src_ptr,
-                                   int   src_pixels_per_line,
-                                   int  xoffset,
-                                   int  yoffset,
-                                   unsigned char *dst_ptr,
-                                   int dst_pitch) {
-  const short  *HFilter = vp9_sub_pel_filters_8[xoffset];
-  const short  *VFilter = vp9_sub_pel_filters_8[yoffset];
-  unsigned char tmp[4 * 4];
-
-  vp9_filter_block2d_4x4_8(src_ptr, src_pixels_per_line,
-                           HFilter, VFilter,
-                           tmp, 4);
-  block2d_average(tmp, 4, dst_ptr, dst_pitch, VPX_FILTER_4x4);
-}
-
-void vp9_eighttap_predict_sharp_c(unsigned char  *src_ptr,
-                                  int   src_pixels_per_line,
-                                  int  xoffset,
-                                  int  yoffset,
-                                  unsigned char *dst_ptr,
-                                  int dst_pitch) {
-  const short  *HFilter;
-  const short  *VFilter;
-
-  HFilter = vp9_sub_pel_filters_8s[xoffset];
-  VFilter = vp9_sub_pel_filters_8s[yoffset];
-
-  vp9_filter_block2d_4x4_8(src_ptr, src_pixels_per_line,
-                           HFilter, VFilter,
-                           dst_ptr, dst_pitch);
-}
-
-void vp9_eighttap_predict_avg4x4_sharp_c(unsigned char  *src_ptr,
-                                         int   src_pixels_per_line,
-                                         int  xoffset,
-                                         int  yoffset,
-                                         unsigned char *dst_ptr,
-                                         int dst_pitch) {
-  const short  *HFilter = vp9_sub_pel_filters_8s[xoffset];
-  const short  *VFilter = vp9_sub_pel_filters_8s[yoffset];
-  unsigned char tmp[4 * 4];
-
-  vp9_filter_block2d_4x4_8(src_ptr, src_pixels_per_line,
-                           HFilter, VFilter,
-                           tmp, 4);
-  block2d_average(tmp, 4, dst_ptr, dst_pitch, VPX_FILTER_4x4);
-}
-
-void vp9_eighttap_predict8x8_c(unsigned char  *src_ptr,
-                               int  src_pixels_per_line,
-                               int  xoffset,
-                               int  yoffset,
-                               unsigned char *dst_ptr,
-                               int  dst_pitch) {
-  const short  *HFilter = vp9_sub_pel_filters_8[xoffset];
-  const short  *VFilter = vp9_sub_pel_filters_8[yoffset];
-
-  vp9_filter_block2d_8x8_8(src_ptr, src_pixels_per_line,
-                           HFilter, VFilter,
-                           dst_ptr, dst_pitch);
-}
-
-void vp9_eighttap_predict8x8_sharp_c(unsigned char  *src_ptr,
-                                     int  src_pixels_per_line,
-                                     int  xoffset,
-                                     int  yoffset,
-                                     unsigned char *dst_ptr,
-                                     int  dst_pitch) {
-  const short  *HFilter = vp9_sub_pel_filters_8s[xoffset];
-  const short  *VFilter = vp9_sub_pel_filters_8s[yoffset];
-
-  vp9_filter_block2d_8x8_8(src_ptr, src_pixels_per_line,
-                           HFilter, VFilter,
-                           dst_ptr, dst_pitch);
-}
-
-void vp9_eighttap_predict_avg8x8_c(unsigned char  *src_ptr,
-                                   int  src_pixels_per_line,
-                                   int  xoffset,
-                                   int  yoffset,
-                                   unsigned char *dst_ptr,
-                                   int  dst_pitch) {
-  unsigned char tmp[8 * 8];
-  const short  *HFilter = vp9_sub_pel_filters_8[xoffset];
-  const short  *VFilter = vp9_sub_pel_filters_8[yoffset];
-
-  vp9_filter_block2d_8x8_8(src_ptr, src_pixels_per_line,
-                           HFilter, VFilter,
-                           tmp, 8);
-  block2d_average(tmp, 8, dst_ptr, dst_pitch, VPX_FILTER_8x8);
-}
-
-void vp9_eighttap_predict_avg8x8_sharp_c(unsigned char  *src_ptr,
-                                         int  src_pixels_per_line,
-                                         int  xoffset,
-                                         int  yoffset,
-                                         unsigned char *dst_ptr,
-                                         int  dst_pitch) {
-  unsigned char tmp[8 * 8];
-  const short  *HFilter = vp9_sub_pel_filters_8s[xoffset];
-  const short  *VFilter = vp9_sub_pel_filters_8s[yoffset];
-
-  vp9_filter_block2d_8x8_8(src_ptr, src_pixels_per_line,
-                           HFilter, VFilter,
-                           tmp, 8);
-  block2d_average(tmp, 8, dst_ptr, dst_pitch, VPX_FILTER_8x8);
-}
-
-void vp9_eighttap_predict8x4_c(unsigned char  *src_ptr,
-                               int  src_pixels_per_line,
-                               int  xoffset,
-                               int  yoffset,
-                               unsigned char *dst_ptr,
-                               int  dst_pitch) {
-  const short  *HFilter = vp9_sub_pel_filters_8[xoffset];
-  const short  *VFilter = vp9_sub_pel_filters_8[yoffset];
-
-  vp9_filter_block2d_8x4_8(src_ptr, src_pixels_per_line,
-                           HFilter, VFilter,
-                           dst_ptr, dst_pitch);
-}
-
-void vp9_eighttap_predict8x4_sharp_c(unsigned char  *src_ptr,
-                                     int  src_pixels_per_line,
-                                     int  xoffset,
-                                     int  yoffset,
-                                     unsigned char *dst_ptr,
-                                     int  dst_pitch) {
-  const short  *HFilter = vp9_sub_pel_filters_8s[xoffset];
-  const short  *VFilter = vp9_sub_pel_filters_8s[yoffset];
-
-  vp9_filter_block2d_8x4_8(src_ptr, src_pixels_per_line,
-                           HFilter, VFilter,
-                           dst_ptr, dst_pitch);
-}
-
-void vp9_eighttap_predict16x16_c(unsigned char  *src_ptr,
-                                 int  src_pixels_per_line,
-                                 int  xoffset,
-                                 int  yoffset,
-                                 unsigned char *dst_ptr,
-                                 int  dst_pitch) {
-  const short  *HFilter = vp9_sub_pel_filters_8[xoffset];
-  const short  *VFilter = vp9_sub_pel_filters_8[yoffset];
-
-  vp9_filter_block2d_16x16_8(src_ptr, src_pixels_per_line,
-                       HFilter, VFilter,
-                       dst_ptr, dst_pitch);
-}
-
-void vp9_eighttap_predict16x16_sharp_c(unsigned char  *src_ptr,
-                                       int  src_pixels_per_line,
-                                       int  xoffset,
-                                       int  yoffset,
-                                       unsigned char *dst_ptr,
-                                       int  dst_pitch) {
-  const short  *HFilter = vp9_sub_pel_filters_8s[xoffset];
-  const short  *VFilter = vp9_sub_pel_filters_8s[yoffset];
-
-  vp9_filter_block2d_16x16_8(src_ptr, src_pixels_per_line,
-                       HFilter, VFilter,
-                       dst_ptr, dst_pitch);
-}
-
-void vp9_eighttap_predict_avg16x16_c(unsigned char  *src_ptr,
-                                     int  src_pixels_per_line,
-                                     int  xoffset,
-                                     int  yoffset,
-                                     unsigned char *dst_ptr,
-                                     int  dst_pitch) {
-  DECLARE_ALIGNED_ARRAY(16, unsigned char, tmp, 16 * 16);
-  const short  *HFilter = vp9_sub_pel_filters_8[xoffset];
-  const short  *VFilter = vp9_sub_pel_filters_8[yoffset];
-
-  vp9_filter_block2d_16x16_8(src_ptr, src_pixels_per_line,
-                       HFilter, VFilter,
-                       tmp, 16);
-  block2d_average(tmp, 16, dst_ptr, dst_pitch, VPX_FILTER_16x16);
-}
-
-void vp9_eighttap_predict_avg16x16_sharp_c(unsigned char  *src_ptr,
-                                           int  src_pixels_per_line,
-                                           int  xoffset,
-                                           int  yoffset,
-                                           unsigned char *dst_ptr,
-                                           int  dst_pitch) {
-  DECLARE_ALIGNED_ARRAY(16, unsigned char, tmp, 16 * 16);
-  const short  *HFilter = vp9_sub_pel_filters_8s[xoffset];
-  const short  *VFilter = vp9_sub_pel_filters_8s[yoffset];
-
-  vp9_filter_block2d_16x16_8(src_ptr, src_pixels_per_line,
-                       HFilter, VFilter,
-                       tmp, 16);
-  block2d_average(tmp, 16, dst_ptr, dst_pitch, VPX_FILTER_16x16);
-}
-
-/****************************************************************************
- *
- *  ROUTINE       : filter_block2d_bil_first_pass
- *
- *  INPUTS        : UINT8  *src_ptr    : Pointer to source block.
- *                  UINT32  src_stride : Stride of source block.
- *                  UINT32  height     : Block height.
- *                  UINT32  width      : Block width.
- *                  INT32  *vp9_filter : Array of 2 bi-linear filter taps.
- *
- *  OUTPUTS       : INT32  *dst_ptr    : Pointer to filtered block.
- *
- *  RETURNS       : void
- *
- *  FUNCTION      : Applies a 1-D 2-tap bi-linear filter to the source block
- *                  in the horizontal direction to produce the filtered output
- *                  block. Used to implement first-pass of 2-D separable filter.
- *
- *  SPECIAL NOTES : Produces INT32 output to retain precision for next pass.
- *                  Two filter taps should sum to VP9_FILTER_WEIGHT.
- *
- ****************************************************************************/
-static void filter_block2d_bil_first_pass(unsigned char  *src_ptr,
-                                          unsigned short *dst_ptr,
-                                          unsigned int    src_stride,
-                                          unsigned int    height,
-                                          unsigned int    width,
-                                          const short    *vp9_filter) {
-  unsigned int i, j;
-
-  for (i = 0; i < height; i++) {
-    for (j = 0; j < width; j++) {
-      /* Apply bilinear filter */
-      dst_ptr[j] = (((int)src_ptr[0] * vp9_filter[0]) +
-                    ((int)src_ptr[1] * vp9_filter[1]) +
-                    (VP9_FILTER_WEIGHT / 2)) >> VP9_FILTER_SHIFT;
-      src_ptr++;
-    }
-
-    /* Next row... */
-    src_ptr += src_stride - width;
-    dst_ptr += width;
-  }
-}
-
-/****************************************************************************
- *
- *  ROUTINE       : filter_block2d_bil_second_pass
- *
- *  INPUTS        : INT32  *src_ptr    : Pointer to source block.
- *                  UINT32  dst_pitch  : Destination block pitch.
- *                  UINT32  height     : Block height.
- *                  UINT32  width      : Block width.
- *                  INT32  *vp9_filter : Array of 2 bi-linear filter taps.
- *
- *  OUTPUTS       : UINT16 *dst_ptr    : Pointer to filtered block.
- *
- *  RETURNS       : void
- *
- *  FUNCTION      : Applies a 1-D 2-tap bi-linear filter to the source block
- *                  in the vertical direction to produce the filtered output
- *                  block. Used to implement second-pass of 2-D separable filter.
- *
- *  SPECIAL NOTES : Requires 32-bit input as produced by filter_block2d_bil_first_pass.
- *                  Two filter taps should sum to VP9_FILTER_WEIGHT.
- *
- ****************************************************************************/
-static void filter_block2d_bil_second_pass(unsigned short *src_ptr,
-                                           unsigned char  *dst_ptr,
-                                           int             dst_pitch,
-                                           unsigned int    height,
-                                           unsigned int    width,
-                                           const short    *vp9_filter) {
-  unsigned int  i, j;
-  int  Temp;
-
-  for (i = 0; i < height; i++) {
-    for (j = 0; j < width; j++) {
-      /* Apply filter */
-      Temp = ((int)src_ptr[0]     * vp9_filter[0]) +
-             ((int)src_ptr[width] * vp9_filter[1]) +
-             (VP9_FILTER_WEIGHT / 2);
-      dst_ptr[j] = (unsigned int)(Temp >> VP9_FILTER_SHIFT);
-      src_ptr++;
-    }
-
-    /* Next row... */
-    dst_ptr += dst_pitch;
-  }
-}
-
-/*
- * As before for filter_block2d_second_pass_avg(), the functional difference
- * between filter_block2d_bil_second_pass() and filter_block2d_bil_second_pass_avg()
- * is that filter_block2d_bil_second_pass() does a bilinear filter on input
- * and stores the result in output; filter_block2d_bil_second_pass_avg(),
- * instead, does a bilinear filter on input, averages the resulting value
- * with the values already present in the output and stores the result of
- * that back into the output ((filter_result + dest + 1) >> 1).
- */
-static void filter_block2d_bil_second_pass_avg(unsigned short *src_ptr,
-                                               unsigned char  *dst_ptr,
-                                               int             dst_pitch,
-                                               unsigned int    height,
-                                               unsigned int    width,
-                                               const short    *vp9_filter) {
-  unsigned int  i, j;
-  int  Temp;
-
-  for (i = 0; i < height; i++) {
-    for (j = 0; j < width; j++) {
-      /* Apply filter */
-      Temp = ((int)src_ptr[0]     * vp9_filter[0]) +
-             ((int)src_ptr[width] * vp9_filter[1]) +
-             (VP9_FILTER_WEIGHT / 2);
-      dst_ptr[j] = (unsigned int)(((Temp >> VP9_FILTER_SHIFT) + dst_ptr[j] + 1) >> 1);
-      src_ptr++;
-    }
-
-    /* Next row... */
-    dst_ptr += dst_pitch;
-  }
-}
-
-/****************************************************************************
- *
- *  ROUTINE       : filter_block2d_bil
- *
- *  INPUTS        : UINT8  *src_ptr          : Pointer to source block.
- *                  UINT32  src_pitch        : Stride of source block.
- *                  UINT32  dst_pitch        : Stride of destination block.
- *                  INT32  *HFilter          : Array of 2 horizontal filter taps.
- *                  INT32  *VFilter          : Array of 2 vertical filter taps.
- *                  INT32  Width             : Block width
- *                  INT32  Height            : Block height
- *
- *  OUTPUTS       : UINT16 *dst_ptr       : Pointer to filtered block.
- *
- *  RETURNS       : void
- *
- *  FUNCTION      : 2-D filters an input block by applying a 2-tap
- *                  bi-linear filter horizontally followed by a 2-tap
- *                  bi-linear filter vertically on the result.
- *
- *  SPECIAL NOTES : The largest block size can be handled here is 16x16
- *
- ****************************************************************************/
-static void filter_block2d_bil(unsigned char *src_ptr,
-                               unsigned char *dst_ptr,
-                               unsigned int   src_pitch,
-                               unsigned int   dst_pitch,
-                               const short   *HFilter,
-                               const short   *VFilter,
-                               int            Width,
-                               int            Height) {
-
-  unsigned short FData[17 * 16];  /* Temp data buffer used in filtering */
-
-  /* First filter 1-D horizontally... */
-  filter_block2d_bil_first_pass(src_ptr, FData, src_pitch, Height + 1, Width, HFilter);
-
-  /* then 1-D vertically... */
-  filter_block2d_bil_second_pass(FData, dst_ptr, dst_pitch, Height, Width, VFilter);
-}
-
-static void filter_block2d_bil_avg(unsigned char *src_ptr,
-                                   unsigned char *dst_ptr,
-                                   unsigned int   src_pitch,
-                                   unsigned int   dst_pitch,
-                                   const short   *HFilter,
-                                   const short   *VFilter,
-                                   int            Width,
-                                   int            Height) {
-  unsigned short FData[17 * 16];  /* Temp data buffer used in filtering */
-
-  /* First filter 1-D horizontally... */
-  filter_block2d_bil_first_pass(src_ptr, FData, src_pitch, Height + 1, Width, HFilter);
-
-  /* then 1-D vertically... */
-  filter_block2d_bil_second_pass_avg(FData, dst_ptr, dst_pitch, Height, Width, VFilter);
-}
-
-void vp9_bilinear_predict4x4_c(unsigned char  *src_ptr,
-                               int   src_pixels_per_line,
-                               int  xoffset,
-                               int  yoffset,
-                               unsigned char *dst_ptr,
-                               int dst_pitch) {
-  const short *HFilter;
-  const short *VFilter;
-
-  HFilter = vp9_bilinear_filters[xoffset];
-  VFilter = vp9_bilinear_filters[yoffset];
-
-  filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 4, 4);
-}
-
-void vp9_bilinear_predict_avg4x4_c(unsigned char  *src_ptr,
-                                   int   src_pixels_per_line,
-                                   int  xoffset,
-                                   int  yoffset,
-                                   unsigned char *dst_ptr,
-                                   int dst_pitch) {
-  const short *HFilter;
-  const short *VFilter;
-
-  HFilter = vp9_bilinear_filters[xoffset];
-  VFilter = vp9_bilinear_filters[yoffset];
-
-  filter_block2d_bil_avg(src_ptr, dst_ptr, src_pixels_per_line,
-                         dst_pitch, HFilter, VFilter, 4, 4);
-}
-
-void vp9_bilinear_predict8x8_c(unsigned char  *src_ptr,
-                               int  src_pixels_per_line,
-                               int  xoffset,
-                               int  yoffset,
-                               unsigned char *dst_ptr,
-                               int  dst_pitch) {
-  const short *HFilter;
-  const short *VFilter;
-
-  HFilter = vp9_bilinear_filters[xoffset];
-  VFilter = vp9_bilinear_filters[yoffset];
-
-  filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 8);
-
-}
-
-void vp9_bilinear_predict_avg8x8_c(unsigned char  *src_ptr,
-                                   int  src_pixels_per_line,
-                                   int  xoffset,
-                                   int  yoffset,
-                                   unsigned char *dst_ptr,
-                                   int  dst_pitch) {
-  const short *HFilter;
-  const short *VFilter;
-
-  HFilter = vp9_bilinear_filters[xoffset];
-  VFilter = vp9_bilinear_filters[yoffset];
-
-  filter_block2d_bil_avg(src_ptr, dst_ptr, src_pixels_per_line,
-                         dst_pitch, HFilter, VFilter, 8, 8);
-}
-
-void vp9_bilinear_predict8x4_c(unsigned char  *src_ptr,
-                               int  src_pixels_per_line,
-                               int  xoffset,
-                               int  yoffset,
-                               unsigned char *dst_ptr,
-                               int  dst_pitch) {
-  const short *HFilter;
-  const short *VFilter;
-
-  HFilter = vp9_bilinear_filters[xoffset];
-  VFilter = vp9_bilinear_filters[yoffset];
-
-  filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 4);
-
-}
-
-void vp9_bilinear_predict16x16_c(unsigned char  *src_ptr,
-                                 int  src_pixels_per_line,
-                                 int  xoffset,
-                                 int  yoffset,
-                                 unsigned char *dst_ptr,
-                                 int  dst_pitch) {
-  const short *HFilter;
-  const short *VFilter;
-
-  HFilter = vp9_bilinear_filters[xoffset];
-  VFilter = vp9_bilinear_filters[yoffset];
-
-  filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 16, 16);
-}
-
-void vp9_bilinear_predict_avg16x16_c(unsigned char  *src_ptr,
-                                     int  src_pixels_per_line,
-                                     int  xoffset,
-                                     int  yoffset,
-                                     unsigned char *dst_ptr,
-                                     int  dst_pitch) {
-  const short *HFilter;
-  const short *VFilter;
-
-  HFilter = vp9_bilinear_filters[xoffset];
-  VFilter = vp9_bilinear_filters[yoffset];
-
-  filter_block2d_bil_avg(src_ptr, dst_ptr, src_pixels_per_line,
-                         dst_pitch, HFilter, VFilter, 16, 16);
-}
--- a/vp8/common/filter.h
+++ /dev/null
@@ -1,28 +1,0 @@
-/*
- *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef FILTER_H
-#define FILTER_H
-
-#include "vpx_config.h"
-#include "vpx_scale/yv12config.h"
-
-#define BLOCK_HEIGHT_WIDTH 4
-#define VP9_FILTER_WEIGHT 128
-#define VP9_FILTER_SHIFT  7
-
-#define SUBPEL_SHIFTS 16
-
-extern const short vp9_bilinear_filters[SUBPEL_SHIFTS][2];
-extern const short vp9_sub_pel_filters_6[SUBPEL_SHIFTS][6];
-extern const short vp9_sub_pel_filters_8[SUBPEL_SHIFTS][8];
-extern const short vp9_sub_pel_filters_8s[SUBPEL_SHIFTS][8];
-
-#endif // FILTER_H
--- a/vp8/common/findnearmv.c
+++ /dev/null
@@ -1,327 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "findnearmv.h"
-#include "vp8/common/sadmxn.h"
-#include <limits.h>
-
-const unsigned char vp9_mbsplit_offset[4][16] = {
-  { 0,  8,  0,  0,  0,  0,  0,  0,  0,  0,   0,  0,  0,  0,  0,  0},
-  { 0,  2,  0,  0,  0,  0,  0,  0,  0,  0,   0,  0,  0,  0,  0,  0},
-  { 0,  2,  8, 10,  0,  0,  0,  0,  0,  0,   0,  0,  0,  0,  0,  0},
-  { 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15}
-};
-
-static void lower_mv_precision(int_mv *mv, int usehp)
-{
-  if (!usehp || !vp9_use_nmv_hp(&mv->as_mv)) {
-    if (mv->as_mv.row & 1)
-      mv->as_mv.row += (mv->as_mv.row > 0 ? -1 : 1);
-    if (mv->as_mv.col & 1)
-      mv->as_mv.col += (mv->as_mv.col > 0 ? -1 : 1);
-  }
-}
-
-/* Predict motion vectors using those from already-decoded nearby blocks.
-   Note that we only consider one 4x4 subblock from each candidate 16x16
-   macroblock.   */
-
-void vp9_find_near_mvs
-(
-  MACROBLOCKD *xd,
-  const MODE_INFO *here,
-  const MODE_INFO *lf_here,
-  int_mv *nearest,
-  int_mv *nearby,
-  int_mv *best_mv,
-  int cnt[4],
-  int refframe,
-  int *ref_frame_sign_bias) {
-  const MODE_INFO *above = here - xd->mode_info_stride;
-  const MODE_INFO *left = here - 1;
-  const MODE_INFO *aboveleft = above - 1;
-  const MODE_INFO *third = NULL;
-  int_mv            near_mvs[4];
-  int_mv           *mv = near_mvs;
-  int             *cntx = cnt;
-  enum {CNT_INTRA, CNT_NEAREST, CNT_NEAR, CNT_SPLITMV};
-
-  /* Zero accumulators */
-  mv[0].as_int = mv[1].as_int = mv[2].as_int = 0;
-  cnt[0] = cnt[1] = cnt[2] = cnt[3] = 0;
-
-  /* Process above */
-  if (above->mbmi.ref_frame != INTRA_FRAME) {
-    if (above->mbmi.mv[0].as_int) {
-      ++ mv;
-      mv->as_int = above->mbmi.mv[0].as_int;
-      mv_bias(ref_frame_sign_bias[above->mbmi.ref_frame],
-              refframe, mv, ref_frame_sign_bias);
-      ++cntx;
-    }
-    *cntx += 2;
-  }
-
-  /* Process left */
-  if (left->mbmi.ref_frame != INTRA_FRAME) {
-    if (left->mbmi.mv[0].as_int) {
-      int_mv this_mv;
-      this_mv.as_int = left->mbmi.mv[0].as_int;
-      mv_bias(ref_frame_sign_bias[left->mbmi.ref_frame],
-              refframe, &this_mv, ref_frame_sign_bias);
-
-      if (this_mv.as_int != mv->as_int) {
-        ++ mv;
-        mv->as_int = this_mv.as_int;
-        ++ cntx;
-      }
-      *cntx += 2;
-    } else
-      cnt[CNT_INTRA] += 2;
-  }
-  /* Process above left or the one from last frame */
-  if (aboveleft->mbmi.ref_frame != INTRA_FRAME ||
-      (lf_here->mbmi.ref_frame == LAST_FRAME && refframe == LAST_FRAME)) {
-    if (aboveleft->mbmi.mv[0].as_int) {
-      third = aboveleft;
-    } else if (lf_here->mbmi.mv[0].as_int) {
-      third = lf_here;
-    }
-    if (third) {
-      int_mv this_mv;
-      this_mv.as_int = third->mbmi.mv[0].as_int;
-      mv_bias(ref_frame_sign_bias[third->mbmi.ref_frame],
-              refframe, &this_mv, ref_frame_sign_bias);
-
-      if (this_mv.as_int != mv->as_int) {
-        ++ mv;
-        mv->as_int = this_mv.as_int;
-        ++ cntx;
-      }
-      *cntx += 1;
-    } else
-      cnt[CNT_INTRA] += 1;
-  }
-
-  /* If we have three distinct MV's ... */
-  if (cnt[CNT_SPLITMV]) {
-    /* See if the third MV can be merged with NEAREST */
-    if (mv->as_int == near_mvs[CNT_NEAREST].as_int)
-      cnt[CNT_NEAREST] += 1;
-  }
-
-  cnt[CNT_SPLITMV] = ((above->mbmi.mode == SPLITMV)
-                      + (left->mbmi.mode == SPLITMV)) * 2
-                     + (
-                       lf_here->mbmi.mode == SPLITMV ||
-                       aboveleft->mbmi.mode == SPLITMV);
-
-  /* Swap near and nearest if necessary */
-  if (cnt[CNT_NEAR] > cnt[CNT_NEAREST]) {
-    int tmp;
-    tmp = cnt[CNT_NEAREST];
-    cnt[CNT_NEAREST] = cnt[CNT_NEAR];
-    cnt[CNT_NEAR] = tmp;
-    tmp = near_mvs[CNT_NEAREST].as_int;
-    near_mvs[CNT_NEAREST].as_int = near_mvs[CNT_NEAR].as_int;
-    near_mvs[CNT_NEAR].as_int = tmp;
-  }
-
-  /* Use near_mvs[0] to store the "best" MV */
-  if (cnt[CNT_NEAREST] >= cnt[CNT_INTRA])
-    near_mvs[CNT_INTRA] = near_mvs[CNT_NEAREST];
-
-  /* Set up return values */
-  best_mv->as_int = near_mvs[0].as_int;
-  nearest->as_int = near_mvs[CNT_NEAREST].as_int;
-  nearby->as_int = near_mvs[CNT_NEAR].as_int;
-
-  /* Make sure that the 1/8th bits of the Mvs are zero if high_precision
-   * is not being used, by truncating the last bit towards 0
-   */
-  lower_mv_precision(best_mv, xd->allow_high_precision_mv);
-  lower_mv_precision(nearest, xd->allow_high_precision_mv);
-  lower_mv_precision(nearby, xd->allow_high_precision_mv);
-
-  // TODO: move clamp outside findnearmv
-  clamp_mv2(nearest, xd);
-  clamp_mv2(nearby, xd);
-  clamp_mv2(best_mv, xd);
-}
-
-vp9_prob *vp9_mv_ref_probs(VP9_COMMON *pc,
-                           vp9_prob p[VP9_MVREFS - 1], const int near_mv_ref_ct[4]
-                          ) {
-  p[0] = pc->fc.vp8_mode_contexts [near_mv_ref_ct[0]] [0];
-  p[1] = pc->fc.vp8_mode_contexts [near_mv_ref_ct[1]] [1];
-  p[2] = pc->fc.vp8_mode_contexts [near_mv_ref_ct[2]] [2];
-  p[3] = pc->fc.vp8_mode_contexts [near_mv_ref_ct[3]] [3];
-  return p;
-}
-
-#if CONFIG_NEWBESTREFMV
-#define SP(x) (((x) & 7) << 1)
-unsigned int vp9_sad3x16_c(
-  const unsigned char *src_ptr,
-  int  src_stride,
-  const unsigned char *ref_ptr,
-  int  ref_stride,
-  int max_sad) {
-  return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 3, 16);
-}
-unsigned int vp9_sad16x3_c(
-  const unsigned char *src_ptr,
-  int  src_stride,
-  const unsigned char *ref_ptr,
-  int  ref_stride,
-  int max_sad) {
-  return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 16, 3);
-}
-
-/* check a list of motion vectors by sad score using a number rows of pixels
- * above and a number cols of pixels in the left to select the one with best
- * score to use as ref motion vector
- */
-void vp9_find_best_ref_mvs(MACROBLOCKD *xd,
-                           unsigned char *ref_y_buffer,
-                           int ref_y_stride,
-                           int_mv *mvlist,
-                           int_mv *best_mv,
-                           int_mv *nearest,
-                           int_mv *near) {
-  int i, j;
-  unsigned char *above_src;
-  unsigned char *left_src;
-  unsigned char *above_ref;
-  unsigned char *left_ref;
-  int score;
-  int sse;
-  int ref_scores[MAX_MV_REFS] = {0};
-  int_mv sorted_mvs[MAX_MV_REFS];
-  int zero_seen = FALSE;
-
-  // Default all to 0,0 if nothing else available
-  best_mv->as_int = nearest->as_int = near->as_int = 0;
-  vpx_memset(sorted_mvs, 0, sizeof(sorted_mvs));
-
-#if CONFIG_SUBPELREFMV
-  above_src = xd->dst.y_buffer - xd->dst.y_stride * 2;
-  left_src  = xd->dst.y_buffer - 2;
-  above_ref = ref_y_buffer - ref_y_stride * 2;
-  left_ref  = ref_y_buffer - 2;
-#else
-  above_src = xd->dst.y_buffer - xd->dst.y_stride * 3;
-  left_src  = xd->dst.y_buffer - 3;
-  above_ref = ref_y_buffer - ref_y_stride * 3;
-  left_ref  = ref_y_buffer - 3;
-#endif
-
-  //for(i = 0; i < MAX_MV_REFS; ++i) {
-  // Limit search to the predicted best 4
-  for(i = 0; i < 4; ++i) {
-    int_mv this_mv;
-    int offset = 0;
-    int row_offset, col_offset;
-
-    this_mv.as_int = mvlist[i].as_int;
-
-    // If we see a 0,0 vector for a second time we have reached the end of
-    // the list of valid candidate vectors.
-    if (!this_mv.as_int && zero_seen)
-      break;
-
-    zero_seen = zero_seen || !this_mv.as_int;
-
-    clamp_mv(&this_mv,
-             xd->mb_to_left_edge - LEFT_TOP_MARGIN + 24,
-             xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN,
-             xd->mb_to_top_edge - LEFT_TOP_MARGIN + 24,
-             xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN);
-#if CONFIG_SUBPELREFMV
-    row_offset = this_mv.as_mv.row >> 3;
-    col_offset = this_mv.as_mv.col >> 3;
-    offset = ref_y_stride * row_offset + col_offset;
-    score = 0;
-    if (xd->up_available) {
-      vp9_sub_pixel_variance16x2_c(above_ref + offset, ref_y_stride,
-                                   SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),
-                                   above_src, xd->dst.y_stride, &sse);
-      score += sse;
-    }
-    if (xd->left_available) {
-      vp9_sub_pixel_variance2x16_c(left_ref + offset, ref_y_stride,
-                                   SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),
-                                   left_src, xd->dst.y_stride, &sse);
-      score += sse;
-    }
-#else
-    row_offset = (this_mv.as_mv.row > 0) ?
-      ((this_mv.as_mv.row + 3) >> 3):((this_mv.as_mv.row + 4) >> 3);
-    col_offset = (this_mv.as_mv.col > 0) ?
-      ((this_mv.as_mv.col + 3) >> 3):((this_mv.as_mv.col + 4) >> 3);
-    offset = ref_y_stride * row_offset + col_offset;
-    score = 0;
-    if (xd->up_available) {
-      score += vp9_sad16x3(above_src, xd->dst.y_stride,
-                           above_ref + offset, ref_y_stride, INT_MAX);
-    }
-    if (xd->left_available) {
-      score += vp9_sad3x16(left_src, xd->dst.y_stride,
-                           left_ref + offset, ref_y_stride, INT_MAX);
-    }
-#endif
-    // Add the entry to our list and then resort the list on score.
-    ref_scores[i] = score;
-    sorted_mvs[i].as_int = this_mv.as_int;
-    j = i;
-    while (j > 0) {
-      if (ref_scores[j] < ref_scores[j-1]) {
-        ref_scores[j] = ref_scores[j-1];
-        sorted_mvs[j].as_int = sorted_mvs[j-1].as_int;
-        ref_scores[j-1] = score;
-        sorted_mvs[j-1].as_int = this_mv.as_int;
-        j--;
-      } else
-        break;
-    }
-  }
-
-  // Make sure all the candidates are properly clamped etc
-  for (i = 0; i < 4; ++i) {
-    lower_mv_precision(&sorted_mvs[i], xd->allow_high_precision_mv);
-    clamp_mv2(&sorted_mvs[i], xd);
-  }
-
-  // Set the best mv to the first entry in the sorted list
-  best_mv->as_int = sorted_mvs[0].as_int;
-
-  // Provided that there are non zero vectors available there will not
-  // be more than one 0,0 entry in the sorted list.
-  // The best ref mv is always set to the first entry (which gave the best
-  // results. The nearest is set to the first non zero vector if available and
-  // near to the second non zero vector if available.
-  // We do not use 0,0 as a nearest or near as 0,0 has its own mode.
-  if ( sorted_mvs[0].as_int ) {
-    nearest->as_int = sorted_mvs[0].as_int;
-    if ( sorted_mvs[1].as_int )
-      near->as_int = sorted_mvs[1].as_int;
-    else
-      near->as_int = sorted_mvs[2].as_int;
-  } else {
-      nearest->as_int = sorted_mvs[1].as_int;
-      near->as_int = sorted_mvs[2].as_int;
-  }
-
-  // Copy back the re-ordered mv list
-  vpx_memcpy(mvlist, sorted_mvs, sizeof(sorted_mvs));
-}
-
-#endif  // CONFIG_NEWBESTREFMV
--- a/vp8/common/findnearmv.h
+++ /dev/null
@@ -1,188 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef __INC_FINDNEARMV_H
-#define __INC_FINDNEARMV_H
-
-#include "mv.h"
-#include "blockd.h"
-#include "modecont.h"
-#include "treecoder.h"
-#include "onyxc_int.h"
-
-#if CONFIG_NEWBESTREFMV
-/* check a list of motion vectors by sad score using a number rows of pixels
- * above and a number cols of pixels in the left to select the one with best
- * score to use as ref motion vector
- */
-void vp9_find_best_ref_mvs(MACROBLOCKD *xd,
-                           unsigned char *ref_y_buffer,
-                           int ref_y_stride,
-                           int_mv *mvlist,
-                           int_mv *best_mv,
-                           int_mv *nearest,
-                           int_mv *near);
-#endif
-
-static void mv_bias(int refmb_ref_frame_sign_bias, int refframe, int_mv *mvp, const int *ref_frame_sign_bias) {
-  MV xmv;
-  xmv = mvp->as_mv;
-
-  if (refmb_ref_frame_sign_bias != ref_frame_sign_bias[refframe]) {
-    xmv.row *= -1;
-    xmv.col *= -1;
-  }
-
-  mvp->as_mv = xmv;
-}
-
-#define LEFT_TOP_MARGIN (16 << 3)
-#define RIGHT_BOTTOM_MARGIN (16 << 3)
-
-static void clamp_mv(int_mv *mv,
-                     int mb_to_left_edge,
-                     int mb_to_right_edge,
-                     int mb_to_top_edge,
-                     int mb_to_bottom_edge) {
-  mv->as_mv.col = (mv->as_mv.col < mb_to_left_edge) ?
-                  mb_to_left_edge : mv->as_mv.col;
-  mv->as_mv.col = (mv->as_mv.col > mb_to_right_edge) ?
-                  mb_to_right_edge : mv->as_mv.col;
-  mv->as_mv.row = (mv->as_mv.row < mb_to_top_edge) ?
-                  mb_to_top_edge : mv->as_mv.row;
-  mv->as_mv.row = (mv->as_mv.row > mb_to_bottom_edge) ?
-                  mb_to_bottom_edge : mv->as_mv.row;
-}
-
-static void clamp_mv2(int_mv *mv, const MACROBLOCKD *xd) {
-  clamp_mv(mv,
-           xd->mb_to_left_edge - LEFT_TOP_MARGIN,
-           xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN,
-           xd->mb_to_top_edge - LEFT_TOP_MARGIN,
-           xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN);
-}
-
-static unsigned int check_mv_bounds(int_mv *mv,
-                                    int mb_to_left_edge,
-                                    int mb_to_right_edge,
-                                    int mb_to_top_edge,
-                                    int mb_to_bottom_edge) {
-  return (mv->as_mv.col < mb_to_left_edge) ||
-         (mv->as_mv.col > mb_to_right_edge) ||
-         (mv->as_mv.row < mb_to_top_edge) ||
-         (mv->as_mv.row > mb_to_bottom_edge);
-}
-
-void vp9_find_near_mvs(MACROBLOCKD *xd,
-                       const MODE_INFO *here,
-                       const MODE_INFO *lfhere,
-                       int_mv *nearest, int_mv *nearby, int_mv *best,
-                       int near_mv_ref_cts[4],
-                       int refframe,
-                       int *ref_frame_sign_bias);
-
-vp9_prob *vp9_mv_ref_probs(VP9_COMMON *pc,
-                           vp9_prob p[VP9_MVREFS - 1],
-                           const int near_mv_ref_ct[4]);
-
-extern const unsigned char vp9_mbsplit_offset[4][16];
-
-static int left_block_mv(const MODE_INFO *cur_mb, int b) {
-  if (!(b & 3)) {
-    /* On L edge, get from MB to left of us */
-    --cur_mb;
-
-    if (cur_mb->mbmi.mode != SPLITMV)
-      return cur_mb->mbmi.mv[0].as_int;
-    b += 4;
-  }
-
-  return (cur_mb->bmi + b - 1)->as_mv.first.as_int;
-}
-
-static int left_block_second_mv(const MODE_INFO *cur_mb, int b) {
-  if (!(b & 3)) {
-    /* On L edge, get from MB to left of us */
-    --cur_mb;
-
-    if (cur_mb->mbmi.mode != SPLITMV)
-      return cur_mb->mbmi.second_ref_frame ? cur_mb->mbmi.mv[1].as_int : cur_mb->mbmi.mv[0].as_int;
-    b += 4;
-  }
-
-  return cur_mb->mbmi.second_ref_frame ? (cur_mb->bmi + b - 1)->as_mv.second.as_int : (cur_mb->bmi + b - 1)->as_mv.first.as_int;
-}
-
-static int above_block_mv(const MODE_INFO *cur_mb, int b, int mi_stride) {
-  if (!(b >> 2)) {
-    /* On top edge, get from MB above us */
-    cur_mb -= mi_stride;
-
-    if (cur_mb->mbmi.mode != SPLITMV)
-      return cur_mb->mbmi.mv[0].as_int;
-    b += 16;
-  }
-
-  return (cur_mb->bmi + b - 4)->as_mv.first.as_int;
-}
-
-static int above_block_second_mv(const MODE_INFO *cur_mb, int b, int mi_stride) {
-  if (!(b >> 2)) {
-    /* On top edge, get from MB above us */
-    cur_mb -= mi_stride;
-
-    if (cur_mb->mbmi.mode != SPLITMV)
-      return cur_mb->mbmi.second_ref_frame ? cur_mb->mbmi.mv[1].as_int : cur_mb->mbmi.mv[0].as_int;
-    b += 16;
-  }
-
-  return cur_mb->mbmi.second_ref_frame ? (cur_mb->bmi + b - 4)->as_mv.second.as_int : (cur_mb->bmi + b - 4)->as_mv.first.as_int;
-}
-
-static B_PREDICTION_MODE left_block_mode(const MODE_INFO *cur_mb, int b) {
-  if (!(b & 3)) {
-    /* On L edge, get from MB to left of us */
-    --cur_mb;
-
-    if (cur_mb->mbmi.mode < I8X8_PRED) {
-      return pred_mode_conv(cur_mb->mbmi.mode);
-    } else if (cur_mb->mbmi.mode == I8X8_PRED) {
-      return pred_mode_conv((cur_mb->bmi + 3 + b)->as_mode.first);
-    } else if (cur_mb->mbmi.mode == B_PRED) {
-      return ((cur_mb->bmi + 3 + b)->as_mode.first);
-    } else {
-      return B_DC_PRED;
-    }
-  }
-  return (cur_mb->bmi + b - 1)->as_mode.first;
-}
-
-static B_PREDICTION_MODE above_block_mode(const MODE_INFO *cur_mb,
-                                          int b, int mi_stride) {
-  if (!(b >> 2)) {
-    /* On top edge, get from MB above us */
-    cur_mb -= mi_stride;
-
-    if (cur_mb->mbmi.mode < I8X8_PRED) {
-      return pred_mode_conv(cur_mb->mbmi.mode);
-    } else if (cur_mb->mbmi.mode == I8X8_PRED) {
-      return pred_mode_conv((cur_mb->bmi + 12 + b)->as_mode.first);
-    } else if (cur_mb->mbmi.mode == B_PRED) {
-      return ((cur_mb->bmi + 12 + b)->as_mode.first);
-    } else {
-      return B_DC_PRED;
-    }
-  }
-
-  return (cur_mb->bmi + b - 4)->as_mode.first;
-}
-
-#endif
--- a/vp8/common/generic/systemdependent.c
+++ /dev/null
@@ -1,87 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vpx_ports/config.h"
-#include "vpx_rtcd.h"
-#include "vp8/common/subpixel.h"
-#include "vp8/common/loopfilter.h"
-#include "vp8/common/idct.h"
-#include "vp8/common/onyxc_int.h"
-
-extern void vp9_arch_x86_common_init(VP9_COMMON *ctx);
-extern void vp9_arch_arm_common_init(VP9_COMMON *ctx);
-
-void vp9_machine_specific_config(VP9_COMMON *ctx) {
-#if CONFIG_RUNTIME_CPU_DETECT
-  VP9_COMMON_RTCD *rtcd = &ctx->rtcd;
-
-  rtcd->idct.idct1        = vp9_short_idct4x4llm_1_c;
-  rtcd->idct.idct16       = vp9_short_idct4x4llm_c;
-  rtcd->idct.idct1_scalar_add = vp9_dc_only_idct_add_c;
-  rtcd->idct.iwalsh1      = vp9_short_inv_walsh4x4_1_c;
-  rtcd->idct.iwalsh16     = vp9_short_inv_walsh4x4_c;
-  rtcd->idct.idct8        = vp9_short_idct8x8_c;
-  rtcd->idct.idct1_scalar_add_8x8 = vp9_dc_only_idct_add_8x8_c;
-  rtcd->idct.ihaar2       = vp9_short_ihaar2x2_c;
-  rtcd->idct.idct16x16    = vp9_short_idct16x16_c;
-
-  rtcd->subpix.eighttap16x16       = vp9_eighttap_predict16x16_c;
-  rtcd->subpix.eighttap8x8         = vp9_eighttap_predict8x8_c;
-  rtcd->subpix.eighttap_avg16x16   = vp9_eighttap_predict_avg16x16_c;
-  rtcd->subpix.eighttap_avg8x8     = vp9_eighttap_predict_avg8x8_c;
-  rtcd->subpix.eighttap_avg4x4     = vp9_eighttap_predict_avg4x4_c;
-  rtcd->subpix.eighttap8x4         = vp9_eighttap_predict8x4_c;
-  rtcd->subpix.eighttap4x4         = vp9_eighttap_predict_c;
-  rtcd->subpix.eighttap16x16_sharp     = vp9_eighttap_predict16x16_sharp_c;
-  rtcd->subpix.eighttap8x8_sharp       = vp9_eighttap_predict8x8_sharp_c;
-  rtcd->subpix.eighttap_avg16x16_sharp = vp9_eighttap_predict_avg16x16_sharp_c;
-  rtcd->subpix.eighttap_avg8x8_sharp   = vp9_eighttap_predict_avg8x8_sharp_c;
-  rtcd->subpix.eighttap_avg4x4_sharp   = vp9_eighttap_predict_avg4x4_sharp_c;
-  rtcd->subpix.eighttap8x4_sharp       = vp9_eighttap_predict8x4_sharp_c;
-  rtcd->subpix.eighttap4x4_sharp       = vp9_eighttap_predict_sharp_c;
-
-  rtcd->subpix.sixtap16x16       = vp9_sixtap_predict16x16_c;
-  rtcd->subpix.sixtap8x8         = vp9_sixtap_predict8x8_c;
-  rtcd->subpix.sixtap_avg16x16   = vp9_sixtap_predict_avg16x16_c;
-  rtcd->subpix.sixtap_avg8x8     = vp9_sixtap_predict_avg8x8_c;
-  rtcd->subpix.sixtap8x4         = vp9_sixtap_predict8x4_c;
-  rtcd->subpix.sixtap4x4         = vp9_sixtap_predict_c;
-  rtcd->subpix.sixtap_avg4x4     = vp9_sixtap_predict_avg_c;
-  rtcd->subpix.bilinear16x16     = vp9_bilinear_predict16x16_c;
-  rtcd->subpix.bilinear8x8       = vp9_bilinear_predict8x8_c;
-  rtcd->subpix.bilinear_avg16x16 = vp9_bilinear_predict_avg16x16_c;
-  rtcd->subpix.bilinear_avg8x8   = vp9_bilinear_predict_avg8x8_c;
-  rtcd->subpix.bilinear8x4       = vp9_bilinear_predict8x4_c;
-  rtcd->subpix.bilinear4x4       = vp9_bilinear_predict4x4_c;
-  rtcd->subpix.bilinear_avg4x4   = vp9_bilinear_predict_avg4x4_c;
-
-#if CONFIG_POSTPROC || (CONFIG_VP8_ENCODER && CONFIG_INTERNAL_STATS)
-  rtcd->postproc.down             = vp9_mbpost_proc_down_c;
-  rtcd->postproc.across           = vp9_mbpost_proc_across_ip_c;
-  rtcd->postproc.downacross       = vp9_post_proc_down_and_across_c;
-  rtcd->postproc.addnoise         = vp9_plane_add_noise_c;
-  rtcd->postproc.blend_mb_inner   = vp9_blend_mb_inner_c;
-  rtcd->postproc.blend_mb_outer   = vp9_blend_mb_outer_c;
-  rtcd->postproc.blend_b          = vp9_blend_b_c;
-#endif
-
-#endif
-
-#if ARCH_X86 || ARCH_X86_64
-  vp9_arch_x86_common_init(ctx);
-#endif
-
-#if ARCH_ARM
-  vp9_arch_arm_common_init(ctx);
-#endif
-
-  vpx_rtcd();
-}
--- a/vp8/common/header.h
+++ /dev/null
@@ -1,42 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef __INC_HEADER_H
-#define __INC_HEADER_H
-
-/* 24 bits total */
-typedef struct {
-  unsigned int type: 1;
-  unsigned int version: 3;
-  unsigned int show_frame: 1;
-
-  /* Allow 2^20 bytes = 8 megabits for first partition */
-
-  unsigned int first_partition_length_in_bytes: 19;
-
-#ifdef PACKET_TESTING
-  unsigned int frame_number;
-  unsigned int update_gold: 1;
-  unsigned int uses_gold: 1;
-  unsigned int update_last: 1;
-  unsigned int uses_last: 1;
-#endif
-
-} VP9_HEADER;
-
-#ifdef PACKET_TESTING
-#define VP9_HEADER_SIZE 8
-#else
-#define VP9_HEADER_SIZE 3
-#endif
-
-
-#endif
--- a/vp8/common/idct.h
+++ /dev/null
@@ -1,144 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef __INC_IDCT_H
-#define __INC_IDCT_H
-
-#include "vp8/common/blockd.h"
-
-#define prototype_second_order(sym) \
-  void sym(short *input, short *output)
-
-#define prototype_idct(sym) \
-  void sym(short *input, short *output, int pitch)
-
-#define prototype_idct_scalar_add(sym) \
-  void sym(short input, \
-           unsigned char *pred, unsigned char *output, \
-           int pitch, int stride)
-
-#if ARCH_X86 || ARCH_X86_64
-#include "x86/idct_x86.h"
-#endif
-
-#ifdef _MSC_VER
-/* TODO: remove these after integer implmementations are done */
-#define M_PI       3.14159265358979323846
-#define round(x) (((x)>0)? floor((x)+0.5): ceil((x)-0.5))
-#endif
-
-
-#if ARCH_ARM
-#include "arm/idct_arm.h"
-#endif
-
-#if CONFIG_LOSSLESS
-#define WHT_UPSCALE_FACTOR 3
-#define Y2_WHT_UPSCALE_FACTOR 2
-#endif
-
-#ifndef vp9_idct_idct16x16
-#define vp9_idct_idct16x16 vp9_short_idct16x16_c
-#endif
-extern prototype_idct(vp9_idct_idct16x16);
-
-#ifndef vp9_idct_idct8
-#define vp9_idct_idct8 vp9_short_idct8x8_c
-#endif
-extern prototype_idct(vp9_idct_idct8);
-
-#ifndef vp9_idct_idct8_1
-#define vp9_idct_idct8_1 vp9_short_idct8x8_1_c
-#endif
-extern prototype_idct(vp9_idct_idct8_1);
-
-#ifndef vp9_idct_ihaar2
-#define vp9_idct_ihaar2 vp9_short_ihaar2x2_c
-#endif
-extern prototype_idct(vp9_idct_ihaar2);
-
-#ifndef vp9_idct_ihaar2_1
-#define vp9_idct_ihaar2_1 vp9_short_ihaar2x2_1_c
-#endif
-extern prototype_idct(vp9_idct_ihaar2_1);
-
-#ifndef vp9_idct_idct1_scalar_add_8x8
-#define vp9_idct_idct1_scalar_add_8x8 vp9_dc_only_idct_add_8x8_c
-#endif
-extern prototype_idct_scalar_add(vp9_idct_idct1_scalar_add_8x8);
-
-
-
-#ifndef vp9_idct_idct1
-#define vp9_idct_idct1 vp9_short_idct4x4llm_1_c
-#endif
-extern prototype_idct(vp9_idct_idct1);
-
-#ifndef vp9_idct_idct16
-#define vp9_idct_idct16 vp9_short_idct4x4llm_c
-#endif
-extern prototype_idct(vp9_idct_idct16);
-
-#ifndef vp9_idct_idct1_scalar_add
-#define vp9_idct_idct1_scalar_add vp9_dc_only_idct_add_c
-#endif
-extern prototype_idct_scalar_add(vp9_idct_idct1_scalar_add);
-
-
-#ifndef vp9_idct_iwalsh1
-#define vp9_idct_iwalsh1 vp9_short_inv_walsh4x4_1_c
-#endif
-extern prototype_second_order(vp9_idct_iwalsh1);
-
-#ifndef vp9_idct_iwalsh16
-#define vp9_idct_iwalsh16 vp9_short_inv_walsh4x4_c
-#endif
-extern prototype_second_order(vp9_idct_iwalsh16);
-
-#if CONFIG_LOSSLESS
-extern prototype_idct(vp9_short_inv_walsh4x4_x8_c);
-extern prototype_idct(vp9_short_inv_walsh4x4_1_x8_c);
-extern prototype_idct_scalar_add(vp9_dc_only_inv_walsh_add_c);
-extern prototype_second_order(vp9_short_inv_walsh4x4_lossless_c);
-extern prototype_second_order(vp9_short_inv_walsh4x4_1_lossless_c);
-#endif
-
-void vp9_ihtllm_c(const int16_t *input, int16_t *output, int pitch,
-                  TX_TYPE tx_type, int tx_dim);
-
-typedef prototype_idct((*vp9_idct_fn_t));
-typedef prototype_idct_scalar_add((*vp9_idct_scalar_add_fn_t));
-typedef prototype_second_order((*vp9_second_order_fn_t));
-
-typedef struct {
-  vp9_idct_fn_t            idct1;
-  vp9_idct_fn_t            idct16;
-  vp9_idct_scalar_add_fn_t idct1_scalar_add;
-
-  vp9_second_order_fn_t iwalsh1;
-  vp9_second_order_fn_t iwalsh16;
-
-  vp9_idct_fn_t            idct8;
-  vp9_idct_fn_t            idct8_1;
-  vp9_idct_scalar_add_fn_t idct1_scalar_add_8x8;
-  vp9_idct_fn_t ihaar2;
-  vp9_idct_fn_t ihaar2_1;
-
-  vp9_idct_fn_t            idct16x16;
-} vp9_idct_rtcd_vtable_t;
-
-#if CONFIG_RUNTIME_CPU_DETECT
-#define IDCT_INVOKE(ctx,fn) (ctx)->fn
-#else
-#define IDCT_INVOKE(ctx,fn) vp9_idct_##fn
-#endif
-
-#endif
--- a/vp8/common/idctllm.c
+++ /dev/null
@@ -1,1275 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-/****************************************************************************
- * Notes:
- *
- * This implementation makes use of 16 bit fixed point verio of two multiply
- * constants:
- *         1.   sqrt(2) * cos (pi/8)
- *         2.   sqrt(2) * sin (pi/8)
- * Becuase the first constant is bigger than 1, to maintain the same 16 bit
- * fixed point precision as the second one, we use a trick of
- *         x * a = x + x*(a-1)
- * so
- *         x * sqrt(2) * cos (pi/8) = x + x * (sqrt(2) *cos(pi/8)-1).
- **************************************************************************/
-#include <assert.h>
-#include <math.h>
-#include "vpx_ports/config.h"
-#include "vp8/common/idct.h"
-#include "vp8/common/systemdependent.h"
-
-#include "vp8/common/blockd.h"
-
-static const int cospi8sqrt2minus1 = 20091;
-static const int sinpi8sqrt2      = 35468;
-static const int rounding = 0;
-
-// TODO: these transforms can be further converted into integer forms
-//       for complexity optimization
-static const float idct_4[16] = {
-  0.500000000000000,   0.653281482438188,   0.500000000000000,   0.270598050073099,
-  0.500000000000000,   0.270598050073099,  -0.500000000000000,  -0.653281482438188,
-  0.500000000000000,  -0.270598050073099,  -0.500000000000000,   0.653281482438188,
-  0.500000000000000,  -0.653281482438188,   0.500000000000000,  -0.270598050073099
-};
-
-static const float iadst_4[16] = {
-  0.228013428883779,   0.577350269189626,   0.656538502008139,   0.428525073124360,
-  0.428525073124360,   0.577350269189626,  -0.228013428883779,  -0.656538502008139,
-  0.577350269189626,                   0,  -0.577350269189626,   0.577350269189626,
-  0.656538502008139,  -0.577350269189626,   0.428525073124359,  -0.228013428883779
-};
-
-static const float idct_8[64] = {
-  0.353553390593274,   0.490392640201615,   0.461939766255643,   0.415734806151273,
-  0.353553390593274,   0.277785116509801,   0.191341716182545,   0.097545161008064,
-  0.353553390593274,   0.415734806151273,   0.191341716182545,  -0.097545161008064,
- -0.353553390593274,  -0.490392640201615,  -0.461939766255643,  -0.277785116509801,
-  0.353553390593274,   0.277785116509801,  -0.191341716182545,  -0.490392640201615,
- -0.353553390593274,   0.097545161008064,   0.461939766255643,   0.415734806151273,
-  0.353553390593274,   0.097545161008064,  -0.461939766255643,  -0.277785116509801,
-  0.353553390593274,   0.415734806151273,  -0.191341716182545,  -0.490392640201615,
-  0.353553390593274,  -0.097545161008064,  -0.461939766255643,   0.277785116509801,
-  0.353553390593274,  -0.415734806151273,  -0.191341716182545,   0.490392640201615,
-  0.353553390593274,  -0.277785116509801,  -0.191341716182545,   0.490392640201615,
- -0.353553390593274,  -0.097545161008064,   0.461939766255643,  -0.415734806151273,
-  0.353553390593274,  -0.415734806151273,   0.191341716182545,   0.097545161008064,
- -0.353553390593274,   0.490392640201615,  -0.461939766255643,   0.277785116509801,
-  0.353553390593274,  -0.490392640201615,   0.461939766255643,  -0.415734806151273,
-  0.353553390593274,  -0.277785116509801,   0.191341716182545,  -0.097545161008064
-};
-
-static const float iadst_8[64] = {
-  0.089131608307533,   0.255357107325376,   0.387095214016349,   0.466553967085785,
-  0.483002021635509,   0.434217976756762,   0.326790388032145,   0.175227946595735,
-  0.175227946595735,   0.434217976756762,   0.466553967085785,   0.255357107325376,
- -0.089131608307533,  -0.387095214016348,  -0.483002021635509,  -0.326790388032145,
-  0.255357107325376,   0.483002021635509,   0.175227946595735,  -0.326790388032145,
- -0.466553967085785,  -0.089131608307533,   0.387095214016349,   0.434217976756762,
-  0.326790388032145,   0.387095214016349,  -0.255357107325376,  -0.434217976756762,
-  0.175227946595735,   0.466553967085786,  -0.089131608307534,  -0.483002021635509,
-  0.387095214016349,   0.175227946595735,  -0.483002021635509,   0.089131608307533,
-  0.434217976756762,  -0.326790388032145,  -0.255357107325377,   0.466553967085785,
-  0.434217976756762,  -0.089131608307533,  -0.326790388032145,   0.483002021635509,
- -0.255357107325376,  -0.175227946595735,   0.466553967085785,  -0.387095214016348,
-  0.466553967085785,  -0.326790388032145,   0.089131608307533,   0.175227946595735,
- -0.387095214016348,   0.483002021635509,  -0.434217976756762,   0.255357107325376,
-  0.483002021635509,  -0.466553967085785,   0.434217976756762,  -0.387095214016348,
-  0.326790388032145,  -0.255357107325375,   0.175227946595736,  -0.089131608307532
-};
-
-static const int16_t idct_i4[16] = {
-  8192,  10703,  8192,   4433,
-  8192,   4433, -8192, -10703,
-  8192,  -4433, -8192,  10703,
-  8192, -10703,  8192,  -4433
-};
-
-static const int16_t iadst_i4[16] = {
-   3736,  9459, 10757,   7021,
-   7021,  9459, -3736, -10757,
-   9459,     0, -9459,   9459,
-  10757, -9459,  7021,  -3736
-};
-
-static const int16_t idct_i8[64] = {
-   5793,  8035,  7568,  6811,
-   5793,  4551,  3135,  1598,
-   5793,  6811,  3135, -1598,
-  -5793, -8035, -7568, -4551,
-   5793,  4551, -3135, -8035,
-  -5793,  1598,  7568,  6811,
-   5793,  1598, -7568, -4551,
-   5793,  6811, -3135, -8035,
-   5793, -1598, -7568,  4551,
-   5793, -6811, -3135,  8035,
-   5793, -4551, -3135,  8035,
-  -5793, -1598,  7568, -6811,
-   5793, -6811,  3135,  1598,
-  -5793,  8035, -7568,  4551,
-   5793, -8035,  7568, -6811,
-   5793, -4551,  3135, -1598
-};
-
-static const int16_t iadst_i8[64] = {
-   1460,  4184,  6342,  7644,
-   7914,  7114,  5354,  2871,
-   2871,  7114,  7644,  4184,
-  -1460, -6342, -7914, -5354,
-   4184,  7914,  2871, -5354,
-  -7644, -1460,  6342,  7114,
-   5354,  6342, -4184, -7114,
-   2871,  7644, -1460, -7914,
-   6342,  2871, -7914,  1460,
-   7114, -5354, -4184,  7644,
-   7114, -1460, -5354,  7914,
-  -4184, -2871,  7644, -6342,
-   7644, -5354,  1460,  2871,
-  -6342,  7914, -7114,  4184,
-   7914, -7644,  7114, -6342,
-   5354, -4184,  2871, -1460
-};
-
-static float idct_16[256] = {
-  0.250000,  0.351851,  0.346760,  0.338330,  0.326641,  0.311806,  0.293969,  0.273300,
-  0.250000,  0.224292,  0.196424,  0.166664,  0.135299,  0.102631,  0.068975,  0.034654,
-  0.250000,  0.338330,  0.293969,  0.224292,  0.135299,  0.034654, -0.068975, -0.166664,
- -0.250000, -0.311806, -0.346760, -0.351851, -0.326641, -0.273300, -0.196424, -0.102631,
-  0.250000,  0.311806,  0.196424,  0.034654, -0.135299, -0.273300, -0.346760, -0.338330,
- -0.250000, -0.102631,  0.068975,  0.224292,  0.326641,  0.351851,  0.293969,  0.166664,
-  0.250000,  0.273300,  0.068975, -0.166664, -0.326641, -0.338330, -0.196424,  0.034654,
-  0.250000,  0.351851,  0.293969,  0.102631, -0.135299, -0.311806, -0.346760, -0.224292,
-  0.250000,  0.224292, -0.068975, -0.311806, -0.326641, -0.102631,  0.196424,  0.351851,
-  0.250000, -0.034654, -0.293969, -0.338330, -0.135299,  0.166664,  0.346760,  0.273300,
-  0.250000,  0.166664, -0.196424, -0.351851, -0.135299,  0.224292,  0.346760,  0.102631,
- -0.250000, -0.338330, -0.068975,  0.273300,  0.326641,  0.034654, -0.293969, -0.311806,
-  0.250000,  0.102631, -0.293969, -0.273300,  0.135299,  0.351851,  0.068975, -0.311806,
- -0.250000,  0.166664,  0.346760,  0.034654, -0.326641, -0.224292,  0.196424,  0.338330,
-  0.250000,  0.034654, -0.346760, -0.102631,  0.326641,  0.166664, -0.293969, -0.224292,
-  0.250000,  0.273300, -0.196424, -0.311806,  0.135299,  0.338330, -0.068975, -0.351851,
-  0.250000, -0.034654, -0.346760,  0.102631,  0.326641, -0.166664, -0.293969,  0.224292,
-  0.250000, -0.273300, -0.196424,  0.311806,  0.135299, -0.338330, -0.068975,  0.351851,
-  0.250000, -0.102631, -0.293969,  0.273300,  0.135299, -0.351851,  0.068975,  0.311806,
- -0.250000, -0.166664,  0.346760, -0.034654, -0.326641,  0.224292,  0.196424, -0.338330,
-  0.250000, -0.166664, -0.196424,  0.351851, -0.135299, -0.224292,  0.346760, -0.102631,
- -0.250000,  0.338330, -0.068975, -0.273300,  0.326641, -0.034654, -0.293969,  0.311806,
-  0.250000, -0.224292, -0.068975,  0.311806, -0.326641,  0.102631,  0.196424, -0.351851,
-  0.250000,  0.034654, -0.293969,  0.338330, -0.135299, -0.166664,  0.346760, -0.273300,
-  0.250000, -0.273300,  0.068975,  0.166664, -0.326641,  0.338330, -0.196424, -0.034654,
-  0.250000, -0.351851,  0.293969, -0.102631, -0.135299,  0.311806, -0.346760,  0.224292,
-  0.250000, -0.311806,  0.196424, -0.034654, -0.135299,  0.273300, -0.346760,  0.338330,
- -0.250000,  0.102631,  0.068975, -0.224292,  0.326641, -0.351851,  0.293969, -0.166664,
-  0.250000, -0.338330,  0.293969, -0.224292,  0.135299, -0.034654, -0.068975,  0.166664,
- -0.250000,  0.311806, -0.346760,  0.351851, -0.326641,  0.273300, -0.196424,  0.102631,
-  0.250000, -0.351851,  0.346760, -0.338330,  0.326641, -0.311806,  0.293969, -0.273300,
-  0.250000, -0.224292,  0.196424, -0.166664,  0.135299, -0.102631,  0.068975, -0.034654
-};
-
-static float iadst_16[256] = {
-  0.033094,  0.098087,  0.159534,  0.215215,  0.263118,  0.301511,  0.329007,  0.344612,
-  0.347761,  0.338341,  0.316693,  0.283599,  0.240255,  0.188227,  0.129396,  0.065889,
-  0.065889,  0.188227,  0.283599,  0.338341,  0.344612,  0.301511,  0.215215,  0.098087,
- -0.033094, -0.159534, -0.263118, -0.329007, -0.347761, -0.316693, -0.240255, -0.129396,
-  0.098087,  0.263118,  0.344612,  0.316693,  0.188227,  0.000000, -0.188227, -0.316693,
- -0.344612, -0.263118, -0.098087,  0.098087,  0.263118,  0.344612,  0.316693,  0.188227,
-  0.129396,  0.316693,  0.329007,  0.159534, -0.098087, -0.301511, -0.338341, -0.188227,
-  0.065889,  0.283599,  0.344612,  0.215215, -0.033094, -0.263118, -0.347761, -0.240255,
-  0.159534,  0.344612,  0.240255, -0.065889, -0.316693, -0.301511, -0.033094,  0.263118,
-  0.338341,  0.129396, -0.188227, -0.347761, -0.215215,  0.098087,  0.329007,  0.283599,
-  0.188227,  0.344612,  0.098087, -0.263118, -0.316693, -0.000000,  0.316693,  0.263118,
- -0.098087, -0.344612, -0.188227,  0.188227,  0.344612,  0.098087, -0.263118, -0.316693,
-  0.215215,  0.316693, -0.065889, -0.347761, -0.098087,  0.301511,  0.240255, -0.188227,
- -0.329007,  0.033094,  0.344612,  0.129396, -0.283599, -0.263118,  0.159534,  0.338341,
-  0.240255,  0.263118, -0.215215, -0.283599,  0.188227,  0.301511, -0.159534, -0.316693,
-  0.129396,  0.329007, -0.098087, -0.338341,  0.065889,  0.344612, -0.033094, -0.347761,
-  0.263118,  0.188227, -0.316693, -0.098087,  0.344612,  0.000000, -0.344612,  0.098087,
-  0.316693, -0.188227, -0.263118,  0.263118,  0.188227, -0.316693, -0.098087,  0.344612,
-  0.283599,  0.098087, -0.347761,  0.129396,  0.263118, -0.301511, -0.065889,  0.344612,
- -0.159534, -0.240255,  0.316693,  0.033094, -0.338341,  0.188227,  0.215215, -0.329007,
-  0.301511,  0.000000, -0.301511,  0.301511,  0.000000, -0.301511,  0.301511,  0.000000,
- -0.301511,  0.301511,  0.000000, -0.301511,  0.301511,  0.000000, -0.301511,  0.301511,
-  0.316693, -0.098087, -0.188227,  0.344612, -0.263118, -0.000000,  0.263118, -0.344612,
-  0.188227,  0.098087, -0.316693,  0.316693, -0.098087, -0.188227,  0.344612, -0.263118,
-  0.329007, -0.188227, -0.033094,  0.240255, -0.344612,  0.301511, -0.129396, -0.098087,
-  0.283599, -0.347761,  0.263118, -0.065889, -0.159534,  0.316693, -0.338341,  0.215215,
-  0.338341, -0.263118,  0.129396,  0.033094, -0.188227,  0.301511, -0.347761,  0.316693,
- -0.215215,  0.065889,  0.098087, -0.240255,  0.329007, -0.344612,  0.283599, -0.159534,
-  0.344612, -0.316693,  0.263118, -0.188227,  0.098087,  0.000000, -0.098087,  0.188227,
- -0.263118,  0.316693, -0.344612,  0.344612, -0.316693,  0.263118, -0.188227,  0.098087,
-  0.347761, -0.344612,  0.338341, -0.329007,  0.316693, -0.301511,  0.283599, -0.263118,
-  0.240255, -0.215215,  0.188227, -0.159534,  0.129396, -0.098087,  0.065889, -0.033094
-};
-
-static const int16_t idct_i16[256] = {
-   4096,  5765,  5681,  5543,  5352,  5109,  4816,  4478,
-   4096,  3675,  3218,  2731,  2217,  1682,  1130,   568,
-   4096,  5543,  4816,  3675,  2217,   568, -1130, -2731,
-  -4096, -5109, -5681, -5765, -5352, -4478, -3218, -1682,
-   4096,  5109,  3218,   568, -2217, -4478, -5681, -5543,
-  -4096, -1682,  1130,  3675,  5352,  5765,  4816,  2731,
-   4096,  4478,  1130, -2731, -5352, -5543, -3218,   568,
-   4096,  5765,  4816,  1682, -2217, -5109, -5681, -3675,
-   4096,  3675, -1130, -5109, -5352, -1682,  3218,  5765,
-   4096,  -568, -4816, -5543, -2217,  2731,  5681,  4478,
-   4096,  2731, -3218, -5765, -2217,  3675,  5681,  1682,
-  -4096, -5543, -1130,  4478,  5352,   568, -4816, -5109,
-   4096,  1682, -4816, -4478,  2217,  5765,  1130, -5109,
-  -4096,  2731,  5681,   568, -5352, -3675,  3218,  5543,
-   4096,   568, -5681, -1682,  5352,  2731, -4816, -3675,
-   4096,  4478, -3218, -5109,  2217,  5543, -1130, -5765,
-   4096,  -568, -5681,  1682,  5352, -2731, -4816,  3675,
-   4096, -4478, -3218,  5109,  2217, -5543, -1130,  5765,
-   4096, -1682, -4816,  4478,  2217, -5765,  1130,  5109,
-  -4096, -2731,  5681,  -568, -5352,  3675,  3218, -5543,
-   4096, -2731, -3218,  5765, -2217, -3675,  5681, -1682,
-  -4096,  5543, -1130, -4478,  5352,  -568, -4816,  5109,
-   4096, -3675, -1130,  5109, -5352,  1682,  3218, -5765,
-   4096,   568, -4816,  5543, -2217, -2731,  5681, -4478,
-   4096, -4478,  1130,  2731, -5352,  5543, -3218,  -568,
-   4096, -5765,  4816, -1682, -2217,  5109, -5681,  3675,
-   4096, -5109,  3218,  -568, -2217,  4478, -5681,  5543,
-  -4096,  1682,  1130, -3675,  5352, -5765,  4816, -2731,
-   4096, -5543,  4816, -3675,  2217,  -568, -1130,  2731,
-  -4096,  5109, -5681,  5765, -5352,  4478, -3218,  1682,
-   4096, -5765,  5681, -5543,  5352, -5109,  4816, -4478,
-   4096, -3675,  3218, -2731,  2217, -1682,  1130,  -568
-};
-
-static const int16_t iadst_i16[256] = {
-    542,  1607,  2614,  3526,  4311,  4940,  5390,  5646,
-   5698,  5543,  5189,  4646,  3936,  3084,  2120,  1080,
-   1080,  3084,  4646,  5543,  5646,  4940,  3526,  1607,
-   -542, -2614, -4311, -5390, -5698, -5189, -3936, -2120,
-   1607,  4311,  5646,  5189,  3084,     0, -3084, -5189,
-  -5646, -4311, -1607,  1607,  4311,  5646,  5189,  3084,
-   2120,  5189,  5390,  2614, -1607, -4940, -5543, -3084,
-   1080,  4646,  5646,  3526, -542,  -4311, -5698, -3936,
-   2614,  5646,  3936, -1080, -5189, -4940,  -542,  4311,
-   5543,  2120, -3084, -5698, -3526,  1607,  5390,  4646,
-   3084,  5646,  1607, -4311, -5189,     0,  5189,  4311,
-  -1607, -5646, -3084,  3084,  5646,  1607, -4311, -5189,
-   3526,  5189, -1080, -5698, -1607,  4940,  3936, -3084,
-  -5390,   542,  5646,  2120, -4646, -4311,  2614,  5543,
-   3936,  4311, -3526, -4646,  3084,  4940, -2614, -5189,
-   2120,  5390, -1607, -5543,  1080,  5646,  -542, -5698,
-   4311,  3084, -5189, -1607,  5646,     0, -5646,  1607,
-   5189, -3084, -4311,  4311,  3084, -5189, -1607,  5646,
-   4646,  1607, -5698,  2120,  4311, -4940, -1080,  5646,
-  -2614, -3936,  5189,   542, -5543,  3084,  3526, -5390,
-   4940,     0, -4940,  4940,     0, -4940,  4940,     0,
-  -4940,  4940,     0, -4940,  4940,     0, -4940,  4940,
-   5189, -1607, -3084,  5646, -4311,     0,  4311, -5646,
-   3084,  1607, -5189,  5189, -1607, -3084,  5646, -4311,
-   5390, -3084,  -542,  3936, -5646,  4940, -2120, -1607,
-   4646, -5698,  4311, -1080, -2614,  5189, -5543,  3526,
-   5543, -4311,  2120,   542, -3084,  4940, -5698,  5189,
-  -3526,  1080,  1607, -3936,  5390, -5646,  4646, -2614,
-   5646, -5189,  4311, -3084,  1607,     0, -1607,  3084,
-  -4311,  5189, -5646,  5646, -5189,  4311, -3084,  1607,
-   5698, -5646,  5543, -5390,  5189, -4940,  4646, -4311,
-   3936, -3526,  3084, -2614,  2120, -1607,  1080,  -542
-};
-
-/* For test */
-#define TEST_INT 1
-#if TEST_INT
-#define vp9_ihtllm_int_c vp9_ihtllm_c
-#else
-#define vp9_ihtllm_float_c vp9_ihtllm_c
-#endif
-
-void vp9_ihtllm_float_c(const int16_t *input, int16_t *output, int pitch,
-                  TX_TYPE tx_type, int tx_dim) {
-  vp9_clear_system_state();  // Make it simd safe : __asm emms;
-  {
-    int i, j, k;
-    float bufa[256], bufb[256];  // buffers are for floating-point test purpose
-                                 // the implementation could be simplified in
-                                 // conjunction with integer transform
-    const int16_t *ip = input;
-    int16_t *op = output;
-    int shortpitch = pitch >> 1;
-
-    float *pfa = &bufa[0];
-    float *pfb = &bufb[0];
-
-    // pointers to vertical and horizontal transforms
-    const float *ptv, *pth;
-
-    assert(tx_type != DCT_DCT);
-    // load and convert residual array into floating-point
-    for(j = 0; j < tx_dim; j++) {
-      for(i = 0; i < tx_dim; i++) {
-        pfa[i] = (float)ip[i];
-      }
-      pfa += tx_dim;
-      ip  += tx_dim;
-    }
-
-    // vertical transformation
-    pfa = &bufa[0];
-    pfb = &bufb[0];
-
-    switch(tx_type) {
-      case ADST_ADST :
-      case ADST_DCT  :
-        ptv = (tx_dim == 4) ? &iadst_4[0] :
-                              ((tx_dim == 8) ? &iadst_8[0] : &iadst_16[0]);
-        break;
-
-      default :
-        ptv = (tx_dim == 4) ? &idct_4[0] :
-                              ((tx_dim == 8) ? &idct_8[0] : &idct_16[0]);
-        break;
-    }
-
-    for(j = 0; j < tx_dim; j++) {
-      for(i = 0; i < tx_dim; i++) {
-        pfb[i] = 0 ;
-        for(k = 0; k < tx_dim; k++) {
-          pfb[i] += ptv[k] * pfa[(k * tx_dim)];
-        }
-        pfa += 1;
-      }
-
-      pfb += tx_dim;
-      ptv += tx_dim;
-      pfa = &bufa[0];
-    }
-
-    // horizontal transformation
-    pfa = &bufa[0];
-    pfb = &bufb[0];
-
-    switch(tx_type) {
-      case ADST_ADST :
-      case  DCT_ADST :
-        pth = (tx_dim == 4) ? &iadst_4[0] :
-                              ((tx_dim == 8) ? &iadst_8[0] : &iadst_16[0]);
-        break;
-
-      default :
-        pth = (tx_dim == 4) ? &idct_4[0] :
-                              ((tx_dim == 8) ? &idct_8[0] : &idct_16[0]);
-        break;
-    }
-
-    for(j = 0; j < tx_dim; j++) {
-      for(i = 0; i < tx_dim; i++) {
-        pfa[i] = 0;
-        for(k = 0; k < tx_dim; k++) {
-          pfa[i] += pfb[k] * pth[k];
-        }
-        pth += tx_dim;
-       }
-
-      pfa += tx_dim;
-      pfb += tx_dim;
-
-      switch(tx_type) {
-        case ADST_ADST :
-        case  DCT_ADST :
-          pth = (tx_dim == 4) ? &iadst_4[0] :
-                                ((tx_dim == 8) ? &iadst_8[0] : &iadst_16[0]);
-          break;
-
-        default :
-          pth = (tx_dim == 4) ? &idct_4[0] :
-                                ((tx_dim == 8) ? &idct_8[0] : &idct_16[0]);
-          break;
-      }
-    }
-
-    // convert to short integer format and load BLOCKD buffer
-    op  = output;
-    pfa = &bufa[0];
-
-    for(j = 0; j < tx_dim; j++) {
-      for(i = 0; i < tx_dim; i++) {
-        op[i] = (pfa[i] > 0 ) ? (int16_t)( pfa[i] / 8 + 0.49) :
-                               -(int16_t)( - pfa[i] / 8 + 0.49);
-      }
-
-      op += shortpitch;
-      pfa += tx_dim;
-    }
-  }
-  vp9_clear_system_state(); // Make it simd safe : __asm emms;
-}
-
-/* Converted the transforms to integer form. */
-#define VERTICAL_SHIFT 14  // 16
-#define VERTICAL_ROUNDING ((1 << (VERTICAL_SHIFT - 1)) - 1)
-#define HORIZONTAL_SHIFT 17  // 15
-#define HORIZONTAL_ROUNDING ((1 << (HORIZONTAL_SHIFT - 1)) - 1)
-void vp9_ihtllm_int_c(const int16_t *input, int16_t *output, int pitch,
-                      TX_TYPE tx_type, int tx_dim) {
-  int i, j, k;
-  int16_t imbuf[256];
-
-  const int16_t *ip = input;
-  int16_t *op = output;
-  int16_t *im = &imbuf[0];
-
-  /* pointers to vertical and horizontal transforms. */
-  const int16_t *ptv = NULL, *pth = NULL;
-  int shortpitch = pitch >> 1;
-
-  switch (tx_type) {
-    case ADST_ADST :
-      ptv = pth = (tx_dim == 4) ? &iadst_i4[0]
-                                  : ((tx_dim == 8) ? &iadst_i8[0]
-                                                     : &iadst_i16[0]);
-      break;
-    case ADST_DCT  :
-      ptv = (tx_dim == 4) ? &iadst_i4[0]
-                            : ((tx_dim == 8) ? &iadst_i8[0] : &iadst_i16[0]);
-      pth = (tx_dim == 4) ? &idct_i4[0]
-                            : ((tx_dim == 8) ? &idct_i8[0] : &idct_i16[0]);
-      break;
-    case  DCT_ADST :
-      ptv = (tx_dim == 4) ? &idct_i4[0]
-                            : ((tx_dim == 8) ? &idct_i8[0] : &idct_i16[0]);
-      pth = (tx_dim == 4) ? &iadst_i4[0]
-                            : ((tx_dim == 8) ? &iadst_i8[0] : &iadst_i16[0]);
-      break;
-    case  DCT_DCT :
-      ptv = pth = (tx_dim == 4) ? &idct_i4[0]
-                                  : ((tx_dim == 8) ? &idct_i8[0]
-                                                     : &idct_i16[0]);
-      break;
-    default:
-      assert(0);
-      break;
-  }
-
-  /* vertical transformation */
-  for (j = 0; j < tx_dim; j++) {
-    for (i = 0; i < tx_dim; i++) {
-      int temp = 0;
-
-      for (k = 0; k < tx_dim; k++) {
-        temp += ptv[k] * ip[(k * tx_dim)];
-      }
-
-      im[i] = (int16_t)((temp + VERTICAL_ROUNDING) >> VERTICAL_SHIFT);
-      ip++;
-    }
-    im += tx_dim;  // 16
-    ptv += tx_dim;
-    ip = input;
-  }
-
-  /* horizontal transformation */
-  im = &imbuf[0];
-
-  for (j = 0; j < tx_dim; j++) {
-    const int16_t *pthc = pth;
-
-    for (i = 0; i < tx_dim; i++) {
-      int temp = 0;
-
-      for (k = 0; k < tx_dim; k++) {
-        temp += im[k] * pthc[k];
-      }
-
-      op[i] = (int16_t)((temp + HORIZONTAL_ROUNDING) >> HORIZONTAL_SHIFT);
-      pthc += tx_dim;
-    }
-
-    im += tx_dim;  // 16
-    op += shortpitch;
-  }
-}
-
-void vp9_short_idct4x4llm_c(short *input, short *output, int pitch) {
-  int i;
-  int a1, b1, c1, d1;
-
-  short *ip = input;
-  short *op = output;
-  int temp1, temp2;
-  int shortpitch = pitch >> 1;
-
-  for (i = 0; i < 4; i++) {
-    a1 = ip[0] + ip[8];
-    b1 = ip[0] - ip[8];
-
-    temp1 = (ip[4] * sinpi8sqrt2 + rounding) >> 16;
-    temp2 = ip[12] + ((ip[12] * cospi8sqrt2minus1 + rounding) >> 16);
-    c1 = temp1 - temp2;
-
-    temp1 = ip[4] + ((ip[4] * cospi8sqrt2minus1 + rounding) >> 16);
-    temp2 = (ip[12] * sinpi8sqrt2 + rounding) >> 16;
-    d1 = temp1 + temp2;
-
-    op[shortpitch * 0] = a1 + d1;
-    op[shortpitch * 3] = a1 - d1;
-
-    op[shortpitch * 1] = b1 + c1;
-    op[shortpitch * 2] = b1 - c1;
-
-    ip++;
-    op++;
-  }
-
-  ip = output;
-  op = output;
-
-  for (i = 0; i < 4; i++) {
-    a1 = ip[0] + ip[2];
-    b1 = ip[0] - ip[2];
-
-    temp1 = (ip[1] * sinpi8sqrt2 + rounding) >> 16;
-    temp2 = ip[3] + ((ip[3] * cospi8sqrt2minus1 + rounding) >> 16);
-    c1 = temp1 - temp2;
-
-    temp1 = ip[1] + ((ip[1] * cospi8sqrt2minus1 + rounding) >> 16);
-    temp2 = (ip[3] * sinpi8sqrt2 + rounding) >> 16;
-    d1 = temp1 + temp2;
-
-    op[0] = (a1 + d1 + 16) >> 5;
-    op[3] = (a1 - d1 + 16) >> 5;
-
-    op[1] = (b1 + c1 + 16) >> 5;
-    op[2] = (b1 - c1 + 16) >> 5;
-
-    ip += shortpitch;
-    op += shortpitch;
-  }
-}
-
-void vp9_short_idct4x4llm_1_c(short *input, short *output, int pitch) {
-  int i;
-  int a1;
-  short *op = output;
-  int shortpitch = pitch >> 1;
-  a1 = ((input[0] + 16) >> 5);
-  for (i = 0; i < 4; i++) {
-    op[0] = a1;
-    op[1] = a1;
-    op[2] = a1;
-    op[3] = a1;
-    op += shortpitch;
-  }
-}
-
-void vp9_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr,
-                            unsigned char *dst_ptr, int pitch, int stride) {
-  int a1 = ((input_dc + 16) >> 5);
-  int r, c;
-
-  for (r = 0; r < 4; r++) {
-    for (c = 0; c < 4; c++) {
-      int a = a1 + pred_ptr[c];
-
-      if (a < 0)
-        a = 0;
-
-      if (a > 255)
-        a = 255;
-
-      dst_ptr[c] = (unsigned char) a;
-    }
-
-    dst_ptr += stride;
-    pred_ptr += pitch;
-  }
-}
-
-void vp9_short_inv_walsh4x4_c(short *input, short *output) {
-  int i;
-  int a1, b1, c1, d1;
-  short *ip = input;
-  short *op = output;
-
-  for (i = 0; i < 4; i++) {
-    a1 = ((ip[0] + ip[3]));
-    b1 = ((ip[1] + ip[2]));
-    c1 = ((ip[1] - ip[2]));
-    d1 = ((ip[0] - ip[3]));
-
-    op[0] = (a1 + b1 + 1) >> 1;
-    op[1] = (c1 + d1) >> 1;
-    op[2] = (a1 - b1) >> 1;
-    op[3] = (d1 - c1) >> 1;
-
-    ip += 4;
-    op += 4;
-  }
-
-  ip = output;
-  op = output;
-  for (i = 0; i < 4; i++) {
-    a1 = ip[0] + ip[12];
-    b1 = ip[4] + ip[8];
-    c1 = ip[4] - ip[8];
-    d1 = ip[0] - ip[12];
-    op[0] = (a1 + b1 + 1) >> 1;
-    op[4] = (c1 + d1) >> 1;
-    op[8] = (a1 - b1) >> 1;
-    op[12] = (d1 - c1) >> 1;
-    ip++;
-    op++;
-  }
-}
-
-void vp9_short_inv_walsh4x4_1_c(short *in, short *out) {
-  int i;
-  short tmp[4];
-  short *ip = in;
-  short *op = tmp;
-
-  op[0] = (ip[0] + 1) >> 1;
-  op[1] = op[2] = op[3] = (ip[0] >> 1);
-
-  ip = tmp;
-  op = out;
-  for (i = 0; i < 4; i++) {
-    op[0] = (ip[0] + 1) >> 1;
-    op[4] = op[8] = op[12] = (ip[0] >> 1);
-    ip++;
-    op++;
-  }
-}
-
-#if CONFIG_LOSSLESS
-void vp9_short_inv_walsh4x4_lossless_c(short *input, short *output) {
-  int i;
-  int a1, b1, c1, d1;
-  short *ip = input;
-  short *op = output;
-
-  for (i = 0; i < 4; i++) {
-    a1 = ((ip[0] + ip[3])) >> Y2_WHT_UPSCALE_FACTOR;
-    b1 = ((ip[1] + ip[2])) >> Y2_WHT_UPSCALE_FACTOR;
-    c1 = ((ip[1] - ip[2])) >> Y2_WHT_UPSCALE_FACTOR;
-    d1 = ((ip[0] - ip[3])) >> Y2_WHT_UPSCALE_FACTOR;
-
-    op[0] = (a1 + b1 + 1) >> 1;
-    op[1] = (c1 + d1) >> 1;
-    op[2] = (a1 - b1) >> 1;
-    op[3] = (d1 - c1) >> 1;
-
-    ip += 4;
-    op += 4;
-  }
-
-  ip = output;
-  op = output;
-  for (i = 0; i < 4; i++) {
-    a1 = ip[0] + ip[12];
-    b1 = ip[4] + ip[8];
-    c1 = ip[4] - ip[8];
-    d1 = ip[0] - ip[12];
-
-
-    op[0] = ((a1 + b1 + 1) >> 1) << Y2_WHT_UPSCALE_FACTOR;
-    op[4] = ((c1 + d1) >> 1) << Y2_WHT_UPSCALE_FACTOR;
-    op[8] = ((a1 - b1) >> 1) << Y2_WHT_UPSCALE_FACTOR;
-    op[12] = ((d1 - c1) >> 1) << Y2_WHT_UPSCALE_FACTOR;
-
-    ip++;
-    op++;
-  }
-}
-
-void vp9_short_inv_walsh4x4_1_lossless_c(short *in, short *out) {
-  int i;
-  short tmp[4];
-  short *ip = in;
-  short *op = tmp;
-
-  op[0] = ((ip[0] >> Y2_WHT_UPSCALE_FACTOR) + 1) >> 1;
-  op[1] = op[2] = op[3] = ((ip[0] >> Y2_WHT_UPSCALE_FACTOR) >> 1);
-
-  ip = tmp;
-  op = out;
-  for (i = 0; i < 4; i++) {
-    op[0] = ((ip[0] + 1) >> 1) << Y2_WHT_UPSCALE_FACTOR;
-    op[4] = op[8] = op[12] = ((ip[0] >> 1)) << Y2_WHT_UPSCALE_FACTOR;
-    ip++;
-    op++;
-  }
-}
-
-void vp9_short_inv_walsh4x4_x8_c(short *input, short *output, int pitch) {
-  int i;
-  int a1, b1, c1, d1;
-  short *ip = input;
-  short *op = output;
-  int shortpitch = pitch >> 1;
-
-  for (i = 0; i < 4; i++) {
-    a1 = ((ip[0] + ip[3])) >> WHT_UPSCALE_FACTOR;
-    b1 = ((ip[1] + ip[2])) >> WHT_UPSCALE_FACTOR;
-    c1 = ((ip[1] - ip[2])) >> WHT_UPSCALE_FACTOR;
-    d1 = ((ip[0] - ip[3])) >> WHT_UPSCALE_FACTOR;
-
-    op[0] = (a1 + b1 + 1) >> 1;
-    op[1] = (c1 + d1) >> 1;
-    op[2] = (a1 - b1) >> 1;
-    op[3] = (d1 - c1) >> 1;
-
-    ip += 4;
-    op += shortpitch;
-  }
-
-  ip = output;
-  op = output;
-  for (i = 0; i < 4; i++) {
-    a1 = ip[shortpitch * 0] + ip[shortpitch * 3];
-    b1 = ip[shortpitch * 1] + ip[shortpitch * 2];
-    c1 = ip[shortpitch * 1] - ip[shortpitch * 2];
-    d1 = ip[shortpitch * 0] - ip[shortpitch * 3];
-
-
-    op[shortpitch * 0] = (a1 + b1 + 1) >> 1;
-    op[shortpitch * 1] = (c1 + d1) >> 1;
-    op[shortpitch * 2] = (a1 - b1) >> 1;
-    op[shortpitch * 3] = (d1 - c1) >> 1;
-
-    ip++;
-    op++;
-  }
-}
-
-void vp9_short_inv_walsh4x4_1_x8_c(short *in, short *out, int pitch) {
-  int i;
-  short tmp[4];
-  short *ip = in;
-  short *op = tmp;
-  int shortpitch = pitch >> 1;
-
-  op[0] = ((ip[0] >> WHT_UPSCALE_FACTOR) + 1) >> 1;
-  op[1] = op[2] = op[3] = ((ip[0] >> WHT_UPSCALE_FACTOR) >> 1);
-
-
-  ip = tmp;
-  op = out;
-  for (i = 0; i < 4; i++) {
-    op[shortpitch * 0] = (ip[0] + 1) >> 1;
-    op[shortpitch * 1] = op[shortpitch * 2] = op[shortpitch * 3] = ip[0] >> 1;
-    ip++;
-    op++;
-  }
-}
-
-void vp9_dc_only_inv_walsh_add_c(short input_dc, unsigned char *pred_ptr,
-                                 unsigned char *dst_ptr,
-                                 int pitch, int stride) {
-  int r, c;
-  short tmp[16];
-  vp9_short_inv_walsh4x4_1_x8_c(&input_dc, tmp, 4 << 1);
-
-  for (r = 0; r < 4; r++) {
-    for (c = 0; c < 4; c++) {
-      int a = tmp[r * 4 + c] + pred_ptr[c];
-      if (a < 0)
-        a = 0;
-
-      if (a > 255)
-        a = 255;
-
-      dst_ptr[c] = (unsigned char) a;
-    }
-
-    dst_ptr += stride;
-    pred_ptr += pitch;
-  }
-}
-#endif
-
-void vp9_dc_only_idct_add_8x8_c(short input_dc,
-                                unsigned char *pred_ptr,
-                                unsigned char *dst_ptr,
-                                int pitch, int stride) {
-  int a1 = ((input_dc + 16) >> 5);
-  int r, c, b;
-  unsigned char *orig_pred = pred_ptr;
-  unsigned char *orig_dst = dst_ptr;
-  for (b = 0; b < 4; b++) {
-    for (r = 0; r < 4; r++) {
-      for (c = 0; c < 4; c++) {
-        int a = a1 + pred_ptr[c];
-
-        if (a < 0)
-          a = 0;
-
-        if (a > 255)
-          a = 255;
-
-        dst_ptr[c] = (unsigned char) a;
-      }
-
-      dst_ptr += stride;
-      pred_ptr += pitch;
-    }
-    dst_ptr = orig_dst + (b + 1) % 2 * 4 + (b + 1) / 2 * 4 * stride;
-    pred_ptr = orig_pred + (b + 1) % 2 * 4 + (b + 1) / 2 * 4 * pitch;
-  }
-}
-
-#define W1 2841                 /* 2048*sqrt(2)*cos(1*pi/16) */
-#define W2 2676                 /* 2048*sqrt(2)*cos(2*pi/16) */
-#define W3 2408                 /* 2048*sqrt(2)*cos(3*pi/16) */
-#define W5 1609                 /* 2048*sqrt(2)*cos(5*pi/16) */
-#define W6 1108                 /* 2048*sqrt(2)*cos(6*pi/16) */
-#define W7 565                  /* 2048*sqrt(2)*cos(7*pi/16) */
-
-/* row (horizontal) IDCT
- *
- * 7                       pi         1 dst[k] = sum c[l] * src[l] * cos( -- *
- * ( k + - ) * l ) l=0                      8          2
- *
- * where: c[0]    = 128 c[1..7] = 128*sqrt(2) */
-
-static void idctrow(int *blk) {
-  int x0, x1, x2, x3, x4, x5, x6, x7, x8;
-  /* shortcut */
-  if (!((x1 = blk[4] << 11) | (x2 = blk[6]) | (x3 = blk[2]) |
-        (x4 = blk[1]) | (x5 = blk[7]) | (x6 = blk[5]) | (x7 = blk[3]))) {
-    blk[0] = blk[1] = blk[2] = blk[3] = blk[4]
-                                        = blk[5] = blk[6] = blk[7] = blk[0] << 3;
-    return;
-  }
-
-  x0 = (blk[0] << 11) + 128;    /* for proper rounding in the fourth stage */
-  /* first stage */
-  x8 = W7 * (x4 + x5);
-  x4 = x8 + (W1 - W7) * x4;
-  x5 = x8 - (W1 + W7) * x5;
-  x8 = W3 * (x6 + x7);
-  x6 = x8 - (W3 - W5) * x6;
-  x7 = x8 - (W3 + W5) * x7;
-
-  /* second stage */
-  x8 = x0 + x1;
-  x0 -= x1;
-  x1 = W6 * (x3 + x2);
-  x2 = x1 - (W2 + W6) * x2;
-  x3 = x1 + (W2 - W6) * x3;
-  x1 = x4 + x6;
-  x4 -= x6;
-  x6 = x5 + x7;
-  x5 -= x7;
-
-  /* third stage */
-  x7 = x8 + x3;
-  x8 -= x3;
-  x3 = x0 + x2;
-  x0 -= x2;
-  x2 = (181 * (x4 + x5) + 128) >> 8;
-  x4 = (181 * (x4 - x5) + 128) >> 8;
-
-  /* fourth stage */
-  blk[0] = (x7 + x1) >> 8;
-  blk[1] = (x3 + x2) >> 8;
-  blk[2] = (x0 + x4) >> 8;
-  blk[3] = (x8 + x6) >> 8;
-  blk[4] = (x8 - x6) >> 8;
-  blk[5] = (x0 - x4) >> 8;
-  blk[6] = (x3 - x2) >> 8;
-  blk[7] = (x7 - x1) >> 8;
-}
-
-/* column (vertical) IDCT
- *
- * 7                         pi         1 dst[8*k] = sum c[l] * src[8*l] *
- * cos( -- * ( k + - ) * l ) l=0                        8          2
- *
- * where: c[0]    = 1/1024 c[1..7] = (1/1024)*sqrt(2) */
-static void idctcol(int *blk) {
-  int x0, x1, x2, x3, x4, x5, x6, x7, x8;
-
-  /* shortcut */
-  if (!((x1 = (blk[8 * 4] << 8)) | (x2 = blk[8 * 6]) | (x3 = blk[8 * 2]) |
-        (x4 = blk[8 * 1]) | (x5 = blk[8 * 7]) | (x6 = blk[8 * 5]) |
-        (x7 = blk[8 * 3]))) {
-    blk[8 * 0] = blk[8 * 1] = blk[8 * 2] = blk[8 * 3]
-                                           = blk[8 * 4] = blk[8 * 5] = blk[8 * 6]
-                                                                       = blk[8 * 7] = ((blk[8 * 0] + 32) >> 6);
-    return;
-  }
-
-  x0 = (blk[8 * 0] << 8) + 16384;
-
-  /* first stage */
-  x8 = W7 * (x4 + x5) + 4;
-  x4 = (x8 + (W1 - W7) * x4) >> 3;
-  x5 = (x8 - (W1 + W7) * x5) >> 3;
-  x8 = W3 * (x6 + x7) + 4;
-  x6 = (x8 - (W3 - W5) * x6) >> 3;
-  x7 = (x8 - (W3 + W5) * x7) >> 3;
-
-  /* second stage */
-  x8 = x0 + x1;
-  x0 -= x1;
-  x1 = W6 * (x3 + x2) + 4;
-  x2 = (x1 - (W2 + W6) * x2) >> 3;
-  x3 = (x1 + (W2 - W6) * x3) >> 3;
-  x1 = x4 + x6;
-  x4 -= x6;
-  x6 = x5 + x7;
-  x5 -= x7;
-
-  /* third stage */
-  x7 = x8 + x3;
-  x8 -= x3;
-  x3 = x0 + x2;
-  x0 -= x2;
-  x2 = (181 * (x4 + x5) + 128) >> 8;
-  x4 = (181 * (x4 - x5) + 128) >> 8;
-
-  /* fourth stage */
-  blk[8 * 0] = (x7 + x1) >> 14;
-  blk[8 * 1] = (x3 + x2) >> 14;
-  blk[8 * 2] = (x0 + x4) >> 14;
-  blk[8 * 3] = (x8 + x6) >> 14;
-  blk[8 * 4] = (x8 - x6) >> 14;
-  blk[8 * 5] = (x0 - x4) >> 14;
-  blk[8 * 6] = (x3 - x2) >> 14;
-  blk[8 * 7] = (x7 - x1) >> 14;
-}
-
-#define TX_DIM 8
-void vp9_short_idct8x8_c(short *coefs, short *block, int pitch) {
-  int X[TX_DIM * TX_DIM];
-  int i, j;
-  int shortpitch = pitch >> 1;
-
-  for (i = 0; i < TX_DIM; i++) {
-    for (j = 0; j < TX_DIM; j++) {
-      X[i * TX_DIM + j] = (int)(coefs[i * TX_DIM + j] + 1
-                                + (coefs[i * TX_DIM + j] < 0)) >> 2;
-    }
-  }
-  for (i = 0; i < 8; i++)
-    idctrow(X + 8 * i);
-
-  for (i = 0; i < 8; i++)
-    idctcol(X + i);
-
-  for (i = 0; i < TX_DIM; i++) {
-    for (j = 0; j < TX_DIM; j++) {
-      block[i * shortpitch + j]  = X[i * TX_DIM + j] >> 1;
-    }
-  }
-}
-
-
-void vp9_short_ihaar2x2_c(short *input, short *output, int pitch) {
-  int i;
-  short *ip = input; // 0,1, 4, 8
-  short *op = output;
-  for (i = 0; i < 16; i++) {
-    op[i] = 0;
-  }
-
-  op[0] = (ip[0] + ip[1] + ip[4] + ip[8] + 1) >> 1;
-  op[1] = (ip[0] - ip[1] + ip[4] - ip[8]) >> 1;
-  op[4] = (ip[0] + ip[1] - ip[4] - ip[8]) >> 1;
-  op[8] = (ip[0] - ip[1] - ip[4] + ip[8]) >> 1;
-}
-
-
-#if 0
-// Keep a really bad float version as reference for now.
-void vp9_short_idct16x16_c(short *input, short *output, int pitch) {
-
-  vp9_clear_system_state(); // Make it simd safe : __asm emms;
-  {
-    double x;
-    const int short_pitch = pitch >> 1;
-    int i, j, k, l;
-    for (l = 0; l < 16; ++l) {
-      for (k = 0; k < 16; ++k) {
-        double s = 0;
-        for (i = 0; i < 16; ++i) {
-          for (j = 0; j < 16; ++j) {
-            x=cos(PI*j*(l+0.5)/16.0)*cos(PI*i*(k+0.5)/16.0)*input[i*16+j]/32;
-            if (i != 0)
-              x *= sqrt(2.0);
-            if (j != 0)
-              x *= sqrt(2.0);
-            s += x;
-          }
-        }
-        output[k*short_pitch+l] = (short)round(s);
-      }
-    }
-  }
-  vp9_clear_system_state(); // Make it simd safe : __asm emms;
-}
-#endif
-
-static const double C1 = 0.995184726672197;
-static const double C2 = 0.98078528040323;
-static const double C3 = 0.956940335732209;
-static const double C4 = 0.923879532511287;
-static const double C5 = 0.881921264348355;
-static const double C6 = 0.831469612302545;
-static const double C7 = 0.773010453362737;
-static const double C8 = 0.707106781186548;
-static const double C9 = 0.634393284163646;
-static const double C10 = 0.555570233019602;
-static const double C11 = 0.471396736825998;
-static const double C12 = 0.38268343236509;
-static const double C13 = 0.290284677254462;
-static const double C14 = 0.195090322016128;
-static const double C15 = 0.098017140329561;
-
-
-static void butterfly_16x16_idct_1d(double input[16], double output[16]) {
-
-  vp9_clear_system_state(); // Make it simd safe : __asm emms;
-  {
-    double step[16];
-    double intermediate[16];
-    double temp1, temp2;
-
-
-    // step 1 and 2
-    step[ 0] = input[0] + input[8];
-    step[ 1] = input[0] - input[8];
-
-    temp1 = input[4]*C12;
-    temp2 = input[12]*C4;
-
-    temp1 -= temp2;
-    temp1 *= C8;
-
-    step[ 2] = 2*(temp1);
-
-    temp1 = input[4]*C4;
-    temp2 = input[12]*C12;
-    temp1 += temp2;
-    temp1 = (temp1);
-    temp1 *= C8;
-    step[ 3] = 2*(temp1);
-
-    temp1 = input[2]*C8;
-    temp1 = 2*(temp1);
-    temp2 = input[6] + input[10];
-
-    step[ 4] = temp1 + temp2;
-    step[ 5] = temp1 - temp2;
-
-    temp1 = input[14]*C8;
-    temp1 = 2*(temp1);
-    temp2 = input[6] - input[10];
-
-    step[ 6] = temp2 - temp1;
-    step[ 7] = temp2 + temp1;
-
-    // for odd input
-    temp1 = input[3]*C12;
-    temp2 = input[13]*C4;
-    temp1 += temp2;
-    temp1 = (temp1);
-    temp1 *= C8;
-    intermediate[ 8] = 2*(temp1);
-
-    temp1 = input[3]*C4;
-    temp2 = input[13]*C12;
-    temp2 -= temp1;
-    temp2 = (temp2);
-    temp2 *= C8;
-    intermediate[ 9] = 2*(temp2);
-
-    intermediate[10] = 2*(input[9]*C8);
-    intermediate[11] = input[15] - input[1];
-    intermediate[12] = input[15] + input[1];
-    intermediate[13] = 2*((input[7]*C8));
-
-    temp1 = input[11]*C12;
-    temp2 = input[5]*C4;
-    temp2 -= temp1;
-    temp2 = (temp2);
-    temp2 *= C8;
-    intermediate[14] = 2*(temp2);
-
-    temp1 = input[11]*C4;
-    temp2 = input[5]*C12;
-    temp1 += temp2;
-    temp1 = (temp1);
-    temp1 *= C8;
-    intermediate[15] = 2*(temp1);
-
-    step[ 8] = intermediate[ 8] + intermediate[14];
-    step[ 9] = intermediate[ 9] + intermediate[15];
-    step[10] = intermediate[10] + intermediate[11];
-    step[11] = intermediate[10] - intermediate[11];
-    step[12] = intermediate[12] + intermediate[13];
-    step[13] = intermediate[12] - intermediate[13];
-    step[14] = intermediate[ 8] - intermediate[14];
-    step[15] = intermediate[ 9] - intermediate[15];
-
-    // step 3
-    output[0] = step[ 0] + step[ 3];
-    output[1] = step[ 1] + step[ 2];
-    output[2] = step[ 1] - step[ 2];
-    output[3] = step[ 0] - step[ 3];
-
-    temp1 = step[ 4]*C14;
-    temp2 = step[ 7]*C2;
-    temp1 -= temp2;
-    output[4] =  (temp1);
-
-    temp1 = step[ 4]*C2;
-    temp2 = step[ 7]*C14;
-    temp1 += temp2;
-    output[7] =  (temp1);
-
-    temp1 = step[ 5]*C10;
-    temp2 = step[ 6]*C6;
-    temp1 -= temp2;
-    output[5] =  (temp1);
-
-    temp1 = step[ 5]*C6;
-    temp2 = step[ 6]*C10;
-    temp1 += temp2;
-    output[6] =  (temp1);
-
-    output[8] = step[ 8] + step[11];
-    output[9] = step[ 9] + step[10];
-    output[10] = step[ 9] - step[10];
-    output[11] = step[ 8] - step[11];
-    output[12] = step[12] + step[15];
-    output[13] = step[13] + step[14];
-    output[14] = step[13] - step[14];
-    output[15] = step[12] - step[15];
-
-    // output 4
-    step[ 0] = output[0] + output[7];
-    step[ 1] = output[1] + output[6];
-    step[ 2] = output[2] + output[5];
-    step[ 3] = output[3] + output[4];
-    step[ 4] = output[3] - output[4];
-    step[ 5] = output[2] - output[5];
-    step[ 6] = output[1] - output[6];
-    step[ 7] = output[0] - output[7];
-
-    temp1 = output[8]*C7;
-    temp2 = output[15]*C9;
-    temp1 -= temp2;
-    step[ 8] = (temp1);
-
-    temp1 = output[9]*C11;
-    temp2 = output[14]*C5;
-    temp1 += temp2;
-    step[ 9] = (temp1);
-
-    temp1 = output[10]*C3;
-    temp2 = output[13]*C13;
-    temp1 -= temp2;
-    step[10] = (temp1);
-
-    temp1 = output[11]*C15;
-    temp2 = output[12]*C1;
-    temp1 += temp2;
-    step[11] = (temp1);
-
-    temp1 = output[11]*C1;
-    temp2 = output[12]*C15;
-    temp2 -= temp1;
-    step[12] = (temp2);
-
-    temp1 = output[10]*C13;
-    temp2 = output[13]*C3;
-    temp1 += temp2;
-    step[13] = (temp1);
-
-    temp1 = output[9]*C5;
-    temp2 = output[14]*C11;
-    temp2 -= temp1;
-    step[14] = (temp2);
-
-    temp1 = output[8]*C9;
-    temp2 = output[15]*C7;
-    temp1 += temp2;
-    step[15] = (temp1);
-
-    // step 5
-    output[0] = (step[0] + step[15]);
-    output[1] = (step[1] + step[14]);
-    output[2] = (step[2] + step[13]);
-    output[3] = (step[3] + step[12]);
-    output[4] = (step[4] + step[11]);
-    output[5] = (step[5] + step[10]);
-    output[6] = (step[6] + step[ 9]);
-    output[7] = (step[7] + step[ 8]);
-
-    output[15] = (step[0] - step[15]);
-    output[14] = (step[1] - step[14]);
-    output[13] = (step[2] - step[13]);
-    output[12] = (step[3] - step[12]);
-    output[11] = (step[4] - step[11]);
-    output[10] = (step[5] - step[10]);
-    output[9] = (step[6] - step[ 9]);
-    output[8] = (step[7] - step[ 8]);
-  }
-  vp9_clear_system_state(); // Make it simd safe : __asm emms;
-}
-
-// Remove once an int version of iDCT is written
-#if 0
-void reference_16x16_idct_1d(double input[16], double output[16]) {
-
-  vp9_clear_system_state(); // Make it simd safe : __asm emms;
-  {
-    const double kPi = 3.141592653589793238462643383279502884;
-    const double kSqrt2 = 1.414213562373095048801688724209698;
-    for (int k = 0; k < 16; k++) {
-      output[k] = 0.0;
-      for (int n = 0; n < 16; n++) {
-        output[k] += input[n]*cos(kPi*(2*k+1)*n/32.0);
-        if (n == 0)
-          output[k] = output[k]/kSqrt2;
-      }
-    }
-  }
-  vp9_clear_system_state(); // Make it simd safe : __asm emms;
-}
-#endif
-
-void vp9_short_idct16x16_c(short *input, short *output, int pitch) {
-
-  vp9_clear_system_state(); // Make it simd safe : __asm emms;
-  {
-    double out[16*16], out2[16*16];
-    const int short_pitch = pitch >> 1;
-    int i, j;
-      // First transform rows
-    for (i = 0; i < 16; ++i) {
-      double temp_in[16], temp_out[16];
-      for (j = 0; j < 16; ++j)
-        temp_in[j] = input[j + i*short_pitch];
-      butterfly_16x16_idct_1d(temp_in, temp_out);
-      for (j = 0; j < 16; ++j)
-        out[j + i*16] = temp_out[j];
-    }
-    // Then transform columns
-    for (i = 0; i < 16; ++i) {
-      double temp_in[16], temp_out[16];
-      for (j = 0; j < 16; ++j)
-        temp_in[j] = out[j*16 + i];
-      butterfly_16x16_idct_1d(temp_in, temp_out);
-      for (j = 0; j < 16; ++j)
-        out2[j*16 + i] = temp_out[j];
-    }
-    for (i = 0; i < 16*16; ++i)
-      output[i] = round(out2[i]/128);
-  }
-  vp9_clear_system_state(); // Make it simd safe : __asm emms;
-}
--- a/vp8/common/implicit_segmentation.c
+++ /dev/null
@@ -1,255 +1,0 @@
-/*
- *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "vp8/common/onyxc_int.h"
-
-#define MAX_REGIONS 24000
-#ifndef NULL
-#define NULL 0
-#endif
-
-#define min_mbs_in_region 3
-
-// this linked list structure holds equivalences for connected
-// component labeling
-struct list_el {
-  int label;
-  int seg_value;
-  int count;
-  struct list_el *next;
-};
-typedef struct list_el item;
-
-// connected colorsegments
-typedef struct {
-  int min_x;
-  int min_y;
-  int max_x;
-  int max_y;
-  long long sum_x;
-  long long sum_y;
-  int pixels;
-  int seg_value;
-  int label;
-} segment_info;
-
-
-typedef enum {
-  SEGMENT_MODE,
-  SEGMENT_MV,
-  SEGMENT_REFFRAME,
-  SEGMENT_SKIPPED
-} SEGMENT_TYPE;
-
-
-// this merges the two equivalence lists and
-// then makes sure that every label points to the same
-// equivalence list
-void merge(item *labels, int u, int v) {
-  item *a = labels[u].next;
-  item *b = labels[v].next;
-  item c;
-  item *it = &c;
-  int count;
-
-  // check if they are already merged
-  if (u == v || a == b)
-    return;
-
-  count = a->count + b->count;
-
-  // merge 2 sorted linked lists.
-  while (a != NULL && b != NULL) {
-    if (a->label < b->label) {
-      it->next = a;
-      a = a->next;
-    } else {
-      it->next = b;
-      b = b->next;
-    }
-
-    it = it->next;
-  }
-
-  if (a == NULL)
-    it->next = b;
-  else
-    it->next = a;
-
-  it = c.next;
-
-  // make sure every equivalence in the linked list points to this new ll
-  while (it != NULL) {
-    labels[it->label].next = c.next;
-    it = it->next;
-  }
-  c.next->count = count;
-
-}
-
-void segment_via_mode_info(VP9_COMMON *oci, int how) {
-  MODE_INFO *mi = oci->mi;
-  int i, j;
-  int mb_index = 0;
-
-  int label = 1;
-  int pitch = oci->mb_cols;
-
-  // holds linked list equivalences
-  // the max should probably be allocated at a higher level in oci
-  item equivalences[MAX_REGIONS];
-  int eq_ptr = 0;
-  item labels[MAX_REGIONS];
-  segment_info segments[MAX_REGIONS];
-  int label_count = 1;
-  int labeling[400 * 300];
-  int *lp = labeling;
-
-  label_count = 1;
-  memset(labels, 0, sizeof(labels));
-  memset(segments, 0, sizeof(segments));
-
-  /* Go through each macroblock first pass labelling */
-  for (i = 0; i < oci->mb_rows; i++, lp += pitch) {
-    for (j = 0; j < oci->mb_cols; j++) {
-      // int above seg_value, left seg_value, this seg_value...
-      int a = -1, l = -1, n = -1;
-
-      // above label, left label
-      int al = -1, ll = -1;
-      if (i) {
-        al = lp[j - pitch];
-        a = labels[al].next->seg_value;
-      }
-      if (j) {
-        ll = lp[j - 1];
-        l = labels[ll].next->seg_value;
-      }
-
-      // what setting are we going to do the implicit segmentation on
-      switch (how) {
-        case SEGMENT_MODE:
-          n = mi[mb_index].mbmi.mode;
-          break;
-        case SEGMENT_MV:
-          n = mi[mb_index].mbmi.mv[0].as_int;
-          if (mi[mb_index].mbmi.ref_frame == INTRA_FRAME)
-            n = -9999999;
-          break;
-        case SEGMENT_REFFRAME:
-          n = mi[mb_index].mbmi.ref_frame;
-          break;
-        case SEGMENT_SKIPPED:
-          n = mi[mb_index].mbmi.mb_skip_coeff;
-          break;
-      }
-
-      // above and left both have the same seg_value
-      if (n == a && n == l) {
-        // pick the lowest label
-        lp[j] = (al < ll ? al : ll);
-        labels[lp[j]].next->count++;
-
-        // merge the above and left equivalencies
-        merge(labels, al, ll);
-      }
-      // this matches above seg_value
-      else if (n == a) {
-        // give it the same label as above
-        lp[j] = al;
-        labels[al].next->count++;
-      }
-      // this matches left seg_value
-      else if (n == l) {
-        // give it the same label as above
-        lp[j] = ll;
-        labels[ll].next->count++;
-      } else {
-        // new label doesn't match either
-        item *e = &labels[label];
-        item *nl = &equivalences[eq_ptr++];
-        lp[j] = label;
-        nl->label = label;
-        nl->next = 0;
-        nl->seg_value = n;
-        nl->count = 1;
-        e->next = nl;
-        label++;
-      }
-      mb_index++;
-    }
-    mb_index++;
-  }
-  lp = labeling;
-
-  // give new labels to regions
-  for (i = 1; i < label; i++)
-    if (labels[i].next->count > min_mbs_in_region  &&  labels[labels[i].next->label].label == 0) {
-      segment_info *cs = &segments[label_count];
-      cs->label = label_count;
-      labels[labels[i].next->label].label = label_count++;
-      labels[labels[i].next->label].seg_value  = labels[i].next->seg_value;
-      cs->seg_value = labels[labels[i].next->label].seg_value;
-      cs->min_x = oci->mb_cols;
-      cs->min_y = oci->mb_rows;
-      cs->max_x = 0;
-      cs->max_y = 0;
-      cs->sum_x = 0;
-      cs->sum_y = 0;
-      cs->pixels = 0;
-
-    }
-  lp = labeling;
-
-  // this is just to gather stats...
-  for (i = 0; i < oci->mb_rows; i++, lp += pitch) {
-    for (j = 0; j < oci->mb_cols; j++) {
-      segment_info *cs;
-      int oldlab = labels[lp[j]].next->label;
-      int lab = labels[oldlab].label;
-      lp[j] = lab;
-
-      cs = &segments[lab];
-
-      cs->min_x = (j < cs->min_x ? j : cs->min_x);
-      cs->max_x = (j > cs->max_x ? j : cs->max_x);
-      cs->min_y = (i < cs->min_y ? i : cs->min_y);
-      cs->max_y = (i > cs->max_y ? i : cs->max_y);
-      cs->sum_x += j;
-      cs->sum_y += i;
-      cs->pixels++;
-
-      lp[j] = lab;
-      mb_index++;
-    }
-    mb_index++;
-  }
-
-  {
-    lp = labeling;
-    printf("labelling \n");
-    mb_index = 0;
-    for (i = 0; i < oci->mb_rows; i++, lp += pitch) {
-      for (j = 0; j < oci->mb_cols; j++) {
-        printf("%4d", lp[j]);
-      }
-      printf("            ");
-      for (j = 0; j < oci->mb_cols; j++, mb_index++) {
-        // printf("%3d",mi[mb_index].mbmi.mode );
-        printf("%4d:%4d", mi[mb_index].mbmi.mv[0].as_mv.row,
-            mi[mb_index].mbmi.mv[0].as_mv.col);
-      }
-      printf("\n");
-      ++mb_index;
-    }
-    printf("\n");
-  }
-}
-
--- a/vp8/common/invtrans.c
+++ /dev/null
@@ -1,135 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "invtrans.h"
-
-static void recon_dcblock(MACROBLOCKD *xd) {
-  BLOCKD *b = &xd->block[24];
-  int i;
-
-  for (i = 0; i < 16; i++) {
-    xd->block[i].dqcoeff[0] = b->diff[i];
-  }
-}
-
-static void recon_dcblock_8x8(MACROBLOCKD *xd) {
-  BLOCKD *b = &xd->block[24]; // for coeff 0, 2, 8, 10
-
-  xd->block[0].dqcoeff[0] = b->diff[0];
-  xd->block[4].dqcoeff[0] = b->diff[1];
-  xd->block[8].dqcoeff[0] = b->diff[4];
-  xd->block[12].dqcoeff[0] = b->diff[8];
-}
-
-void vp9_inverse_transform_b_4x4(const vp9_idct_rtcd_vtable_t *rtcd,
-                                 BLOCKD *b, int pitch) {
-  if (b->eob <= 1)
-    IDCT_INVOKE(rtcd, idct1)(b->dqcoeff, b->diff, pitch);
-  else
-    IDCT_INVOKE(rtcd, idct16)(b->dqcoeff, b->diff, pitch);
-}
-
-void vp9_inverse_transform_mby_4x4(const vp9_idct_rtcd_vtable_t *rtcd,
-                                   MACROBLOCKD *xd) {
-  int i;
-  BLOCKD *blockd = xd->block;
-
-  if (xd->mode_info_context->mbmi.mode != SPLITMV) {
-    /* do 2nd order transform on the dc block */
-    IDCT_INVOKE(rtcd, iwalsh16)(blockd[24].dqcoeff, blockd[24].diff);
-    recon_dcblock(xd);
-  }
-
-  for (i = 0; i < 16; i++) {
-    vp9_inverse_transform_b_4x4(rtcd, &blockd[i], 32);
-  }
-}
-
-void vp9_inverse_transform_mbuv_4x4(const vp9_idct_rtcd_vtable_t *rtcd,
-                                    MACROBLOCKD *xd) {
-  int i;
-  BLOCKD *blockd = xd->block;
-
-  for (i = 16; i < 24; i++) {
-    vp9_inverse_transform_b_4x4(rtcd, &blockd[i], 16);
-  }
-}
-
-void vp9_inverse_transform_mb_4x4(const vp9_idct_rtcd_vtable_t *rtcd,
-                                  MACROBLOCKD *xd) {
-  vp9_inverse_transform_mby_4x4(rtcd, xd);
-  vp9_inverse_transform_mbuv_4x4(rtcd, xd);
-}
-
-void vp9_inverse_transform_b_8x8(const vp9_idct_rtcd_vtable_t *rtcd,
-                                 short *input_dqcoeff, short *output_coeff,
-                                 int pitch) {
-  // int b,i;
-  // if (b->eob > 1)
-  IDCT_INVOKE(rtcd, idct8)(input_dqcoeff, output_coeff, pitch);
-  // else
-  // IDCT_INVOKE(rtcd, idct8_1)(b->dqcoeff, b->diff, pitch);//pitch
-}
-
-void vp9_inverse_transform_mby_8x8(const vp9_idct_rtcd_vtable_t *rtcd,
-                                   MACROBLOCKD *xd) {
-  int i;
-  BLOCKD *blockd = xd->block;
-
-  if (xd->mode_info_context->mbmi.mode != SPLITMV) {
-    // do 2nd order transform on the dc block
-    IDCT_INVOKE(rtcd, ihaar2)(blockd[24].dqcoeff, blockd[24].diff, 8);
-    recon_dcblock_8x8(xd); // need to change for 8x8
-  }
-
-  for (i = 0; i < 9; i += 8) {
-    vp9_inverse_transform_b_8x8(rtcd, &blockd[i].dqcoeff[0],
-                                &blockd[i].diff[0], 32);
-  }
-  for (i = 2; i < 11; i += 8) {
-    vp9_inverse_transform_b_8x8(rtcd, &blockd[i + 2].dqcoeff[0],
-                                &blockd[i].diff[0], 32);
-  }
-}
-
-void vp9_inverse_transform_mbuv_8x8(const vp9_idct_rtcd_vtable_t *rtcd,
-                                    MACROBLOCKD *xd) {
-  int i;
-  BLOCKD *blockd = xd->block;
-
-  for (i = 16; i < 24; i += 4) {
-    vp9_inverse_transform_b_8x8(rtcd, &blockd[i].dqcoeff[0],
-                                &blockd[i].diff[0], 16);
-  }
-}
-
-void vp9_inverse_transform_mb_8x8(const vp9_idct_rtcd_vtable_t *rtcd,
-                                  MACROBLOCKD *xd) {
-  vp9_inverse_transform_mby_8x8(rtcd, xd);
-  vp9_inverse_transform_mbuv_8x8(rtcd, xd);
-}
-
-void vp9_inverse_transform_b_16x16(const vp9_idct_rtcd_vtable_t *rtcd,
-                                   short *input_dqcoeff,
-                                   short *output_coeff, int pitch) {
-  IDCT_INVOKE(rtcd, idct16x16)(input_dqcoeff, output_coeff, pitch);
-}
-
-void vp9_inverse_transform_mby_16x16(const vp9_idct_rtcd_vtable_t *rtcd,
-                                     MACROBLOCKD *xd) {
-  vp9_inverse_transform_b_16x16(rtcd, &xd->block[0].dqcoeff[0],
-                                &xd->block[0].diff[0], 32);
-}
-
-void vp9_inverse_transform_mb_16x16(const vp9_idct_rtcd_vtable_t *rtcd,
-                                    MACROBLOCKD *xd) {
-  vp9_inverse_transform_mby_16x16(rtcd, xd);
-  vp9_inverse_transform_mbuv_8x8(rtcd, xd);
-}
--- a/vp8/common/invtrans.h
+++ /dev/null
@@ -1,53 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef __INC_INVTRANS_H
-#define __INC_INVTRANS_H
-
-#include "vpx_ports/config.h"
-#include "idct.h"
-#include "blockd.h"
-
-extern void vp9_inverse_transform_b_4x4(const vp9_idct_rtcd_vtable_t *rtcd,
-                                        BLOCKD *b, int pitch);
-
-extern void vp9_inverse_transform_mb_4x4(const vp9_idct_rtcd_vtable_t *rtcd,
-                                         MACROBLOCKD *xd);
-
-extern void vp9_inverse_transform_mby_4x4(const vp9_idct_rtcd_vtable_t *rtcd,
-                                          MACROBLOCKD *xd);
-
-extern void vp9_inverse_transform_mbuv_4x4(const vp9_idct_rtcd_vtable_t *rtcd,
-                                           MACROBLOCKD *xd);
-
-extern void vp9_inverse_transform_b_8x8(const vp9_idct_rtcd_vtable_t *rtcd,
-                                        short *input_dqcoeff,
-                                        short *output_coeff, int pitch);
-
-extern void vp9_inverse_transform_mb_8x8(const vp9_idct_rtcd_vtable_t *rtcd,
-                                         MACROBLOCKD *xd);
-
-extern void vp9_inverse_transform_mby_8x8(const vp9_idct_rtcd_vtable_t *rtcd,
-                                          MACROBLOCKD *xd);
-
-extern void vp9_inverse_transform_mbuv_8x8(const vp9_idct_rtcd_vtable_t *rtcd,
-                                           MACROBLOCKD *xd);
-
-extern void vp9_inverse_transform_b_16x16(const vp9_idct_rtcd_vtable_t *rtcd,
-                                          short *input_dqcoeff,
-                                          short *output_coeff, int pitch);
-
-extern void vp9_inverse_transform_mb_16x16(const vp9_idct_rtcd_vtable_t *rtcd,
-                                           MACROBLOCKD *xd);
-
-extern void vp9_inverse_transform_mby_16x16(const vp9_idct_rtcd_vtable_t *rtcd,
-                                            MACROBLOCKD *xd);
-
-#endif  // __INC_INVTRANS_H
--- a/vp8/common/loopfilter.c
+++ /dev/null
@@ -1,524 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "vpx_config.h"
-#include "loopfilter.h"
-#include "onyxc_int.h"
-#include "vpx_mem/vpx_mem.h"
-
-#include "vp8/common/seg_common.h"
-
-static void lf_init_lut(loop_filter_info_n *lfi) {
-  int filt_lvl;
-
-  for (filt_lvl = 0; filt_lvl <= MAX_LOOP_FILTER; filt_lvl++) {
-    if (filt_lvl >= 40) {
-      lfi->hev_thr_lut[KEY_FRAME][filt_lvl] = 2;
-      lfi->hev_thr_lut[INTER_FRAME][filt_lvl] = 3;
-    } else if (filt_lvl >= 20) {
-      lfi->hev_thr_lut[KEY_FRAME][filt_lvl] = 1;
-      lfi->hev_thr_lut[INTER_FRAME][filt_lvl] = 2;
-    } else if (filt_lvl >= 15) {
-      lfi->hev_thr_lut[KEY_FRAME][filt_lvl] = 1;
-      lfi->hev_thr_lut[INTER_FRAME][filt_lvl] = 1;
-    } else {
-      lfi->hev_thr_lut[KEY_FRAME][filt_lvl] = 0;
-      lfi->hev_thr_lut[INTER_FRAME][filt_lvl] = 0;
-    }
-  }
-
-  lfi->mode_lf_lut[DC_PRED] = 1;
-  lfi->mode_lf_lut[D45_PRED] = 1;
-  lfi->mode_lf_lut[D135_PRED] = 1;
-  lfi->mode_lf_lut[D117_PRED] = 1;
-  lfi->mode_lf_lut[D153_PRED] = 1;
-  lfi->mode_lf_lut[D27_PRED] = 1;
-  lfi->mode_lf_lut[D63_PRED] = 1;
-  lfi->mode_lf_lut[V_PRED] = 1;
-  lfi->mode_lf_lut[H_PRED] = 1;
-  lfi->mode_lf_lut[TM_PRED] = 1;
-  lfi->mode_lf_lut[B_PRED]  = 0;
-  lfi->mode_lf_lut[I8X8_PRED] = 0;
-  lfi->mode_lf_lut[ZEROMV]  = 1;
-  lfi->mode_lf_lut[NEARESTMV] = 2;
-  lfi->mode_lf_lut[NEARMV] = 2;
-  lfi->mode_lf_lut[NEWMV] = 2;
-  lfi->mode_lf_lut[SPLITMV] = 3;
-}
-
-void vp9_loop_filter_update_sharpness(loop_filter_info_n *lfi,
-                                      int sharpness_lvl) {
-  int i;
-
-  /* For each possible value for the loop filter fill out limits */
-  for (i = 0; i <= MAX_LOOP_FILTER; i++) {
-    int filt_lvl = i;
-    int block_inside_limit = 0;
-
-    /* Set loop filter paramaeters that control sharpness. */
-    block_inside_limit = filt_lvl >> (sharpness_lvl > 0);
-    block_inside_limit = block_inside_limit >> (sharpness_lvl > 4);
-
-    if (sharpness_lvl > 0) {
-      if (block_inside_limit > (9 - sharpness_lvl))
-        block_inside_limit = (9 - sharpness_lvl);
-    }
-
-    if (block_inside_limit < 1)
-      block_inside_limit = 1;
-
-    vpx_memset(lfi->lim[i], block_inside_limit, SIMD_WIDTH);
-    vpx_memset(lfi->blim[i], (2 * filt_lvl + block_inside_limit),
-               SIMD_WIDTH);
-    vpx_memset(lfi->mblim[i], (2 * (filt_lvl + 2) + block_inside_limit),
-               SIMD_WIDTH);
-  }
-}
-
-void vp9_loop_filter_init(VP9_COMMON *cm) {
-  loop_filter_info_n *lfi = &cm->lf_info;
-  int i;
-
-  /* init limits for given sharpness*/
-  vp9_loop_filter_update_sharpness(lfi, cm->sharpness_level);
-  cm->last_sharpness_level = cm->sharpness_level;
-
-  /* init LUT for lvl  and hev thr picking */
-  lf_init_lut(lfi);
-
-  /* init hev threshold const vectors */
-  for (i = 0; i < 4; i++) {
-    vpx_memset(lfi->hev_thr[i], i, SIMD_WIDTH);
-  }
-}
-
-void vp9_loop_filter_frame_init(VP9_COMMON *cm,
-                                MACROBLOCKD *xd,
-                                int default_filt_lvl) {
-  int seg,  /* segment number */
-      ref,  /* index in ref_lf_deltas */
-      mode; /* index in mode_lf_deltas */
-
-  loop_filter_info_n *lfi = &cm->lf_info;
-
-  /* update limits if sharpness has changed */
-  if (cm->last_sharpness_level != cm->sharpness_level) {
-    vp9_loop_filter_update_sharpness(lfi, cm->sharpness_level);
-    cm->last_sharpness_level = cm->sharpness_level;
-  }
-
-  for (seg = 0; seg < MAX_MB_SEGMENTS; seg++) {
-    int lvl_seg = default_filt_lvl;
-    int lvl_ref, lvl_mode;
-
-
-    // Set the baseline filter values for each segment
-    if (vp9_segfeature_active(xd, seg, SEG_LVL_ALT_LF)) {
-      /* Abs value */
-      if (xd->mb_segment_abs_delta == SEGMENT_ABSDATA) {
-        lvl_seg = vp9_get_segdata(xd, seg, SEG_LVL_ALT_LF);
-      } else { /* Delta Value */
-        lvl_seg += vp9_get_segdata(xd, seg, SEG_LVL_ALT_LF);
-        lvl_seg = (lvl_seg > 0) ? ((lvl_seg > 63) ? 63 : lvl_seg) : 0;
-      }
-    }
-
-    if (!xd->mode_ref_lf_delta_enabled) {
-      /* we could get rid of this if we assume that deltas are set to
-       * zero when not in use; encoder always uses deltas
-       */
-      vpx_memset(lfi->lvl[seg][0], lvl_seg, 4 * 4);
-      continue;
-    }
-
-    lvl_ref = lvl_seg;
-
-    /* INTRA_FRAME */
-    ref = INTRA_FRAME;
-
-    /* Apply delta for reference frame */
-    lvl_ref += xd->ref_lf_deltas[ref];
-
-    /* Apply delta for Intra modes */
-    mode = 0; /* B_PRED */
-    /* Only the split mode BPRED has a further special case */
-    lvl_mode = lvl_ref +  xd->mode_lf_deltas[mode];
-    lvl_mode = (lvl_mode > 0) ? (lvl_mode > 63 ? 63 : lvl_mode) : 0; /* clamp */
-
-    lfi->lvl[seg][ref][mode] = lvl_mode;
-
-    mode = 1; /* all the rest of Intra modes */
-    lvl_mode = (lvl_ref > 0) ? (lvl_ref > 63 ? 63 : lvl_ref)  : 0; /* clamp */
-    lfi->lvl[seg][ref][mode] = lvl_mode;
-
-    /* LAST, GOLDEN, ALT */
-    for (ref = 1; ref < MAX_REF_FRAMES; ref++) {
-      int lvl_ref = lvl_seg;
-
-      /* Apply delta for reference frame */
-      lvl_ref += xd->ref_lf_deltas[ref];
-
-      /* Apply delta for Inter modes */
-      for (mode = 1; mode < 4; mode++) {
-        lvl_mode = lvl_ref + xd->mode_lf_deltas[mode];
-        lvl_mode = (lvl_mode > 0) ? (lvl_mode > 63 ? 63 : lvl_mode) : 0; /* clamp */
-
-        lfi->lvl[seg][ref][mode] = lvl_mode;
-      }
-    }
-  }
-}
-
-void vp9_loop_filter_frame(VP9_COMMON *cm, MACROBLOCKD *xd) {
-  YV12_BUFFER_CONFIG *post = cm->frame_to_show;
-  loop_filter_info_n *lfi_n = &cm->lf_info;
-  struct loop_filter_info lfi;
-
-  FRAME_TYPE frame_type = cm->frame_type;
-
-  int mb_row;
-  int mb_col;
-
-  int filter_level;
-
-  unsigned char *y_ptr, *u_ptr, *v_ptr;
-
-  /* Point at base of Mb MODE_INFO list */
-  const MODE_INFO *mode_info_context = cm->mi;
-
-  /* Initialize the loop filter for this frame. */
-  vp9_loop_filter_frame_init(cm, xd, cm->filter_level);
-
-  /* Set up the buffer pointers */
-  y_ptr = post->y_buffer;
-  u_ptr = post->u_buffer;
-  v_ptr = post->v_buffer;
-
-  /* vp9_filter each macro block */
-  for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) {
-    for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) {
-      int skip_lf = (mode_info_context->mbmi.mode != B_PRED &&
-                     mode_info_context->mbmi.mode != I8X8_PRED &&
-                     mode_info_context->mbmi.mode != SPLITMV &&
-                     mode_info_context->mbmi.mb_skip_coeff);
-
-      const int mode_index = lfi_n->mode_lf_lut[mode_info_context->mbmi.mode];
-      const int seg = mode_info_context->mbmi.segment_id;
-      const int ref_frame = mode_info_context->mbmi.ref_frame;
-      int tx_type = mode_info_context->mbmi.txfm_size;
-      filter_level = lfi_n->lvl[seg][ref_frame][mode_index];
-
-      if (filter_level) {
-        if (cm->filter_type == NORMAL_LOOPFILTER) {
-          const int hev_index = lfi_n->hev_thr_lut[frame_type][filter_level];
-          lfi.mblim = lfi_n->mblim[filter_level];
-          lfi.blim = lfi_n->blim[filter_level];
-          lfi.lim = lfi_n->lim[filter_level];
-          lfi.hev_thr = lfi_n->hev_thr[hev_index];
-
-          if (mb_col > 0
-#if CONFIG_SUPERBLOCKS
-              && !((mb_col & 1) && mode_info_context->mbmi.encoded_as_sb &&
-                   mode_info_context[0].mbmi.mb_skip_coeff &&
-                   mode_info_context[-1].mbmi.mb_skip_coeff)
-#endif
-              )
-            vp9_loop_filter_mbv(y_ptr, u_ptr, v_ptr, post->y_stride,
-                                post->uv_stride, &lfi);
-
-          if (!skip_lf && tx_type != TX_16X16) {
-            if (tx_type == TX_8X8)
-              vp9_loop_filter_bv8x8(y_ptr, u_ptr, v_ptr, post->y_stride,
-                                    post->uv_stride, &lfi);
-            else
-              vp9_loop_filter_bv(y_ptr, u_ptr, v_ptr, post->y_stride,
-                                 post->uv_stride, &lfi);
-
-          }
-
-          /* don't apply across umv border */
-          if (mb_row > 0
-#if CONFIG_SUPERBLOCKS
-              && !((mb_row & 1) && mode_info_context->mbmi.encoded_as_sb &&
-                   mode_info_context[0].mbmi.mb_skip_coeff &&
-                   mode_info_context[-cm->mode_info_stride].mbmi.mb_skip_coeff)
-#endif
-              )
-            vp9_loop_filter_mbh(y_ptr, u_ptr, v_ptr, post->y_stride,
-                                post->uv_stride, &lfi);
-
-          if (!skip_lf && tx_type != TX_16X16) {
-            if (tx_type == TX_8X8)
-              vp9_loop_filter_bh8x8(y_ptr, u_ptr, v_ptr, post->y_stride,
-                                    post->uv_stride, &lfi);
-            else
-              vp9_loop_filter_bh(y_ptr, u_ptr, v_ptr, post->y_stride,
-                                 post->uv_stride, &lfi);
-          }
-        } else {
-          // FIXME: Not 8x8 aware
-          if (mb_col > 0
-#if CONFIG_SUPERBLOCKS
-              && !((mb_col & 1) && mode_info_context->mbmi.encoded_as_sb &&
-                   mode_info_context[0].mbmi.mb_skip_coeff &&
-                   mode_info_context[-1].mbmi.mb_skip_coeff)
-#endif
-              )
-            vp9_loop_filter_simple_mbv(y_ptr, post->y_stride,
-                                       lfi_n->mblim[filter_level]);
-
-          if (!skip_lf)
-            vp9_loop_filter_simple_bv(y_ptr, post->y_stride,
-                                      lfi_n->blim[filter_level]);
-
-          /* don't apply across umv border */
-          if (mb_row > 0
-#if CONFIG_SUPERBLOCKS
-              && !((mb_row & 1) && mode_info_context->mbmi.encoded_as_sb &&
-                   mode_info_context[0].mbmi.mb_skip_coeff &&
-                   mode_info_context[-cm->mode_info_stride].mbmi.mb_skip_coeff)
-#endif
-              )
-            vp9_loop_filter_simple_mbh(y_ptr, post->y_stride,
-                                       lfi_n->mblim[filter_level]);
-
-          if (!skip_lf)
-            vp9_loop_filter_simple_bh(y_ptr, post->y_stride,
-                                      lfi_n->blim[filter_level]);
-        }
-      }
-
-      y_ptr += 16;
-      u_ptr += 8;
-      v_ptr += 8;
-
-      mode_info_context++;     /* step to next MB */
-    }
-
-    y_ptr += post->y_stride  * 16 - post->y_width;
-    u_ptr += post->uv_stride *  8 - post->uv_width;
-    v_ptr += post->uv_stride *  8 - post->uv_width;
-
-    mode_info_context++;         /* Skip border mb */
-  }
-}
-
-void vp9_loop_filter_frame_yonly(VP9_COMMON *cm, MACROBLOCKD *xd,
-                                 int default_filt_lvl) {
-  YV12_BUFFER_CONFIG *post = cm->frame_to_show;
-
-  unsigned char *y_ptr;
-  int mb_row;
-  int mb_col;
-
-  loop_filter_info_n *lfi_n = &cm->lf_info;
-  struct loop_filter_info lfi;
-
-  int filter_level;
-  FRAME_TYPE frame_type = cm->frame_type;
-
-  /* Point at base of Mb MODE_INFO list */
-  const MODE_INFO *mode_info_context = cm->mi;
-
-#if 0
-  if (default_filt_lvl == 0) /* no filter applied */
-    return;
-#endif
-
-  /* Initialize the loop filter for this frame. */
-  vp9_loop_filter_frame_init(cm, xd, default_filt_lvl);
-
-  /* Set up the buffer pointers */
-  y_ptr = post->y_buffer;
-
-  /* vp9_filter each macro block */
-  for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) {
-    for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) {
-      int skip_lf = (mode_info_context->mbmi.mode != B_PRED &&
-                     mode_info_context->mbmi.mode != I8X8_PRED &&
-                     mode_info_context->mbmi.mode != SPLITMV &&
-                     mode_info_context->mbmi.mb_skip_coeff);
-
-      const int mode_index = lfi_n->mode_lf_lut[mode_info_context->mbmi.mode];
-      const int seg = mode_info_context->mbmi.segment_id;
-      const int ref_frame = mode_info_context->mbmi.ref_frame;
-      int tx_type = mode_info_context->mbmi.txfm_size;
-      filter_level = lfi_n->lvl[seg][ref_frame][mode_index];
-
-      if (filter_level) {
-        if (cm->filter_type == NORMAL_LOOPFILTER) {
-          const int hev_index = lfi_n->hev_thr_lut[frame_type][filter_level];
-          lfi.mblim = lfi_n->mblim[filter_level];
-          lfi.blim = lfi_n->blim[filter_level];
-          lfi.lim = lfi_n->lim[filter_level];
-          lfi.hev_thr = lfi_n->hev_thr[hev_index];
-
-          if (mb_col > 0)
-            vp9_loop_filter_mbv(y_ptr, 0, 0, post->y_stride, 0, &lfi);
-
-          if (!skip_lf && tx_type != TX_16X16) {
-            if (tx_type == TX_8X8)
-              vp9_loop_filter_bv8x8(y_ptr, 0, 0, post->y_stride, 0, &lfi);
-            else
-              vp9_loop_filter_bv(y_ptr, 0, 0, post->y_stride, 0, &lfi);
-          }
-
-          /* don't apply across umv border */
-          if (mb_row > 0)
-            vp9_loop_filter_mbh(y_ptr, 0, 0, post->y_stride, 0, &lfi);
-
-          if (!skip_lf && tx_type != TX_16X16) {
-            if (tx_type == TX_8X8)
-              vp9_loop_filter_bh8x8(y_ptr, 0, 0, post->y_stride, 0, &lfi);
-            else
-              vp9_loop_filter_bh(y_ptr, 0, 0, post->y_stride, 0, &lfi);
-          }
-        } else {
-          // FIXME: Not 8x8 aware
-          if (mb_col > 0)
-            vp9_loop_filter_simple_mbv(y_ptr, post->y_stride,
-                                       lfi_n->mblim[filter_level]);
-
-          if (!skip_lf)
-            vp9_loop_filter_simple_bv(y_ptr, post->y_stride,
-                                      lfi_n->blim[filter_level]);
-
-          /* don't apply across umv border */
-          if (mb_row > 0)
-            vp9_loop_filter_simple_mbh(y_ptr, post->y_stride,
-                                       lfi_n->mblim[filter_level]);
-
-          if (!skip_lf)
-            vp9_loop_filter_simple_bh(y_ptr, post->y_stride,
-                                      lfi_n->blim[filter_level]);
-        }
-      }
-
-      y_ptr += 16;
-      mode_info_context++;        /* step to next MB */
-    }
-
-    y_ptr += post->y_stride  * 16 - post->y_width;
-    mode_info_context++;            /* Skip border mb */
-  }
-}
-
-void vp9_loop_filter_partial_frame(VP9_COMMON *cm, MACROBLOCKD *xd,
-                                   int default_filt_lvl) {
-  YV12_BUFFER_CONFIG *post = cm->frame_to_show;
-
-  unsigned char *y_ptr;
-  int mb_row;
-  int mb_col;
-  int mb_cols = post->y_width  >> 4;
-
-  int linestocopy, i;
-
-  loop_filter_info_n *lfi_n = &cm->lf_info;
-  struct loop_filter_info lfi;
-
-  int filter_level;
-  int alt_flt_enabled = xd->segmentation_enabled;
-  FRAME_TYPE frame_type = cm->frame_type;
-
-  const MODE_INFO *mode_info_context;
-
-  int lvl_seg[MAX_MB_SEGMENTS];
-
-  mode_info_context = cm->mi + (post->y_height >> 5) * (mb_cols + 1);
-
-  /* 3 is a magic number. 4 is probably magic too */
-  linestocopy = (post->y_height >> (4 + 3));
-
-  if (linestocopy < 1)
-    linestocopy = 1;
-
-  linestocopy <<= 4;
-
-  /* Note the baseline filter values for each segment */
-  /* See vp9_loop_filter_frame_init. Rather than call that for each change
-   * to default_filt_lvl, copy the relevant calculation here.
-   */
-  if (alt_flt_enabled) {
-    for (i = 0; i < MAX_MB_SEGMENTS; i++) {
-      /* Abs value */
-      if (xd->mb_segment_abs_delta == SEGMENT_ABSDATA) {
-        lvl_seg[i] = vp9_get_segdata(xd, i, SEG_LVL_ALT_LF);
-      }
-      /* Delta Value */
-      else {
-        lvl_seg[i] = default_filt_lvl +
-                     vp9_get_segdata(xd, i, SEG_LVL_ALT_LF);
-        lvl_seg[i] = (lvl_seg[i] > 0) ?
-                     ((lvl_seg[i] > 63) ? 63 : lvl_seg[i]) : 0;
-      }
-    }
-  }
-
-  /* Set up the buffer pointers */
-  y_ptr = post->y_buffer + (post->y_height >> 5) * 16 * post->y_stride;
-
-  /* vp9_filter each macro block */
-  for (mb_row = 0; mb_row < (linestocopy >> 4); mb_row++) {
-    for (mb_col = 0; mb_col < mb_cols; mb_col++) {
-      int skip_lf = (mode_info_context->mbmi.mode != B_PRED &&
-                     mode_info_context->mbmi.mode != I8X8_PRED &&
-                     mode_info_context->mbmi.mode != SPLITMV &&
-                     mode_info_context->mbmi.mb_skip_coeff);
-
-      if (alt_flt_enabled)
-        filter_level = lvl_seg[mode_info_context->mbmi.segment_id];
-      else
-        filter_level = default_filt_lvl;
-
-      if (filter_level) {
-        if (cm->filter_type == NORMAL_LOOPFILTER) {
-          const int hev_index = lfi_n->hev_thr_lut[frame_type][filter_level];
-          lfi.mblim = lfi_n->mblim[filter_level];
-          lfi.blim = lfi_n->blim[filter_level];
-          lfi.lim = lfi_n->lim[filter_level];
-          lfi.hev_thr = lfi_n->hev_thr[hev_index];
-
-          if (mb_col > 0)
-            vp9_loop_filter_mbv(y_ptr, 0, 0, post->y_stride, 0, &lfi);
-
-          if (!skip_lf)
-            vp9_loop_filter_bv(y_ptr, 0, 0, post->y_stride, 0, &lfi);
-
-          vp9_loop_filter_mbh(y_ptr, 0, 0, post->y_stride, 0, &lfi);
-
-          if (!skip_lf)
-            vp9_loop_filter_bh(y_ptr, 0, 0, post->y_stride, 0, &lfi);
-        } else {
-          if (mb_col > 0)
-            vp9_loop_filter_simple_mbv (y_ptr, post->y_stride,
-                                        lfi_n->mblim[filter_level]);
-
-          if (!skip_lf)
-            vp9_loop_filter_simple_bv(y_ptr, post->y_stride,
-                                      lfi_n->blim[filter_level]);
-
-          vp9_loop_filter_simple_mbh(y_ptr, post->y_stride,
-                                     lfi_n->mblim[filter_level]);
-
-          if (!skip_lf)
-            vp9_loop_filter_simple_bh(y_ptr, post->y_stride,
-                                      lfi_n->blim[filter_level]);
-        }
-      }
-
-      y_ptr += 16;
-      mode_info_context += 1;      /* step to next MB */
-    }
-
-    y_ptr += post->y_stride  * 16 - post->y_width;
-    mode_info_context += 1;          /* Skip border mb */
-  }
-}
--- a/vp8/common/loopfilter.h
+++ /dev/null
@@ -1,104 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef loopfilter_h
-#define loopfilter_h
-
-#include "vpx_ports/mem.h"
-#include "vpx_config.h"
-#include "blockd.h"
-
-#define MAX_LOOP_FILTER 63
-
-typedef enum {
-  NORMAL_LOOPFILTER = 0,
-  SIMPLE_LOOPFILTER = 1
-} LOOPFILTERTYPE;
-
-#if ARCH_ARM
-#define SIMD_WIDTH 1
-#else
-#define SIMD_WIDTH 16
-#endif
-
-/* Need to align this structure so when it is declared and
- * passed it can be loaded into vector registers.
- */
-typedef struct {
-  DECLARE_ALIGNED(SIMD_WIDTH, unsigned char,
-                  mblim[MAX_LOOP_FILTER + 1][SIMD_WIDTH]);
-  DECLARE_ALIGNED(SIMD_WIDTH, unsigned char,
-                  blim[MAX_LOOP_FILTER + 1][SIMD_WIDTH]);
-  DECLARE_ALIGNED(SIMD_WIDTH, unsigned char,
-                  lim[MAX_LOOP_FILTER + 1][SIMD_WIDTH]);
-  DECLARE_ALIGNED(SIMD_WIDTH, unsigned char,
-                  hev_thr[4][SIMD_WIDTH]);
-  unsigned char lvl[4][4][4];
-  unsigned char hev_thr_lut[2][MAX_LOOP_FILTER + 1];
-  unsigned char mode_lf_lut[MB_MODE_COUNT];
-} loop_filter_info_n;
-
-struct loop_filter_info {
-  const unsigned char *mblim;
-  const unsigned char *blim;
-  const unsigned char *lim;
-  const unsigned char *hev_thr;
-};
-
-#define prototype_loopfilter(sym) \
-  void sym(unsigned char *src, int pitch, const unsigned char *blimit,\
-           const unsigned char *limit, const unsigned char *thresh, int count)
-
-#define prototype_loopfilter_block(sym) \
-  void sym(unsigned char *y, unsigned char *u, unsigned char *v, \
-           int ystride, int uv_stride, struct loop_filter_info *lfi)
-
-#define prototype_simple_loopfilter(sym) \
-  void sym(unsigned char *y, int ystride, const unsigned char *blimit)
-
-#if ARCH_X86 || ARCH_X86_64
-#include "x86/loopfilter_x86.h"
-#endif
-
-#if ARCH_ARM
-#include "arm/loopfilter_arm.h"
-#endif
-
-typedef void loop_filter_uvfunction(unsigned char *u,   /* source pointer */
-                                    int p,              /* pitch */
-                                    const unsigned char *blimit,
-                                    const unsigned char *limit,
-                                    const unsigned char *thresh,
-                                    unsigned char *v);
-
-/* assorted loopfilter functions which get used elsewhere */
-struct VP9Common;
-struct macroblockd;
-
-void vp9_loop_filter_init(struct VP9Common *cm);
-
-void vp9_loop_filter_frame_init(struct VP9Common *cm,
-                                struct macroblockd *mbd,
-                                int default_filt_lvl);
-
-void vp9_loop_filter_frame(struct VP9Common *cm, struct macroblockd *mbd);
-
-void vp9_loop_filter_partial_frame(struct VP9Common *cm,
-                                   struct macroblockd *mbd,
-                                   int default_filt_lvl);
-
-void vp9_loop_filter_frame_yonly(struct VP9Common *cm,
-                                 struct macroblockd *mbd,
-                                 int default_filt_lvl);
-
-void vp9_loop_filter_update_sharpness(loop_filter_info_n *lfi,
-                                      int sharpness_lvl);
-
-#endif  // loopfilter_h
--- a/vp8/common/loopfilter_filters.c
+++ /dev/null
@@ -1,480 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <stdlib.h>
-#include "vpx_config.h"
-#include "loopfilter.h"
-#include "onyxc_int.h"
-
-typedef unsigned char uc;
-
-static __inline signed char signed_char_clamp(int t) {
-  t = (t < -128 ? -128 : t);
-  t = (t > 127 ? 127 : t);
-  return (signed char) t;
-}
-
-
-/* should we apply any filter at all ( 11111111 yes, 00000000 no) */
-static __inline signed char filter_mask(uc limit, uc blimit,
-                                        uc p3, uc p2, uc p1, uc p0,
-                                        uc q0, uc q1, uc q2, uc q3) {
-  signed char mask = 0;
-  mask |= (abs(p3 - p2) > limit) * -1;
-  mask |= (abs(p2 - p1) > limit) * -1;
-  mask |= (abs(p1 - p0) > limit) * -1;
-  mask |= (abs(q1 - q0) > limit) * -1;
-  mask |= (abs(q2 - q1) > limit) * -1;
-  mask |= (abs(q3 - q2) > limit) * -1;
-  mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
-  mask = ~mask;
-  return mask;
-}
-
-/* is there high variance internal edge ( 11111111 yes, 00000000 no) */
-static __inline signed char hevmask(uc thresh, uc p1, uc p0, uc q0, uc q1) {
-  signed char hev = 0;
-  hev  |= (abs(p1 - p0) > thresh) * -1;
-  hev  |= (abs(q1 - q0) > thresh) * -1;
-  return hev;
-}
-
-static __inline void filter(signed char mask, uc hev, uc *op1,
-                            uc *op0, uc *oq0, uc *oq1)
-
-{
-  signed char ps0, qs0;
-  signed char ps1, qs1;
-  signed char filter, Filter1, Filter2;
-  signed char u;
-
-  ps1 = (signed char) * op1 ^ 0x80;
-  ps0 = (signed char) * op0 ^ 0x80;
-  qs0 = (signed char) * oq0 ^ 0x80;
-  qs1 = (signed char) * oq1 ^ 0x80;
-
-  /* add outer taps if we have high edge variance */
-  filter = signed_char_clamp(ps1 - qs1);
-  filter &= hev;
-
-  /* inner taps */
-  filter = signed_char_clamp(filter + 3 * (qs0 - ps0));
-  filter &= mask;
-
-  /* save bottom 3 bits so that we round one side +4 and the other +3
-   * if it equals 4 we'll set to adjust by -1 to account for the fact
-   * we'd round 3 the other way
-   */
-  Filter1 = signed_char_clamp(filter + 4);
-  Filter2 = signed_char_clamp(filter + 3);
-  Filter1 >>= 3;
-  Filter2 >>= 3;
-  u = signed_char_clamp(qs0 - Filter1);
-  *oq0 = u ^ 0x80;
-  u = signed_char_clamp(ps0 + Filter2);
-  *op0 = u ^ 0x80;
-  filter = Filter1;
-
-  /* outer tap adjustments */
-  filter += 1;
-  filter >>= 1;
-  filter &= ~hev;
-
-  u = signed_char_clamp(qs1 - filter);
-  *oq1 = u ^ 0x80;
-  u = signed_char_clamp(ps1 + filter);
-  *op1 = u ^ 0x80;
-
-}
-
-void vp9_loop_filter_horizontal_edge_c
-(
-  unsigned char *s,
-  int p, /* pitch */
-  const unsigned char *blimit,
-  const unsigned char *limit,
-  const unsigned char *thresh,
-  int count
-) {
-  int  hev = 0; /* high edge variance */
-  signed char mask = 0;
-  int i = 0;
-
-  /* loop filter designed to work using chars so that we can make maximum use
-   * of 8 bit simd instructions.
-   */
-  do {
-    mask = filter_mask(limit[0], blimit[0],
-                       s[-4 * p], s[-3 * p], s[-2 * p], s[-1 * p],
-                       s[0 * p], s[1 * p], s[2 * p], s[3 * p]);
-
-    hev = hevmask(thresh[0], s[-2 * p], s[-1 * p], s[0 * p], s[1 * p]);
-
-    filter(mask, hev, s - 2 * p, s - 1 * p, s, s + 1 * p);
-
-    ++s;
-  } while (++i < count * 8);
-}
-
-void vp9_loop_filter_vertical_edge_c(unsigned char *s,
-                                     int p,
-                                     const unsigned char *blimit,
-                                     const unsigned char *limit,
-                                     const unsigned char *thresh,
-                                     int count) {
-  int  hev = 0; /* high edge variance */
-  signed char mask = 0;
-  int i = 0;
-
-  /* loop filter designed to work using chars so that we can make maximum use
-   * of 8 bit simd instructions.
-   */
-  do {
-    mask = filter_mask(limit[0], blimit[0],
-                       s[-4], s[-3], s[-2], s[-1],
-                       s[0], s[1], s[2], s[3]);
-
-    hev = hevmask(thresh[0], s[-2], s[-1], s[0], s[1]);
-
-    filter(mask, hev, s - 2, s - 1, s, s + 1);
-
-    s += p;
-  } while (++i < count * 8);
-}
-static __inline signed char flatmask(uc thresh,
-                                     uc p4, uc p3, uc p2, uc p1, uc p0,
-                                     uc q0, uc q1, uc q2, uc q3, uc q4) {
-  signed char flat = 0;
-  flat |= (abs(p1 - p0) > 1) * -1;
-  flat |= (abs(q1 - q0) > 1) * -1;
-  flat |= (abs(p0 - p2) > 1) * -1;
-  flat |= (abs(q0 - q2) > 1) * -1;
-  flat |= (abs(p3 - p0) > 1) * -1;
-  flat |= (abs(q3 - q0) > 1) * -1;
-  flat |= (abs(p4 - p0) > 1) * -1;
-  flat |= (abs(q4 - q0) > 1) * -1;
-  flat = ~flat;
-  return flat;
-}
-
-static __inline void mbfilter(signed char mask, uc hev, uc flat,
-                              uc *op4, uc *op3, uc *op2, uc *op1, uc *op0,
-                              uc *oq0, uc *oq1, uc *oq2, uc *oq3, uc *oq4) {
-  /* use a 7 tap filter [1, 1, 1, 2, 1, 1, 1] for flat line */
-  if (flat && mask) {
-    unsigned char p0, q0;
-    unsigned char p1, q1;
-    unsigned char p2, q2;
-    unsigned char p3, q3;
-    unsigned char p4, q4;
-
-    p4 = *op4;
-    p3 = *op3;
-    p2 = *op2;
-    p1 = *op1;
-    p0 = *op0;
-    q0 = *oq0;
-    q1 = *oq1;
-    q2 = *oq2;
-    q3 = *oq3;
-    q4 = *oq4;
-
-    *op2 = (p4 + p4 + p3 + p2 + p2 + p1 + p0 + q0 + 4) >> 3;
-    *op1 = (p4 + p3 + p2 + p1 + p1 + p0 + q0 + q1 + 4) >> 3;
-    *op0 = (p3 + p2 + p1 + p0 + p0 + q0 + q1 + q2 + 4) >> 3;
-    *oq0 = (p2 + p1 + p0 + q0 + q0 + q1 + q2 + q3 + 4) >> 3;
-    *oq1 = (p1 + p0 + q0 + q1 + q1 + q2 + q3 + q4 + 4) >> 3;
-    *oq2 = (p0 + q0 + q1 + q2 + q2 + q3 + q4 + q4 + 4) >> 3;
-  } else {
-    signed char ps0, qs0;
-    signed char ps1, qs1;
-    signed char filter, Filter1, Filter2;
-    signed char u;
-
-    ps1 = (signed char) * op1 ^ 0x80;
-    ps0 = (signed char) * op0 ^ 0x80;
-    qs0 = (signed char) * oq0 ^ 0x80;
-    qs1 = (signed char) * oq1 ^ 0x80;
-
-    /* add outer taps if we have high edge variance */
-    filter = signed_char_clamp(ps1 - qs1);
-    filter &= hev;
-
-    /* inner taps */
-    filter = signed_char_clamp(filter + 3 * (qs0 - ps0));
-    filter &= mask;
-
-    Filter1 = signed_char_clamp(filter + 4);
-    Filter2 = signed_char_clamp(filter + 3);
-    Filter1 >>= 3;
-    Filter2 >>= 3;
-
-    u = signed_char_clamp(qs0 - Filter1);
-    *oq0 = u ^ 0x80;
-    u = signed_char_clamp(ps0 + Filter2);
-    *op0 = u ^ 0x80;
-    filter = Filter1;
-
-    /* outer tap adjustments */
-    filter += 1;
-    filter >>= 1;
-    filter &= ~hev;
-
-    u = signed_char_clamp(qs1 - filter);
-    *oq1 = u ^ 0x80;
-    u = signed_char_clamp(ps1 + filter);
-    *op1 = u ^ 0x80;
-  }
-}
-void vp9_mbloop_filter_horizontal_edge_c
-(
-  unsigned char *s,
-  int p,
-  const unsigned char *blimit,
-  const unsigned char *limit,
-  const unsigned char *thresh,
-  int count
-) {
-  signed char hev = 0; /* high edge variance */
-  signed char mask = 0;
-  signed char flat = 0;
-  int i = 0;
-
-  /* loop filter designed to work using chars so that we can make maximum use
-   * of 8 bit simd instructions.
-   */
-  do {
-
-    mask = filter_mask(limit[0], blimit[0],
-                       s[-4 * p], s[-3 * p], s[-2 * p], s[-1 * p],
-                       s[ 0 * p], s[ 1 * p], s[ 2 * p], s[ 3 * p]);
-
-    hev = hevmask(thresh[0], s[-2 * p], s[-1 * p], s[0 * p], s[1 * p]);
-
-    flat = flatmask(thresh[0],
-                    s[-5 * p], s[-4 * p], s[-3 * p], s[-2 * p], s[-1 * p],
-                    s[ 0 * p], s[ 1 * p], s[ 2 * p], s[ 3 * p], s[ 4 * p]);
-    mbfilter(mask, hev, flat,
-             s - 5 * p, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p,
-             s,       s + 1 * p, s + 2 * p, s + 3 * p, s + 4 * p);
-
-    ++s;
-  } while (++i < count * 8);
-
-}
-void vp9_mbloop_filter_vertical_edge_c
-(
-  unsigned char *s,
-  int p,
-  const unsigned char *blimit,
-  const unsigned char *limit,
-  const unsigned char *thresh,
-  int count
-) {
-  signed char hev = 0; /* high edge variance */
-  signed char mask = 0;
-  signed char flat = 0;
-  int i = 0;
-
-  do {
-
-    mask = filter_mask(limit[0], blimit[0],
-                       s[-4], s[-3], s[-2], s[-1],
-                       s[0], s[1], s[2], s[3]);
-
-    hev = hevmask(thresh[0], s[-2], s[-1], s[0], s[1]);
-    flat = flatmask(thresh[0],
-                    s[-5], s[-4], s[-3], s[-2], s[-1],
-                    s[ 0], s[ 1], s[ 2], s[ 3], s[ 4]);
-    mbfilter(mask, hev, flat,
-             s - 5, s - 4, s - 3, s - 2, s - 1,
-             s,     s + 1, s + 2, s + 3, s + 4);
-    s += p;
-  } while (++i < count * 8);
-
-}
-
-/* should we apply any filter at all ( 11111111 yes, 00000000 no) */
-static __inline signed char simple_filter_mask(uc blimit,
-                                               uc p1, uc p0,
-                                               uc q0, uc q1) {
-  /* Why does this cause problems for win32?
-   * error C2143: syntax error : missing ';' before 'type'
-   *  (void) limit;
-   */
-  signed char mask = (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  <= blimit) * -1;
-  return mask;
-}
-
-static __inline void simple_filter(signed char mask,
-                                   uc *op1, uc *op0,
-                                   uc *oq0, uc *oq1) {
-  signed char filter, Filter1, Filter2;
-  signed char p1 = (signed char) * op1 ^ 0x80;
-  signed char p0 = (signed char) * op0 ^ 0x80;
-  signed char q0 = (signed char) * oq0 ^ 0x80;
-  signed char q1 = (signed char) * oq1 ^ 0x80;
-  signed char u;
-
-  filter = signed_char_clamp(p1 - q1);
-  filter = signed_char_clamp(filter + 3 * (q0 - p0));
-  filter &= mask;
-
-  /* save bottom 3 bits so that we round one side +4 and the other +3 */
-  Filter1 = signed_char_clamp(filter + 4);
-  Filter1 >>= 3;
-  u = signed_char_clamp(q0 - Filter1);
-  *oq0  = u ^ 0x80;
-
-  Filter2 = signed_char_clamp(filter + 3);
-  Filter2 >>= 3;
-  u = signed_char_clamp(p0 + Filter2);
-  *op0 = u ^ 0x80;
-}
-
-void vp9_loop_filter_simple_horizontal_edge_c
-(
-  unsigned char *s,
-  int p,
-  const unsigned char *blimit
-) {
-  signed char mask = 0;
-  int i = 0;
-
-  do {
-    mask = simple_filter_mask(blimit[0],
-                              s[-2 * p], s[-1 * p],
-                              s[0 * p], s[1 * p]);
-    simple_filter(mask,
-                  s - 2 * p, s - 1 * p,
-                  s, s + 1 * p);
-    ++s;
-  } while (++i < 16);
-}
-
-void vp9_loop_filter_simple_vertical_edge_c
-(
-  unsigned char *s,
-  int p,
-  const unsigned char *blimit
-) {
-  signed char mask = 0;
-  int i = 0;
-
-  do {
-    mask = simple_filter_mask(blimit[0], s[-2], s[-1], s[0], s[1]);
-    simple_filter(mask, s - 2, s - 1, s, s + 1);
-    s += p;
-  } while (++i < 16);
-
-}
-
-/* Vertical MB Filtering */
-void vp9_loop_filter_mbv_c(unsigned char *y_ptr, unsigned char *u_ptr,
-                           unsigned char *v_ptr, int y_stride, int uv_stride,
-                           struct loop_filter_info *lfi) {
-  vp9_mbloop_filter_vertical_edge_c(y_ptr, y_stride,
-                                    lfi->mblim, lfi->lim, lfi->hev_thr, 2);
-
-  if (u_ptr)
-    vp9_mbloop_filter_vertical_edge_c(u_ptr, uv_stride,
-                                      lfi->mblim, lfi->lim, lfi->hev_thr, 1);
-
-  if (v_ptr)
-    vp9_mbloop_filter_vertical_edge_c(v_ptr, uv_stride,
-                                      lfi->mblim, lfi->lim, lfi->hev_thr, 1);
-}
-
-/* Vertical B Filtering */
-void vp9_loop_filter_bv_c(unsigned char *y_ptr, unsigned char *u_ptr,
-                          unsigned char *v_ptr, int y_stride, int uv_stride,
-                          struct loop_filter_info *lfi) {
-  vp9_loop_filter_vertical_edge_c(y_ptr + 4, y_stride,
-                                  lfi->blim, lfi->lim, lfi->hev_thr, 2);
-  vp9_loop_filter_vertical_edge_c(y_ptr + 8, y_stride,
-                                  lfi->blim, lfi->lim, lfi->hev_thr, 2);
-  vp9_loop_filter_vertical_edge_c(y_ptr + 12, y_stride,
-                                  lfi->blim, lfi->lim, lfi->hev_thr, 2);
-
-  if (u_ptr)
-    vp9_loop_filter_vertical_edge_c(u_ptr + 4, uv_stride,
-                                    lfi->blim, lfi->lim, lfi->hev_thr, 1);
-
-  if (v_ptr)
-    vp9_loop_filter_vertical_edge_c(v_ptr + 4, uv_stride,
-                                    lfi->blim, lfi->lim, lfi->hev_thr, 1);
-}
-
-/* Horizontal MB filtering */
-void vp9_loop_filter_mbh_c(unsigned char *y_ptr, unsigned char *u_ptr,
-                           unsigned char *v_ptr, int y_stride, int uv_stride,
-                           struct loop_filter_info *lfi) {
-  vp9_mbloop_filter_horizontal_edge_c(y_ptr, y_stride,
-                                      lfi->mblim, lfi->lim, lfi->hev_thr, 2);
-
-  if (u_ptr)
-    vp9_mbloop_filter_horizontal_edge_c(u_ptr, uv_stride,
-                                        lfi->mblim, lfi->lim, lfi->hev_thr, 1);
-
-  if (v_ptr)
-    vp9_mbloop_filter_horizontal_edge_c(v_ptr, uv_stride,
-                                        lfi->mblim, lfi->lim, lfi->hev_thr, 1);
-}
-
-/* Horizontal B Filtering */
-void vp9_loop_filter_bh_c(unsigned char *y_ptr, unsigned char *u_ptr,
-                          unsigned char *v_ptr, int y_stride, int uv_stride,
-                          struct loop_filter_info *lfi) {
-  vp9_loop_filter_horizontal_edge_c(y_ptr + 4 * y_stride, y_stride,
-                                    lfi->blim, lfi->lim, lfi->hev_thr, 2);
-  vp9_loop_filter_horizontal_edge_c(y_ptr + 8 * y_stride, y_stride,
-                                    lfi->blim, lfi->lim, lfi->hev_thr, 2);
-  vp9_loop_filter_horizontal_edge_c(y_ptr + 12 * y_stride, y_stride,
-                                    lfi->blim, lfi->lim, lfi->hev_thr, 2);
-
-  if (u_ptr)
-    vp9_loop_filter_horizontal_edge_c(u_ptr + 4 * uv_stride, uv_stride,
-                                      lfi->blim, lfi->lim, lfi->hev_thr, 1);
-
-  if (v_ptr)
-    vp9_loop_filter_horizontal_edge_c(v_ptr + 4 * uv_stride, uv_stride,
-                                      lfi->blim, lfi->lim, lfi->hev_thr, 1);
-}
-
-void vp9_loop_filter_bh8x8_c(unsigned char *y_ptr, unsigned char *u_ptr,
-                             unsigned char *v_ptr, int y_stride, int uv_stride,
-                             struct loop_filter_info *lfi) {
-  vp9_mbloop_filter_horizontal_edge_c(
-    y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
-}
-
-void vp9_loop_filter_bhs_c(unsigned char *y_ptr, int y_stride,
-                           const unsigned char *blimit) {
-  vp9_loop_filter_simple_horizontal_edge_c(y_ptr + 4 * y_stride,
-                                           y_stride, blimit);
-  vp9_loop_filter_simple_horizontal_edge_c(y_ptr + 8 * y_stride,
-                                           y_stride, blimit);
-  vp9_loop_filter_simple_horizontal_edge_c(y_ptr + 12 * y_stride,
-                                           y_stride, blimit);
-}
-
-void vp9_loop_filter_bv8x8_c(unsigned char *y_ptr, unsigned char *u_ptr,
-                             unsigned char *v_ptr, int y_stride, int uv_stride,
-                             struct loop_filter_info *lfi) {
-  vp9_mbloop_filter_vertical_edge_c(
-    y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
-}
-
-void vp9_loop_filter_bvs_c(unsigned char *y_ptr, int y_stride,
-                           const unsigned char *blimit) {
-  vp9_loop_filter_simple_vertical_edge_c(y_ptr + 4, y_stride, blimit);
-  vp9_loop_filter_simple_vertical_edge_c(y_ptr + 8, y_stride, blimit);
-  vp9_loop_filter_simple_vertical_edge_c(y_ptr + 12, y_stride, blimit);
-}
--- a/vp8/common/maskingmv.c
+++ /dev/null
@@ -1,806 +1,0 @@
-/*
- ============================================================================
- Name        : maskingmv.c
- Author      : jimbankoski
- Version     :
- Copyright   : Your copyright notice
- Description : Hello World in C, Ansi-style
- ============================================================================
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-extern unsigned int vp9_sad16x16_sse3(
-  unsigned char *src_ptr,
-  int  src_stride,
-  unsigned char *ref_ptr,
-  int  ref_stride,
-  int  max_err);
-
-extern void vp9_sad16x16x3_sse3(
-  unsigned char *src_ptr,
-  int  src_stride,
-  unsigned char *ref_ptr,
-  int  ref_stride,
-  int  *results);
-
-extern int vp8_growmaskmb_sse3(
-  unsigned char *om,
-  unsigned char *nm);
-
-extern void vp8_makemask_sse3(
-  unsigned char *y,
-  unsigned char *u,
-  unsigned char *v,
-  unsigned char *ym,
-  int yp,
-  int uvp,
-  int ys,
-  int us,
-  int vs,
-  int yt,
-  int ut,
-  int vt);
-
-unsigned int vp9_sad16x16_unmasked_wmt(
-  unsigned char *src_ptr,
-  int  src_stride,
-  unsigned char *ref_ptr,
-  int  ref_stride,
-  unsigned char *mask);
-
-unsigned int vp9_sad16x16_masked_wmt(
-  unsigned char *src_ptr,
-  int  src_stride,
-  unsigned char *ref_ptr,
-  int  ref_stride,
-  unsigned char *mask);
-
-unsigned int vp8_masked_predictor_wmt(
-  unsigned char *masked,
-  unsigned char *unmasked,
-  int  src_stride,
-  unsigned char *dst_ptr,
-  int  dst_stride,
-  unsigned char *mask);
-unsigned int vp8_masked_predictor_uv_wmt(
-  unsigned char *masked,
-  unsigned char *unmasked,
-  int  src_stride,
-  unsigned char *dst_ptr,
-  int  dst_stride,
-  unsigned char *mask);
-unsigned int vp8_uv_from_y_mask(
-  unsigned char *ymask,
-  unsigned char *uvmask);
-int yp = 16;
-unsigned char sxy[] = {
-  40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,
-  40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,
-  40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,
-  40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,
-  40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,
-  60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,
-  60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,
-  60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,
-  40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,
-  40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,
-  40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,
-  40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,
-  40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,
-  40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,
-  40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,
-  40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90
-};
-
-unsigned char sts[] = {
-  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-};
-unsigned char str[] = {
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
-};
-
-unsigned char y[] = {
-  40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40,
-  40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40,
-  40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40,
-  40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40,
-  40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40,
-  60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40,
-  60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40,
-  60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40,
-  40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40,
-  40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40,
-  40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40,
-  40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40,
-  40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40,
-  40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40,
-  40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40,
-  40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40
-};
-int uvp = 8;
-unsigned char u[] = {
-  90, 80, 70, 70, 90, 90, 90, 17,
-  90, 80, 70, 70, 90, 90, 90, 17,
-  84, 70, 70, 90, 90, 90, 17, 17,
-  84, 70, 70, 90, 90, 90, 17, 17,
-  80, 70, 70, 90, 90, 90, 17, 17,
-  90, 80, 70, 70, 90, 90, 90, 17,
-  90, 80, 70, 70, 90, 90, 90, 17,
-  90, 80, 70, 70, 90, 90, 90, 17
-};
-
-unsigned char v[] = {
-  80, 80, 80, 80, 80, 80, 80, 80,
-  80, 80, 80, 80, 80, 80, 80, 80,
-  80, 80, 80, 80, 80, 80, 80, 80,
-  80, 80, 80, 80, 80, 80, 80, 80,
-  80, 80, 80, 80, 80, 80, 80, 80,
-  80, 80, 80, 80, 80, 80, 80, 80,
-  80, 80, 80, 80, 80, 80, 80, 80,
-  80, 80, 80, 80, 80, 80, 80, 80
-};
-
-unsigned char ym[256];
-unsigned char uvm[64];
-typedef struct {
-  unsigned char y;
-  unsigned char yt;
-  unsigned char u;
-  unsigned char ut;
-  unsigned char v;
-  unsigned char vt;
-  unsigned char use;
-} COLOR_SEG_ELEMENT;
-
-/*
-COLOR_SEG_ELEMENT segmentation[]=
-{
-    { 60,4,80,17,80,10, 1},
-    { 40,4,15,10,80,10, 1},
-};
-*/
-
-COLOR_SEG_ELEMENT segmentation[] = {
-  { 79, 44, 92, 44, 237, 60, 1},
-};
-
-unsigned char pixel_mask(unsigned char y, unsigned char u, unsigned char v,
-                         COLOR_SEG_ELEMENT sgm[],
-                         int c) {
-  COLOR_SEG_ELEMENT *s = sgm;
-  unsigned char m = 0;
-  int i;
-  for (i = 0; i < c; i++, s++)
-    m |= (abs(y - s->y) < s->yt &&
-          abs(u - s->u) < s->ut &&
-          abs(v - s->v) < s->vt ? 255 : 0);
-
-  return m;
-}
-int neighbors[256][8];
-int makeneighbors(void) {
-  int i, j;
-  for (i = 0; i < 256; i++) {
-    int r = (i >> 4), c = (i & 15);
-    int ni = 0;
-    for (j = 0; j < 8; j++)
-      neighbors[i][j] = i;
-    for (j = 0; j < 256; j++) {
-      int nr = (j >> 4), nc = (j & 15);
-      if (abs(nr - r) < 2 && abs(nc - c) < 2)
-        neighbors[i][ni++] = j;
-    }
-  }
-  return 0;
-}
-void grow_ymask(unsigned char *ym) {
-  unsigned char nym[256];
-  int i, j;
-
-  for (i = 0; i < 256; i++) {
-    nym[i] = ym[i];
-    for (j = 0; j < 8; j++) {
-      nym[i] |= ym[neighbors[i][j]];
-    }
-  }
-  for (i = 0; i < 256; i++)
-    ym[i] = nym[i];
-}
-void make_mb_mask(unsigned char *y, unsigned char *u, unsigned char *v,
-                  unsigned char *ym, unsigned char *uvm,
-                  int yp, int uvp,
-                  COLOR_SEG_ELEMENT sgm[],
-                  int count) {
-  int r, c;
-  unsigned char *oym = ym;
-
-  memset(ym, 20, 256);
-  for (r = 0; r < 8; r++, uvm += 8, u += uvp, v += uvp, y += (yp + yp), ym += 32)
-    for (c = 0; c < 8; c++) {
-      int y1 = y[c << 1];
-      int u1 = u[c];
-      int v1 = v[c];
-      int m = pixel_mask(y1, u1, v1, sgm, count);
-      uvm[c] = m;
-      ym[c << 1] = uvm[c]; // = pixel_mask(y[c<<1],u[c],v[c],sgm,count);
-      ym[(c << 1) + 1] = pixel_mask(y[1 + (c << 1)], u[c], v[c], sgm, count);
-      ym[(c << 1) + 16] = pixel_mask(y[yp + (c << 1)], u[c], v[c], sgm, count);
-      ym[(c << 1) + 17] = pixel_mask(y[1 + yp + (c << 1)], u[c], v[c], sgm, count);
-    }
-  grow_ymask(oym);
-}
-
-int masked_sad(unsigned char *src, int p, unsigned char *dst, int dp,
-               unsigned char *ym) {
-  int i, j;
-  unsigned sad = 0;
-  for (i = 0; i < 16; i++, src += p, dst += dp, ym += 16)
-    for (j = 0; j < 16; j++)
-      if (ym[j])
-        sad += abs(src[j] - dst[j]);
-
-  return sad;
-}
-
-int compare_masks(unsigned char *sym, unsigned char *ym) {
-  int i, j;
-  unsigned sad = 0;
-  for (i = 0; i < 16; i++, sym += 16, ym += 16)
-    for (j = 0; j < 16; j++)
-      sad += (sym[j] != ym[j] ? 1 : 0);
-
-  return sad;
-}
-int unmasked_sad(unsigned char *src, int p, unsigned char *dst, int dp,
-                 unsigned char *ym) {
-  int i, j;
-  unsigned sad = 0;
-  for (i = 0; i < 16; i++, src += p, dst += dp, ym += 16)
-    for (j = 0; j < 16; j++)
-      if (!ym[j])
-        sad += abs(src[j] - dst[j]);
-
-  return sad;
-}
-int masked_motion_search(unsigned char *y, unsigned char *u, unsigned char *v,
-                         int yp, int uvp,
-                         unsigned char *dy, unsigned char *du, unsigned char *dv,
-                         int dyp, int duvp,
-                         COLOR_SEG_ELEMENT sgm[],
-                         int count,
-                         int *mi,
-                         int *mj,
-                         int *ui,
-                         int *uj,
-                         int *wm) {
-  int i, j;
-
-  unsigned char ym[256];
-  unsigned char uvm[64];
-  unsigned char dym[256];
-  unsigned char duvm[64];
-  unsigned int e = 0;
-  int beste = 256;
-  int bmi = -32, bmj = -32;
-  int bui = -32, buj = -32;
-  int beste1 = 256;
-  int bmi1 = -32, bmj1 = -32;
-  int bui1 = -32, buj1 = -32;
-  int obeste;
-
-  // first try finding best mask and then unmasked
-  beste = 0xffffffff;
-
-  // find best unmasked mv
-  for (i = -32; i < 32; i++) {
-    unsigned char *dyz = i * dyp + dy;
-    unsigned char *duz = i / 2 * duvp + du;
-    unsigned char *dvz = i / 2 * duvp + dv;
-    for (j = -32; j < 32; j++) {
-      // 0,0  masked destination
-      make_mb_mask(dyz + j, duz + j / 2, dvz + j / 2, dym, duvm, dyp, duvp, sgm, count);
-
-      e = unmasked_sad(y, yp, dyz + j, dyp, dym);
-
-      if (e < beste) {
-        bui = i;
-        buj = j;
-        beste = e;
-      }
-    }
-  }
-  // bui=0;buj=0;
-  // best mv masked destination
-  make_mb_mask(dy + bui * dyp + buj, du + bui / 2 * duvp + buj / 2, dv + bui / 2 * duvp + buj / 2,
-               dym, duvm, dyp, duvp, sgm, count);
-
-  obeste = beste;
-  beste = 0xffffffff;
-
-  // find best masked
-  for (i = -32; i < 32; i++) {
-    unsigned char *dyz = i * dyp + dy;
-    for (j = -32; j < 32; j++) {
-      e = masked_sad(y, yp, dyz + j, dyp, dym);
-
-      if (e < beste) {
-        bmi = i;
-        bmj = j;
-        beste = e;
-      }
-    }
-  }
-  beste1 = beste + obeste;
-  bmi1 = bmi;
-  bmj1 = bmj;
-  bui1 = bui;
-  buj1 = buj;
-
-  beste = 0xffffffff;
-  // source mask
-  make_mb_mask(y, u, v, ym, uvm, yp, uvp, sgm, count);
-
-  // find best mask
-  for (i = -32; i < 32; i++) {
-    unsigned char *dyz = i * dyp + dy;
-    unsigned char *duz = i / 2 * duvp + du;
-    unsigned char *dvz = i / 2 * duvp + dv;
-    for (j = -32; j < 32; j++) {
-      // 0,0  masked destination
-      make_mb_mask(dyz + j, duz + j / 2, dvz + j / 2, dym, duvm, dyp, duvp, sgm, count);
-
-      e = compare_masks(ym, dym);
-
-      if (e < beste) {
-        bmi = i;
-        bmj = j;
-        beste = e;
-      }
-    }
-  }
-
-
-  // best mv masked destination
-  make_mb_mask(dy + bmi * dyp + bmj, du + bmi / 2 * duvp + bmj / 2, dv + bmi / 2 * duvp + bmj / 2,
-               dym, duvm, dyp, duvp, sgm, count);
-
-  obeste = masked_sad(y, yp, dy + bmi * dyp + bmj, dyp, dym);
-
-  beste = 0xffffffff;
-
-  // find best unmasked mv
-  for (i = -32; i < 32; i++) {
-    unsigned char *dyz = i * dyp + dy;
-    for (j = -32; j < 32; j++) {
-      e = unmasked_sad(y, yp, dyz + j, dyp, dym);
-
-      if (e < beste) {
-        bui = i;
-        buj = j;
-        beste = e;
-      }
-    }
-  }
-  beste += obeste;
-
-
-  if (beste < beste1) {
-    *mi = bmi;
-    *mj = bmj;
-    *ui = bui;
-    *uj = buj;
-    *wm = 1;
-  } else {
-    *mi = bmi1;
-    *mj = bmj1;
-    *ui = bui1;
-    *uj = buj1;
-    *wm = 0;
-
-  }
-  return 0;
-}
-
-int predict(unsigned char *src, int p, unsigned char *dst, int dp,
-            unsigned char *ym, unsigned char *prd) {
-  int i, j;
-  for (i = 0; i < 16; i++, src += p, dst += dp, ym += 16, prd += 16)
-    for (j = 0; j < 16; j++)
-      prd[j] = (ym[j] ? src[j] : dst[j]);
-  return 0;
-}
-
-int fast_masked_motion_search(unsigned char *y, unsigned char *u, unsigned char *v,
-                              int yp, int uvp,
-                              unsigned char *dy, unsigned char *du, unsigned char *dv,
-                              int dyp, int duvp,
-                              COLOR_SEG_ELEMENT sgm[],
-                              int count,
-                              int *mi,
-                              int *mj,
-                              int *ui,
-                              int *uj,
-                              int *wm) {
-  int i, j;
-
-  unsigned char ym[256];
-  unsigned char ym2[256];
-  unsigned char uvm[64];
-  unsigned char dym2[256];
-  unsigned char dym[256];
-  unsigned char duvm[64];
-  unsigned int e = 0;
-  int beste = 256;
-  int bmi = -32, bmj = -32;
-  int bui = -32, buj = -32;
-  int beste1 = 256;
-  int bmi1 = -32, bmj1 = -32;
-  int bui1 = -32, buj1 = -32;
-  int obeste;
-
-  // first try finding best mask and then unmasked
-  beste = 0xffffffff;
-
-#if 0
-  for (i = 0; i < 16; i++) {
-    unsigned char *dy = i * yp + y;
-    for (j = 0; j < 16; j++)
-      printf("%2x", dy[j]);
-    printf("\n");
-  }
-  printf("\n");
-
-  for (i = -32; i < 48; i++) {
-    unsigned char *dyz = i * dyp + dy;
-    for (j = -32; j < 48; j++)
-      printf("%2x", dyz[j]);
-    printf("\n");
-  }
-#endif
-
-  // find best unmasked mv
-  for (i = -32; i < 32; i++) {
-    unsigned char *dyz = i * dyp + dy;
-    unsigned char *duz = i / 2 * duvp + du;
-    unsigned char *dvz = i / 2 * duvp + dv;
-    for (j = -32; j < 32; j++) {
-      // 0,0  masked destination
-      vp8_makemask_sse3(dyz + j, duz + j / 2, dvz + j / 2, dym, dyp, duvp,
-                        sgm[0].y, sgm[0].u, sgm[0].v,
-                        sgm[0].yt, sgm[0].ut, sgm[0].vt);
-
-      vp8_growmaskmb_sse3(dym, dym2);
-
-      e = vp9_sad16x16_unmasked_wmt(y, yp, dyz + j, dyp, dym2);
-
-      if (e < beste) {
-        bui = i;
-        buj = j;
-        beste = e;
-      }
-    }
-  }
-  // bui=0;buj=0;
-  // best mv masked destination
-
-  vp8_makemask_sse3(dy + bui * dyp + buj, du + bui / 2 * duvp + buj / 2, dv + bui / 2 * duvp + buj / 2,
-                    dym, dyp, duvp,
-                    sgm[0].y, sgm[0].u, sgm[0].v,
-                    sgm[0].yt, sgm[0].ut, sgm[0].vt);
-
-  vp8_growmaskmb_sse3(dym, dym2);
-
-  obeste = beste;
-  beste = 0xffffffff;
-
-  // find best masked
-  for (i = -32; i < 32; i++) {
-    unsigned char *dyz = i * dyp + dy;
-    for (j = -32; j < 32; j++) {
-      e = vp9_sad16x16_masked_wmt(y, yp, dyz + j, dyp, dym2);
-      if (e < beste) {
-        bmi = i;
-        bmj = j;
-        beste = e;
-      }
-    }
-  }
-  beste1 = beste + obeste;
-  bmi1 = bmi;
-  bmj1 = bmj;
-  bui1 = bui;
-  buj1 = buj;
-
-  // source mask
-  vp8_makemask_sse3(y, u, v,
-                    ym, yp, uvp,
-                    sgm[0].y, sgm[0].u, sgm[0].v,
-                    sgm[0].yt, sgm[0].ut, sgm[0].vt);
-
-  vp8_growmaskmb_sse3(ym, ym2);
-
-  // find best mask
-  for (i = -32; i < 32; i++) {
-    unsigned char *dyz = i * dyp + dy;
-    unsigned char *duz = i / 2 * duvp + du;
-    unsigned char *dvz = i / 2 * duvp + dv;
-    for (j = -32; j < 32; j++) {
-      // 0,0  masked destination
-      vp8_makemask_sse3(dyz + j, duz + j / 2, dvz + j / 2, dym, dyp, duvp,
-                        sgm[0].y, sgm[0].u, sgm[0].v,
-                        sgm[0].yt, sgm[0].ut, sgm[0].vt);
-
-      vp8_growmaskmb_sse3(dym, dym2);
-
-      e = compare_masks(ym2, dym2);
-
-      if (e < beste) {
-        bmi = i;
-        bmj = j;
-        beste = e;
-      }
-    }
-  }
-
-  vp8_makemask_sse3(dy + bmi * dyp + bmj, du + bmi / 2 * duvp + bmj / 2, dv + bmi / 2 * duvp + bmj / 2,
-                    dym, dyp, duvp,
-                    sgm[0].y, sgm[0].u, sgm[0].v,
-                    sgm[0].yt, sgm[0].ut, sgm[0].vt);
-
-  vp8_growmaskmb_sse3(dym, dym2);
-
-  obeste = vp9_sad16x16_masked_wmt(y, yp, dy + bmi * dyp + bmj, dyp, dym2);
-
-  beste = 0xffffffff;
-
-  // find best unmasked mv
-  for (i = -32; i < 32; i++) {
-    unsigned char *dyz = i * dyp + dy;
-    for (j = -32; j < 32; j++) {
-      e = vp9_sad16x16_unmasked_wmt(y, yp, dyz + j, dyp, dym2);
-
-      if (e < beste) {
-        bui = i;
-        buj = j;
-        beste = e;
-      }
-    }
-  }
-  beste += obeste;
-
-  if (beste < beste1) {
-    *mi = bmi;
-    *mj = bmj;
-    *ui = bui;
-    *uj = buj;
-    *wm = 1;
-  } else {
-    *mi = bmi1;
-    *mj = bmj1;
-    *ui = bui1;
-    *uj = buj1;
-    *wm = 0;
-    beste = beste1;
-
-  }
-  return beste;
-}
-
-int predict_all(unsigned char *ym, unsigned char *um, unsigned char *vm,
-                int ymp, int uvmp,
-                unsigned char *yp, unsigned char *up, unsigned char *vp,
-                int ypp, int uvpp,
-                COLOR_SEG_ELEMENT sgm[],
-                int count,
-                int mi,
-                int mj,
-                int ui,
-                int uj,
-                int wm) {
-  int i, j;
-  unsigned char dym[256];
-  unsigned char dym2[256];
-  unsigned char duvm[64];
-  unsigned char *yu = ym, *uu = um, *vu = vm;
-
-  unsigned char *dym3 = dym2;
-
-  ym += mi * ymp + mj;
-  um += mi / 2 * uvmp + mj / 2;
-  vm += mi / 2 * uvmp + mj / 2;
-
-  yu += ui * ymp + uj;
-  uu += ui / 2 * uvmp + uj / 2;
-  vu += ui / 2 * uvmp + uj / 2;
-
-  // best mv masked destination
-  if (wm)
-    vp8_makemask_sse3(ym, um, vm, dym, ymp, uvmp,
-                      sgm[0].y, sgm[0].u, sgm[0].v,
-                      sgm[0].yt, sgm[0].ut, sgm[0].vt);
-  else
-    vp8_makemask_sse3(yu, uu, vu, dym, ymp, uvmp,
-                      sgm[0].y, sgm[0].u, sgm[0].v,
-                      sgm[0].yt, sgm[0].ut, sgm[0].vt);
-
-  vp8_growmaskmb_sse3(dym, dym2);
-  vp8_masked_predictor_wmt(ym, yu, ymp, yp, ypp, dym3);
-  vp8_uv_from_y_mask(dym3, duvm);
-  vp8_masked_predictor_uv_wmt(um, uu, uvmp, up, uvpp, duvm);
-  vp8_masked_predictor_uv_wmt(vm, vu, uvmp, vp, uvpp, duvm);
-
-  return 0;
-}
-
-unsigned char f0p[1280 * 720 * 3 / 2];
-unsigned char f1p[1280 * 720 * 3 / 2];
-unsigned char prd[1280 * 720 * 3 / 2];
-unsigned char msk[1280 * 720 * 3 / 2];
-
-
-int mainz(int argc, char *argv[]) {
-
-  FILE *f = fopen(argv[1], "rb");
-  FILE *g = fopen(argv[2], "wb");
-  int w = atoi(argv[3]), h = atoi(argv[4]);
-  int y_stride = w, uv_stride = w / 2;
-  int r, c;
-  unsigned char *f0 = f0p, *f1 = f1p, *t;
-  unsigned char ym[256], uvm[64];
-  unsigned char ym2[256], uvm2[64];
-  unsigned char ym3[256], uvm3[64];
-  int a, b;
-
-  COLOR_SEG_ELEMENT last = { 20, 20, 20, 20, 230, 20, 1}, best;
-#if 0
-  makeneighbors();
-  COLOR_SEG_ELEMENT segmentation[] = {
-    { 60, 4, 80, 17, 80, 10, 1},
-    { 40, 4, 15, 10, 80, 10, 1},
-  };
-  make_mb_mask(y, u, v, ym2, uvm2, 16, 8, segmentation, 1);
-
-  vp8_makemask_sse3(y, u, v, ym, (int) 16, (int) 8,
-                    (int) segmentation[0].y, (int) segmentation[0].u, (int) segmentation[0].v,
-                    segmentation[0].yt, segmentation[0].ut, segmentation[0].vt);
-
-  vp8_growmaskmb_sse3(ym, ym3);
-
-  a = vp9_sad16x16_masked_wmt(str, 16, sts, 16, ym3);
-  b = vp9_sad16x16_unmasked_wmt(str, 16, sts, 16, ym3);
-
-  vp8_masked_predictor_wmt(str, sts, 16, ym, 16, ym3);
-
-  vp8_uv_from_y_mask(ym3, uvm3);
-
-  return 4;
-#endif
-  makeneighbors();
-
-
-  memset(prd, 128, w * h * 3 / 2);
-
-  fread(f0, w * h * 3 / 2, 1, f);
-
-  while (!feof(f)) {
-    unsigned char *ys = f1, *yd = f0, *yp = prd;
-    unsigned char *us = f1 + w * h, *ud = f0 + w * h, *up = prd + w * h;
-    unsigned char *vs = f1 + w * h * 5 / 4, *vd = f0 + w * h * 5 / 4, *vp = prd + w * h * 5 / 4;
-    fread(f1, w * h * 3 / 2, 1, f);
-
-    ys += 32 * y_stride;
-    yd += 32 * y_stride;
-    yp += 32 * y_stride;
-    us += 16 * uv_stride;
-    ud += 16 * uv_stride;
-    up += 16 * uv_stride;
-    vs += 16 * uv_stride;
-    vd += 16 * uv_stride;
-    vp += 16 * uv_stride;
-    for (r = 32; r < h - 32; r += 16,
-         ys += 16 * w, yd += 16 * w, yp += 16 * w,
-         us += 8 * uv_stride, ud += 8 * uv_stride, up += 8 * uv_stride,
-         vs += 8 * uv_stride, vd += 8 * uv_stride, vp += 8 * uv_stride) {
-      for (c = 32; c < w - 32; c += 16) {
-        int mi, mj, ui, uj, wm;
-        int bmi, bmj, bui, buj, bwm;
-        unsigned char ym[256];
-
-        if (vp9_sad16x16_sse3(ys + c, y_stride, yd + c, y_stride, 0xffff) == 0)
-          bmi = bmj = bui = buj = bwm = 0;
-        else {
-          COLOR_SEG_ELEMENT cs[5];
-          int j;
-          unsigned int beste = 0xfffffff;
-          unsigned int bestj = 0;
-
-          // try color from last mb segmentation
-          cs[0] = last;
-
-          // try color segs from 4 pixels in mb recon as segmentation
-          cs[1].y = yd[c + y_stride + 1];
-          cs[1].u = ud[c / 2 + uv_stride];
-          cs[1].v = vd[c / 2 + uv_stride];
-          cs[1].yt = cs[1].ut = cs[1].vt = 20;
-          cs[2].y = yd[c + w + 14];
-          cs[2].u = ud[c / 2 + uv_stride + 7];
-          cs[2].v = vd[c / 2 + uv_stride + 7];
-          cs[2].yt = cs[2].ut = cs[2].vt = 20;
-          cs[3].y = yd[c + w * 14 + 1];
-          cs[3].u = ud[c / 2 + uv_stride * 7];
-          cs[3].v = vd[c / 2 + uv_stride * 7];
-          cs[3].yt = cs[3].ut = cs[3].vt = 20;
-          cs[4].y = yd[c + w * 14 + 14];
-          cs[4].u = ud[c / 2 + uv_stride * 7 + 7];
-          cs[4].v = vd[c / 2 + uv_stride * 7 + 7];
-          cs[4].yt = cs[4].ut = cs[4].vt = 20;
-
-          for (j = 0; j < 5; j++) {
-            int e;
-
-            e = fast_masked_motion_search(
-                  ys + c, us + c / 2, vs + c / 2, y_stride, uv_stride,
-                  yd + c, ud + c / 2, vd + c / 2, y_stride, uv_stride,
-                  &cs[j], 1, &mi, &mj, &ui, &uj, &wm);
-
-            if (e < beste) {
-              bmi = mi;
-              bmj = mj;
-              bui = ui;
-              buj = uj, bwm = wm;
-              bestj = j;
-              beste = e;
-            }
-          }
-          best = cs[bestj];
-          // best = segmentation[0];
-          last = best;
-        }
-        predict_all(yd + c, ud + c / 2, vd + c / 2, w, uv_stride,
-                    yp + c, up + c / 2, vp + c / 2, w, uv_stride,
-                    &best, 1, bmi, bmj, bui, buj, bwm);
-
-      }
-    }
-    fwrite(prd, w * h * 3 / 2, 1, g);
-    t = f0;
-    f0 = f1;
-    f1 = t;
-
-  }
-  fclose(f);
-  fclose(g);
-  return;
-}
--- a/vp8/common/mbpitch.c
+++ /dev/null
@@ -1,124 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "blockd.h"
-
-typedef enum {
-  PRED = 0,
-  DEST = 1
-} BLOCKSET;
-
-static void setup_block
-(
-  BLOCKD *b,
-  int mv_stride,
-  unsigned char **base,
-  unsigned char **base2,
-  int Stride,
-  int offset,
-  BLOCKSET bs
-) {
-
-  if (bs == DEST) {
-    b->dst_stride = Stride;
-    b->dst = offset;
-    b->base_dst = base;
-  } else {
-    b->pre_stride = Stride;
-    b->pre = offset;
-    b->base_pre = base;
-    b->base_second_pre = base2;
-  }
-
-}
-
-
-static void setup_macroblock(MACROBLOCKD *xd, BLOCKSET bs) {
-  int block;
-
-  unsigned char **y, **u, **v;
-  unsigned char **y2, **u2, **v2;
-  BLOCKD *blockd = xd->block;
-  int stride;
-
-  if (bs == DEST) {
-    y = &xd->dst.y_buffer;
-    u = &xd->dst.u_buffer;
-    v = &xd->dst.v_buffer;
-  } else {
-    y = &xd->pre.y_buffer;
-    u = &xd->pre.u_buffer;
-    v = &xd->pre.v_buffer;
-
-    y2 = &xd->second_pre.y_buffer;
-    u2 = &xd->second_pre.u_buffer;
-    v2 = &xd->second_pre.v_buffer;
-  }
-
-  stride = xd->dst.y_stride;
-  for (block = 0; block < 16; block++) { /* y blocks */
-    setup_block(&blockd[block], stride, y, y2, stride,
-                (block >> 2) * 4 * stride + (block & 3) * 4, bs);
-  }
-
-  stride = xd->dst.uv_stride;
-  for (block = 16; block < 20; block++) { /* U and V blocks */
-    setup_block(&blockd[block], stride, u, u2, stride,
-      ((block - 16) >> 1) * 4 * stride + (block & 1) * 4, bs);
-
-    setup_block(&blockd[block + 4], stride, v, v2, stride,
-      ((block - 16) >> 1) * 4 * stride + (block & 1) * 4, bs);
-  }
-}
-
-void vp9_setup_block_dptrs(MACROBLOCKD *xd) {
-  int r, c;
-  BLOCKD *blockd = xd->block;
-
-  for (r = 0; r < 4; r++) {
-    for (c = 0; c < 4; c++) {
-      blockd[r * 4 + c].diff = &xd->diff[r * 4 * 16 + c * 4];
-      blockd[r * 4 + c].predictor = xd->predictor + r * 4 * 16 + c * 4;
-    }
-  }
-
-  for (r = 0; r < 2; r++) {
-    for (c = 0; c < 2; c++) {
-      blockd[16 + r * 2 + c].diff = &xd->diff[256 + r * 4 * 8 + c * 4];
-      blockd[16 + r * 2 + c].predictor =
-        xd->predictor + 256 + r * 4 * 8 + c * 4;
-
-    }
-  }
-
-  for (r = 0; r < 2; r++) {
-    for (c = 0; c < 2; c++) {
-      blockd[20 + r * 2 + c].diff = &xd->diff[320 + r * 4 * 8 + c * 4];
-      blockd[20 + r * 2 + c].predictor =
-        xd->predictor + 320 + r * 4 * 8 + c * 4;
-
-    }
-  }
-
-  blockd[24].diff = &xd->diff[384];
-
-  for (r = 0; r < 25; r++) {
-    blockd[r].qcoeff  = xd->qcoeff  + r * 16;
-    blockd[r].dqcoeff = xd->dqcoeff + r * 16;
-  }
-}
-
-void vp9_build_block_doffsets(MACROBLOCKD *xd) {
-
-  /* handle the destination pitch features */
-  setup_macroblock(xd, DEST);
-  setup_macroblock(xd, PRED);
-}
--- a/vp8/common/modecont.c
+++ /dev/null
@@ -1,64 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "entropy.h"
-const int vp9_default_mode_contexts[6][4] = {
-  {
-    /* 0 */
-    7,     1,     1,   183
-  },
-  {
-    /* 1 */
-    14,    18,    14,   147
-  },
-  {
-    /* 2 */
-    135,    64,    57,    68
-  },
-  {
-    /* 3 */
-    60,    56,   128,   65
-  },
-  {
-    /* 4 */
-    159,   134,   128,   34
-  },
-  {
-    /* 5 */
-    234,   188,   128,   28
-  },
-};
-const int vp9_default_mode_contexts_a[6][4] = {
-  {
-    /* 0 */
-    4,     1,    1,   143
-  },
-  {
-    /* 1 */
-    7,     9,    7,   107
-  },
-  {
-    /* 2 */
-    95,    34,   57,    68
-  },
-  {
-    /* 3 */
-    95,    56,   128,   65
-  },
-  {
-    /* 4 */
-    159,   67,   128,   34
-  },
-  {
-    /* 5 */
-    234,   94,   128,   28
-  },
-};
--- a/vp8/common/modecont.h
+++ /dev/null
@@ -1,17 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef __INC_MODECONT_H
-#define __INC_MODECONT_H
-
-extern const int vp9_default_mode_contexts[6][4];
-extern const int vp9_default_mode_contexts_a[6][4];
-#endif
--- a/vp8/common/modecontext.c
+++ /dev/null
@@ -1,145 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "entropymode.h"
-
-const unsigned int vp9_kf_default_bmode_counts[VP9_BINTRAMODES][VP9_BINTRAMODES][VP9_BINTRAMODES] = {
-  {
-    /*Above Mode :  0*/
-    { 43438,   2195,    470,    316,    615,    171,    217,    412,    124,    160, }, /* left_mode 0 */
-    {  5722,   2751,    296,    291,     81,     68,     80,    101,    100,    170, }, /* left_mode 1 */
-    {  1629,    201,    307,     25,     47,     16,     34,     72,     19,     28, }, /* left_mode 2 */
-    {   332,    266,     36,    500,     20,     65,     23,     14,    154,    106, }, /* left_mode 3 */
-    {   450,     97,     10,     24,    117,     10,      2,     12,      8,     71, }, /* left_mode 4 */
-    {   384,     49,     29,     44,     12,    162,     51,      5,     87,     42, }, /* left_mode 5 */
-    {   495,     53,    157,     27,     14,     57,    180,     17,     17,     34, }, /* left_mode 6 */
-    {   695,     64,     62,      9,     27,      5,      3,    147,     10,     26, }, /* left_mode 7 */
-    {   230,     54,     20,    124,     16,    125,     29,     12,    283,     37, }, /* left_mode 8 */
-    {   260,     87,     21,    120,     32,     16,     33,     16,     33,    203, }, /* left_mode 9 */
-  },
-  {
-    /*Above Mode :  1*/
-    {  3934,   2573,    355,    137,    128,     87,    133,    117,     37,     27, }, /* left_mode 0 */
-    {  1036,   1929,    278,    135,     27,     37,     48,     55,     41,     91, }, /* left_mode 1 */
-    {   223,    256,    253,     15,     13,      9,     28,     64,      3,      3, }, /* left_mode 2 */
-    {   120,    129,     17,    316,     15,     11,      9,      4,     53,     74, }, /* left_mode 3 */
-    {   129,     58,      6,     11,     38,      2,      0,      5,      2,     67, }, /* left_mode 4 */
-    {    53,     22,     11,     16,      8,     26,     14,      3,     19,     12, }, /* left_mode 5 */
-    {    59,     26,     61,     11,      4,      9,     35,     13,      8,      8, }, /* left_mode 6 */
-    {   101,     52,     40,      8,      5,      2,      8,     59,      2,     20, }, /* left_mode 7 */
-    {    48,     34,     10,     52,      8,     15,      6,      6,     63,     20, }, /* left_mode 8 */
-    {    96,     48,     22,     63,     11,     14,      5,      8,      9,     96, }, /* left_mode 9 */
-  },
-  {
-    /*Above Mode :  2*/
-    {   709,    461,    506,     36,     27,     33,    151,     98,     24,      6, }, /* left_mode 0 */
-    {   201,    375,    442,     27,     13,      8,     46,     58,      6,     19, }, /* left_mode 1 */
-    {   122,    140,    417,      4,     13,      3,     33,     59,      4,      2, }, /* left_mode 2 */
-    {    36,     17,     22,     16,      6,      8,     12,     17,      9,     21, }, /* left_mode 3 */
-    {    51,     15,      7,      1,     14,      0,      4,      5,      3,     22, }, /* left_mode 4 */
-    {    18,     11,     30,      9,      7,     20,     11,      5,      2,      6, }, /* left_mode 5 */
-    {    38,     21,    103,      9,      4,     12,     79,     13,      2,      5, }, /* left_mode 6 */
-    {    64,     17,     66,      2,     12,      4,      2,     65,      4,      5, }, /* left_mode 7 */
-    {    14,      7,      7,     16,      3,     11,      4,     13,     15,     16, }, /* left_mode 8 */
-    {    36,      8,     32,      9,      9,      4,     14,      7,      6,     24, }, /* left_mode 9 */
-  },
-  {
-    /*Above Mode :  3*/
-    {  1340,    173,     36,    119,     30,     10,     13,     10,     20,     26, }, /* left_mode 0 */
-    {   156,    293,     26,    108,      5,     16,      2,      4,     23,     30, }, /* left_mode 1 */
-    {    60,     34,     13,      7,      3,      3,      0,      8,      4,      5, }, /* left_mode 2 */
-    {    72,     64,      1,    235,      3,      9,      2,      7,     28,     38, }, /* left_mode 3 */
-    {    29,     14,      1,      3,      5,      0,      2,      2,      5,     13, }, /* left_mode 4 */
-    {    22,      7,      4,     11,      2,      5,      1,      2,      6,      4, }, /* left_mode 5 */
-    {    18,     14,      5,      6,      4,      3,     14,      0,      9,      2, }, /* left_mode 6 */
-    {    41,     10,      7,      1,      2,      0,      0,     10,      2,      1, }, /* left_mode 7 */
-    {    23,     19,      2,     33,      1,      5,      2,      0,     51,      8, }, /* left_mode 8 */
-    {    33,     26,      7,     53,      3,      9,      3,      3,      9,     19, }, /* left_mode 9 */
-  },
-  {
-    /*Above Mode :  4*/
-    {   410,    165,     43,     31,     66,     15,     30,     54,      8,     17, }, /* left_mode 0 */
-    {   115,     64,     27,     18,     30,      7,     11,     15,      4,     19, }, /* left_mode 1 */
-    {    31,     23,     25,      1,      7,      2,      2,     10,      0,      5, }, /* left_mode 2 */
-    {    17,      4,      1,      6,      8,      2,      7,      5,      5,     21, }, /* left_mode 3 */
-    {   120,     12,      1,      2,     83,      3,      0,      4,      1,     40, }, /* left_mode 4 */
-    {     4,      3,      1,      2,      1,      2,      5,      0,      3,      6, }, /* left_mode 5 */
-    {    10,      2,     13,      6,      6,      6,      8,      2,      4,      5, }, /* left_mode 6 */
-    {    58,     10,      5,      1,     28,      1,      1,     33,      1,      9, }, /* left_mode 7 */
-    {     8,      2,      1,      4,      2,      5,      1,      1,      2,     10, }, /* left_mode 8 */
-    {    76,      7,      5,      7,     18,      2,      2,      0,      5,     45, }, /* left_mode 9 */
-  },
-  {
-    /*Above Mode :  5*/
-    {   444,     46,     47,     20,     14,    110,     60,     14,     60,      7, }, /* left_mode 0 */
-    {    59,     57,     25,     18,      3,     17,     21,      6,     14,      6, }, /* left_mode 1 */
-    {    24,     17,     20,      6,      4,     13,      7,      2,      3,      2, }, /* left_mode 2 */
-    {    13,     11,      5,     14,      4,      9,      2,      4,     15,      7, }, /* left_mode 3 */
-    {     8,      5,      2,      1,      4,      0,      1,      1,      2,     12, }, /* left_mode 4 */
-    {    19,      5,      5,      7,      4,     40,      6,      3,     10,      4, }, /* left_mode 5 */
-    {    16,      5,      9,      1,      1,     16,     26,      2,     10,      4, }, /* left_mode 6 */
-    {    11,      4,      8,      1,      1,      4,      4,      5,      4,      1, }, /* left_mode 7 */
-    {    15,      1,      3,      7,      3,     21,      7,      1,     34,      5, }, /* left_mode 8 */
-    {    18,      5,      1,      3,      4,      3,      7,      1,      2,      9, }, /* left_mode 9 */
-  },
-  {
-    /*Above Mode :  6*/
-    {   476,    149,     94,     13,     14,     77,    291,     27,     23,      3, }, /* left_mode 0 */
-    {    79,     83,     42,     14,      2,     12,     63,      2,      4,     14, }, /* left_mode 1 */
-    {    43,     36,     55,      1,      3,      8,     42,     11,      5,      1, }, /* left_mode 2 */
-    {     9,      9,      6,     16,      1,      5,      6,      3,     11,     10, }, /* left_mode 3 */
-    {    10,      3,      1,      3,     10,      1,      0,      1,      1,      4, }, /* left_mode 4 */
-    {    14,      6,     15,      5,      1,     20,     25,      2,      5,      0, }, /* left_mode 5 */
-    {    28,      7,     51,      1,      0,      8,    127,      6,      2,      5, }, /* left_mode 6 */
-    {    13,      3,      3,      2,      3,      1,      2,      8,      1,      2, }, /* left_mode 7 */
-    {    10,      3,      3,      3,      3,      8,      2,      2,      9,      3, }, /* left_mode 8 */
-    {    13,      7,     11,      4,      0,      4,      6,      2,      5,      8, }, /* left_mode 9 */
-  },
-  {
-    /*Above Mode :  7*/
-    {   376,    135,    119,      6,     32,      8,     31,    224,      9,      3, }, /* left_mode 0 */
-    {    93,     60,     54,      6,     13,      7,      8,     92,      2,     12, }, /* left_mode 1 */
-    {    74,     36,     84,      0,      3,      2,      9,     67,      2,      1, }, /* left_mode 2 */
-    {    19,      4,      4,      8,      8,      2,      4,      7,      6,     16, }, /* left_mode 3 */
-    {    51,      7,      4,      1,     77,      3,      0,     14,      1,     15, }, /* left_mode 4 */
-    {     7,      7,      5,      7,      4,      7,      4,      5,      0,      3, }, /* left_mode 5 */
-    {    18,      2,     19,      2,      2,      4,     12,     11,      1,      2, }, /* left_mode 6 */
-    {   129,      6,     27,      1,     21,      3,      0,    189,      0,      6, }, /* left_mode 7 */
-    {     9,      1,      2,      8,      3,      7,      0,      5,      3,      3, }, /* left_mode 8 */
-    {    20,      4,      5,     10,      4,      2,      7,     17,      3,     16, }, /* left_mode 9 */
-  },
-  {
-    /*Above Mode :  8*/
-    {   617,     68,     34,     79,     11,     27,     25,     14,     75,     13, }, /* left_mode 0 */
-    {    51,     82,     21,     26,      6,     12,     13,      1,     26,     16, }, /* left_mode 1 */
-    {    29,      9,     12,     11,      3,      7,      1,     10,      2,      2, }, /* left_mode 2 */
-    {    17,     19,     11,     74,      4,      3,      2,      0,     58,     13, }, /* left_mode 3 */
-    {    10,      1,      1,      3,      4,      1,      0,      2,      1,      8, }, /* left_mode 4 */
-    {    14,      4,      5,      5,      1,     13,      2,      0,     27,      8, }, /* left_mode 5 */
-    {    10,      3,      5,      4,      1,      7,      6,      4,      5,      1, }, /* left_mode 6 */
-    {    10,      2,      6,      2,      1,      1,      1,      4,      2,      1, }, /* left_mode 7 */
-    {    14,      8,      5,     23,      2,     12,      6,      2,    117,      5, }, /* left_mode 8 */
-    {     9,      6,      2,     19,      1,      6,      3,      2,      9,      9, }, /* left_mode 9 */
-  },
-  {
-    /*Above Mode :  9*/
-    {   680,     73,     22,     38,     42,      5,     11,      9,      6,     28, }, /* left_mode 0 */
-    {   113,    112,     21,     22,     10,      2,      8,      4,      6,     42, }, /* left_mode 1 */
-    {    44,     20,     24,      6,      5,      4,      3,      3,      1,      2, }, /* left_mode 2 */
-    {    40,     23,      7,     71,      5,      2,      4,      1,      7,     22, }, /* left_mode 3 */
-    {    85,      9,      4,      4,     17,      2,      0,      3,      2,     23, }, /* left_mode 4 */
-    {    13,      4,      2,      6,      1,      7,      0,      1,      7,      6, }, /* left_mode 5 */
-    {    26,      6,      8,      3,      2,      3,      8,      1,      5,      4, }, /* left_mode 6 */
-    {    54,      8,      9,      6,      7,      0,      1,     11,      1,      3, }, /* left_mode 7 */
-    {     9,     10,      4,     13,      2,      5,      4,      2,     14,      8, }, /* left_mode 8 */
-    {    92,      9,      5,     19,     15,      3,      3,      1,      6,     58, }, /* left_mode 9 */
-  },
-};
--- a/vp8/common/mv.h
+++ /dev/null
@@ -1,26 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef __INC_MV_H
-#define __INC_MV_H
-#include "vpx/vpx_integer.h"
-
-typedef struct {
-  short row;
-  short col;
-} MV;
-
-typedef union {
-  uint32_t  as_int;
-  MV        as_mv;
-} int_mv;        /* facilitates faster equality tests and copies */
-
-#endif
--- a/vp8/common/mvref_common.c
+++ /dev/null
@@ -1,342 +1,0 @@
-/*
- *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "mvref_common.h"
-
-#if CONFIG_NEWBESTREFMV
-
-#define MVREF_NEIGHBOURS 8
-static int mv_ref_search[MVREF_NEIGHBOURS][2] =
-  { {0,-1},{-1,0},{-1,-1},{0,-2},{-2,0},{-1,-2},{-2,-1},{-2,-2} };
-static int ref_distance_weight[MVREF_NEIGHBOURS] =
-  { 3,3,2,1,1,1,1,1 };
-
-// clamp_mv
-#define MV_BORDER (16 << 3) // Allow 16 pels in 1/8th pel units
-static void clamp_mv(const MACROBLOCKD *xd, int_mv *mv) {
-
-  if (mv->as_mv.col < (xd->mb_to_left_edge - MV_BORDER))
-    mv->as_mv.col = xd->mb_to_left_edge - MV_BORDER;
-  else if (mv->as_mv.col > xd->mb_to_right_edge + MV_BORDER)
-    mv->as_mv.col = xd->mb_to_right_edge + MV_BORDER;
-
-  if (mv->as_mv.row < (xd->mb_to_top_edge - MV_BORDER))
-    mv->as_mv.row = xd->mb_to_top_edge - MV_BORDER;
-  else if (mv->as_mv.row > xd->mb_to_bottom_edge + MV_BORDER)
-    mv->as_mv.row = xd->mb_to_bottom_edge + MV_BORDER;
-}
-
-
-// Gets a best matching candidate refenence motion vector
-// from the given mode info structure (if available)
-static int get_candidate_mvref(
-  const MODE_INFO *candidate_mi,
-  MV_REFERENCE_FRAME ref_frame,
-  MV_REFERENCE_FRAME *c_ref_frame,
-  int_mv *c_mv,
-  MV_REFERENCE_FRAME *c2_ref_frame,
-  int_mv *c2_mv
-) {
-
-  int ret_val = FALSE;
-  c2_mv->as_int = 0;
-  *c2_ref_frame = INTRA_FRAME;
-
-  // Target ref frame matches candidate first ref frame
-  if (ref_frame == candidate_mi->mbmi.ref_frame) {
-    c_mv->as_int = candidate_mi->mbmi.mv[0].as_int;
-    *c_ref_frame = ref_frame;
-    ret_val = TRUE;
-
-    // Is there a second non zero vector we can use.
-    if ((candidate_mi->mbmi.second_ref_frame != INTRA_FRAME) &&
-        (candidate_mi->mbmi.mv[1].as_int != 0) &&
-        (candidate_mi->mbmi.mv[1].as_int != c_mv->as_int)) {
-      c2_mv->as_int = candidate_mi->mbmi.mv[1].as_int;
-      *c2_ref_frame = candidate_mi->mbmi.second_ref_frame;
-    }
-
-  // Target ref frame matches candidate second ref frame
-  } else if (ref_frame == candidate_mi->mbmi.second_ref_frame) {
-    c_mv->as_int = candidate_mi->mbmi.mv[1].as_int;
-    *c_ref_frame = ref_frame;
-    ret_val = TRUE;
-
-    // Is there a second non zero vector we can use.
-    if ((candidate_mi->mbmi.ref_frame != INTRA_FRAME) &&
-        (candidate_mi->mbmi.mv[0].as_int != 0) &&
-        (candidate_mi->mbmi.mv[0].as_int != c_mv->as_int)) {
-      c2_mv->as_int = candidate_mi->mbmi.mv[0].as_int;
-      *c2_ref_frame = candidate_mi->mbmi.ref_frame;
-    }
-
-  // No ref frame matches so use first ref mv as first choice
-  } else if (candidate_mi->mbmi.ref_frame != INTRA_FRAME) {
-    c_mv->as_int = candidate_mi->mbmi.mv[0].as_int;
-    *c_ref_frame = candidate_mi->mbmi.ref_frame;
-    ret_val = TRUE;
-
-    // Is there a second non zero vector we can use.
-    if ((candidate_mi->mbmi.second_ref_frame != INTRA_FRAME) &&
-        (candidate_mi->mbmi.mv[1].as_int != 0) &&
-        (candidate_mi->mbmi.mv[1].as_int != c_mv->as_int)) {
-      c2_mv->as_int = candidate_mi->mbmi.mv[1].as_int;
-      *c2_ref_frame = candidate_mi->mbmi.second_ref_frame;
-    }
-
-  // If only the second ref mv is valid:- (Should not trigger in current code
-  // base given current possible compound prediction options).
-  } else if (candidate_mi->mbmi.second_ref_frame != INTRA_FRAME) {
-    c_mv->as_int = candidate_mi->mbmi.mv[1].as_int;
-    *c_ref_frame = candidate_mi->mbmi.second_ref_frame;
-    ret_val = TRUE;
-  }
-
-  return ret_val;
-}
-
-// Performs mv adjustment based on reference frame and clamps the MV
-// if it goes off the edge of the buffer.
-static void scale_mv(
-  MACROBLOCKD *xd,
-  MV_REFERENCE_FRAME this_ref_frame,
-  MV_REFERENCE_FRAME candidate_ref_frame,
-  int_mv *candidate_mv,
-  int *ref_sign_bias
-) {
-
-  if (candidate_ref_frame != this_ref_frame) {
-
-    //int frame_distances[MAX_REF_FRAMES];
-    //int last_distance = 1;
-    //int gf_distance = xd->frames_since_golden;
-    //int arf_distance = xd->frames_till_alt_ref_frame;
-
-    // Sign inversion where appropriate.
-    if (ref_sign_bias[candidate_ref_frame] != ref_sign_bias[this_ref_frame]) {
-      candidate_mv->as_mv.row = -candidate_mv->as_mv.row;
-      candidate_mv->as_mv.col = -candidate_mv->as_mv.col;
-    }
-
-    // Scale based on frame distance if the reference frames not the same.
-    /*frame_distances[INTRA_FRAME] = 1;   // should never be used
-    frame_distances[LAST_FRAME] = 1;
-    frame_distances[GOLDEN_FRAME] =
-      (xd->frames_since_golden) ? xd->frames_since_golden : 1;
-    frame_distances[ALTREF_FRAME] =
-      (xd->frames_till_alt_ref_frame) ? xd->frames_till_alt_ref_frame : 1;
-
-    if (frame_distances[this_ref_frame] &&
-        frame_distances[candidate_ref_frame]) {
-      candidate_mv->as_mv.row =
-        (short)(((int)(candidate_mv->as_mv.row) *
-                 frame_distances[this_ref_frame]) /
-                frame_distances[candidate_ref_frame]);
-
-      candidate_mv->as_mv.col =
-        (short)(((int)(candidate_mv->as_mv.col) *
-                 frame_distances[this_ref_frame]) /
-                frame_distances[candidate_ref_frame]);
-    }
-    */
-  }
-
-  // Clamp the MV so it does not point out of the frame buffer
-  clamp_mv(xd, candidate_mv);
-}
-
-// Adds a new candidate reference vector to the list if indeed it is new.
-// If it is not new then the score of the existing candidate that it matches
-// is increased and the list is resorted.
-static void addmv_and_shuffle(
-  int_mv *mv_list,
-  int *mv_scores,
-  int *index,
-  int_mv candidate_mv,
-  int weight
-) {
-
-  int i = *index;
-  int duplicate_found = FALSE;
-
-  // Check for duplicates. If there is one increment its score.
-  // Duplicate defined as being the same full pel vector with rounding.
-  while (i > 0) {
-    i--;
-
-    if (candidate_mv.as_int == mv_list[i].as_int) {
-      duplicate_found = TRUE;
-      mv_scores[i] += weight;
-      break;
-    }
-  }
-
-  // If no duplicate was found add the new vector and give it a weight
-  if (!duplicate_found) {
-    mv_list[*index].as_int = candidate_mv.as_int;
-    mv_scores[*index] = weight;
-    i = *index;
-    (*index)++;
-  }
-
-  // Reshuffle the list so that highest scoring mvs at the top.
-  while (i > 0) {
-    if (mv_scores[i] > mv_scores[i-1]) {
-      int tmp_score = mv_scores[i-1];
-      int_mv tmp_mv = mv_list[i-1];
-
-      mv_scores[i-1] = mv_scores[i];
-      mv_list[i-1] = mv_list[i];
-      mv_scores[i] = tmp_score;
-      mv_list[i] = tmp_mv;
-      i--;
-    } else
-      break;
-  }
-}
-
-// This function searches the neighbourhood of a given MB/SB and populates a
-// list of candidate reference vectors.
-//
-void vp9_find_mv_refs(
-  MACROBLOCKD *xd,
-  MODE_INFO *here,
-  MODE_INFO *lf_here,
-  MV_REFERENCE_FRAME ref_frame,
-  int_mv *mv_ref_list,
-  int *ref_sign_bias
-) {
-
-  int i;
-  MODE_INFO *candidate_mi;
-  int_mv candidate_mvs[MAX_MV_REFS];
-  int_mv c_refmv;
-  MV_REFERENCE_FRAME c_ref_frame;
-  int_mv c2_refmv;
-  MV_REFERENCE_FRAME c2_ref_frame;
-  int candidate_scores[MAX_MV_REFS];
-  int index = 0;
-  int ref_weight = 0;
-  int valid_mv_ref;
-
-  // Blank the reference vector lists and other local structures.
-  vpx_memset(mv_ref_list, 0, sizeof(int_mv) * MAX_MV_REFS);
-  vpx_memset(candidate_mvs, 0, sizeof(int_mv) * MAX_MV_REFS);
-  vpx_memset(candidate_scores, 0, sizeof(candidate_scores));
-
-  // Populate a list with candidate reference vectors from the
-  // spatial neighbours.
-  for (i = 0; i < 2; ++i) {
-    if (((mv_ref_search[i][0] << 7) >= xd->mb_to_left_edge) &&
-        ((mv_ref_search[i][1] << 7) >= xd->mb_to_top_edge)) {
-
-      candidate_mi = here + mv_ref_search[i][0] +
-                     (mv_ref_search[i][1] * xd->mode_info_stride);
-
-      valid_mv_ref = get_candidate_mvref(candidate_mi, ref_frame,
-                                         &c_ref_frame, &c_refmv,
-                                         &c2_ref_frame, &c2_refmv);
-
-      // If there is a valid MV candidate then add it to the list
-      if (valid_mv_ref) {
-        scale_mv(xd, ref_frame, c_ref_frame, &c_refmv, ref_sign_bias );
-        ref_weight = ref_distance_weight[i] +
-                     ((c_ref_frame == ref_frame) << 4);
-
-        addmv_and_shuffle(candidate_mvs, candidate_scores,
-                          &index, c_refmv, ref_weight);
-
-        // If there is a second valid mv then add it as well.
-        if (c2_ref_frame != INTRA_FRAME) {
-          scale_mv(xd, ref_frame, c2_ref_frame, &c2_refmv, ref_sign_bias );
-          ref_weight = ref_distance_weight[i] +
-                       ((c2_ref_frame == ref_frame) << 4);
-
-          addmv_and_shuffle(candidate_mvs, candidate_scores,
-                            &index, c2_refmv, ref_weight);
-        }
-      }
-    }
-  }
-
-  // Look at the corresponding vector in the last frame
-  candidate_mi = lf_here;
-  valid_mv_ref = get_candidate_mvref(candidate_mi, ref_frame,
-                                     &c_ref_frame, &c_refmv,
-                                     &c2_ref_frame, &c2_refmv);
-
-  // If there is a valid MV candidate then add it to the list
-  if (valid_mv_ref) {
-    scale_mv(xd, ref_frame, c_ref_frame, &c_refmv, ref_sign_bias );
-    ref_weight = 2 + ((c_ref_frame == ref_frame) << 4);
-    addmv_and_shuffle(candidate_mvs, candidate_scores,
-                      &index, c_refmv, ref_weight);
-
-    // If there is a second valid mv then add it as well.
-    if (c2_ref_frame != INTRA_FRAME) {
-      scale_mv(xd, ref_frame, c2_ref_frame, &c2_refmv, ref_sign_bias );
-      ref_weight = ref_distance_weight[i] +
-                   ((c2_ref_frame == ref_frame) << 4);
-
-      addmv_and_shuffle(candidate_mvs, candidate_scores,
-                        &index, c2_refmv, ref_weight);
-    }
-  }
-
-  // Populate a list with candidate reference vectors from the
-  // spatial neighbours.
-  for (i = 2; i < MVREF_NEIGHBOURS; ++i) {
-    if (((mv_ref_search[i][0] << 7) >= xd->mb_to_left_edge) &&
-        ((mv_ref_search[i][1] << 7) >= xd->mb_to_top_edge)) {
-
-      candidate_mi = here + mv_ref_search[i][0] +
-                     (mv_ref_search[i][1] * xd->mode_info_stride);
-
-      valid_mv_ref = get_candidate_mvref(candidate_mi, ref_frame,
-                                         &c_ref_frame, &c_refmv,
-                                         &c2_ref_frame, &c2_refmv);
-
-      // If there is a valid MV candidate then add it to the list
-      if (valid_mv_ref) {
-        scale_mv(xd, ref_frame, c_ref_frame, &c_refmv, ref_sign_bias );
-        ref_weight = ref_distance_weight[i] +
-                     ((c_ref_frame == ref_frame) << 4);
-
-        addmv_and_shuffle(candidate_mvs, candidate_scores,
-                          &index, c_refmv, ref_weight);
-
-        // If there is a second valid mv then add it as well.
-        if (c2_ref_frame != INTRA_FRAME) {
-          scale_mv(xd, ref_frame, c2_ref_frame, &c2_refmv, ref_sign_bias );
-          ref_weight = ref_distance_weight[i] +
-                       ((c2_ref_frame == ref_frame) << 4);
-
-          addmv_and_shuffle(candidate_mvs, candidate_scores,
-                            &index, c2_refmv, ref_weight);
-        }
-      }
-    }
-  }
-
-  // 0,0 is always a valid reference.
-  for (i = 0; i < index; ++i)
-    if (candidate_mvs[i].as_int == 0)
-      break;
-  if (i == index) {
-    c_refmv.as_int = 0;
-    addmv_and_shuffle(candidate_mvs, candidate_scores,
-                      &index, c_refmv, candidate_scores[3]+1 );
-  }
-
-  // Copy over the candidate list.
-  vpx_memcpy(mv_ref_list, candidate_mvs, sizeof(candidate_mvs));
-}
-
-#endif
--- a/vp8/common/mvref_common.h
+++ /dev/null
@@ -1,31 +1,0 @@
-/*
- *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "onyxc_int.h"
-#include "blockd.h"
-
-// MR reference entropy header file.
-#if CONFIG_NEWBESTREFMV
-
-#ifndef __INC_MVREF_COMMON_H
-#define __INC_MVREF_COMMON_H
-
-void vp9_find_mv_refs(
-  MACROBLOCKD *xd,
-  MODE_INFO *here,
-  MODE_INFO *lf_here,
-  MV_REFERENCE_FRAME ref_frame,
-  int_mv * mv_ref_list,
-  int *ref_sign_bias
-);
-
-#endif
-
-#endif
--- a/vp8/common/onyx.h
+++ /dev/null
@@ -1,225 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef __INC_ONYX_H
-#define __INC_ONYX_H
-
-#ifdef __cplusplus
-extern "C"
-{
-#endif
-
-#include "vpx/internal/vpx_codec_internal.h"
-#include "vpx/vp8cx.h"
-#include "vpx_scale/yv12config.h"
-#include "type_aliases.h"
-#include "ppflags.h"
-  typedef int *VP9_PTR;
-
-  /* Create/destroy static data structures. */
-
-  typedef enum {
-    NORMAL      = 0,
-    FOURFIVE    = 1,
-    THREEFIVE   = 2,
-    ONETWO      = 3
-
-  } VPX_SCALING;
-
-  typedef enum {
-    VP9_LAST_FLAG = 1,
-    VP9_GOLD_FLAG = 2,
-    VP9_ALT_FLAG = 4
-  } VP9_REFFRAME;
-
-
-  typedef enum {
-    USAGE_STREAM_FROM_SERVER    = 0x0,
-    USAGE_LOCAL_FILE_PLAYBACK   = 0x1,
-    USAGE_CONSTRAINED_QUALITY   = 0x2
-  } END_USAGE;
-
-
-  typedef enum {
-    MODE_GOODQUALITY    = 0x1,
-    MODE_BESTQUALITY    = 0x2,
-    MODE_FIRSTPASS      = 0x3,
-    MODE_SECONDPASS     = 0x4,
-    MODE_SECONDPASS_BEST = 0x5,
-  } MODE;
-
-  typedef enum {
-    FRAMEFLAGS_KEY    = 1,
-    FRAMEFLAGS_GOLDEN = 2,
-    FRAMEFLAGS_ALTREF = 4,
-  } FRAMETYPE_FLAGS;
-
-
-#include <assert.h>
-  static __inline void Scale2Ratio(int mode, int *hr, int *hs) {
-    switch (mode) {
-      case    NORMAL:
-        *hr = 1;
-        *hs = 1;
-        break;
-      case    FOURFIVE:
-        *hr = 4;
-        *hs = 5;
-        break;
-      case    THREEFIVE:
-        *hr = 3;
-        *hs = 5;
-        break;
-      case    ONETWO:
-        *hr = 1;
-        *hs = 2;
-        break;
-      default:
-        *hr = 1;
-        *hs = 1;
-        assert(0);
-        break;
-    }
-  }
-
-  typedef struct {
-    int Version;            // 4 versions of bitstream defined 0 best quality/slowest decode, 3 lowest quality/fastest decode
-    int Width;              // width of data passed to the compressor
-    int Height;             // height of data passed to the compressor
-    double frame_rate;       // set to passed in framerate
-    int target_bandwidth;    // bandwidth to be used in kilobits per second
-
-    int noise_sensitivity;   // parameter used for applying pre processing blur: recommendation 0
-    int Sharpness;          // parameter used for sharpening output: recommendation 0:
-    int cpu_used;
-    unsigned int rc_max_intra_bitrate_pct;
-
-    // mode ->
-    // (0)=Realtime/Live Encoding. This mode is optimized for realtim encoding (for example, capturing
-    //    a television signal or feed from a live camera). ( speed setting controls how fast )
-    // (1)=Good Quality Fast Encoding. The encoder balances quality with the amount of time it takes to
-    //    encode the output. ( speed setting controls how fast )
-    // (2)=One Pass - Best Quality. The encoder places priority on the quality of the output over encoding
-    //    speed. The output is compressed at the highest possible quality. This option takes the longest
-    //    amount of time to encode. ( speed setting ignored )
-    // (3)=Two Pass - First Pass. The encoder generates a file of statistics for use in the second encoding
-    //    pass. ( speed setting controls how fast )
-    // (4)=Two Pass - Second Pass. The encoder uses the statistics that were generated in the first encoding
-    //    pass to create the compressed output. ( speed setting controls how fast )
-    // (5)=Two Pass - Second Pass Best.  The encoder uses the statistics that were generated in the first
-    //    encoding pass to create the compressed output using the highest possible quality, and taking a
-    //    longer amount of time to encode.. ( speed setting ignored )
-    int Mode;               //
-
-    // Key Framing Operations
-    int auto_key;            // automatically detect cut scenes and set the keyframes
-    int key_freq;            // maximum distance to key frame.
-
-    int allow_lag;           // allow lagged compression (if 0 lagin frames is ignored)
-    int lag_in_frames;        // how many frames lag before we start encoding
-
-    // ----------------------------------------------------------------
-    // DATARATE CONTROL OPTIONS
-
-    int end_usage; // vbr or cbr
-
-    // buffer targeting aggressiveness
-    int under_shoot_pct;
-    int over_shoot_pct;
-
-    // buffering parameters
-    int starting_buffer_level;  // in seconds
-    int optimal_buffer_level;
-    int maximum_buffer_size;
-
-    // controlling quality
-    int fixed_q;
-    int worst_allowed_q;
-    int best_allowed_q;
-    int cq_level;
-    int lossless;
-
-    // two pass datarate control
-    int two_pass_vbrbias;        // two pass datarate control tweaks
-    int two_pass_vbrmin_section;
-    int two_pass_vbrmax_section;
-    // END DATARATE CONTROL OPTIONS
-    // ----------------------------------------------------------------
-
-
-    // these parameters aren't to be used in final build don't use!!!
-    int play_alternate;
-    int alt_freq;
-
-    int encode_breakout;  // early breakout encode threshold : for video conf recommend 800
-
-    int arnr_max_frames;
-    int arnr_strength;
-    int arnr_type;
-
-    struct vpx_fixed_buf         two_pass_stats_in;
-    struct vpx_codec_pkt_list  *output_pkt_list;
-
-    vp8e_tuning tuning;
-  } VP9_CONFIG;
-
-
-  void vp9_initialize_enc();
-
-  VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf);
-  void vp9_remove_compressor(VP9_PTR *comp);
-
-  void vp9_change_config(VP9_PTR onyx, VP9_CONFIG *oxcf);
-
-// receive a frames worth of data caller can assume that a copy of this frame is made
-// and not just a copy of the pointer..
-  int vp9_receive_raw_frame(VP9_PTR comp, unsigned int frame_flags,
-                            YV12_BUFFER_CONFIG *sd, int64_t time_stamp,
-                            int64_t end_time_stamp);
-
-  int vp9_get_compressed_data(VP9_PTR comp, unsigned int *frame_flags,
-                              unsigned long *size, unsigned char *dest,
-                              int64_t *time_stamp, int64_t *time_end,
-                              int flush);
-
-  int vp9_get_preview_raw_frame(VP9_PTR comp, YV12_BUFFER_CONFIG *dest,
-                                vp9_ppflags_t *flags);
-
-  int vp9_use_as_reference(VP9_PTR comp, int ref_frame_flags);
-
-  int vp9_update_reference(VP9_PTR comp, int ref_frame_flags);
-
-  int vp9_get_reference_enc(VP9_PTR comp, VP9_REFFRAME ref_frame_flag,
-                            YV12_BUFFER_CONFIG *sd);
-
-  int vp9_set_reference_enc(VP9_PTR comp, VP9_REFFRAME ref_frame_flag,
-                            YV12_BUFFER_CONFIG *sd);
-
-  int vp9_update_entropy(VP9_PTR comp, int update);
-
-  int vp9_set_roimap(VP9_PTR comp, unsigned char *map,
-                     unsigned int rows, unsigned int cols,
-                     int delta_q[4], int delta_lf[4],
-                     unsigned int threshold[4]);
-
-  int vp9_set_active_map(VP9_PTR comp, unsigned char *map,
-                         unsigned int rows, unsigned int cols);
-
-  int vp9_set_internal_size(VP9_PTR comp,
-                            VPX_SCALING horiz_mode, VPX_SCALING vert_mode);
-
-  int vp9_get_quantizer(VP9_PTR c);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif  // __INC_ONYX_H
--- a/vp8/common/onyxc_int.h
+++ /dev/null
@@ -1,314 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef __INC_ONYXC_INT_H
-#define __INC_ONYXC_INT_H
-
-#include "vpx_config.h"
-#include "vpx/internal/vpx_codec_internal.h"
-#include "vpx_rtcd.h"
-#include "loopfilter.h"
-#include "entropymv.h"
-#include "entropy.h"
-#include "entropymode.h"
-#include "idct.h"
-#if CONFIG_POSTPROC
-#include "postproc.h"
-#endif
-
-/*#ifdef PACKET_TESTING*/
-#include "header.h"
-/*#endif*/
-
-/* Create/destroy static data structures. */
-
-void vp9_initialize_common(void);
-
-#define MINQ 0
-
-#define MAXQ 255
-#define QINDEX_BITS 8
-
-#define QINDEX_RANGE (MAXQ + 1)
-
-#define NUM_YV12_BUFFERS 4
-
-#define COMP_PRED_CONTEXTS   2
-
-typedef struct frame_contexts {
-  vp9_prob bmode_prob [VP9_BINTRAMODES - 1];
-  vp9_prob ymode_prob [VP9_YMODES - 1]; /* interframe intra mode probs */
-  vp9_prob uv_mode_prob [VP9_YMODES][VP9_UV_MODES - 1];
-  vp9_prob i8x8_mode_prob [VP9_I8X8_MODES - 1];
-  vp9_prob sub_mv_ref_prob [SUBMVREF_COUNT][VP9_SUBMVREFS - 1];
-  vp9_prob mbsplit_prob [VP9_NUMMBSPLITS - 1];
-  vp9_prob coef_probs [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
-  vp9_prob hybrid_coef_probs [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
-  vp9_prob coef_probs_8x8 [BLOCK_TYPES_8X8] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
-  vp9_prob hybrid_coef_probs_8x8 [BLOCK_TYPES_8X8] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
-  vp9_prob coef_probs_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
-  vp9_prob hybrid_coef_probs_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
-
-  nmv_context nmvc;
-  nmv_context pre_nmvc;
-  vp9_prob pre_bmode_prob [VP9_BINTRAMODES - 1];
-  vp9_prob pre_ymode_prob [VP9_YMODES - 1]; /* interframe intra mode probs */
-  vp9_prob pre_uv_mode_prob [VP9_YMODES][VP9_UV_MODES - 1];
-  vp9_prob pre_i8x8_mode_prob [VP9_I8X8_MODES - 1];
-  vp9_prob pre_sub_mv_ref_prob [SUBMVREF_COUNT][VP9_SUBMVREFS - 1];
-  vp9_prob pre_mbsplit_prob [VP9_NUMMBSPLITS - 1];
-  unsigned int bmode_counts [VP9_BINTRAMODES];
-  unsigned int ymode_counts [VP9_YMODES];   /* interframe intra mode probs */
-  unsigned int uv_mode_counts [VP9_YMODES][VP9_UV_MODES];
-  unsigned int i8x8_mode_counts [VP9_I8X8_MODES];   /* interframe intra mode probs */
-  unsigned int sub_mv_ref_counts [SUBMVREF_COUNT][VP9_SUBMVREFS];
-  unsigned int mbsplit_counts [VP9_NUMMBSPLITS];
-
-  vp9_prob pre_coef_probs [BLOCK_TYPES] [COEF_BANDS]
-      [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
-  vp9_prob pre_hybrid_coef_probs [BLOCK_TYPES] [COEF_BANDS]
-      [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
-
-  vp9_prob pre_coef_probs_8x8 [BLOCK_TYPES_8X8] [COEF_BANDS]
-      [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
-  vp9_prob pre_hybrid_coef_probs_8x8 [BLOCK_TYPES_8X8] [COEF_BANDS]
-      [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
-
-  vp9_prob pre_coef_probs_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS]
-      [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
-  vp9_prob pre_hybrid_coef_probs_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS]
-      [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
-
-  unsigned int coef_counts [BLOCK_TYPES] [COEF_BANDS]
-      [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];
-  unsigned int hybrid_coef_counts [BLOCK_TYPES] [COEF_BANDS]
-      [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];
-
-  unsigned int coef_counts_8x8 [BLOCK_TYPES_8X8] [COEF_BANDS]
-      [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];
-  unsigned int hybrid_coef_counts_8x8 [BLOCK_TYPES_8X8] [COEF_BANDS]
-      [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];
-
-  unsigned int coef_counts_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS]
-      [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];
-  unsigned int hybrid_coef_counts_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS]
-      [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];
-
-  nmv_context_counts NMVcount;
-  vp9_prob switchable_interp_prob[VP9_SWITCHABLE_FILTERS + 1]
-                                 [VP9_SWITCHABLE_FILTERS - 1];
-
-  int mode_context[6][4];
-  int mode_context_a[6][4];
-  int vp8_mode_contexts[6][4];
-  int mv_ref_ct[6][4][2];
-  int mv_ref_ct_a[6][4][2];
-} FRAME_CONTEXT;
-
-typedef enum {
-  RECON_CLAMP_REQUIRED        = 0,
-  RECON_CLAMP_NOTREQUIRED     = 1
-} CLAMP_TYPE;
-
-typedef enum {
-  SINGLE_PREDICTION_ONLY = 0,
-  COMP_PREDICTION_ONLY   = 1,
-  HYBRID_PREDICTION      = 2,
-  NB_PREDICTION_TYPES    = 3,
-} COMPPREDMODE_TYPE;
-
-typedef enum {
-  ONLY_4X4            = 0,
-  ALLOW_8X8           = 1,
-  ALLOW_16X16         = 2,
-  TX_MODE_SELECT      = 3,
-  NB_TXFM_MODES       = 4,
-} TXFM_MODE;
-
-typedef struct VP9_COMMON_RTCD {
-#if CONFIG_RUNTIME_CPU_DETECT
-  vp9_idct_rtcd_vtable_t        idct;
-  vp9_subpix_rtcd_vtable_t      subpix;
-#if CONFIG_POSTPROC
-  vp9_postproc_rtcd_vtable_t    postproc;
-#endif
-  int                           flags;
-#else
-  int unused;
-#endif
-} VP9_COMMON_RTCD;
-
-typedef struct VP9Common {
-  struct vpx_internal_error_info  error;
-
-  DECLARE_ALIGNED(16, short, Y1dequant[QINDEX_RANGE][16]);
-  DECLARE_ALIGNED(16, short, Y2dequant[QINDEX_RANGE][16]);
-  DECLARE_ALIGNED(16, short, UVdequant[QINDEX_RANGE][16]);
-
-  int Width;
-  int Height;
-  int horiz_scale;
-  int vert_scale;
-
-  YUV_TYPE clr_type;
-  CLAMP_TYPE  clamp_type;
-
-  YV12_BUFFER_CONFIG *frame_to_show;
-
-  YV12_BUFFER_CONFIG yv12_fb[NUM_YV12_BUFFERS];
-  int fb_idx_ref_cnt[NUM_YV12_BUFFERS];
-  int new_fb_idx, lst_fb_idx, gld_fb_idx, alt_fb_idx;
-
-  YV12_BUFFER_CONFIG post_proc_buffer;
-  YV12_BUFFER_CONFIG temp_scale_frame;
-
-
-  FRAME_TYPE last_frame_type;  /* Save last frame's frame type for motion search. */
-  FRAME_TYPE frame_type;
-
-  int show_frame;
-
-  int frame_flags;
-  int MBs;
-  int mb_rows;
-  int mb_cols;
-  int mode_info_stride;
-
-  /* profile settings */
-  int experimental;
-  int mb_no_coeff_skip;
-  TXFM_MODE txfm_mode;
-  COMPPREDMODE_TYPE comp_pred_mode;
-  int no_lpf;
-  int use_bilinear_mc_filter;
-  int full_pixel;
-
-  int base_qindex;
-  int last_kf_gf_q;  /* Q used on the last GF or KF */
-
-  int y1dc_delta_q;
-  int y2dc_delta_q;
-  int y2ac_delta_q;
-  int uvdc_delta_q;
-  int uvac_delta_q;
-
-  unsigned int frames_since_golden;
-  unsigned int frames_till_alt_ref_frame;
-
-  /* We allocate a MODE_INFO struct for each macroblock, together with
-     an extra row on top and column on the left to simplify prediction. */
-
-  MODE_INFO *mip; /* Base of allocated array */
-  MODE_INFO *mi;  /* Corresponds to upper left visible macroblock */
-  MODE_INFO *prev_mip; /* MODE_INFO array 'mip' from last decoded frame */
-  MODE_INFO *prev_mi;  /* 'mi' from last frame (points into prev_mip) */
-
-
-  // Persistent mb segment id map used in prediction.
-  unsigned char *last_frame_seg_map;
-
-  INTERPOLATIONFILTERTYPE mcomp_filter_type;
-  LOOPFILTERTYPE filter_type;
-
-  loop_filter_info_n lf_info;
-
-  int filter_level;
-  int last_sharpness_level;
-  int sharpness_level;
-
-  int refresh_last_frame;       /* Two state 0 = NO, 1 = YES */
-  int refresh_golden_frame;     /* Two state 0 = NO, 1 = YES */
-  int refresh_alt_ref_frame;     /* Two state 0 = NO, 1 = YES */
-
-  int copy_buffer_to_gf;         /* 0 none, 1 Last to GF, 2 ARF to GF */
-  int copy_buffer_to_arf;        /* 0 none, 1 Last to ARF, 2 GF to ARF */
-
-  int refresh_entropy_probs;    /* Two state 0 = NO, 1 = YES */
-
-  int ref_frame_sign_bias[MAX_REF_FRAMES];    /* Two state 0, 1 */
-
-  /* Y,U,V,Y2 */
-  ENTROPY_CONTEXT_PLANES *above_context;   /* row of context for each plane */
-  ENTROPY_CONTEXT_PLANES left_context[2];  /* (up to) 4 contexts "" */
-
-  /* keyframe block modes are predicted by their above, left neighbors */
-
-  vp9_prob kf_bmode_prob [VP9_BINTRAMODES] [VP9_BINTRAMODES] [VP9_BINTRAMODES - 1];
-  vp9_prob kf_ymode_prob[8][VP9_YMODES - 1]; /* keyframe "" */
-#if CONFIG_SUPERBLOCKS
-  vp9_prob sb_kf_ymode_prob[8][VP9_I32X32_MODES - 1];
-#endif
-  int kf_ymode_probs_index;
-  int kf_ymode_probs_update;
-  vp9_prob kf_uv_mode_prob[VP9_YMODES] [VP9_UV_MODES - 1];
-
-  vp9_prob prob_intra_coded;
-  vp9_prob prob_last_coded;
-  vp9_prob prob_gf_coded;
-#if CONFIG_SUPERBLOCKS
-  vp9_prob sb_coded;
-#endif
-
-  // Context probabilities when using predictive coding of segment id
-  vp9_prob segment_pred_probs[PREDICTION_PROBS];
-  unsigned char temporal_update;
-
-  // Context probabilities for reference frame prediction
-  unsigned char ref_scores[MAX_REF_FRAMES];
-  vp9_prob ref_pred_probs[PREDICTION_PROBS];
-  vp9_prob mod_refprobs[MAX_REF_FRAMES][PREDICTION_PROBS];
-
-  vp9_prob prob_comppred[COMP_PRED_CONTEXTS];
-
-  // FIXME contextualize
-  vp9_prob prob_tx[TX_SIZE_MAX - 1];
-
-  vp9_prob mbskip_pred_probs[MBSKIP_CONTEXTS];
-
-  FRAME_CONTEXT lfc_a; /* last alt ref entropy */
-  FRAME_CONTEXT lfc; /* last frame entropy */
-  FRAME_CONTEXT fc;  /* this frame entropy */
-
-  // int mv_ref_ct[6][4][2];
-  // int mv_ref_ct_a[6][4][2];
-  // int mode_context[6][4];
-  // int mode_context_a[6][4];
-  // int vp8_mode_contexts[6][4];
-
-  unsigned int current_video_frame;
-  int near_boffset[3];
-  int version;
-
-#ifdef PACKET_TESTING
-  VP9_HEADER oh;
-#endif
-  double bitrate;
-  double framerate;
-
-#if CONFIG_RUNTIME_CPU_DETECT
-  VP9_COMMON_RTCD rtcd;
-#endif
-
-#if CONFIG_POSTPROC
-  struct postproc_state  postproc_state;
-#endif
-
-#if CONFIG_PRED_FILTER
-  /* Prediction filter variables */
-  int pred_filter_mode;   // 0=disabled at the frame level (no MB filtered)
-  // 1=enabled at the frame level (all MB filtered)
-  // 2=specified per MB (1=filtered, 0=non-filtered)
-  vp9_prob prob_pred_filter_off;
-#endif
-
-} VP9_COMMON;
-
-#endif  // __INC_ONYX_INT_H
--- a/vp8/common/onyxd.h
+++ /dev/null
@@ -1,68 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef __INC_ONYXD_H
-#define __INC_ONYXD_H
-
-
-/* Create/destroy static data structures. */
-#ifdef __cplusplus
-extern "C"
-{
-#endif
-#include "type_aliases.h"
-#include "vpx_scale/yv12config.h"
-#include "ppflags.h"
-#include "vpx_ports/mem.h"
-#include "vpx/vpx_codec.h"
-
-  typedef void   *VP9D_PTR;
-  typedef struct {
-    int     Width;
-    int     Height;
-    int     Version;
-    int     postprocess;
-    int     max_threads;
-    int     input_partition;
-  } VP9D_CONFIG;
-  typedef enum {
-    VP9_LAST_FLAG = 1,
-    VP9_GOLD_FLAG = 2,
-    VP9_ALT_FLAG = 4
-  } VP9_REFFRAME;
-
-  void vp9_initialize_dec(void);
-
-  int vp9_receive_compressed_data(VP9D_PTR comp, unsigned long size,
-                                  const unsigned char *dest,
-                                  int64_t time_stamp);
-
-  int vp9_get_raw_frame(VP9D_PTR comp, YV12_BUFFER_CONFIG *sd,
-                        int64_t *time_stamp, int64_t *time_end_stamp,
-                        vp9_ppflags_t *flags);
-
-  vpx_codec_err_t vp9_get_reference_dec(VP9D_PTR comp,
-                                        VP9_REFFRAME ref_frame_flag,
-                                        YV12_BUFFER_CONFIG *sd);
-
-  vpx_codec_err_t vp9_set_reference_dec(VP9D_PTR comp,
-                                        VP9_REFFRAME ref_frame_flag,
-                                        YV12_BUFFER_CONFIG *sd);
-
-  VP9D_PTR vp9_create_decompressor(VP9D_CONFIG *oxcf);
-
-  void vp9_remove_decompressor(VP9D_PTR comp);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif  // __INC_ONYXD_H
--- a/vp8/common/postproc.c
+++ /dev/null
@@ -1,1035 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vpx_ports/config.h"
-#include "vpx_scale/yv12config.h"
-#include "postproc.h"
-#include "vpx_scale/yv12extend.h"
-#include "vpx_scale/vpxscale.h"
-#include "systemdependent.h"
-
-#include <math.h>
-#include <stdlib.h>
-#include <stdio.h>
-
-#define RGB_TO_YUV(t)                                            \
-  ( (0.257*(float)(t >> 16))  + (0.504*(float)(t >> 8 & 0xff)) + \
-    (0.098*(float)(t & 0xff)) + 16),                             \
-  (-(0.148*(float)(t >> 16))  - (0.291*(float)(t >> 8 & 0xff)) + \
-    (0.439*(float)(t & 0xff)) + 128),                            \
-  ( (0.439*(float)(t >> 16))  - (0.368*(float)(t >> 8 & 0xff)) - \
-    (0.071*(float)(t & 0xff)) + 128)
-
-/* global constants */
-#if CONFIG_POSTPROC_VISUALIZER
-static const unsigned char MB_PREDICTION_MODE_colors[MB_MODE_COUNT][3] = {
-  { RGB_TO_YUV(0x98FB98) },   /* PaleGreen */
-  { RGB_TO_YUV(0x00FF00) },   /* Green */
-  { RGB_TO_YUV(0xADFF2F) },   /* GreenYellow */
-  { RGB_TO_YUV(0x8F0000) },   /* Dark Red */
-  { RGB_TO_YUV(0x008F8F) },   /* Dark Cyan */
-  { RGB_TO_YUV(0x008F8F) },   /* Dark Cyan */
-  { RGB_TO_YUV(0x008F8F) },   /* Dark Cyan */
-  { RGB_TO_YUV(0x8F0000) },   /* Dark Red */
-  { RGB_TO_YUV(0x8F0000) },   /* Dark Red */
-  { RGB_TO_YUV(0x228B22) },   /* ForestGreen */
-  { RGB_TO_YUV(0x006400) },   /* DarkGreen */
-  { RGB_TO_YUV(0x98F5FF) },   /* Cadet Blue */
-  { RGB_TO_YUV(0x6CA6CD) },   /* Sky Blue */
-  { RGB_TO_YUV(0x00008B) },   /* Dark blue */
-  { RGB_TO_YUV(0x551A8B) },   /* Purple */
-  { RGB_TO_YUV(0xFF0000) }    /* Red */
-  { RGB_TO_YUV(0xCC33FF) },   /* Magenta */
-};
-
-static const unsigned char B_PREDICTION_MODE_colors[B_MODE_COUNT][3] = {
-  { RGB_TO_YUV(0x6633ff) },   /* Purple */
-  { RGB_TO_YUV(0xcc33ff) },   /* Magenta */
-  { RGB_TO_YUV(0xff33cc) },   /* Pink */
-  { RGB_TO_YUV(0xff3366) },   /* Coral */
-  { RGB_TO_YUV(0x3366ff) },   /* Blue */
-  { RGB_TO_YUV(0xed00f5) },   /* Dark Blue */
-  { RGB_TO_YUV(0x2e00b8) },   /* Dark Purple */
-  { RGB_TO_YUV(0xff6633) },   /* Orange */
-  { RGB_TO_YUV(0x33ccff) },   /* Light Blue */
-  { RGB_TO_YUV(0x8ab800) },   /* Green */
-  { RGB_TO_YUV(0xffcc33) },   /* Light Orange */
-  { RGB_TO_YUV(0x33ffcc) },   /* Aqua */
-  { RGB_TO_YUV(0x66ff33) },   /* Light Green */
-  { RGB_TO_YUV(0xccff33) },   /* Yellow */
-};
-
-static const unsigned char MV_REFERENCE_FRAME_colors[MAX_REF_FRAMES][3] = {
-  { RGB_TO_YUV(0x00ff00) },   /* Blue */
-  { RGB_TO_YUV(0x0000ff) },   /* Green */
-  { RGB_TO_YUV(0xffff00) },   /* Yellow */
-  { RGB_TO_YUV(0xff0000) },   /* Red */
-};
-#endif
-
-static const short kernel5[] = {
-  1, 1, 4, 1, 1
-};
-
-const short vp9_rv[] = {
-  8, 5, 2, 2, 8, 12, 4, 9, 8, 3,
-  0, 3, 9, 0, 0, 0, 8, 3, 14, 4,
-  10, 1, 11, 14, 1, 14, 9, 6, 12, 11,
-  8, 6, 10, 0, 0, 8, 9, 0, 3, 14,
-  8, 11, 13, 4, 2, 9, 0, 3, 9, 6,
-  1, 2, 3, 14, 13, 1, 8, 2, 9, 7,
-  3, 3, 1, 13, 13, 6, 6, 5, 2, 7,
-  11, 9, 11, 8, 7, 3, 2, 0, 13, 13,
-  14, 4, 12, 5, 12, 10, 8, 10, 13, 10,
-  4, 14, 4, 10, 0, 8, 11, 1, 13, 7,
-  7, 14, 6, 14, 13, 2, 13, 5, 4, 4,
-  0, 10, 0, 5, 13, 2, 12, 7, 11, 13,
-  8, 0, 4, 10, 7, 2, 7, 2, 2, 5,
-  3, 4, 7, 3, 3, 14, 14, 5, 9, 13,
-  3, 14, 3, 6, 3, 0, 11, 8, 13, 1,
-  13, 1, 12, 0, 10, 9, 7, 6, 2, 8,
-  5, 2, 13, 7, 1, 13, 14, 7, 6, 7,
-  9, 6, 10, 11, 7, 8, 7, 5, 14, 8,
-  4, 4, 0, 8, 7, 10, 0, 8, 14, 11,
-  3, 12, 5, 7, 14, 3, 14, 5, 2, 6,
-  11, 12, 12, 8, 0, 11, 13, 1, 2, 0,
-  5, 10, 14, 7, 8, 0, 4, 11, 0, 8,
-  0, 3, 10, 5, 8, 0, 11, 6, 7, 8,
-  10, 7, 13, 9, 2, 5, 1, 5, 10, 2,
-  4, 3, 5, 6, 10, 8, 9, 4, 11, 14,
-  0, 10, 0, 5, 13, 2, 12, 7, 11, 13,
-  8, 0, 4, 10, 7, 2, 7, 2, 2, 5,
-  3, 4, 7, 3, 3, 14, 14, 5, 9, 13,
-  3, 14, 3, 6, 3, 0, 11, 8, 13, 1,
-  13, 1, 12, 0, 10, 9, 7, 6, 2, 8,
-  5, 2, 13, 7, 1, 13, 14, 7, 6, 7,
-  9, 6, 10, 11, 7, 8, 7, 5, 14, 8,
-  4, 4, 0, 8, 7, 10, 0, 8, 14, 11,
-  3, 12, 5, 7, 14, 3, 14, 5, 2, 6,
-  11, 12, 12, 8, 0, 11, 13, 1, 2, 0,
-  5, 10, 14, 7, 8, 0, 4, 11, 0, 8,
-  0, 3, 10, 5, 8, 0, 11, 6, 7, 8,
-  10, 7, 13, 9, 2, 5, 1, 5, 10, 2,
-  4, 3, 5, 6, 10, 8, 9, 4, 11, 14,
-  3, 8, 3, 7, 8, 5, 11, 4, 12, 3,
-  11, 9, 14, 8, 14, 13, 4, 3, 1, 2,
-  14, 6, 5, 4, 4, 11, 4, 6, 2, 1,
-  5, 8, 8, 12, 13, 5, 14, 10, 12, 13,
-  0, 9, 5, 5, 11, 10, 13, 9, 10, 13,
-};
-
-
-extern void vp9_blit_text(const char *msg, unsigned char *address,
-                          const int pitch);
-extern void vp9_blit_line(int x0, int x1, int y0, int y1,
-                          unsigned char *image, const int pitch);
-/****************************************************************************
- */
-void vp9_post_proc_down_and_across_c(unsigned char *src_ptr,
-                                     unsigned char *dst_ptr,
-                                     int src_pixels_per_line,
-                                     int dst_pixels_per_line,
-                                     int rows,
-                                     int cols,
-                                     int flimit) {
-  unsigned char *p_src, *p_dst;
-  int row;
-  int col;
-  int i;
-  int v;
-  int pitch = src_pixels_per_line;
-  unsigned char d[8];
-  (void)dst_pixels_per_line;
-
-  for (row = 0; row < rows; row++) {
-    /* post_proc_down for one row */
-    p_src = src_ptr;
-    p_dst = dst_ptr;
-
-    for (col = 0; col < cols; col++) {
-
-      int kernel = 4;
-      int v = p_src[col];
-
-      for (i = -2; i <= 2; i++) {
-        if (abs(v - p_src[col + i * pitch]) > flimit)
-          goto down_skip_convolve;
-
-        kernel += kernel5[2 + i] * p_src[col + i * pitch];
-      }
-
-      v = (kernel >> 3);
-    down_skip_convolve:
-      p_dst[col] = v;
-    }
-
-    /* now post_proc_across */
-    p_src = dst_ptr;
-    p_dst = dst_ptr;
-
-    for (i = 0; i < 8; i++)
-      d[i] = p_src[i];
-
-    for (col = 0; col < cols; col++) {
-      int kernel = 4;
-      v = p_src[col];
-
-      d[col & 7] = v;
-
-      for (i = -2; i <= 2; i++) {
-        if (abs(v - p_src[col + i]) > flimit)
-          goto across_skip_convolve;
-
-        kernel += kernel5[2 + i] * p_src[col + i];
-      }
-
-      d[col & 7] = (kernel >> 3);
-    across_skip_convolve:
-
-      if (col >= 2)
-        p_dst[col - 2] = d[(col - 2) & 7];
-    }
-
-    /* handle the last two pixels */
-    p_dst[col - 2] = d[(col - 2) & 7];
-    p_dst[col - 1] = d[(col - 1) & 7];
-
-
-    /* next row */
-    src_ptr += pitch;
-    dst_ptr += pitch;
-  }
-}
-
-static int q2mbl(int x) {
-  if (x < 20) x = 20;
-
-  x = 50 + (x - 50) * 10 / 8;
-  return x * x / 3;
-}
-
-void vp9_mbpost_proc_across_ip_c(unsigned char *src, int pitch,
-                                 int rows, int cols, int flimit) {
-  int r, c, i;
-
-  unsigned char *s = src;
-  unsigned char d[16];
-
-
-  for (r = 0; r < rows; r++) {
-    int sumsq = 0;
-    int sum   = 0;
-
-    for (i = -8; i <= 6; i++) {
-      sumsq += s[i] * s[i];
-      sum   += s[i];
-      d[i + 8] = 0;
-    }
-
-    for (c = 0; c < cols + 8; c++) {
-      int x = s[c + 7] - s[c - 8];
-      int y = s[c + 7] + s[c - 8];
-
-      sum  += x;
-      sumsq += x * y;
-
-      d[c & 15] = s[c];
-
-      if (sumsq * 15 - sum * sum < flimit) {
-        d[c & 15] = (8 + sum + s[c]) >> 4;
-      }
-
-      s[c - 8] = d[(c - 8) & 15];
-    }
-
-    s += pitch;
-  }
-}
-
-void vp9_mbpost_proc_down_c(unsigned char *dst, int pitch,
-                            int rows, int cols, int flimit) {
-  int r, c, i;
-  const short *rv3 = &vp9_rv[63 & rand()];
-
-  for (c = 0; c < cols; c++) {
-    unsigned char *s = &dst[c];
-    int sumsq = 0;
-    int sum   = 0;
-    unsigned char d[16];
-    const short *rv2 = rv3 + ((c * 17) & 127);
-
-    for (i = -8; i <= 6; i++) {
-      sumsq += s[i * pitch] * s[i * pitch];
-      sum   += s[i * pitch];
-    }
-
-    for (r = 0; r < rows + 8; r++) {
-      sumsq += s[7 * pitch] * s[ 7 * pitch] - s[-8 * pitch] * s[-8 * pitch];
-      sum  += s[7 * pitch] - s[-8 * pitch];
-      d[r & 15] = s[0];
-
-      if (sumsq * 15 - sum * sum < flimit) {
-        d[r & 15] = (rv2[r & 127] + sum + s[0]) >> 4;
-      }
-
-      s[-8 * pitch] = d[(r - 8) & 15];
-      s += pitch;
-    }
-  }
-}
-
-static void deblock_and_de_macro_block(YV12_BUFFER_CONFIG   *source,
-                                       YV12_BUFFER_CONFIG   *post,
-                                       int                   q,
-                                       int                   low_var_thresh,
-                                       int                   flag,
-                                       vp9_postproc_rtcd_vtable_t *rtcd) {
-  double level = 6.0e-05 * q * q * q - .0067 * q * q + .306 * q + .0065;
-  int ppl = (int)(level + .5);
-  (void) low_var_thresh;
-  (void) flag;
-
-  POSTPROC_INVOKE(rtcd, downacross)(source->y_buffer, post->y_buffer,
-                                    source->y_stride,  post->y_stride,
-                                    source->y_height, source->y_width,  ppl);
-  POSTPROC_INVOKE(rtcd, across)(post->y_buffer, post->y_stride,
-                                post->y_height, post->y_width, q2mbl(q));
-  POSTPROC_INVOKE(rtcd, down)(post->y_buffer, post->y_stride,
-                              post->y_height, post->y_width, q2mbl(q));
-
-  POSTPROC_INVOKE(rtcd, downacross)(source->u_buffer, post->u_buffer,
-                                    source->uv_stride, post->uv_stride,
-                                    source->uv_height, source->uv_width, ppl);
-  POSTPROC_INVOKE(rtcd, downacross)(source->v_buffer, post->v_buffer,
-                                    source->uv_stride, post->uv_stride,
-                                    source->uv_height, source->uv_width, ppl);
-}
-
-void vp9_deblock(YV12_BUFFER_CONFIG         *source,
-                 YV12_BUFFER_CONFIG         *post,
-                 int                         q,
-                 int                         low_var_thresh,
-                 int                         flag,
-                 vp9_postproc_rtcd_vtable_t *rtcd) {
-  double level = 6.0e-05 * q * q * q - .0067 * q * q + .306 * q + .0065;
-  int ppl = (int)(level + .5);
-  (void) low_var_thresh;
-  (void) flag;
-
-  POSTPROC_INVOKE(rtcd, downacross)(source->y_buffer, post->y_buffer,
-                                    source->y_stride,  post->y_stride,
-                                    source->y_height, source->y_width,   ppl);
-  POSTPROC_INVOKE(rtcd, downacross)(source->u_buffer, post->u_buffer,
-                                    source->uv_stride, post->uv_stride,
-                                    source->uv_height, source->uv_width, ppl);
-  POSTPROC_INVOKE(rtcd, downacross)(source->v_buffer, post->v_buffer,
-                                    source->uv_stride, post->uv_stride,
-                                    source->uv_height, source->uv_width, ppl);
-}
-
-void vp9_de_noise(YV12_BUFFER_CONFIG         *src,
-                  YV12_BUFFER_CONFIG         *post,
-                  int                         q,
-                  int                         low_var_thresh,
-                  int                         flag,
-                  vp9_postproc_rtcd_vtable_t *rtcd) {
-  double level = 6.0e-05 * q * q * q - .0067 * q * q + .306 * q + .0065;
-  int ppl = (int)(level + .5);
-  (void) post;
-  (void) low_var_thresh;
-  (void) flag;
-
-  POSTPROC_INVOKE(rtcd, downacross)(src->y_buffer + 2 * src->y_stride + 2,
-                                    src->y_buffer + 2 * src->y_stride + 2,
-                                    src->y_stride,
-                                    src->y_stride,
-                                    src->y_height - 4,
-                                    src->y_width - 4,
-                                    ppl);
-  POSTPROC_INVOKE(rtcd, downacross)(src->u_buffer + 2 * src->uv_stride + 2,
-                                    src->u_buffer + 2 * src->uv_stride + 2,
-                                    src->uv_stride,
-                                    src->uv_stride,
-                                    src->uv_height - 4,
-                                    src->uv_width - 4, ppl);
-  POSTPROC_INVOKE(rtcd, downacross)(src->v_buffer + 2 * src->uv_stride + 2,
-                                    src->v_buffer + 2 * src->uv_stride + 2,
-                                    src->uv_stride,
-                                    src->uv_stride,
-                                    src->uv_height - 4,
-                                    src->uv_width - 4, ppl);
-}
-
-double vp9_gaussian(double sigma, double mu, double x) {
-  return 1 / (sigma * sqrt(2.0 * 3.14159265)) *
-         (exp(-(x - mu) * (x - mu) / (2 * sigma * sigma)));
-}
-
-static void fillrd(struct postproc_state *state, int q, int a) {
-  char char_dist[300];
-
-  double sigma;
-  int ai = a, qi = q, i;
-
-  vp9_clear_system_state();
-
-  sigma = ai + .5 + .6 * (63 - qi) / 63.0;
-
-  /* set up a lookup table of 256 entries that matches
-   * a gaussian distribution with sigma determined by q.
-   */
-  {
-    double i;
-    int next, j;
-
-    next = 0;
-
-    for (i = -32; i < 32; i++) {
-      int a = (int)(.5 + 256 * vp9_gaussian(sigma, 0, i));
-
-      if (a) {
-        for (j = 0; j < a; j++) {
-          char_dist[next + j] = (char) i;
-        }
-
-        next = next + j;
-      }
-
-    }
-
-    for (next = next; next < 256; next++)
-      char_dist[next] = 0;
-  }
-
-  for (i = 0; i < 3072; i++) {
-    state->noise[i] = char_dist[rand() & 0xff];
-  }
-
-  for (i = 0; i < 16; i++) {
-    state->blackclamp[i] = -char_dist[0];
-    state->whiteclamp[i] = -char_dist[0];
-    state->bothclamp[i] = -2 * char_dist[0];
-  }
-
-  state->last_q = q;
-  state->last_noise = a;
-}
-
-/****************************************************************************
- *
- *  ROUTINE       : plane_add_noise_c
- *
- *  INPUTS        : unsigned char *Start  starting address of buffer to
- *                                        add gaussian noise to
- *                  unsigned int Width    width of plane
- *                  unsigned int Height   height of plane
- *                  int  Pitch    distance between subsequent lines of frame
- *                  int  q        quantizer used to determine amount of noise
- *                                  to add
- *
- *  OUTPUTS       : None.
- *
- *  RETURNS       : void.
- *
- *  FUNCTION      : adds gaussian noise to a plane of pixels
- *
- *  SPECIAL NOTES : None.
- *
- ****************************************************************************/
-void vp9_plane_add_noise_c(unsigned char *Start, char *noise,
-                           char blackclamp[16],
-                           char whiteclamp[16],
-                           char bothclamp[16],
-                           unsigned int Width, unsigned int Height, int Pitch) {
-  unsigned int i, j;
-
-  for (i = 0; i < Height; i++) {
-    unsigned char *Pos = Start + i * Pitch;
-    char  *Ref = (char *)(noise + (rand() & 0xff));
-
-    for (j = 0; j < Width; j++) {
-      if (Pos[j] < blackclamp[0])
-        Pos[j] = blackclamp[0];
-
-      if (Pos[j] > 255 + whiteclamp[0])
-        Pos[j] = 255 + whiteclamp[0];
-
-      Pos[j] += Ref[j];
-    }
-  }
-}
-
-/* Blend the macro block with a solid colored square.  Leave the
- * edges unblended to give distinction to macro blocks in areas
- * filled with the same color block.
- */
-void vp9_blend_mb_inner_c(unsigned char *y, unsigned char *u, unsigned char *v,
-                          int y1, int u1, int v1, int alpha, int stride) {
-  int i, j;
-  int y1_const = y1 * ((1 << 16) - alpha);
-  int u1_const = u1 * ((1 << 16) - alpha);
-  int v1_const = v1 * ((1 << 16) - alpha);
-
-  y += 2 * stride + 2;
-  for (i = 0; i < 12; i++) {
-    for (j = 0; j < 12; j++) {
-      y[j] = (y[j] * alpha + y1_const) >> 16;
-    }
-    y += stride;
-  }
-
-  stride >>= 1;
-
-  u += stride + 1;
-  v += stride + 1;
-
-  for (i = 0; i < 6; i++) {
-    for (j = 0; j < 6; j++) {
-      u[j] = (u[j] * alpha + u1_const) >> 16;
-      v[j] = (v[j] * alpha + v1_const) >> 16;
-    }
-    u += stride;
-    v += stride;
-  }
-}
-
-/* Blend only the edge of the macro block.  Leave center
- * unblended to allow for other visualizations to be layered.
- */
-void vp9_blend_mb_outer_c(unsigned char *y, unsigned char *u, unsigned char *v,
-                          int y1, int u1, int v1, int alpha, int stride) {
-  int i, j;
-  int y1_const = y1 * ((1 << 16) - alpha);
-  int u1_const = u1 * ((1 << 16) - alpha);
-  int v1_const = v1 * ((1 << 16) - alpha);
-
-  for (i = 0; i < 2; i++) {
-    for (j = 0; j < 16; j++) {
-      y[j] = (y[j] * alpha + y1_const) >> 16;
-    }
-    y += stride;
-  }
-
-  for (i = 0; i < 12; i++) {
-    y[0]  = (y[0] * alpha  + y1_const) >> 16;
-    y[1]  = (y[1] * alpha  + y1_const) >> 16;
-    y[14] = (y[14] * alpha + y1_const) >> 16;
-    y[15] = (y[15] * alpha + y1_const) >> 16;
-    y += stride;
-  }
-
-  for (i = 0; i < 2; i++) {
-    for (j = 0; j < 16; j++) {
-      y[j] = (y[j] * alpha + y1_const) >> 16;
-    }
-    y += stride;
-  }
-
-  stride >>= 1;
-
-  for (j = 0; j < 8; j++) {
-    u[j] = (u[j] * alpha + u1_const) >> 16;
-    v[j] = (v[j] * alpha + v1_const) >> 16;
-  }
-  u += stride;
-  v += stride;
-
-  for (i = 0; i < 6; i++) {
-    u[0] = (u[0] * alpha + u1_const) >> 16;
-    v[0] = (v[0] * alpha + v1_const) >> 16;
-
-    u[7] = (u[7] * alpha + u1_const) >> 16;
-    v[7] = (v[7] * alpha + v1_const) >> 16;
-
-    u += stride;
-    v += stride;
-  }
-
-  for (j = 0; j < 8; j++) {
-    u[j] = (u[j] * alpha + u1_const) >> 16;
-    v[j] = (v[j] * alpha + v1_const) >> 16;
-  }
-}
-
-void vp9_blend_b_c(unsigned char *y, unsigned char *u, unsigned char *v,
-                   int y1, int u1, int v1, int alpha, int stride) {
-  int i, j;
-  int y1_const = y1 * ((1 << 16) - alpha);
-  int u1_const = u1 * ((1 << 16) - alpha);
-  int v1_const = v1 * ((1 << 16) - alpha);
-
-  for (i = 0; i < 4; i++) {
-    for (j = 0; j < 4; j++) {
-      y[j] = (y[j] * alpha + y1_const) >> 16;
-    }
-    y += stride;
-  }
-
-  stride >>= 1;
-
-  for (i = 0; i < 2; i++) {
-    for (j = 0; j < 2; j++) {
-      u[j] = (u[j] * alpha + u1_const) >> 16;
-      v[j] = (v[j] * alpha + v1_const) >> 16;
-    }
-    u += stride;
-    v += stride;
-  }
-}
-
-static void constrain_line(int x0, int *x1, int y0, int *y1,
-                           int width, int height) {
-  int dx;
-  int dy;
-
-  if (*x1 > width) {
-    dx = *x1 - x0;
-    dy = *y1 - y0;
-
-    *x1 = width;
-    if (dx)
-      *y1 = ((width - x0) * dy) / dx + y0;
-  }
-  if (*x1 < 0) {
-    dx = *x1 - x0;
-    dy = *y1 - y0;
-
-    *x1 = 0;
-    if (dx)
-      *y1 = ((0 - x0) * dy) / dx + y0;
-  }
-  if (*y1 > height) {
-    dx = *x1 - x0;
-    dy = *y1 - y0;
-
-    *y1 = height;
-    if (dy)
-      *x1 = ((height - y0) * dx) / dy + x0;
-  }
-  if (*y1 < 0) {
-    dx = *x1 - x0;
-    dy = *y1 - y0;
-
-    *y1 = 0;
-    if (dy)
-      *x1 = ((0 - y0) * dx) / dy + x0;
-  }
-}
-
-
-#if CONFIG_RUNTIME_CPU_DETECT
-#define RTCD_VTABLE(oci) (&(oci)->rtcd.postproc)
-#else
-#define RTCD_VTABLE(oci) NULL
-#endif
-
-int vp9_post_proc_frame(VP9_COMMON *oci, YV12_BUFFER_CONFIG *dest,
-                        vp9_ppflags_t *ppflags) {
-  int q = oci->filter_level * 10 / 6;
-  int flags = ppflags->post_proc_flag;
-  int deblock_level = ppflags->deblocking_level;
-  int noise_level = ppflags->noise_level;
-
-  if (!oci->frame_to_show)
-    return -1;
-
-  if (q > 63)
-    q = 63;
-
-  if (!flags) {
-    *dest = *oci->frame_to_show;
-
-    /* handle problem with extending borders */
-    dest->y_width = oci->Width;
-    dest->y_height = oci->Height;
-    dest->uv_height = dest->y_height / 2;
-    return 0;
-
-  }
-
-#if ARCH_X86||ARCH_X86_64
-  vpx_reset_mmx_state();
-#endif
-
-  if (flags & VP9D_DEMACROBLOCK) {
-    deblock_and_de_macro_block(oci->frame_to_show, &oci->post_proc_buffer,
-                               q + (deblock_level - 5) * 10, 1, 0,
-                               RTCD_VTABLE(oci));
-  } else if (flags & VP9D_DEBLOCK) {
-    vp9_deblock(oci->frame_to_show, &oci->post_proc_buffer,
-                q, 1, 0, RTCD_VTABLE(oci));
-  } else {
-    vp8_yv12_copy_frame_ptr(oci->frame_to_show, &oci->post_proc_buffer);
-  }
-
-  if (flags & VP9D_ADDNOISE) {
-    if (oci->postproc_state.last_q != q
-        || oci->postproc_state.last_noise != noise_level) {
-      fillrd(&oci->postproc_state, 63 - q, noise_level);
-    }
-
-    POSTPROC_INVOKE(RTCD_VTABLE(oci), addnoise)(oci->post_proc_buffer.y_buffer,
-                                                oci->postproc_state.noise,
-                                                oci->postproc_state.blackclamp,
-                                                oci->postproc_state.whiteclamp,
-                                                oci->postproc_state.bothclamp,
-                                                oci->post_proc_buffer.y_width,
-                                                oci->post_proc_buffer.y_height,
-                                                oci->post_proc_buffer.y_stride);
-  }
-
-#if CONFIG_POSTPROC_VISUALIZER
-  if (flags & VP9D_DEBUG_TXT_FRAME_INFO) {
-    char message[512];
-    sprintf(message, "F%1dG%1dQ%3dF%3dP%d_s%dx%d",
-            (oci->frame_type == KEY_FRAME),
-            oci->refresh_golden_frame,
-            oci->base_qindex,
-            oci->filter_level,
-            flags,
-            oci->mb_cols, oci->mb_rows);
-    vp9_blit_text(message, oci->post_proc_buffer.y_buffer,
-                  oci->post_proc_buffer.y_stride);
-  }
-
-  if (flags & VP9D_DEBUG_TXT_MBLK_MODES) {
-    int i, j;
-    unsigned char *y_ptr;
-    YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer;
-    int mb_rows = post->y_height >> 4;
-    int mb_cols = post->y_width  >> 4;
-    int mb_index = 0;
-    MODE_INFO *mi = oci->mi;
-
-    y_ptr = post->y_buffer + 4 * post->y_stride + 4;
-
-    /* vp9_filter each macro block */
-    for (i = 0; i < mb_rows; i++) {
-      for (j = 0; j < mb_cols; j++) {
-        char zz[4];
-
-        sprintf(zz, "%c", mi[mb_index].mbmi.mode + 'a');
-
-        vp9_blit_text(zz, y_ptr, post->y_stride);
-        mb_index++;
-        y_ptr += 16;
-      }
-
-      mb_index++; /* border */
-      y_ptr += post->y_stride  * 16 - post->y_width;
-
-    }
-  }
-
-  if (flags & VP9D_DEBUG_TXT_DC_DIFF) {
-    int i, j;
-    unsigned char *y_ptr;
-    YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer;
-    int mb_rows = post->y_height >> 4;
-    int mb_cols = post->y_width  >> 4;
-    int mb_index = 0;
-    MODE_INFO *mi = oci->mi;
-
-    y_ptr = post->y_buffer + 4 * post->y_stride + 4;
-
-    /* vp9_filter each macro block */
-    for (i = 0; i < mb_rows; i++) {
-      for (j = 0; j < mb_cols; j++) {
-        char zz[4];
-        int dc_diff = !(mi[mb_index].mbmi.mode != B_PRED &&
-                        mi[mb_index].mbmi.mode != SPLITMV &&
-                        mi[mb_index].mbmi.mb_skip_coeff);
-
-        if (oci->frame_type == KEY_FRAME)
-          sprintf(zz, "a");
-        else
-          sprintf(zz, "%c", dc_diff + '0');
-
-        vp9_blit_text(zz, y_ptr, post->y_stride);
-        mb_index++;
-        y_ptr += 16;
-      }
-
-      mb_index++; /* border */
-      y_ptr += post->y_stride  * 16 - post->y_width;
-
-    }
-  }
-
-  if (flags & VP9D_DEBUG_TXT_RATE_INFO) {
-    char message[512];
-    snprintf(message, sizeof(message),
-             "Bitrate: %10.2f frame_rate: %10.2f ",
-             oci->bitrate, oci->framerate);
-    vp9_blit_text(message, oci->post_proc_buffer.y_buffer,
-                  oci->post_proc_buffer.y_stride);
-  }
-
-  /* Draw motion vectors */
-  if ((flags & VP9D_DEBUG_DRAW_MV) && ppflags->display_mv_flag) {
-    YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer;
-    int width  = post->y_width;
-    int height = post->y_height;
-    unsigned char *y_buffer = oci->post_proc_buffer.y_buffer;
-    int y_stride = oci->post_proc_buffer.y_stride;
-    MODE_INFO *mi = oci->mi;
-    int x0, y0;
-
-    for (y0 = 0; y0 < height; y0 += 16) {
-      for (x0 = 0; x0 < width; x0 += 16) {
-        int x1, y1;
-
-        if (!(ppflags->display_mv_flag & (1 << mi->mbmi.mode))) {
-          mi++;
-          continue;
-        }
-
-        if (mi->mbmi.mode == SPLITMV) {
-          switch (mi->mbmi.partitioning) {
-            case PARTITIONING_16X8 : {  /* mv_top_bottom */
-              union b_mode_info *bmi = &mi->bmi[0];
-              MV *mv = &bmi->mv.as_mv;
-
-              x1 = x0 + 8 + (mv->col >> 3);
-              y1 = y0 + 4 + (mv->row >> 3);
-
-              constrain_line(x0 + 8, &x1, y0 + 4, &y1, width, height);
-              vp9_blit_line(x0 + 8,  x1, y0 + 4,  y1, y_buffer, y_stride);
-
-              bmi = &mi->bmi[8];
-
-              x1 = x0 + 8 + (mv->col >> 3);
-              y1 = y0 + 12 + (mv->row >> 3);
-
-              constrain_line(x0 + 8, &x1, y0 + 12, &y1, width, height);
-              vp9_blit_line(x0 + 8,  x1, y0 + 12,  y1, y_buffer, y_stride);
-
-              break;
-            }
-            case PARTITIONING_8X16 : {  /* mv_left_right */
-              union b_mode_info *bmi = &mi->bmi[0];
-              MV *mv = &bmi->mv.as_mv;
-
-              x1 = x0 + 4 + (mv->col >> 3);
-              y1 = y0 + 8 + (mv->row >> 3);
-
-              constrain_line(x0 + 4, &x1, y0 + 8, &y1, width, height);
-              vp9_blit_line(x0 + 4,  x1, y0 + 8,  y1, y_buffer, y_stride);
-
-              bmi = &mi->bmi[2];
-
-              x1 = x0 + 12 + (mv->col >> 3);
-              y1 = y0 + 8 + (mv->row >> 3);
-
-              constrain_line(x0 + 12, &x1, y0 + 8, &y1, width, height);
-              vp9_blit_line(x0 + 12,  x1, y0 + 8,  y1, y_buffer, y_stride);
-
-              break;
-            }
-            case PARTITIONING_8X8 : {  /* mv_quarters   */
-              union b_mode_info *bmi = &mi->bmi[0];
-              MV *mv = &bmi->mv.as_mv;
-
-              x1 = x0 + 4 + (mv->col >> 3);
-              y1 = y0 + 4 + (mv->row >> 3);
-
-              constrain_line(x0 + 4, &x1, y0 + 4, &y1, width, height);
-              vp9_blit_line(x0 + 4,  x1, y0 + 4,  y1, y_buffer, y_stride);
-
-              bmi = &mi->bmi[2];
-
-              x1 = x0 + 12 + (mv->col >> 3);
-              y1 = y0 + 4 + (mv->row >> 3);
-
-              constrain_line(x0 + 12, &x1, y0 + 4, &y1, width, height);
-              vp9_blit_line(x0 + 12,  x1, y0 + 4,  y1, y_buffer, y_stride);
-
-              bmi = &mi->bmi[8];
-
-              x1 = x0 + 4 + (mv->col >> 3);
-              y1 = y0 + 12 + (mv->row >> 3);
-
-              constrain_line(x0 + 4, &x1, y0 + 12, &y1, width, height);
-              vp9_blit_line(x0 + 4,  x1, y0 + 12,  y1, y_buffer, y_stride);
-
-              bmi = &mi->bmi[10];
-
-              x1 = x0 + 12 + (mv->col >> 3);
-              y1 = y0 + 12 + (mv->row >> 3);
-
-              constrain_line(x0 + 12, &x1, y0 + 12, &y1, width, height);
-              vp9_blit_line(x0 + 12,  x1, y0 + 12,  y1, y_buffer, y_stride);
-              break;
-            }
-            case PARTITIONING_4X4:
-            default : {
-              union b_mode_info *bmi = mi->bmi;
-              int bx0, by0;
-
-              for (by0 = y0; by0 < (y0 + 16); by0 += 4) {
-                for (bx0 = x0; bx0 < (x0 + 16); bx0 += 4) {
-                  MV *mv = &bmi->mv.as_mv;
-
-                  x1 = bx0 + 2 + (mv->col >> 3);
-                  y1 = by0 + 2 + (mv->row >> 3);
-
-                  constrain_line(bx0 + 2, &x1, by0 + 2, &y1, width, height);
-                  vp9_blit_line(bx0 + 2,  x1, by0 + 2,  y1, y_buffer, y_stride);
-
-                  bmi++;
-                }
-              }
-            }
-          }
-        } else if (mi->mbmi.mode >= NEARESTMV) {
-          MV *mv = &mi->mbmi.mv.as_mv;
-          const int lx0 = x0 + 8;
-          const int ly0 = y0 + 8;
-
-          x1 = lx0 + (mv->col >> 3);
-          y1 = ly0 + (mv->row >> 3);
-
-          if (x1 != lx0 && y1 != ly0) {
-            constrain_line(lx0, &x1, ly0 - 1, &y1, width, height);
-            vp9_blit_line(lx0,  x1, ly0 - 1,  y1, y_buffer, y_stride);
-
-            constrain_line(lx0, &x1, ly0 + 1, &y1, width, height);
-            vp9_blit_line(lx0,  x1, ly0 + 1,  y1, y_buffer, y_stride);
-          } else
-            vp9_blit_line(lx0,  x1, ly0,  y1, y_buffer, y_stride);
-        }
-
-        mi++;
-      }
-      mi++;
-    }
-  }
-
-  /* Color in block modes */
-  if ((flags & VP9D_DEBUG_CLR_BLK_MODES)
-      && (ppflags->display_mb_modes_flag || ppflags->display_b_modes_flag)) {
-    int y, x;
-    YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer;
-    int width  = post->y_width;
-    int height = post->y_height;
-    unsigned char *y_ptr = oci->post_proc_buffer.y_buffer;
-    unsigned char *u_ptr = oci->post_proc_buffer.u_buffer;
-    unsigned char *v_ptr = oci->post_proc_buffer.v_buffer;
-    int y_stride = oci->post_proc_buffer.y_stride;
-    MODE_INFO *mi = oci->mi;
-
-    for (y = 0; y < height; y += 16) {
-      for (x = 0; x < width; x += 16) {
-        int Y = 0, U = 0, V = 0;
-
-        if (mi->mbmi.mode == B_PRED &&
-            ((ppflags->display_mb_modes_flag & B_PRED) ||
-             ppflags->display_b_modes_flag)) {
-          int by, bx;
-          unsigned char *yl, *ul, *vl;
-          union b_mode_info *bmi = mi->bmi;
-
-          yl = y_ptr + x;
-          ul = u_ptr + (x >> 1);
-          vl = v_ptr + (x >> 1);
-
-          for (by = 0; by < 16; by += 4) {
-            for (bx = 0; bx < 16; bx += 4) {
-              if ((ppflags->display_b_modes_flag & (1 << mi->mbmi.mode))
-                  || (ppflags->display_mb_modes_flag & B_PRED)) {
-                Y = B_PREDICTION_MODE_colors[bmi->as_mode.first][0];
-                U = B_PREDICTION_MODE_colors[bmi->as_mode.first][1];
-                V = B_PREDICTION_MODE_colors[bmi->as_mode.first][2];
-
-                POSTPROC_INVOKE(RTCD_VTABLE(oci), blend_b)(yl + bx,
-                                                           ul + (bx >> 1),
-                                                           vl + (bx >> 1),
-                                                           Y, U, V,
-                                                           0xc000, y_stride);
-              }
-              bmi++;
-            }
-
-            yl += y_stride * 4;
-            ul += y_stride * 1;
-            vl += y_stride * 1;
-          }
-        } else if (ppflags->display_mb_modes_flag & (1 << mi->mbmi.mode)) {
-          Y = MB_PREDICTION_MODE_colors[mi->mbmi.mode][0];
-          U = MB_PREDICTION_MODE_colors[mi->mbmi.mode][1];
-          V = MB_PREDICTION_MODE_colors[mi->mbmi.mode][2];
-
-          POSTPROC_INVOKE(RTCD_VTABLE(oci), blend_mb_inner)(y_ptr + x,
-                                                            u_ptr + (x >> 1),
-                                                            v_ptr + (x >> 1),
-                                                            Y, U, V,
-                                                            0xc000, y_stride);
-        }
-
-        mi++;
-      }
-      y_ptr += y_stride * 16;
-      u_ptr += y_stride * 4;
-      v_ptr += y_stride * 4;
-
-      mi++;
-    }
-  }
-
-  /* Color in frame reference blocks */
-  if ((flags & VP9D_DEBUG_CLR_FRM_REF_BLKS) &&
-      ppflags->display_ref_frame_flag) {
-    int y, x;
-    YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer;
-    int width  = post->y_width;
-    int height = post->y_height;
-    unsigned char *y_ptr = oci->post_proc_buffer.y_buffer;
-    unsigned char *u_ptr = oci->post_proc_buffer.u_buffer;
-    unsigned char *v_ptr = oci->post_proc_buffer.v_buffer;
-    int y_stride = oci->post_proc_buffer.y_stride;
-    MODE_INFO *mi = oci->mi;
-
-    for (y = 0; y < height; y += 16) {
-      for (x = 0; x < width; x += 16) {
-        int Y = 0, U = 0, V = 0;
-
-        if (ppflags->display_ref_frame_flag & (1 << mi->mbmi.ref_frame)) {
-          Y = MV_REFERENCE_FRAME_colors[mi->mbmi.ref_frame][0];
-          U = MV_REFERENCE_FRAME_colors[mi->mbmi.ref_frame][1];
-          V = MV_REFERENCE_FRAME_colors[mi->mbmi.ref_frame][2];
-
-          POSTPROC_INVOKE(RTCD_VTABLE(oci), blend_mb_outer)(y_ptr + x,
-                                                            u_ptr + (x >> 1),
-                                                            v_ptr + (x >> 1),
-                                                            Y, U, V,
-                                                            0xc000, y_stride);
-        }
-
-        mi++;
-      }
-      y_ptr += y_stride * 16;
-      u_ptr += y_stride * 4;
-      v_ptr += y_stride * 4;
-
-      mi++;
-    }
-  }
-#endif
-
-  *dest = oci->post_proc_buffer;
-
-  /* handle problem with extending borders */
-  dest->y_width = oci->Width;
-  dest->y_height = oci->Height;
-  dest->uv_height = dest->y_height / 2;
-
-  return 0;
-}
--- a/vp8/common/postproc.h
+++ /dev/null
@@ -1,128 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef POSTPROC_H
-#define POSTPROC_H
-
-#define prototype_postproc_inplace(sym)\
-  void sym(unsigned char *dst, int pitch, int rows, int cols, int flimit)
-
-#define prototype_postproc(sym)\
-  void sym(unsigned char *src, unsigned char *dst, int src_pitch, \
-           int dst_pitch, int rows, int cols, int flimit)
-
-#define prototype_postproc_addnoise(sym) \
-  void sym(unsigned char *s, char *noise, char blackclamp[16], \
-           char whiteclamp[16], char bothclamp[16], \
-           unsigned int w, unsigned int h, int pitch)
-
-#define prototype_postproc_blend_mb_inner(sym)\
-  void sym(unsigned char *y, unsigned char *u, unsigned char *v, \
-           int y1, int u1, int v1, int alpha, int stride)
-
-#define prototype_postproc_blend_mb_outer(sym)\
-  void sym(unsigned char *y, unsigned char *u, unsigned char *v, \
-           int y1, int u1, int v1, int alpha, int stride)
-
-#define prototype_postproc_blend_b(sym)\
-  void sym(unsigned char *y, unsigned char *u, unsigned char *v, \
-           int y1, int u1, int v1, int alpha, int stride)
-
-#if ARCH_X86 || ARCH_X86_64
-#include "x86/postproc_x86.h"
-#endif
-
-#ifndef vp9_postproc_down
-#define vp9_postproc_down vp9_mbpost_proc_down_c
-#endif
-extern prototype_postproc_inplace(vp9_postproc_down);
-
-#ifndef vp9_postproc_across
-#define vp9_postproc_across vp9_mbpost_proc_across_ip_c
-#endif
-extern prototype_postproc_inplace(vp9_postproc_across);
-
-#ifndef vp9_postproc_downacross
-#define vp9_postproc_downacross vp9_post_proc_down_and_across_c
-#endif
-extern prototype_postproc(vp9_postproc_downacross);
-
-#ifndef vp9_postproc_addnoise
-#define vp9_postproc_addnoise vp9_plane_add_noise_c
-#endif
-extern prototype_postproc_addnoise(vp9_postproc_addnoise);
-
-#ifndef vp9_postproc_blend_mb_inner
-#define vp9_postproc_blend_mb_inner vp9_blend_mb_inner_c
-#endif
-extern prototype_postproc_blend_mb_inner(vp9_postproc_blend_mb_inner);
-
-#ifndef vp9_postproc_blend_mb_outer
-#define vp9_postproc_blend_mb_outer vp9_blend_mb_outer_c
-#endif
-extern prototype_postproc_blend_mb_outer(vp9_postproc_blend_mb_outer);
-
-#ifndef vp9_postproc_blend_b
-#define vp9_postproc_blend_b vp9_blend_b_c
-#endif
-extern prototype_postproc_blend_b(vp9_postproc_blend_b);
-
-typedef prototype_postproc((*vp9_postproc_fn_t));
-typedef prototype_postproc_inplace((*vp9_postproc_inplace_fn_t));
-typedef prototype_postproc_addnoise((*vp9_postproc_addnoise_fn_t));
-typedef prototype_postproc_blend_mb_inner((*vp9_postproc_blend_mb_inner_fn_t));
-typedef prototype_postproc_blend_mb_outer((*vp9_postproc_blend_mb_outer_fn_t));
-typedef prototype_postproc_blend_b((*vp9_postproc_blend_b_fn_t));
-typedef struct {
-  vp9_postproc_inplace_fn_t           down;
-  vp9_postproc_inplace_fn_t           across;
-  vp9_postproc_fn_t                   downacross;
-  vp9_postproc_addnoise_fn_t          addnoise;
-  vp9_postproc_blend_mb_inner_fn_t    blend_mb_inner;
-  vp9_postproc_blend_mb_outer_fn_t    blend_mb_outer;
-  vp9_postproc_blend_b_fn_t           blend_b;
-} vp9_postproc_rtcd_vtable_t;
-
-#if CONFIG_RUNTIME_CPU_DETECT
-#define POSTPROC_INVOKE(ctx,fn) (ctx)->fn
-#else
-#define POSTPROC_INVOKE(ctx,fn) vp9_postproc_##fn
-#endif
-
-#include "vpx_ports/mem.h"
-struct postproc_state {
-  int           last_q;
-  int           last_noise;
-  char          noise[3072];
-  DECLARE_ALIGNED(16, char, blackclamp[16]);
-  DECLARE_ALIGNED(16, char, whiteclamp[16]);
-  DECLARE_ALIGNED(16, char, bothclamp[16]);
-};
-#include "onyxc_int.h"
-#include "ppflags.h"
-int vp9_post_proc_frame(struct VP9Common *oci, YV12_BUFFER_CONFIG *dest,
-                        vp9_ppflags_t *flags);
-
-
-void vp9_de_noise(YV12_BUFFER_CONFIG         *source,
-                  YV12_BUFFER_CONFIG         *post,
-                  int                         q,
-                  int                         low_var_thresh,
-                  int                         flag,
-                  vp9_postproc_rtcd_vtable_t *rtcd);
-
-void vp9_deblock(YV12_BUFFER_CONFIG         *source,
-                 YV12_BUFFER_CONFIG         *post,
-                 int                         q,
-                 int                         low_var_thresh,
-                 int                         flag,
-                 vp9_postproc_rtcd_vtable_t *rtcd);
-#endif
--- a/vp8/common/ppc/copy_altivec.asm
+++ /dev/null
@@ -1,47 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    .globl copy_mem16x16_ppc
-
-;# r3 unsigned char *src
-;# r4 int src_stride
-;# r5 unsigned char *dst
-;# r6 int dst_stride
-
-;# Make the assumption that input will not be aligned,
-;#  but the output will be.  So two reads and a perm
-;#  for the input, but only one store for the output.
-copy_mem16x16_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xe000
-    mtspr   256, r12            ;# set VRSAVE
-
-    li      r10, 16
-    mtctr   r10
-
-cp_16x16_loop:
-    lvsl    v0,  0, r3          ;# permutate value for alignment
-
-    lvx     v1,   0, r3
-    lvx     v2, r10, r3
-
-    vperm   v1, v1, v2, v0
-
-    stvx    v1,  0, r5
-
-    add     r3, r3, r4          ;# increment source pointer
-    add     r5, r5, r6          ;# increment destination pointer
-
-    bdnz    cp_16x16_loop
-
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr
--- a/vp8/common/ppc/filter_altivec.asm
+++ /dev/null
@@ -1,1013 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    .globl sixtap_predict_ppc
-    .globl sixtap_predict8x4_ppc
-    .globl sixtap_predict8x8_ppc
-    .globl sixtap_predict16x16_ppc
-
-.macro load_c V, LABEL, OFF, R0, R1
-    lis     \R0, \LABEL@ha
-    la      \R1, \LABEL@l(\R0)
-    lvx     \V, \OFF, \R1
-.endm
-
-.macro load_hfilter V0, V1
-    load_c \V0, HFilter, r5, r9, r10
-
-    addi    r5,  r5, 16
-    lvx     \V1, r5, r10
-.endm
-
-;# Vertical filtering
-.macro Vprolog
-    load_c v0, VFilter, r6, r3, r10
-
-    vspltish v5, 8
-    vspltish v6, 3
-    vslh    v6, v5, v6      ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
-
-    vspltb  v1, v0, 1
-    vspltb  v2, v0, 2
-    vspltb  v3, v0, 3
-    vspltb  v4, v0, 4
-    vspltb  v5, v0, 5
-    vspltb  v0, v0, 0
-.endm
-
-.macro vpre_load
-    Vprolog
-    li      r10,  16
-    lvx     v10,   0, r9    ;# v10..v14 = first 5 rows
-    lvx     v11, r10, r9
-    addi    r9,   r9, 32
-    lvx     v12,   0, r9
-    lvx     v13, r10, r9
-    addi    r9,   r9, 32
-    lvx     v14,   0, r9
-.endm
-
-.macro Msum Re, Ro, V, T, TMP
-                                ;# (Re,Ro) += (V*T)
-    vmuleub \TMP, \V, \T        ;# trashes v8
-    vadduhm \Re, \Re, \TMP      ;# Re = evens, saturation unnecessary
-    vmuloub \TMP, \V, \T
-    vadduhm \Ro, \Ro, \TMP      ;# Ro = odds
-.endm
-
-.macro vinterp_no_store P0 P1 P2 P3 P4 P5
-    vmuleub  v8, \P0, v0        ;# 64 + 4 positive taps
-    vadduhm v16, v6, v8
-    vmuloub  v8, \P0, v0
-    vadduhm v17, v6, v8
-    Msum v16, v17, \P2, v2, v8
-    Msum v16, v17, \P3, v3, v8
-    Msum v16, v17, \P5, v5, v8
-
-    vmuleub v18, \P1, v1        ;# 2 negative taps
-    vmuloub v19, \P1, v1
-    Msum v18, v19, \P4, v4, v8
-
-    vsubuhs v16, v16, v18       ;# subtract neg from pos
-    vsubuhs v17, v17, v19
-    vsrh    v16, v16, v7        ;# divide by 128
-    vsrh    v17, v17, v7        ;# v16 v17 = evens, odds
-    vmrghh  v18, v16, v17       ;# v18 v19 = 16-bit result in order
-    vmrglh  v19, v16, v17
-    vpkuhus  \P0, v18, v19      ;# P0 = 8-bit result
-.endm
-
-.macro vinterp_no_store_8x8 P0 P1 P2 P3 P4 P5
-    vmuleub v24, \P0, v13       ;# 64 + 4 positive taps
-    vadduhm v21, v20, v24
-    vmuloub v24, \P0, v13
-    vadduhm v22, v20, v24
-    Msum v21, v22, \P2, v15, v25
-    Msum v21, v22, \P3, v16, v25
-    Msum v21, v22, \P5, v18, v25
-
-    vmuleub v23, \P1, v14       ;# 2 negative taps
-    vmuloub v24, \P1, v14
-    Msum v23, v24, \P4, v17, v25
-
-    vsubuhs v21, v21, v23       ;# subtract neg from pos
-    vsubuhs v22, v22, v24
-    vsrh    v21, v21, v19       ;# divide by 128
-    vsrh    v22, v22, v19       ;# v16 v17 = evens, odds
-    vmrghh  v23, v21, v22       ;# v18 v19 = 16-bit result in order
-    vmrglh  v24, v21, v22
-    vpkuhus \P0, v23, v24       ;# P0 = 8-bit result
-.endm
-
-
-.macro Vinterp P0 P1 P2 P3 P4 P5
-    vinterp_no_store \P0, \P1, \P2, \P3, \P4, \P5
-    stvx    \P0, 0, r7
-    add     r7, r7, r8      ;# 33 ops per 16 pels
-.endm
-
-
-.macro luma_v P0, P1, P2, P3, P4, P5
-    addi    r9,   r9, 16        ;# P5 = newest input row
-    lvx     \P5,   0, r9
-    Vinterp \P0, \P1, \P2, \P3, \P4, \P5
-.endm
-
-.macro luma_vtwo
-    luma_v v10, v11, v12, v13, v14, v15
-    luma_v v11, v12, v13, v14, v15, v10
-.endm
-
-.macro luma_vfour
-    luma_vtwo
-    luma_v v12, v13, v14, v15, v10, v11
-    luma_v v13, v14, v15, v10, v11, v12
-.endm
-
-.macro luma_vsix
-    luma_vfour
-    luma_v v14, v15, v10, v11, v12, v13
-    luma_v v15, v10, v11, v12, v13, v14
-.endm
-
-.macro Interp4 R I I4
-    vmsummbm \R, v13, \I, v15
-    vmsummbm \R, v14, \I4, \R
-.endm
-
-.macro Read8x8 VD, RS, RP, increment_counter
-    lvsl    v21,  0, \RS        ;# permutate value for alignment
-
-    ;# input to filter is 21 bytes wide, output is 16 bytes.
-    ;#  input will can span three vectors if not aligned correctly.
-    lvx     \VD,   0, \RS
-    lvx     v20, r10, \RS
-
-.if \increment_counter
-    add     \RS, \RS, \RP
-.endif
-
-    vperm   \VD, \VD, v20, v21
-.endm
-
-.macro interp_8x8 R
-    vperm   v20, \R, \R, v16    ;# v20 = 0123 1234 2345 3456
-    vperm   v21, \R, \R, v17    ;# v21 = 4567 5678 6789 789A
-    Interp4 v20, v20,  v21      ;# v20 = result 0 1 2 3
-    vperm   \R, \R, \R, v18     ;# R   = 89AB 9ABC ABCx BCxx
-    Interp4 v21, v21, \R        ;# v21 = result 4 5 6 7
-
-    vpkswus \R, v20, v21        ;#  R = 0 1 2 3 4 5 6 7
-    vsrh    \R, \R, v19
-
-    vpkuhus \R, \R, \R          ;# saturate and pack
-
-.endm
-
-.macro Read4x4 VD, RS, RP, increment_counter
-    lvsl    v21,  0, \RS        ;# permutate value for alignment
-
-    ;# input to filter is 21 bytes wide, output is 16 bytes.
-    ;#  input will can span three vectors if not aligned correctly.
-    lvx     v20,   0, \RS
-
-.if \increment_counter
-    add     \RS, \RS, \RP
-.endif
-
-    vperm   \VD, v20, v20, v21
-.endm
-    .text
-
-    .align 2
-;# r3 unsigned char * src
-;# r4 int src_pitch
-;# r5 int x_offset
-;# r6 int y_offset
-;# r7 unsigned char * dst
-;# r8 int dst_pitch
-sixtap_predict_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xff87
-    ori     r12, r12, 0xffc0
-    mtspr   256, r12            ;# set VRSAVE
-
-    stwu    r1,-32(r1)          ;# create space on the stack
-
-    slwi.   r5, r5, 5           ;# index into horizontal filter array
-
-    vspltish v19, 7
-
-    ;# If there isn't any filtering to be done for the horizontal, then
-    ;#  just skip to the second pass.
-    beq-    vertical_only_4x4
-
-    ;# load up horizontal filter
-    load_hfilter v13, v14
-
-    ;# rounding added in on the multiply
-    vspltisw v16, 8
-    vspltisw v15, 3
-    vslw    v15, v16, v15       ;# 0x00000040000000400000004000000040
-
-    ;# Load up permutation constants
-    load_c v16, B_0123, 0, r9, r10
-    load_c v17, B_4567, 0, r9, r10
-    load_c v18, B_89AB, 0, r9, r10
-
-    ;# Back off input buffer by 2 bytes.  Need 2 before and 3 after
-    addi    r3, r3, -2
-
-    addi    r9, r3, 0
-    li      r10, 16
-    Read8x8 v2, r3, r4, 1
-    Read8x8 v3, r3, r4, 1
-    Read8x8 v4, r3, r4, 1
-    Read8x8 v5, r3, r4, 1
-
-    slwi.   r6, r6, 4           ;# index into vertical filter array
-
-    ;# filter a line
-    interp_8x8 v2
-    interp_8x8 v3
-    interp_8x8 v4
-    interp_8x8 v5
-
-    ;# Finished filtering main horizontal block.  If there is no
-    ;#  vertical filtering, jump to storing the data.  Otherwise
-    ;#  load up and filter the additional 5 lines that are needed
-    ;#  for the vertical filter.
-    beq-    store_4x4
-
-    ;# only needed if there is a vertical filter present
-    ;# if the second filter is not null then need to back off by 2*pitch
-    sub     r9, r9, r4
-    sub     r9, r9, r4
-
-    Read8x8 v0, r9, r4, 1
-    Read8x8 v1, r9, r4, 0
-    Read8x8 v6, r3, r4, 1
-    Read8x8 v7, r3, r4, 1
-    Read8x8 v8, r3, r4, 0
-
-    interp_8x8 v0
-    interp_8x8 v1
-    interp_8x8 v6
-    interp_8x8 v7
-    interp_8x8 v8
-
-    b       second_pass_4x4
-
-vertical_only_4x4:
-    ;# only needed if there is a vertical filter present
-    ;# if the second filter is not null then need to back off by 2*pitch
-    sub     r3, r3, r4
-    sub     r3, r3, r4
-    li      r10, 16
-
-    Read8x8 v0, r3, r4, 1
-    Read8x8 v1, r3, r4, 1
-    Read8x8 v2, r3, r4, 1
-    Read8x8 v3, r3, r4, 1
-    Read8x8 v4, r3, r4, 1
-    Read8x8 v5, r3, r4, 1
-    Read8x8 v6, r3, r4, 1
-    Read8x8 v7, r3, r4, 1
-    Read8x8 v8, r3, r4, 0
-
-    slwi    r6, r6, 4           ;# index into vertical filter array
-
-second_pass_4x4:
-    load_c   v20, b_hilo_4x4, 0, r9, r10
-    load_c   v21, b_hilo, 0, r9, r10
-
-    ;# reposition input so that it can go through the
-    ;# filtering phase with one pass.
-    vperm   v0, v0, v1, v20     ;# 0 1 x x
-    vperm   v2, v2, v3, v20     ;# 2 3 x x
-    vperm   v4, v4, v5, v20     ;# 4 5 x x
-    vperm   v6, v6, v7, v20     ;# 6 7 x x
-
-    vperm   v0, v0, v2, v21     ;# 0 1 2 3
-    vperm   v4, v4, v6, v21     ;# 4 5 6 7
-
-    vsldoi  v1, v0, v4, 4
-    vsldoi  v2, v0, v4, 8
-    vsldoi  v3, v0, v4, 12
-
-    vsldoi  v5, v4, v8, 4
-
-    load_c   v13, VFilter, r6, r9, r10
-
-    vspltish v15, 8
-    vspltish v20, 3
-    vslh    v20, v15, v20       ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
-
-    vspltb  v14, v13, 1
-    vspltb  v15, v13, 2
-    vspltb  v16, v13, 3
-    vspltb  v17, v13, 4
-    vspltb  v18, v13, 5
-    vspltb  v13, v13, 0
-
-    vinterp_no_store_8x8 v0, v1, v2, v3, v4, v5
-
-    stvx    v0, 0, r1
-
-    lwz     r0, 0(r1)
-    stw     r0, 0(r7)
-    add     r7, r7, r8
-
-    lwz     r0, 4(r1)
-    stw     r0, 0(r7)
-    add     r7, r7, r8
-
-    lwz     r0, 8(r1)
-    stw     r0, 0(r7)
-    add     r7, r7, r8
-
-    lwz     r0, 12(r1)
-    stw     r0, 0(r7)
-
-    b       exit_4x4
-
-store_4x4:
-
-    stvx    v2, 0, r1
-    lwz     r0, 0(r1)
-    stw     r0, 0(r7)
-    add     r7, r7, r8
-
-    stvx    v3, 0, r1
-    lwz     r0, 0(r1)
-    stw     r0, 0(r7)
-    add     r7, r7, r8
-
-    stvx    v4, 0, r1
-    lwz     r0, 0(r1)
-    stw     r0, 0(r7)
-    add     r7, r7, r8
-
-    stvx    v5, 0, r1
-    lwz     r0, 0(r1)
-    stw     r0, 0(r7)
-
-exit_4x4:
-
-    addi    r1, r1, 32          ;# recover stack
-
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr
-
-.macro w_8x8 V, D, R, P
-    stvx    \V, 0, r1
-    lwz     \R, 0(r1)
-    stw     \R, 0(r7)
-    lwz     \R, 4(r1)
-    stw     \R, 4(r7)
-    add     \D, \D, \P
-.endm
-
-    .align 2
-;# r3 unsigned char * src
-;# r4 int src_pitch
-;# r5 int x_offset
-;# r6 int y_offset
-;# r7 unsigned char * dst
-;# r8 int dst_pitch
-
-sixtap_predict8x4_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xffff
-    ori     r12, r12, 0xffc0
-    mtspr   256, r12            ;# set VRSAVE
-
-    stwu    r1,-32(r1)          ;# create space on the stack
-
-    slwi.   r5, r5, 5           ;# index into horizontal filter array
-
-    vspltish v19, 7
-
-    ;# If there isn't any filtering to be done for the horizontal, then
-    ;#  just skip to the second pass.
-    beq-    second_pass_pre_copy_8x4
-
-    load_hfilter v13, v14
-
-    ;# rounding added in on the multiply
-    vspltisw v16, 8
-    vspltisw v15, 3
-    vslw    v15, v16, v15       ;# 0x00000040000000400000004000000040
-
-    ;# Load up permutation constants
-    load_c v16, B_0123, 0, r9, r10
-    load_c v17, B_4567, 0, r9, r10
-    load_c v18, B_89AB, 0, r9, r10
-
-    ;# Back off input buffer by 2 bytes.  Need 2 before and 3 after
-    addi    r3, r3, -2
-
-    addi    r9, r3, 0
-    li      r10, 16
-    Read8x8 v2, r3, r4, 1
-    Read8x8 v3, r3, r4, 1
-    Read8x8 v4, r3, r4, 1
-    Read8x8 v5, r3, r4, 1
-
-    slwi.   r6, r6, 4           ;# index into vertical filter array
-
-    ;# filter a line
-    interp_8x8 v2
-    interp_8x8 v3
-    interp_8x8 v4
-    interp_8x8 v5
-
-    ;# Finished filtering main horizontal block.  If there is no
-    ;#  vertical filtering, jump to storing the data.  Otherwise
-    ;#  load up and filter the additional 5 lines that are needed
-    ;#  for the vertical filter.
-    beq-    store_8x4
-
-    ;# only needed if there is a vertical filter present
-    ;# if the second filter is not null then need to back off by 2*pitch
-    sub     r9, r9, r4
-    sub     r9, r9, r4
-
-    Read8x8 v0, r9, r4, 1
-    Read8x8 v1, r9, r4, 0
-    Read8x8 v6, r3, r4, 1
-    Read8x8 v7, r3, r4, 1
-    Read8x8 v8, r3, r4, 0
-
-    interp_8x8 v0
-    interp_8x8 v1
-    interp_8x8 v6
-    interp_8x8 v7
-    interp_8x8 v8
-
-    b       second_pass_8x4
-
-second_pass_pre_copy_8x4:
-    ;# only needed if there is a vertical filter present
-    ;# if the second filter is not null then need to back off by 2*pitch
-    sub     r3, r3, r4
-    sub     r3, r3, r4
-    li      r10, 16
-
-    Read8x8 v0,  r3, r4, 1
-    Read8x8 v1,  r3, r4, 1
-    Read8x8 v2,  r3, r4, 1
-    Read8x8 v3,  r3, r4, 1
-    Read8x8 v4,  r3, r4, 1
-    Read8x8 v5,  r3, r4, 1
-    Read8x8 v6,  r3, r4, 1
-    Read8x8 v7,  r3, r4, 1
-    Read8x8 v8,  r3, r4, 1
-
-    slwi    r6, r6, 4           ;# index into vertical filter array
-
-second_pass_8x4:
-    load_c v13, VFilter, r6, r9, r10
-
-    vspltish v15, 8
-    vspltish v20, 3
-    vslh    v20, v15, v20       ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
-
-    vspltb  v14, v13, 1
-    vspltb  v15, v13, 2
-    vspltb  v16, v13, 3
-    vspltb  v17, v13, 4
-    vspltb  v18, v13, 5
-    vspltb  v13, v13, 0
-
-    vinterp_no_store_8x8 v0, v1, v2, v3,  v4,  v5
-    vinterp_no_store_8x8 v1, v2, v3, v4,  v5,  v6
-    vinterp_no_store_8x8 v2, v3, v4, v5,  v6,  v7
-    vinterp_no_store_8x8 v3, v4, v5, v6,  v7,  v8
-
-    cmpi    cr0, r8, 8
-    beq     cr0, store_aligned_8x4
-
-    w_8x8   v0, r7, r0, r8
-    w_8x8   v1, r7, r0, r8
-    w_8x8   v2, r7, r0, r8
-    w_8x8   v3, r7, r0, r8
-
-    b       exit_8x4
-
-store_aligned_8x4:
-
-    load_c v10, b_hilo, 0, r9, r10
-
-    vperm   v0, v0, v1, v10
-    vperm   v2, v2, v3, v10
-
-    stvx    v0, 0, r7
-    addi    r7, r7, 16
-    stvx    v2, 0, r7
-
-    b       exit_8x4
-
-store_8x4:
-    cmpi    cr0, r8, 8
-    beq     cr0, store_aligned2_8x4
-
-    w_8x8   v2, r7, r0, r8
-    w_8x8   v3, r7, r0, r8
-    w_8x8   v4, r7, r0, r8
-    w_8x8   v5, r7, r0, r8
-
-    b       exit_8x4
-
-store_aligned2_8x4:
-    load_c v10, b_hilo, 0, r9, r10
-
-    vperm   v2, v2, v3, v10
-    vperm   v4, v4, v5, v10
-
-    stvx    v2, 0, r7
-    addi    r7, r7, 16
-    stvx    v4, 0, r7
-
-exit_8x4:
-
-    addi    r1, r1, 32          ;# recover stack
-
-    mtspr   256, r11            ;# reset old VRSAVE
-
-
-    blr
-
-    .align 2
-;# r3 unsigned char * src
-;# r4 int src_pitch
-;# r5 int x_offset
-;# r6 int y_offset
-;# r7 unsigned char * dst
-;# r8 int dst_pitch
-
-;# Because the width that needs to be filtered will fit in a single altivec
-;#  register there is no need to loop.  Everything can stay in registers.
-sixtap_predict8x8_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xffff
-    ori     r12, r12, 0xffc0
-    mtspr   256, r12            ;# set VRSAVE
-
-    stwu    r1,-32(r1)          ;# create space on the stack
-
-    slwi.   r5, r5, 5           ;# index into horizontal filter array
-
-    vspltish v19, 7
-
-    ;# If there isn't any filtering to be done for the horizontal, then
-    ;#  just skip to the second pass.
-    beq-    second_pass_pre_copy_8x8
-
-    load_hfilter v13, v14
-
-    ;# rounding added in on the multiply
-    vspltisw v16, 8
-    vspltisw v15, 3
-    vslw    v15, v16, v15       ;# 0x00000040000000400000004000000040
-
-    ;# Load up permutation constants
-    load_c v16, B_0123, 0, r9, r10
-    load_c v17, B_4567, 0, r9, r10
-    load_c v18, B_89AB, 0, r9, r10
-
-    ;# Back off input buffer by 2 bytes.  Need 2 before and 3 after
-    addi    r3, r3, -2
-
-    addi    r9, r3, 0
-    li      r10, 16
-    Read8x8 v2, r3, r4, 1
-    Read8x8 v3, r3, r4, 1
-    Read8x8 v4, r3, r4, 1
-    Read8x8 v5, r3, r4, 1
-    Read8x8 v6, r3, r4, 1
-    Read8x8 v7, r3, r4, 1
-    Read8x8 v8, r3, r4, 1
-    Read8x8 v9, r3, r4, 1
-
-    slwi.   r6, r6, 4           ;# index into vertical filter array
-
-    ;# filter a line
-    interp_8x8 v2
-    interp_8x8 v3
-    interp_8x8 v4
-    interp_8x8 v5
-    interp_8x8 v6
-    interp_8x8 v7
-    interp_8x8 v8
-    interp_8x8 v9
-
-    ;# Finished filtering main horizontal block.  If there is no
-    ;#  vertical filtering, jump to storing the data.  Otherwise
-    ;#  load up and filter the additional 5 lines that are needed
-    ;#  for the vertical filter.
-    beq-    store_8x8
-
-    ;# only needed if there is a vertical filter present
-    ;# if the second filter is not null then need to back off by 2*pitch
-    sub     r9, r9, r4
-    sub     r9, r9, r4
-
-    Read8x8 v0,  r9, r4, 1
-    Read8x8 v1,  r9, r4, 0
-    Read8x8 v10, r3, r4, 1
-    Read8x8 v11, r3, r4, 1
-    Read8x8 v12, r3, r4, 0
-
-    interp_8x8 v0
-    interp_8x8 v1
-    interp_8x8 v10
-    interp_8x8 v11
-    interp_8x8 v12
-
-    b       second_pass_8x8
-
-second_pass_pre_copy_8x8:
-    ;# only needed if there is a vertical filter present
-    ;# if the second filter is not null then need to back off by 2*pitch
-    sub     r3, r3, r4
-    sub     r3, r3, r4
-    li      r10, 16
-
-    Read8x8 v0,  r3, r4, 1
-    Read8x8 v1,  r3, r4, 1
-    Read8x8 v2,  r3, r4, 1
-    Read8x8 v3,  r3, r4, 1
-    Read8x8 v4,  r3, r4, 1
-    Read8x8 v5,  r3, r4, 1
-    Read8x8 v6,  r3, r4, 1
-    Read8x8 v7,  r3, r4, 1
-    Read8x8 v8,  r3, r4, 1
-    Read8x8 v9,  r3, r4, 1
-    Read8x8 v10, r3, r4, 1
-    Read8x8 v11, r3, r4, 1
-    Read8x8 v12, r3, r4, 0
-
-    slwi    r6, r6, 4           ;# index into vertical filter array
-
-second_pass_8x8:
-    load_c v13, VFilter, r6, r9, r10
-
-    vspltish v15, 8
-    vspltish v20, 3
-    vslh    v20, v15, v20       ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
-
-    vspltb  v14, v13, 1
-    vspltb  v15, v13, 2
-    vspltb  v16, v13, 3
-    vspltb  v17, v13, 4
-    vspltb  v18, v13, 5
-    vspltb  v13, v13, 0
-
-    vinterp_no_store_8x8 v0, v1, v2, v3,  v4,  v5
-    vinterp_no_store_8x8 v1, v2, v3, v4,  v5,  v6
-    vinterp_no_store_8x8 v2, v3, v4, v5,  v6,  v7
-    vinterp_no_store_8x8 v3, v4, v5, v6,  v7,  v8
-    vinterp_no_store_8x8 v4, v5, v6, v7,  v8,  v9
-    vinterp_no_store_8x8 v5, v6, v7, v8,  v9,  v10
-    vinterp_no_store_8x8 v6, v7, v8, v9,  v10, v11
-    vinterp_no_store_8x8 v7, v8, v9, v10, v11, v12
-
-    cmpi    cr0, r8, 8
-    beq     cr0, store_aligned_8x8
-
-    w_8x8   v0, r7, r0, r8
-    w_8x8   v1, r7, r0, r8
-    w_8x8   v2, r7, r0, r8
-    w_8x8   v3, r7, r0, r8
-    w_8x8   v4, r7, r0, r8
-    w_8x8   v5, r7, r0, r8
-    w_8x8   v6, r7, r0, r8
-    w_8x8   v7, r7, r0, r8
-
-    b       exit_8x8
-
-store_aligned_8x8:
-
-    load_c v10, b_hilo, 0, r9, r10
-
-    vperm   v0, v0, v1, v10
-    vperm   v2, v2, v3, v10
-    vperm   v4, v4, v5, v10
-    vperm   v6, v6, v7, v10
-
-    stvx    v0, 0, r7
-    addi    r7, r7, 16
-    stvx    v2, 0, r7
-    addi    r7, r7, 16
-    stvx    v4, 0, r7
-    addi    r7, r7, 16
-    stvx    v6, 0, r7
-
-    b       exit_8x8
-
-store_8x8:
-    cmpi    cr0, r8, 8
-    beq     cr0, store_aligned2_8x8
-
-    w_8x8   v2, r7, r0, r8
-    w_8x8   v3, r7, r0, r8
-    w_8x8   v4, r7, r0, r8
-    w_8x8   v5, r7, r0, r8
-    w_8x8   v6, r7, r0, r8
-    w_8x8   v7, r7, r0, r8
-    w_8x8   v8, r7, r0, r8
-    w_8x8   v9, r7, r0, r8
-
-    b       exit_8x8
-
-store_aligned2_8x8:
-    load_c v10, b_hilo, 0, r9, r10
-
-    vperm   v2, v2, v3, v10
-    vperm   v4, v4, v5, v10
-    vperm   v6, v6, v7, v10
-    vperm   v8, v8, v9, v10
-
-    stvx    v2, 0, r7
-    addi    r7, r7, 16
-    stvx    v4, 0, r7
-    addi    r7, r7, 16
-    stvx    v6, 0, r7
-    addi    r7, r7, 16
-    stvx    v8, 0, r7
-
-exit_8x8:
-
-    addi    r1, r1, 32          ;# recover stack
-
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr
-
-    .align 2
-;# r3 unsigned char * src
-;# r4 int src_pitch
-;# r5 int x_offset
-;# r6 int y_offset
-;# r7 unsigned char * dst
-;# r8 int dst_pitch
-
-;# Two pass filtering.  First pass is Horizontal edges, second pass is vertical
-;#  edges.  One of the filters can be null, but both won't be.  Needs to use a
-;#  temporary buffer because the source buffer can't be modified and the buffer
-;#  for the destination is not large enough to hold the temporary data.
-sixtap_predict16x16_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xffff
-    ori     r12, r12, 0xf000
-    mtspr   256, r12            ;# set VRSAVE
-
-    stwu    r1,-416(r1)         ;# create space on the stack
-
-    ;# Three possiblities
-    ;#  1. First filter is null.  Don't use a temp buffer.
-    ;#  2. Second filter is null.  Don't use a temp buffer.
-    ;#  3. Neither are null, use temp buffer.
-
-    ;# First Pass (horizontal edge)
-    ;#  setup pointers for src
-    ;#  if possiblity (1) then setup the src pointer to be the orginal and jump
-    ;#  to second pass.  this is based on if x_offset is 0.
-
-    ;# load up horizontal filter
-    slwi.   r5, r5, 5           ;# index into horizontal filter array
-
-    load_hfilter v4, v5
-
-    beq-    copy_horizontal_16x21
-
-    ;# Back off input buffer by 2 bytes.  Need 2 before and 3 after
-    addi    r3, r3, -2
-
-    slwi.   r6, r6, 4           ;# index into vertical filter array
-
-    ;# setup constants
-    ;# v14 permutation value for alignment
-    load_c v14, b_hperm, 0, r9, r10
-
-    ;# These statements are guessing that there won't be a second pass,
-    ;#  but if there is then inside the bypass they need to be set
-    li      r0, 16              ;# prepare for no vertical filter
-
-    ;# Change the output pointer and pitch to be the actual
-    ;#  desination instead of a temporary buffer.
-    addi    r9, r7, 0
-    addi    r5, r8, 0
-
-    ;# no vertical filter, so write the output from the first pass
-    ;#  directly into the output buffer.
-    beq-    no_vertical_filter_bypass
-
-    ;# if the second filter is not null then need to back off by 2*pitch
-    sub     r3, r3, r4
-    sub     r3, r3, r4
-
-    ;# setup counter for the number of lines that are going to be filtered
-    li      r0, 21
-
-    ;# use the stack as temporary storage
-    la      r9, 48(r1)
-    li      r5, 16
-
-no_vertical_filter_bypass:
-
-    mtctr   r0
-
-    ;# rounding added in on the multiply
-    vspltisw v10, 8
-    vspltisw v12, 3
-    vslw    v12, v10, v12       ;# 0x00000040000000400000004000000040
-
-    ;# downshift by 7 ( divide by 128 ) at the end
-    vspltish v13, 7
-
-    ;# index to the next set of vectors in the row.
-    li      r10, 16
-    li      r12, 32
-
-horizontal_loop_16x16:
-
-    lvsl    v15,  0, r3         ;# permutate value for alignment
-
-    ;# input to filter is 21 bytes wide, output is 16 bytes.
-    ;#  input will can span three vectors if not aligned correctly.
-    lvx     v1,   0, r3
-    lvx     v2, r10, r3
-    lvx     v3, r12, r3
-
-    vperm   v8, v1, v2, v15
-    vperm   v9, v2, v3, v15     ;# v8 v9 = 21 input pixels left-justified
-
-    vsldoi  v11, v8, v9, 4
-
-    ;# set 0
-    vmsummbm v6, v4, v8, v12    ;# taps times elements
-    vmsummbm v0, v5, v11, v6
-
-    ;# set 1
-    vsldoi  v10, v8, v9, 1
-    vsldoi  v11, v8, v9, 5
-
-    vmsummbm v6, v4, v10, v12
-    vmsummbm v1, v5, v11, v6
-
-    ;# set 2
-    vsldoi  v10, v8, v9, 2
-    vsldoi  v11, v8, v9, 6
-
-    vmsummbm v6, v4, v10, v12
-    vmsummbm v2, v5, v11, v6
-
-    ;# set 3
-    vsldoi  v10, v8, v9, 3
-    vsldoi  v11, v8, v9, 7
-
-    vmsummbm v6, v4, v10, v12
-    vmsummbm v3, v5, v11, v6
-
-    vpkswus v0, v0, v1          ;# v0 = 0 4 8 C 1 5 9 D (16-bit)
-    vpkswus v1, v2, v3          ;# v1 = 2 6 A E 3 7 B F
-
-    vsrh    v0, v0, v13         ;# divide v0, v1 by 128
-    vsrh    v1, v1, v13
-
-    vpkuhus v0, v0, v1          ;# v0 = scrambled 8-bit result
-    vperm   v0, v0, v0, v14     ;# v0 = correctly-ordered result
-
-    stvx    v0,  0, r9
-    add     r9, r9, r5
-
-    add     r3, r3, r4
-
-    bdnz    horizontal_loop_16x16
-
-    ;# check again to see if vertical filter needs to be done.
-    cmpi    cr0, r6, 0
-    beq     cr0, end_16x16
-
-    ;# yes there is, so go to the second pass
-    b       second_pass_16x16
-
-copy_horizontal_16x21:
-    li      r10, 21
-    mtctr   r10
-
-    li      r10, 16
-
-    sub     r3, r3, r4
-    sub     r3, r3, r4
-
-    ;# this is done above if there is a horizontal filter,
-    ;#  if not it needs to be done down here.
-    slwi    r6, r6, 4           ;# index into vertical filter array
-
-    ;# always write to the stack when doing a horizontal copy
-    la      r9, 48(r1)
-
-copy_horizontal_loop_16x21:
-    lvsl    v15,  0, r3         ;# permutate value for alignment
-
-    lvx     v1,   0, r3
-    lvx     v2, r10, r3
-
-    vperm   v8, v1, v2, v15
-
-    stvx    v8,  0, r9
-    addi    r9, r9, 16
-
-    add     r3, r3, r4
-
-    bdnz    copy_horizontal_loop_16x21
-
-second_pass_16x16:
-
-    ;# always read from the stack when doing a vertical filter
-    la      r9, 48(r1)
-
-    ;# downshift by 7 ( divide by 128 ) at the end
-    vspltish v7, 7
-
-    vpre_load
-
-    luma_vsix
-    luma_vsix
-    luma_vfour
-
-end_16x16:
-
-    addi    r1, r1, 416         ;# recover stack
-
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr
-
-    .data
-
-    .align 4
-HFilter:
-    .byte     0,  0,128,  0,  0,  0,128,  0,  0,  0,128,  0,  0,  0,128,  0
-    .byte     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
-    .byte     0, -6,123, 12,  0, -6,123, 12,  0, -6,123, 12,  0, -6,123, 12
-    .byte    -1,  0,  0,  0, -1,  0,  0,  0, -1,  0,  0,  0, -1,  0,  0,  0
-    .byte     2,-11,108, 36,  2,-11,108, 36,  2,-11,108, 36,  2,-11,108, 36
-    .byte    -8,  1,  0,  0, -8,  1,  0,  0, -8,  1,  0,  0, -8,  1,  0,  0
-    .byte     0, -9, 93, 50,  0, -9, 93, 50,  0, -9, 93, 50,  0, -9, 93, 50
-    .byte    -6,  0,  0,  0, -6,  0,  0,  0, -6,  0,  0,  0, -6,  0,  0,  0
-    .byte     3,-16, 77, 77,  3,-16, 77, 77,  3,-16, 77, 77,  3,-16, 77, 77
-    .byte   -16,  3,  0,  0,-16,  3,  0,  0,-16,  3,  0,  0,-16,  3,  0,  0
-    .byte     0, -6, 50, 93,  0, -6, 50, 93,  0, -6, 50, 93,  0, -6, 50, 93
-    .byte    -9,  0,  0,  0, -9,  0,  0,  0, -9,  0,  0,  0, -9,  0,  0,  0
-    .byte     1, -8, 36,108,  1, -8, 36,108,  1, -8, 36,108,  1, -8, 36,108
-    .byte   -11,  2,  0,  0,-11,  2,  0,  0,-11,  2,  0,  0,-11,  2,  0,  0
-    .byte     0, -1, 12,123,  0, -1, 12,123,  0, -1, 12,123,  0, -1, 12,123
-    .byte    -6,  0,  0,  0, -6,  0,  0,  0, -6,  0,  0,  0, -6,  0,  0,  0
-
-    .align 4
-VFilter:
-    .byte     0,  0,128,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
-    .byte     0,  6,123, 12,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
-    .byte     2, 11,108, 36,  8,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
-    .byte     0,  9, 93, 50,  6,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
-    .byte     3, 16, 77, 77, 16,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
-    .byte     0,  6, 50, 93,  9,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
-    .byte     1,  8, 36,108, 11,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
-    .byte     0,  1, 12,123,  6,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
-
-    .align 4
-b_hperm:
-    .byte     0,  4,  8, 12,  1,  5,  9, 13,  2,  6, 10, 14,  3,  7, 11, 15
-
-    .align 4
-B_0123:
-    .byte     0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6
-
-    .align 4
-B_4567:
-    .byte     4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10
-
-    .align 4
-B_89AB:
-    .byte     8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
-
-    .align 4
-b_hilo:
-    .byte     0,  1,  2,  3,  4,  5,  6,  7, 16, 17, 18, 19, 20, 21, 22, 23
-
-    .align 4
-b_hilo_4x4:
-    .byte     0,  1,  2,  3, 16, 17, 18, 19,  0,  0,  0,  0,  0,  0,  0,  0
--- a/vp8/common/ppc/filter_bilinear_altivec.asm
+++ /dev/null
@@ -1,677 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    .globl bilinear_predict4x4_ppc
-    .globl bilinear_predict8x4_ppc
-    .globl bilinear_predict8x8_ppc
-    .globl bilinear_predict16x16_ppc
-
-.macro load_c V, LABEL, OFF, R0, R1
-    lis     \R0, \LABEL@ha
-    la      \R1, \LABEL@l(\R0)
-    lvx     \V, \OFF, \R1
-.endm
-
-.macro load_vfilter V0, V1
-    load_c \V0, vfilter_b, r6, r9, r10
-
-    addi    r6,  r6, 16
-    lvx     \V1, r6, r10
-.endm
-
-.macro HProlog jump_label
-    ;# load up horizontal filter
-    slwi.   r5, r5, 4           ;# index into horizontal filter array
-
-    ;# index to the next set of vectors in the row.
-    li      r10, 16
-    li      r12, 32
-
-    ;# downshift by 7 ( divide by 128 ) at the end
-    vspltish v19, 7
-
-    ;# If there isn't any filtering to be done for the horizontal, then
-    ;#  just skip to the second pass.
-    beq     \jump_label
-
-    load_c v20, hfilter_b, r5, r9, r0
-
-    ;# setup constants
-    ;# v14 permutation value for alignment
-    load_c v28, b_hperm_b, 0, r9, r0
-
-    ;# rounding added in on the multiply
-    vspltisw v21, 8
-    vspltisw v18, 3
-    vslw    v18, v21, v18       ;# 0x00000040000000400000004000000040
-
-    slwi.   r6, r6, 5           ;# index into vertical filter array
-.endm
-
-;# Filters a horizontal line
-;# expects:
-;#  r3  src_ptr
-;#  r4  pitch
-;#  r10 16
-;#  r12 32
-;#  v17 perm intput
-;#  v18 rounding
-;#  v19 shift
-;#  v20 filter taps
-;#  v21 tmp
-;#  v22 tmp
-;#  v23 tmp
-;#  v24 tmp
-;#  v25 tmp
-;#  v26 tmp
-;#  v27 tmp
-;#  v28 perm output
-;#
-.macro HFilter V
-    vperm   v24, v21, v21, v10  ;# v20 = 0123 1234 2345 3456
-    vperm   v25, v21, v21, v11  ;# v21 = 4567 5678 6789 789A
-
-    vmsummbm v24, v20, v24, v18
-    vmsummbm v25, v20, v25, v18
-
-    vpkswus v24, v24, v25       ;# v24 = 0 4 8 C 1 5 9 D (16-bit)
-
-    vsrh    v24, v24, v19       ;# divide v0, v1 by 128
-
-    vpkuhus \V, v24, v24        ;# \V = scrambled 8-bit result
-.endm
-
-.macro hfilter_8 V, increment_counter
-    lvsl    v17,  0, r3         ;# permutate value for alignment
-
-    ;# input to filter is 9 bytes wide, output is 8 bytes.
-    lvx     v21,   0, r3
-    lvx     v22, r10, r3
-
-.if \increment_counter
-    add     r3, r3, r4
-.endif
-    vperm   v21, v21, v22, v17
-
-    HFilter \V
-.endm
-
-
-.macro load_and_align_8 V, increment_counter
-    lvsl    v17,  0, r3         ;# permutate value for alignment
-
-    ;# input to filter is 21 bytes wide, output is 16 bytes.
-    ;#  input will can span three vectors if not aligned correctly.
-    lvx     v21,   0, r3
-    lvx     v22, r10, r3
-
-.if \increment_counter
-    add     r3, r3, r4
-.endif
-
-    vperm   \V, v21, v22, v17
-.endm
-
-.macro write_aligned_8 V, increment_counter
-    stvx    \V,  0, r7
-
-.if \increment_counter
-    add     r7, r7, r8
-.endif
-.endm
-
-.macro vfilter_16 P0 P1
-    vmuleub v22, \P0, v20       ;# 64 + 4 positive taps
-    vadduhm v22, v18, v22
-    vmuloub v23, \P0, v20
-    vadduhm v23, v18, v23
-
-    vmuleub v24, \P1, v21
-    vadduhm v22, v22, v24       ;# Re = evens, saturation unnecessary
-    vmuloub v25, \P1, v21
-    vadduhm v23, v23, v25       ;# Ro = odds
-
-    vsrh    v22, v22, v19       ;# divide by 128
-    vsrh    v23, v23, v19       ;# v16 v17 = evens, odds
-    vmrghh  \P0, v22, v23       ;# v18 v19 = 16-bit result in order
-    vmrglh  v23, v22, v23
-    vpkuhus \P0, \P0, v23       ;# P0 = 8-bit result
-.endm
-
-
-.macro w_8x8 V, D, R, P
-    stvx    \V, 0, r1
-    lwz     \R, 0(r1)
-    stw     \R, 0(r7)
-    lwz     \R, 4(r1)
-    stw     \R, 4(r7)
-    add     \D, \D, \P
-.endm
-
-
-    .align 2
-;# r3 unsigned char * src
-;# r4 int src_pitch
-;# r5 int x_offset
-;# r6 int y_offset
-;# r7 unsigned char * dst
-;# r8 int dst_pitch
-bilinear_predict4x4_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xf830
-    ori     r12, r12, 0xfff8
-    mtspr   256, r12            ;# set VRSAVE
-
-    stwu    r1,-32(r1)          ;# create space on the stack
-
-    HProlog second_pass_4x4_pre_copy_b
-
-    ;# Load up permutation constants
-    load_c v10, b_0123_b, 0, r9, r12
-    load_c v11, b_4567_b, 0, r9, r12
-
-    hfilter_8 v0, 1
-    hfilter_8 v1, 1
-    hfilter_8 v2, 1
-    hfilter_8 v3, 1
-
-    ;# Finished filtering main horizontal block.  If there is no
-    ;#  vertical filtering, jump to storing the data.  Otherwise
-    ;#  load up and filter the additional line that is needed
-    ;#  for the vertical filter.
-    beq     store_out_4x4_b
-
-    hfilter_8 v4, 0
-
-    b   second_pass_4x4_b
-
-second_pass_4x4_pre_copy_b:
-    slwi    r6, r6, 5           ;# index into vertical filter array
-
-    load_and_align_8  v0, 1
-    load_and_align_8  v1, 1
-    load_and_align_8  v2, 1
-    load_and_align_8  v3, 1
-    load_and_align_8  v4, 1
-
-second_pass_4x4_b:
-    vspltish v20, 8
-    vspltish v18, 3
-    vslh    v18, v20, v18   ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
-
-    load_vfilter v20, v21
-
-    vfilter_16 v0,  v1
-    vfilter_16 v1,  v2
-    vfilter_16 v2,  v3
-    vfilter_16 v3,  v4
-
-store_out_4x4_b:
-
-    stvx    v0, 0, r1
-    lwz     r0, 0(r1)
-    stw     r0, 0(r7)
-    add     r7, r7, r8
-
-    stvx    v1, 0, r1
-    lwz     r0, 0(r1)
-    stw     r0, 0(r7)
-    add     r7, r7, r8
-
-    stvx    v2, 0, r1
-    lwz     r0, 0(r1)
-    stw     r0, 0(r7)
-    add     r7, r7, r8
-
-    stvx    v3, 0, r1
-    lwz     r0, 0(r1)
-    stw     r0, 0(r7)
-
-exit_4x4:
-
-    addi    r1, r1, 32          ;# recover stack
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr
-
-    .align 2
-;# r3 unsigned char * src
-;# r4 int src_pitch
-;# r5 int x_offset
-;# r6 int y_offset
-;# r7 unsigned char * dst
-;# r8 int dst_pitch
-bilinear_predict8x4_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xf830
-    ori     r12, r12, 0xfff8
-    mtspr   256, r12            ;# set VRSAVE
-
-    stwu    r1,-32(r1)          ;# create space on the stack
-
-    HProlog second_pass_8x4_pre_copy_b
-
-    ;# Load up permutation constants
-    load_c v10, b_0123_b, 0, r9, r12
-    load_c v11, b_4567_b, 0, r9, r12
-
-    hfilter_8 v0, 1
-    hfilter_8 v1, 1
-    hfilter_8 v2, 1
-    hfilter_8 v3, 1
-
-    ;# Finished filtering main horizontal block.  If there is no
-    ;#  vertical filtering, jump to storing the data.  Otherwise
-    ;#  load up and filter the additional line that is needed
-    ;#  for the vertical filter.
-    beq     store_out_8x4_b
-
-    hfilter_8 v4, 0
-
-    b   second_pass_8x4_b
-
-second_pass_8x4_pre_copy_b:
-    slwi    r6, r6, 5           ;# index into vertical filter array
-
-    load_and_align_8  v0, 1
-    load_and_align_8  v1, 1
-    load_and_align_8  v2, 1
-    load_and_align_8  v3, 1
-    load_and_align_8  v4, 1
-
-second_pass_8x4_b:
-    vspltish v20, 8
-    vspltish v18, 3
-    vslh    v18, v20, v18   ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
-
-    load_vfilter v20, v21
-
-    vfilter_16 v0,  v1
-    vfilter_16 v1,  v2
-    vfilter_16 v2,  v3
-    vfilter_16 v3,  v4
-
-store_out_8x4_b:
-
-    cmpi    cr0, r8, 8
-    beq     cr0, store_aligned_8x4_b
-
-    w_8x8   v0, r7, r0, r8
-    w_8x8   v1, r7, r0, r8
-    w_8x8   v2, r7, r0, r8
-    w_8x8   v3, r7, r0, r8
-
-    b       exit_8x4
-
-store_aligned_8x4_b:
-    load_c v10, b_hilo_b, 0, r9, r10
-
-    vperm   v0, v0, v1, v10
-    vperm   v2, v2, v3, v10
-
-    stvx    v0, 0, r7
-    addi    r7, r7, 16
-    stvx    v2, 0, r7
-
-exit_8x4:
-
-    addi    r1, r1, 32          ;# recover stack
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr
-
-    .align 2
-;# r3 unsigned char * src
-;# r4 int src_pitch
-;# r5 int x_offset
-;# r6 int y_offset
-;# r7 unsigned char * dst
-;# r8 int dst_pitch
-bilinear_predict8x8_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xfff0
-    ori     r12, r12, 0xffff
-    mtspr   256, r12            ;# set VRSAVE
-
-    stwu    r1,-32(r1)          ;# create space on the stack
-
-    HProlog second_pass_8x8_pre_copy_b
-
-    ;# Load up permutation constants
-    load_c v10, b_0123_b, 0, r9, r12
-    load_c v11, b_4567_b, 0, r9, r12
-
-    hfilter_8 v0, 1
-    hfilter_8 v1, 1
-    hfilter_8 v2, 1
-    hfilter_8 v3, 1
-    hfilter_8 v4, 1
-    hfilter_8 v5, 1
-    hfilter_8 v6, 1
-    hfilter_8 v7, 1
-
-    ;# Finished filtering main horizontal block.  If there is no
-    ;#  vertical filtering, jump to storing the data.  Otherwise
-    ;#  load up and filter the additional line that is needed
-    ;#  for the vertical filter.
-    beq     store_out_8x8_b
-
-    hfilter_8 v8, 0
-
-    b   second_pass_8x8_b
-
-second_pass_8x8_pre_copy_b:
-    slwi    r6, r6, 5           ;# index into vertical filter array
-
-    load_and_align_8  v0, 1
-    load_and_align_8  v1, 1
-    load_and_align_8  v2, 1
-    load_and_align_8  v3, 1
-    load_and_align_8  v4, 1
-    load_and_align_8  v5, 1
-    load_and_align_8  v6, 1
-    load_and_align_8  v7, 1
-    load_and_align_8  v8, 0
-
-second_pass_8x8_b:
-    vspltish v20, 8
-    vspltish v18, 3
-    vslh    v18, v20, v18   ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
-
-    load_vfilter v20, v21
-
-    vfilter_16 v0,  v1
-    vfilter_16 v1,  v2
-    vfilter_16 v2,  v3
-    vfilter_16 v3,  v4
-    vfilter_16 v4,  v5
-    vfilter_16 v5,  v6
-    vfilter_16 v6,  v7
-    vfilter_16 v7,  v8
-
-store_out_8x8_b:
-
-    cmpi    cr0, r8, 8
-    beq     cr0, store_aligned_8x8_b
-
-    w_8x8   v0, r7, r0, r8
-    w_8x8   v1, r7, r0, r8
-    w_8x8   v2, r7, r0, r8
-    w_8x8   v3, r7, r0, r8
-    w_8x8   v4, r7, r0, r8
-    w_8x8   v5, r7, r0, r8
-    w_8x8   v6, r7, r0, r8
-    w_8x8   v7, r7, r0, r8
-
-    b       exit_8x8
-
-store_aligned_8x8_b:
-    load_c v10, b_hilo_b, 0, r9, r10
-
-    vperm   v0, v0, v1, v10
-    vperm   v2, v2, v3, v10
-    vperm   v4, v4, v5, v10
-    vperm   v6, v6, v7, v10
-
-    stvx    v0, 0, r7
-    addi    r7, r7, 16
-    stvx    v2, 0, r7
-    addi    r7, r7, 16
-    stvx    v4, 0, r7
-    addi    r7, r7, 16
-    stvx    v6, 0, r7
-
-exit_8x8:
-
-    addi    r1, r1, 32          ;# recover stack
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr
-
-;# Filters a horizontal line
-;# expects:
-;#  r3  src_ptr
-;#  r4  pitch
-;#  r10 16
-;#  r12 32
-;#  v17 perm intput
-;#  v18 rounding
-;#  v19 shift
-;#  v20 filter taps
-;#  v21 tmp
-;#  v22 tmp
-;#  v23 tmp
-;#  v24 tmp
-;#  v25 tmp
-;#  v26 tmp
-;#  v27 tmp
-;#  v28 perm output
-;#
-.macro hfilter_16 V, increment_counter
-
-    lvsl    v17,  0, r3         ;# permutate value for alignment
-
-    ;# input to filter is 21 bytes wide, output is 16 bytes.
-    ;#  input will can span three vectors if not aligned correctly.
-    lvx     v21,   0, r3
-    lvx     v22, r10, r3
-    lvx     v23, r12, r3
-
-.if \increment_counter
-    add     r3, r3, r4
-.endif
-    vperm   v21, v21, v22, v17
-    vperm   v22, v22, v23, v17  ;# v8 v9 = 21 input pixels left-justified
-
-    ;# set 0
-    vmsummbm v24, v20, v21, v18 ;# taps times elements
-
-    ;# set 1
-    vsldoi  v23, v21, v22, 1
-    vmsummbm v25, v20, v23, v18
-
-    ;# set 2
-    vsldoi  v23, v21, v22, 2
-    vmsummbm v26, v20, v23, v18
-
-    ;# set 3
-    vsldoi  v23, v21, v22, 3
-    vmsummbm v27, v20, v23, v18
-
-    vpkswus v24, v24, v25       ;# v24 = 0 4 8 C 1 5 9 D (16-bit)
-    vpkswus v25, v26, v27       ;# v25 = 2 6 A E 3 7 B F
-
-    vsrh    v24, v24, v19       ;# divide v0, v1 by 128
-    vsrh    v25, v25, v19
-
-    vpkuhus \V, v24, v25        ;# \V = scrambled 8-bit result
-    vperm   \V, \V, v0, v28     ;# \V = correctly-ordered result
-.endm
-
-.macro load_and_align_16 V, increment_counter
-    lvsl    v17,  0, r3         ;# permutate value for alignment
-
-    ;# input to filter is 21 bytes wide, output is 16 bytes.
-    ;#  input will can span three vectors if not aligned correctly.
-    lvx     v21,   0, r3
-    lvx     v22, r10, r3
-
-.if \increment_counter
-    add     r3, r3, r4
-.endif
-
-    vperm   \V, v21, v22, v17
-.endm
-
-.macro write_16 V, increment_counter
-    stvx    \V,  0, r7
-
-.if \increment_counter
-    add     r7, r7, r8
-.endif
-.endm
-
-    .align 2
-;# r3 unsigned char * src
-;# r4 int src_pitch
-;# r5 int x_offset
-;# r6 int y_offset
-;# r7 unsigned char * dst
-;# r8 int dst_pitch
-bilinear_predict16x16_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xffff
-    ori     r12, r12, 0xfff8
-    mtspr   256, r12            ;# set VRSAVE
-
-    HProlog second_pass_16x16_pre_copy_b
-
-    hfilter_16 v0,  1
-    hfilter_16 v1,  1
-    hfilter_16 v2,  1
-    hfilter_16 v3,  1
-    hfilter_16 v4,  1
-    hfilter_16 v5,  1
-    hfilter_16 v6,  1
-    hfilter_16 v7,  1
-    hfilter_16 v8,  1
-    hfilter_16 v9,  1
-    hfilter_16 v10, 1
-    hfilter_16 v11, 1
-    hfilter_16 v12, 1
-    hfilter_16 v13, 1
-    hfilter_16 v14, 1
-    hfilter_16 v15, 1
-
-    ;# Finished filtering main horizontal block.  If there is no
-    ;#  vertical filtering, jump to storing the data.  Otherwise
-    ;#  load up and filter the additional line that is needed
-    ;#  for the vertical filter.
-    beq     store_out_16x16_b
-
-    hfilter_16 v16, 0
-
-    b   second_pass_16x16_b
-
-second_pass_16x16_pre_copy_b:
-    slwi    r6, r6, 5           ;# index into vertical filter array
-
-    load_and_align_16  v0,  1
-    load_and_align_16  v1,  1
-    load_and_align_16  v2,  1
-    load_and_align_16  v3,  1
-    load_and_align_16  v4,  1
-    load_and_align_16  v5,  1
-    load_and_align_16  v6,  1
-    load_and_align_16  v7,  1
-    load_and_align_16  v8,  1
-    load_and_align_16  v9,  1
-    load_and_align_16  v10, 1
-    load_and_align_16  v11, 1
-    load_and_align_16  v12, 1
-    load_and_align_16  v13, 1
-    load_and_align_16  v14, 1
-    load_and_align_16  v15, 1
-    load_and_align_16  v16, 0
-
-second_pass_16x16_b:
-    vspltish v20, 8
-    vspltish v18, 3
-    vslh    v18, v20, v18   ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
-
-    load_vfilter v20, v21
-
-    vfilter_16 v0,  v1
-    vfilter_16 v1,  v2
-    vfilter_16 v2,  v3
-    vfilter_16 v3,  v4
-    vfilter_16 v4,  v5
-    vfilter_16 v5,  v6
-    vfilter_16 v6,  v7
-    vfilter_16 v7,  v8
-    vfilter_16 v8,  v9
-    vfilter_16 v9,  v10
-    vfilter_16 v10, v11
-    vfilter_16 v11, v12
-    vfilter_16 v12, v13
-    vfilter_16 v13, v14
-    vfilter_16 v14, v15
-    vfilter_16 v15, v16
-
-store_out_16x16_b:
-
-    write_16 v0,  1
-    write_16 v1,  1
-    write_16 v2,  1
-    write_16 v3,  1
-    write_16 v4,  1
-    write_16 v5,  1
-    write_16 v6,  1
-    write_16 v7,  1
-    write_16 v8,  1
-    write_16 v9,  1
-    write_16 v10, 1
-    write_16 v11, 1
-    write_16 v12, 1
-    write_16 v13, 1
-    write_16 v14, 1
-    write_16 v15, 0
-
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr
-
-    .data
-
-    .align 4
-hfilter_b:
-    .byte   128,  0,  0,  0,128,  0,  0,  0,128,  0,  0,  0,128,  0,  0,  0
-    .byte   112, 16,  0,  0,112, 16,  0,  0,112, 16,  0,  0,112, 16,  0,  0
-    .byte    96, 32,  0,  0, 96, 32,  0,  0, 96, 32,  0,  0, 96, 32,  0,  0
-    .byte    80, 48,  0,  0, 80, 48,  0,  0, 80, 48,  0,  0, 80, 48,  0,  0
-    .byte    64, 64,  0,  0, 64, 64,  0,  0, 64, 64,  0,  0, 64, 64,  0,  0
-    .byte    48, 80,  0,  0, 48, 80,  0,  0, 48, 80,  0,  0, 48, 80,  0,  0
-    .byte    32, 96,  0,  0, 32, 96,  0,  0, 32, 96,  0,  0, 32, 96,  0,  0
-    .byte    16,112,  0,  0, 16,112,  0,  0, 16,112,  0,  0, 16,112,  0,  0
-
-    .align 4
-vfilter_b:
-    .byte   128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128
-    .byte     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
-    .byte   112,112,112,112,112,112,112,112,112,112,112,112,112,112,112,112
-    .byte    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
-    .byte    96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96
-    .byte    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32
-    .byte    80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80
-    .byte    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48
-    .byte    64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
-    .byte    64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
-    .byte    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48
-    .byte    80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80
-    .byte    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32
-    .byte    96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96
-    .byte    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
-    .byte   112,112,112,112,112,112,112,112,112,112,112,112,112,112,112,112
-
-    .align 4
-b_hperm_b:
-    .byte     0,  4,  8, 12,  1,  5,  9, 13,  2,  6, 10, 14,  3,  7, 11, 15
-
-    .align 4
-b_0123_b:
-    .byte     0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6
-
-    .align 4
-b_4567_b:
-    .byte     4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10
-
-b_hilo_b:
-    .byte     0,  1,  2,  3,  4,  5,  6,  7, 16, 17, 18, 19, 20, 21, 22, 23
--- a/vp8/common/ppc/idctllm_altivec.asm
+++ /dev/null
@@ -1,189 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    .globl short_idct4x4llm_ppc
-
-.macro load_c V, LABEL, OFF, R0, R1
-    lis     \R0, \LABEL@ha
-    la      \R1, \LABEL@l(\R0)
-    lvx     \V, \OFF, \R1
-.endm
-
-;# r3 short *input
-;# r4 short *output
-;# r5 int pitch
-    .align 2
-short_idct4x4llm_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xfff8
-    mtspr   256, r12            ;# set VRSAVE
-
-    load_c v8, sinpi8sqrt2, 0, r9, r10
-    load_c v9, cospi8sqrt2minus1, 0, r9, r10
-    load_c v10, hi_hi, 0, r9, r10
-    load_c v11, lo_lo, 0, r9, r10
-    load_c v12, shift_16, 0, r9, r10
-
-    li      r10,  16
-    lvx     v0,   0, r3         ;# input ip[0], ip[ 4]
-    lvx     v1, r10, r3         ;# input ip[8], ip[12]
-
-    ;# first pass
-    vupkhsh v2, v0
-    vupkhsh v3, v1
-    vaddsws v6, v2, v3          ;# a1 = ip[0]+ip[8]
-    vsubsws v7, v2, v3          ;# b1 = ip[0]-ip[8]
-
-    vupklsh v0, v0
-    vmulosh v4, v0, v8
-    vsraw   v4, v4, v12
-    vaddsws v4, v4, v0          ;# ip[ 4] * sin(pi/8) * sqrt(2)
-
-    vupklsh v1, v1
-    vmulosh v5, v1, v9
-    vsraw   v5, v5, v12         ;# ip[12] * cos(pi/8) * sqrt(2)
-    vaddsws v5, v5, v1
-
-    vsubsws v4, v4, v5          ;# c1
-
-    vmulosh v3, v1, v8
-    vsraw   v3, v3, v12
-    vaddsws v3, v3, v1          ;# ip[12] * sin(pi/8) * sqrt(2)
-
-    vmulosh v5, v0, v9
-    vsraw   v5, v5, v12         ;# ip[ 4] * cos(pi/8) * sqrt(2)
-    vaddsws v5, v5, v0
-
-    vaddsws v3, v3, v5          ;# d1
-
-    vaddsws v0, v6, v3          ;# a1 + d1
-    vsubsws v3, v6, v3          ;# a1 - d1
-
-    vaddsws v1, v7, v4          ;# b1 + c1
-    vsubsws v2, v7, v4          ;# b1 - c1
-
-    ;# transpose input
-    vmrghw  v4, v0, v1          ;# a0 b0 a1 b1
-    vmrghw  v5, v2, v3          ;# c0 d0 c1 d1
-
-    vmrglw  v6, v0, v1          ;# a2 b2 a3 b3
-    vmrglw  v7, v2, v3          ;# c2 d2 c3 d3
-
-    vperm   v0, v4, v5, v10     ;# a0 b0 c0 d0
-    vperm   v1, v4, v5, v11     ;# a1 b1 c1 d1
-
-    vperm   v2, v6, v7, v10     ;# a2 b2 c2 d2
-    vperm   v3, v6, v7, v11     ;# a3 b3 c3 d3
-
-    ;# second pass
-    vaddsws v6, v0, v2          ;# a1 = ip[0]+ip[8]
-    vsubsws v7, v0, v2          ;# b1 = ip[0]-ip[8]
-
-    vmulosh v4, v1, v8
-    vsraw   v4, v4, v12
-    vaddsws v4, v4, v1          ;# ip[ 4] * sin(pi/8) * sqrt(2)
-
-    vmulosh v5, v3, v9
-    vsraw   v5, v5, v12         ;# ip[12] * cos(pi/8) * sqrt(2)
-    vaddsws v5, v5, v3
-
-    vsubsws v4, v4, v5          ;# c1
-
-    vmulosh v2, v3, v8
-    vsraw   v2, v2, v12
-    vaddsws v2, v2, v3          ;# ip[12] * sin(pi/8) * sqrt(2)
-
-    vmulosh v5, v1, v9
-    vsraw   v5, v5, v12         ;# ip[ 4] * cos(pi/8) * sqrt(2)
-    vaddsws v5, v5, v1
-
-    vaddsws v3, v2, v5          ;# d1
-
-    vaddsws v0, v6, v3          ;# a1 + d1
-    vsubsws v3, v6, v3          ;# a1 - d1
-
-    vaddsws v1, v7, v4          ;# b1 + c1
-    vsubsws v2, v7, v4          ;# b1 - c1
-
-    vspltish v6, 4
-    vspltish v7, 3
-
-    vpkswss v0, v0, v1
-    vpkswss v1, v2, v3
-
-    vaddshs v0, v0, v6
-    vaddshs v1, v1, v6
-
-    vsrah   v0, v0, v7
-    vsrah   v1, v1, v7
-
-    ;# transpose output
-    vmrghh  v2, v0, v1          ;# a0 c0 a1 c1 a2 c2 a3 c3
-    vmrglh  v3, v0, v1          ;# b0 d0 b1 d1 b2 d2 b3 d3
-
-    vmrghh  v0, v2, v3          ;# a0 b0 c0 d0 a1 b1 c1 d1
-    vmrglh  v1, v2, v3          ;# a2 b2 c2 d2 a3 b3 c3 d3
-
-    stwu    r1,-416(r1)         ;# create space on the stack
-
-    stvx    v0,  0, r1
-    lwz     r6, 0(r1)
-    stw     r6, 0(r4)
-    lwz     r6, 4(r1)
-    stw     r6, 4(r4)
-
-    add     r4, r4, r5
-
-    lwz     r6,  8(r1)
-    stw     r6,  0(r4)
-    lwz     r6, 12(r1)
-    stw     r6,  4(r4)
-
-    add     r4, r4, r5
-
-    stvx    v1,  0, r1
-    lwz     r6, 0(r1)
-    stw     r6, 0(r4)
-    lwz     r6, 4(r1)
-    stw     r6, 4(r4)
-
-    add     r4, r4, r5
-
-    lwz     r6,  8(r1)
-    stw     r6,  0(r4)
-    lwz     r6, 12(r1)
-    stw     r6,  4(r4)
-
-    addi    r1, r1, 416         ;# recover stack
-
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr
-
-    .align 4
-sinpi8sqrt2:
-    .short  35468, 35468, 35468, 35468, 35468, 35468, 35468, 35468
-
-    .align 4
-cospi8sqrt2minus1:
-    .short  20091, 20091, 20091, 20091, 20091, 20091, 20091, 20091
-
-    .align 4
-shift_16:
-    .long      16,    16,    16,    16
-
-    .align 4
-hi_hi:
-    .byte     0,  1,  2,  3,  4,  5,  6,  7, 16, 17, 18, 19, 20, 21, 22, 23
-
-    .align 4
-lo_lo:
-    .byte     8,  9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31
--- a/vp8/common/ppc/loopfilter_altivec.c
+++ /dev/null
@@ -1,127 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "loopfilter.h"
-#include "onyxc_int.h"
-
-typedef void loop_filter_function_y_ppc
-(
-  unsigned char *s,   // source pointer
-  int p,              // pitch
-  const signed char *flimit,
-  const signed char *limit,
-  const signed char *thresh
-);
-
-typedef void loop_filter_function_uv_ppc
-(
-  unsigned char *u,   // source pointer
-  unsigned char *v,   // source pointer
-  int p,              // pitch
-  const signed char *flimit,
-  const signed char *limit,
-  const signed char *thresh
-);
-
-typedef void loop_filter_function_s_ppc
-(
-  unsigned char *s,   // source pointer
-  int p,              // pitch
-  const signed char *flimit
-);
-
-loop_filter_function_y_ppc mbloop_filter_horizontal_edge_y_ppc;
-loop_filter_function_y_ppc mbloop_filter_vertical_edge_y_ppc;
-loop_filter_function_y_ppc loop_filter_horizontal_edge_y_ppc;
-loop_filter_function_y_ppc loop_filter_vertical_edge_y_ppc;
-
-loop_filter_function_uv_ppc mbloop_filter_horizontal_edge_uv_ppc;
-loop_filter_function_uv_ppc mbloop_filter_vertical_edge_uv_ppc;
-loop_filter_function_uv_ppc loop_filter_horizontal_edge_uv_ppc;
-loop_filter_function_uv_ppc loop_filter_vertical_edge_uv_ppc;
-
-loop_filter_function_s_ppc loop_filter_simple_horizontal_edge_ppc;
-loop_filter_function_s_ppc loop_filter_simple_vertical_edge_ppc;
-
-// Horizontal MB filtering
-void loop_filter_mbh_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                         int y_stride, int uv_stride, loop_filter_info *lfi) {
-  mbloop_filter_horizontal_edge_y_ppc(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr);
-
-  if (u_ptr)
-    mbloop_filter_horizontal_edge_uv_ppc(u_ptr, v_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr);
-}
-
-void loop_filter_mbhs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                          int y_stride, int uv_stride, loop_filter_info *lfi) {
-  (void)u_ptr;
-  (void)v_ptr;
-  (void)uv_stride;
-  loop_filter_simple_horizontal_edge_ppc(y_ptr, y_stride, lfi->mbflim);
-}
-
-// Vertical MB Filtering
-void loop_filter_mbv_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                         int y_stride, int uv_stride, loop_filter_info *lfi) {
-  mbloop_filter_vertical_edge_y_ppc(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr);
-
-  if (u_ptr)
-    mbloop_filter_vertical_edge_uv_ppc(u_ptr, v_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr);
-}
-
-void loop_filter_mbvs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                          int y_stride, int uv_stride, loop_filter_info *lfi) {
-  (void)u_ptr;
-  (void)v_ptr;
-  (void)uv_stride;
-  loop_filter_simple_vertical_edge_ppc(y_ptr, y_stride, lfi->mbflim);
-}
-
-// Horizontal B Filtering
-void loop_filter_bh_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                        int y_stride, int uv_stride, loop_filter_info *lfi) {
-  // These should all be done at once with one call, instead of 3
-  loop_filter_horizontal_edge_y_ppc(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr);
-  loop_filter_horizontal_edge_y_ppc(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr);
-  loop_filter_horizontal_edge_y_ppc(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr);
-
-  if (u_ptr)
-    loop_filter_horizontal_edge_uv_ppc(u_ptr + 4 * uv_stride, v_ptr + 4 * uv_stride, uv_stride, lfi->flim, lfi->lim, lfi->thr);
-}
-
-void loop_filter_bhs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                         int y_stride, int uv_stride, loop_filter_info *lfi) {
-  (void)u_ptr;
-  (void)v_ptr;
-  (void)uv_stride;
-  loop_filter_simple_horizontal_edge_ppc(y_ptr + 4 * y_stride, y_stride, lfi->flim);
-  loop_filter_simple_horizontal_edge_ppc(y_ptr + 8 * y_stride, y_stride, lfi->flim);
-  loop_filter_simple_horizontal_edge_ppc(y_ptr + 12 * y_stride, y_stride, lfi->flim);
-}
-
-// Vertical B Filtering
-void loop_filter_bv_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                        int y_stride, int uv_stride, loop_filter_info *lfi) {
-  loop_filter_vertical_edge_y_ppc(y_ptr, y_stride, lfi->flim, lfi->lim, lfi->thr);
-
-  if (u_ptr)
-    loop_filter_vertical_edge_uv_ppc(u_ptr + 4, v_ptr + 4, uv_stride, lfi->flim, lfi->lim, lfi->thr);
-}
-
-void loop_filter_bvs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                         int y_stride, int uv_stride, loop_filter_info *lfi) {
-  (void)u_ptr;
-  (void)v_ptr;
-  (void)uv_stride;
-  loop_filter_simple_vertical_edge_ppc(y_ptr + 4,  y_stride, lfi->flim);
-  loop_filter_simple_vertical_edge_ppc(y_ptr + 8,  y_stride, lfi->flim);
-  loop_filter_simple_vertical_edge_ppc(y_ptr + 12, y_stride, lfi->flim);
-}
--- a/vp8/common/ppc/loopfilter_filters_altivec.asm
+++ /dev/null
@@ -1,1253 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    .globl mbloop_filter_horizontal_edge_y_ppc
-    .globl loop_filter_horizontal_edge_y_ppc
-    .globl mbloop_filter_vertical_edge_y_ppc
-    .globl loop_filter_vertical_edge_y_ppc
-
-    .globl mbloop_filter_horizontal_edge_uv_ppc
-    .globl loop_filter_horizontal_edge_uv_ppc
-    .globl mbloop_filter_vertical_edge_uv_ppc
-    .globl loop_filter_vertical_edge_uv_ppc
-
-    .globl loop_filter_simple_horizontal_edge_ppc
-    .globl loop_filter_simple_vertical_edge_ppc
-
-    .text
-;# We often need to perform transposes (and other transpose-like operations)
-;#   on matrices of data.  This is simplified by the fact that we usually
-;#   operate on hunks of data whose dimensions are powers of 2, or at least
-;#   divisible by highish powers of 2.
-;#
-;#   These operations can be very confusing.  They become more straightforward
-;#   when we think of them as permutations of address bits: Concatenate a
-;#   group of vector registers and think of it as occupying a block of
-;#   memory beginning at address zero.  The low four bits 0...3 of the
-;#   address then correspond to position within a register, the higher-order
-;#   address bits select the register.
-;#
-;#   Although register selection, at the code level, is arbitrary, things
-;#   are simpler if we use contiguous ranges of register numbers, simpler
-;#   still if the low-order bits of the register number correspond to
-;#   conceptual address bits.  We do this whenever reasonable.
-;#
-;#   A 16x16 transpose can then be thought of as an operation on
-;#   a 256-element block of memory.  It takes 8 bits 0...7 to address this
-;#   memory and the effect of a transpose is to interchange address bit
-;#   0 with 4, 1 with 5, 2 with 6, and 3 with 7.  Bits 0...3 index the
-;#   column, which is interchanged with the row addressed by bits 4..7.
-;#
-;#   The altivec merge instructions provide a rapid means of effecting
-;#   many of these transforms.  They operate at three widths (8,16,32).
-;#   Writing V(x) for vector register #x, paired merges permute address
-;#   indices as follows.
-;#
-;#   0->1  1->2  2->3  3->(4+d)  (4+s)->0:
-;#
-;#      vmrghb  V( x),          V( y), V( y + (1<<s))
-;#      vmrglb  V( x + (1<<d)), V( y), V( y + (1<<s))
-;#
-;#
-;#   =0=   1->2  2->3  3->(4+d)  (4+s)->1:
-;#
-;#      vmrghh  V( x),          V( y), V( y + (1<<s))
-;#      vmrglh  V( x + (1<<d)), V( y), V( y + (1<<s))
-;#
-;#
-;#   =0=   =1=   2->3  3->(4+d)  (4+s)->2:
-;#
-;#      vmrghw  V( x),          V( y), V( y + (1<<s))
-;#      vmrglw  V( x + (1<<d)), V( y), V( y + (1<<s))
-;#
-;#
-;#   Unfortunately, there is no doubleword merge instruction.
-;#   The following sequence uses "vperm" is a substitute.
-;#   Assuming that the selection masks b_hihi and b_lolo (defined in LFppc.c)
-;#   are in registers Vhihi and Vlolo, we can also effect the permutation
-;#
-;#   =0=   =1=   =2=   3->(4+d)  (4+s)->3   by the sequence:
-;#
-;#      vperm   V( x),          V( y), V( y + (1<<s)), Vhihi
-;#      vperm   V( x + (1<<d)), V( y), V( y + (1<<s)), Vlolo
-;#
-;#
-;#   Except for bits s and d, the other relationships between register
-;#   number (= high-order part of address) bits are at the disposal of
-;#   the programmer.
-;#
-
-;# To avoid excess transposes, we filter all 3 vertical luma subblock
-;#   edges together.  This requires a single 16x16 transpose, which, in
-;#   the above language, amounts to the following permutation of address
-;#   indices:  0<->4   1<->5  2<->6  3<->7, which we accomplish by
-;#   4 iterations of the cyclic transform 0->1->2->3->4->5->6->7->0.
-;#
-;#   Except for the fact that the destination registers get written
-;#   before we are done referencing the old contents, the cyclic transform
-;#   is effected by
-;#
-;#      x = 0;  do {
-;#          vmrghb V(2x),   V(x), V(x+8);
-;#          vmrghb V(2x+1), V(x), V(x+8);
-;#      } while( ++x < 8);
-;#
-;#   For clarity, and because we can afford it, we do this transpose
-;#   using all 32 registers, alternating the banks 0..15  and  16 .. 31,
-;#   leaving the final result in 16 .. 31, as the lower registers are
-;#   used in the filtering itself.
-;#
-.macro Tpair A, B, X, Y
-    vmrghb  \A, \X, \Y
-    vmrglb  \B, \X, \Y
-.endm
-
-;# Each step takes 8*2 = 16 instructions
-
-.macro t16_even
-    Tpair v16,v17,  v0,v8
-    Tpair v18,v19,  v1,v9
-    Tpair v20,v21,  v2,v10
-    Tpair v22,v23,  v3,v11
-    Tpair v24,v25,  v4,v12
-    Tpair v26,v27,  v5,v13
-    Tpair v28,v29,  v6,v14
-    Tpair v30,v31,  v7,v15
-.endm
-
-.macro t16_odd
-    Tpair v0,v1, v16,v24
-    Tpair v2,v3, v17,v25
-    Tpair v4,v5, v18,v26
-    Tpair v6,v7, v19,v27
-    Tpair v8,v9, v20,v28
-    Tpair v10,v11, v21,v29
-    Tpair v12,v13, v22,v30
-    Tpair v14,v15, v23,v31
-.endm
-
-;# Whole transpose takes 4*16 = 64 instructions
-
-.macro t16_full
-    t16_odd
-    t16_even
-    t16_odd
-    t16_even
-.endm
-
-;# Vertical edge filtering requires transposes.  For the simple filter,
-;#   we need to convert 16 rows of 4 pels each into 4 registers of 16 pels
-;#   each.  Writing 0 ... 63 for the pixel indices, the desired result is:
-;#
-;#  v0 =  0  1 ... 14 15
-;#  v1 = 16 17 ... 30 31
-;#  v2 = 32 33 ... 47 48
-;#  v3 = 49 50 ... 62 63
-;#
-;#  In frame-buffer memory, the layout is:
-;#
-;#     0  16  32  48
-;#     1  17  33  49
-;#     ...
-;#    15  31  47  63.
-;#
-;#  We begin by reading the data 32 bits at a time (using scalar operations)
-;#  into a temporary array, reading the rows of the array into vector registers,
-;#  with the following layout:
-;#
-;#  v0 =  0 16 32 48  4 20 36 52  8 24 40 56  12 28 44 60
-;#  v1 =  1 17 33 49  5 21 ...                      45 61
-;#  v2 =  2 18 ...                                  46 62
-;#  v3 =  3 19 ...                                  47 63
-;#
-;#  From the "address-bit" perspective discussed above, we simply need to
-;#  interchange bits 0 <-> 4 and 1 <-> 5, leaving bits 2 and 3 alone.
-;#  In other words, we transpose each of the four 4x4 submatrices.
-;#
-;#  This transformation is its own inverse, and we need to perform it
-;#  again before writing the pixels back into the frame buffer.
-;#
-;#  It acts in place on registers v0...v3, uses v4...v7 as temporaries,
-;#  and assumes that v14/v15 contain the b_hihi/b_lolo selectors
-;#  defined above.  We think of both groups of 4 registers as having
-;#  "addresses" {0,1,2,3} * 16.
-;#
-.macro Transpose4times4x4 Vlo, Vhi
-
-    ;# d=s=0        0->1  1->2  2->3  3->4  4->0  =5=
-
-    vmrghb  v4, v0, v1
-    vmrglb  v5, v0, v1
-    vmrghb  v6, v2, v3
-    vmrglb  v7, v2, v3
-
-    ;# d=0 s=1      =0=   1->2  2->3  3->4  4->5  5->1
-
-    vmrghh  v0, v4, v6
-    vmrglh  v1, v4, v6
-    vmrghh  v2, v5, v7
-    vmrglh  v3, v5, v7
-
-    ;# d=s=0        =0=   =1=   2->3  3->4  4->2  =5=
-
-    vmrghw  v4, v0, v1
-    vmrglw  v5, v0, v1
-    vmrghw  v6, v2, v3
-    vmrglw  v7, v2, v3
-
-    ;# d=0  s=1     =0=   =1=   =2=   3->4  4->5  5->3
-
-    vperm   v0, v4, v6, \Vlo
-    vperm   v1, v4, v6, \Vhi
-    vperm   v2, v5, v7, \Vlo
-    vperm   v3, v5, v7, \Vhi
-.endm
-;# end Transpose4times4x4
-
-
-;# Normal mb vertical edge filter transpose.
-;#
-;#   We read 8 columns of data, initially in the following pattern:
-;#
-;#  (0,0)  (1,0) ... (7,0)  (0,1)  (1,1) ... (7,1)
-;#  (0,2)  (1,2) ... (7,2)  (0,3)  (1,3) ... (7,3)
-;#  ...
-;#  (0,14) (1,14) .. (7,14) (0,15) (1,15) .. (7,15)
-;#
-;#   and wish to convert to:
-;#
-;#  (0,0) ... (0,15)
-;#  (1,0) ... (1,15)
-;#  ...
-;#  (7,0) ... (7,15).
-;#
-;#  In "address bit" language, we wish to map
-;#
-;#  0->4  1->5  2->6  3->0  4->1  5->2  6->3, i.e., I -> (I+4) mod 7.
-;#
-;#  This can be accomplished by 4 iterations of the cyclic transform
-;#
-;#  I -> (I+1) mod 7;
-;#
-;#  each iteration can be realized by (d=0, s=2):
-;#
-;#  x = 0;  do  Tpair( V(2x),V(2x+1),  V(x),V(x+4))  while( ++x < 4);
-;#
-;#  The input/output is in registers v0...v7.  We use v10...v17 as mirrors;
-;#  preserving v8 = sign converter.
-;#
-;#  Inverse transpose is similar, except here I -> (I+3) mod 7 and the
-;#  result lands in the "mirror" registers v10...v17
-;#
-.macro t8x16_odd
-    Tpair v10, v11,  v0, v4
-    Tpair v12, v13,  v1, v5
-    Tpair v14, v15,  v2, v6
-    Tpair v16, v17,  v3, v7
-.endm
-
-.macro t8x16_even
-    Tpair v0, v1,  v10, v14
-    Tpair v2, v3,  v11, v15
-    Tpair v4, v5,  v12, v16
-    Tpair v6, v7,  v13, v17
-.endm
-
-.macro transpose8x16_fwd
-    t8x16_odd
-    t8x16_even
-    t8x16_odd
-    t8x16_even
-.endm
-
-.macro transpose8x16_inv
-    t8x16_odd
-    t8x16_even
-    t8x16_odd
-.endm
-
-.macro Transpose16x16
-    vmrghb  v0, v16, v24
-    vmrglb  v1, v16, v24
-    vmrghb  v2, v17, v25
-    vmrglb  v3, v17, v25
-    vmrghb  v4, v18, v26
-    vmrglb  v5, v18, v26
-    vmrghb  v6, v19, v27
-    vmrglb  v7, v19, v27
-    vmrghb  v8, v20, v28
-    vmrglb  v9, v20, v28
-    vmrghb  v10, v21, v29
-    vmrglb  v11, v21, v29
-    vmrghb  v12, v22, v30
-    vmrglb  v13, v22, v30
-    vmrghb  v14, v23, v31
-    vmrglb  v15, v23, v31
-    vmrghb  v16, v0, v8
-    vmrglb  v17, v0, v8
-    vmrghb  v18, v1, v9
-    vmrglb  v19, v1, v9
-    vmrghb  v20, v2, v10
-    vmrglb  v21, v2, v10
-    vmrghb  v22, v3, v11
-    vmrglb  v23, v3, v11
-    vmrghb  v24, v4, v12
-    vmrglb  v25, v4, v12
-    vmrghb  v26, v5, v13
-    vmrglb  v27, v5, v13
-    vmrghb  v28, v6, v14
-    vmrglb  v29, v6, v14
-    vmrghb  v30, v7, v15
-    vmrglb  v31, v7, v15
-    vmrghb  v0, v16, v24
-    vmrglb  v1, v16, v24
-    vmrghb  v2, v17, v25
-    vmrglb  v3, v17, v25
-    vmrghb  v4, v18, v26
-    vmrglb  v5, v18, v26
-    vmrghb  v6, v19, v27
-    vmrglb  v7, v19, v27
-    vmrghb  v8, v20, v28
-    vmrglb  v9, v20, v28
-    vmrghb  v10, v21, v29
-    vmrglb  v11, v21, v29
-    vmrghb  v12, v22, v30
-    vmrglb  v13, v22, v30
-    vmrghb  v14, v23, v31
-    vmrglb  v15, v23, v31
-    vmrghb  v16, v0, v8
-    vmrglb  v17, v0, v8
-    vmrghb  v18, v1, v9
-    vmrglb  v19, v1, v9
-    vmrghb  v20, v2, v10
-    vmrglb  v21, v2, v10
-    vmrghb  v22, v3, v11
-    vmrglb  v23, v3, v11
-    vmrghb  v24, v4, v12
-    vmrglb  v25, v4, v12
-    vmrghb  v26, v5, v13
-    vmrglb  v27, v5, v13
-    vmrghb  v28, v6, v14
-    vmrglb  v29, v6, v14
-    vmrghb  v30, v7, v15
-    vmrglb  v31, v7, v15
-.endm
-
-;# load_g loads a global vector (whose address is in the local variable Gptr)
-;#   into vector register Vreg.  Trashes r0
-.macro load_g Vreg, Gptr
-    lwz     r0, \Gptr
-    lvx     \Vreg, 0, r0
-.endm
-
-;# exploit the saturation here.  if the answer is negative
-;# it will be clamped to 0.  orring 0 with a positive
-;# number will be the positive number (abs)
-;# RES = abs( A-B), trashes TMP
-.macro Abs RES, TMP, A, B
-    vsububs \RES, \A, \B
-    vsububs \TMP, \B, \A
-    vor     \RES, \RES, \TMP
-.endm
-
-;# RES = Max( RES, abs( A-B)), trashes TMP
-.macro max_abs RES, TMP, A, B
-    vsububs \TMP, \A, \B
-    vmaxub  \RES, \RES, \TMP
-    vsububs \TMP, \B, \A
-    vmaxub  \RES, \RES, \TMP
-.endm
-
-.macro Masks
-    ;# build masks
-    ;# input is all 8 bit unsigned (0-255).  need to
-    ;# do abs(vala-valb) > limit.  but no need to compare each
-    ;# value to the limit.  find the max of the absolute differences
-    ;# and compare that to the limit.
-    ;# First hev
-    Abs     v14, v13, v2, v3    ;# |P1 - P0|
-    max_abs  v14, v13, v5, v4    ;# |Q1 - Q0|
-
-    vcmpgtub v10, v14, v10      ;# HEV = true if thresh exceeded
-
-    ;# Next limit
-    max_abs  v14, v13, v0, v1    ;# |P3 - P2|
-    max_abs  v14, v13, v1, v2    ;# |P2 - P1|
-    max_abs  v14, v13, v6, v5    ;# |Q2 - Q1|
-    max_abs  v14, v13, v7, v6    ;# |Q3 - Q2|
-
-    vcmpgtub v9, v14, v9        ;# R = true if limit exceeded
-
-    ;# flimit
-    Abs     v14, v13, v3, v4    ;# |P0 - Q0|
-
-    vcmpgtub v8, v14, v8        ;# X = true if flimit exceeded
-
-    vor     v8, v8, v9          ;# R = true if flimit or limit exceeded
-    ;# done building masks
-.endm
-
-.macro build_constants RFL, RLI, RTH, FL, LI, TH
-    ;# build constants
-    lvx     \FL, 0, \RFL        ;# flimit
-    lvx     \LI, 0, \RLI        ;# limit
-    lvx     \TH, 0, \RTH        ;# thresh
-
-    vspltisb v11, 8
-    vspltisb v12, 4
-    vslb    v11, v11, v12       ;# 0x80808080808080808080808080808080
-.endm
-
-.macro load_data_y
-    ;# setup strides/pointers to be able to access
-    ;# all of the data
-    add     r5, r4, r4          ;# r5 = 2 * stride
-    sub     r6, r3, r5          ;# r6 -> 2 rows back
-    neg     r7, r4              ;# r7 = -stride
-
-    ;# load 16 pixels worth of data to work on
-    sub     r0, r6, r5          ;# r0 -> 4 rows back (temp)
-    lvx     v0,  0, r0          ;# P3  (read only)
-    lvx     v1, r7, r6          ;# P2
-    lvx     v2,  0, r6          ;# P1
-    lvx     v3, r7, r3          ;# P0
-    lvx     v4,  0, r3          ;# Q0
-    lvx     v5, r4, r3          ;# Q1
-    lvx     v6, r5, r3          ;# Q2
-    add     r0, r3, r5          ;# r0 -> 2 rows fwd (temp)
-    lvx     v7, r4, r0          ;# Q3  (read only)
-.endm
-
-;# Expects
-;#  v10 == HEV
-;#  v13 == tmp
-;#  v14 == tmp
-.macro common_adjust P0, Q0, P1, Q1, HEV_PRESENT
-    vxor    \P1, \P1, v11       ;# SP1
-    vxor    \P0, \P0, v11       ;# SP0
-    vxor    \Q0, \Q0, v11       ;# SQ0
-    vxor    \Q1, \Q1, v11       ;# SQ1
-
-    vsubsbs v13, \P1, \Q1       ;# f  = c (P1 - Q1)
-.if \HEV_PRESENT
-    vand    v13, v13, v10       ;# f &= hev
-.endif
-    vsubsbs v14, \Q0, \P0       ;# -126 <=  X = Q0-P0  <= +126
-    vaddsbs v13, v13, v14
-    vaddsbs v13, v13, v14
-    vaddsbs v13, v13, v14       ;# A = c( c(P1-Q1) + 3*(Q0-P0))
-
-    vandc   v13, v13, v8        ;# f &= mask
-
-    vspltisb v8, 3
-    vspltisb v9, 4
-
-    vaddsbs v14, v13, v9        ;# f1 = c (f+4)
-    vaddsbs v15, v13, v8        ;# f2 = c (f+3)
-
-    vsrab   v13, v14, v8        ;# f1 >>= 3
-    vsrab   v15, v15, v8        ;# f2 >>= 3
-
-    vsubsbs \Q0, \Q0, v13       ;# u1 = c (SQ0 - f1)
-    vaddsbs \P0, \P0, v15       ;# u2 = c (SP0 + f2)
-.endm
-
-.macro vp8_mbfilter
-    Masks
-
-    ;# start the fitering here
-    vxor    v1, v1, v11         ;# SP2
-    vxor    v2, v2, v11         ;# SP1
-    vxor    v3, v3, v11         ;# SP0
-    vxor    v4, v4, v11         ;# SQ0
-    vxor    v5, v5, v11         ;# SQ1
-    vxor    v6, v6, v11         ;# SQ2
-
-    ;# add outer taps if we have high edge variance
-    vsubsbs v13, v2, v5         ;# f  = c (SP1-SQ1)
-
-    vsubsbs v14, v4, v3         ;# SQ0-SP0
-    vaddsbs v13, v13, v14
-    vaddsbs v13, v13, v14
-    vaddsbs v13, v13, v14       ;# f  = c( c(SP1-SQ1) + 3*(SQ0-SP0))
-
-    vandc   v13, v13, v8        ;# f &= mask
-    vand    v15, v13, v10       ;# f2 = f & hev
-
-    ;# save bottom 3 bits so that we round one side +4 and the other +3
-    vspltisb v8, 3
-    vspltisb v9, 4
-
-    vaddsbs v14, v15, v9        ;# f1 = c (f+4)
-    vaddsbs v15, v15, v8        ;# f2 = c (f+3)
-
-    vsrab   v14, v14, v8        ;# f1 >>= 3
-    vsrab   v15, v15, v8        ;# f2 >>= 3
-
-    vsubsbs v4, v4, v14         ;# u1 = c (SQ0 - f1)
-    vaddsbs v3, v3, v15         ;# u2 = c (SP0 + f2)
-
-    ;# only apply wider filter if not high edge variance
-    vandc   v13, v13, v10       ;# f &= ~hev
-
-    vspltisb v9, 2
-    vnor    v8, v8, v8
-    vsrb    v9, v8, v9          ;# 0x3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f
-    vupkhsb v9, v9              ;# 0x003f003f003f003f003f003f003f003f
-    vspltisb v8, 9
-
-    ;# roughly 1/7th difference across boundary
-    vspltish v10, 7
-    vmulosb v14, v8, v13        ;# A = c( c(P1-Q1) + 3*(Q0-P0))
-    vmulesb v15, v8, v13
-    vaddshs v14, v14, v9        ;# +=  63
-    vaddshs v15, v15, v9
-    vsrah   v14, v14, v10       ;# >>= 7
-    vsrah   v15, v15, v10
-    vmrglh  v10, v15, v14
-    vmrghh  v15, v15, v14
-
-    vpkshss v10, v15, v10       ;# X = saturated down to bytes
-
-    vsubsbs v6, v6, v10         ;# subtract from Q and add to P
-    vaddsbs v1, v1, v10
-
-    vxor    v6, v6, v11
-    vxor    v1, v1, v11
-
-    ;# roughly 2/7th difference across boundary
-    vspltish v10, 7
-    vaddubm v12, v8, v8
-    vmulosb v14, v12, v13       ;# A = c( c(P1-Q1) + 3*(Q0-P0))
-    vmulesb v15, v12, v13
-    vaddshs v14, v14, v9
-    vaddshs v15, v15, v9
-    vsrah   v14, v14, v10       ;# >>= 7
-    vsrah   v15, v15, v10
-    vmrglh  v10, v15, v14
-    vmrghh  v15, v15, v14
-
-    vpkshss v10, v15, v10       ;# X = saturated down to bytes
-
-    vsubsbs v5, v5, v10         ;# subtract from Q and add to P
-    vaddsbs v2, v2, v10
-
-    vxor    v5, v5, v11
-    vxor    v2, v2, v11
-
-    ;# roughly 3/7th difference across boundary
-    vspltish v10, 7
-    vaddubm v12, v12, v8
-    vmulosb v14, v12, v13       ;# A = c( c(P1-Q1) + 3*(Q0-P0))
-    vmulesb v15, v12, v13
-    vaddshs v14, v14, v9
-    vaddshs v15, v15, v9
-    vsrah   v14, v14, v10       ;# >>= 7
-    vsrah   v15, v15, v10
-    vmrglh  v10, v15, v14
-    vmrghh  v15, v15, v14
-
-    vpkshss v10, v15, v10       ;# X = saturated down to bytes
-
-    vsubsbs v4, v4, v10         ;# subtract from Q and add to P
-    vaddsbs v3, v3, v10
-
-    vxor    v4, v4, v11
-    vxor    v3, v3, v11
-.endm
-
-.macro SBFilter
-    Masks
-
-    common_adjust v3, v4, v2, v5, 1
-
-    ;# outer tap adjustments
-    vspltisb v8, 1
-
-    vaddubm v13, v13, v8        ;# f  += 1
-    vsrab   v13, v13, v8        ;# f >>= 1
-
-    vandc   v13, v13, v10       ;# f &= ~hev
-
-    vsubsbs v5, v5, v13         ;# u1 = c (SQ1 - f)
-    vaddsbs v2, v2, v13         ;# u2 = c (SP1 + f)
-
-    vxor    v2, v2, v11
-    vxor    v3, v3, v11
-    vxor    v4, v4, v11
-    vxor    v5, v5, v11
-.endm
-
-    .align 2
-mbloop_filter_horizontal_edge_y_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xffff
-    mtspr   256, r12            ;# set VRSAVE
-
-    build_constants r5, r6, r7, v8, v9, v10
-
-    load_data_y
-
-    vp8_mbfilter
-
-    stvx     v1, r7, r6         ;# P2
-    stvx     v2,  0, r6         ;# P1
-    stvx     v3, r7, r3         ;# P0
-    stvx     v4,  0, r3         ;# Q0
-    stvx     v5, r4, r3         ;# Q1
-    stvx     v6, r5, r3         ;# Q2
-
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr
-
-    .align 2
-;#  r3 unsigned char *s
-;#  r4 int p
-;#  r5 const signed char *flimit
-;#  r6 const signed char *limit
-;#  r7 const signed char *thresh
-loop_filter_horizontal_edge_y_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xffff
-    mtspr   256, r12            ;# set VRSAVE
-
-    build_constants r5, r6, r7, v8, v9, v10
-
-    load_data_y
-
-    SBFilter
-
-    stvx     v2,  0, r6         ;# P1
-    stvx     v3, r7, r3         ;# P0
-    stvx     v4,  0, r3         ;# Q0
-    stvx     v5, r4, r3         ;# Q1
-
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr
-
-;# Filtering a vertical mb.  Each mb is aligned on a 16 byte boundary.
-;#  So we can read in an entire mb aligned.  However if we want to filter the mb
-;#  edge we run into problems.  For the loopfilter we require 4 bytes before the mb
-;#  and 4 after for a total of 8 bytes.  Reading 16 bytes inorder to get 4 is a bit
-;#  of a waste.  So this is an even uglier way to get around that.
-;# Using the regular register file words are read in and then saved back out to
-;#  memory to align and order them up.  Then they are read in using the
-;#  vector register file.
-.macro RLVmb V, R
-    lwzux   r0, r3, r4
-    stw     r0, 4(\R)
-    lwz     r0,-4(r3)
-    stw     r0, 0(\R)
-    lwzux   r0, r3, r4
-    stw     r0,12(\R)
-    lwz     r0,-4(r3)
-    stw     r0, 8(\R)
-    lvx     \V, 0, \R
-.endm
-
-.macro WLVmb V, R
-    stvx    \V, 0, \R
-    lwz     r0,12(\R)
-    stwux   r0, r3, r4
-    lwz     r0, 8(\R)
-    stw     r0,-4(r3)
-    lwz     r0, 4(\R)
-    stwux   r0, r3, r4
-    lwz     r0, 0(\R)
-    stw     r0,-4(r3)
-.endm
-
-    .align 2
-;#  r3 unsigned char *s
-;#  r4 int p
-;#  r5 const signed char *flimit
-;#  r6 const signed char *limit
-;#  r7 const signed char *thresh
-mbloop_filter_vertical_edge_y_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xffff
-    ori     r12, r12, 0xc000
-    mtspr   256, r12            ;# set VRSAVE
-
-    la      r9, -48(r1)         ;# temporary space for reading in vectors
-    sub     r3, r3, r4
-
-    RLVmb v0, r9
-    RLVmb v1, r9
-    RLVmb v2, r9
-    RLVmb v3, r9
-    RLVmb v4, r9
-    RLVmb v5, r9
-    RLVmb v6, r9
-    RLVmb v7, r9
-
-    transpose8x16_fwd
-
-    build_constants r5, r6, r7, v8, v9, v10
-
-    vp8_mbfilter
-
-    transpose8x16_inv
-
-    add r3, r3, r4
-    neg r4, r4
-
-    WLVmb v17, r9
-    WLVmb v16, r9
-    WLVmb v15, r9
-    WLVmb v14, r9
-    WLVmb v13, r9
-    WLVmb v12, r9
-    WLVmb v11, r9
-    WLVmb v10, r9
-
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr
-
-.macro RL V, R, P
-    lvx     \V, 0,  \R
-    add     \R, \R, \P
-.endm
-
-.macro WL V, R, P
-    stvx    \V, 0,  \R
-    add     \R, \R, \P
-.endm
-
-.macro Fil P3, P2, P1, P0, Q0, Q1, Q2, Q3
-                                ;# K = |P0-P1| already
-    Abs     v14, v13, \Q0, \Q1  ;# M = |Q0-Q1|
-    vmaxub  v14, v14, v4        ;# M = max( |P0-P1|, |Q0-Q1|)
-    vcmpgtub v10, v14, v0
-
-    Abs     v4, v5, \Q2, \Q3    ;# K = |Q2-Q3| = next |P0-P1]
-
-    max_abs  v14, v13, \Q1, \Q2  ;# M = max( M, |Q1-Q2|)
-    max_abs  v14, v13, \P1, \P2  ;# M = max( M, |P1-P2|)
-    max_abs  v14, v13, \P2, \P3  ;# M = max( M, |P2-P3|)
-
-    vmaxub   v14, v14, v4       ;# M = max interior abs diff
-    vcmpgtub v9, v14, v2        ;# M = true if int_l exceeded
-
-    Abs     v14, v13, \P0, \Q0  ;# X = Abs( P0-Q0)
-    vcmpgtub v8, v14, v3        ;# X = true if edge_l exceeded
-    vor     v8, v8, v9          ;# M = true if edge_l or int_l exceeded
-
-    ;# replace P1,Q1 w/signed versions
-    common_adjust \P0, \Q0, \P1, \Q1, 1
-
-    vaddubm v13, v13, v1        ;# -16 <= M <= 15, saturation irrelevant
-    vsrab   v13, v13, v1
-    vandc   v13, v13, v10       ;# adjust P1,Q1 by (M+1)>>1  if ! hev
-    vsubsbs \Q1, \Q1, v13
-    vaddsbs \P1, \P1, v13
-
-    vxor    \P1, \P1, v11       ;# P1
-    vxor    \P0, \P0, v11       ;# P0
-    vxor    \Q0, \Q0, v11       ;# Q0
-    vxor    \Q1, \Q1, v11       ;# Q1
-.endm
-
-
-    .align 2
-;#  r3 unsigned char *s
-;#  r4 int p
-;#  r5 const signed char *flimit
-;#  r6 const signed char *limit
-;#  r7 const signed char *thresh
-loop_filter_vertical_edge_y_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xffff
-    ori     r12, r12, 0xffff
-    mtspr   256, r12            ;# set VRSAVE
-
-    addi    r9, r3, 0
-    RL      v16, r9, r4
-    RL      v17, r9, r4
-    RL      v18, r9, r4
-    RL      v19, r9, r4
-    RL      v20, r9, r4
-    RL      v21, r9, r4
-    RL      v22, r9, r4
-    RL      v23, r9, r4
-    RL      v24, r9, r4
-    RL      v25, r9, r4
-    RL      v26, r9, r4
-    RL      v27, r9, r4
-    RL      v28, r9, r4
-    RL      v29, r9, r4
-    RL      v30, r9, r4
-    lvx     v31, 0, r9
-
-    Transpose16x16
-
-    vspltisb v1, 1
-
-    build_constants r5, r6, r7, v3, v2, v0
-
-    Abs v4, v5, v19, v18                            ;# K(v14) = first |P0-P1|
-
-    Fil v16, v17, v18, v19,  v20, v21, v22, v23
-    Fil v20, v21, v22, v23,  v24, v25, v26, v27
-    Fil v24, v25, v26, v27,  v28, v29, v30, v31
-
-    Transpose16x16
-
-    addi    r9, r3, 0
-    WL      v16, r9, r4
-    WL      v17, r9, r4
-    WL      v18, r9, r4
-    WL      v19, r9, r4
-    WL      v20, r9, r4
-    WL      v21, r9, r4
-    WL      v22, r9, r4
-    WL      v23, r9, r4
-    WL      v24, r9, r4
-    WL      v25, r9, r4
-    WL      v26, r9, r4
-    WL      v27, r9, r4
-    WL      v28, r9, r4
-    WL      v29, r9, r4
-    WL      v30, r9, r4
-    stvx    v31, 0, r9
-
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr
-
-;# -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- UV FILTERING -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
-.macro active_chroma_sel V
-    andi.   r7, r3, 8       ;# row origin modulo 16
-    add     r7, r7, r7      ;# selects selectors
-    lis     r12, _chromaSelectors@ha
-    la      r0,  _chromaSelectors@l(r12)
-    lwzux   r0, r7, r0      ;# leave selector addr in r7
-
-    lvx     \V, 0, r0       ;# mask to concatenate active U,V pels
-.endm
-
-.macro hread_uv Dest, U, V, Offs, VMask
-    lvx     \U, \Offs, r3
-    lvx     \V, \Offs, r4
-    vperm   \Dest, \U, \V, \VMask   ;# Dest = active part of U then V
-.endm
-
-.macro hwrite_uv New, U, V, Offs, Umask, Vmask
-    vperm   \U, \New, \U, \Umask    ;# Combine new pels with siblings
-    vperm   \V, \New, \V, \Vmask
-    stvx    \U, \Offs, r3           ;# Write to frame buffer
-    stvx    \V, \Offs, r4
-.endm
-
-;# Process U,V in parallel.
-.macro load_chroma_h
-    neg     r9, r5          ;# r9 = -1 * stride
-    add     r8, r9, r9      ;# r8 = -2 * stride
-    add     r10, r5, r5     ;# r10 = 2 * stride
-
-    active_chroma_sel v12
-
-    ;# P3, Q3 are read-only; need not save addresses or sibling pels
-    add     r6, r8, r8      ;# r6 = -4 * stride
-    hread_uv v0, v14, v15, r6, v12
-    add     r6, r10, r5     ;# r6 =  3 * stride
-    hread_uv v7, v14, v15, r6, v12
-
-    ;# Others are read/write; save addresses and sibling pels
-
-    add     r6, r8, r9      ;# r6 = -3 * stride
-    hread_uv v1, v16, v17, r6,  v12
-    hread_uv v2, v18, v19, r8,  v12
-    hread_uv v3, v20, v21, r9,  v12
-    hread_uv v4, v22, v23, 0,   v12
-    hread_uv v5, v24, v25, r5,  v12
-    hread_uv v6, v26, v27, r10, v12
-.endm
-
-.macro uresult_sel V
-    load_g   \V, 4(r7)
-.endm
-
-.macro vresult_sel V
-    load_g   \V, 8(r7)
-.endm
-
-;# always write P1,P0,Q0,Q1
-.macro store_chroma_h
-    uresult_sel v11
-    vresult_sel v12
-    hwrite_uv v2, v18, v19, r8, v11, v12
-    hwrite_uv v3, v20, v21, r9, v11, v12
-    hwrite_uv v4, v22, v23, 0,  v11, v12
-    hwrite_uv v5, v24, v25, r5, v11, v12
-.endm
-
-    .align 2
-;#  r3 unsigned char *u
-;#  r4 unsigned char *v
-;#  r5 int p
-;#  r6 const signed char *flimit
-;#  r7 const signed char *limit
-;#  r8 const signed char *thresh
-mbloop_filter_horizontal_edge_uv_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xffff
-    ori     r12, r12, 0xffff
-    mtspr   256, r12            ;# set VRSAVE
-
-    build_constants r6, r7, r8, v8, v9, v10
-
-    load_chroma_h
-
-    vp8_mbfilter
-
-    store_chroma_h
-
-    hwrite_uv v1, v16, v17, r6,  v11, v12    ;# v1 == P2
-    hwrite_uv v6, v26, v27, r10, v11, v12    ;# v6 == Q2
-
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr
-
-    .align 2
-;#  r3 unsigned char *u
-;#  r4 unsigned char *v
-;#  r5 int p
-;#  r6 const signed char *flimit
-;#  r7 const signed char *limit
-;#  r8 const signed char *thresh
-loop_filter_horizontal_edge_uv_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xffff
-    ori     r12, r12, 0xffff
-    mtspr   256, r12            ;# set VRSAVE
-
-    build_constants r6, r7, r8, v8, v9, v10
-
-    load_chroma_h
-
-    SBFilter
-
-    store_chroma_h
-
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr
-
-.macro R V, R
-    lwzux   r0, r3, r5
-    stw     r0, 4(\R)
-    lwz     r0,-4(r3)
-    stw     r0, 0(\R)
-    lwzux   r0, r4, r5
-    stw     r0,12(\R)
-    lwz     r0,-4(r4)
-    stw     r0, 8(\R)
-    lvx     \V, 0, \R
-.endm
-
-
-.macro W V, R
-    stvx    \V, 0, \R
-    lwz     r0,12(\R)
-    stwux   r0, r4, r5
-    lwz     r0, 8(\R)
-    stw     r0,-4(r4)
-    lwz     r0, 4(\R)
-    stwux   r0, r3, r5
-    lwz     r0, 0(\R)
-    stw     r0,-4(r3)
-.endm
-
-.macro chroma_vread R
-    sub r3, r3, r5          ;# back up one line for simplicity
-    sub r4, r4, r5
-
-    R v0, \R
-    R v1, \R
-    R v2, \R
-    R v3, \R
-    R v4, \R
-    R v5, \R
-    R v6, \R
-    R v7, \R
-
-    transpose8x16_fwd
-.endm
-
-.macro chroma_vwrite R
-
-    transpose8x16_inv
-
-    add     r3, r3, r5
-    add     r4, r4, r5
-    neg     r5, r5          ;# Write rows back in reverse order
-
-    W v17, \R
-    W v16, \R
-    W v15, \R
-    W v14, \R
-    W v13, \R
-    W v12, \R
-    W v11, \R
-    W v10, \R
-.endm
-
-    .align 2
-;#  r3 unsigned char *u
-;#  r4 unsigned char *v
-;#  r5 int p
-;#  r6 const signed char *flimit
-;#  r7 const signed char *limit
-;#  r8 const signed char *thresh
-mbloop_filter_vertical_edge_uv_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xffff
-    ori     r12, r12, 0xc000
-    mtspr   256, r12            ;# set VRSAVE
-
-    la      r9, -48(r1)         ;# temporary space for reading in vectors
-
-    chroma_vread r9
-
-    build_constants r6, r7, r8, v8, v9, v10
-
-    vp8_mbfilter
-
-    chroma_vwrite r9
-
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr
-
-    .align 2
-;#  r3 unsigned char *u
-;#  r4 unsigned char *v
-;#  r5 int p
-;#  r6 const signed char *flimit
-;#  r7 const signed char *limit
-;#  r8 const signed char *thresh
-loop_filter_vertical_edge_uv_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xffff
-    ori     r12, r12, 0xc000
-    mtspr   256, r12            ;# set VRSAVE
-
-    la      r9, -48(r1)         ;# temporary space for reading in vectors
-
-    chroma_vread r9
-
-    build_constants r6, r7, r8, v8, v9, v10
-
-    SBFilter
-
-    chroma_vwrite r9
-
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr
-
-;# -=-=-=-=-=-=-=-=-=-=-=-=-=-= SIMPLE LOOP FILTER =-=-=-=-=-=-=-=-=-=-=-=-=-=-
-
-.macro vp8_simple_filter
-    Abs v14, v13, v1, v2    ;# M = abs( P0 - Q0)
-    vcmpgtub v8, v14, v8    ;# v5 = true if _over_ limit
-
-    ;# preserve unsigned v0 and v3
-    common_adjust v1, v2, v0, v3, 0
-
-    vxor v1, v1, v11
-    vxor v2, v2, v11        ;# cvt Q0, P0 back to pels
-.endm
-
-.macro simple_vertical
-    addi    r8,  0, 16
-    addi    r7, r5, 32
-
-    lvx     v0,  0, r5
-    lvx     v1, r8, r5
-    lvx     v2,  0, r7
-    lvx     v3, r8, r7
-
-    lis     r12, _B_hihi@ha
-    la      r0,  _B_hihi@l(r12)
-    lvx     v16, 0, r0
-
-    lis     r12, _B_lolo@ha
-    la      r0,  _B_lolo@l(r12)
-    lvx     v17, 0, r0
-
-    Transpose4times4x4 v16, v17
-    vp8_simple_filter
-
-    vxor v0, v0, v11
-    vxor v3, v3, v11        ;# cvt Q0, P0 back to pels
-
-    Transpose4times4x4 v16, v17
-
-    stvx    v0,  0, r5
-    stvx    v1, r8, r5
-    stvx    v2,  0, r7
-    stvx    v3, r8, r7
-.endm
-
-    .align 2
-;#  r3 unsigned char *s
-;#  r4 int p
-;#  r5 const signed char *flimit
-loop_filter_simple_horizontal_edge_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xffff
-    mtspr   256, r12            ;# set VRSAVE
-
-    ;# build constants
-    lvx     v8, 0, r5           ;# flimit
-
-    vspltisb v11, 8
-    vspltisb v12, 4
-    vslb    v11, v11, v12       ;# 0x80808080808080808080808080808080
-
-    neg     r5, r4              ;# r5 = -1 * stride
-    add     r6, r5, r5          ;# r6 = -2 * stride
-
-    lvx     v0, r6, r3          ;# v0 = P1 = 16 pels two rows above edge
-    lvx     v1, r5, r3          ;# v1 = P0 = 16 pels one row  above edge
-    lvx     v2,  0, r3          ;# v2 = Q0 = 16 pels one row  below edge
-    lvx     v3, r4, r3          ;# v3 = Q1 = 16 pels two rows below edge
-
-    vp8_simple_filter
-
-    stvx    v1, r5, r3          ;# store P0
-    stvx    v2,  0, r3          ;# store Q0
-
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr
-
-.macro RLV Offs
-    stw     r0, (\Offs*4)(r5)
-    lwzux   r0, r7, r4
-.endm
-
-.macro WLV Offs
-    lwz     r0, (\Offs*4)(r5)
-    stwux   r0, r7, r4
-.endm
-
-    .align 2
-;#  r3 unsigned char *s
-;#  r4 int p
-;#  r5 const signed char *flimit
-loop_filter_simple_vertical_edge_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xffff
-    ori     r12, r12, 0xc000
-    mtspr   256, r12            ;# set VRSAVE
-
-    ;# build constants
-    lvx     v8, 0, r5           ;# flimit
-
-    vspltisb v11, 8
-    vspltisb v12, 4
-    vslb    v11, v11, v12       ;# 0x80808080808080808080808080808080
-
-    la r5, -96(r1)              ;# temporary space for reading in vectors
-
-    ;# Store 4 pels at word "Offs" in temp array, then advance r7
-    ;#   to next row and read another 4 pels from the frame buffer.
-
-    subi    r7, r3,  2          ;# r7 -> 2 pels before start
-    lwzx    r0,  0, r7          ;# read first 4 pels
-
-    ;# 16 unaligned word accesses
-    RLV 0
-    RLV 4
-    RLV 8
-    RLV 12
-    RLV 1
-    RLV 5
-    RLV 9
-    RLV 13
-    RLV 2
-    RLV 6
-    RLV 10
-    RLV 14
-    RLV 3
-    RLV 7
-    RLV 11
-
-    stw     r0, (15*4)(r5)      ;# write last 4 pels
-
-    simple_vertical
-
-    ;# Read temp array, write frame buffer.
-    subi    r7, r3,  2          ;# r7 -> 2 pels before start
-    lwzx    r0,  0, r5          ;# read/write first 4 pels
-    stwx    r0,  0, r7
-
-    WLV 4
-    WLV 8
-    WLV 12
-    WLV 1
-    WLV 5
-    WLV 9
-    WLV 13
-    WLV 2
-    WLV 6
-    WLV 10
-    WLV 14
-    WLV 3
-    WLV 7
-    WLV 11
-    WLV 15
-
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr
-
-    .data
-
-_chromaSelectors:
-    .long   _B_hihi
-    .long   _B_Ures0
-    .long   _B_Vres0
-    .long   0
-    .long   _B_lolo
-    .long   _B_Ures8
-    .long   _B_Vres8
-    .long   0
-
-    .align 4
-_B_Vres8:
-    .byte   16, 17, 18, 19, 20, 21, 22, 23,  8,  9, 10, 11, 12, 13, 14, 15
-
-    .align 4
-_B_Ures8:
-    .byte   16, 17, 18, 19, 20, 21, 22, 23,  0,  1,  2,  3,  4,  5,  6,  7
-
-    .align 4
-_B_lolo:
-    .byte    8,  9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31
-
-    .align 4
-_B_Vres0:
-    .byte    8,  9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31
-    .align 4
-_B_Ures0:
-    .byte    0,  1,  2,  3,  4,  5,  6,  7, 24, 25, 26, 27, 28, 29, 30, 31
-
-    .align 4
-_B_hihi:
-    .byte    0,  1,  2,  3,  4,  5,  6,  7, 16, 17, 18, 19, 20, 21, 22, 23
--- a/vp8/common/ppc/platform_altivec.asm
+++ /dev/null
@@ -1,59 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    .globl save_platform_context
-    .globl restore_platform_context
-
-.macro W V P
-    stvx    \V,  0, \P
-    addi    \P, \P, 16
-.endm
-
-.macro R V P
-    lvx     \V,  0, \P
-    addi    \P, \P, 16
-.endm
-
-;# r3 context_ptr
-    .align 2
-save_platform_contex:
-    W v20, r3
-    W v21, r3
-    W v22, r3
-    W v23, r3
-    W v24, r3
-    W v25, r3
-    W v26, r3
-    W v27, r3
-    W v28, r3
-    W v29, r3
-    W v30, r3
-    W v31, r3
-
-    blr
-
-;# r3 context_ptr
-    .align 2
-restore_platform_context:
-    R v20, r3
-    R v21, r3
-    R v22, r3
-    R v23, r3
-    R v24, r3
-    R v25, r3
-    R v26, r3
-    R v27, r3
-    R v28, r3
-    R v29, r3
-    R v30, r3
-    R v31, r3
-
-    blr
--- a/vp8/common/ppc/recon_altivec.asm
+++ /dev/null
@@ -1,175 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    .globl recon4b_ppc
-    .globl recon2b_ppc
-    .globl recon_b_ppc
-
-.macro row_of16 Diff Pred Dst Stride
-    lvx     v1,  0, \Pred           ;# v1 = pred = p0..p15
-    addi    \Pred, \Pred, 16        ;# next pred
-    vmrghb  v2, v0, v1              ;# v2 = 16-bit p0..p7
-    lvx     v3,  0, \Diff           ;# v3 = d0..d7
-    vaddshs v2, v2, v3              ;# v2 = r0..r7
-    vmrglb  v1, v0, v1              ;# v1 = 16-bit p8..p15
-    lvx     v3, r8, \Diff           ;# v3 = d8..d15
-    addi    \Diff, \Diff, 32        ;# next diff
-    vaddshs v3, v3, v1              ;# v3 = r8..r15
-    vpkshus v2, v2, v3              ;# v2 = 8-bit r0..r15
-    stvx    v2,  0, \Dst            ;# to dst
-    add     \Dst, \Dst, \Stride     ;# next dst
-.endm
-
-    .text
-    .align 2
-;#  r3 = short *diff_ptr,
-;#  r4 = unsigned char *pred_ptr,
-;#  r5 = unsigned char *dst_ptr,
-;#  r6 = int stride
-recon4b_ppc:
-    mfspr   r0, 256                     ;# get old VRSAVE
-    stw     r0, -8(r1)                  ;# save old VRSAVE to stack
-    oris    r0, r0, 0xf000
-    mtspr   256,r0                      ;# set VRSAVE
-
-    vxor    v0, v0, v0
-    li      r8, 16
-
-    row_of16 r3, r4, r5, r6
-    row_of16 r3, r4, r5, r6
-    row_of16 r3, r4, r5, r6
-    row_of16 r3, r4, r5, r6
-
-    lwz     r12, -8(r1)                 ;# restore old VRSAVE from stack
-    mtspr   256, r12                    ;# reset old VRSAVE
-
-    blr
-
-.macro two_rows_of8 Diff Pred Dst Stride write_first_four_pels
-    lvx     v1,  0, \Pred       ;# v1 = pred = p0..p15
-    vmrghb  v2, v0, v1          ;# v2 = 16-bit p0..p7
-    lvx     v3,  0, \Diff       ;# v3 = d0..d7
-    vaddshs v2, v2, v3          ;# v2 = r0..r7
-    vmrglb  v1, v0, v1          ;# v1 = 16-bit p8..p15
-    lvx     v3, r8, \Diff       ;# v2 = d8..d15
-    vaddshs v3, v3, v1          ;# v3 = r8..r15
-    vpkshus v2, v2, v3          ;# v3 = 8-bit r0..r15
-    stvx    v2,  0, r10         ;# 2 rows to dst from buf
-    lwz     r0, 0(r10)
-.if \write_first_four_pels
-    stw     r0, 0(\Dst)
-    .else
-    stwux   r0, \Dst, \Stride
-.endif
-    lwz     r0, 4(r10)
-    stw     r0, 4(\Dst)
-    lwz     r0, 8(r10)
-    stwux   r0, \Dst, \Stride       ;# advance dst to next row
-    lwz     r0, 12(r10)
-    stw     r0, 4(\Dst)
-.endm
-
-    .align 2
-;#  r3 = short *diff_ptr,
-;#  r4 = unsigned char *pred_ptr,
-;#  r5 = unsigned char *dst_ptr,
-;#  r6 = int stride
-
-recon2b_ppc:
-    mfspr   r0, 256                     ;# get old VRSAVE
-    stw     r0, -8(r1)                  ;# save old VRSAVE to stack
-    oris    r0, r0, 0xf000
-    mtspr   256,r0                      ;# set VRSAVE
-
-    vxor    v0, v0, v0
-    li      r8, 16
-
-    la      r10, -48(r1)                ;# buf
-
-    two_rows_of8 r3, r4, r5, r6, 1
-
-    addi    r4, r4, 16;                 ;# next pred
-    addi    r3, r3, 32;                 ;# next diff
-
-    two_rows_of8 r3, r4, r5, r6, 0
-
-    lwz     r12, -8(r1)                 ;# restore old VRSAVE from stack
-    mtspr   256, r12                    ;# reset old VRSAVE
-
-    blr
-
-.macro get_two_diff_rows
-    stw     r0, 0(r10)
-    lwz     r0, 4(r3)
-    stw     r0, 4(r10)
-    lwzu    r0, 32(r3)
-    stw     r0, 8(r10)
-    lwz     r0, 4(r3)
-    stw     r0, 12(r10)
-    lvx     v3, 0, r10
-.endm
-
-    .align 2
-;#  r3 = short *diff_ptr,
-;#  r4 = unsigned char *pred_ptr,
-;#  r5 = unsigned char *dst_ptr,
-;#  r6 = int stride
-recon_b_ppc:
-    mfspr   r0, 256                     ;# get old VRSAVE
-    stw     r0, -8(r1)                  ;# save old VRSAVE to stack
-    oris    r0, r0, 0xf000
-    mtspr   256,r0                      ;# set VRSAVE
-
-    vxor    v0, v0, v0
-
-    la      r10, -48(r1)    ;# buf
-
-    lwz     r0, 0(r4)
-    stw     r0, 0(r10)
-    lwz     r0, 16(r4)
-    stw     r0, 4(r10)
-    lwz     r0, 32(r4)
-    stw     r0, 8(r10)
-    lwz     r0, 48(r4)
-    stw     r0, 12(r10)
-
-    lvx     v1,  0, r10;    ;# v1 = pred = p0..p15
-
-    lwz r0, 0(r3)           ;# v3 = d0..d7
-
-    get_two_diff_rows
-
-    vmrghb  v2, v0, v1;     ;# v2 = 16-bit p0..p7
-    vaddshs v2, v2, v3;     ;# v2 = r0..r7
-
-    lwzu r0, 32(r3)         ;# v3 = d8..d15
-
-    get_two_diff_rows
-
-    vmrglb  v1, v0, v1;     ;# v1 = 16-bit p8..p15
-    vaddshs v3, v3, v1;     ;# v3 = r8..r15
-
-    vpkshus v2, v2, v3;     ;# v2 = 8-bit r0..r15
-    stvx    v2,  0, r10;    ;# 16 pels to dst from buf
-
-    lwz     r0, 0(r10)
-    stw     r0, 0(r5)
-    lwz     r0, 4(r10)
-    stwux   r0, r5, r6
-    lwz     r0, 8(r10)
-    stwux   r0, r5, r6
-    lwz     r0, 12(r10)
-    stwx    r0, r5, r6
-
-    lwz     r12, -8(r1)                 ;# restore old VRSAVE from stack
-    mtspr   256, r12                    ;# reset old VRSAVE
-
-    blr
--- a/vp8/common/ppc/systemdependent.c
+++ /dev/null
@@ -1,167 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "subpixel.h"
-#include "loopfilter.h"
-#include "recon.h"
-#include "idct.h"
-#include "onyxc_int.h"
-
-void (*vp8_short_idct4x4)(short *input, short *output, int pitch);
-void (*vp8_short_idct4x4_1)(short *input, short *output, int pitch);
-void (*vp8_dc_only_idct)(short input_dc, short *output, int pitch);
-
-extern void (*vp9_post_proc_down_and_across)(
-  unsigned char *src_ptr,
-  unsigned char *dst_ptr,
-  int src_pixels_per_line,
-  int dst_pixels_per_line,
-  int rows,
-  int cols,
-  int flimit
-);
-
-extern void (*vp9_mbpost_proc_down)(unsigned char *dst, int pitch, int rows, int cols, int flimit);
-extern void vp9_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols, int flimit);
-extern void (*vp9_mbpost_proc_across_ip)(unsigned char *src, int pitch, int rows, int cols, int flimit);
-extern void vp9_mbpost_proc_across_ip_c(unsigned char *src, int pitch, int rows, int cols, int flimit);
-
-extern void vp9_post_proc_down_and_across_c
-(
-  unsigned char *src_ptr,
-  unsigned char *dst_ptr,
-  int src_pixels_per_line,
-  int dst_pixels_per_line,
-  int rows,
-  int cols,
-  int flimit
-);
-void vp9_plane_add_noise_c(unsigned char *Start, unsigned int Width, unsigned int Height, int Pitch, int q, int a);
-
-extern copy_mem_block_function *vp9_copy_mem16x16;
-extern copy_mem_block_function *vp9_copy_mem8x8;
-extern copy_mem_block_function *vp9_copy_mem8x4;
-
-// PPC
-extern subpixel_predict_function sixtap_predict_ppc;
-extern subpixel_predict_function sixtap_predict8x4_ppc;
-extern subpixel_predict_function sixtap_predict8x8_ppc;
-extern subpixel_predict_function sixtap_predict16x16_ppc;
-extern subpixel_predict_function bilinear_predict4x4_ppc;
-extern subpixel_predict_function bilinear_predict8x4_ppc;
-extern subpixel_predict_function bilinear_predict8x8_ppc;
-extern subpixel_predict_function bilinear_predict16x16_ppc;
-
-extern copy_mem_block_function copy_mem16x16_ppc;
-
-void recon_b_ppc(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride);
-void recon2b_ppc(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride);
-void recon4b_ppc(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride);
-
-extern void short_idct4x4llm_ppc(short *input, short *output, int pitch);
-
-// Generic C
-extern subpixel_predict_function vp9_sixtap_predict_c;
-extern subpixel_predict_function vp9_sixtap_predict8x4_c;
-extern subpixel_predict_function vp9_sixtap_predict8x8_c;
-extern subpixel_predict_function vp9_sixtap_predict16x16_c;
-extern subpixel_predict_function vp9_bilinear_predict4x4_c;
-extern subpixel_predict_function vp9_bilinear_predict8x4_c;
-extern subpixel_predict_function vp9_bilinear_predict8x8_c;
-extern subpixel_predict_function vp9_bilinear_predict16x16_c;
-
-extern copy_mem_block_function vp9_copy_mem16x16_c;
-extern copy_mem_block_function vp9_copy_mem8x8_c;
-extern copy_mem_block_function vp9_copy_mem8x4_c;
-
-void vp9_recon_b_c(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride);
-void vp9_recon2b_c(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride);
-void vp9_recon4b_c(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride);
-
-extern void vp9_short_idct4x4llm_1_c(short *input, short *output, int pitch);
-extern void vp9_short_idct4x4llm_c(short *input, short *output, int pitch);
-extern void vp8_dc_only_idct_c(short input_dc, short *output, int pitch);
-
-// PPC
-extern loop_filter_block_function loop_filter_mbv_ppc;
-extern loop_filter_block_function loop_filter_bv_ppc;
-extern loop_filter_block_function loop_filter_mbh_ppc;
-extern loop_filter_block_function loop_filter_bh_ppc;
-
-extern loop_filter_block_function loop_filter_mbvs_ppc;
-extern loop_filter_block_function loop_filter_bvs_ppc;
-extern loop_filter_block_function loop_filter_mbhs_ppc;
-extern loop_filter_block_function loop_filter_bhs_ppc;
-
-// Generic C
-extern loop_filter_block_function vp9_loop_filter_mbv_c;
-extern loop_filter_block_function vp9_loop_filter_bv_c;
-extern loop_filter_block_function vp9_loop_filter_mbh_c;
-extern loop_filter_block_function vp9_loop_filter_bh_c;
-
-extern loop_filter_block_function vp9_loop_filter_mbvs_c;
-extern loop_filter_block_function vp9_loop_filter_bvs_c;
-extern loop_filter_block_function vp9_loop_filter_mbhs_c;
-extern loop_filter_block_function vp9_loop_filter_bhs_c;
-
-extern loop_filter_block_function *vp8_lf_mbvfull;
-extern loop_filter_block_function *vp8_lf_mbhfull;
-extern loop_filter_block_function *vp8_lf_bvfull;
-extern loop_filter_block_function *vp8_lf_bhfull;
-
-extern loop_filter_block_function *vp8_lf_mbvsimple;
-extern loop_filter_block_function *vp8_lf_mbhsimple;
-extern loop_filter_block_function *vp8_lf_bvsimple;
-extern loop_filter_block_function *vp8_lf_bhsimple;
-
-void vp9_clear_c(void) {
-}
-
-void vp9_machine_specific_config(void) {
-  // Pure C:
-  vp9_clear_system_state                = vp9_clear_c;
-  vp9_recon_b                          = vp9_recon_b_c;
-  vp9_recon4b                         = vp9_recon4b_c;
-  vp9_recon2b                         = vp9_recon2b_c;
-
-  vp9_bilinear_predict16x16            = bilinear_predict16x16_ppc;
-  vp9_bilinear_predict8x8              = bilinear_predict8x8_ppc;
-  vp9_bilinear_predict8x4              = bilinear_predict8x4_ppc;
-  vp8_bilinear_predict                 = bilinear_predict4x4_ppc;
-
-  vp9_sixtap_predict16x16              = sixtap_predict16x16_ppc;
-  vp9_sixtap_predict8x8                = sixtap_predict8x8_ppc;
-  vp9_sixtap_predict8x4                = sixtap_predict8x4_ppc;
-  vp9_sixtap_predict                   = sixtap_predict_ppc;
-
-  vp8_short_idct4x4_1                  = vp9_short_idct4x4llm_1_c;
-  vp8_short_idct4x4                    = short_idct4x4llm_ppc;
-  vp8_dc_only_idct                      = vp8_dc_only_idct_c;
-
-  vp8_lf_mbvfull                       = loop_filter_mbv_ppc;
-  vp8_lf_bvfull                        = loop_filter_bv_ppc;
-  vp8_lf_mbhfull                       = loop_filter_mbh_ppc;
-  vp8_lf_bhfull                        = loop_filter_bh_ppc;
-
-  vp8_lf_mbvsimple                     = loop_filter_mbvs_ppc;
-  vp8_lf_bvsimple                      = loop_filter_bvs_ppc;
-  vp8_lf_mbhsimple                     = loop_filter_mbhs_ppc;
-  vp8_lf_bhsimple                      = loop_filter_bhs_ppc;
-
-  vp9_post_proc_down_and_across           = vp9_post_proc_down_and_across_c;
-  vp9_mbpost_proc_down                  = vp9_mbpost_proc_down_c;
-  vp9_mbpost_proc_across_ip              = vp9_mbpost_proc_across_ip_c;
-  vp9_plane_add_noise                   = vp9_plane_add_noise_c;
-
-  vp9_copy_mem16x16                    = copy_mem16x16_ppc;
-  vp9_copy_mem8x8                      = vp9_copy_mem8x8_c;
-  vp9_copy_mem8x4                      = vp9_copy_mem8x4_c;
-
-}
--- a/vp8/common/ppflags.h
+++ /dev/null
@@ -1,38 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef __INC_PPFLAGS_H
-#define __INC_PPFLAGS_H
-enum {
-  VP9D_NOFILTERING            = 0,
-  VP9D_DEBLOCK                = 1 << 0,
-  VP9D_DEMACROBLOCK           = 1 << 1,
-  VP9D_ADDNOISE               = 1 << 2,
-  VP9D_DEBUG_TXT_FRAME_INFO   = 1 << 3,
-  VP9D_DEBUG_TXT_MBLK_MODES   = 1 << 4,
-  VP9D_DEBUG_TXT_DC_DIFF      = 1 << 5,
-  VP9D_DEBUG_TXT_RATE_INFO    = 1 << 6,
-  VP9D_DEBUG_DRAW_MV          = 1 << 7,
-  VP9D_DEBUG_CLR_BLK_MODES    = 1 << 8,
-  VP9D_DEBUG_CLR_FRM_REF_BLKS = 1 << 9
-};
-
-typedef struct {
-  int post_proc_flag;
-  int deblocking_level;
-  int noise_level;
-  int display_ref_frame_flag;
-  int display_mb_modes_flag;
-  int display_b_modes_flag;
-  int display_mv_flag;
-} vp9_ppflags_t;
-
-#endif
--- a/vp8/common/pragmas.h
+++ /dev/null
@@ -1,19 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-
-
-#ifdef __INTEL_COMPILER
-#pragma warning(disable:997 1011 170)
-#endif
-#ifdef _MSC_VER
-#pragma warning(disable:4799)
-#endif
--- a/vp8/common/pred_common.c
+++ /dev/null
@@ -1,463 +1,0 @@
-
-/*
- *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "vp8/common/pred_common.h"
-#include "vp8/common/seg_common.h"
-
-// TBD prediction functions for various bitstream signals
-
-// Returns a context number for the given MB prediction signal
-unsigned char vp9_get_pred_context(const VP9_COMMON *const cm,
-                                   const MACROBLOCKD *const xd,
-                                   PRED_ID pred_id) {
-  int pred_context;
-  MODE_INFO *m = xd->mode_info_context;
-
-  // Note:
-  // The mode info data structure has a one element border above and to the
-  // left of the entries correpsonding to real macroblocks.
-  // The prediction flags in these dummy entries are initialised to 0.
-  switch (pred_id) {
-    case PRED_SEG_ID:
-      pred_context = (m - 1)->mbmi.seg_id_predicted +
-                     (m - cm->mode_info_stride)->mbmi.seg_id_predicted;
-      break;
-
-
-    case PRED_REF:
-      pred_context = (m - 1)->mbmi.ref_predicted +
-                     (m - cm->mode_info_stride)->mbmi.ref_predicted;
-      break;
-
-    case PRED_COMP:
-      // Context based on use of comp pred flag by neighbours
-      // pred_context =
-      //   ((m - 1)->mbmi.second_ref_frame != INTRA_FRAME) +
-      //    ((m - cm->mode_info_stride)->mbmi.second_ref_frame != INTRA_FRAME);
-
-      // Context based on mode and reference frame
-      // if ( m->mbmi.ref_frame == LAST_FRAME )
-      //    pred_context = 0 + (m->mbmi.mode != ZEROMV);
-      // else if ( m->mbmi.ref_frame == GOLDEN_FRAME )
-      //    pred_context = 2 + (m->mbmi.mode != ZEROMV);
-      // else
-      //    pred_context = 4 + (m->mbmi.mode != ZEROMV);
-
-      if (m->mbmi.ref_frame == LAST_FRAME)
-        pred_context = 0;
-      else
-        pred_context = 1;
-
-      break;
-
-    case PRED_MBSKIP:
-      pred_context = (m - 1)->mbmi.mb_skip_coeff +
-                     (m - cm->mode_info_stride)->mbmi.mb_skip_coeff;
-      break;
-
-    case PRED_SWITCHABLE_INTERP:
-      {
-        int left_in_image = (m - 1)->mbmi.mb_in_image;
-        int above_in_image = (m - cm->mode_info_stride)->mbmi.mb_in_image;
-        int left_mode = (m - 1)->mbmi.mode;
-        int above_mode = (m - cm->mode_info_stride)->mbmi.mode;
-        int left_interp, above_interp;
-        if (left_in_image && left_mode >= NEARESTMV && left_mode <= SPLITMV)
-          left_interp = vp9_switchable_interp_map[(m - 1)->mbmi.interp_filter];
-        else
-          left_interp = VP9_SWITCHABLE_FILTERS;
-        if (above_in_image && above_mode >= NEARESTMV && above_mode <= SPLITMV)
-          above_interp = vp9_switchable_interp_map[
-              (m - cm->mode_info_stride)->mbmi.interp_filter];
-        else
-          above_interp = VP9_SWITCHABLE_FILTERS;
-
-        if (left_interp == above_interp)
-          pred_context = left_interp;
-        else if (left_interp == VP9_SWITCHABLE_FILTERS &&
-                 above_interp != VP9_SWITCHABLE_FILTERS)
-          pred_context = above_interp;
-        else if (left_interp != VP9_SWITCHABLE_FILTERS &&
-                 above_interp == VP9_SWITCHABLE_FILTERS)
-          pred_context = left_interp;
-        else
-          pred_context = VP9_SWITCHABLE_FILTERS;
-      }
-      break;
-
-    default:
-      // TODO *** add error trap code.
-      pred_context = 0;
-      break;
-  }
-
-  return pred_context;
-}
-
-// This function returns a context probability for coding a given
-// prediction signal
-vp9_prob vp9_get_pred_prob(const VP9_COMMON *const cm,
-                          const MACROBLOCKD *const xd,
-                          PRED_ID pred_id) {
-  vp9_prob pred_probability;
-  int pred_context;
-
-  // Get the appropriate prediction context
-  pred_context = vp9_get_pred_context(cm, xd, pred_id);
-
-  switch (pred_id) {
-    case PRED_SEG_ID:
-      pred_probability = cm->segment_pred_probs[pred_context];
-      break;
-
-    case PRED_REF:
-      pred_probability = cm->ref_pred_probs[pred_context];
-      break;
-
-    case PRED_COMP:
-      // In keeping with convention elsewhre the probability returned is
-      // the probability of a "0" outcome which in this case means the
-      // probability of comp pred off.
-      pred_probability = cm->prob_comppred[pred_context];
-      break;
-
-    case PRED_MBSKIP:
-      pred_probability = cm->mbskip_pred_probs[pred_context];
-      break;
-
-    default:
-      // TODO *** add error trap code.
-      pred_probability = 128;
-      break;
-  }
-
-  return pred_probability;
-}
-
-// This function returns a context probability ptr for coding a given
-// prediction signal
-const vp9_prob *vp9_get_pred_probs(const VP9_COMMON *const cm,
-                                   const MACROBLOCKD *const xd,
-                                   PRED_ID pred_id) {
-  const vp9_prob *pred_probability;
-  int pred_context;
-
-  // Get the appropriate prediction context
-  pred_context = vp9_get_pred_context(cm, xd, pred_id);
-
-  switch (pred_id) {
-    case PRED_SEG_ID:
-      pred_probability = &cm->segment_pred_probs[pred_context];
-      break;
-
-    case PRED_REF:
-      pred_probability = &cm->ref_pred_probs[pred_context];
-      break;
-
-    case PRED_COMP:
-      // In keeping with convention elsewhre the probability returned is
-      // the probability of a "0" outcome which in this case means the
-      // probability of comp pred off.
-      pred_probability = &cm->prob_comppred[pred_context];
-      break;
-
-    case PRED_MBSKIP:
-      pred_probability = &cm->mbskip_pred_probs[pred_context];
-      break;
-
-    case PRED_SWITCHABLE_INTERP:
-      pred_probability = &cm->fc.switchable_interp_prob[pred_context][0];
-      break;
-
-    default:
-      // TODO *** add error trap code.
-      pred_probability = NULL;
-      break;
-  }
-
-  return pred_probability;
-}
-
-// This function returns the status of the given prediction signal.
-// I.e. is the predicted value for the given signal correct.
-unsigned char vp9_get_pred_flag(const MACROBLOCKD *const xd,
-                                PRED_ID pred_id) {
-  unsigned char pred_flag = 0;
-
-  switch (pred_id) {
-    case PRED_SEG_ID:
-      pred_flag = xd->mode_info_context->mbmi.seg_id_predicted;
-      break;
-
-    case PRED_REF:
-      pred_flag = xd->mode_info_context->mbmi.ref_predicted;
-      break;
-
-    case PRED_MBSKIP:
-      pred_flag = xd->mode_info_context->mbmi.mb_skip_coeff;
-      break;
-
-    default:
-      // TODO *** add error trap code.
-      pred_flag = 0;
-      break;
-  }
-
-  return pred_flag;
-}
-
-// This function sets the status of the given prediction signal.
-// I.e. is the predicted value for the given signal correct.
-void vp9_set_pred_flag(MACROBLOCKD *const xd,
-                       PRED_ID pred_id,
-                       unsigned char pred_flag) {
-#if CONFIG_SUPERBLOCKS
-  const int mis = xd->mode_info_stride;
-#endif
-
-  switch (pred_id) {
-    case PRED_SEG_ID:
-      xd->mode_info_context->mbmi.seg_id_predicted = pred_flag;
-#if CONFIG_SUPERBLOCKS
-      if (xd->mode_info_context->mbmi.encoded_as_sb) {
-        if (xd->mb_to_right_edge > 0)
-          xd->mode_info_context[1].mbmi.seg_id_predicted = pred_flag;
-        if (xd->mb_to_bottom_edge > 0) {
-          xd->mode_info_context[mis].mbmi.seg_id_predicted = pred_flag;
-          if (xd->mb_to_right_edge > 0)
-            xd->mode_info_context[mis + 1].mbmi.seg_id_predicted = pred_flag;
-        }
-      }
-#endif
-      break;
-
-    case PRED_REF:
-      xd->mode_info_context->mbmi.ref_predicted = pred_flag;
-#if CONFIG_SUPERBLOCKS
-      if (xd->mode_info_context->mbmi.encoded_as_sb) {
-        if (xd->mb_to_right_edge > 0)
-          xd->mode_info_context[1].mbmi.ref_predicted = pred_flag;
-        if (xd->mb_to_bottom_edge > 0) {
-          xd->mode_info_context[mis].mbmi.ref_predicted = pred_flag;
-          if (xd->mb_to_right_edge > 0)
-            xd->mode_info_context[mis + 1].mbmi.ref_predicted = pred_flag;
-        }
-      }
-#endif
-      break;
-
-    case PRED_MBSKIP:
-      xd->mode_info_context->mbmi.mb_skip_coeff = pred_flag;
-#if CONFIG_SUPERBLOCKS
-      if (xd->mode_info_context->mbmi.encoded_as_sb) {
-        if (xd->mb_to_right_edge > 0)
-          xd->mode_info_context[1].mbmi.mb_skip_coeff = pred_flag;
-        if (xd->mb_to_bottom_edge > 0) {
-          xd->mode_info_context[mis].mbmi.mb_skip_coeff = pred_flag;
-          if (xd->mb_to_right_edge > 0)
-            xd->mode_info_context[mis + 1].mbmi.mb_skip_coeff = pred_flag;
-        }
-      }
-#endif
-      break;
-
-    default:
-      // TODO *** add error trap code.
-      break;
-  }
-}
-
-
-// The following contain the guts of the prediction code used to
-// peredict various bitstream signals.
-
-// Macroblock segment id prediction function
-unsigned char vp9_get_pred_mb_segid(const VP9_COMMON *const cm,
-                                    const MACROBLOCKD *const xd, int MbIndex) {
-  // Currently the prediction for the macroblock segment ID is
-  // the value stored for this macroblock in the previous frame.
-#if CONFIG_SUPERBLOCKS
-  if (!xd->mode_info_context->mbmi.encoded_as_sb) {
-#endif
-    return cm->last_frame_seg_map[MbIndex];
-#if CONFIG_SUPERBLOCKS
-  } else {
-    int seg_id = cm->last_frame_seg_map[MbIndex];
-    int mb_col = MbIndex % cm->mb_cols;
-    int mb_row = MbIndex / cm->mb_cols;
-    if (mb_col + 1 < cm->mb_cols)
-      seg_id = seg_id && cm->last_frame_seg_map[MbIndex + 1];
-    if (mb_row + 1 < cm->mb_rows) {
-      seg_id = seg_id && cm->last_frame_seg_map[MbIndex + cm->mb_cols];
-      if (mb_col + 1 < cm->mb_cols)
-        seg_id = seg_id && cm->last_frame_seg_map[MbIndex + cm->mb_cols + 1];
-    }
-    return seg_id;
-  }
-#endif
-}
-
-MV_REFERENCE_FRAME vp9_get_pred_ref(const VP9_COMMON *const cm,
-                                    const MACROBLOCKD *const xd) {
-  MODE_INFO *m = xd->mode_info_context;
-
-  MV_REFERENCE_FRAME left;
-  MV_REFERENCE_FRAME above;
-  MV_REFERENCE_FRAME above_left;
-  MV_REFERENCE_FRAME pred_ref = LAST_FRAME;
-
-  int segment_id = xd->mode_info_context->mbmi.segment_id;
-  int seg_ref_active;
-  int i;
-
-  unsigned char frame_allowed[MAX_REF_FRAMES] = {1, 1, 1, 1};
-  unsigned char ref_score[MAX_REF_FRAMES];
-  unsigned char best_score = 0;
-  unsigned char left_in_image;
-  unsigned char above_in_image;
-  unsigned char above_left_in_image;
-
-  // Is segment coding ennabled
-  seg_ref_active = vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME);
-
-  // Special case treatment if segment coding is enabled.
-  // Dont allow prediction of a reference frame that the segment
-  // does not allow
-  if (seg_ref_active) {
-    for (i = 0; i < MAX_REF_FRAMES; i++) {
-      frame_allowed[i] =
-        vp9_check_segref(xd, segment_id, i);
-
-      // Score set to 0 if ref frame not allowed
-      ref_score[i] = cm->ref_scores[i] * frame_allowed[i];
-    }
-  } else
-    vpx_memcpy(ref_score, cm->ref_scores, sizeof(ref_score));
-
-  // Reference frames used by neighbours
-  left = (m - 1)->mbmi.ref_frame;
-  above = (m - cm->mode_info_stride)->mbmi.ref_frame;
-  above_left = (m - 1 - cm->mode_info_stride)->mbmi.ref_frame;
-
-  // Are neighbours in image
-  left_in_image = (m - 1)->mbmi.mb_in_image;
-  above_in_image = (m - cm->mode_info_stride)->mbmi.mb_in_image;
-  above_left_in_image = (m - 1 - cm->mode_info_stride)->mbmi.mb_in_image;
-
-  // Adjust scores for candidate reference frames based on neigbours
-  if (frame_allowed[left] && left_in_image) {
-    ref_score[left] += 16;
-    if (above_left_in_image && (left == above_left))
-      ref_score[left] += 4;
-  }
-  if (frame_allowed[above] && above_in_image) {
-    ref_score[above] += 16;
-    if (above_left_in_image && (above == above_left))
-      ref_score[above] += 4;
-  }
-
-  // Now choose the candidate with the highest score
-  for (i = 0; i < MAX_REF_FRAMES; i++) {
-    if (ref_score[i] > best_score) {
-      pred_ref = i;
-      best_score = ref_score[i];
-    }
-  }
-
-  return pred_ref;
-}
-
-// Functions to computes a set of modified reference frame probabilities
-// to use when the prediction of the reference frame value fails
-void vp9_calc_ref_probs(int *count, vp9_prob *probs) {
-  int tot_count;
-
-  tot_count = count[0] + count[1] + count[2] + count[3];
-  if (tot_count) {
-    probs[0] = (vp9_prob)((count[0] * 255 + (tot_count >> 1)) / tot_count);
-    probs[0] += !probs[0];
-  } else
-    probs[0] = 128;
-
-  tot_count -= count[0];
-  if (tot_count) {
-    probs[1] = (vp9_prob)((count[1] * 255 + (tot_count >> 1)) / tot_count);
-    probs[1] += !probs[1];
-  } else
-    probs[1] = 128;
-
-  tot_count -= count[1];
-  if (tot_count) {
-    probs[2] = (vp9_prob)((count[2] * 255 + (tot_count >> 1)) / tot_count);
-    probs[2] += !probs[2];
-  } else
-    probs[2] = 128;
-
-}
-
-// Computes a set of modified conditional probabilities for the reference frame
-// Values willbe set to 0 for reference frame options that are not possible
-// because wither they were predicted and prediction has failed or because
-// they are not allowed for a given segment.
-void vp9_compute_mod_refprobs(VP9_COMMON *const cm) {
-  int norm_cnt[MAX_REF_FRAMES];
-  int intra_count;
-  int inter_count;
-  int last_count;
-  int gfarf_count;
-  int gf_count;
-  int arf_count;
-
-  intra_count = cm->prob_intra_coded;
-  inter_count = (255 - intra_count);
-  last_count = (inter_count * cm->prob_last_coded) / 255;
-  gfarf_count = inter_count - last_count;
-  gf_count = (gfarf_count * cm->prob_gf_coded) / 255;
-  arf_count = gfarf_count - gf_count;
-
-  // Work out modified reference frame probabilities to use where prediction
-  // of the reference frame fails
-  norm_cnt[0] = 0;
-  norm_cnt[1] = last_count;
-  norm_cnt[2] = gf_count;
-  norm_cnt[3] = arf_count;
-  vp9_calc_ref_probs(norm_cnt, cm->mod_refprobs[INTRA_FRAME]);
-  cm->mod_refprobs[INTRA_FRAME][0] = 0;    // This branch implicit
-
-  norm_cnt[0] = intra_count;
-  norm_cnt[1] = 0;
-  norm_cnt[2] = gf_count;
-  norm_cnt[3] = arf_count;
-  vp9_calc_ref_probs(norm_cnt, cm->mod_refprobs[LAST_FRAME]);
-  cm->mod_refprobs[LAST_FRAME][1] = 0;    // This branch implicit
-
-  norm_cnt[0] = intra_count;
-  norm_cnt[1] = last_count;
-  norm_cnt[2] = 0;
-  norm_cnt[3] = arf_count;
-  vp9_calc_ref_probs(norm_cnt, cm->mod_refprobs[GOLDEN_FRAME]);
-  cm->mod_refprobs[GOLDEN_FRAME][2] = 0;  // This branch implicit
-
-  norm_cnt[0] = intra_count;
-  norm_cnt[1] = last_count;
-  norm_cnt[2] = gf_count;
-  norm_cnt[3] = 0;
-  vp9_calc_ref_probs(norm_cnt, cm->mod_refprobs[ALTREF_FRAME]);
-  cm->mod_refprobs[ALTREF_FRAME][2] = 0;  // This branch implicit
-
-  // Score the reference frames based on overal frequency.
-  // These scores contribute to the prediction choices.
-  // Max score 17 min 1
-  cm->ref_scores[INTRA_FRAME] = 1 + (intra_count * 16 / 255);
-  cm->ref_scores[LAST_FRAME] = 1 + (last_count * 16 / 255);
-  cm->ref_scores[GOLDEN_FRAME] = 1 + (gf_count * 16 / 255);
-  cm->ref_scores[ALTREF_FRAME] = 1 + (arf_count * 16 / 255);
-}
--- a/vp8/common/pred_common.h
+++ /dev/null
@@ -1,56 +1,0 @@
-/*
- *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "type_aliases.h"
-#include "onyxc_int.h"
-#include "vp8/common/blockd.h"
-
-#ifndef __INC_PRED_COMMON_H__
-#define __INC_PRED_COMMON_H__ 1
-
-
-// Predicted items
-typedef enum {
-  PRED_SEG_ID = 0,               // Segment identifier
-  PRED_REF = 1,
-  PRED_COMP = 2,
-  PRED_MBSKIP = 3,
-  PRED_SWITCHABLE_INTERP = 4
-} PRED_ID;
-
-extern unsigned char vp9_get_pred_context(const VP9_COMMON *const cm,
-                                          const MACROBLOCKD *const xd,
-                                          PRED_ID pred_id);
-
-extern vp9_prob vp9_get_pred_prob(const VP9_COMMON *const cm,
-                                  const MACROBLOCKD *const xd,
-                                  PRED_ID pred_id);
-
-extern const vp9_prob *vp9_get_pred_probs(const VP9_COMMON *const cm,
-                                          const MACROBLOCKD *const xd,
-                                          PRED_ID pred_id);
-
-extern unsigned char vp9_get_pred_flag(const MACROBLOCKD *const xd,
-                                       PRED_ID pred_id);
-
-extern void vp9_set_pred_flag(MACROBLOCKD *const xd,
-                              PRED_ID pred_id,
-                              unsigned char pred_flag);
-
-
-extern unsigned char vp9_get_pred_mb_segid(const VP9_COMMON *const cm,
-                                           const MACROBLOCKD *const xd,
-                                           int MbIndex);
-
-extern MV_REFERENCE_FRAME vp9_get_pred_ref(const VP9_COMMON *const cm,
-                                       const MACROBLOCKD *const xd);
-extern void vp9_compute_mod_refprobs(VP9_COMMON *const cm);
-
-#endif /* __INC_PRED_COMMON_H__ */
--- a/vp8/common/quant_common.c
+++ /dev/null
@@ -1,125 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "quant_common.h"
-
-static int dc_qlookup[QINDEX_RANGE];
-static int ac_qlookup[QINDEX_RANGE];
-
-#define ACDC_MIN 4
-
-void vp9_init_quant_tables() {
-  int i;
-  int current_val = 4;
-  int last_val = 4;
-  int ac_val;
-
-  for (i = 0; i < QINDEX_RANGE; i++) {
-    ac_qlookup[i] = current_val;
-    current_val = (int)((double)current_val * 1.02);
-    if (current_val == last_val)
-      current_val++;
-    last_val = current_val;
-
-    ac_val = ac_qlookup[i];
-    dc_qlookup[i] = (0.000000305 * ac_val * ac_val * ac_val) +
-                    (-0.00065 * ac_val * ac_val) +
-                    (0.9 * ac_val) + 0.5;
-    if (dc_qlookup[i] < ACDC_MIN)
-      dc_qlookup[i] = ACDC_MIN;
-  }
-}
-
-int vp9_dc_quant(int QIndex, int Delta) {
-  int retval;
-
-  QIndex = QIndex + Delta;
-
-  if (QIndex > MAXQ)
-    QIndex = MAXQ;
-  else if (QIndex < 0)
-    QIndex = 0;
-
-  retval = dc_qlookup[ QIndex ];
-  return retval;
-}
-
-int vp9_dc2quant(int QIndex, int Delta) {
-  int retval;
-
-  QIndex = QIndex + Delta;
-
-  if (QIndex > MAXQ)
-    QIndex = MAXQ;
-  else if (QIndex < 0)
-    QIndex = 0;
-
-  retval = dc_qlookup[ QIndex ];
-
-  return retval;
-
-}
-int vp9_dc_uv_quant(int QIndex, int Delta) {
-  int retval;
-
-  QIndex = QIndex + Delta;
-
-  if (QIndex > MAXQ)
-    QIndex = MAXQ;
-  else if (QIndex < 0)
-    QIndex = 0;
-
-  retval = dc_qlookup[ QIndex ];
-
-  return retval;
-}
-
-int vp9_ac_yquant(int QIndex) {
-  int retval;
-
-  if (QIndex > MAXQ)
-    QIndex = MAXQ;
-  else if (QIndex < 0)
-    QIndex = 0;
-
-  retval = ac_qlookup[ QIndex ];
-  return retval;
-}
-
-int vp9_ac2quant(int QIndex, int Delta) {
-  int retval;
-
-  QIndex = QIndex + Delta;
-
-  if (QIndex > MAXQ)
-    QIndex = MAXQ;
-  else if (QIndex < 0)
-    QIndex = 0;
-
-  retval = (ac_qlookup[ QIndex ] * 775) / 1000;
-  if (retval < 4)
-    retval = 4;
-
-  return retval;
-}
-int vp9_ac_uv_quant(int QIndex, int Delta) {
-  int retval;
-
-  QIndex = QIndex + Delta;
-
-  if (QIndex > MAXQ)
-    QIndex = MAXQ;
-  else if (QIndex < 0)
-    QIndex = 0;
-
-  retval = ac_qlookup[ QIndex ];
-  return retval;
-}
--- a/vp8/common/quant_common.h
+++ /dev/null
@@ -1,22 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "string.h"
-#include "blockd.h"
-#include "onyxc_int.h"
-
-extern void vp9_init_quant_tables();
-extern int vp9_ac_yquant(int QIndex);
-extern int vp9_dc_quant(int QIndex, int Delta);
-extern int vp9_dc2quant(int QIndex, int Delta);
-extern int vp9_ac2quant(int QIndex, int Delta);
-extern int vp9_dc_uv_quant(int QIndex, int Delta);
-extern int vp9_ac_uv_quant(int QIndex, int Delta);
--- a/vp8/common/recon.c
+++ /dev/null
@@ -1,197 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vpx_ports/config.h"
-#include "vpx_rtcd.h"
-#include "blockd.h"
-
-void vp9_recon_b_c
-(
-  unsigned char *pred_ptr,
-  short *diff_ptr,
-  unsigned char *dst_ptr,
-  int stride
-) {
-  int r, c;
-
-  for (r = 0; r < 4; r++) {
-    for (c = 0; c < 4; c++) {
-      int a = diff_ptr[c] + pred_ptr[c];
-
-      if (a < 0)
-        a = 0;
-
-      if (a > 255)
-        a = 255;
-
-      dst_ptr[c] = (unsigned char) a;
-    }
-
-    dst_ptr += stride;
-    diff_ptr += 16;
-    pred_ptr += 16;
-  }
-}
-
-void vp9_recon_uv_b_c
-(
-  unsigned char *pred_ptr,
-  short *diff_ptr,
-  unsigned char *dst_ptr,
-  int stride
-) {
-  int r, c;
-
-  for (r = 0; r < 4; r++) {
-    for (c = 0; c < 4; c++) {
-      int a = diff_ptr[c] + pred_ptr[c];
-
-      if (a < 0)
-        a = 0;
-
-      if (a > 255)
-        a = 255;
-
-      dst_ptr[c] = (unsigned char) a;
-    }
-
-    dst_ptr += stride;
-    diff_ptr += 8;
-    pred_ptr += 8;
-  }
-}
-void vp9_recon4b_c
-(
-  unsigned char *pred_ptr,
-  short *diff_ptr,
-  unsigned char *dst_ptr,
-  int stride
-) {
-  int r, c;
-
-  for (r = 0; r < 4; r++) {
-    for (c = 0; c < 16; c++) {
-      int a = diff_ptr[c] + pred_ptr[c];
-
-      if (a < 0)
-        a = 0;
-
-      if (a > 255)
-        a = 255;
-
-      dst_ptr[c] = (unsigned char) a;
-    }
-
-    dst_ptr += stride;
-    diff_ptr += 16;
-    pred_ptr += 16;
-  }
-}
-
-void vp9_recon2b_c
-(
-  unsigned char *pred_ptr,
-  short *diff_ptr,
-  unsigned char *dst_ptr,
-  int stride
-) {
-  int r, c;
-
-  for (r = 0; r < 4; r++) {
-    for (c = 0; c < 8; c++) {
-      int a = diff_ptr[c] + pred_ptr[c];
-
-      if (a < 0)
-        a = 0;
-
-      if (a > 255)
-        a = 255;
-
-      dst_ptr[c] = (unsigned char) a;
-    }
-
-    dst_ptr += stride;
-    diff_ptr += 8;
-    pred_ptr += 8;
-  }
-}
-
-#if CONFIG_SUPERBLOCKS
-void vp9_recon_mby_s_c(MACROBLOCKD *xd, uint8_t *dst) {
-  int x, y;
-  BLOCKD *b = &xd->block[0];
-  int stride = b->dst_stride;
-  short *diff = b->diff;
-
-  for (y = 0; y < 16; y++) {
-    for (x = 0; x < 16; x++) {
-      int a = dst[x] + diff[x];
-      if (a < 0)
-        a = 0;
-      else if (a > 255)
-        a = 255;
-      dst[x] = a;
-    }
-    dst += stride;
-    diff += 16;
-  }
-}
-
-void vp9_recon_mbuv_s_c(MACROBLOCKD *xd, uint8_t *udst, uint8_t *vdst) {
-  int x, y, i;
-  uint8_t *dst = udst;
-
-  for (i = 0; i < 2; i++, dst = vdst) {
-    BLOCKD *b = &xd->block[16 + 4 * i];
-    int stride = b->dst_stride;
-    short *diff = b->diff;
-
-    for (y = 0; y < 8; y++) {
-      for (x = 0; x < 8; x++) {
-        int a = dst[x] + diff[x];
-        if (a < 0)
-          a = 0;
-        else if (a > 255)
-          a = 255;
-        dst[x] = a;
-      }
-      dst += stride;
-      diff += 8;
-    }
-  }
-}
-#endif
-
-void vp9_recon_mby_c(MACROBLOCKD *xd) {
-  int i;
-
-  for (i = 0; i < 16; i += 4) {
-    BLOCKD *b = &xd->block[i];
-
-    vp9_recon4b(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
-  }
-}
-
-void vp9_recon_mb_c(MACROBLOCKD *xd) {
-  int i;
-
-  for (i = 0; i < 16; i += 4) {
-    BLOCKD *b = &xd->block[i];
-
-    vp9_recon4b(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
-  }
-
-  for (i = 16; i < 24; i += 2) {
-    BLOCKD *b = &xd->block[i];
-
-    vp9_recon2b(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
-  }
-}
--- a/vp8/common/reconinter.c
+++ /dev/null
@@ -1,1145 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vpx_ports/config.h"
-#include "vpx/vpx_integer.h"
-#include "subpixel.h"
-#include "blockd.h"
-#include "reconinter.h"
-#if CONFIG_RUNTIME_CPU_DETECT
-#include "onyxc_int.h"
-#endif
-
-void vp9_setup_interp_filters(MACROBLOCKD *xd,
-                              INTERPOLATIONFILTERTYPE mcomp_filter_type,
-                              VP9_COMMON *cm) {
-  if (mcomp_filter_type == SIXTAP) {
-    xd->subpixel_predict        = SUBPIX_INVOKE(
-        &cm->rtcd.subpix, sixtap4x4);
-    xd->subpixel_predict8x4     = SUBPIX_INVOKE(
-        &cm->rtcd.subpix, sixtap8x4);
-    xd->subpixel_predict8x8     = SUBPIX_INVOKE(
-        &cm->rtcd.subpix, sixtap8x8);
-    xd->subpixel_predict16x16   = SUBPIX_INVOKE(
-        &cm->rtcd.subpix, sixtap16x16);
-    xd->subpixel_predict_avg    = SUBPIX_INVOKE(
-        &cm->rtcd.subpix, sixtap_avg4x4);
-    xd->subpixel_predict_avg8x8 = SUBPIX_INVOKE(
-        &cm->rtcd.subpix, sixtap_avg8x8);
-    xd->subpixel_predict_avg16x16 = SUBPIX_INVOKE(
-        &cm->rtcd.subpix, sixtap_avg16x16);
-  } else if (mcomp_filter_type == EIGHTTAP || mcomp_filter_type == SWITCHABLE) {
-    xd->subpixel_predict        = SUBPIX_INVOKE(
-        &cm->rtcd.subpix, eighttap4x4);
-    xd->subpixel_predict8x4     = SUBPIX_INVOKE(
-        &cm->rtcd.subpix, eighttap8x4);
-    xd->subpixel_predict8x8     = SUBPIX_INVOKE(
-        &cm->rtcd.subpix, eighttap8x8);
-    xd->subpixel_predict16x16   = SUBPIX_INVOKE(
-        &cm->rtcd.subpix, eighttap16x16);
-    xd->subpixel_predict_avg    = SUBPIX_INVOKE(
-        &cm->rtcd.subpix, eighttap_avg4x4);
-    xd->subpixel_predict_avg8x8 = SUBPIX_INVOKE(
-        &cm->rtcd.subpix, eighttap_avg8x8);
-    xd->subpixel_predict_avg16x16 = SUBPIX_INVOKE(
-        &cm->rtcd.subpix, eighttap_avg16x16);
-  } else if (mcomp_filter_type == EIGHTTAP_SHARP) {
-    xd->subpixel_predict        = SUBPIX_INVOKE(
-        &cm->rtcd.subpix, eighttap4x4_sharp);
-    xd->subpixel_predict8x4     = SUBPIX_INVOKE(
-        &cm->rtcd.subpix, eighttap8x4_sharp);
-    xd->subpixel_predict8x8     = SUBPIX_INVOKE(
-        &cm->rtcd.subpix, eighttap8x8_sharp);
-    xd->subpixel_predict16x16   = SUBPIX_INVOKE(
-        &cm->rtcd.subpix, eighttap16x16_sharp);
-    xd->subpixel_predict_avg    = SUBPIX_INVOKE(
-        &cm->rtcd.subpix, eighttap_avg4x4_sharp);
-    xd->subpixel_predict_avg8x8 = SUBPIX_INVOKE(
-        &cm->rtcd.subpix, eighttap_avg8x8_sharp);
-    xd->subpixel_predict_avg16x16 = SUBPIX_INVOKE(
-        &cm->rtcd.subpix, eighttap_avg16x16_sharp);
-  }
-  else {
-    xd->subpixel_predict        = SUBPIX_INVOKE(
-        &cm->rtcd.subpix, bilinear4x4);
-    xd->subpixel_predict8x4     = SUBPIX_INVOKE(
-        &cm->rtcd.subpix, bilinear8x4);
-    xd->subpixel_predict8x8     = SUBPIX_INVOKE(
-        &cm->rtcd.subpix, bilinear8x8);
-    xd->subpixel_predict16x16   = SUBPIX_INVOKE(
-        &cm->rtcd.subpix, bilinear16x16);
-    xd->subpixel_predict_avg    = SUBPIX_INVOKE(
-        &cm->rtcd.subpix, bilinear_avg4x4);
-    xd->subpixel_predict_avg8x8 = SUBPIX_INVOKE(
-        &cm->rtcd.subpix, bilinear_avg8x8);
-    xd->subpixel_predict_avg16x16 = SUBPIX_INVOKE(
-        &cm->rtcd.subpix, bilinear_avg16x16);
-  }
-}
-
-void vp9_copy_mem16x16_c(unsigned char *src,
-                         int src_stride,
-                         unsigned char *dst,
-                         int dst_stride) {
-  int r;
-
-  for (r = 0; r < 16; r++) {
-#if !(CONFIG_FAST_UNALIGNED)
-    dst[0] = src[0];
-    dst[1] = src[1];
-    dst[2] = src[2];
-    dst[3] = src[3];
-    dst[4] = src[4];
-    dst[5] = src[5];
-    dst[6] = src[6];
-    dst[7] = src[7];
-    dst[8] = src[8];
-    dst[9] = src[9];
-    dst[10] = src[10];
-    dst[11] = src[11];
-    dst[12] = src[12];
-    dst[13] = src[13];
-    dst[14] = src[14];
-    dst[15] = src[15];
-
-#else
-    ((uint32_t *)dst)[0] = ((uint32_t *)src)[0];
-    ((uint32_t *)dst)[1] = ((uint32_t *)src)[1];
-    ((uint32_t *)dst)[2] = ((uint32_t *)src)[2];
-    ((uint32_t *)dst)[3] = ((uint32_t *)src)[3];
-
-#endif
-    src += src_stride;
-    dst += dst_stride;
-  }
-}
-
-void vp9_avg_mem16x16_c(unsigned char *src,
-                        int src_stride,
-                        unsigned char *dst,
-                        int dst_stride) {
-  int r;
-
-  for (r = 0; r < 16; r++) {
-    int n;
-
-    for (n = 0; n < 16; n++) {
-      dst[n] = (dst[n] + src[n] + 1) >> 1;
-    }
-
-    src += src_stride;
-    dst += dst_stride;
-  }
-}
-
-void vp9_copy_mem8x8_c(unsigned char *src,
-                       int src_stride,
-                       unsigned char *dst,
-                       int dst_stride) {
-  int r;
-
-  for (r = 0; r < 8; r++) {
-#if !(CONFIG_FAST_UNALIGNED)
-    dst[0] = src[0];
-    dst[1] = src[1];
-    dst[2] = src[2];
-    dst[3] = src[3];
-    dst[4] = src[4];
-    dst[5] = src[5];
-    dst[6] = src[6];
-    dst[7] = src[7];
-#else
-    ((uint32_t *)dst)[0] = ((uint32_t *)src)[0];
-    ((uint32_t *)dst)[1] = ((uint32_t *)src)[1];
-#endif
-    src += src_stride;
-    dst += dst_stride;
-  }
-}
-
-void vp9_avg_mem8x8_c(unsigned char *src,
-                      int src_stride,
-                      unsigned char *dst,
-                      int dst_stride) {
-  int r;
-
-  for (r = 0; r < 8; r++) {
-    int n;
-
-    for (n = 0; n < 8; n++) {
-      dst[n] = (dst[n] + src[n] + 1) >> 1;
-    }
-
-    src += src_stride;
-    dst += dst_stride;
-  }
-}
-
-void vp9_copy_mem8x4_c(unsigned char *src,
-                       int src_stride,
-                       unsigned char *dst,
-                       int dst_stride) {
-  int r;
-
-  for (r = 0; r < 4; r++) {
-#if !(CONFIG_FAST_UNALIGNED)
-    dst[0] = src[0];
-    dst[1] = src[1];
-    dst[2] = src[2];
-    dst[3] = src[3];
-    dst[4] = src[4];
-    dst[5] = src[5];
-    dst[6] = src[6];
-    dst[7] = src[7];
-#else
-    ((uint32_t *)dst)[0] = ((uint32_t *)src)[0];
-    ((uint32_t *)dst)[1] = ((uint32_t *)src)[1];
-#endif
-    src += src_stride;
-    dst += dst_stride;
-  }
-}
-
-void vp9_build_inter_predictors_b(BLOCKD *d, int pitch, vp9_subpix_fn_t sppf) {
-  int r;
-  unsigned char *ptr_base;
-  unsigned char *ptr;
-  unsigned char *pred_ptr = d->predictor;
-  int_mv mv;
-
-  ptr_base = *(d->base_pre);
-  mv.as_int = d->bmi.as_mv.first.as_int;
-
-  if (mv.as_mv.row & 7 || mv.as_mv.col & 7) {
-    ptr = ptr_base + d->pre + (mv.as_mv.row >> 3) * d->pre_stride +
-          (mv.as_mv.col >> 3);
-    sppf(ptr, d->pre_stride, (mv.as_mv.col & 7) << 1, (mv.as_mv.row & 7) << 1,
-         pred_ptr, pitch);
-  } else {
-    ptr_base += d->pre + (mv.as_mv.row >> 3) * d->pre_stride +
-                (mv.as_mv.col >> 3);
-    ptr = ptr_base;
-
-    for (r = 0; r < 4; r++) {
-#if !(CONFIG_FAST_UNALIGNED)
-      pred_ptr[0]  = ptr[0];
-      pred_ptr[1]  = ptr[1];
-      pred_ptr[2]  = ptr[2];
-      pred_ptr[3]  = ptr[3];
-#else
-      *(uint32_t *)pred_ptr = *(uint32_t *)ptr;
-#endif
-      pred_ptr     += pitch;
-      ptr         += d->pre_stride;
-    }
-  }
-}
-
-/*
- * Similar to vp9_build_inter_predictors_b(), but instead of storing the
- * results in d->predictor, we average the contents of d->predictor (which
- * come from an earlier call to vp9_build_inter_predictors_b()) with the
- * predictor of the second reference frame / motion vector.
- */
-void vp9_build_2nd_inter_predictors_b(BLOCKD *d, int pitch,
-                                      vp9_subpix_fn_t sppf) {
-  int r;
-  unsigned char *ptr_base;
-  unsigned char *ptr;
-  unsigned char *pred_ptr = d->predictor;
-  int_mv mv;
-
-  ptr_base = *(d->base_second_pre);
-  mv.as_int = d->bmi.as_mv.second.as_int;
-
-  if (mv.as_mv.row & 7 || mv.as_mv.col & 7) {
-    ptr = ptr_base + d->pre + (mv.as_mv.row >> 3) * d->pre_stride +
-          (mv.as_mv.col >> 3);
-    sppf(ptr, d->pre_stride, (mv.as_mv.col & 7) << 1, (mv.as_mv.row & 7) << 1,
-         pred_ptr, pitch);
-  } else {
-    ptr_base += d->pre + (mv.as_mv.row >> 3) * d->pre_stride +
-                (mv.as_mv.col >> 3);
-    ptr = ptr_base;
-
-    for (r = 0; r < 4; r++) {
-      pred_ptr[0]  = (pred_ptr[0] + ptr[0] + 1) >> 1;
-      pred_ptr[1]  = (pred_ptr[1] + ptr[1] + 1) >> 1;
-      pred_ptr[2]  = (pred_ptr[2] + ptr[2] + 1) >> 1;
-      pred_ptr[3]  = (pred_ptr[3] + ptr[3] + 1) >> 1;
-      pred_ptr    += pitch;
-      ptr         += d->pre_stride;
-    }
-  }
-}
-
-void vp9_build_inter_predictors4b(MACROBLOCKD *xd, BLOCKD *d, int pitch) {
-  unsigned char *ptr_base;
-  unsigned char *ptr;
-  unsigned char *pred_ptr = d->predictor;
-  int_mv mv;
-
-  ptr_base = *(d->base_pre);
-  mv.as_int = d->bmi.as_mv.first.as_int;
-  ptr = ptr_base + d->pre + (mv.as_mv.row >> 3) * d->pre_stride +
-        (mv.as_mv.col >> 3);
-
-  if (mv.as_mv.row & 7 || mv.as_mv.col & 7) {
-    xd->subpixel_predict8x8(ptr, d->pre_stride, (mv.as_mv.col & 7) << 1,
-                            (mv.as_mv.row & 7) << 1, pred_ptr, pitch);
-  } else {
-    vp9_copy_mem8x8(ptr, d->pre_stride, pred_ptr, pitch);
-  }
-}
-
-/*
- * Similar to build_inter_predictors_4b(), but instead of storing the
- * results in d->predictor, we average the contents of d->predictor (which
- * come from an earlier call to build_inter_predictors_4b()) with the
- * predictor of the second reference frame / motion vector.
- */
-void vp9_build_2nd_inter_predictors4b(MACROBLOCKD *xd,
-                                      BLOCKD *d, int pitch) {
-  unsigned char *ptr_base;
-  unsigned char *ptr;
-  unsigned char *pred_ptr = d->predictor;
-  int_mv mv;
-
-  ptr_base = *(d->base_second_pre);
-  mv.as_int = d->bmi.as_mv.second.as_int;
-  ptr = ptr_base + d->pre + (mv.as_mv.row >> 3) * d->pre_stride +
-        (mv.as_mv.col >> 3);
-
-  if (mv.as_mv.row & 7 || mv.as_mv.col & 7) {
-    xd->subpixel_predict_avg8x8(ptr, d->pre_stride, (mv.as_mv.col & 7) << 1,
-                               (mv.as_mv.row & 7) << 1, pred_ptr, pitch);
-  } else {
-    vp9_avg_mem8x8(ptr, d->pre_stride, pred_ptr, pitch);
-  }
-}
-
-static void build_inter_predictors2b(MACROBLOCKD *xd, BLOCKD *d, int pitch) {
-  unsigned char *ptr_base;
-  unsigned char *ptr;
-  unsigned char *pred_ptr = d->predictor;
-  int_mv mv;
-
-  ptr_base = *(d->base_pre);
-  mv.as_int = d->bmi.as_mv.first.as_int;
-  ptr = ptr_base + d->pre + (mv.as_mv.row >> 3) * d->pre_stride +
-        (mv.as_mv.col >> 3);
-
-  if (mv.as_mv.row & 7 || mv.as_mv.col & 7) {
-    xd->subpixel_predict8x4(ptr, d->pre_stride, (mv.as_mv.col & 7) << 1,
-                           (mv.as_mv.row & 7) << 1, pred_ptr, pitch);
-  } else {
-    vp9_copy_mem8x4(ptr, d->pre_stride, pred_ptr, pitch);
-  }
-}
-
-
-/*encoder only*/
-#if CONFIG_PRED_FILTER
-
-// Select the thresholded or non-thresholded filter
-#define USE_THRESH_FILTER 0
-
-#define PRED_FILT_LEN 5
-
-static const int filt_shift = 4;
-static const int pred_filter[PRED_FILT_LEN] = {1, 2, 10, 2, 1};
-// Alternative filter {1, 1, 4, 1, 1}
-
-#if !USE_THRESH_FILTER
-void filter_mb(unsigned char *src, int src_stride,
-               unsigned char *dst, int dst_stride,
-               int width, int height) {
-  int i, j, k;
-  unsigned int Temp[32 * 32];
-  unsigned int  *pTmp = Temp;
-  unsigned char *pSrc = src - (1 + src_stride) * (PRED_FILT_LEN / 2);
-
-  // Horizontal
-  for (i = 0; i < height + PRED_FILT_LEN - 1; i++) {
-    for (j = 0; j < width; j++) {
-      int sum = 0;
-      for (k = 0; k < PRED_FILT_LEN; k++)
-        sum += pSrc[j + k] * pred_filter[k];
-      pTmp[j] = sum;
-    }
-
-    pSrc += src_stride;
-    pTmp += width;
-  }
-
-  // Vertical
-  pTmp = Temp;
-  for (i = 0; i < width; i++) {
-    unsigned char *pDst = dst + i;
-    for (j = 0; j < height; j++) {
-      int sum = 0;
-      for (k = 0; k < PRED_FILT_LEN; k++)
-        sum += pTmp[(j + k) * width] * pred_filter[k];
-      // Round
-      sum = (sum + ((1 << (filt_shift << 1)) >> 1)) >> (filt_shift << 1);
-      pDst[j * dst_stride] = (sum < 0 ? 0 : sum > 255 ? 255 : sum);
-    }
-    ++pTmp;
-  }
-}
-#else
-// Based on vp9_post_proc_down_and_across_c (postproc.c)
-void filter_mb(unsigned char *src, int src_stride,
-               unsigned char *dst, int dst_stride,
-               int width, int height) {
-  unsigned char *pSrc, *pDst;
-  int row;
-  int col;
-  int i;
-  int v;
-  unsigned char d[8];
-
-  /* TODO flimit should be linked to the quantizer value */
-  int flimit = 7;
-
-  for (row = 0; row < height; row++) {
-    /* post_proc_down for one row */
-    pSrc = src;
-    pDst = dst;
-
-    for (col = 0; col < width; col++) {
-      int kernel = (1 << (filt_shift - 1));
-      int v = pSrc[col];
-
-      for (i = -2; i <= 2; i++) {
-        if (abs(v - pSrc[col + i * src_stride]) > flimit)
-          goto down_skip_convolve;
-
-        kernel += pred_filter[2 + i] * pSrc[col + i * src_stride];
-      }
-
-      v = (kernel >> filt_shift);
-    down_skip_convolve:
-      pDst[col] = v;
-    }
-
-    /* now post_proc_across */
-    pSrc = dst;
-    pDst = dst;
-
-    for (i = 0; i < 8; i++)
-      d[i] = pSrc[i];
-
-    for (col = 0; col < width; col++) {
-      int kernel = (1 << (filt_shift - 1));
-      v = pSrc[col];
-
-      d[col & 7] = v;
-
-      for (i = -2; i <= 2; i++) {
-        if (abs(v - pSrc[col + i]) > flimit)
-          goto across_skip_convolve;
-
-        kernel += pred_filter[2 + i] * pSrc[col + i];
-      }
-
-      d[col & 7] = (kernel >> filt_shift);
-    across_skip_convolve:
-
-      if (col >= 2)
-        pDst[col - 2] = d[(col - 2) & 7];
-    }
-
-    /* handle the last two pixels */
-    pDst[col - 2] = d[(col - 2) & 7];
-    pDst[col - 1] = d[(col - 1) & 7];
-
-    /* next row */
-    src += src_stride;
-    dst += dst_stride;
-  }
-}
-#endif  // !USE_THRESH_FILTER
-
-#endif  // CONFIG_PRED_FILTER
-
-/*encoder only*/
-void vp9_build_inter4x4_predictors_mbuv(MACROBLOCKD *xd) {
-  int i, j;
-  BLOCKD *blockd = xd->block;
-
-  /* build uv mvs */
-  for (i = 0; i < 2; i++) {
-    for (j = 0; j < 2; j++) {
-      int yoffset = i * 8 + j * 2;
-      int uoffset = 16 + i * 2 + j;
-      int voffset = 20 + i * 2 + j;
-      int temp;
-
-      temp = blockd[yoffset  ].bmi.as_mv.first.as_mv.row
-             + blockd[yoffset + 1].bmi.as_mv.first.as_mv.row
-             + blockd[yoffset + 4].bmi.as_mv.first.as_mv.row
-             + blockd[yoffset + 5].bmi.as_mv.first.as_mv.row;
-
-      if (temp < 0) temp -= 4;
-      else temp += 4;
-
-      xd->block[uoffset].bmi.as_mv.first.as_mv.row = (temp / 8) &
-        xd->fullpixel_mask;
-
-      temp = blockd[yoffset  ].bmi.as_mv.first.as_mv.col
-             + blockd[yoffset + 1].bmi.as_mv.first.as_mv.col
-             + blockd[yoffset + 4].bmi.as_mv.first.as_mv.col
-             + blockd[yoffset + 5].bmi.as_mv.first.as_mv.col;
-
-      if (temp < 0) temp -= 4;
-      else temp += 4;
-
-      blockd[uoffset].bmi.as_mv.first.as_mv.col = (temp / 8) &
-        xd->fullpixel_mask;
-
-      blockd[voffset].bmi.as_mv.first.as_mv.row =
-        blockd[uoffset].bmi.as_mv.first.as_mv.row;
-      blockd[voffset].bmi.as_mv.first.as_mv.col =
-        blockd[uoffset].bmi.as_mv.first.as_mv.col;
-
-      if (xd->mode_info_context->mbmi.second_ref_frame) {
-        temp = blockd[yoffset  ].bmi.as_mv.second.as_mv.row
-               + blockd[yoffset + 1].bmi.as_mv.second.as_mv.row
-               + blockd[yoffset + 4].bmi.as_mv.second.as_mv.row
-               + blockd[yoffset + 5].bmi.as_mv.second.as_mv.row;
-
-        if (temp < 0) {
-          temp -= 4;
-        } else {
-          temp += 4;
-        }
-
-        blockd[uoffset].bmi.as_mv.second.as_mv.row = (temp / 8) &
-          xd->fullpixel_mask;
-
-        temp = blockd[yoffset  ].bmi.as_mv.second.as_mv.col
-               + blockd[yoffset + 1].bmi.as_mv.second.as_mv.col
-               + blockd[yoffset + 4].bmi.as_mv.second.as_mv.col
-               + blockd[yoffset + 5].bmi.as_mv.second.as_mv.col;
-
-        if (temp < 0) {
-          temp -= 4;
-        } else {
-          temp += 4;
-        }
-
-        blockd[uoffset].bmi.as_mv.second.as_mv.col = (temp / 8) &
-          xd->fullpixel_mask;
-
-        blockd[voffset].bmi.as_mv.second.as_mv.row =
-          blockd[uoffset].bmi.as_mv.second.as_mv.row;
-        blockd[voffset].bmi.as_mv.second.as_mv.col =
-          blockd[uoffset].bmi.as_mv.second.as_mv.col;
-      }
-    }
-  }
-
-  for (i = 16; i < 24; i += 2) {
-    BLOCKD *d0 = &blockd[i];
-    BLOCKD *d1 = &blockd[i + 1];
-
-    if (d0->bmi.as_mv.first.as_int == d1->bmi.as_mv.first.as_int)
-      build_inter_predictors2b(xd, d0, 8);
-    else {
-      vp9_build_inter_predictors_b(d0, 8, xd->subpixel_predict);
-      vp9_build_inter_predictors_b(d1, 8, xd->subpixel_predict);
-    }
-
-    if (xd->mode_info_context->mbmi.second_ref_frame) {
-      vp9_build_2nd_inter_predictors_b(d0, 8, xd->subpixel_predict_avg);
-      vp9_build_2nd_inter_predictors_b(d1, 8, xd->subpixel_predict_avg);
-    }
-  }
-}
-
-static void clamp_mv_to_umv_border(MV *mv, const MACROBLOCKD *xd) {
-  /* If the MV points so far into the UMV border that no visible pixels
-   * are used for reconstruction, the subpel part of the MV can be
-   * discarded and the MV limited to 16 pixels with equivalent results.
-   *
-   * This limit kicks in at 19 pixels for the top and left edges, for
-   * the 16 pixels plus 3 taps right of the central pixel when subpel
-   * filtering. The bottom and right edges use 16 pixels plus 2 pixels
-   * left of the central pixel when filtering.
-   */
-  if (mv->col < (xd->mb_to_left_edge - ((16 + INTERP_EXTEND) << 3)))
-    mv->col = xd->mb_to_left_edge - (16 << 3);
-  else if (mv->col > xd->mb_to_right_edge + ((15 + INTERP_EXTEND) << 3))
-    mv->col = xd->mb_to_right_edge + (16 << 3);
-
-  if (mv->row < (xd->mb_to_top_edge - ((16 + INTERP_EXTEND) << 3)))
-    mv->row = xd->mb_to_top_edge - (16 << 3);
-  else if (mv->row > xd->mb_to_bottom_edge + ((15 + INTERP_EXTEND) << 3))
-    mv->row = xd->mb_to_bottom_edge + (16 << 3);
-}
-
-/* A version of the above function for chroma block MVs.*/
-static void clamp_uvmv_to_umv_border(MV *mv, const MACROBLOCKD *xd) {
-  mv->col = (2 * mv->col < (xd->mb_to_left_edge - ((16 + INTERP_EXTEND) << 3))) ?
-            (xd->mb_to_left_edge - (16 << 3)) >> 1 : mv->col;
-  mv->col = (2 * mv->col > xd->mb_to_right_edge + ((15 + INTERP_EXTEND) << 3)) ?
-            (xd->mb_to_right_edge + (16 << 3)) >> 1 : mv->col;
-
-  mv->row = (2 * mv->row < (xd->mb_to_top_edge - ((16 + INTERP_EXTEND) << 3))) ?
-            (xd->mb_to_top_edge - (16 << 3)) >> 1 : mv->row;
-  mv->row = (2 * mv->row > xd->mb_to_bottom_edge + ((15 + INTERP_EXTEND) << 3)) ?
-            (xd->mb_to_bottom_edge + (16 << 3)) >> 1 : mv->row;
-}
-
-/*encoder only*/
-void vp9_build_1st_inter16x16_predictors_mby(MACROBLOCKD *xd,
-                                             unsigned char *dst_y,
-                                             int dst_ystride,
-                                             int clamp_mvs) {
-  unsigned char *ptr_base = xd->pre.y_buffer;
-  unsigned char *ptr;
-  int pre_stride = xd->block[0].pre_stride;
-  int_mv ymv;
-
-  ymv.as_int = xd->mode_info_context->mbmi.mv[0].as_int;
-
-  if (clamp_mvs)
-    clamp_mv_to_umv_border(&ymv.as_mv, xd);
-
-  ptr = ptr_base + (ymv.as_mv.row >> 3) * pre_stride + (ymv.as_mv.col >> 3);
-
-#if CONFIG_PRED_FILTER
-  if (xd->mode_info_context->mbmi.pred_filter_enabled) {
-    if ((ymv.as_mv.row | ymv.as_mv.col) & 7) {
-      // Sub-pel filter needs extended input
-      int len = 15 + (INTERP_EXTEND << 1);
-      unsigned char Temp[32 * 32]; // Data required by sub-pel filter
-      unsigned char *pTemp = Temp + (INTERP_EXTEND - 1) * (len + 1);
-
-      // Copy extended MB into Temp array, applying the spatial filter
-      filter_mb(ptr - (INTERP_EXTEND - 1) * (pre_stride + 1), pre_stride,
-                Temp, len, len, len);
-
-      // Sub-pel interpolation
-      xd->subpixel_predict16x16(pTemp, len,
-                                (ymv.as_mv.col & 7) << 1,
-                                (ymv.as_mv.row & 7) << 1,
-                                dst_y, dst_ystride);
-    } else {
-      // Apply spatial filter to create the prediction directly
-      filter_mb(ptr, pre_stride, dst_y, dst_ystride, 16, 16);
-    }
-  } else
-#endif
-    if ((ymv.as_mv.row | ymv.as_mv.col) & 7) {
-      xd->subpixel_predict16x16(ptr, pre_stride,
-                                (ymv.as_mv.col & 7) << 1,
-                                (ymv.as_mv.row & 7) << 1,
-                                dst_y, dst_ystride);
-    } else {
-      vp9_copy_mem16x16(ptr, pre_stride, dst_y, dst_ystride);
-    }
-}
-
-void vp9_build_1st_inter16x16_predictors_mbuv(MACROBLOCKD *xd,
-                                              unsigned char *dst_u,
-                                              unsigned char *dst_v,
-                                              int dst_uvstride) {
-  int offset;
-  unsigned char *uptr, *vptr;
-  int pre_stride = xd->block[0].pre_stride;
-  int_mv _o16x16mv;
-  int_mv _16x16mv;
-
-  _16x16mv.as_int = xd->mode_info_context->mbmi.mv[0].as_int;
-
-  if (xd->mode_info_context->mbmi.need_to_clamp_mvs)
-    clamp_mv_to_umv_border(&_16x16mv.as_mv, xd);
-
-  _o16x16mv = _16x16mv;
-  /* calc uv motion vectors */
-  if (_16x16mv.as_mv.row < 0)
-    _16x16mv.as_mv.row -= 1;
-  else
-    _16x16mv.as_mv.row += 1;
-
-  if (_16x16mv.as_mv.col < 0)
-    _16x16mv.as_mv.col -= 1;
-  else
-    _16x16mv.as_mv.col += 1;
-
-  _16x16mv.as_mv.row /= 2;
-  _16x16mv.as_mv.col /= 2;
-
-  _16x16mv.as_mv.row &= xd->fullpixel_mask;
-  _16x16mv.as_mv.col &= xd->fullpixel_mask;
-
-  pre_stride >>= 1;
-  offset = (_16x16mv.as_mv.row >> 3) * pre_stride + (_16x16mv.as_mv.col >> 3);
-  uptr = xd->pre.u_buffer + offset;
-  vptr = xd->pre.v_buffer + offset;
-
-#if CONFIG_PRED_FILTER
-  if (xd->mode_info_context->mbmi.pred_filter_enabled) {
-    int i;
-    unsigned char *pSrc = uptr;
-    unsigned char *pDst = dst_u;
-    int len = 7 + (INTERP_EXTEND << 1);
-    unsigned char Temp[32 * 32]; // Data required by the sub-pel filter
-    unsigned char *pTemp = Temp + (INTERP_EXTEND - 1) * (len + 1);
-
-    // U & V
-    for (i = 0; i < 2; i++) {
-      if (_o16x16mv.as_int & 0x000f000f) {
-        // Copy extended MB into Temp array, applying the spatial filter
-        filter_mb(pSrc - (INTERP_EXTEND - 1) * (pre_stride + 1), pre_stride,
-                  Temp, len, len, len);
-
-        // Sub-pel filter
-        xd->subpixel_predict8x8(pTemp, len,
-                                _o16x16mv.as_mv.col & 15,
-                                _o16x16mv.as_mv.row & 15,
-                                pDst, dst_uvstride);
-      } else {
-        filter_mb(pSrc, pre_stride, pDst, dst_uvstride, 8, 8);
-      }
-
-      // V
-      pSrc = vptr;
-      pDst = dst_v;
-    }
-  } else
-#endif
-    if (_o16x16mv.as_int & 0x000f000f) {
-      xd->subpixel_predict8x8(uptr, pre_stride, _o16x16mv.as_mv.col & 15,
-                              _o16x16mv.as_mv.row & 15, dst_u, dst_uvstride);
-      xd->subpixel_predict8x8(vptr, pre_stride, _o16x16mv.as_mv.col & 15,
-                              _o16x16mv.as_mv.row & 15, dst_v, dst_uvstride);
-    } else {
-      vp9_copy_mem8x8(uptr, pre_stride, dst_u, dst_uvstride);
-      vp9_copy_mem8x8(vptr, pre_stride, dst_v, dst_uvstride);
-    }
-}
-
-
-void vp9_build_1st_inter16x16_predictors_mb(MACROBLOCKD *xd,
-                                            unsigned char *dst_y,
-                                            unsigned char *dst_u,
-                                            unsigned char *dst_v,
-                                            int dst_ystride, int dst_uvstride) {
-  vp9_build_1st_inter16x16_predictors_mby(xd, dst_y, dst_ystride,
-      xd->mode_info_context->mbmi.need_to_clamp_mvs);
-  vp9_build_1st_inter16x16_predictors_mbuv(xd, dst_u, dst_v, dst_uvstride);
-}
-
-#if CONFIG_SUPERBLOCKS
-void vp9_build_inter32x32_predictors_sb(MACROBLOCKD *x,
-                                        unsigned char *dst_y,
-                                        unsigned char *dst_u,
-                                        unsigned char *dst_v,
-                                        int dst_ystride,
-                                        int dst_uvstride) {
-  uint8_t *y1 = x->pre.y_buffer, *u1 = x->pre.u_buffer, *v1 = x->pre.v_buffer;
-  uint8_t *y2 = x->second_pre.y_buffer, *u2 = x->second_pre.u_buffer,
-          *v2 = x->second_pre.v_buffer;
-  int n;
-
-  for (n = 0; n < 4; n++)
-  {
-    const int x_idx = n & 1, y_idx = n >> 1;
-
-    x->pre.y_buffer = y1 + y_idx * 16 * x->pre.y_stride  + x_idx * 16;
-    x->pre.u_buffer = u1 + y_idx *  8 * x->pre.uv_stride + x_idx *  8;
-    x->pre.v_buffer = v1 + y_idx *  8 * x->pre.uv_stride + x_idx *  8;
-
-    vp9_build_1st_inter16x16_predictors_mb(x,
-      dst_y + y_idx * 16 * dst_ystride  + x_idx * 16,
-      dst_u + y_idx *  8 * dst_uvstride + x_idx *  8,
-      dst_v + y_idx *  8 * dst_uvstride + x_idx *  8,
-      dst_ystride, dst_uvstride);
-    if (x->mode_info_context->mbmi.second_ref_frame) {
-      x->second_pre.y_buffer = y2 + y_idx * 16 * x->pre.y_stride  + x_idx * 16;
-      x->second_pre.u_buffer = u2 + y_idx *  8 * x->pre.uv_stride + x_idx *  8;
-      x->second_pre.v_buffer = v2 + y_idx *  8 * x->pre.uv_stride + x_idx *  8;
-
-      vp9_build_2nd_inter16x16_predictors_mb(x,
-        dst_y + y_idx * 16 * dst_ystride  + x_idx * 16,
-        dst_u + y_idx *  8 * dst_uvstride + x_idx *  8,
-        dst_v + y_idx *  8 * dst_uvstride + x_idx *  8,
-        dst_ystride, dst_uvstride);
-    }
-  }
-
-  x->pre.y_buffer = y1;
-  x->pre.u_buffer = u1;
-  x->pre.v_buffer = v1;
-
-  if (x->mode_info_context->mbmi.second_ref_frame) {
-    x->second_pre.y_buffer = y2;
-    x->second_pre.u_buffer = u2;
-    x->second_pre.v_buffer = v2;
-  }
-}
-#endif
-
-/*
- * The following functions should be called after an initial
- * call to vp9_build_1st_inter16x16_predictors_mb() or _mby()/_mbuv().
- * It will run a second sixtap filter on a (different) ref
- * frame and average the result with the output of the
- * first sixtap filter. The second reference frame is stored
- * in x->second_pre (the reference frame index is in
- * x->mode_info_context->mbmi.second_ref_frame). The second
- * motion vector is x->mode_info_context->mbmi.second_mv.
- *
- * This allows blending prediction from two reference frames
- * which sometimes leads to better prediction than from a
- * single reference framer.
- */
-void vp9_build_2nd_inter16x16_predictors_mby(MACROBLOCKD *xd,
-                                             unsigned char *dst_y,
-                                             int dst_ystride) {
-  unsigned char *ptr;
-
-  int_mv _16x16mv;
-  int mv_row;
-  int mv_col;
-
-  unsigned char *ptr_base = xd->second_pre.y_buffer;
-  int pre_stride = xd->block[0].pre_stride;
-
-  _16x16mv.as_int = xd->mode_info_context->mbmi.mv[1].as_int;
-
-  if (xd->mode_info_context->mbmi.need_to_clamp_secondmv)
-    clamp_mv_to_umv_border(&_16x16mv.as_mv, xd);
-
-  mv_row = _16x16mv.as_mv.row;
-  mv_col = _16x16mv.as_mv.col;
-
-  ptr = ptr_base + (mv_row >> 3) * pre_stride + (mv_col >> 3);
-
-#if CONFIG_PRED_FILTER
-  if (xd->mode_info_context->mbmi.pred_filter_enabled) {
-    if ((mv_row | mv_col) & 7) {
-      // Sub-pel filter needs extended input
-      int len = 15 + (INTERP_EXTEND << 1);
-      unsigned char Temp[32 * 32]; // Data required by sub-pel filter
-      unsigned char *pTemp = Temp + (INTERP_EXTEND - 1) * (len + 1);
-
-      // Copy extended MB into Temp array, applying the spatial filter
-      filter_mb(ptr - (INTERP_EXTEND - 1) * (pre_stride + 1), pre_stride,
-                Temp, len, len, len);
-
-      // Sub-pel filter
-      xd->subpixel_predict_avg16x16(pTemp, len, (mv_col & 7) << 1,
-                                    (mv_row & 7) << 1, dst_y, dst_ystride);
-    } else {
-      // TODO Needs to AVERAGE with the dst_y
-      // For now, do not apply the prediction filter in these cases!
-      vp9_avg_mem16x16(ptr, pre_stride, dst_y, dst_ystride);
-    }
-  } else
-#endif  // CONFIG_PRED_FILTER
-  {
-    if ((mv_row | mv_col) & 7) {
-      xd->subpixel_predict_avg16x16(ptr, pre_stride, (mv_col & 7) << 1,
-                                    (mv_row & 7) << 1, dst_y, dst_ystride);
-    } else {
-      vp9_avg_mem16x16(ptr, pre_stride, dst_y, dst_ystride);
-    }
-  }
-}
-
-void vp9_build_2nd_inter16x16_predictors_mbuv(MACROBLOCKD *xd,
-                                              unsigned char *dst_u,
-                                              unsigned char *dst_v,
-                                              int dst_uvstride) {
-  int offset;
-  unsigned char *uptr, *vptr;
-
-  int_mv _16x16mv;
-  int mv_row;
-  int mv_col;
-  int omv_row, omv_col;
-
-  int pre_stride = xd->block[0].pre_stride;
-
-  _16x16mv.as_int = xd->mode_info_context->mbmi.mv[1].as_int;
-
-  if (xd->mode_info_context->mbmi.need_to_clamp_secondmv)
-    clamp_mv_to_umv_border(&_16x16mv.as_mv, xd);
-
-  mv_row = _16x16mv.as_mv.row;
-  mv_col = _16x16mv.as_mv.col;
-
-  /* calc uv motion vectors */
-  omv_row = mv_row;
-  omv_col = mv_col;
-  mv_row = (mv_row + (mv_row > 0)) >> 1;
-  mv_col = (mv_col + (mv_col > 0)) >> 1;
-
-  mv_row &= xd->fullpixel_mask;
-  mv_col &= xd->fullpixel_mask;
-
-  pre_stride >>= 1;
-  offset = (mv_row >> 3) * pre_stride + (mv_col >> 3);
-  uptr = xd->second_pre.u_buffer + offset;
-  vptr = xd->second_pre.v_buffer + offset;
-
-#if CONFIG_PRED_FILTER
-  if (xd->mode_info_context->mbmi.pred_filter_enabled) {
-    int i;
-    int len = 7 + (INTERP_EXTEND << 1);
-    unsigned char Temp[32 * 32]; // Data required by sub-pel filter
-    unsigned char *pTemp = Temp + (INTERP_EXTEND - 1) * (len + 1);
-    unsigned char *pSrc = uptr;
-    unsigned char *pDst = dst_u;
-
-    // U & V
-    for (i = 0; i < 2; i++) {
-      if ((omv_row | omv_col) & 15) {
-        // Copy extended MB into Temp array, applying the spatial filter
-        filter_mb(pSrc - (INTERP_EXTEND - 1) * (pre_stride + 1), pre_stride,
-                  Temp, len, len, len);
-
-        // Sub-pel filter
-        xd->subpixel_predict_avg8x8(pTemp, len, omv_col & 15,
-                                    omv_row & 15, pDst, dst_uvstride);
-      } else {
-        // TODO Needs to AVERAGE with the dst_[u|v]
-        // For now, do not apply the prediction filter here!
-        vp9_avg_mem8x8(pSrc, pre_stride, pDst, dst_uvstride);
-      }
-
-      // V
-      pSrc = vptr;
-      pDst = dst_v;
-    }
-  } else
-#endif  // CONFIG_PRED_FILTER
-    if ((omv_row | omv_col) & 15) {
-      xd->subpixel_predict_avg8x8(uptr, pre_stride, omv_col & 15,
-                                  omv_row & 15, dst_u, dst_uvstride);
-      xd->subpixel_predict_avg8x8(vptr, pre_stride, omv_col & 15,
-                                  omv_row & 15, dst_v, dst_uvstride);
-    } else {
-      vp9_avg_mem8x8(uptr, pre_stride, dst_u, dst_uvstride);
-      vp9_avg_mem8x8(vptr, pre_stride, dst_v, dst_uvstride);
-    }
-}
-
-void vp9_build_2nd_inter16x16_predictors_mb(MACROBLOCKD *xd,
-                                            unsigned char *dst_y,
-                                            unsigned char *dst_u,
-                                            unsigned char *dst_v,
-                                            int dst_ystride,
-                                            int dst_uvstride) {
-  vp9_build_2nd_inter16x16_predictors_mby(xd, dst_y, dst_ystride);
-  vp9_build_2nd_inter16x16_predictors_mbuv(xd, dst_u, dst_v, dst_uvstride);
-}
-
-static void build_inter4x4_predictors_mb(MACROBLOCKD *xd) {
-  int i;
-  MB_MODE_INFO * mbmi = &xd->mode_info_context->mbmi;
-  BLOCKD *blockd = xd->block;
-
-  if (xd->mode_info_context->mbmi.partitioning != PARTITIONING_4X4) {
-    blockd[ 0].bmi = xd->mode_info_context->bmi[ 0];
-    blockd[ 2].bmi = xd->mode_info_context->bmi[ 2];
-    blockd[ 8].bmi = xd->mode_info_context->bmi[ 8];
-    blockd[10].bmi = xd->mode_info_context->bmi[10];
-
-    if (mbmi->need_to_clamp_mvs) {
-      clamp_mv_to_umv_border(&blockd[ 0].bmi.as_mv.first.as_mv, xd);
-      clamp_mv_to_umv_border(&blockd[ 2].bmi.as_mv.first.as_mv, xd);
-      clamp_mv_to_umv_border(&blockd[ 8].bmi.as_mv.first.as_mv, xd);
-      clamp_mv_to_umv_border(&blockd[10].bmi.as_mv.first.as_mv, xd);
-      if (mbmi->second_ref_frame) {
-        clamp_mv_to_umv_border(&blockd[ 0].bmi.as_mv.second.as_mv, xd);
-        clamp_mv_to_umv_border(&blockd[ 2].bmi.as_mv.second.as_mv, xd);
-        clamp_mv_to_umv_border(&blockd[ 8].bmi.as_mv.second.as_mv, xd);
-        clamp_mv_to_umv_border(&blockd[10].bmi.as_mv.second.as_mv, xd);
-      }
-    }
-
-
-    vp9_build_inter_predictors4b(xd, &blockd[ 0], 16);
-    vp9_build_inter_predictors4b(xd, &blockd[ 2], 16);
-    vp9_build_inter_predictors4b(xd, &blockd[ 8], 16);
-    vp9_build_inter_predictors4b(xd, &blockd[10], 16);
-
-    if (mbmi->second_ref_frame) {
-      vp9_build_2nd_inter_predictors4b(xd, &blockd[ 0], 16);
-      vp9_build_2nd_inter_predictors4b(xd, &blockd[ 2], 16);
-      vp9_build_2nd_inter_predictors4b(xd, &blockd[ 8], 16);
-      vp9_build_2nd_inter_predictors4b(xd, &blockd[10], 16);
-    }
-  } else {
-    for (i = 0; i < 16; i += 2) {
-      BLOCKD *d0 = &blockd[i];
-      BLOCKD *d1 = &blockd[i + 1];
-
-      blockd[i + 0].bmi = xd->mode_info_context->bmi[i + 0];
-      blockd[i + 1].bmi = xd->mode_info_context->bmi[i + 1];
-
-      if (mbmi->need_to_clamp_mvs) {
-        clamp_mv_to_umv_border(&blockd[i + 0].bmi.as_mv.first.as_mv, xd);
-        clamp_mv_to_umv_border(&blockd[i + 1].bmi.as_mv.first.as_mv, xd);
-        if (mbmi->second_ref_frame) {
-          clamp_mv_to_umv_border(&blockd[i + 0].bmi.as_mv.second.as_mv, xd);
-          clamp_mv_to_umv_border(&blockd[i + 1].bmi.as_mv.second.as_mv, xd);
-        }
-      }
-
-      if (d0->bmi.as_mv.first.as_int == d1->bmi.as_mv.first.as_int)
-        build_inter_predictors2b(xd, d0, 16);
-      else {
-        vp9_build_inter_predictors_b(d0, 16, xd->subpixel_predict);
-        vp9_build_inter_predictors_b(d1, 16, xd->subpixel_predict);
-      }
-
-      if (mbmi->second_ref_frame) {
-        vp9_build_2nd_inter_predictors_b(d0, 16, xd->subpixel_predict_avg);
-        vp9_build_2nd_inter_predictors_b(d1, 16, xd->subpixel_predict_avg);
-      }
-    }
-  }
-
-  for (i = 16; i < 24; i += 2) {
-    BLOCKD *d0 = &blockd[i];
-    BLOCKD *d1 = &blockd[i + 1];
-
-    if (d0->bmi.as_mv.first.as_int == d1->bmi.as_mv.first.as_int)
-      build_inter_predictors2b(xd, d0, 8);
-    else {
-      vp9_build_inter_predictors_b(d0, 8, xd->subpixel_predict);
-      vp9_build_inter_predictors_b(d1, 8, xd->subpixel_predict);
-    }
-
-    if (mbmi->second_ref_frame) {
-      vp9_build_2nd_inter_predictors_b(d0, 8, xd->subpixel_predict_avg);
-      vp9_build_2nd_inter_predictors_b(d1, 8, xd->subpixel_predict_avg);
-    }
-  }
-}
-
-static
-void build_4x4uvmvs(MACROBLOCKD *xd) {
-  int i, j;
-  BLOCKD *blockd = xd->block;
-
-  for (i = 0; i < 2; i++) {
-    for (j = 0; j < 2; j++) {
-      int yoffset = i * 8 + j * 2;
-      int uoffset = 16 + i * 2 + j;
-      int voffset = 20 + i * 2 + j;
-
-      int temp;
-
-      temp = xd->mode_info_context->bmi[yoffset + 0].as_mv.first.as_mv.row
-             + xd->mode_info_context->bmi[yoffset + 1].as_mv.first.as_mv.row
-             + xd->mode_info_context->bmi[yoffset + 4].as_mv.first.as_mv.row
-             + xd->mode_info_context->bmi[yoffset + 5].as_mv.first.as_mv.row;
-
-      if (temp < 0) temp -= 4;
-      else temp += 4;
-
-      blockd[uoffset].bmi.as_mv.first.as_mv.row = (temp / 8) &
-                                                  xd->fullpixel_mask;
-
-      temp = xd->mode_info_context->bmi[yoffset + 0].as_mv.first.as_mv.col
-             + xd->mode_info_context->bmi[yoffset + 1].as_mv.first.as_mv.col
-             + xd->mode_info_context->bmi[yoffset + 4].as_mv.first.as_mv.col
-             + xd->mode_info_context->bmi[yoffset + 5].as_mv.first.as_mv.col;
-
-      if (temp < 0) temp -= 4;
-      else temp += 4;
-
-      blockd[uoffset].bmi.as_mv.first.as_mv.col = (temp / 8) &
-        xd->fullpixel_mask;
-
-      // if (x->mode_info_context->mbmi.need_to_clamp_mvs)
-      clamp_uvmv_to_umv_border(&blockd[uoffset].bmi.as_mv.first.as_mv, xd);
-
-      // if (x->mode_info_context->mbmi.need_to_clamp_mvs)
-      clamp_uvmv_to_umv_border(&blockd[uoffset].bmi.as_mv.first.as_mv, xd);
-
-      blockd[voffset].bmi.as_mv.first.as_mv.row =
-        blockd[uoffset].bmi.as_mv.first.as_mv.row;
-      blockd[voffset].bmi.as_mv.first.as_mv.col =
-        blockd[uoffset].bmi.as_mv.first.as_mv.col;
-
-      if (xd->mode_info_context->mbmi.second_ref_frame) {
-        temp = xd->mode_info_context->bmi[yoffset + 0].as_mv.second.as_mv.row
-               + xd->mode_info_context->bmi[yoffset + 1].as_mv.second.as_mv.row
-               + xd->mode_info_context->bmi[yoffset + 4].as_mv.second.as_mv.row
-               + xd->mode_info_context->bmi[yoffset + 5].as_mv.second.as_mv.row;
-
-        if (temp < 0) {
-          temp -= 4;
-        } else {
-          temp += 4;
-        }
-
-       blockd[uoffset].bmi.as_mv.second.as_mv.row = (temp / 8) &
-                                                    xd->fullpixel_mask;
-
-        temp = xd->mode_info_context->bmi[yoffset + 0].as_mv.second.as_mv.col
-               + xd->mode_info_context->bmi[yoffset + 1].as_mv.second.as_mv.col
-               + xd->mode_info_context->bmi[yoffset + 4].as_mv.second.as_mv.col
-               + xd->mode_info_context->bmi[yoffset + 5].as_mv.second.as_mv.col;
-
-        if (temp < 0) {
-          temp -= 4;
-        } else {
-          temp += 4;
-        }
-
-        blockd[uoffset].bmi.as_mv.second.as_mv.col = (temp / 8) &
-                                                        xd->fullpixel_mask;
-
-        // if (mbmi->need_to_clamp_mvs)
-        clamp_uvmv_to_umv_border(
-          &blockd[uoffset].bmi.as_mv.second.as_mv, xd);
-
-        // if (mbmi->need_to_clamp_mvs)
-        clamp_uvmv_to_umv_border(
-          &blockd[uoffset].bmi.as_mv.second.as_mv, xd);
-
-        blockd[voffset].bmi.as_mv.second.as_mv.row =
-          blockd[uoffset].bmi.as_mv.second.as_mv.row;
-        blockd[voffset].bmi.as_mv.second.as_mv.col =
-          blockd[uoffset].bmi.as_mv.second.as_mv.col;
-      }
-    }
-  }
-}
-
-void vp9_build_inter_predictors_mb(MACROBLOCKD *xd) {
-  if (xd->mode_info_context->mbmi.mode != SPLITMV) {
-    vp9_build_1st_inter16x16_predictors_mb(xd, xd->predictor,
-                                           &xd->predictor[256],
-                                           &xd->predictor[320], 16, 8);
-
-    if (xd->mode_info_context->mbmi.second_ref_frame) {
-      /* 256 = offset of U plane in Y+U+V buffer;
-       * 320 = offset of V plane in Y+U+V buffer.
-       * (256=16x16, 320=16x16+8x8). */
-      vp9_build_2nd_inter16x16_predictors_mb(xd, xd->predictor,
-                                             &xd->predictor[256],
-                                             &xd->predictor[320], 16, 8);
-    }
-  } else {
-    build_4x4uvmvs(xd);
-    build_inter4x4_predictors_mb(xd);
-  }
-}
--- a/vp8/common/reconinter.h
+++ /dev/null
@@ -1,78 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef __INC_RECONINTER_H
-#define __INC_RECONINTER_H
-
-#include "onyxc_int.h"
-
-extern void vp9_build_1st_inter16x16_predictors_mby(MACROBLOCKD *xd,
-                                                    unsigned char *dst_y,
-                                                    int dst_ystride,
-                                                    int clamp_mvs);
-
-extern void vp9_build_1st_inter16x16_predictors_mbuv(MACROBLOCKD *xd,
-                                                     unsigned char *dst_u,
-                                                     unsigned char *dst_v,
-                                                     int dst_uvstride);
-
-extern void vp9_build_1st_inter16x16_predictors_mb(MACROBLOCKD *xd,
-                                                   unsigned char *dst_y,
-                                                   unsigned char *dst_u,
-                                                   unsigned char *dst_v,
-                                                   int dst_ystride,
-                                                   int dst_uvstride);
-
-extern void vp9_build_2nd_inter16x16_predictors_mby(MACROBLOCKD *xd,
-                                                    unsigned char *dst_y,
-                                                    int dst_ystride);
-
-extern void vp9_build_2nd_inter16x16_predictors_mbuv(MACROBLOCKD *xd,
-                                                     unsigned char *dst_u,
-                                                     unsigned char *dst_v,
-                                                     int dst_uvstride);
-
-extern void vp9_build_2nd_inter16x16_predictors_mb(MACROBLOCKD *xd,
-                                                   unsigned char *dst_y,
-                                                   unsigned char *dst_u,
-                                                   unsigned char *dst_v,
-                                                   int dst_ystride,
-                                                   int dst_uvstride);
-
-#if CONFIG_SUPERBLOCKS
-extern void vp9_build_inter32x32_predictors_sb(MACROBLOCKD *x,
-                                               unsigned char *dst_y,
-                                               unsigned char *dst_u,
-                                               unsigned char *dst_v,
-                                               int dst_ystride,
-                                               int dst_uvstride);
-#endif
-
-extern void vp9_build_inter_predictors_mb(MACROBLOCKD *xd);
-
-extern void vp9_build_inter_predictors_b(BLOCKD *d, int pitch,
-                                         vp9_subpix_fn_t sppf);
-
-extern void vp9_build_2nd_inter_predictors_b(BLOCKD *d, int pitch,
-                                             vp9_subpix_fn_t sppf);
-
-extern void vp9_build_inter_predictors4b(MACROBLOCKD *xd, BLOCKD *d,
-                                         int pitch);
-
-extern void vp9_build_2nd_inter_predictors4b(MACROBLOCKD *xd,
-                                             BLOCKD *d, int pitch);
-
-extern void vp9_build_inter4x4_predictors_mbuv(MACROBLOCKD *xd);
-
-extern void vp9_setup_interp_filters(MACROBLOCKD *xd,
-                                     INTERPOLATIONFILTERTYPE filter,
-                                     VP9_COMMON *cm);
-
-#endif  // __INC_RECONINTER_H
--- a/vp8/common/reconintra.c
+++ /dev/null
@@ -1,490 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <stdio.h>
-#include "vpx_ports/config.h"
-#include "vpx_rtcd.h"
-#include "reconintra.h"
-#include "vpx_mem/vpx_mem.h"
-
-/* For skip_recon_mb(), add vp9_build_intra_predictors_mby_s(MACROBLOCKD *xd)
- * and vp9_build_intra_predictors_mbuv_s(MACROBLOCKD *xd).
- */
-
-static void d27_predictor(uint8_t *ypred_ptr, int y_stride, int n,
-                          uint8_t *yabove_row, uint8_t *yleft_col) {
-  int r, c, h, w, v;
-  int a, b;
-  r = 0;
-  for (c = 0; c < n - 2; c++) {
-    if (c & 1)
-      a = yleft_col[r + 1];
-    else
-      a = (yleft_col[r] + yleft_col[r + 1] + 1) >> 1;
-    b = yabove_row[c + 2];
-    ypred_ptr[c] = (2 * a + (c + 1) * b + (c + 3) / 2) / (c + 3);
-  }
-  for (r = 1; r < n / 2 - 1; r++) {
-    for (c = 0; c < n - 2 - 2 * r; c++) {
-      if (c & 1)
-        a = yleft_col[r + 1];
-      else
-        a = (yleft_col[r] + yleft_col[r + 1] + 1) >> 1;
-      b = ypred_ptr[(r - 1) * y_stride + c + 2];
-      ypred_ptr[r * y_stride + c] = (2 * a + (c + 1) * b + (c + 3) / 2) / (c + 3);
-    }
-  }
-  for (; r < n - 1; ++r) {
-    for (c = 0; c < n; c++) {
-      v = (c & 1 ? yleft_col[r + 1] : (yleft_col[r] + yleft_col[r + 1] + 1) >> 1);
-      h = r - c / 2;
-      ypred_ptr[h * y_stride + c] = v;
-    }
-  }
-  c = 0;
-  r = n - 1;
-  ypred_ptr[r * y_stride] = (ypred_ptr[(r - 1) * y_stride] +
-                             yleft_col[r] + 1) >> 1;
-  for (r = n - 2; r >= n / 2; --r) {
-    w = c + (n - 1 - r) * 2;
-    ypred_ptr[r * y_stride + w] = (ypred_ptr[(r - 1) * y_stride + w] +
-                                   ypred_ptr[r * y_stride + w - 1] + 1) >> 1;
-  }
-  for (c = 1; c < n; c++) {
-    for (r = n - 1; r >= n / 2 + c / 2; --r) {
-      w = c + (n - 1 - r) * 2;
-      ypred_ptr[r * y_stride + w] = (ypred_ptr[(r - 1) * y_stride + w] +
-                                     ypred_ptr[r * y_stride + w - 1] + 1) >> 1;
-    }
-  }
-}
-
-static void d63_predictor(uint8_t *ypred_ptr, int y_stride, int n,
-                          uint8_t *yabove_row, uint8_t *yleft_col) {
-  int r, c, h, w, v;
-  int a, b;
-  c = 0;
-  for (r = 0; r < n - 2; r++) {
-    if (r & 1)
-      a = yabove_row[c + 1];
-    else
-      a = (yabove_row[c] + yabove_row[c + 1] + 1) >> 1;
-    b = yleft_col[r + 2];
-    ypred_ptr[r * y_stride] = (2 * a + (r + 1) * b + (r + 3) / 2) / (r + 3);
-  }
-  for (c = 1; c < n / 2 - 1; c++) {
-    for (r = 0; r < n - 2 - 2 * c; r++) {
-      if (r & 1)
-        a = yabove_row[c + 1];
-      else
-        a = (yabove_row[c] + yabove_row[c + 1] + 1) >> 1;
-      b = ypred_ptr[(r + 2) * y_stride + c - 1];
-      ypred_ptr[r * y_stride + c] = (2 * a + (c + 1) * b + (c + 3) / 2) / (c + 3);
-    }
-  }
-  for (; c < n - 1; ++c) {
-    for (r = 0; r < n; r++) {
-      v = (r & 1 ? yabove_row[c + 1] : (yabove_row[c] + yabove_row[c + 1] + 1) >> 1);
-      w = c - r / 2;
-      ypred_ptr[r * y_stride + w] = v;
-    }
-  }
-  r = 0;
-  c = n - 1;
-  ypred_ptr[c] = (ypred_ptr[(c - 1)] + yabove_row[c] + 1) >> 1;
-  for (c = n - 2; c >= n / 2; --c) {
-    h = r + (n - 1 - c) * 2;
-    ypred_ptr[h * y_stride + c] = (ypred_ptr[h * y_stride + c - 1] +
-                                   ypred_ptr[(h - 1) * y_stride + c] + 1) >> 1;
-  }
-  for (r = 1; r < n; r++) {
-    for (c = n - 1; c >= n / 2 + r / 2; --c) {
-      h = r + (n - 1 - c) * 2;
-      ypred_ptr[h * y_stride + c] = (ypred_ptr[h * y_stride + c - 1] +
-                                     ypred_ptr[(h - 1) * y_stride + c] + 1) >> 1;
-    }
-  }
-}
-
-static void d45_predictor(uint8_t *ypred_ptr, int y_stride, int n,
-                          uint8_t *yabove_row, uint8_t *yleft_col) {
-  int r, c;
-  for (r = 0; r < n - 1; ++r) {
-    for (c = 0; c <= r; ++c) {
-      ypred_ptr[(r - c) * y_stride + c] =
-        (yabove_row[r + 1] * (c + 1) +
-         yleft_col[r + 1] * (r - c + 1) + r / 2 + 1) / (r + 2);
-    }
-  }
-  for (c = 0; c <= r; ++c) {
-    int yabove_ext = yabove_row[r]; // 2*yabove_row[r] - yabove_row[r-1];
-    int yleft_ext = yleft_col[r]; // 2*yleft_col[r] - yleft_col[r-1];
-    yabove_ext = (yabove_ext > 255 ? 255 : (yabove_ext < 0 ? 0 : yabove_ext));
-    yleft_ext = (yleft_ext > 255 ? 255 : (yleft_ext < 0 ? 0 : yleft_ext));
-    ypred_ptr[(r - c) * y_stride + c] =
-      (yabove_ext * (c + 1) +
-       yleft_ext * (r - c + 1) + r / 2 + 1) / (r + 2);
-  }
-  for (r = 1; r < n; ++r) {
-    for (c = n - r; c < n; ++c)
-      ypred_ptr[r * y_stride + c] = (ypred_ptr[(r - 1) * y_stride + c] +
-                                     ypred_ptr[r * y_stride + c - 1] + 1) >> 1;
-  }
-}
-
-static void d117_predictor(uint8_t *ypred_ptr, int y_stride, int n,
-                           uint8_t *yabove_row, uint8_t *yleft_col) {
-  int r, c;
-  for (c = 0; c < n; c++)
-    ypred_ptr[c] = (yabove_row[c - 1] + yabove_row[c] + 1) >> 1;
-  ypred_ptr += y_stride;
-  for (c = 0; c < n; c++)
-    ypred_ptr[c] = yabove_row[c - 1];
-  ypred_ptr += y_stride;
-  for (r = 2; r < n; ++r) {
-    ypred_ptr[0] = yleft_col[r - 2];
-    for (c = 1; c < n; c++)
-      ypred_ptr[c] = ypred_ptr[-2 * y_stride + c - 1];
-    ypred_ptr += y_stride;
-  }
-}
-
-static void d135_predictor(uint8_t *ypred_ptr, int y_stride, int n,
-                           uint8_t *yabove_row, uint8_t *yleft_col) {
-  int r, c;
-  ypred_ptr[0] = yabove_row[-1];
-  for (c = 1; c < n; c++)
-    ypred_ptr[c] = yabove_row[c - 1];
-  for (r = 1; r < n; ++r)
-    ypred_ptr[r * y_stride] = yleft_col[r - 1];
-
-  ypred_ptr += y_stride;
-  for (r = 1; r < n; ++r) {
-    for (c = 1; c < n; c++) {
-      ypred_ptr[c] = ypred_ptr[-y_stride + c - 1];
-    }
-    ypred_ptr += y_stride;
-  }
-}
-
-static void d153_predictor(uint8_t *ypred_ptr, int y_stride, int n,
-                           uint8_t *yabove_row, uint8_t *yleft_col) {
-  int r, c;
-  ypred_ptr[0] = (yabove_row[-1] + yleft_col[0] + 1) >> 1;
-  for (r = 1; r < n; r++)
-    ypred_ptr[r * y_stride] = (yleft_col[r - 1] + yleft_col[r] + 1) >> 1;
-  ypred_ptr++;
-  ypred_ptr[0] = yabove_row[-1];
-  for (r = 1; r < n; r++)
-    ypred_ptr[r * y_stride] = yleft_col[r - 1];
-  ypred_ptr++;
-
-  for (c = 0; c < n - 2; c++)
-    ypred_ptr[c] = yabove_row[c];
-  ypred_ptr += y_stride;
-  for (r = 1; r < n; ++r) {
-    for (c = 0; c < n - 2; c++)
-      ypred_ptr[c] = ypred_ptr[-y_stride + c - 2];
-    ypred_ptr += y_stride;
-  }
-}
-
-void vp9_recon_intra_mbuv(MACROBLOCKD *xd) {
-  int i;
-
-  for (i = 16; i < 24; i += 2) {
-    BLOCKD *b = &xd->block[i];
-    vp9_recon2b(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
-  }
-}
-
-void vp9_build_intra_predictors_internal(unsigned char *src, int src_stride,
-                                         unsigned char *ypred_ptr,
-                                         int y_stride, int mode, int bsize,
-                                         int up_available, int left_available) {
-
-  unsigned char *yabove_row = src - src_stride;
-  unsigned char yleft_col[32];
-  unsigned char ytop_left = yabove_row[-1];
-  int r, c, i;
-
-  for (i = 0; i < bsize; i++) {
-    yleft_col[i] = src[i * src_stride - 1];
-  }
-
-  /* for Y */
-  switch (mode) {
-    case DC_PRED: {
-      int expected_dc;
-      int i;
-      int shift;
-      int average = 0;
-      int log2_bsize_minus_1;
-
-      assert(bsize == 4 || bsize == 8 || bsize == 16 || bsize == 32);
-      if (bsize == 4) {
-        log2_bsize_minus_1 = 1;
-      } else if (bsize == 8) {
-        log2_bsize_minus_1 = 2;
-      } else if (bsize == 16) {
-        log2_bsize_minus_1 = 3;
-      } else /* bsize == 32 */ {
-        log2_bsize_minus_1 = 4;
-      }
-
-      if (up_available || left_available) {
-        if (up_available) {
-          for (i = 0; i < bsize; i++) {
-            average += yabove_row[i];
-          }
-        }
-
-        if (left_available) {
-          for (i = 0; i < bsize; i++) {
-            average += yleft_col[i];
-          }
-        }
-        shift = log2_bsize_minus_1 + up_available + left_available;
-        expected_dc = (average + (1 << (shift - 1))) >> shift;
-      } else {
-        expected_dc = 128;
-      }
-
-      for (r = 0; r < bsize; r++) {
-        vpx_memset(ypred_ptr, expected_dc, bsize);
-        ypred_ptr += y_stride;
-      }
-    }
-    break;
-    case V_PRED: {
-      for (r = 0; r < bsize; r++) {
-        memcpy(ypred_ptr, yabove_row, bsize);
-        ypred_ptr += y_stride;
-      }
-    }
-    break;
-    case H_PRED: {
-      for (r = 0; r < bsize; r++) {
-        vpx_memset(ypred_ptr, yleft_col[r], bsize);
-        ypred_ptr += y_stride;
-      }
-    }
-    break;
-    case TM_PRED: {
-      for (r = 0; r < bsize; r++) {
-        for (c = 0; c < bsize; c++) {
-          int pred =  yleft_col[r] + yabove_row[ c] - ytop_left;
-
-          if (pred < 0)
-            pred = 0;
-
-          if (pred > 255)
-            pred = 255;
-
-          ypred_ptr[c] = pred;
-        }
-
-        ypred_ptr += y_stride;
-      }
-    }
-    break;
-    case D45_PRED: {
-      d45_predictor(ypred_ptr, y_stride, bsize,  yabove_row, yleft_col);
-    }
-    break;
-    case D135_PRED: {
-      d135_predictor(ypred_ptr, y_stride, bsize,  yabove_row, yleft_col);
-    }
-    break;
-    case D117_PRED: {
-      d117_predictor(ypred_ptr, y_stride, bsize,  yabove_row, yleft_col);
-    }
-    break;
-    case D153_PRED: {
-      d153_predictor(ypred_ptr, y_stride, bsize,  yabove_row, yleft_col);
-    }
-    break;
-    case D27_PRED: {
-      d27_predictor(ypred_ptr, y_stride, bsize,  yabove_row, yleft_col);
-    }
-    break;
-    case D63_PRED: {
-      d63_predictor(ypred_ptr, y_stride, bsize,  yabove_row, yleft_col);
-    }
-    break;
-    case I8X8_PRED:
-    case B_PRED:
-    case NEARESTMV:
-    case NEARMV:
-    case ZEROMV:
-    case NEWMV:
-    case SPLITMV:
-    case MB_MODE_COUNT:
-      break;
-  }
-}
-
-void vp9_build_intra_predictors_mby(MACROBLOCKD *xd) {
-  vp9_build_intra_predictors_internal(xd->dst.y_buffer, xd->dst.y_stride,
-                                      xd->predictor, 16,
-                                      xd->mode_info_context->mbmi.mode, 16,
-                                      xd->up_available, xd->left_available);
-}
-
-void vp9_build_intra_predictors_mby_s(MACROBLOCKD *xd) {
-  vp9_build_intra_predictors_internal(xd->dst.y_buffer, xd->dst.y_stride,
-                                      xd->dst.y_buffer, xd->dst.y_stride,
-                                      xd->mode_info_context->mbmi.mode, 16,
-                                      xd->up_available, xd->left_available);
-}
-
-#if CONFIG_SUPERBLOCKS
-void vp9_build_intra_predictors_sby_s(MACROBLOCKD *xd) {
-  vp9_build_intra_predictors_internal(xd->dst.y_buffer, xd->dst.y_stride,
-                                      xd->dst.y_buffer, xd->dst.y_stride,
-                                      xd->mode_info_context->mbmi.mode, 32,
-                                      xd->up_available, xd->left_available);
-}
-#endif
-
-#if CONFIG_COMP_INTRA_PRED
-void vp9_build_comp_intra_predictors_mby(MACROBLOCKD *xd) {
-  unsigned char predictor[2][256];
-  int i;
-
-  vp9_build_intra_predictors_internal(xd->dst.y_buffer, xd->dst.y_stride,
-                                      predictor[0], 16,
-                                      xd->mode_info_context->mbmi.mode,
-                                      16, xd->up_available,
-                                      xd->left_available);
-  vp9_build_intra_predictors_internal(xd->dst.y_buffer, xd->dst.y_stride,
-                                      predictor[1], 16,
-                                      xd->mode_info_context->mbmi.second_mode,
-                                      16, xd->up_available,
-                                      xd->left_available);
-
-  for (i = 0; i < 256; i++) {
-    xd->predictor[i] = (predictor[0][i] + predictor[1][i] + 1) >> 1;
-  }
-}
-#endif
-
-void vp9_build_intra_predictors_mbuv_internal(MACROBLOCKD *xd,
-                                              unsigned char *upred_ptr,
-                                              unsigned char *vpred_ptr,
-                                              int uv_stride,
-                                              int mode, int bsize) {
-  vp9_build_intra_predictors_internal(xd->dst.u_buffer, xd->dst.uv_stride,
-                                      upred_ptr, uv_stride, mode, bsize,
-                                      xd->up_available, xd->left_available);
-  vp9_build_intra_predictors_internal(xd->dst.v_buffer, xd->dst.uv_stride,
-                                      vpred_ptr, uv_stride, mode, bsize,
-                                      xd->up_available, xd->left_available);
-}
-
-void vp9_build_intra_predictors_mbuv(MACROBLOCKD *xd) {
-  vp9_build_intra_predictors_mbuv_internal(xd, &xd->predictor[256],
-                                           &xd->predictor[320], 8,
-                                           xd->mode_info_context->mbmi.uv_mode,
-                                           8);
-}
-
-void vp9_build_intra_predictors_mbuv_s(MACROBLOCKD *xd) {
-  vp9_build_intra_predictors_mbuv_internal(xd, xd->dst.u_buffer,
-                                           xd->dst.v_buffer,
-                                           xd->dst.uv_stride,
-                                           xd->mode_info_context->mbmi.uv_mode,
-                                           8);
-}
-
-#if CONFIG_SUPERBLOCKS
-void vp9_build_intra_predictors_sbuv_s(MACROBLOCKD *xd) {
-  vp9_build_intra_predictors_mbuv_internal(xd, xd->dst.u_buffer,
-                                           xd->dst.v_buffer, xd->dst.uv_stride,
-                                           xd->mode_info_context->mbmi.uv_mode,
-                                           16);
-}
-#endif
-
-#if CONFIG_COMP_INTRA_PRED
-void vp9_build_comp_intra_predictors_mbuv(MACROBLOCKD *xd) {
-  unsigned char predictor[2][2][64];
-  int i;
-
-  vp9_build_intra_predictors_mbuv_internal(
-    xd, predictor[0][0], predictor[1][0], 8,
-    xd->mode_info_context->mbmi.uv_mode, 8);
-  vp9_build_intra_predictors_mbuv_internal(
-    xd, predictor[0][1], predictor[1][1], 8,
-    xd->mode_info_context->mbmi.second_uv_mode, 8);
-  for (i = 0; i < 64; i++) {
-    xd->predictor[256 + i] = (predictor[0][0][i] + predictor[0][1][i] + 1) >> 1;
-    xd->predictor[256 + 64 + i] = (predictor[1][0][i] +
-                                   predictor[1][1][i] + 1) >> 1;
-  }
-}
-#endif
-
-void vp9_intra8x8_predict(BLOCKD *xd,
-                          int mode,
-                          unsigned char *predictor) {
-  vp9_build_intra_predictors_internal(*(xd->base_dst) + xd->dst,
-                                      xd->dst_stride, predictor, 16,
-                                      mode, 8, 1, 1);
-}
-
-#if CONFIG_COMP_INTRA_PRED
-void vp9_comp_intra8x8_predict(BLOCKD *xd,
-                               int mode, int second_mode,
-                               unsigned char *out_predictor) {
-  unsigned char predictor[2][8 * 16];
-  int i, j;
-
-  vp9_intra8x8_predict(xd, mode, predictor[0]);
-  vp9_intra8x8_predict(xd, second_mode, predictor[1]);
-
-  for (i = 0; i < 8 * 16; i += 16) {
-    for (j = i; j < i + 8; j++) {
-      out_predictor[j] = (predictor[0][j] + predictor[1][j] + 1) >> 1;
-    }
-  }
-}
-#endif
-
-void vp9_intra_uv4x4_predict(BLOCKD *xd,
-                             int mode,
-                             unsigned char *predictor) {
-  vp9_build_intra_predictors_internal(*(xd->base_dst) + xd->dst,
-                                      xd->dst_stride, predictor, 8,
-                                      mode, 4, 1, 1);
-}
-
-#if CONFIG_COMP_INTRA_PRED
-void vp9_comp_intra_uv4x4_predict(BLOCKD *xd,
-                                  int mode, int mode2,
-                                  unsigned char *out_predictor) {
-  unsigned char predictor[2][8 * 4];
-  int i, j;
-
-  vp9_intra_uv4x4_predict(xd, mode, predictor[0]);
-  vp9_intra_uv4x4_predict(xd, mode2, predictor[1]);
-
-  for (i = 0; i < 4 * 8; i += 8) {
-    for (j = i; j < i + 4; j++) {
-      out_predictor[j] = (predictor[0][j] + predictor[1][j] + 1) >> 1;
-    }
-  }
-}
-#endif
-
-/* TODO: try different ways of use Y-UV mode correlation
- Current code assumes that a uv 4x4 block use same mode
- as corresponding Y 8x8 area
- */
--- a/vp8/common/reconintra.h
+++ /dev/null
@@ -1,18 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef __INC_RECONINTRA_H
-#define __INC_RECONINTRA_H
-
-#include "blockd.h"
-
-extern void init_intra_left_above_pixels(MACROBLOCKD *xd);
-
-#endif  // __INC_RECONINTRA_H
--- a/vp8/common/reconintra4x4.c
+++ /dev/null
@@ -1,321 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vpx_ports/config.h"
-#include "vpx_mem/vpx_mem.h"
-#include "reconintra.h"
-#include "vpx_rtcd.h"
-
-void vp9_intra4x4_predict_c(BLOCKD *x, int b_mode,
-                            unsigned char *predictor) {
-  int i, r, c;
-
-  unsigned char *Above = *(x->base_dst) + x->dst - x->dst_stride;
-  unsigned char Left[4];
-  unsigned char top_left = Above[-1];
-
-  Left[0] = (*(x->base_dst))[x->dst - 1];
-  Left[1] = (*(x->base_dst))[x->dst - 1 + x->dst_stride];
-  Left[2] = (*(x->base_dst))[x->dst - 1 + 2 * x->dst_stride];
-  Left[3] = (*(x->base_dst))[x->dst - 1 + 3 * x->dst_stride];
-
-  switch (b_mode) {
-    case B_DC_PRED: {
-      int expected_dc = 0;
-
-      for (i = 0; i < 4; i++) {
-        expected_dc += Above[i];
-        expected_dc += Left[i];
-      }
-
-      expected_dc = (expected_dc + 4) >> 3;
-
-      for (r = 0; r < 4; r++) {
-        for (c = 0; c < 4; c++) {
-          predictor[c] = expected_dc;
-        }
-
-        predictor += 16;
-      }
-    }
-    break;
-    case B_TM_PRED: {
-      /* prediction similar to true_motion prediction */
-      for (r = 0; r < 4; r++) {
-        for (c = 0; c < 4; c++) {
-          int pred = Above[c] - top_left + Left[r];
-
-          if (pred < 0)
-            pred = 0;
-
-          if (pred > 255)
-            pred = 255;
-
-          predictor[c] = pred;
-        }
-
-        predictor += 16;
-      }
-    }
-    break;
-
-    case B_VE_PRED: {
-
-      unsigned int ap[4];
-      ap[0] = Above[0];
-      ap[1] = Above[1];
-      ap[2] = Above[2];
-      ap[3] = Above[3];
-
-      for (r = 0; r < 4; r++) {
-        for (c = 0; c < 4; c++) {
-
-          predictor[c] = ap[c];
-        }
-
-        predictor += 16;
-      }
-
-    }
-    break;
-
-
-    case B_HE_PRED: {
-
-      unsigned int lp[4];
-      lp[0] = Left[0];
-      lp[1] = Left[1];
-      lp[2] = Left[2];
-      lp[3] = Left[3];
-
-      for (r = 0; r < 4; r++) {
-        for (c = 0; c < 4; c++) {
-          predictor[c] = lp[r];
-        }
-
-        predictor += 16;
-      }
-    }
-    break;
-    case B_LD_PRED: {
-      unsigned char *ptr = Above;
-      predictor[0 * 16 + 0] = (ptr[0] + ptr[1] * 2 + ptr[2] + 2) >> 2;
-      predictor[0 * 16 + 1] =
-        predictor[1 * 16 + 0] = (ptr[1] + ptr[2] * 2 + ptr[3] + 2) >> 2;
-      predictor[0 * 16 + 2] =
-        predictor[1 * 16 + 1] =
-          predictor[2 * 16 + 0] = (ptr[2] + ptr[3] * 2 + ptr[4] + 2) >> 2;
-      predictor[0 * 16 + 3] =
-        predictor[1 * 16 + 2] =
-          predictor[2 * 16 + 1] =
-            predictor[3 * 16 + 0] = (ptr[3] + ptr[4] * 2 + ptr[5] + 2) >> 2;
-      predictor[1 * 16 + 3] =
-        predictor[2 * 16 + 2] =
-          predictor[3 * 16 + 1] = (ptr[4] + ptr[5] * 2 + ptr[6] + 2) >> 2;
-      predictor[2 * 16 + 3] =
-        predictor[3 * 16 + 2] = (ptr[5] + ptr[6] * 2 + ptr[7] + 2) >> 2;
-      predictor[3 * 16 + 3] = (ptr[6] + ptr[7] * 2 + ptr[7] + 2) >> 2;
-
-    }
-    break;
-    case B_RD_PRED: {
-
-      unsigned char pp[9];
-
-      pp[0] = Left[3];
-      pp[1] = Left[2];
-      pp[2] = Left[1];
-      pp[3] = Left[0];
-      pp[4] = top_left;
-      pp[5] = Above[0];
-      pp[6] = Above[1];
-      pp[7] = Above[2];
-      pp[8] = Above[3];
-
-      predictor[3 * 16 + 0] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2;
-      predictor[3 * 16 + 1] =
-        predictor[2 * 16 + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
-      predictor[3 * 16 + 2] =
-        predictor[2 * 16 + 1] =
-          predictor[1 * 16 + 0] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2;
-      predictor[3 * 16 + 3] =
-        predictor[2 * 16 + 2] =
-          predictor[1 * 16 + 1] =
-            predictor[0 * 16 + 0] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2;
-      predictor[2 * 16 + 3] =
-        predictor[1 * 16 + 2] =
-          predictor[0 * 16 + 1] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2;
-      predictor[1 * 16 + 3] =
-        predictor[0 * 16 + 2] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2;
-      predictor[0 * 16 + 3] = (pp[6] + pp[7] * 2 + pp[8] + 2) >> 2;
-
-    }
-    break;
-    case B_VR_PRED: {
-
-      unsigned char pp[9];
-
-      pp[0] = Left[3];
-      pp[1] = Left[2];
-      pp[2] = Left[1];
-      pp[3] = Left[0];
-      pp[4] = top_left;
-      pp[5] = Above[0];
-      pp[6] = Above[1];
-      pp[7] = Above[2];
-      pp[8] = Above[3];
-
-
-      predictor[3 * 16 + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
-      predictor[2 * 16 + 0] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2;
-      predictor[3 * 16 + 1] =
-        predictor[1 * 16 + 0] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2;
-      predictor[2 * 16 + 1] =
-        predictor[0 * 16 + 0] = (pp[4] + pp[5] + 1) >> 1;
-      predictor[3 * 16 + 2] =
-        predictor[1 * 16 + 1] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2;
-      predictor[2 * 16 + 2] =
-        predictor[0 * 16 + 1] = (pp[5] + pp[6] + 1) >> 1;
-      predictor[3 * 16 + 3] =
-        predictor[1 * 16 + 2] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2;
-      predictor[2 * 16 + 3] =
-        predictor[0 * 16 + 2] = (pp[6] + pp[7] + 1) >> 1;
-      predictor[1 * 16 + 3] = (pp[6] + pp[7] * 2 + pp[8] + 2) >> 2;
-      predictor[0 * 16 + 3] = (pp[7] + pp[8] + 1) >> 1;
-
-    }
-    break;
-    case B_VL_PRED: {
-
-      unsigned char *pp = Above;
-
-      predictor[0 * 16 + 0] = (pp[0] + pp[1] + 1) >> 1;
-      predictor[1 * 16 + 0] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2;
-      predictor[2 * 16 + 0] =
-        predictor[0 * 16 + 1] = (pp[1] + pp[2] + 1) >> 1;
-      predictor[1 * 16 + 1] =
-        predictor[3 * 16 + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
-      predictor[2 * 16 + 1] =
-        predictor[0 * 16 + 2] = (pp[2] + pp[3] + 1) >> 1;
-      predictor[3 * 16 + 1] =
-        predictor[1 * 16 + 2] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2;
-      predictor[0 * 16 + 3] =
-        predictor[2 * 16 + 2] = (pp[3] + pp[4] + 1) >> 1;
-      predictor[1 * 16 + 3] =
-        predictor[3 * 16 + 2] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2;
-      predictor[2 * 16 + 3] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2;
-      predictor[3 * 16 + 3] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2;
-    }
-    break;
-
-    case B_HD_PRED: {
-      unsigned char pp[9];
-      pp[0] = Left[3];
-      pp[1] = Left[2];
-      pp[2] = Left[1];
-      pp[3] = Left[0];
-      pp[4] = top_left;
-      pp[5] = Above[0];
-      pp[6] = Above[1];
-      pp[7] = Above[2];
-      pp[8] = Above[3];
-
-
-      predictor[3 * 16 + 0] = (pp[0] + pp[1] + 1) >> 1;
-      predictor[3 * 16 + 1] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2;
-      predictor[2 * 16 + 0] =
-        predictor[3 * 16 + 2] = (pp[1] + pp[2] + 1) >> 1;
-      predictor[2 * 16 + 1] =
-        predictor[3 * 16 + 3] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
-      predictor[2 * 16 + 2] =
-        predictor[1 * 16 + 0] = (pp[2] + pp[3] + 1) >> 1;
-      predictor[2 * 16 + 3] =
-        predictor[1 * 16 + 1] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2;
-      predictor[1 * 16 + 2] =
-        predictor[0 * 16 + 0] = (pp[3] + pp[4] + 1) >> 1;
-      predictor[1 * 16 + 3] =
-        predictor[0 * 16 + 1] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2;
-      predictor[0 * 16 + 2] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2;
-      predictor[0 * 16 + 3] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2;
-    }
-    break;
-
-
-    case B_HU_PRED: {
-      unsigned char *pp = Left;
-      predictor[0 * 16 + 0] = (pp[0] + pp[1] + 1) >> 1;
-      predictor[0 * 16 + 1] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2;
-      predictor[0 * 16 + 2] =
-        predictor[1 * 16 + 0] = (pp[1] + pp[2] + 1) >> 1;
-      predictor[0 * 16 + 3] =
-        predictor[1 * 16 + 1] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
-      predictor[1 * 16 + 2] =
-        predictor[2 * 16 + 0] = (pp[2] + pp[3] + 1) >> 1;
-      predictor[1 * 16 + 3] =
-        predictor[2 * 16 + 1] = (pp[2] + pp[3] * 2 + pp[3] + 2) >> 2;
-      predictor[2 * 16 + 2] =
-        predictor[2 * 16 + 3] =
-          predictor[3 * 16 + 0] =
-            predictor[3 * 16 + 1] =
-              predictor[3 * 16 + 2] =
-                predictor[3 * 16 + 3] = pp[3];
-    }
-    break;
-
-
-  }
-}
-
-#if CONFIG_COMP_INTRA_PRED
-void vp9_comp_intra4x4_predict_c(BLOCKD *x,
-                               int b_mode, int b_mode2,
-                               unsigned char *out_predictor) {
-  unsigned char predictor[2][4 * 16];
-  int i, j;
-
-  vp9_intra4x4_predict(x, b_mode, predictor[0]);
-  vp9_intra4x4_predict(x, b_mode2, predictor[1]);
-
-  for (i = 0; i < 16 * 4; i += 16) {
-    for (j = i; j < i + 4; j++) {
-      out_predictor[j] = (predictor[0][j] + predictor[1][j] + 1) >> 1;
-    }
-  }
-}
-#endif
-
-/* copy 4 bytes from the above right down so that the 4x4 prediction modes using pixels above and
- * to the right prediction have filled in pixels to use.
- */
-void vp9_intra_prediction_down_copy(MACROBLOCKD *xd) {
-  int extend_edge = (xd->mb_to_right_edge == 0 && xd->mb_index < 2);
-  unsigned char *above_right = *(xd->block[0].base_dst) + xd->block[0].dst -
-                               xd->block[0].dst_stride + 16;
-  unsigned int *src_ptr = (unsigned int *)
-      (above_right - (xd->mb_index == 3 ? 16 * xd->block[0].dst_stride : 0));
-
-  unsigned int *dst_ptr0 = (unsigned int *)above_right;
-  unsigned int *dst_ptr1 =
-    (unsigned int *)(above_right + 4 * xd->block[0].dst_stride);
-  unsigned int *dst_ptr2 =
-    (unsigned int *)(above_right + 8 * xd->block[0].dst_stride);
-  unsigned int *dst_ptr3 =
-    (unsigned int *)(above_right + 12 * xd->block[0].dst_stride);
-
-  if (extend_edge) {
-    *src_ptr = ((uint8_t *) src_ptr)[-1] * 0x01010101U;
-  }
-
-  *dst_ptr0 = *src_ptr;
-  *dst_ptr1 = *src_ptr;
-  *dst_ptr2 = *src_ptr;
-  *dst_ptr3 = *src_ptr;
-}
--- a/vp8/common/reconintra4x4.h
+++ /dev/null
@@ -1,17 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef __INC_RECONINTRA4x4_H
-#define __INC_RECONINTRA4x4_H
-
-extern void vp9_intra_prediction_down_copy(MACROBLOCKD *xd);
-
-#endif
--- a/vp8/common/rtcd.c
+++ /dev/null
@@ -1,105 +1,0 @@
-/*
- *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-#include "vpx_config.h"
-#define RTCD_C
-#include "vpx_rtcd.h"
-
-#if CONFIG_MULTITHREAD && defined(_WIN32)
-#include <windows.h>
-#include <stdlib.h>
-static void once(void (*func)(void))
-{
-    static CRITICAL_SECTION *lock;
-    static LONG waiters;
-    static int done;
-    void *lock_ptr = &lock;
-
-    /* If the initialization is complete, return early. This isn't just an
-     * optimization, it prevents races on the destruction of the global
-     * lock.
-     */
-    if(done)
-        return;
-
-    InterlockedIncrement(&waiters);
-
-    /* Get a lock. We create one and try to make it the one-true-lock,
-     * throwing it away if we lost the race.
-     */
-
-    {
-        /* Scope to protect access to new_lock */
-        CRITICAL_SECTION *new_lock = malloc(sizeof(CRITICAL_SECTION));
-        InitializeCriticalSection(new_lock);
-        if (InterlockedCompareExchangePointer(lock_ptr, new_lock, NULL) != NULL)
-        {
-            DeleteCriticalSection(new_lock);
-            free(new_lock);
-        }
-    }
-
-    /* At this point, we have a lock that can be synchronized on. We don't
-     * care which thread actually performed the allocation.
-     */
-
-    EnterCriticalSection(lock);
-
-    if (!done)
-    {
-        func();
-        done = 1;
-    }
-
-    LeaveCriticalSection(lock);
-
-    /* Last one out should free resources. The destructed objects are
-     * protected by checking if(done) above.
-     */
-    if(!InterlockedDecrement(&waiters))
-    {
-        DeleteCriticalSection(lock);
-        free(lock);
-        lock = NULL;
-    }
-}
-
-
-#elif CONFIG_MULTITHREAD && HAVE_PTHREAD_H
-#include <pthread.h>
-static void once(void (*func)(void))
-{
-    static pthread_once_t lock = PTHREAD_ONCE_INIT;
-    pthread_once(&lock, func);
-}
-
-
-#else
-/* No-op version that performs no synchronization. vpx_rtcd() is idempotent,
- * so as long as your platform provides atomic loads/stores of pointers
- * no synchronization is strictly necessary.
- */
-
-static void once(void (*func)(void))
-{
-    static int done;
-
-    if(!done)
-    {
-        func();
-        done = 1;
-    }
-}
-#endif
-
-
-void vpx_rtcd()
-{
-    once(setup_rtcd_internal);
-}
--- a/vp8/common/rtcd_defs.sh
+++ /dev/null
@@ -1,482 +1,0 @@
-common_forward_decls() {
-cat <<EOF
-
-struct loop_filter_info;
-struct blockd;
-struct macroblockd;
-struct loop_filter_info;
-
-/* Encoder forward decls */
-struct block;
-struct macroblock;
-struct variance_vtable;
-
-/* Encoder forward decls */
-struct variance_vtable;
-union int_mv;
-struct yv12_buffer_config;
-EOF
-}
-forward_decls common_forward_decls
-
-prototype void vp9_filter_block2d_4x4_8 "const unsigned char *src_ptr, const unsigned int src_stride, const short *HFilter_aligned16, const short *VFilter_aligned16, unsigned char *dst_ptr, unsigned int dst_stride"
-prototype void vp9_filter_block2d_8x4_8 "const unsigned char *src_ptr, const unsigned int src_stride, const short *HFilter_aligned16, const short *VFilter_aligned16, unsigned char *dst_ptr, unsigned int dst_stride"
-prototype void vp9_filter_block2d_8x8_8 "const unsigned char *src_ptr, const unsigned int src_stride, const short *HFilter_aligned16, const short *VFilter_aligned16, unsigned char *dst_ptr, unsigned int dst_stride"
-prototype void vp9_filter_block2d_16x16_8 "const unsigned char *src_ptr, const unsigned int src_stride, const short *HFilter_aligned16, const short *VFilter_aligned16, unsigned char *dst_ptr, unsigned int dst_stride"
-
-# At the very least, MSVC 2008 has compiler bug exhibited by this code; code
-# compiles warning free but a dissassembly of generated code show bugs. To be
-# on the safe side, only enabled when compiled with 'gcc'.
-if [ "$CONFIG_GCC" = "yes" ]; then
-    specialize vp9_filter_block2d_4x4_8 sse4_1 sse2
-fi
-    specialize vp9_filter_block2d_8x4_8 ssse3 #sse4_1 sse2
-    specialize vp9_filter_block2d_8x8_8 ssse3 #sse4_1 sse2
-    specialize vp9_filter_block2d_16x16_8 ssse3 #sse4_1 sse2
-
-#
-# Dequant
-#
-prototype void vp9_dequantize_b "struct blockd *x"
-specialize vp9_dequantize_b mmx
-
-prototype void vp9_dequantize_b_2x2 "struct blockd *x"
-specialize vp9_dequantize_b_2x2
-
-prototype void vp9_dequant_dc_idct_add_y_block_8x8 "short *q, short *dq, unsigned char *pre, unsigned char *dst, int stride, char *eobs, short *dc, struct macroblockd *xd"
-specialize vp9_dequant_dc_idct_add_y_block_8x8
-
-prototype void vp9_dequant_idct_add_y_block_8x8 "short *q, short *dq, unsigned char *pre, unsigned char *dst, int stride, char *eobs, struct macroblockd *xd"
-specialize vp9_dequant_idct_add_y_block_8x8
-
-prototype void vp9_dequant_idct_add_uv_block_8x8 "short *q, short *dq, unsigned char *pre, unsigned char *dstu, unsigned char *dstv, int stride, char *eobs, struct macroblockd *xd"
-specialize vp9_dequant_idct_add_uv_block_8x8
-
-prototype void vp9_dequant_idct_add_16x16 "short *input, short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride"
-specialize vp9_dequant_idct_add_16x16
-
-prototype void vp9_dequant_idct_add "short *input, short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride"
-specialize vp9_dequant_idct_add
-
-prototype void vp9_dequant_dc_idct_add "short *input, short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride, int Dc"
-specialize vp9_dequant_dc_idct_add
-
-prototype void vp9_dequant_dc_idct_add_y_block "short *q, short *dq, unsigned char *pre, unsigned char *dst, int stride, char *eobs, short *dc"
-specialize vp9_dequant_dc_idct_add_y_block mmx
-
-prototype void vp9_dequant_idct_add_y_block "short *q, short *dq, unsigned char *pre, unsigned char *dst, int stride, char *eobs"
-specialize vp9_dequant_idct_add_y_block mmx
-
-prototype void vp9_dequant_idct_add_uv_block "short *q, short *dq, unsigned char *pre, unsigned char *dstu, unsigned char *dstv, int stride, char *eobs"
-specialize vp9_dequant_idct_add_uv_block mmx
-
-#
-# RECON
-#
-prototype void vp9_copy_mem16x16 "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch"
-specialize vp9_copy_mem16x16 mmx sse2 media neon dspr2
-vp9_copy_mem16x16_media=vp9_copy_mem16x16_v6
-vp9_copy_mem16x16_dspr2=vp9_copy_mem16x16_dspr2
-
-prototype void vp9_copy_mem8x8 "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch"
-specialize vp9_copy_mem8x8 mmx media neon dspr2
-vp9_copy_mem8x8_media=vp9_copy_mem8x8_v6
-vp9_copy_mem8x8_dspr2=vp9_copy_mem8x8_dspr2
-
-prototype void vp9_copy_mem8x4 "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch"
-specialize vp9_copy_mem8x4 mmx
-
-prototype void vp9_intra4x4_predict "unsigned char *Above, unsigned char *yleft, int left_stride, B_PREDICTION_MODE b_mode, unsigned char *dst, int dst_stride, unsigned char top_left"
-specialize vp9_intra4x4_predict
-
-prototype void vp9_avg_mem16x16 "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch"
-specialize vp9_avg_mem16x16
-
-prototype void vp9_avg_mem8x8 "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch"
-specialize vp9_avg_mem8x8
-
-prototype void vp9_copy_mem8x4 "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch"
-specialize vp9_copy_mem8x4 mmx media neon dspr2
-vp9_copy_mem8x4_media=vp9_copy_mem8x4_v6
-vp9_copy_mem8x4_dspr2=vp9_copy_mem8x4_dspr2
-
-prototype void vp9_recon_b "unsigned char *pred_ptr, short *diff_ptr, unsigned char *dst_ptr, int stride"
-specialize vp9_recon_b
-
-prototype void vp9_recon_uv_b "unsigned char *pred_ptr, short *diff_ptr, unsigned char *dst_ptr, int stride"
-specialize vp9_recon_uv_b
-
-prototype void vp9_recon2b "unsigned char *pred_ptr, short *diff_ptr, unsigned char *dst_ptr, int stride"
-specialize vp9_recon2b sse2
-
-prototype void vp9_recon4b "unsigned char *pred_ptr, short *diff_ptr, unsigned char *dst_ptr, int stride"
-specialize vp9_recon4b sse2
-
-prototype void vp9_recon_mb "struct macroblockd *x"
-specialize vp9_recon_mb
-
-prototype void vp9_recon_mby "struct macroblockd *x"
-specialize vp9_recon_mby
-
-prototype void vp9_build_intra_predictors_mby_s "struct macroblockd *x"
-specialize vp9_build_intra_predictors_mby_s
-
-prototype void vp9_build_intra_predictors_sby_s "struct macroblockd *x"
-specialize vp9_build_intra_predictors_sby_s;
-
-prototype void vp9_build_intra_predictors_sbuv_s "struct macroblockd *x"
-specialize vp9_build_intra_predictors_sbuv_s;
-
-prototype void vp9_build_intra_predictors_mby "struct macroblockd *x"
-specialize vp9_build_intra_predictors_mby;
-
-prototype void vp9_build_comp_intra_predictors_mby "struct macroblockd *x"
-specialize vp9_build_comp_intra_predictors_mby;
-
-prototype void vp9_build_intra_predictors_mby_s "struct macroblockd *x"
-specialize vp9_build_intra_predictors_mby_s;
-
-prototype void vp9_build_intra_predictors_mbuv "struct macroblockd *x"
-specialize vp9_build_intra_predictors_mbuv;
-
-prototype void vp9_build_intra_predictors_mbuv_s "struct macroblockd *x"
-specialize vp9_build_intra_predictors_mbuv_s;
-
-prototype void vp9_build_comp_intra_predictors_mbuv "struct macroblockd *x"
-specialize vp9_build_comp_intra_predictors_mbuv;
-
-prototype void vp9_intra4x4_predict "struct blockd *x, int b_mode, unsigned char *predictor"
-specialize vp9_intra4x4_predict;
-
-prototype void vp9_comp_intra4x4_predict "struct blockd *x, int b_mode, int second_mode, unsigned char *predictor"
-specialize vp9_comp_intra4x4_predict;
-
-prototype void vp9_intra8x8_predict "struct blockd *x, int b_mode, unsigned char *predictor"
-specialize vp9_intra8x8_predict;
-
-prototype void vp9_comp_intra8x8_predict "struct blockd *x, int b_mode, int second_mode, unsigned char *predictor"
-specialize vp9_comp_intra8x8_predict;
-
-prototype void vp9_intra_uv4x4_predict "struct blockd *x, int b_mode, unsigned char *predictor"
-specialize vp9_intra_uv4x4_predict;
-
-prototype void vp9_comp_intra_uv4x4_predict "struct blockd *x, int b_mode, int second_mode, unsigned char *predictor"
-specialize vp9_comp_intra_uv4x4_predict;
-
-#
-# Loopfilter
-#
-prototype void vp9_loop_filter_mbv "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi"
-specialize vp9_loop_filter_mbv sse2
-
-prototype void vp9_loop_filter_bv "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi"
-specialize vp9_loop_filter_bv sse2
-
-prototype void vp9_loop_filter_bv8x8 "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi"
-specialize vp9_loop_filter_bv8x8 sse2
-
-prototype void vp9_loop_filter_mbh "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi"
-specialize vp9_loop_filter_mbh sse2
-
-prototype void vp9_loop_filter_bh "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi"
-specialize vp9_loop_filter_bh sse2
-
-prototype void vp9_loop_filter_bh8x8 "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi"
-specialize vp9_loop_filter_bh8x8 sse2
-
-prototype void vp9_loop_filter_simple_mbv "unsigned char *y, int ystride, const unsigned char *blimit"
-specialize vp9_loop_filter_simple_mbv mmx sse2 media neon
-vp9_loop_filter_simple_mbv_c=vp9_loop_filter_simple_vertical_edge_c
-vp9_loop_filter_simple_mbv_mmx=vp9_loop_filter_simple_vertical_edge_mmx
-vp9_loop_filter_simple_mbv_sse2=vp9_loop_filter_simple_vertical_edge_sse2
-vp9_loop_filter_simple_mbv_media=vp9_loop_filter_simple_vertical_edge_armv6
-vp9_loop_filter_simple_mbv_neon=vp9_loop_filter_mbvs_neon
-
-prototype void vp9_loop_filter_simple_mbh "unsigned char *y, int ystride, const unsigned char *blimit"
-specialize vp9_loop_filter_simple_mbh mmx sse2 media neon
-vp9_loop_filter_simple_mbh_c=vp9_loop_filter_simple_horizontal_edge_c
-vp9_loop_filter_simple_mbh_mmx=vp9_loop_filter_simple_horizontal_edge_mmx
-vp9_loop_filter_simple_mbh_sse2=vp9_loop_filter_simple_horizontal_edge_sse2
-vp9_loop_filter_simple_mbh_media=vp9_loop_filter_simple_horizontal_edge_armv6
-vp9_loop_filter_simple_mbh_neon=vp9_loop_filter_mbhs_neon
-
-prototype void vp9_loop_filter_simple_bv "unsigned char *y, int ystride, const unsigned char *blimit"
-specialize vp9_loop_filter_simple_bv mmx sse2 media neon
-vp9_loop_filter_simple_bv_c=vp9_loop_filter_bvs_c
-vp9_loop_filter_simple_bv_mmx=vp9_loop_filter_bvs_mmx
-vp9_loop_filter_simple_bv_sse2=vp9_loop_filter_bvs_sse2
-vp9_loop_filter_simple_bv_media=vp9_loop_filter_bvs_armv6
-vp9_loop_filter_simple_bv_neon=vp9_loop_filter_bvs_neon
-
-prototype void vp9_loop_filter_simple_bh "unsigned char *y, int ystride, const unsigned char *blimit"
-specialize vp9_loop_filter_simple_bh mmx sse2 media neon
-vp9_loop_filter_simple_bh_c=vp9_loop_filter_bhs_c
-vp9_loop_filter_simple_bh_mmx=vp9_loop_filter_bhs_mmx
-vp9_loop_filter_simple_bh_sse2=vp9_loop_filter_bhs_sse2
-vp9_loop_filter_simple_bh_media=vp9_loop_filter_bhs_armv6
-vp9_loop_filter_simple_bh_neon=vp9_loop_filter_bhs_neon
-
-#
-# sad 16x3, 3x16
-#
-if [ "$CONFIG_NEWBESTREFMV" = "yes" ]; then
-prototype unsigned int vp9_sad16x3 "const unsigned char *src_ptr, int  src_stride, const unsigned char *ref_ptr, int ref_stride, int max_sad"
-specialize vp9_sad16x3 sse2
-
-prototype unsigned int vp9_sad3x16 "const unsigned char *src_ptr, int  src_stride, const unsigned char *ref_ptr, int ref_stride, int max_sad"
-specialize vp9_sad3x16 sse2
-fi
-
-#
-# Encoder functions below this point.
-#
-if [ "$CONFIG_VP8_ENCODER" = "yes" ]; then
-
-
-# variance
-[ $arch = "x86_64" ] && mmx_x86_64=mmx && sse2_x86_64=sse2
-
-prototype unsigned int vp9_variance32x32 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_variance32x32
-
-prototype unsigned int vp9_variance16x16 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_variance16x16 mmx sse2
-vp9_variance16x16_sse2=vp9_variance16x16_wmt
-vp9_variance16x16_mmx=vp9_variance16x16_mmx
-
-prototype unsigned int vp9_variance16x8 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_variance16x8 mmx sse2
-vp9_variance16x8_sse2=vp9_variance16x8_wmt
-vp9_variance16x8_mmx=vp9_variance16x8_mmx
-
-prototype unsigned int vp9_variance8x16 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_variance8x16 mmx sse2
-vp9_variance8x16_sse2=vp9_variance8x16_wmt
-vp9_variance8x16_mmx=vp9_variance8x16_mmx
-
-prototype unsigned int vp9_variance8x8 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_variance8x8 mmx sse2
-vp9_variance8x8_sse2=vp9_variance8x8_wmt
-vp9_variance8x8_mmx=vp9_variance8x8_mmx
-
-prototype unsigned int vp9_variance4x4 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_variance4x4 mmx sse2
-vp9_variance4x4_sse2=vp9_variance4x4_wmt
-vp9_variance4x4_mmx=vp9_variance4x4_mmx
-
-prototype unsigned int vp9_sub_pixel_variance32x32 "const unsigned char *src_ptr, int source_stride, int xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"
-specialize vp9_sub_pixel_variance32x32
-
-prototype unsigned int vp9_sub_pixel_variance16x16 "const unsigned char *src_ptr, int source_stride, int xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"
-specialize vp9_sub_pixel_variance16x16 sse2 mmx ssse3
-vp9_sub_pixel_variance16x16_sse2=vp9_sub_pixel_variance16x16_wmt
-
-prototype unsigned int vp9_sub_pixel_variance8x16 "const unsigned char *src_ptr, int source_stride, int xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"
-specialize vp9_sub_pixel_variance8x16 sse2 mmx
-vp9_sub_pixel_variance8x16_sse2=vp9_sub_pixel_variance8x16_wmt
-
-prototype unsigned int vp9_sub_pixel_variance16x8 "const unsigned char *src_ptr, int source_stride, int xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"
-specialize vp9_sub_pixel_variance16x8 sse2 mmx ssse3
-vp9_sub_pixel_variance16x8_sse2=vp9_sub_pixel_variance16x8_ssse3;
-vp9_sub_pixel_variance16x8_sse2=vp9_sub_pixel_variance16x8_wmt
-
-prototype unsigned int vp9_sub_pixel_variance8x8 "const unsigned char *src_ptr, int source_stride, int xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"
-specialize vp9_sub_pixel_variance8x8 sse2 mmx
-vp9_sub_pixel_variance8x8_sse2=vp9_sub_pixel_variance8x8_wmt
-
-prototype unsigned int vp9_sub_pixel_variance4x4 "const unsigned char *src_ptr, int source_stride, int xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"
-specialize vp9_sub_pixel_variance4x4 sse2 mmx
-vp9_sub_pixel_variance4x4_sse2=vp9_sub_pixel_variance4x4_wmt
-
-prototype unsigned int vp9_sad32x32 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int max_sad"
-specialize vp9_sad32x32
-
-prototype unsigned int vp9_sad16x16 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int max_sad"
-specialize vp9_sad16x16 mmx sse2 sse3
-vp9_sad16x16_sse2=vp9_sad16x16_wmt
-
-prototype unsigned int vp9_sad16x8 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int max_sad"
-specialize vp9_sad16x8 mmx sse2
-vp9_sad16x8_sse2=vp9_sad16x8_wmt
-
-prototype unsigned int vp9_sad8x16 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int max_sad"
-specialize vp9_sad8x16 mmx sse2
-vp9_sad8x16_sse2=vp9_sad8x16_wmt
-
-prototype unsigned int vp9_sad8x8 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int max_sad"
-specialize vp9_sad8x8 mmx sse2
-vp9_sad8x8_sse2=vp9_sad8x8_wmt
-
-prototype unsigned int vp9_sad4x4 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int max_sad"
-specialize vp9_sad4x4 mmx sse2
-vp9_sad4x4_sse2=vp9_sad4x4_wmt
-
-prototype unsigned int vp9_variance_halfpixvar16x16_h "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_variance_halfpixvar16x16_h mmx sse2
-vp9_variance_halfpixvar16x16_h_sse2=vp9_variance_halfpixvar16x16_h_wmt
-
-prototype unsigned int vp9_variance_halfpixvar16x16_v "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_variance_halfpixvar16x16_v mmx sse2
-vp9_variance_halfpixvar16x16_v_sse2=vp9_variance_halfpixvar16x16_v_wmt
-
-prototype unsigned int vp9_variance_halfpixvar16x16_hv "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_variance_halfpixvar16x16_hv mmx sse2
-vp9_variance_halfpixvar16x16_hv_sse2=vp9_variance_halfpixvar16x16_hv_wmt
-
-prototype unsigned int vp9_variance_halfpixvar32x32_h "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_variance_halfpixvar32x32_h
-
-prototype unsigned int vp9_variance_halfpixvar32x32_v "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_variance_halfpixvar32x32_v
-
-prototype unsigned int vp9_variance_halfpixvar32x32_hv "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_variance_halfpixvar32x32_hv
-
-prototype void vp9_sad32x32x3 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sad_array"
-specialize vp9_sad32x32x3
-
-prototype void vp9_sad16x16x3 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sad_array"
-specialize vp9_sad16x16x3 sse3 ssse3
-
-prototype void vp9_sad16x8x3 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sad_array"
-specialize vp9_sad16x8x3 sse3 ssse3
-
-prototype void vp9_sad8x16x3 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sad_array"
-specialize vp9_sad8x16x3 sse3
-
-prototype void vp9_sad8x8x3 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sad_array"
-specialize vp9_sad8x8x3 sse3
-
-prototype void vp9_sad4x4x3 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sad_array"
-specialize vp9_sad4x4x3 sse3
-
-prototype void vp9_sad32x32x8 "const unsigned char *src_ptr, int  src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned short *sad_array"
-specialize vp9_sad32x32x8
-
-prototype void vp9_sad16x16x8 "const unsigned char *src_ptr, int  src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned short *sad_array"
-specialize vp9_sad16x16x8 sse4
-
-prototype void vp9_sad16x8x8 "const unsigned char *src_ptr, int  src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned short *sad_array"
-specialize vp9_sad16x8x8 sse4
-
-prototype void vp9_sad8x16x8 "const unsigned char *src_ptr, int  src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned short *sad_array"
-specialize vp9_sad8x16x8 sse4
-
-prototype void vp9_sad8x8x8 "const unsigned char *src_ptr, int  src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned short *sad_array"
-specialize vp9_sad8x8x8 sse4
-
-prototype void vp9_sad4x4x8 "const unsigned char *src_ptr, int  src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned short *sad_array"
-specialize vp9_sad4x4x8 sse4
-
-prototype void vp9_sad32x32x4d "const unsigned char *src_ptr, int  src_stride, unsigned char *ref_ptr[], int  ref_stride, unsigned int *sad_array"
-specialize vp9_sad32x32x4d
-
-prototype void vp9_sad16x16x4d "const unsigned char *src_ptr, int  src_stride, unsigned char *ref_ptr[], int  ref_stride, unsigned int *sad_array"
-specialize vp9_sad16x16x4d sse3
-
-prototype void vp9_sad16x8x4d "const unsigned char *src_ptr, int  src_stride, unsigned char *ref_ptr[], int  ref_stride, unsigned int *sad_array"
-specialize vp9_sad16x8x4d sse3
-
-prototype void vp9_sad8x16x4d "const unsigned char *src_ptr, int  src_stride, unsigned char *ref_ptr[], int  ref_stride, unsigned int *sad_array"
-specialize vp9_sad8x16x4d sse3
-
-prototype void vp9_sad8x8x4d "const unsigned char *src_ptr, int  src_stride, unsigned char *ref_ptr[], int  ref_stride, unsigned int *sad_array"
-specialize vp9_sad8x8x4d sse3
-
-prototype void vp9_sad4x4x4d "const unsigned char *src_ptr, int  src_stride, unsigned char *ref_ptr[], int  ref_stride, unsigned int *sad_array"
-specialize vp9_sad4x4x4d sse3
-
-#
-# Block copy
-#
-case $arch in
-    x86*)
-    prototype void vp9_copy32xn "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, int n"
-    specialize vp9_copy32xn sse2 sse3
-    ;;
-esac
-
-prototype unsigned int vp9_sub_pixel_mse16x16 "const unsigned char  *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, const unsigned char *dst_ptr, int dst_pixels_per_line, unsigned int *sse"
-specialize vp9_sub_pixel_mse16x16 sse2 mmx
-vp9_sub_pixel_mse16x16_sse2=vp9_sub_pixel_mse16x16_wmt
-
-prototype unsigned int vp9_mse16x16 "const unsigned char *src_ptr, int  source_stride, const unsigned char *ref_ptr, int  recon_stride, unsigned int *sse"
-specialize vp9_mse16x16 mmx sse2
-vp9_mse16x16_sse2=vp9_mse16x16_wmt
-
-prototype unsigned int vp9_sub_pixel_mse32x32 "const unsigned char  *src_ptr, int  source_stride, int  xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"
-specialize vp9_sub_pixel_mse32x32
-
-prototype unsigned int vp9_get_mb_ss "const short *"
-specialize vp9_get_mb_ss mmx sse2
-# ENCODEMB INVOKE
-prototype int vp9_mbblock_error "struct macroblock *mb, int dc"
-specialize vp9_mbblock_error mmx sse2
-vp9_mbblock_error_sse2=vp9_mbblock_error_xmm
-
-prototype int vp9_block_error "short *coeff, short *dqcoeff, int block_size"
-specialize vp9_block_error mmx sse2
-vp9_block_error_sse2=vp9_block_error_xmm
-
-prototype void vp9_subtract_b "struct block *be, struct blockd *bd, int pitch"
-specialize vp9_subtract_b mmx sse2
-
-prototype int vp9_mbuverror "struct macroblock *mb"
-specialize vp9_mbuverror mmx sse2
-vp9_mbuverror_sse2=vp9_mbuverror_xmm
-
-prototype void vp9_subtract_b "struct block *be, struct blockd *bd, int pitch"
-specialize vp9_subtract_b mmx sse2
-
-prototype void vp9_subtract_mby "short *diff, unsigned char *src, unsigned char *pred, int stride"
-specialize vp9_subtract_mby mmx sse2
-
-prototype void vp9_subtract_mbuv "short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride"
-specialize vp9_subtract_mbuv mmx sse2
-
-#
-# Structured Similarity (SSIM)
-#
-if [ "$CONFIG_INTERNAL_STATS" = "yes" ]; then
-    [ $arch = "x86_64" ] && sse2_on_x86_64=sse2
-
-    prototype void vp9_ssim_parms_8x8 "unsigned char *s, int sp, unsigned char *r, int rp, unsigned long *sum_s, unsigned long *sum_r, unsigned long *sum_sq_s, unsigned long *sum_sq_r, unsigned long *sum_sxr"
-    specialize vp9_ssim_parms_8x8 $sse2_on_x86_64
-
-    prototype void vp9_ssim_parms_16x16 "unsigned char *s, int sp, unsigned char *r, int rp, unsigned long *sum_s, unsigned long *sum_r, unsigned long *sum_sq_s, unsigned long *sum_sq_r, unsigned long *sum_sxr"
-    specialize vp9_ssim_parms_16x16 $sse2_on_x86_64
-fi
-
-# fdct functions
-prototype void vp9_fht "const short *input, int pitch, short *output, int tx_type, int tx_dim"
-specialize vp9_fht
-
-prototype void vp9_short_fdct8x8 "short *InputData, short *OutputData, int pitch"
-specialize vp9_short_fdct8x8
-
-prototype void vp9_short_fhaar2x2 "short *InputData, short *OutputData, int pitch"
-specialize vp9_short_fhaar2x2
-
-prototype void vp9_short_fdct4x4 "short *InputData, short *OutputData, int pitch"
-specialize vp9_short_fdct4x4
-
-prototype void vp9_short_fdct8x4 "short *InputData, short *OutputData, int pitch"
-specialize vp9_short_fdct8x4
-
-prototype void vp9_short_walsh4x4 "short *InputData, short *OutputData, int pitch"
-specialize vp9_short_walsh4x4
-
-prototype void vp9_short_fdct16x16 "short *InputData, short *OutputData, int pitch"
-specialize vp9_short_fdct16x16
-
-prototype void vp9_short_walsh4x4_lossless "short *InputData, short *OutputData, int pitch"
-specialize vp9_short_walsh4x4_lossless
-
-prototype void vp9_short_walsh4x4_x8 "short *InputData, short *OutputData, int pitch"
-specialize vp9_short_walsh4x4_x8
-
-prototype void vp9_short_walsh8x4_x8 "short *InputData, short *OutputData, int pitch"
-specialize vp9_short_walsh8x4_x8
-
-fi
-# end encoder functions
--- a/vp8/common/sadmxn.h
+++ /dev/null
@@ -1,37 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef __INC_SAD_H
-#define __INC_SAD_H
-
-static __inline
-unsigned int sad_mx_n_c(
-  const unsigned char *src_ptr,
-  int  src_stride,
-  const unsigned char *ref_ptr,
-  int  ref_stride,
-  int m,
-  int n) {
-  int r, c;
-  unsigned int sad = 0;
-
-  for (r = 0; r < n; r++) {
-    for (c = 0; c < m; c++) {
-      sad += abs(src_ptr[c] - ref_ptr[c]);
-    }
-
-    src_ptr += src_stride;
-    ref_ptr += ref_stride;
-  }
-
-  return sad;
-}
-
-#endif
--- a/vp8/common/seg_common.c
+++ /dev/null
@@ -1,103 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "vp8/common/seg_common.h"
-
-static const int segfeaturedata_signed[SEG_LVL_MAX] = { 1, 1, 0, 0, 0, 0 };
-static const int seg_feature_data_bits[SEG_LVL_MAX] = { QINDEX_BITS, 6, 4, 4, 6, 2 };
-
-// These functions provide access to new segment level features.
-// Eventually these function may be "optimized out" but for the moment,
-// the coding mechanism is still subject to change so these provide a
-// convenient single point of change.
-
-int vp9_segfeature_active(const MACROBLOCKD *xd,
-                          int segment_id,
-                          SEG_LVL_FEATURES feature_id) {
-  // Return true if mask bit set and segmentation enabled.
-  return (xd->segmentation_enabled &&
-          (xd->segment_feature_mask[segment_id] &
-           (0x01 << feature_id)));
-}
-
-void vp9_clearall_segfeatures(MACROBLOCKD *xd) {
-  vpx_memset(xd->segment_feature_data, 0, sizeof(xd->segment_feature_data));
-  vpx_memset(xd->segment_feature_mask, 0, sizeof(xd->segment_feature_mask));
-}
-
-void vp9_enable_segfeature(MACROBLOCKD *xd,
-                           int segment_id,
-                           SEG_LVL_FEATURES feature_id) {
-  xd->segment_feature_mask[segment_id] |= (0x01 << feature_id);
-}
-
-void vp9_disable_segfeature(MACROBLOCKD *xd,
-                            int segment_id,
-                            SEG_LVL_FEATURES feature_id) {
-  xd->segment_feature_mask[segment_id] &= ~(1 << feature_id);
-}
-
-int vp9_seg_feature_data_bits(SEG_LVL_FEATURES feature_id) {
-  return seg_feature_data_bits[feature_id];
-}
-
-int vp9_is_segfeature_signed(SEG_LVL_FEATURES feature_id) {
-  return (segfeaturedata_signed[feature_id]);
-}
-
-void vp9_clear_segdata(MACROBLOCKD *xd,
-                       int segment_id,
-                       SEG_LVL_FEATURES feature_id) {
-  xd->segment_feature_data[segment_id][feature_id] = 0;
-}
-
-void vp9_set_segdata(MACROBLOCKD *xd,
-                     int segment_id,
-                     SEG_LVL_FEATURES feature_id,
-                     int seg_data) {
-  xd->segment_feature_data[segment_id][feature_id] = seg_data;
-}
-
-int vp9_get_segdata(const MACROBLOCKD *xd,
-                    int segment_id,
-                    SEG_LVL_FEATURES feature_id) {
-  return xd->segment_feature_data[segment_id][feature_id];
-}
-
-void vp9_clear_segref(MACROBLOCKD *xd, int segment_id) {
-  xd->segment_feature_data[segment_id][SEG_LVL_REF_FRAME] = 0;
-}
-
-void vp9_set_segref(MACROBLOCKD *xd,
-                    int segment_id,
-                    MV_REFERENCE_FRAME ref_frame) {
-  xd->segment_feature_data[segment_id][SEG_LVL_REF_FRAME] |=
-    (1 << ref_frame);
-}
-
-int vp9_check_segref(const MACROBLOCKD *xd,
-                     int segment_id,
-                     MV_REFERENCE_FRAME ref_frame) {
-  return (xd->segment_feature_data[segment_id][SEG_LVL_REF_FRAME] &
-          (1 << ref_frame)) ? 1 : 0;
-}
-
-int vp9_check_segref_inter(MACROBLOCKD *xd, int segment_id) {
-  return (xd->segment_feature_data[segment_id][SEG_LVL_REF_FRAME] &
-          ~(1 << INTRA_FRAME)) ? 1 : 0;
-}
-
-int vp9_get_seg_tx_type(MACROBLOCKD *xd, int segment_id) {
-  if (vp9_segfeature_active(xd, segment_id, SEG_LVL_TRANSFORM))
-    return vp9_get_segdata(xd, segment_id, SEG_LVL_TRANSFORM);
-  else
-    return TX_4X4;
-}
-// TBD? Functions to read and write segment data with range / validity checking
--- a/vp8/common/seg_common.h
+++ /dev/null
@@ -1,64 +1,0 @@
-/*
- *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "type_aliases.h"
-#include "onyxc_int.h"
-#include "vp8/common/blockd.h"
-
-#ifndef __INC_SEG_COMMON_H__
-#define __INC_SEG_COMMON_H__ 1
-
-int vp9_segfeature_active(const MACROBLOCKD *xd,
-                          int segment_id,
-                          SEG_LVL_FEATURES feature_id);
-
-void vp9_clearall_segfeatures(MACROBLOCKD *xd);
-
-void vp9_enable_segfeature(MACROBLOCKD *xd,
-                           int segment_id,
-                           SEG_LVL_FEATURES feature_id);
-
-void vp9_disable_segfeature(MACROBLOCKD *xd,
-                            int segment_id,
-                            SEG_LVL_FEATURES feature_id);
-
-int vp9_seg_feature_data_bits(SEG_LVL_FEATURES feature_id);
-
-int vp9_is_segfeature_signed(SEG_LVL_FEATURES feature_id);
-
-void vp9_clear_segdata(MACROBLOCKD *xd,
-                       int segment_id,
-                       SEG_LVL_FEATURES feature_id);
-
-void vp9_set_segdata(MACROBLOCKD *xd,
-                     int segment_id,
-                     SEG_LVL_FEATURES feature_id,
-                     int seg_data);
-
-int vp9_get_segdata(const MACROBLOCKD *xd,
-                    int segment_id,
-                    SEG_LVL_FEATURES feature_id);
-
-void vp9_clear_segref(MACROBLOCKD *xd, int segment_id);
-
-void vp9_set_segref(MACROBLOCKD *xd,
-                    int segment_id,
-                    MV_REFERENCE_FRAME ref_frame);
-
-int vp9_check_segref(const MACROBLOCKD *xd,
-                     int segment_id,
-                     MV_REFERENCE_FRAME ref_frame);
-
-int vp9_check_segref_inter(MACROBLOCKD *xd, int segment_id);
-
-int vp9_get_seg_tx_type(MACROBLOCKD *xd, int segment_id);
-
-#endif /* __INC_SEG_COMMON_H__ */
-
--- a/vp8/common/setupintrarecon.c
+++ /dev/null
@@ -1,31 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "setupintrarecon.h"
-#include "vpx_mem/vpx_mem.h"
-
-void vp9_setup_intra_recon(YV12_BUFFER_CONFIG *ybf) {
-  int i;
-
-  /* set up frame new frame for intra coded blocks */
-  vpx_memset(ybf->y_buffer - 1 - ybf->y_stride, 127, ybf->y_width + 5);
-  for (i = 0; i < ybf->y_height; i++)
-    ybf->y_buffer[ybf->y_stride * i - 1] = (unsigned char) 129;
-
-  vpx_memset(ybf->u_buffer - 1 - ybf->uv_stride, 127, ybf->uv_width + 5);
-  for (i = 0; i < ybf->uv_height; i++)
-    ybf->u_buffer[ybf->uv_stride * i - 1] = (unsigned char) 129;
-
-  vpx_memset(ybf->v_buffer - 1 - ybf->uv_stride, 127, ybf->uv_width + 5);
-  for (i = 0; i < ybf->uv_height; i++)
-    ybf->v_buffer[ybf->uv_stride * i - 1] = (unsigned char) 129;
-
-}
--- a/vp8/common/setupintrarecon.h
+++ /dev/null
@@ -1,13 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vpx_scale/yv12config.h"
-extern void vp9_setup_intra_recon(YV12_BUFFER_CONFIG *ybf);
--- a/vp8/common/subpixel.h
+++ /dev/null
@@ -1,204 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef SUBPIXEL_H
-#define SUBPIXEL_H
-
-#define prototype_subpixel_predict(sym) \
-  void sym(unsigned char *src, int src_pitch, int xofst, int yofst, \
-           unsigned char *dst, int dst_pitch)
-
-#if ARCH_X86 || ARCH_X86_64
-#include "x86/subpixel_x86.h"
-#endif
-
-#if ARCH_ARM
-#include "arm/subpixel_arm.h"
-#endif
-
-#ifndef vp9_subpix_sixtap16x16
-#define vp9_subpix_sixtap16x16 vp9_sixtap_predict16x16_c
-#endif
-extern prototype_subpixel_predict(vp9_subpix_sixtap16x16);
-
-#ifndef vp9_subpix_sixtap8x8
-#define vp9_subpix_sixtap8x8 vp9_sixtap_predict8x8_c
-#endif
-extern prototype_subpixel_predict(vp9_subpix_sixtap8x8);
-
-#ifndef vp9_subpix_sixtap_avg16x16
-#define vp9_subpix_sixtap_avg16x16 vp9_sixtap_predict_avg16x16_c
-#endif
-extern prototype_subpixel_predict(vp9_subpix_sixtap_avg16x16);
-
-#ifndef vp9_subpix_sixtap_avg8x8
-#define vp9_subpix_sixtap_avg8x8 vp9_sixtap_predict_avg8x8_c
-#endif
-extern prototype_subpixel_predict(vp9_subpix_sixtap_avg8x8);
-#ifndef vp9_subpix_sixtap8x4
-#define vp9_subpix_sixtap8x4 vp9_sixtap_predict8x4_c
-#endif
-extern prototype_subpixel_predict(vp9_subpix_sixtap8x4);
-
-#ifndef vp9_subpix_sixtap4x4
-#define vp9_subpix_sixtap4x4 vp9_sixtap_predict_c
-#endif
-extern prototype_subpixel_predict(vp9_subpix_sixtap4x4);
-
-#ifndef vp9_subpix_sixtap_avg4x4
-#define vp9_subpix_sixtap_avg4x4 vp9_sixtap_predict_avg_c
-#endif
-extern prototype_subpixel_predict(vp9_subpix_sixtap_avg4x4);
-
-#ifndef vp9_subpix_eighttap16x16
-#define vp9_subpix_eighttap16x16 vp9_eighttap_predict16x16_c
-#endif
-extern prototype_subpixel_predict(vp9_subpix_eighttap16x16);
-
-#ifndef vp9_subpix_eighttap8x8
-#define vp9_subpix_eighttap8x8 vp9_eighttap_predict8x8_c
-#endif
-extern prototype_subpixel_predict(vp9_subpix_eighttap8x8);
-
-#ifndef vp9_subpix_eighttap_avg16x16
-#define vp9_subpix_eighttap_avg16x16 vp9_eighttap_predict_avg16x16_c
-#endif
-extern prototype_subpixel_predict(vp9_subpix_eighttap_avg16x16);
-
-#ifndef vp9_subpix_eighttap_avg8x8
-#define vp9_subpix_eighttap_avg8x8 vp9_eighttap_predict_avg8x8_c
-#endif
-extern prototype_subpixel_predict(vp9_subpix_eighttap_avg8x8);
-
-#ifndef vp9_subpix_eighttap8x4
-#define vp9_subpix_eighttap8x4 vp9_eighttap_predict8x4_c
-#endif
-extern prototype_subpixel_predict(vp9_subpix_eighttap8x4);
-
-#ifndef vp9_subpix_eighttap4x4
-#define vp9_subpix_eighttap4x4 vp9_eighttap_predict_c
-#endif
-extern prototype_subpixel_predict(vp9_subpix_eighttap4x4);
-
-#ifndef vp9_subpix_eighttap_avg4x4
-#define vp9_subpix_eighttap_avg4x4 vp9_eighttap_predict_avg4x4_c
-#endif
-extern prototype_subpixel_predict(vp9_subpix_eighttap_avg4x4);
-
-#ifndef vp9_subpix_eighttap16x16_sharp
-#define vp9_subpix_eighttap16x16_sharp vp9_eighttap_predict16x16_sharp_c
-#endif
-extern prototype_subpixel_predict(vp9_subpix_eighttap16x16_sharp);
-
-#ifndef vp9_subpix_eighttap8x8_sharp
-#define vp9_subpix_eighttap8x8_sharp vp9_eighttap_predict8x8_sharp_c
-#endif
-extern prototype_subpixel_predict(vp9_subpix_eighttap8x8_sharp);
-
-#ifndef vp9_subpix_eighttap_avg16x16_sharp
-#define vp9_subpix_eighttap_avg16x16_sharp vp9_eighttap_predict_avg16x16_sharp_c
-#endif
-extern prototype_subpixel_predict(vp9_subpix_eighttap_avg16x16_sharp);
-
-#ifndef vp9_subpix_eighttap_avg8x8_sharp
-#define vp9_subpix_eighttap_avg8x8_sharp vp9_eighttap_predict_avg8x8_sharp_c
-#endif
-extern prototype_subpixel_predict(vp9_subpix_eighttap_avg8x8_sharp);
-
-#ifndef vp9_subpix_eighttap8x4_sharp
-#define vp9_subpix_eighttap8x4_sharp vp9_eighttap_predict8x4_sharp_c
-#endif
-extern prototype_subpixel_predict(vp9_subpix_eighttap8x4_sharp);
-
-#ifndef vp9_subpix_eighttap4x4_sharp
-#define vp9_subpix_eighttap4x4_sharp vp9_eighttap_predict_sharp_c
-#endif
-extern prototype_subpixel_predict(vp9_subpix_eighttap4x4_sharp);
-
-#ifndef vp9_subpix_eighttap_avg4x4_sharp
-#define vp9_subpix_eighttap_avg4x4_sharp vp9_eighttap_predict_avg4x4_sharp_c
-#endif
-extern prototype_subpixel_predict(vp9_subpix_eighttap_avg4x4_sharp);
-
-#ifndef vp9_subpix_bilinear16x16
-#define vp9_subpix_bilinear16x16 vp9_bilinear_predict16x16_c
-#endif
-extern prototype_subpixel_predict(vp9_subpix_bilinear16x16);
-
-#ifndef vp9_subpix_bilinear8x8
-#define vp9_subpix_bilinear8x8 vp9_bilinear_predict8x8_c
-#endif
-extern prototype_subpixel_predict(vp9_subpix_bilinear8x8);
-
-#ifndef vp9_subpix_bilinear_avg16x16
-#define vp9_subpix_bilinear_avg16x16 vp9_bilinear_predict_avg16x16_c
-#endif
-extern prototype_subpixel_predict(vp9_subpix_bilinear_avg16x16);
-
-#ifndef vp9_subpix_bilinear_avg8x8
-#define vp9_subpix_bilinear_avg8x8 vp9_bilinear_predict_avg8x8_c
-#endif
-extern prototype_subpixel_predict(vp9_subpix_bilinear_avg8x8);
-
-#ifndef vp9_subpix_bilinear8x4
-#define vp9_subpix_bilinear8x4 vp9_bilinear_predict8x4_c
-#endif
-extern prototype_subpixel_predict(vp9_subpix_bilinear8x4);
-
-#ifndef vp9_subpix_bilinear4x4
-#define vp9_subpix_bilinear4x4 vp9_bilinear_predict4x4_c
-#endif
-extern prototype_subpixel_predict(vp9_subpix_bilinear4x4);
-
-#ifndef vp9_subpix_bilinear_avg4x4
-#define vp9_subpix_bilinear_avg4x4 vp9_bilinear_predict_avg4x4_c
-#endif
-extern prototype_subpixel_predict(vp9_subpix_bilinear_avg4x4);
-
-typedef prototype_subpixel_predict((*vp9_subpix_fn_t));
-typedef struct {
-  vp9_subpix_fn_t  eighttap16x16;
-  vp9_subpix_fn_t  eighttap8x8;
-  vp9_subpix_fn_t  eighttap_avg16x16;
-  vp9_subpix_fn_t  eighttap_avg8x8;
-  vp9_subpix_fn_t  eighttap_avg4x4;
-  vp9_subpix_fn_t  eighttap8x4;
-  vp9_subpix_fn_t  eighttap4x4;
-  vp9_subpix_fn_t  eighttap16x16_sharp;
-  vp9_subpix_fn_t  eighttap8x8_sharp;
-  vp9_subpix_fn_t  eighttap_avg16x16_sharp;
-  vp9_subpix_fn_t  eighttap_avg8x8_sharp;
-  vp9_subpix_fn_t  eighttap_avg4x4_sharp;
-  vp9_subpix_fn_t  eighttap8x4_sharp;
-  vp9_subpix_fn_t  eighttap4x4_sharp;
-  vp9_subpix_fn_t  sixtap16x16;
-  vp9_subpix_fn_t  sixtap8x8;
-  vp9_subpix_fn_t  sixtap_avg16x16;
-  vp9_subpix_fn_t  sixtap_avg8x8;
-  vp9_subpix_fn_t  sixtap8x4;
-  vp9_subpix_fn_t  sixtap4x4;
-  vp9_subpix_fn_t  sixtap_avg4x4;
-  vp9_subpix_fn_t  bilinear16x16;
-  vp9_subpix_fn_t  bilinear8x8;
-  vp9_subpix_fn_t  bilinear_avg16x16;
-  vp9_subpix_fn_t  bilinear_avg8x8;
-  vp9_subpix_fn_t  bilinear8x4;
-  vp9_subpix_fn_t  bilinear4x4;
-  vp9_subpix_fn_t  bilinear_avg4x4;
-} vp9_subpix_rtcd_vtable_t;
-
-#if CONFIG_RUNTIME_CPU_DETECT
-#define SUBPIX_INVOKE(ctx,fn) (ctx)->fn
-#else
-#define SUBPIX_INVOKE(ctx,fn) vp9_subpix_##fn
-#endif
-
-#endif
--- a/vp8/common/swapyv12buffer.c
+++ /dev/null
@@ -1,32 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "swapyv12buffer.h"
-
-void vp9_swap_yv12_buffer(YV12_BUFFER_CONFIG *new_frame,
-                          YV12_BUFFER_CONFIG *last_frame) {
-  unsigned char *temp;
-
-  temp = last_frame->buffer_alloc;
-  last_frame->buffer_alloc = new_frame->buffer_alloc;
-  new_frame->buffer_alloc = temp;
-
-  temp = last_frame->y_buffer;
-  last_frame->y_buffer = new_frame->y_buffer;
-  new_frame->y_buffer = temp;
-
-  temp = last_frame->u_buffer;
-  last_frame->u_buffer = new_frame->u_buffer;
-  new_frame->u_buffer = temp;
-
-  temp = last_frame->v_buffer;
-  last_frame->v_buffer = new_frame->v_buffer;
-  new_frame->v_buffer = temp;
-}
--- a/vp8/common/swapyv12buffer.h
+++ /dev/null
@@ -1,19 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef __SWAPYV12_BUFFER_H
-#define __SWAPYV12_BUFFER_H
-
-#include "vpx_scale/yv12config.h"
-
-void vp9_swap_yv12_buffer(YV12_BUFFER_CONFIG *new_frame,
-                          YV12_BUFFER_CONFIG *last_frame);
-
-#endif  // __SWAPYV12_BUFFER_H
--- a/vp8/common/systemdependent.h
+++ /dev/null
@@ -1,21 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vpx_ports/config.h"
-#if ARCH_X86 || ARCH_X86_64
-void vpx_reset_mmx_state(void);
-#define vp9_clear_system_state() vpx_reset_mmx_state()
-#else
-#define vp9_clear_system_state()
-#endif
-
-struct VP9Common;
-void vp9_machine_specific_config(struct VP9Common *);
--- a/vp8/common/tapify.py
+++ /dev/null
@@ -1,106 +1,0 @@
-"""
- *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
-"""
-#!/usr/bin/env python
-import sys,string,os,re,math,numpy
-scale = 2**16
-def dist(p1,p2):
-  x1,y1 = p1
-  x2,y2 = p2
-  if x1==x2 and y1==y2 :
-    return 1.0 
-  return 1/ math.sqrt((x1-x2)*(x1-x2)+(y1-y2)*(y1-y2))
-
-def gettaps(p):
-  def l(b):
-    return int(math.floor(b))
-  def h(b):
-    return int(math.ceil(b))
-  def t(b,p,s):
-    return int((scale*dist(b,p)+s/2)/s)
-  r,c = p
-  ul=[l(r),l(c)]
-  ur=[l(r),h(c)]
-  ll=[h(r),l(c)]
-  lr=[h(r),h(c)]
-  sum = dist(ul,p)+dist(ur,p)+dist(ll,p)+dist(lr,p)
-  t4 = scale - t(ul,p,sum) - t(ur,p,sum) - t(ll,p,sum);
-  return [[ul,t(ul,p,sum)],[ur,t(ur,p,sum)],
-          [ll,t(ll,p,sum)],[lr,t4]]
-
-def print_mb_taps(angle,blocksize):
-  theta = angle / 57.2957795;
-  affine = [[math.cos(theta),-math.sin(theta)],
-            [math.sin(theta),math.cos(theta)]]
-  radius = (float(blocksize)-1)/2
-  print " // angle of",angle,"degrees"
-  for y in range(blocksize) :
-    for x in range(blocksize) :
-      r,c = numpy.dot(affine,[y-radius, x-radius])
-      tps = gettaps([r+radius,c+radius])
-      for t in tps :
-        p,t = t
-        tr,tc = p
-        print " %2d, %2d, %5d, " % (tr,tc,t,),
-      print " // %2d,%2d " % (y,x)
-
-i=float(sys.argv[1])
-while  i <= float(sys.argv[2]) :
-  print_mb_taps(i,float(sys.argv[4]))
-  i=i+float(sys.argv[3])
-"""
-
-taps = []
-pt=dict()
-ptr=dict()
-for y in range(16) :
-  for x in range(16) :
-    r,c = numpy.dot(affine,[y-7.5, x-7.5])
-    tps = gettaps([r+7.5,c+7.5])
-    j=0
-    for tp in tps : 
-      p,i = tp
-      r,c = p
-      pt[y,x,j]= [p,i]
-      try: 
-        ptr[r,j,c].append([y,x])
-      except:
-        ptr[r,j,c]=[[y,x]]
-      j = j+1 
-
-for key in sorted(pt.keys()) :
-  print key,pt[key]
-
-lr = -99
-lj = -99 
-lc = 0
-
-shuf=""
-mask=""
-for r,j,c in sorted(ptr.keys()) :
-  for y,x in ptr[r,j,c] :
-    if lr != r or lj != j :
-      print "shuf_"+str(lr)+"_"+str(lj)+"_"+shuf.ljust(16,"0"), lc
-      shuf=""
-      lc = 0
-    for i in range(lc,c-1) :
-      shuf = shuf +"0"
-    shuf = shuf + hex(x)[2]
-    lc =c
-    break
-  lr = r
-  lj = j
-#  print r,j,c,ptr[r,j,c]    
-#  print 
-
-for r,j,c in sorted(ptr.keys()) :
-  for y,x in ptr[r,j,c] :
-    print r,j,c,y,x 
-    break
-"""
--- a/vp8/common/textblit.c
+++ /dev/null
@@ -1,116 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <stdlib.h>
-
-
-void vp9_blit_text(const char *msg, unsigned char *address, const int pitch) {
-  int letter_bitmap;
-  unsigned char *output_pos = address;
-  int colpos;
-  const int font[] = {
-    0x0, 0x5C00, 0x8020, 0xAFABEA, 0xD7EC0, 0x1111111, 0x1855740, 0x18000,
-    0x45C0, 0x74400, 0x51140, 0x23880, 0xC4000, 0x21080, 0x80000, 0x111110,
-    0xE9D72E, 0x87E40, 0x12AD732, 0xAAD62A, 0x4F94C4, 0x4D6B7, 0x456AA,
-    0x3E8423, 0xAAD6AA, 0xAAD6A2, 0x2800, 0x2A00, 0x8A880, 0x52940, 0x22A20,
-    0x15422, 0x6AD62E, 0x1E4A53E, 0xAAD6BF, 0x8C62E, 0xE8C63F, 0x118D6BF,
-    0x1094BF, 0xCAC62E, 0x1F2109F, 0x118FE31, 0xF8C628, 0x8A89F, 0x108421F,
-    0x1F1105F, 0x1F4105F, 0xE8C62E, 0x2294BF, 0x164C62E, 0x12694BF, 0x8AD6A2,
-    0x10FC21, 0x1F8421F, 0x744107, 0xF8220F, 0x1151151, 0x117041, 0x119D731,
-    0x47E0, 0x1041041, 0xFC400, 0x10440, 0x1084210, 0x820
-  };
-  colpos = 0;
-
-  while (msg[colpos] != 0) {
-    char letter = msg[colpos];
-    int fontcol, fontrow;
-
-    if (letter <= 'Z' && letter >= ' ')
-      letter_bitmap = font[letter - ' '];
-    else if (letter <= 'z' && letter >= 'a')
-      letter_bitmap = font[letter - 'a' + 'A' - ' '];
-    else
-      letter_bitmap = font[0];
-
-    for (fontcol = 6; fontcol >= 0; fontcol--)
-      for (fontrow = 0; fontrow < 5; fontrow++)
-        output_pos[fontrow * pitch + fontcol] =
-          ((letter_bitmap >> (fontcol * 5)) & (1 << fontrow) ? 255 : 0);
-
-    output_pos += 7;
-    colpos++;
-  }
-}
-
-static void plot(const int x, const int y, unsigned char *image, const int pitch) {
-  image [x + y * pitch] ^= 255;
-}
-
-/* Bresenham line algorithm */
-void vp9_blit_line(int x0, int x1, int y0, int y1, unsigned char *image, const int pitch) {
-  int steep = abs(y1 - y0) > abs(x1 - x0);
-  int deltax, deltay;
-  int error, ystep, y, x;
-
-  if (steep) {
-    int t;
-    t = x0;
-    x0 = y0;
-    y0 = t;
-
-    t = x1;
-    x1 = y1;
-    y1 = t;
-  }
-
-  if (x0 > x1) {
-    int t;
-    t = x0;
-    x0 = x1;
-    x1 = t;
-
-    t = y0;
-    y0 = y1;
-    y1 = t;
-  }
-
-  deltax = x1 - x0;
-  deltay = abs(y1 - y0);
-  error  = deltax / 2;
-
-  y = y0;
-
-  if (y0 < y1)
-    ystep = 1;
-  else
-    ystep = -1;
-
-  if (steep) {
-    for (x = x0; x <= x1; x++) {
-      plot(y, x, image, pitch);
-
-      error = error - deltay;
-      if (error < 0) {
-        y = y + ystep;
-        error = error + deltax;
-      }
-    }
-  } else {
-    for (x = x0; x <= x1; x++) {
-      plot(x, y, image, pitch);
-
-      error = error - deltay;
-      if (error < 0) {
-        y = y + ystep;
-        error = error + deltax;
-      }
-    }
-  }
-}
--- a/vp8/common/treecoder.c
+++ /dev/null
@@ -1,138 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vpx_config.h"
-
-#if defined(CONFIG_DEBUG) && CONFIG_DEBUG
-#include <assert.h>
-#endif
-#include <stdio.h>
-
-#include "treecoder.h"
-
-static void tree2tok(
-  struct vp9_token_struct *const p,
-  vp9_tree t,
-  int i,
-  int v,
-  int L
-) {
-  v += v;
-  ++L;
-
-  do {
-    const vp9_tree_index j = t[i++];
-
-    if (j <= 0) {
-      p[-j].value = v;
-      p[-j].Len = L;
-    } else
-      tree2tok(p, t, j, v, L);
-  } while (++v & 1);
-}
-
-void vp9_tokens_from_tree(struct vp9_token_struct *p, vp9_tree t) {
-  tree2tok(p, t, 0, 0, 0);
-}
-
-void vp9_tokens_from_tree_offset(struct vp9_token_struct *p, vp9_tree t,
-                                 int offset) {
-  tree2tok(p - offset, t, 0, 0, 0);
-}
-
-static void branch_counts(
-  int n,                      /* n = size of alphabet */
-  vp9_token tok               [ /* n */ ],
-  vp9_tree tree,
-  unsigned int branch_ct       [ /* n-1 */ ] [2],
-  const unsigned int num_events[ /* n */ ]
-) {
-  const int tree_len = n - 1;
-  int t = 0;
-
-#if CONFIG_DEBUG
-  assert(tree_len);
-#endif
-
-  do {
-    branch_ct[t][0] = branch_ct[t][1] = 0;
-  } while (++t < tree_len);
-
-  t = 0;
-
-  do {
-    int L = tok[t].Len;
-    const int enc = tok[t].value;
-    const unsigned int ct = num_events[t];
-
-    vp9_tree_index i = 0;
-
-    do {
-      const int b = (enc >> --L) & 1;
-      const int j = i >> 1;
-#if CONFIG_DEBUG
-      assert(j < tree_len  &&  0 <= L);
-#endif
-
-      branch_ct [j] [b] += ct;
-      i = tree[ i + b];
-    } while (i > 0);
-
-#if CONFIG_DEBUG
-    assert(!L);
-#endif
-  } while (++t < n);
-
-}
-
-
-void vp9_tree_probs_from_distribution(
-  int n,                      /* n = size of alphabet */
-  vp9_token tok               [ /* n */ ],
-  vp9_tree tree,
-  vp9_prob probs          [ /* n-1 */ ],
-  unsigned int branch_ct       [ /* n-1 */ ] [2],
-  const unsigned int num_events[ /* n */ ],
-  unsigned int Pfac,
-  int rd
-) {
-  const int tree_len = n - 1;
-  int t = 0;
-
-  branch_counts(n, tok, tree, branch_ct, num_events);
-
-  do {
-    const unsigned int *const c = branch_ct[t];
-    const unsigned int tot = c[0] + c[1];
-
-#if CONFIG_DEBUG
-    assert(tot < (1 << 24));        /* no overflow below */
-#endif
-
-    if (tot) {
-      const unsigned int p = ((c[0] * Pfac) + (rd ? tot >> 1 : 0)) / tot;
-      probs[t] = p < 256 ? (p ? p : 1) : 255; /* agree w/old version for now */
-    } else
-      probs[t] = vp9_prob_half;
-  } while (++t < tree_len);
-}
-
-vp9_prob vp9_bin_prob_from_distribution(const unsigned int counts[2]) {
-  int tot_count = counts[0] + counts[1];
-  vp9_prob prob;
-  if (tot_count) {
-    prob = (counts[0] * 255 + (tot_count >> 1)) / tot_count;
-    prob += !prob;
-  } else {
-    prob = 128;
-  }
-  return prob;
-}
--- a/vp8/common/treecoder.h
+++ /dev/null
@@ -1,75 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef __INC_TREECODER_H
-#define __INC_TREECODER_H
-
-typedef unsigned char vp9_prob;
-
-#define vp9_prob_half ( (vp9_prob) 128)
-
-typedef signed char vp9_tree_index;
-struct bool_coder_spec;
-
-typedef struct bool_coder_spec bool_coder_spec;
-typedef struct bool_writer bool_writer;
-typedef struct bool_reader bool_reader;
-
-typedef const bool_coder_spec c_bool_coder_spec;
-typedef const bool_writer c_bool_writer;
-typedef const bool_reader c_bool_reader;
-
-
-
-# define vp9_complement( x) (255 - x)
-
-
-/* We build coding trees compactly in arrays.
-   Each node of the tree is a pair of vp9_tree_indices.
-   Array index often references a corresponding probability table.
-   Index <= 0 means done encoding/decoding and value = -Index,
-   Index > 0 means need another bit, specification at index.
-   Nonnegative indices are always even;  processing begins at node 0. */
-
-typedef const vp9_tree_index vp9_tree[], *vp9_tree_p;
-
-
-typedef const struct vp9_token_struct {
-  int value;
-  int Len;
-} vp9_token;
-
-/* Construct encoding array from tree. */
-
-void vp9_tokens_from_tree(struct vp9_token_struct *, vp9_tree);
-void vp9_tokens_from_tree_offset(struct vp9_token_struct *, vp9_tree,
-                                 int offset);
-
-
-/* Convert array of token occurrence counts into a table of probabilities
-   for the associated binary encoding tree.  Also writes count of branches
-   taken for each node on the tree; this facilitiates decisions as to
-   probability updates. */
-
-void vp9_tree_probs_from_distribution(
-  int n,                      /* n = size of alphabet */
-  vp9_token tok               [ /* n */ ],
-  vp9_tree tree,
-  vp9_prob probs          [ /* n-1 */ ],
-  unsigned int branch_ct       [ /* n-1 */ ] [2],
-  const unsigned int num_events[ /* n */ ],
-  unsigned int Pfactor,
-  int Round
-);
-
-vp9_prob vp9_bin_prob_from_distribution(const unsigned int counts[2]);
-
-#endif
--- a/vp8/common/type_aliases.h
+++ /dev/null
@@ -1,120 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-/****************************************************************************
-*
-*   Module Title :     type_aliases.h
-*
-*   Description  :     Standard type aliases
-*
-****************************************************************************/
-#ifndef __INC_TYPE_ALIASES_H
-#define __INC_TYPE_ALIASES_H
-
-/****************************************************************************
-* Macros
-****************************************************************************/
-#define EXPORT
-#define IMPORT          extern      /* Used to declare imported data & routines */
-#define PRIVATE         static      /* Used to declare & define module-local data */
-#define LOCAL           static      /* Used to define all persistent routine-local data */
-#define STD_IN_PATH     0           /* Standard input path */
-#define STD_OUT_PATH    1           /* Standard output path */
-#define STD_ERR_PATH    2           /* Standard error path */
-#define STD_IN_FILE     stdin       /* Standard input file pointer */
-#define STD_OUT_FILE    stdout      /* Standard output file pointer */
-#define STD_ERR_FILE    stderr      /* Standard error file pointer */
-#define max_int         0x7FFFFFFF
-
-#define __export
-#define _export
-
-#define CCONV
-
-#ifndef NULL
-#ifdef __cplusplus
-#define NULL    0
-#else
-#define NULL    ((void *)0)
-#endif
-#endif
-
-#ifndef FALSE
-#define FALSE   0
-#endif
-
-#ifndef TRUE
-#define TRUE    1
-#endif
-
-/****************************************************************************
-* Typedefs
-****************************************************************************/
-#ifndef TYPE_INT8
-#define TYPE_INT8
-typedef signed char     INT8;
-#endif
-
-#ifndef TYPE_INT16
-/*#define TYPE_INT16*/
-typedef signed short    INT16;
-#endif
-
-#ifndef TYPE_INT32
-/*#define TYPE_INT32*/
-typedef signed int      INT32;
-#endif
-
-#ifndef TYPE_UINT8
-/*#define TYPE_UINT8*/
-typedef unsigned char   UINT8;
-#endif
-
-#ifndef TYPE_UINT32
-/*#define TYPE_UINT32*/
-typedef unsigned int    UINT32;
-#endif
-
-#ifndef TYPE_UINT16
-/*#define TYPE_UINT16*/
-typedef unsigned short  UINT16;
-#endif
-
-#ifndef TYPE_BOOL
-/*#define TYPE_BOOL*/
-typedef int             BOOL;
-#endif
-
-typedef unsigned char   BOOLEAN;
-
-#ifdef _MSC_VER
-typedef __int64 INT64;
-#ifndef INT64_MAX
-#define INT64_MAX LLONG_MAX
-#endif
-#else
-
-#ifndef TYPE_INT64
-#ifdef _TMS320C6X
-/* for now we only have 40bits */
-typedef long INT64;
-#else
-typedef long long INT64;
-#endif
-#endif
-
-#endif
-
-/* Floating point */
-typedef  double         FLOAT64;
-typedef  float          FLOAT32;
-
-#endif
--- a/vp8/common/x86/filter_sse2.c
+++ /dev/null
@@ -1,289 +1,0 @@
-/*
- *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <assert.h> // for alignment checks
-#include <emmintrin.h> // SSE2
-#include "vp8/common/filter.h"
-#include "vpx_ports/mem.h" // for DECLARE_ALIGNED
-#include "vpx_rtcd.h"
-
-// TODO(cd): After cleanup, commit faster versions for non 4x4 size. This is
-//           just a quick partial snapshot so that other can already use some
-//           speedup.
-// TODO(cd): Use vectorized 8 tap filtering code as speedup to pure C 6 tap
-//           filtering.
-// TODO(cd): Add some comments, better variable naming.
-// TODO(cd): Maybe use _mm_maddubs_epi16 if smaller filter coeficients (no sum
-//           of positive above 128), or have higher precision filter
-//           coefficients.
-
-DECLARE_ALIGNED(16, static const unsigned int, rounding_c[4]) = {
-  VP9_FILTER_WEIGHT >> 1,
-  VP9_FILTER_WEIGHT >> 1,
-  VP9_FILTER_WEIGHT >> 1,
-  VP9_FILTER_WEIGHT >> 1,
-};
-
-// Creating a macro to do more than four pixels at once to hide instruction
-// latency is actually slower :-(
-#define DO_FOUR_PIXELS(result, src_ptr, offset)                                \
-  {                                                                            \
-  /* Do shifted load to achieve require shuffles through unpacking */          \
-  const __m128i src0  = _mm_loadu_si128((const __m128i *)(src_ptr + offset + 0)); \
-  const __m128i src1  = _mm_loadu_si128((const __m128i *)(src_ptr + offset + 1)); \
-  const __m128i src2  = _mm_loadu_si128((const __m128i *)(src_ptr + offset + 2)); \
-  const __m128i src3  = _mm_loadu_si128((const __m128i *)(src_ptr + offset + 3)); \
-  const __m128i src01 = _mm_unpacklo_epi8(src0, src1);                         \
-  const __m128i src01_16 = _mm_unpacklo_epi8(src01, zero);                     \
-  const __m128i src23 = _mm_unpacklo_epi8(src2, src3);                         \
-  const __m128i src23_16 = _mm_unpacklo_epi8(src23, zero);                     \
-  /* Shit by 4 bytes through suffle to get additional shifted loads */         \
-  const __m128i src4  = _mm_shuffle_epi32(src0, _MM_SHUFFLE(3, 3, 2, 1));      \
-  const __m128i src5  = _mm_shuffle_epi32(src1, _MM_SHUFFLE(3, 3, 2, 1));      \
-  const __m128i src6  = _mm_shuffle_epi32(src2, _MM_SHUFFLE(3, 3, 2, 1));      \
-  const __m128i src7  = _mm_shuffle_epi32(src3, _MM_SHUFFLE(3, 3, 2, 1));      \
-  const __m128i src45 = _mm_unpacklo_epi8(src4, src5);                         \
-  const __m128i src45_16 = _mm_unpacklo_epi8(src45, zero);                     \
-  const __m128i src67 = _mm_unpacklo_epi8(src6, src7);                         \
-  const __m128i src67_16 = _mm_unpacklo_epi8(src67, zero);                     \
-  /* multiply accumulate them */                                               \
-  const __m128i mad01 = _mm_madd_epi16(src01_16, fil01);                       \
-  const __m128i mad23 = _mm_madd_epi16(src23_16, fil23);                       \
-  const __m128i mad45 = _mm_madd_epi16(src45_16, fil45);                       \
-  const __m128i mad67 = _mm_madd_epi16(src67_16, fil67);                       \
-  const __m128i mad0123 = _mm_add_epi32(mad01, mad23);                         \
-  const __m128i mad4567 = _mm_add_epi32(mad45, mad67);                         \
-  __m128i mad_all = _mm_add_epi32(mad0123, mad4567);                           \
-  mad_all = _mm_add_epi32(mad_all, rounding);                                  \
-  result = _mm_srai_epi32(mad_all, VP9_FILTER_SHIFT);                          \
-  }
-
-void vp9_filter_block2d_4x4_8_sse2
-(
- const unsigned char *src_ptr, const unsigned int src_stride,
- const short *HFilter_aligned16, const short *VFilter_aligned16,
- unsigned char *dst_ptr, unsigned int dst_stride
-) {
-  __m128i intermediateA, intermediateB, intermediateC;
-
-  const int kInterp_Extend = 4;
-
-  const __m128i zero = _mm_set1_epi16(0);
-  const __m128i rounding = _mm_load_si128((const __m128i *)rounding_c);
-
-  // check alignment
-  assert(0 == ((long)HFilter_aligned16)%16);
-  assert(0 == ((long)VFilter_aligned16)%16);
-
-  {
-    __m128i transpose3_0;
-    __m128i transpose3_1;
-    __m128i transpose3_2;
-    __m128i transpose3_3;
-
-    // Horizontal pass (src -> intermediate).
-    {
-      const __m128i HFilter = _mm_load_si128((const __m128i *)HFilter_aligned16);
-      // get first two columns filter coefficients
-      __m128i fil01 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(0, 0, 0, 0));
-      __m128i fil23 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(1, 1, 1, 1));
-      __m128i fil45 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(2, 2, 2, 2));
-      __m128i fil67 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(3, 3, 3, 3));
-      src_ptr -= (kInterp_Extend - 1) * src_stride + (kInterp_Extend - 1);
-
-      {
-        __m128i mad_all0;
-        __m128i mad_all1;
-        __m128i mad_all2;
-        __m128i mad_all3;
-        DO_FOUR_PIXELS(mad_all0, src_ptr, 0*src_stride)
-        DO_FOUR_PIXELS(mad_all1, src_ptr, 1*src_stride)
-        DO_FOUR_PIXELS(mad_all2, src_ptr, 2*src_stride)
-        DO_FOUR_PIXELS(mad_all3, src_ptr, 3*src_stride)
-        mad_all0 = _mm_packs_epi32(mad_all0, mad_all1);
-        mad_all2 = _mm_packs_epi32(mad_all2, mad_all3);
-        intermediateA = _mm_packus_epi16(mad_all0, mad_all2);
-        // --
-        src_ptr += src_stride*4;
-        // --
-        DO_FOUR_PIXELS(mad_all0, src_ptr, 0*src_stride)
-        DO_FOUR_PIXELS(mad_all1, src_ptr, 1*src_stride)
-        DO_FOUR_PIXELS(mad_all2, src_ptr, 2*src_stride)
-        DO_FOUR_PIXELS(mad_all3, src_ptr, 3*src_stride)
-        mad_all0 = _mm_packs_epi32(mad_all0, mad_all1);
-        mad_all2 = _mm_packs_epi32(mad_all2, mad_all3);
-        intermediateB = _mm_packus_epi16(mad_all0, mad_all2);
-        // --
-        src_ptr += src_stride*4;
-        // --
-        DO_FOUR_PIXELS(mad_all0, src_ptr, 0*src_stride)
-        DO_FOUR_PIXELS(mad_all1, src_ptr, 1*src_stride)
-        DO_FOUR_PIXELS(mad_all2, src_ptr, 2*src_stride)
-        mad_all0 = _mm_packs_epi32(mad_all0, mad_all1);
-        mad_all2 = _mm_packs_epi32(mad_all2, mad_all2);
-        intermediateC = _mm_packus_epi16(mad_all0, mad_all2);
-      }
-    }
-
-    // Transpose result (intermediate -> transpose3_x)
-    {
-      // 00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33
-      // 40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73
-      // 80 81 82 83 90 91 92 93 A0 A1 A2 A3 xx xx xx xx
-      const __m128i transpose0_0 = _mm_unpacklo_epi8(intermediateA, intermediateB);
-      const __m128i transpose0_1 = _mm_unpackhi_epi8(intermediateA, intermediateB);
-      const __m128i transpose0_2 = _mm_unpacklo_epi8(intermediateC, intermediateC);
-      const __m128i transpose0_3 = _mm_unpackhi_epi8(intermediateC, intermediateC);
-      // 00 40 01 41 02 42 03 43 10 50 11 51 12 52 13 53
-      // 20 60 21 61 22 62 23 63 30 70 31 71 32 72 33 73
-      // 80 xx 81 xx 82 xx 83 xx 90 xx 91 xx 92 xx 93 xx
-      // A0 xx A1 xx A2 xx A3 xx xx xx xx xx xx xx xx xx
-      const __m128i transpose1_0 = _mm_unpacklo_epi8(transpose0_0, transpose0_1);
-      const __m128i transpose1_1 = _mm_unpackhi_epi8(transpose0_0, transpose0_1);
-      const __m128i transpose1_2 = _mm_unpacklo_epi8(transpose0_2, transpose0_3);
-      const __m128i transpose1_3 = _mm_unpackhi_epi8(transpose0_2, transpose0_3);
-      // 00 20 40 60 01 21 41 61 02 22 42 62 03 23 43 63
-      // 10 30 50 70 11 31 51 71 12 32 52 72 13 33 53 73
-      // 80 A0 xx xx 81 A1 xx xx 82 A2 xx xx 83 A3 xx xx
-      // 90 xx xx xx 91 xx xx xx 92 xx xx xx 93 xx xx xx
-      const __m128i transpose2_0 = _mm_unpacklo_epi8(transpose1_0, transpose1_1);
-      const __m128i transpose2_1 = _mm_unpackhi_epi8(transpose1_0, transpose1_1);
-      const __m128i transpose2_2 = _mm_unpacklo_epi8(transpose1_2, transpose1_3);
-      const __m128i transpose2_3 = _mm_unpackhi_epi8(transpose1_2, transpose1_3);
-      // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
-      // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
-      // 80 90 A0 xx xx xx xx xx 81 91 A1 xx xx xx xx xx
-      // 82 92 A2 xx xx xx xx xx 83 93 A3 xx xx xx xx xx
-      transpose3_0 = _mm_castps_si128(
-                            _mm_shuffle_ps(_mm_castsi128_ps(transpose2_0),
-                                           _mm_castsi128_ps(transpose2_2),
-                                           _MM_SHUFFLE(1, 0, 1, 0)));
-      transpose3_1 = _mm_castps_si128(
-                            _mm_shuffle_ps(_mm_castsi128_ps(transpose2_0),
-                                           _mm_castsi128_ps(transpose2_2),
-                                           _MM_SHUFFLE(3, 2, 3, 2)));
-      transpose3_2 = _mm_castps_si128(
-                            _mm_shuffle_ps(_mm_castsi128_ps(transpose2_1),
-                                           _mm_castsi128_ps(transpose2_3),
-                                           _MM_SHUFFLE(1, 0, 1, 0)));
-      transpose3_3 = _mm_castps_si128(
-                            _mm_shuffle_ps(_mm_castsi128_ps(transpose2_1),
-                                           _mm_castsi128_ps(transpose2_3),
-                                           _MM_SHUFFLE(3, 2, 3, 2)));
-      // 00 10 20 30 40 50 60 70 80 90 A0 xx xx xx xx xx
-      // 01 11 21 31 41 51 61 71 81 91 A1 xx xx xx xx xx
-      // 02 12 22 32 42 52 62 72 82 92 A2 xx xx xx xx xx
-      // 03 13 23 33 43 53 63 73 83 93 A3 xx xx xx xx xx
-    }
-
-    // Vertical pass (transpose3_x -> dst).
-    {
-      const __m128i VFilter = _mm_load_si128((const __m128i *)VFilter_aligned16);
-      // get first two columns filter coefficients
-      __m128i fil01 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(0, 0, 0, 0));
-      __m128i fil23 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(1, 1, 1, 1));
-      __m128i fil45 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(2, 2, 2, 2));
-      __m128i fil67 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(3, 3, 3, 3));
-      __m128i col0, col1, col2, col3;
-        DECLARE_ALIGNED(16, unsigned char, temp[32]);
-      {
-        _mm_store_si128((__m128i *)temp, transpose3_0);
-        DO_FOUR_PIXELS(col0, temp, 0);
-      }
-      {
-        _mm_store_si128((__m128i *)temp, transpose3_1);
-        DO_FOUR_PIXELS(col1, temp, 0);
-      }
-      {
-        _mm_store_si128((__m128i *)temp, transpose3_2);
-        DO_FOUR_PIXELS(col2, temp, 0);
-      }
-      {
-        _mm_store_si128((__m128i *)temp, transpose3_3);
-        DO_FOUR_PIXELS(col3, temp, 0);
-      }
-      // transpose
-      {
-        __m128i T0 = _mm_unpacklo_epi32(col0, col1);
-        __m128i T1 = _mm_unpacklo_epi32(col2, col3);
-        __m128i T2 = _mm_unpackhi_epi32(col0, col1);
-        __m128i T3 = _mm_unpackhi_epi32(col2, col3);
-        col0 = _mm_unpacklo_epi64(T0, T1);
-        col1 = _mm_unpackhi_epi64(T0, T1);
-        col2 = _mm_unpacklo_epi64(T2, T3);
-        col3 = _mm_unpackhi_epi64(T2, T3);
-      }
-      // saturate to 8 bit
-      {
-        col0 = _mm_packs_epi32(col0, col0);
-        col0 = _mm_packus_epi16(col0, col0);
-        col1 = _mm_packs_epi32(col1, col1);
-        col1 = _mm_packus_epi16(col1, col1);
-        col2 = _mm_packs_epi32 (col2, col2);
-        col2 = _mm_packus_epi16(col2, col2);
-        col3 = _mm_packs_epi32 (col3, col3);
-        col3 = _mm_packus_epi16(col3, col3);
-      }
-      // store
-      {
-        *((unsigned int *)&dst_ptr[dst_stride * 0]) = _mm_cvtsi128_si32(col0);
-        *((unsigned int *)&dst_ptr[dst_stride * 1]) = _mm_cvtsi128_si32(col1);
-        *((unsigned int *)&dst_ptr[dst_stride * 2]) = _mm_cvtsi128_si32(col2);
-        *((unsigned int *)&dst_ptr[dst_stride * 3]) = _mm_cvtsi128_si32(col3);
-      }
-    }
-  }
-}
-
-void vp9_filter_block2d_8x4_8_sse2
-(
- const unsigned char *src_ptr, const unsigned int src_stride,
- const short *HFilter_aligned16, const short *VFilter_aligned16,
- unsigned char *dst_ptr, unsigned int dst_stride
-) {
-  int j;
-  for (j=0; j<8; j+=4) {
-    vp9_filter_block2d_4x4_8_sse2(src_ptr + j, src_stride,
-                                  HFilter_aligned16, VFilter_aligned16,
-                                  dst_ptr + j, dst_stride);
-  }
-}
-
-void vp9_filter_block2d_8x8_8_sse2
-(
- const unsigned char *src_ptr, const unsigned int src_stride,
- const short *HFilter_aligned16, const short *VFilter_aligned16,
- unsigned char *dst_ptr, unsigned int dst_stride
-) {
-  int i, j;
-  for (i=0; i<8; i+=4) {
-    for (j=0; j<8; j+=4) {
-      vp9_filter_block2d_4x4_8_sse2(src_ptr + j + i*src_stride, src_stride,
-                                    HFilter_aligned16, VFilter_aligned16,
-                                    dst_ptr + j + i*dst_stride, dst_stride);
-    }
-  }
-}
-
-void vp9_filter_block2d_16x16_8_sse2
-(
- const unsigned char *src_ptr, const unsigned int src_stride,
- const short *HFilter_aligned16, const short *VFilter_aligned16,
- unsigned char *dst_ptr, unsigned int dst_stride
-) {
-  int i, j;
-  for (i=0; i<16; i+=4) {
-    for (j=0; j<16; j+=4) {
-      vp9_filter_block2d_4x4_8_sse2(src_ptr + j + i*src_stride, src_stride,
-                                    HFilter_aligned16, VFilter_aligned16,
-                                    dst_ptr + j + i*dst_stride, dst_stride);
-    }
-  }
-}
--- a/vp8/common/x86/filter_sse4.c
+++ /dev/null
@@ -1,362 +1,0 @@
-/*
- *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <assert.h> // for alignment checks
-#include <smmintrin.h> // SSE4.1
-#include "vp8/common/filter.h"
-#include "vpx_ports/mem.h" // for DECLARE_ALIGNED
-#include "vpx_rtcd.h"
-
-// TODO(cd): After cleanup, commit faster versions for non 4x4 size. This is
-//           just a quick partial snapshot so that other can already use some
-//           speedup.
-// TODO(cd): Use vectorized 8 tap filtering code as speedup to pure C 6 tap
-//           filtering.
-// TODO(cd): Reduce source size by using macros instead of current code
-//           duplication.
-// TODO(cd): Add some comments, better variable naming.
-// TODO(cd): Maybe use _mm_maddubs_epi16 if smaller filter coeficients (no sum
-//           of positive above 128), or have higher precision filter
-//           coefficients.
-
-DECLARE_ALIGNED(16, static const unsigned char, mask0123_c[16]) = {
-  0x00, 0x01,
-  0x01, 0x02,
-  0x02, 0x03,
-  0x03, 0x04,
-  0x02, 0x03,
-  0x03, 0x04,
-  0x04, 0x05,
-  0x05, 0x06,
-};
-DECLARE_ALIGNED(16, static const unsigned char, mask4567_c[16]) = {
-  0x04, 0x05,
-  0x05, 0x06,
-  0x06, 0x07,
-  0x07, 0x08,
-  0x06, 0x07,
-  0x07, 0x08,
-  0x08, 0x09,
-  0x09, 0x0A,
-};
-DECLARE_ALIGNED(16, static const unsigned int, rounding_c[4]) = {
-  VP9_FILTER_WEIGHT >> 1,
-  VP9_FILTER_WEIGHT >> 1,
-  VP9_FILTER_WEIGHT >> 1,
-  VP9_FILTER_WEIGHT >> 1,
-};
-DECLARE_ALIGNED(16, static const unsigned char, transpose_c[16]) = {
-  0, 4,  8, 12,
-  1, 5,  9, 13,
-  2, 6, 10, 14,
-  3, 7, 11, 15
-};
-
-// Creating a macro to do more than four pixels at once to hide instruction
-// latency is actually slower :-(
-#define DO_FOUR_PIXELS(result, offset)                                         \
-  {                                                                            \
-  /*load pixels*/                                                              \
-  __m128i src  = _mm_loadu_si128((const __m128i *)(src_ptr + offset));         \
-  /* extract the ones used for first column */                                 \
-  __m128i src0123 = _mm_shuffle_epi8(src, mask0123);                           \
-  __m128i src4567 = _mm_shuffle_epi8(src, mask4567);                           \
-  __m128i src01_16 = _mm_unpacklo_epi8(src0123, zero);                         \
-  __m128i src23_16 = _mm_unpackhi_epi8(src0123, zero);                         \
-  __m128i src45_16 = _mm_unpacklo_epi8(src4567, zero);                         \
-  __m128i src67_16 = _mm_unpackhi_epi8(src4567, zero);                         \
-  /* multiply accumulate them */                                               \
-  __m128i mad01 = _mm_madd_epi16(src01_16, fil01);                             \
-  __m128i mad23 = _mm_madd_epi16(src23_16, fil23);                             \
-  __m128i mad45 = _mm_madd_epi16(src45_16, fil45);                             \
-  __m128i mad67 = _mm_madd_epi16(src67_16, fil67);                             \
-  __m128i mad0123 = _mm_add_epi32(mad01, mad23);                               \
-  __m128i mad4567 = _mm_add_epi32(mad45, mad67);                               \
-  __m128i mad_all = _mm_add_epi32(mad0123, mad4567);                           \
-  mad_all = _mm_add_epi32(mad_all, rounding);                                  \
-  result = _mm_srai_epi32(mad_all, VP9_FILTER_SHIFT);                          \
-  }
-
-void vp9_filter_block2d_4x4_8_sse4_1
-(
- const unsigned char *src_ptr, const unsigned int src_stride,
- const short *HFilter_aligned16, const short *VFilter_aligned16,
- unsigned char *dst_ptr, unsigned int dst_stride
-) {
-  __m128i intermediateA, intermediateB, intermediateC;
-
-  const int kInterp_Extend = 4;
-
-  const __m128i zero = _mm_set1_epi16(0);
-  const __m128i mask0123 = _mm_load_si128((const __m128i *)mask0123_c);
-  const __m128i mask4567 = _mm_load_si128((const __m128i *)mask4567_c);
-  const __m128i rounding = _mm_load_si128((const __m128i *)rounding_c);
-  const __m128i transpose = _mm_load_si128((const __m128i *)transpose_c);
-
-  // check alignment
-  assert(0 == ((long)HFilter_aligned16)%16);
-  assert(0 == ((long)VFilter_aligned16)%16);
-
-  {
-    __m128i transpose3_0;
-    __m128i transpose3_1;
-    __m128i transpose3_2;
-    __m128i transpose3_3;
-
-    // Horizontal pass (src -> intermediate).
-    {
-      const __m128i HFilter = _mm_load_si128((const __m128i *)HFilter_aligned16);
-      // get first two columns filter coefficients
-      __m128i fil01 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(0, 0, 0, 0));
-      __m128i fil23 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(1, 1, 1, 1));
-      __m128i fil45 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(2, 2, 2, 2));
-      __m128i fil67 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(3, 3, 3, 3));
-      src_ptr -= (kInterp_Extend - 1) * src_stride + (kInterp_Extend - 1);
-
-      {
-        __m128i mad_all0;
-        __m128i mad_all1;
-        __m128i mad_all2;
-        __m128i mad_all3;
-        DO_FOUR_PIXELS(mad_all0, 0*src_stride)
-        DO_FOUR_PIXELS(mad_all1, 1*src_stride)
-        DO_FOUR_PIXELS(mad_all2, 2*src_stride)
-        DO_FOUR_PIXELS(mad_all3, 3*src_stride)
-        mad_all0 = _mm_packs_epi32(mad_all0, mad_all1);
-        mad_all2 = _mm_packs_epi32(mad_all2, mad_all3);
-        intermediateA = _mm_packus_epi16(mad_all0, mad_all2);
-        // --
-        src_ptr += src_stride*4;
-        // --
-        DO_FOUR_PIXELS(mad_all0, 0*src_stride)
-        DO_FOUR_PIXELS(mad_all1, 1*src_stride)
-        DO_FOUR_PIXELS(mad_all2, 2*src_stride)
-        DO_FOUR_PIXELS(mad_all3, 3*src_stride)
-        mad_all0 = _mm_packs_epi32(mad_all0, mad_all1);
-        mad_all2 = _mm_packs_epi32(mad_all2, mad_all3);
-        intermediateB = _mm_packus_epi16(mad_all0, mad_all2);
-        // --
-        src_ptr += src_stride*4;
-        // --
-        DO_FOUR_PIXELS(mad_all0, 0*src_stride)
-        DO_FOUR_PIXELS(mad_all1, 1*src_stride)
-        DO_FOUR_PIXELS(mad_all2, 2*src_stride)
-        mad_all0 = _mm_packs_epi32(mad_all0, mad_all1);
-        mad_all2 = _mm_packs_epi32(mad_all2, mad_all2);
-        intermediateC = _mm_packus_epi16(mad_all0, mad_all2);
-      }
-    }
-
-    // Transpose result (intermediate -> transpose3_x)
-    {
-      // 00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33
-      // 40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73
-      // 80 81 82 83 90 91 92 93 A0 A1 A2 A3 xx xx xx xx
-      const __m128i transpose1_0 = _mm_shuffle_epi8(intermediateA, transpose);
-      const __m128i transpose1_1 = _mm_shuffle_epi8(intermediateB, transpose);
-      const __m128i transpose1_2 = _mm_shuffle_epi8(intermediateC, transpose);
-      // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
-      // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
-      // 80 90 A0 xx 81 91 A1 xx 82 92 A2 xx 83 93 A3 xx
-      const __m128i transpose2_0 = _mm_unpacklo_epi32(transpose1_0, transpose1_1);
-      const __m128i transpose2_1 = _mm_unpackhi_epi32(transpose1_0, transpose1_1);
-      // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
-      // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
-      transpose3_0 = _mm_castps_si128(
-                            _mm_shuffle_ps(_mm_castsi128_ps(transpose2_0),
-                                           _mm_castsi128_ps(transpose1_2),
-                                           _MM_SHUFFLE(0, 0, 1, 0)));
-      transpose3_1 = _mm_castps_si128(
-                            _mm_shuffle_ps(_mm_castsi128_ps(transpose2_0),
-                                           _mm_castsi128_ps(transpose1_2),
-                                           _MM_SHUFFLE(1, 1, 3, 2)));
-      transpose3_2 = _mm_castps_si128(
-                            _mm_shuffle_ps(_mm_castsi128_ps(transpose2_1),
-                                           _mm_castsi128_ps(transpose1_2),
-                                           _MM_SHUFFLE(2, 2, 1, 0)));
-      transpose3_3 = _mm_castps_si128(
-                            _mm_shuffle_ps(_mm_castsi128_ps(transpose2_1),
-                                           _mm_castsi128_ps(transpose1_2),
-                                           _MM_SHUFFLE(3, 3, 3, 2)));
-      // 00 10 20 30 40 50 60 70 80 90 A0 xx xx xx xx xx
-      // 01 11 21 31 41 51 61 71 81 91 A1 xx xx xx xx xx
-      // 02 12 22 32 42 52 62 72 82 92 A2 xx xx xx xx xx
-      // 03 13 23 33 43 53 63 73 83 93 A3 xx xx xx xx xx
-    }
-
-    // Vertical pass (transpose3_x -> dst).
-    {
-      const __m128i VFilter = _mm_load_si128((const __m128i *)VFilter_aligned16);
-      // get first two columns filter coefficients
-      __m128i fil01 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(0, 0, 0, 0));
-      __m128i fil23 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(1, 1, 1, 1));
-      __m128i fil45 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(2, 2, 2, 2));
-      __m128i fil67 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(3, 3, 3, 3));
-      __m128i col0, col1, col2, col3;
-      {
-        //load pixels
-        __m128i src  = transpose3_0;
-        // extract the ones used for first column
-        __m128i src0123 = _mm_shuffle_epi8(src, mask0123);
-        __m128i src4567 = _mm_shuffle_epi8(src, mask4567);
-        __m128i src01_16 = _mm_unpacklo_epi8(src0123, zero);
-        __m128i src23_16 = _mm_unpackhi_epi8(src0123, zero);
-        __m128i src45_16 = _mm_unpacklo_epi8(src4567, zero);
-        __m128i src67_16 = _mm_unpackhi_epi8(src4567, zero);
-        // multiply accumulate them
-        __m128i mad01 = _mm_madd_epi16(src01_16, fil01);
-        __m128i mad23 = _mm_madd_epi16(src23_16, fil23);
-        __m128i mad45 = _mm_madd_epi16(src45_16, fil45);
-        __m128i mad67 = _mm_madd_epi16(src67_16, fil67);
-        __m128i mad0123 = _mm_add_epi32(mad01, mad23);
-        __m128i mad4567 = _mm_add_epi32(mad45, mad67);
-        __m128i mad_all = _mm_add_epi32(mad0123, mad4567);
-        mad_all = _mm_add_epi32(mad_all, rounding);
-        mad_all = _mm_srai_epi32(mad_all, VP9_FILTER_SHIFT);
-        mad_all = _mm_packs_epi32(mad_all, mad_all);
-        col0 = _mm_packus_epi16(mad_all, mad_all);
-      }
-      {
-        //load pixels
-        __m128i src  = transpose3_1;
-        // extract the ones used for first column
-        __m128i src0123 = _mm_shuffle_epi8(src, mask0123);
-        __m128i src4567 = _mm_shuffle_epi8(src, mask4567);
-        __m128i src01_16 = _mm_unpacklo_epi8(src0123, zero);
-        __m128i src23_16 = _mm_unpackhi_epi8(src0123, zero);
-        __m128i src45_16 = _mm_unpacklo_epi8(src4567, zero);
-        __m128i src67_16 = _mm_unpackhi_epi8(src4567, zero);
-        // multiply accumulate them
-        __m128i mad01 = _mm_madd_epi16(src01_16, fil01);
-        __m128i mad23 = _mm_madd_epi16(src23_16, fil23);
-        __m128i mad45 = _mm_madd_epi16(src45_16, fil45);
-        __m128i mad67 = _mm_madd_epi16(src67_16, fil67);
-        __m128i mad0123 = _mm_add_epi32(mad01, mad23);
-        __m128i mad4567 = _mm_add_epi32(mad45, mad67);
-        __m128i mad_all = _mm_add_epi32(mad0123, mad4567);
-        mad_all = _mm_add_epi32(mad_all, rounding);
-        mad_all = _mm_srai_epi32(mad_all, VP9_FILTER_SHIFT);
-        mad_all = _mm_packs_epi32(mad_all, mad_all);
-        col1 = _mm_packus_epi16(mad_all, mad_all);
-      }
-      {
-        //load pixels
-        __m128i src  = transpose3_2;
-        // extract the ones used for first column
-        __m128i src0123 = _mm_shuffle_epi8(src, mask0123);
-        __m128i src4567 = _mm_shuffle_epi8(src, mask4567);
-        __m128i src01_16 = _mm_unpacklo_epi8(src0123, zero);
-        __m128i src23_16 = _mm_unpackhi_epi8(src0123, zero);
-        __m128i src45_16 = _mm_unpacklo_epi8(src4567, zero);
-        __m128i src67_16 = _mm_unpackhi_epi8(src4567, zero);
-        // multiply accumulate them
-        __m128i mad01 = _mm_madd_epi16(src01_16, fil01);
-        __m128i mad23 = _mm_madd_epi16(src23_16, fil23);
-        __m128i mad45 = _mm_madd_epi16(src45_16, fil45);
-        __m128i mad67 = _mm_madd_epi16(src67_16, fil67);
-        __m128i mad0123 = _mm_add_epi32(mad01, mad23);
-        __m128i mad4567 = _mm_add_epi32(mad45, mad67);
-        __m128i mad_all = _mm_add_epi32(mad0123, mad4567);
-        mad_all = _mm_add_epi32(mad_all, rounding);
-        mad_all = _mm_srai_epi32(mad_all, VP9_FILTER_SHIFT);
-        mad_all = _mm_packs_epi32(mad_all, mad_all);
-        col2 = _mm_packus_epi16(mad_all, mad_all);
-      }
-      {
-        //load pixels
-        __m128i src  = transpose3_3;
-        // extract the ones used for first column
-        __m128i src0123 = _mm_shuffle_epi8(src, mask0123);
-        __m128i src4567 = _mm_shuffle_epi8(src, mask4567);
-        __m128i src01_16 = _mm_unpacklo_epi8(src0123, zero);
-        __m128i src23_16 = _mm_unpackhi_epi8(src0123, zero);
-        __m128i src45_16 = _mm_unpacklo_epi8(src4567, zero);
-        __m128i src67_16 = _mm_unpackhi_epi8(src4567, zero);
-        // multiply accumulate them
-        __m128i mad01 = _mm_madd_epi16(src01_16, fil01);
-        __m128i mad23 = _mm_madd_epi16(src23_16, fil23);
-        __m128i mad45 = _mm_madd_epi16(src45_16, fil45);
-        __m128i mad67 = _mm_madd_epi16(src67_16, fil67);
-        __m128i mad0123 = _mm_add_epi32(mad01, mad23);
-        __m128i mad4567 = _mm_add_epi32(mad45, mad67);
-        __m128i mad_all = _mm_add_epi32(mad0123, mad4567);
-        mad_all = _mm_add_epi32(mad_all, rounding);
-        mad_all = _mm_srai_epi32(mad_all, VP9_FILTER_SHIFT);
-        mad_all = _mm_packs_epi32(mad_all, mad_all);
-        col3 = _mm_packus_epi16(mad_all, mad_all);
-      }
-      {
-        __m128i col01 = _mm_unpacklo_epi8(col0, col1);
-        __m128i col23 = _mm_unpacklo_epi8(col2, col3);
-        __m128i col0123 = _mm_unpacklo_epi16(col01, col23);
-        //TODO(cd): look into Ronald's comment:
-        //    Future suggestion: I believe here, too, you can merge the
-        //    packs_epi32() and pacus_epi16() for the 4 cols above, so that
-        //    you get the data in a single register, and then use pshufb
-        //    (shuffle_epi8()) instead of the unpacks here. Should be
-        //    2+3+2 instructions faster.
-        *((unsigned int *)&dst_ptr[dst_stride * 0]) =
-            _mm_extract_epi32(col0123, 0);
-        *((unsigned int *)&dst_ptr[dst_stride * 1]) =
-            _mm_extract_epi32(col0123, 1);
-        *((unsigned int *)&dst_ptr[dst_stride * 2]) =
-            _mm_extract_epi32(col0123, 2);
-        *((unsigned int *)&dst_ptr[dst_stride * 3]) =
-            _mm_extract_epi32(col0123, 3);
-      }
-    }
-  }
-}
-
-void vp9_filter_block2d_8x4_8_sse4_1
-(
- const unsigned char *src_ptr, const unsigned int src_stride,
- const short *HFilter_aligned16, const short *VFilter_aligned16,
- unsigned char *dst_ptr, unsigned int dst_stride
-) {
-  int j;
-  for (j=0; j<8; j+=4) {
-    vp9_filter_block2d_4x4_8_sse4_1(src_ptr + j, src_stride,
-                                    HFilter_aligned16, VFilter_aligned16,
-                                    dst_ptr + j, dst_stride);
-  }
-}
-
-void vp9_filter_block2d_8x8_8_sse4_1
-(
- const unsigned char *src_ptr, const unsigned int src_stride,
- const short *HFilter_aligned16, const short *VFilter_aligned16,
- unsigned char *dst_ptr, unsigned int dst_stride
-) {
-  int i, j;
-  for (i=0; i<8; i+=4) {
-    for (j=0; j<8; j+=4) {
-      vp9_filter_block2d_4x4_8_sse4_1(src_ptr + j + i*src_stride, src_stride,
-                                      HFilter_aligned16, VFilter_aligned16,
-                                      dst_ptr + j + i*dst_stride, dst_stride);
-    }
-  }
-}
-
-void vp9_filter_block2d_16x16_8_sse4_1
-(
- const unsigned char *src_ptr, const unsigned int src_stride,
- const short *HFilter_aligned16, const short *VFilter_aligned16,
- unsigned char *dst_ptr, unsigned int dst_stride
-) {
-  int i, j;
-  for (i=0; i<16; i+=4) {
-    for (j=0; j<16; j+=4) {
-      vp9_filter_block2d_4x4_8_sse4_1(src_ptr + j + i*src_stride, src_stride,
-                                      HFilter_aligned16, VFilter_aligned16,
-                                      dst_ptr + j + i*dst_stride, dst_stride);
-    }
-  }
-}
--- a/vp8/common/x86/idct_x86.h
+++ /dev/null
@@ -1,64 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef IDCT_X86_H
-#define IDCT_X86_H
-
-/* Note:
- *
- * This platform is commonly built for runtime CPU detection. If you modify
- * any of the function mappings present in this file, be sure to also update
- * them in the function pointer initialization code
- */
-
-#if HAVE_MMX
-extern prototype_idct(vp9_short_idct4x4llm_1_mmx);
-extern prototype_idct(vp9_short_idct4x4llm_mmx);
-extern prototype_idct_scalar_add(vp9_dc_only_idct_add_mmx);
-
-extern prototype_second_order(vp9_short_inv_walsh4x4_mmx);
-extern prototype_second_order(vp9_short_inv_walsh4x4_1_mmx);
-
-#if !CONFIG_RUNTIME_CPU_DETECT
-#undef  vp9_idct_idct1
-#define vp9_idct_idct1 vp9_short_idct4x4llm_1_mmx
-
-#undef  vp9_idct_idct16
-#define vp9_idct_idct16 vp9_short_idct4x4llm_mmx
-
-#undef  vp9_idct_idct1_scalar_add
-#define vp9_idct_idct1_scalar_add vp9_dc_only_idct_add_mmx
-
-#undef vp9_idct_iwalsh16
-#define vp9_idct_iwalsh16 vp9_short_inv_walsh4x4_mmx
-
-#undef vp9_idct_iwalsh1
-#define vp9_idct_iwalsh1 vp9_short_inv_walsh4x4_1_mmx
-
-#endif
-#endif
-
-#if HAVE_SSE2
-
-extern prototype_second_order(vp9_short_inv_walsh4x4_sse2);
-
-#if !CONFIG_RUNTIME_CPU_DETECT
-
-#undef vp9_idct_iwalsh16
-#define vp9_idct_iwalsh16 vp9_short_inv_walsh4x4_sse2
-
-#endif
-
-#endif
-
-
-
-#endif
--- a/vp8/common/x86/idctllm_mmx.asm
+++ /dev/null
@@ -1,241 +1,0 @@
-;
-;  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-%include "third_party/x86inc/x86inc.asm"
-
-SECTION_RODATA
-align 16
-x_s1sqr2:      times 4 dw 0x8A8C
-align 16
-x_c1sqr2less1: times 4 dw 0x4E7B
-align 16
-pw_16:         times 4 dw 16
-
-SECTION .text
-
-
-; /****************************************************************************
-; * Notes:
-; *
-; * This implementation makes use of 16 bit fixed point version of two multiply
-; * constants:
-; *        1.   sqrt(2) * cos (pi/8)
-; *        2.   sqrt(2) * sin (pi/8)
-; * Because the first constant is bigger than 1, to maintain the same 16 bit
-; * fixed point precision as the second one, we use a trick of
-; *        x * a = x + x*(a-1)
-; * so
-; *        x * sqrt(2) * cos (pi/8) = x + x * (sqrt(2) *cos(pi/8)-1).
-; *
-; * For the second constant, because of the 16bit version is 35468, which
-; * is bigger than 32768, in signed 16 bit multiply, it becomes a negative
-; * number.
-; *        (x * (unsigned)35468 >> 16) = x * (signed)35468 >> 16 + x
-; *
-; **************************************************************************/
-
-INIT_MMX
-
-;void short_idct4x4llm_mmx(short *input, short *output, int pitch)
-cglobal short_idct4x4llm_mmx, 3,3,0, inp, out, pit
-    mova            m0,     [inpq +0]
-    mova            m1,     [inpq +8]
-
-    mova            m2,     [inpq+16]
-    mova            m3,     [inpq+24]
-
-    psubw           m0,      m2             ; b1= 0-2
-    paddw           m2,      m2             ;
-
-    mova            m5,      m1
-    paddw           m2,      m0             ; a1 =0+2
-
-    pmulhw          m5,     [x_s1sqr2]       ;
-    paddw           m5,      m1             ; ip1 * sin(pi/8) * sqrt(2)
-
-    mova            m7,      m3             ;
-    pmulhw          m7,     [x_c1sqr2less1]   ;
-
-    paddw           m7,      m3             ; ip3 * cos(pi/8) * sqrt(2)
-    psubw           m7,      m5             ; c1
-
-    mova            m5,      m1
-    mova            m4,      m3
-
-    pmulhw          m5,     [x_c1sqr2less1]
-    paddw           m5,      m1
-
-    pmulhw          m3,     [x_s1sqr2]
-    paddw           m3,      m4
-
-    paddw           m3,      m5             ; d1
-    mova            m6,      m2             ; a1
-
-    mova            m4,      m0             ; b1
-    paddw           m2,      m3             ;0
-
-    paddw           m4,      m7             ;1
-    psubw           m0,      m7             ;2
-
-    psubw           m6,      m3             ;3
-
-    mova            m1,      m2             ; 03 02 01 00
-    mova            m3,      m4             ; 23 22 21 20
-
-    punpcklwd       m1,      m0             ; 11 01 10 00
-    punpckhwd       m2,      m0             ; 13 03 12 02
-
-    punpcklwd       m3,      m6             ; 31 21 30 20
-    punpckhwd       m4,      m6             ; 33 23 32 22
-
-    mova            m0,      m1             ; 11 01 10 00
-    mova            m5,      m2             ; 13 03 12 02
-
-    punpckldq       m0,      m3             ; 30 20 10 00
-    punpckhdq       m1,      m3             ; 31 21 11 01
-
-    punpckldq       m2,      m4             ; 32 22 12 02
-    punpckhdq       m5,      m4             ; 33 23 13 03
-
-    mova            m3,      m5             ; 33 23 13 03
-
-    psubw           m0,      m2             ; b1= 0-2
-    paddw           m2,      m2             ;
-
-    mova            m5,      m1
-    paddw           m2,      m0             ; a1 =0+2
-
-    pmulhw          m5,     [x_s1sqr2]        ;
-    paddw           m5,      m1             ; ip1 * sin(pi/8) * sqrt(2)
-
-    mova            m7,      m3             ;
-    pmulhw          m7,     [x_c1sqr2less1]   ;
-
-    paddw           m7,      m3             ; ip3 * cos(pi/8) * sqrt(2)
-    psubw           m7,      m5             ; c1
-
-    mova            m5,      m1
-    mova            m4,      m3
-
-    pmulhw          m5,     [x_c1sqr2less1]
-    paddw           m5,      m1
-
-    pmulhw          m3,     [x_s1sqr2]
-    paddw           m3,      m4
-
-    paddw           m3,      m5             ; d1
-    paddw           m0,     [pw_16]
-
-    paddw           m2,     [pw_16]
-    mova            m6,      m2             ; a1
-
-    mova            m4,      m0             ; b1
-    paddw           m2,      m3             ;0
-
-    paddw           m4,      m7             ;1
-    psubw           m0,      m7             ;2
-
-    psubw           m6,      m3             ;3
-    psraw           m2,      5
-
-    psraw           m0,      5
-    psraw           m4,      5
-
-    psraw           m6,      5
-
-    mova            m1,      m2             ; 03 02 01 00
-    mova            m3,      m4             ; 23 22 21 20
-
-    punpcklwd       m1,      m0             ; 11 01 10 00
-    punpckhwd       m2,      m0             ; 13 03 12 02
-
-    punpcklwd       m3,      m6             ; 31 21 30 20
-    punpckhwd       m4,      m6             ; 33 23 32 22
-
-    mova            m0,      m1             ; 11 01 10 00
-    mova            m5,      m2             ; 13 03 12 02
-
-    punpckldq       m0,      m3             ; 30 20 10 00
-    punpckhdq       m1,      m3             ; 31 21 11 01
-
-    punpckldq       m2,      m4             ; 32 22 12 02
-    punpckhdq       m5,      m4             ; 33 23 13 03
-
-    mova        [outq],      m0
-
-    mova     [outq+r2],      m1
-    mova [outq+pitq*2],      m2
-
-    add           outq,      pitq
-    mova [outq+pitq*2],      m5
-    RET
-
-;void short_idct4x4llm_1_mmx(short *input, short *output, int pitch)
-cglobal short_idct4x4llm_1_mmx,3,3,0,inp,out,pit
-    movh            m0,     [inpq]
-    paddw           m0,     [pw_16]
-    psraw           m0,      5
-    punpcklwd       m0,      m0
-    punpckldq       m0,      m0
-
-    mova        [outq],      m0
-    mova   [outq+pitq],      m0
-
-    mova [outq+pitq*2],      m0
-    add             r1,      r2
-
-    mova [outq+pitq*2],      m0
-    RET
-
-
-;void dc_only_idct_add_mmx(short input_dc, unsigned char *pred_ptr, unsigned char *dst_ptr, int pitch, int stride)
-cglobal dc_only_idct_add_mmx, 4,5,0,in_dc,pred,dst,pit,stride
-%if ARCH_X86_64
-    movsxd         strideq,      dword stridem
-%else
-    mov            strideq,      stridem
-%endif
-    pxor                m0,      m0
-
-    movh                m5,      in_dcq ; dc
-    paddw               m5,     [pw_16]
-
-    psraw               m5,      5
-
-    punpcklwd           m5,      m5
-    punpckldq           m5,      m5
-
-    movh                m1,     [predq]
-    punpcklbw           m1,      m0
-    paddsw              m1,      m5
-    packuswb            m1,      m0              ; pack and unpack to saturate
-    movh            [dstq],      m1
-
-    movh                m2,     [predq+pitq]
-    punpcklbw           m2,      m0
-    paddsw              m2,      m5
-    packuswb            m2,      m0              ; pack and unpack to saturate
-    movh    [dstq+strideq],      m2
-
-    movh                m3,     [predq+2*pitq]
-    punpcklbw           m3,      m0
-    paddsw              m3,      m5
-    packuswb            m3,      m0              ; pack and unpack to saturate
-    movh  [dstq+2*strideq],      m3
-
-    add               dstq,      strideq
-    add              predq,      pitq
-    movh                m4,     [predq+2*pitq]
-    punpcklbw           m4,      m0
-    paddsw              m4,      m5
-    packuswb            m4,      m0              ; pack and unpack to saturate
-    movh  [dstq+2*strideq],      m4
-    RET
-
--- a/vp8/common/x86/idctllm_sse2.asm
+++ /dev/null
@@ -1,712 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-;void vp9_idct_dequant_0_2x_sse2
-; (
-;   short *qcoeff       - 0
-;   short *dequant      - 1
-;   unsigned char *pre  - 2
-;   unsigned char *dst  - 3
-;   int dst_stride      - 4
-;   int blk_stride      - 5
-; )
-
-global sym(vp9_idct_dequant_0_2x_sse2)
-sym(vp9_idct_dequant_0_2x_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    GET_GOT     rbx
-    ; end prolog
-
-        mov         rdx,            arg(1) ; dequant
-        mov         rax,            arg(0) ; qcoeff
-
-        movd        xmm4,           [rax]
-        movd        xmm5,           [rdx]
-
-        pinsrw      xmm4,           [rax+32],   4
-        pinsrw      xmm5,           [rdx],      4
-
-        pmullw      xmm4,           xmm5
-
-    ; Zero out xmm5, for use unpacking
-        pxor        xmm5,           xmm5
-
-    ; clear coeffs
-        movd        [rax],          xmm5
-        movd        [rax+32],       xmm5
-;pshufb
-        pshuflw     xmm4,           xmm4,       00000000b
-        pshufhw     xmm4,           xmm4,       00000000b
-
-        mov         rax,            arg(2) ; pre
-        paddw       xmm4,           [GLOBAL(fours)]
-
-        movsxd      rcx,            dword ptr arg(5) ; blk_stride
-        psraw       xmm4,           3
-
-        movq        xmm0,           [rax]
-        movq        xmm1,           [rax+rcx]
-        movq        xmm2,           [rax+2*rcx]
-        lea         rcx,            [3*rcx]
-        movq        xmm3,           [rax+rcx]
-
-        punpcklbw   xmm0,           xmm5
-        punpcklbw   xmm1,           xmm5
-        punpcklbw   xmm2,           xmm5
-        punpcklbw   xmm3,           xmm5
-
-        mov         rax,            arg(3) ; dst
-        movsxd      rdx,            dword ptr arg(4) ; dst_stride
-
-    ; Add to predict buffer
-        paddw       xmm0,           xmm4
-        paddw       xmm1,           xmm4
-        paddw       xmm2,           xmm4
-        paddw       xmm3,           xmm4
-
-    ; pack up before storing
-        packuswb    xmm0,           xmm5
-        packuswb    xmm1,           xmm5
-        packuswb    xmm2,           xmm5
-        packuswb    xmm3,           xmm5
-
-    ; store blocks back out
-        movq        [rax],          xmm0
-        movq        [rax + rdx],    xmm1
-
-        lea         rax,            [rax + 2*rdx]
-
-        movq        [rax],          xmm2
-        movq        [rax + rdx],    xmm3
-
-    ; begin epilog
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(vp9_idct_dequant_full_2x_sse2)
-sym(vp9_idct_dequant_full_2x_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ; special case when 2 blocks have 0 or 1 coeffs
-    ; dc is set as first coeff, so no need to load qcoeff
-        mov         rax,            arg(0) ; qcoeff
-        mov         rsi,            arg(2) ; pre
-        mov         rdi,            arg(3) ; dst
-        movsxd      rcx,            dword ptr arg(5) ; blk_stride
-
-    ; Zero out xmm7, for use unpacking
-        pxor        xmm7,           xmm7
-
-        mov         rdx,            arg(1)  ; dequant
-
-    ; note the transpose of xmm1 and xmm2, necessary for shuffle
-    ;   to spit out sensicle data
-        movdqa      xmm0,           [rax]
-        movdqa      xmm2,           [rax+16]
-        movdqa      xmm1,           [rax+32]
-        movdqa      xmm3,           [rax+48]
-
-    ; Clear out coeffs
-        movdqa      [rax],          xmm7
-        movdqa      [rax+16],       xmm7
-        movdqa      [rax+32],       xmm7
-        movdqa      [rax+48],       xmm7
-
-    ; dequantize qcoeff buffer
-        pmullw      xmm0,           [rdx]
-        pmullw      xmm2,           [rdx+16]
-        pmullw      xmm1,           [rdx]
-        pmullw      xmm3,           [rdx+16]
-
-    ; repack so block 0 row x and block 1 row x are together
-        movdqa      xmm4,           xmm0
-        punpckldq   xmm0,           xmm1
-        punpckhdq   xmm4,           xmm1
-
-        pshufd      xmm0,           xmm0,       11011000b
-        pshufd      xmm1,           xmm4,       11011000b
-
-        movdqa      xmm4,           xmm2
-        punpckldq   xmm2,           xmm3
-        punpckhdq   xmm4,           xmm3
-
-        pshufd      xmm2,           xmm2,       11011000b
-        pshufd      xmm3,           xmm4,       11011000b
-
-    ; first pass
-        psubw       xmm0,           xmm2        ; b1 = 0-2
-        paddw       xmm2,           xmm2        ;
-
-        movdqa      xmm5,           xmm1
-        paddw       xmm2,           xmm0        ; a1 = 0+2
-
-        pmulhw      xmm5,           [GLOBAL(x_s1sqr2)]
-        paddw       xmm5,           xmm1        ; ip1 * sin(pi/8) * sqrt(2)
-
-        movdqa      xmm7,           xmm3
-        pmulhw      xmm7,           [GLOBAL(x_c1sqr2less1)]
-
-        paddw       xmm7,           xmm3        ; ip3 * cos(pi/8) * sqrt(2)
-        psubw       xmm7,           xmm5        ; c1
-
-        movdqa      xmm5,           xmm1
-        movdqa      xmm4,           xmm3
-
-        pmulhw      xmm5,           [GLOBAL(x_c1sqr2less1)]
-        paddw       xmm5,           xmm1
-
-        pmulhw      xmm3,           [GLOBAL(x_s1sqr2)]
-        paddw       xmm3,           xmm4
-
-        paddw       xmm3,           xmm5        ; d1
-        movdqa      xmm6,           xmm2        ; a1
-
-        movdqa      xmm4,           xmm0        ; b1
-        paddw       xmm2,           xmm3        ;0
-
-        paddw       xmm4,           xmm7        ;1
-        psubw       xmm0,           xmm7        ;2
-
-        psubw       xmm6,           xmm3        ;3
-
-    ; transpose for the second pass
-        movdqa      xmm7,           xmm2        ; 103 102 101 100 003 002 001 000
-        punpcklwd   xmm2,           xmm0        ; 007 003 006 002 005 001 004 000
-        punpckhwd   xmm7,           xmm0        ; 107 103 106 102 105 101 104 100
-
-        movdqa      xmm5,           xmm4        ; 111 110 109 108 011 010 009 008
-        punpcklwd   xmm4,           xmm6        ; 015 011 014 010 013 009 012 008
-        punpckhwd   xmm5,           xmm6        ; 115 111 114 110 113 109 112 108
-
-
-        movdqa      xmm1,           xmm2        ; 007 003 006 002 005 001 004 000
-        punpckldq   xmm2,           xmm4        ; 013 009 005 001 012 008 004 000
-        punpckhdq   xmm1,           xmm4        ; 015 011 007 003 014 010 006 002
-
-        movdqa      xmm6,           xmm7        ; 107 103 106 102 105 101 104 100
-        punpckldq   xmm7,           xmm5        ; 113 109 105 101 112 108 104 100
-        punpckhdq   xmm6,           xmm5        ; 115 111 107 103 114 110 106 102
-
-
-        movdqa      xmm5,           xmm2        ; 013 009 005 001 012 008 004 000
-        punpckldq   xmm2,           xmm7        ; 112 108 012 008 104 100 004 000
-        punpckhdq   xmm5,           xmm7        ; 113 109 013 009 105 101 005 001
-
-        movdqa      xmm7,           xmm1        ; 015 011 007 003 014 010 006 002
-        punpckldq   xmm1,           xmm6        ; 114 110 014 010 106 102 006 002
-        punpckhdq   xmm7,           xmm6        ; 115 111 015 011 107 103 007 003
-
-        pshufd      xmm0,           xmm2,       11011000b
-        pshufd      xmm2,           xmm1,       11011000b
-
-        pshufd      xmm1,           xmm5,       11011000b
-        pshufd      xmm3,           xmm7,       11011000b
-
-    ; second pass
-        psubw       xmm0,           xmm2            ; b1 = 0-2
-        paddw       xmm2,           xmm2
-
-        movdqa      xmm5,           xmm1
-        paddw       xmm2,           xmm0            ; a1 = 0+2
-
-        pmulhw      xmm5,           [GLOBAL(x_s1sqr2)]
-        paddw       xmm5,           xmm1            ; ip1 * sin(pi/8) * sqrt(2)
-
-        movdqa      xmm7,           xmm3
-        pmulhw      xmm7,           [GLOBAL(x_c1sqr2less1)]
-
-        paddw       xmm7,           xmm3            ; ip3 * cos(pi/8) * sqrt(2)
-        psubw       xmm7,           xmm5            ; c1
-
-        movdqa      xmm5,           xmm1
-        movdqa      xmm4,           xmm3
-
-        pmulhw      xmm5,           [GLOBAL(x_c1sqr2less1)]
-        paddw       xmm5,           xmm1
-
-        pmulhw      xmm3,           [GLOBAL(x_s1sqr2)]
-        paddw       xmm3,           xmm4
-
-        paddw       xmm3,           xmm5            ; d1
-        paddw       xmm0,           [GLOBAL(fours)]
-
-        paddw       xmm2,           [GLOBAL(fours)]
-        movdqa      xmm6,           xmm2            ; a1
-
-        movdqa      xmm4,           xmm0            ; b1
-        paddw       xmm2,           xmm3            ;0
-
-        paddw       xmm4,           xmm7            ;1
-        psubw       xmm0,           xmm7            ;2
-
-        psubw       xmm6,           xmm3            ;3
-        psraw       xmm2,           3
-
-        psraw       xmm0,           3
-        psraw       xmm4,           3
-
-        psraw       xmm6,           3
-
-    ; transpose to save
-        movdqa      xmm7,           xmm2        ; 103 102 101 100 003 002 001 000
-        punpcklwd   xmm2,           xmm0        ; 007 003 006 002 005 001 004 000
-        punpckhwd   xmm7,           xmm0        ; 107 103 106 102 105 101 104 100
-
-        movdqa      xmm5,           xmm4        ; 111 110 109 108 011 010 009 008
-        punpcklwd   xmm4,           xmm6        ; 015 011 014 010 013 009 012 008
-        punpckhwd   xmm5,           xmm6        ; 115 111 114 110 113 109 112 108
-
-
-        movdqa      xmm1,           xmm2        ; 007 003 006 002 005 001 004 000
-        punpckldq   xmm2,           xmm4        ; 013 009 005 001 012 008 004 000
-        punpckhdq   xmm1,           xmm4        ; 015 011 007 003 014 010 006 002
-
-        movdqa      xmm6,           xmm7        ; 107 103 106 102 105 101 104 100
-        punpckldq   xmm7,           xmm5        ; 113 109 105 101 112 108 104 100
-        punpckhdq   xmm6,           xmm5        ; 115 111 107 103 114 110 106 102
-
-
-        movdqa      xmm5,           xmm2        ; 013 009 005 001 012 008 004 000
-        punpckldq   xmm2,           xmm7        ; 112 108 012 008 104 100 004 000
-        punpckhdq   xmm5,           xmm7        ; 113 109 013 009 105 101 005 001
-
-        movdqa      xmm7,           xmm1        ; 015 011 007 003 014 010 006 002
-        punpckldq   xmm1,           xmm6        ; 114 110 014 010 106 102 006 002
-        punpckhdq   xmm7,           xmm6        ; 115 111 015 011 107 103 007 003
-
-        pshufd      xmm0,           xmm2,       11011000b
-        pshufd      xmm2,           xmm1,       11011000b
-
-        pshufd      xmm1,           xmm5,       11011000b
-        pshufd      xmm3,           xmm7,       11011000b
-
-        pxor        xmm7,           xmm7
-
-    ; Load up predict blocks
-        movq        xmm4,           [rsi]
-        movq        xmm5,           [rsi+rcx]
-
-        punpcklbw   xmm4,           xmm7
-        punpcklbw   xmm5,           xmm7
-
-        paddw       xmm0,           xmm4
-        paddw       xmm1,           xmm5
-
-        movq        xmm4,           [rsi+2*rcx]
-        lea         rcx,            [3*rcx]
-        movq        xmm5,           [rsi+rcx]
-
-        punpcklbw   xmm4,           xmm7
-        punpcklbw   xmm5,           xmm7
-
-        paddw       xmm2,           xmm4
-        paddw       xmm3,           xmm5
-
-.finish:
-
-    ; pack up before storing
-        packuswb    xmm0,           xmm7
-        packuswb    xmm1,           xmm7
-        packuswb    xmm2,           xmm7
-        packuswb    xmm3,           xmm7
-
-    ; Load destination stride before writing out,
-    ;   doesn't need to persist
-        movsxd      rdx,            dword ptr arg(4) ; dst_stride
-
-    ; store blocks back out
-        movq        [rdi],          xmm0
-        movq        [rdi + rdx],    xmm1
-
-        lea         rdi,            [rdi + 2*rdx]
-
-        movq        [rdi],          xmm2
-        movq        [rdi + rdx],    xmm3
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void vp9_idct_dequant_dc_0_2x_sse2
-; (
-;   short *qcoeff       - 0
-;   short *dequant      - 1
-;   unsigned char *pre  - 2
-;   unsigned char *dst  - 3
-;   int dst_stride      - 4
-;   short *dc           - 5
-; )
-global sym(vp9_idct_dequant_dc_0_2x_sse2)
-sym(vp9_idct_dequant_dc_0_2x_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ; special case when 2 blocks have 0 or 1 coeffs
-    ; dc is set as first coeff, so no need to load qcoeff
-        mov         rax,            arg(0) ; qcoeff
-        mov         rsi,            arg(2) ; pre
-        mov         rdi,            arg(3) ; dst
-        mov         rdx,            arg(5) ; dc
-
-    ; Zero out xmm5, for use unpacking
-        pxor        xmm5,           xmm5
-
-    ; load up 2 dc words here == 2*16 = doubleword
-        movd        xmm4,           [rdx]
-
-    ; Load up predict blocks
-        movq        xmm0,           [rsi]
-        movq        xmm1,           [rsi+16]
-        movq        xmm2,           [rsi+32]
-        movq        xmm3,           [rsi+48]
-
-    ; Duplicate and expand dc across
-        punpcklwd   xmm4,           xmm4
-        punpckldq   xmm4,           xmm4
-
-    ; Rounding to dequant and downshift
-        paddw       xmm4,           [GLOBAL(fours)]
-        psraw       xmm4,           3
-
-    ; Predict buffer needs to be expanded from bytes to words
-        punpcklbw   xmm0,           xmm5
-        punpcklbw   xmm1,           xmm5
-        punpcklbw   xmm2,           xmm5
-        punpcklbw   xmm3,           xmm5
-
-    ; Add to predict buffer
-        paddw       xmm0,           xmm4
-        paddw       xmm1,           xmm4
-        paddw       xmm2,           xmm4
-        paddw       xmm3,           xmm4
-
-    ; pack up before storing
-        packuswb    xmm0,           xmm5
-        packuswb    xmm1,           xmm5
-        packuswb    xmm2,           xmm5
-        packuswb    xmm3,           xmm5
-
-    ; Load destination stride before writing out,
-    ;   doesn't need to persist
-        movsxd      rdx,            dword ptr arg(4) ; dst_stride
-
-    ; store blocks back out
-        movq        [rdi],          xmm0
-        movq        [rdi + rdx],    xmm1
-
-        lea         rdi,            [rdi + 2*rdx]
-
-        movq        [rdi],          xmm2
-        movq        [rdi + rdx],    xmm3
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(vp9_idct_dequant_dc_full_2x_sse2)
-sym(vp9_idct_dequant_dc_full_2x_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ; special case when 2 blocks have 0 or 1 coeffs
-    ; dc is set as first coeff, so no need to load qcoeff
-        mov         rax,            arg(0) ; qcoeff
-        mov         rsi,            arg(2) ; pre
-        mov         rdi,            arg(3) ; dst
-
-    ; Zero out xmm7, for use unpacking
-        pxor        xmm7,           xmm7
-
-        mov         rdx,            arg(1)  ; dequant
-
-    ; note the transpose of xmm1 and xmm2, necessary for shuffle
-    ;   to spit out sensicle data
-        movdqa      xmm0,           [rax]
-        movdqa      xmm2,           [rax+16]
-        movdqa      xmm1,           [rax+32]
-        movdqa      xmm3,           [rax+48]
-
-    ; Clear out coeffs
-        movdqa      [rax],          xmm7
-        movdqa      [rax+16],       xmm7
-        movdqa      [rax+32],       xmm7
-        movdqa      [rax+48],       xmm7
-
-    ; dequantize qcoeff buffer
-        pmullw      xmm0,           [rdx]
-        pmullw      xmm2,           [rdx+16]
-        pmullw      xmm1,           [rdx]
-        pmullw      xmm3,           [rdx+16]
-
-    ; DC component
-        mov         rdx,            arg(5)
-
-    ; repack so block 0 row x and block 1 row x are together
-        movdqa      xmm4,           xmm0
-        punpckldq   xmm0,           xmm1
-        punpckhdq   xmm4,           xmm1
-
-        pshufd      xmm0,           xmm0,       11011000b
-        pshufd      xmm1,           xmm4,       11011000b
-
-        movdqa      xmm4,           xmm2
-        punpckldq   xmm2,           xmm3
-        punpckhdq   xmm4,           xmm3
-
-        pshufd      xmm2,           xmm2,       11011000b
-        pshufd      xmm3,           xmm4,       11011000b
-
-    ; insert DC component
-        pinsrw      xmm0,           [rdx],      0
-        pinsrw      xmm0,           [rdx+2],    4
-
-    ; first pass
-        psubw       xmm0,           xmm2        ; b1 = 0-2
-        paddw       xmm2,           xmm2        ;
-
-        movdqa      xmm5,           xmm1
-        paddw       xmm2,           xmm0        ; a1 = 0+2
-
-        pmulhw      xmm5,           [GLOBAL(x_s1sqr2)]
-        paddw       xmm5,           xmm1        ; ip1 * sin(pi/8) * sqrt(2)
-
-        movdqa      xmm7,           xmm3
-        pmulhw      xmm7,           [GLOBAL(x_c1sqr2less1)]
-
-        paddw       xmm7,           xmm3        ; ip3 * cos(pi/8) * sqrt(2)
-        psubw       xmm7,           xmm5        ; c1
-
-        movdqa      xmm5,           xmm1
-        movdqa      xmm4,           xmm3
-
-        pmulhw      xmm5,           [GLOBAL(x_c1sqr2less1)]
-        paddw       xmm5,           xmm1
-
-        pmulhw      xmm3,           [GLOBAL(x_s1sqr2)]
-        paddw       xmm3,           xmm4
-
-        paddw       xmm3,           xmm5        ; d1
-        movdqa      xmm6,           xmm2        ; a1
-
-        movdqa      xmm4,           xmm0        ; b1
-        paddw       xmm2,           xmm3        ;0
-
-        paddw       xmm4,           xmm7        ;1
-        psubw       xmm0,           xmm7        ;2
-
-        psubw       xmm6,           xmm3        ;3
-
-    ; transpose for the second pass
-        movdqa      xmm7,           xmm2        ; 103 102 101 100 003 002 001 000
-        punpcklwd   xmm2,           xmm0        ; 007 003 006 002 005 001 004 000
-        punpckhwd   xmm7,           xmm0        ; 107 103 106 102 105 101 104 100
-
-        movdqa      xmm5,           xmm4        ; 111 110 109 108 011 010 009 008
-        punpcklwd   xmm4,           xmm6        ; 015 011 014 010 013 009 012 008
-        punpckhwd   xmm5,           xmm6        ; 115 111 114 110 113 109 112 108
-
-
-        movdqa      xmm1,           xmm2        ; 007 003 006 002 005 001 004 000
-        punpckldq   xmm2,           xmm4        ; 013 009 005 001 012 008 004 000
-        punpckhdq   xmm1,           xmm4        ; 015 011 007 003 014 010 006 002
-
-        movdqa      xmm6,           xmm7        ; 107 103 106 102 105 101 104 100
-        punpckldq   xmm7,           xmm5        ; 113 109 105 101 112 108 104 100
-        punpckhdq   xmm6,           xmm5        ; 115 111 107 103 114 110 106 102
-
-
-        movdqa      xmm5,           xmm2        ; 013 009 005 001 012 008 004 000
-        punpckldq   xmm2,           xmm7        ; 112 108 012 008 104 100 004 000
-        punpckhdq   xmm5,           xmm7        ; 113 109 013 009 105 101 005 001
-
-        movdqa      xmm7,           xmm1        ; 015 011 007 003 014 010 006 002
-        punpckldq   xmm1,           xmm6        ; 114 110 014 010 106 102 006 002
-        punpckhdq   xmm7,           xmm6        ; 115 111 015 011 107 103 007 003
-
-        pshufd      xmm0,           xmm2,       11011000b
-        pshufd      xmm2,           xmm1,       11011000b
-
-        pshufd      xmm1,           xmm5,       11011000b
-        pshufd      xmm3,           xmm7,       11011000b
-
-    ; second pass
-        psubw       xmm0,           xmm2            ; b1 = 0-2
-        paddw       xmm2,           xmm2
-
-        movdqa      xmm5,           xmm1
-        paddw       xmm2,           xmm0            ; a1 = 0+2
-
-        pmulhw      xmm5,           [GLOBAL(x_s1sqr2)]
-        paddw       xmm5,           xmm1            ; ip1 * sin(pi/8) * sqrt(2)
-
-        movdqa      xmm7,           xmm3
-        pmulhw      xmm7,           [GLOBAL(x_c1sqr2less1)]
-
-        paddw       xmm7,           xmm3            ; ip3 * cos(pi/8) * sqrt(2)
-        psubw       xmm7,           xmm5            ; c1
-
-        movdqa      xmm5,           xmm1
-        movdqa      xmm4,           xmm3
-
-        pmulhw      xmm5,           [GLOBAL(x_c1sqr2less1)]
-        paddw       xmm5,           xmm1
-
-        pmulhw      xmm3,           [GLOBAL(x_s1sqr2)]
-        paddw       xmm3,           xmm4
-
-        paddw       xmm3,           xmm5            ; d1
-        paddw       xmm0,           [GLOBAL(fours)]
-
-        paddw       xmm2,           [GLOBAL(fours)]
-        movdqa      xmm6,           xmm2            ; a1
-
-        movdqa      xmm4,           xmm0            ; b1
-        paddw       xmm2,           xmm3            ;0
-
-        paddw       xmm4,           xmm7            ;1
-        psubw       xmm0,           xmm7            ;2
-
-        psubw       xmm6,           xmm3            ;3
-        psraw       xmm2,           3
-
-        psraw       xmm0,           3
-        psraw       xmm4,           3
-
-        psraw       xmm6,           3
-
-    ; transpose to save
-        movdqa      xmm7,           xmm2        ; 103 102 101 100 003 002 001 000
-        punpcklwd   xmm2,           xmm0        ; 007 003 006 002 005 001 004 000
-        punpckhwd   xmm7,           xmm0        ; 107 103 106 102 105 101 104 100
-
-        movdqa      xmm5,           xmm4        ; 111 110 109 108 011 010 009 008
-        punpcklwd   xmm4,           xmm6        ; 015 011 014 010 013 009 012 008
-        punpckhwd   xmm5,           xmm6        ; 115 111 114 110 113 109 112 108
-
-
-        movdqa      xmm1,           xmm2        ; 007 003 006 002 005 001 004 000
-        punpckldq   xmm2,           xmm4        ; 013 009 005 001 012 008 004 000
-        punpckhdq   xmm1,           xmm4        ; 015 011 007 003 014 010 006 002
-
-        movdqa      xmm6,           xmm7        ; 107 103 106 102 105 101 104 100
-        punpckldq   xmm7,           xmm5        ; 113 109 105 101 112 108 104 100
-        punpckhdq   xmm6,           xmm5        ; 115 111 107 103 114 110 106 102
-
-
-        movdqa      xmm5,           xmm2        ; 013 009 005 001 012 008 004 000
-        punpckldq   xmm2,           xmm7        ; 112 108 012 008 104 100 004 000
-        punpckhdq   xmm5,           xmm7        ; 113 109 013 009 105 101 005 001
-
-        movdqa      xmm7,           xmm1        ; 015 011 007 003 014 010 006 002
-        punpckldq   xmm1,           xmm6        ; 114 110 014 010 106 102 006 002
-        punpckhdq   xmm7,           xmm6        ; 115 111 015 011 107 103 007 003
-
-        pshufd      xmm0,           xmm2,       11011000b
-        pshufd      xmm2,           xmm1,       11011000b
-
-        pshufd      xmm1,           xmm5,       11011000b
-        pshufd      xmm3,           xmm7,       11011000b
-
-        pxor        xmm7,           xmm7
-
-    ; Load up predict blocks
-        movq        xmm4,           [rsi]
-        movq        xmm5,           [rsi+16]
-
-        punpcklbw   xmm4,           xmm7
-        punpcklbw   xmm5,           xmm7
-
-        paddw       xmm0,           xmm4
-        paddw       xmm1,           xmm5
-
-        movq        xmm4,           [rsi+32]
-        movq        xmm5,           [rsi+48]
-
-        punpcklbw   xmm4,           xmm7
-        punpcklbw   xmm5,           xmm7
-
-        paddw       xmm2,           xmm4
-        paddw       xmm3,           xmm5
-
-.finish:
-
-    ; pack up before storing
-        packuswb    xmm0,           xmm7
-        packuswb    xmm1,           xmm7
-        packuswb    xmm2,           xmm7
-        packuswb    xmm3,           xmm7
-
-    ; Load destination stride before writing out,
-    ;   doesn't need to persist
-        movsxd      rdx,            dword ptr arg(4) ; dst_stride
-
-    ; store blocks back out
-        movq        [rdi],          xmm0
-        movq        [rdi + rdx],    xmm1
-
-        lea         rdi,            [rdi + 2*rdx]
-
-        movq        [rdi],          xmm2
-        movq        [rdi + rdx],    xmm3
-
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-SECTION_RODATA
-align 16
-fours:
-    times 8 dw 0x0004
-align 16
-x_s1sqr2:
-    times 8 dw 0x8A8C
-align 16
-x_c1sqr2less1:
-    times 8 dw 0x4E7B
--- a/vp8/common/x86/iwalsh_mmx.asm
+++ /dev/null
@@ -1,173 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-;void vp9_short_inv_walsh4x4_1_mmx(short *input, short *output)
-global sym(vp9_short_inv_walsh4x4_1_mmx)
-sym(vp9_short_inv_walsh4x4_1_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 2
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    mov     rsi, arg(0)
-    mov     rax, 3
-
-    mov     rdi, arg(1)
-    add     rax, [rsi]          ;input[0] + 3
-
-    movd    mm0, eax
-
-    punpcklwd mm0, mm0          ;x x val val
-
-    punpckldq mm0, mm0          ;val val val val
-
-    psraw   mm0, 3            ;(input[0] + 3) >> 3
-
-    movq  [rdi + 0], mm0
-    movq  [rdi + 8], mm0
-    movq  [rdi + 16], mm0
-    movq  [rdi + 24], mm0
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void vp9_short_inv_walsh4x4_mmx(short *input, short *output)
-global sym(vp9_short_inv_walsh4x4_mmx)
-sym(vp9_short_inv_walsh4x4_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 2
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    mov     rax, 3
-    mov     rsi, arg(0)
-    mov     rdi, arg(1)
-    shl     rax, 16
-
-    movq    mm0, [rsi + 0]        ;ip[0]
-    movq    mm1, [rsi + 8]        ;ip[4]
-    or      rax, 3            ;00030003h
-
-    movq    mm2, [rsi + 16]       ;ip[8]
-    movq    mm3, [rsi + 24]       ;ip[12]
-
-    movq    mm7, rax
-    movq    mm4, mm0
-
-    punpcklwd mm7, mm7          ;0003000300030003h
-    movq    mm5, mm1
-
-    paddw   mm4, mm3          ;ip[0] + ip[12] aka al
-    paddw   mm5, mm2          ;ip[4] + ip[8] aka bl
-
-    movq    mm6, mm4          ;temp al
-
-    paddw   mm4, mm5          ;al + bl
-    psubw   mm6, mm5          ;al - bl
-
-    psubw   mm0, mm3          ;ip[0] - ip[12] aka d1
-    psubw   mm1, mm2          ;ip[4] - ip[8] aka c1
-
-    movq    mm5, mm0          ;temp dl
-
-    paddw   mm0, mm1          ;dl + cl
-    psubw   mm5, mm1          ;dl - cl
-
-    ; 03 02 01 00
-    ; 13 12 11 10
-    ; 23 22 21 20
-    ; 33 32 31 30
-
-    movq    mm3, mm4          ; 03 02 01 00
-    punpcklwd mm4, mm0          ; 11 01 10 00
-    punpckhwd mm3, mm0          ; 13 03 12 02
-
-    movq    mm1, mm6          ; 23 22 21 20
-    punpcklwd mm6, mm5          ; 31 21 30 20
-    punpckhwd mm1, mm5          ; 33 23 32 22
-
-    movq    mm0, mm4          ; 11 01 10 00
-    movq    mm2, mm3          ; 13 03 12 02
-
-    punpckldq mm0, mm6          ; 30 20 10 00 aka ip[0]
-    punpckhdq mm4, mm6          ; 31 21 11 01 aka ip[4]
-
-    punpckldq mm2, mm1          ; 32 22 12 02 aka ip[8]
-    punpckhdq mm3, mm1          ; 33 23 13 03 aka ip[12]
-;~~~~~~~~~~~~~~~~~~~~~
-    movq    mm1, mm0
-    movq    mm5, mm4
-
-    paddw   mm1, mm3          ;ip[0] + ip[12] aka al
-    paddw   mm5, mm2          ;ip[4] + ip[8] aka bl
-
-    movq    mm6, mm1          ;temp al
-
-    paddw   mm1, mm5          ;al + bl
-    psubw   mm6, mm5          ;al - bl
-
-    psubw   mm0, mm3          ;ip[0] - ip[12] aka d1
-    psubw   mm4, mm2          ;ip[4] - ip[8] aka c1
-
-    movq    mm5, mm0          ;temp dl
-
-    paddw   mm0, mm4          ;dl + cl
-    psubw   mm5, mm4          ;dl - cl
-;~~~~~~~~~~~~~~~~~~~~~
-    movq    mm3, mm1          ; 03 02 01 00
-    punpcklwd mm1, mm0          ; 11 01 10 00
-    punpckhwd mm3, mm0          ; 13 03 12 02
-
-    movq    mm4, mm6          ; 23 22 21 20
-    punpcklwd mm6, mm5          ; 31 21 30 20
-    punpckhwd mm4, mm5          ; 33 23 32 22
-
-    movq    mm0, mm1          ; 11 01 10 00
-    movq    mm2, mm3          ; 13 03 12 02
-
-    punpckldq mm0, mm6          ; 30 20 10 00 aka ip[0]
-    punpckhdq mm1, mm6          ; 31 21 11 01 aka ip[4]
-
-    punpckldq mm2, mm4          ; 32 22 12 02 aka ip[8]
-    punpckhdq mm3, mm4          ; 33 23 13 03 aka ip[12]
-
-    paddw   mm0, mm7
-    paddw   mm1, mm7
-    paddw   mm2, mm7
-    paddw   mm3, mm7
-
-    psraw   mm0, 3
-    psraw   mm1, 3
-    psraw   mm2, 3
-    psraw   mm3, 3
-
-    movq  [rdi + 0], mm0
-    movq  [rdi + 8], mm1
-    movq  [rdi + 16], mm2
-    movq  [rdi + 24], mm3
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
--- a/vp8/common/x86/iwalsh_sse2.asm
+++ /dev/null
@@ -1,119 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-;void vp9_short_inv_walsh4x4_sse2(short *input, short *output)
-global sym(vp9_short_inv_walsh4x4_sse2)
-sym(vp9_short_inv_walsh4x4_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 2
-    SAVE_XMM 6
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    mov     rsi, arg(0)
-    mov     rdi, arg(1)
-    mov     rax, 3
-
-    movdqa    xmm0, [rsi + 0]       ;ip[4] ip[0]
-    movdqa    xmm1, [rsi + 16]      ;ip[12] ip[8]
-
-    shl     rax, 16
-    or      rax, 3            ;00030003h
-
-    pshufd    xmm2, xmm1, 4eh       ;ip[8] ip[12]
-    movdqa    xmm3, xmm0          ;ip[4] ip[0]
-
-    paddw   xmm0, xmm2          ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1
-    psubw   xmm3, xmm2          ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1
-
-    movdqa    xmm4, xmm0
-    punpcklqdq  xmm0, xmm3          ;d1 a1
-    punpckhqdq  xmm4, xmm3          ;c1 b1
-    movd    xmm6, eax
-
-    movdqa    xmm1, xmm4          ;c1 b1
-    paddw   xmm4, xmm0          ;dl+cl a1+b1 aka op[4] op[0]
-    psubw   xmm0, xmm1          ;d1-c1 a1-b1 aka op[12] op[8]
-
-;;;temp output
-;;  movdqu  [rdi + 0], xmm4
-;;  movdqu  [rdi + 16], xmm3
-
-;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-    ; 13 12 11 10 03 02 01 00
-    ;
-    ; 33 32 31 30 23 22 21 20
-    ;
-    movdqa    xmm3, xmm4          ; 13 12 11 10 03 02 01 00
-    punpcklwd xmm4, xmm0          ; 23 03 22 02 21 01 20 00
-    punpckhwd xmm3, xmm0          ; 33 13 32 12 31 11 30 10
-    movdqa    xmm1, xmm4          ; 23 03 22 02 21 01 20 00
-    punpcklwd xmm4, xmm3          ; 31 21 11 01 30 20 10 00
-    punpckhwd xmm1, xmm3          ; 33 23 13 03 32 22 12 02
-    ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-    pshufd    xmm2, xmm1, 4eh       ;ip[8] ip[12]
-    movdqa    xmm3, xmm4          ;ip[4] ip[0]
-
-    pshufd    xmm6, xmm6, 0       ;03 03 03 03 03 03 03 03
-
-    paddw   xmm4, xmm2          ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1
-    psubw   xmm3, xmm2          ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1
-
-    movdqa    xmm5, xmm4
-    punpcklqdq  xmm4, xmm3          ;d1 a1
-    punpckhqdq  xmm5, xmm3          ;c1 b1
-
-    movdqa    xmm1, xmm5          ;c1 b1
-    paddw   xmm5, xmm4          ;dl+cl a1+b1 aka op[4] op[0]
-    psubw   xmm4, xmm1          ;d1-c1 a1-b1 aka op[12] op[8]
-;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-    ; 13 12 11 10 03 02 01 00
-    ;
-    ; 33 32 31 30 23 22 21 20
-    ;
-    movdqa    xmm0, xmm5          ; 13 12 11 10 03 02 01 00
-    punpcklwd xmm5, xmm4          ; 23 03 22 02 21 01 20 00
-    punpckhwd xmm0, xmm4          ; 33 13 32 12 31 11 30 10
-    movdqa    xmm1, xmm5          ; 23 03 22 02 21 01 20 00
-    punpcklwd xmm5, xmm0          ; 31 21 11 01 30 20 10 00
-    punpckhwd xmm1, xmm0          ; 33 23 13 03 32 22 12 02
-;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-    paddw   xmm5, xmm6
-    paddw   xmm1, xmm6
-
-    psraw   xmm5, 3
-    psraw   xmm1, 3
-
-    movdqa  [rdi + 0], xmm5
-    movdqa  [rdi + 16], xmm1
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-SECTION_RODATA
-align 16
-x_s1sqr2:
-    times 4 dw 0x8A8C
-align 16
-x_c1sqr2less1:
-    times 4 dw 0x4E7B
-align 16
-fours:
-    times 4 dw 0x0004
--- a/vp8/common/x86/loopfilter_mmx.asm
+++ /dev/null
@@ -1,969 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-
-;void vp9_loop_filter_horizontal_edge_mmx
-;(
-;    unsigned char *src_ptr,
-;    int src_pixel_step,
-;    const char *blimit,
-;    const char *limit,
-;    const char *thresh,
-;    int  count
-;)
-global sym(vp9_loop_filter_horizontal_edge_mmx)
-sym(vp9_loop_filter_horizontal_edge_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 32                         ; reserve 32 bytes
-    %define t0 [rsp + 0]    ;__declspec(align(16)) char t0[8];
-    %define t1 [rsp + 16]   ;__declspec(align(16)) char t1[8];
-
-        mov         rsi, arg(0) ;src_ptr
-        movsxd      rax, dword ptr arg(1) ;src_pixel_step     ; destination pitch?
-
-        movsxd      rcx, dword ptr arg(5) ;count
-.next8_h:
-        mov         rdx, arg(3) ;limit
-        movq        mm7, [rdx]
-        mov         rdi, rsi              ; rdi points to row +1 for indirect addressing
-        add         rdi, rax
-
-        ; calculate breakout conditions
-        movq        mm2, [rdi+2*rax]      ; q3
-        movq        mm1, [rsi+2*rax]      ; q2
-        movq        mm6, mm1              ; q2
-        psubusb     mm1, mm2              ; q2-=q3
-        psubusb     mm2, mm6              ; q3-=q2
-        por         mm1, mm2              ; abs(q3-q2)
-        psubusb     mm1, mm7              ;
-
-
-        movq        mm4, [rsi+rax]        ; q1
-        movq        mm3, mm4              ; q1
-        psubusb     mm4, mm6              ; q1-=q2
-        psubusb     mm6, mm3              ; q2-=q1
-        por         mm4, mm6              ; abs(q2-q1)
-
-        psubusb     mm4, mm7
-        por        mm1, mm4
-
-        movq        mm4, [rsi]            ; q0
-        movq        mm0, mm4              ; q0
-        psubusb     mm4, mm3              ; q0-=q1
-        psubusb     mm3, mm0              ; q1-=q0
-        por         mm4, mm3              ; abs(q0-q1)
-        movq        t0, mm4               ; save to t0
-        psubusb     mm4, mm7
-        por        mm1, mm4
-
-
-        neg         rax                   ; negate pitch to deal with above border
-
-        movq        mm2, [rsi+4*rax]      ; p3
-        movq        mm4, [rdi+4*rax]      ; p2
-        movq        mm5, mm4              ; p2
-        psubusb     mm4, mm2              ; p2-=p3
-        psubusb     mm2, mm5              ; p3-=p2
-        por         mm4, mm2              ; abs(p3 - p2)
-        psubusb     mm4, mm7
-        por        mm1, mm4
-
-
-        movq        mm4, [rsi+2*rax]      ; p1
-        movq        mm3, mm4              ; p1
-        psubusb     mm4, mm5              ; p1-=p2
-        psubusb     mm5, mm3              ; p2-=p1
-        por         mm4, mm5              ; abs(p2 - p1)
-        psubusb     mm4, mm7
-        por        mm1, mm4
-
-        movq        mm2, mm3              ; p1
-
-        movq        mm4, [rsi+rax]        ; p0
-        movq        mm5, mm4              ; p0
-        psubusb     mm4, mm3              ; p0-=p1
-        psubusb     mm3, mm5              ; p1-=p0
-        por         mm4, mm3              ; abs(p1 - p0)
-        movq        t1, mm4               ; save to t1
-        psubusb     mm4, mm7
-        por        mm1, mm4
-
-        movq        mm3, [rdi]            ; q1
-        movq        mm4, mm3              ; q1
-        psubusb     mm3, mm2              ; q1-=p1
-        psubusb     mm2, mm4              ; p1-=q1
-        por         mm2, mm3              ; abs(p1-q1)
-        pand        mm2, [GLOBAL(tfe)]    ; set lsb of each byte to zero
-        psrlw       mm2, 1                ; abs(p1-q1)/2
-
-        movq        mm6, mm5              ; p0
-        movq        mm3, [rsi]            ; q0
-        psubusb     mm5, mm3              ; p0-=q0
-        psubusb     mm3, mm6              ; q0-=p0
-        por         mm5, mm3              ; abs(p0 - q0)
-        paddusb     mm5, mm5              ; abs(p0-q0)*2
-        paddusb     mm5, mm2              ; abs (p0 - q0) *2 + abs(p1-q1)/2
-
-        mov         rdx, arg(2) ;blimit           ; get blimit
-        movq        mm7, [rdx]            ; blimit
-
-        psubusb     mm5,    mm7           ; abs (p0 - q0) *2 + abs(p1-q1)/2  > blimit
-        por         mm1,    mm5
-        pxor        mm5,    mm5
-        pcmpeqb     mm1,    mm5           ; mask mm1
-
-        ; calculate high edge variance
-        mov         rdx, arg(4) ;thresh           ; get thresh
-        movq        mm7, [rdx]            ;
-        movq        mm4, t0               ; get abs (q1 - q0)
-        psubusb     mm4, mm7
-        movq        mm3, t1               ; get abs (p1 - p0)
-        psubusb     mm3, mm7
-        paddb       mm4, mm3              ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
-
-        pcmpeqb     mm4,        mm5
-
-        pcmpeqb     mm5,        mm5
-        pxor        mm4,        mm5
-
-
-        ; start work on filters
-        movq        mm2, [rsi+2*rax]      ; p1
-        movq        mm7, [rdi]            ; q1
-        pxor        mm2, [GLOBAL(t80)]    ; p1 offset to convert to signed values
-        pxor        mm7, [GLOBAL(t80)]    ; q1 offset to convert to signed values
-        psubsb      mm2, mm7              ; p1 - q1
-        pand        mm2, mm4              ; high var mask (hvm)(p1 - q1)
-        pxor        mm6, [GLOBAL(t80)]    ; offset to convert to signed values
-        pxor        mm0, [GLOBAL(t80)]    ; offset to convert to signed values
-        movq        mm3, mm0              ; q0
-        psubsb      mm0, mm6              ; q0 - p0
-        paddsb      mm2, mm0              ; 1 * (q0 - p0) + hvm(p1 - q1)
-        paddsb      mm2, mm0              ; 2 * (q0 - p0) + hvm(p1 - q1)
-        paddsb      mm2, mm0              ; 3 * (q0 - p0) + hvm(p1 - q1)
-        pand        mm1, mm2                  ; mask filter values we don't care about
-        movq        mm2, mm1
-        paddsb      mm1, [GLOBAL(t4)]     ; 3* (q0 - p0) + hvm(p1 - q1) + 4
-        paddsb      mm2, [GLOBAL(t3)]     ; 3* (q0 - p0) + hvm(p1 - q1) + 3
-
-        pxor        mm0, mm0             ;
-        pxor        mm5, mm5
-        punpcklbw   mm0, mm2            ;
-        punpckhbw   mm5, mm2            ;
-        psraw       mm0, 11             ;
-        psraw       mm5, 11
-        packsswb    mm0, mm5
-        movq        mm2, mm0            ;  (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3;
-
-        pxor        mm0, mm0              ; 0
-        movq        mm5, mm1              ; abcdefgh
-        punpcklbw   mm0, mm1              ; e0f0g0h0
-        psraw       mm0, 11               ; sign extended shift right by 3
-        pxor        mm1, mm1              ; 0
-        punpckhbw   mm1, mm5              ; a0b0c0d0
-        psraw       mm1, 11               ; sign extended shift right by 3
-        movq        mm5, mm0              ; save results
-
-        packsswb    mm0, mm1              ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3
-        paddsw      mm5, [GLOBAL(ones)]
-        paddsw      mm1, [GLOBAL(ones)]
-        psraw       mm5, 1                ; partial shifted one more time for 2nd tap
-        psraw       mm1, 1                ; partial shifted one more time for 2nd tap
-        packsswb    mm5, mm1              ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4
-        pandn       mm4, mm5              ; high edge variance additive
-
-        paddsb      mm6, mm2              ; p0+= p0 add
-        pxor        mm6, [GLOBAL(t80)]    ; unoffset
-        movq        [rsi+rax], mm6        ; write back
-
-        movq        mm6, [rsi+2*rax]      ; p1
-        pxor        mm6, [GLOBAL(t80)]    ; reoffset
-        paddsb      mm6, mm4              ; p1+= p1 add
-        pxor        mm6, [GLOBAL(t80)]    ; unoffset
-        movq        [rsi+2*rax], mm6      ; write back
-
-        psubsb      mm3, mm0              ; q0-= q0 add
-        pxor        mm3, [GLOBAL(t80)]    ; unoffset
-        movq        [rsi], mm3            ; write back
-
-        psubsb      mm7, mm4              ; q1-= q1 add
-        pxor        mm7, [GLOBAL(t80)]    ; unoffset
-        movq        [rdi], mm7            ; write back
-
-        add         rsi,8
-        neg         rax
-        dec         rcx
-        jnz         .next8_h
-
-    add rsp, 32
-    pop rsp
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vp9_loop_filter_vertical_edge_mmx
-;(
-;    unsigned char *src_ptr,
-;    int  src_pixel_step,
-;    const char *blimit,
-;    const char *limit,
-;    const char *thresh,
-;    int count
-;)
-global sym(vp9_loop_filter_vertical_edge_mmx)
-sym(vp9_loop_filter_vertical_edge_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub          rsp, 64      ; reserve 64 bytes
-    %define t0   [rsp + 0]    ;__declspec(align(16)) char t0[8];
-    %define t1   [rsp + 16]   ;__declspec(align(16)) char t1[8];
-    %define srct [rsp + 32]   ;__declspec(align(16)) char srct[32];
-
-        mov         rsi,        arg(0) ;src_ptr
-        movsxd      rax,        dword ptr arg(1) ;src_pixel_step     ; destination pitch?
-
-        lea         rsi,        [rsi + rax*4 - 4]
-
-        movsxd      rcx,        dword ptr arg(5) ;count
-.next8_v:
-        mov         rdi,        rsi           ; rdi points to row +1 for indirect addressing
-        add         rdi,        rax
-
-
-        ;transpose
-        movq        mm6,        [rsi+2*rax]                 ; 67 66 65 64 63 62 61 60
-        movq        mm7,        mm6                         ; 77 76 75 74 73 72 71 70
-
-        punpckhbw   mm7,        [rdi+2*rax]                 ; 77 67 76 66 75 65 74 64
-        punpcklbw   mm6,        [rdi+2*rax]                 ; 73 63 72 62 71 61 70 60
-
-        movq        mm4,        [rsi]                       ; 47 46 45 44 43 42 41 40
-        movq        mm5,        mm4                         ; 47 46 45 44 43 42 41 40
-
-        punpckhbw   mm5,        [rsi+rax]                   ; 57 47 56 46 55 45 54 44
-        punpcklbw   mm4,        [rsi+rax]                   ; 53 43 52 42 51 41 50 40
-
-        movq        mm3,        mm5                         ; 57 47 56 46 55 45 54 44
-        punpckhwd   mm5,        mm7                         ; 77 67 57 47 76 66 56 46
-
-        punpcklwd   mm3,        mm7                         ; 75 65 55 45 74 64 54 44
-        movq        mm2,        mm4                         ; 53 43 52 42 51 41 50 40
-
-        punpckhwd   mm4,        mm6                         ; 73 63 53 43 72 62 52 42
-        punpcklwd   mm2,        mm6                         ; 71 61 51 41 70 60 50 40
-
-        neg         rax
-        movq        mm6,        [rsi+rax*2]                 ; 27 26 25 24 23 22 21 20
-
-        movq        mm1,        mm6                         ; 27 26 25 24 23 22 21 20
-        punpckhbw   mm6,        [rsi+rax]                   ; 37 27 36 36 35 25 34 24
-
-        punpcklbw   mm1,        [rsi+rax]                   ; 33 23 32 22 31 21 30 20
-        movq        mm7,        [rsi+rax*4];                ; 07 06 05 04 03 02 01 00
-
-        punpckhbw   mm7,        [rdi+rax*4]                 ; 17 07 16 06 15 05 14 04
-        movq        mm0,        mm7                         ; 17 07 16 06 15 05 14 04
-
-        punpckhwd   mm7,        mm6                         ; 37 27 17 07 36 26 16 06
-        punpcklwd   mm0,        mm6                         ; 35 25 15 05 34 24 14 04
-
-        movq        mm6,        mm7                         ; 37 27 17 07 36 26 16 06
-        punpckhdq   mm7,        mm5                         ; 77 67 57 47 37 27 17 07  = q3
-
-        punpckldq   mm6,        mm5                         ; 76 66 56 46 36 26 16 06  = q2
-
-        movq        mm5,        mm6                         ; 76 66 56 46 36 26 16 06
-        psubusb     mm5,        mm7                         ; q2-q3
-
-        psubusb     mm7,        mm6                         ; q3-q2
-        por         mm7,        mm5;                        ; mm7=abs (q3-q2)
-
-        movq        mm5,        mm0                         ; 35 25 15 05 34 24 14 04
-        punpckhdq   mm5,        mm3                         ; 75 65 55 45 35 25 15 05 = q1
-
-        punpckldq   mm0,        mm3                         ; 74 64 54 44 34 24 15 04 = q0
-        movq        mm3,        mm5                         ; 75 65 55 45 35 25 15 05 = q1
-
-        psubusb     mm3,        mm6                         ; q1-q2
-        psubusb     mm6,        mm5                         ; q2-q1
-
-        por         mm6,        mm3                         ; mm6=abs(q2-q1)
-        lea         rdx,        srct
-
-        movq        [rdx+24],   mm5                         ; save q1
-        movq        [rdx+16],   mm0                         ; save q0
-
-        movq        mm3,        [rsi+rax*4]                 ; 07 06 05 04 03 02 01 00
-        punpcklbw   mm3,        [rdi+rax*4]                 ; 13 03 12 02 11 01 10 00
-
-        movq        mm0,        mm3                         ; 13 03 12 02 11 01 10 00
-        punpcklwd   mm0,        mm1                         ; 31 21 11 01 30 20 10 00
-
-        punpckhwd   mm3,        mm1                         ; 33 23 13 03 32 22 12 02
-        movq        mm1,        mm0                         ; 31 21 11 01 30 20 10 00
-
-        punpckldq   mm0,        mm2                         ; 70 60 50 40 30 20 10 00  =p3
-        punpckhdq   mm1,        mm2                         ; 71 61 51 41 31 21 11 01  =p2
-
-        movq        mm2,        mm1                         ; 71 61 51 41 31 21 11 01  =p2
-        psubusb     mm2,        mm0                         ; p2-p3
-
-        psubusb     mm0,        mm1                         ; p3-p2
-        por         mm0,        mm2                         ; mm0=abs(p3-p2)
-
-        movq        mm2,        mm3                         ; 33 23 13 03 32 22 12 02
-        punpckldq   mm2,        mm4                         ; 72 62 52 42 32 22 12 02 = p1
-
-        punpckhdq   mm3,        mm4                         ; 73 63 53 43 33 23 13 03 = p0
-        movq        [rdx+8],    mm3                         ; save p0
-
-        movq        [rdx],      mm2                         ; save p1
-        movq        mm5,        mm2                         ; mm5 = p1
-
-        psubusb     mm2,        mm1                         ; p1-p2
-        psubusb     mm1,        mm5                         ; p2-p1
-
-        por         mm1,        mm2                         ; mm1=abs(p2-p1)
-        mov         rdx,        arg(3) ;limit
-
-        movq        mm4,        [rdx]                       ; mm4 = limit
-        psubusb     mm7,        mm4
-
-        psubusb     mm0,        mm4
-        psubusb     mm1,        mm4
-
-        psubusb     mm6,        mm4
-        por         mm7,        mm6
-
-        por         mm0,        mm1
-        por         mm0,        mm7                         ;   abs(q3-q2) > limit || abs(p3-p2) > limit ||abs(p2-p1) > limit || abs(q2-q1) > limit
-
-        movq        mm1,        mm5                         ; p1
-
-        movq        mm7,        mm3                         ; mm3=mm7=p0
-        psubusb     mm7,        mm5                         ; p0 - p1
-
-        psubusb     mm5,        mm3                         ; p1 - p0
-        por         mm5,        mm7                         ; abs(p1-p0)
-
-        movq        t0,         mm5                         ; save abs(p1-p0)
-        lea         rdx,        srct
-
-        psubusb     mm5,        mm4
-        por         mm0,        mm5                         ; mm0=mask
-
-        movq        mm5,        [rdx+16]                    ; mm5=q0
-        movq        mm7,        [rdx+24]                    ; mm7=q1
-
-        movq        mm6,        mm5                         ; mm6=q0
-        movq        mm2,        mm7                         ; q1
-        psubusb     mm5,        mm7                         ; q0-q1
-
-        psubusb     mm7,        mm6                         ; q1-q0
-        por         mm7,        mm5                         ; abs(q1-q0)
-
-        movq        t1,         mm7                         ; save abs(q1-q0)
-        psubusb     mm7,        mm4
-
-        por         mm0,        mm7                         ; mask
-
-        movq        mm5,        mm2                         ; q1
-        psubusb     mm5,        mm1                         ; q1-=p1
-        psubusb     mm1,        mm2                         ; p1-=q1
-        por         mm5,        mm1                         ; abs(p1-q1)
-        pand        mm5,        [GLOBAL(tfe)]               ; set lsb of each byte to zero
-        psrlw       mm5,        1                           ; abs(p1-q1)/2
-
-        mov         rdx,        arg(2) ;blimit                      ;
-
-        movq        mm4,        [rdx]                       ;blimit
-        movq        mm1,        mm3                         ; mm1=mm3=p0
-
-        movq        mm7,        mm6                         ; mm7=mm6=q0
-        psubusb     mm1,        mm7                         ; p0-q0
-
-        psubusb     mm7,        mm3                         ; q0-p0
-        por         mm1,        mm7                         ; abs(q0-p0)
-        paddusb     mm1,        mm1                         ; abs(q0-p0)*2
-        paddusb     mm1,        mm5                         ; abs (p0 - q0) *2 + abs(p1-q1)/2
-
-        psubusb     mm1,        mm4                         ; abs (p0 - q0) *2 + abs(p1-q1)/2  > blimit
-        por         mm1,        mm0;                        ; mask
-
-        pxor        mm0,        mm0
-        pcmpeqb     mm1,        mm0
-
-        ; calculate high edge variance
-        mov         rdx,        arg(4) ;thresh            ; get thresh
-        movq        mm7,        [rdx]
-        ;
-        movq        mm4,        t0              ; get abs (q1 - q0)
-        psubusb     mm4,        mm7
-
-        movq        mm3,        t1              ; get abs (p1 - p0)
-        psubusb     mm3,        mm7
-
-        por         mm4,        mm3             ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
-        pcmpeqb     mm4,        mm0
-
-        pcmpeqb     mm0,        mm0
-        pxor        mm4,        mm0
-
-
-
-        ; start work on filters
-        lea         rdx,        srct
-
-        movq        mm2,        [rdx]           ; p1
-        movq        mm7,        [rdx+24]        ; q1
-
-        movq        mm6,        [rdx+8]         ; p0
-        movq        mm0,        [rdx+16]        ; q0
-
-        pxor        mm2,        [GLOBAL(t80)]   ; p1 offset to convert to signed values
-        pxor        mm7,        [GLOBAL(t80)]   ; q1 offset to convert to signed values
-
-        psubsb      mm2,        mm7             ; p1 - q1
-        pand        mm2,        mm4             ; high var mask (hvm)(p1 - q1)
-
-        pxor        mm6,        [GLOBAL(t80)]   ; offset to convert to signed values
-        pxor        mm0,        [GLOBAL(t80)]   ; offset to convert to signed values
-
-        movq        mm3,        mm0             ; q0
-        psubsb      mm0,        mm6             ; q0 - p0
-
-        paddsb      mm2,        mm0             ; 1 * (q0 - p0) + hvm(p1 - q1)
-        paddsb      mm2,        mm0             ; 2 * (q0 - p0) + hvm(p1 - q1)
-
-        paddsb      mm2,        mm0             ; 3 * (q0 - p0) + hvm(p1 - q1)
-        pand       mm1,        mm2              ; mask filter values we don't care about
-
-        movq        mm2,        mm1
-        paddsb      mm1,        [GLOBAL(t4)]      ; 3* (q0 - p0) + hvm(p1 - q1) + 4
-
-        paddsb      mm2,        [GLOBAL(t3)]      ; 3* (q0 - p0) + hvm(p1 - q1) + 3
-        pxor        mm0,        mm0          ;
-
-        pxor        mm5,        mm5
-        punpcklbw   mm0,        mm2         ;
-
-        punpckhbw   mm5,        mm2         ;
-        psraw       mm0,        11              ;
-
-        psraw       mm5,        11
-        packsswb    mm0,        mm5
-
-        movq        mm2,        mm0         ;  (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3;
-
-        pxor        mm0,        mm0           ; 0
-        movq        mm5,        mm1           ; abcdefgh
-
-        punpcklbw   mm0,        mm1           ; e0f0g0h0
-        psraw       mm0,        11                ; sign extended shift right by 3
-
-        pxor        mm1,        mm1           ; 0
-        punpckhbw   mm1,        mm5           ; a0b0c0d0
-
-        psraw       mm1,        11                ; sign extended shift right by 3
-        movq        mm5,        mm0              ; save results
-
-        packsswb    mm0,        mm1           ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3
-        paddsw      mm5,        [GLOBAL(ones)]
-
-        paddsw      mm1,        [GLOBAL(ones)]
-        psraw       mm5,        1                 ; partial shifted one more time for 2nd tap
-
-        psraw       mm1,        1                 ; partial shifted one more time for 2nd tap
-        packsswb    mm5,        mm1           ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4
-
-        pandn       mm4,        mm5             ; high edge variance additive
-
-        paddsb      mm6,        mm2             ; p0+= p0 add
-        pxor        mm6,        [GLOBAL(t80)]   ; unoffset
-
-        ; mm6=p0                               ;
-        movq        mm1,        [rdx]           ; p1
-        pxor        mm1,        [GLOBAL(t80)]   ; reoffset
-
-        paddsb      mm1,        mm4                 ; p1+= p1 add
-        pxor        mm1,        [GLOBAL(t80)]       ; unoffset
-        ; mm6 = p0 mm1 = p1
-
-        psubsb      mm3,        mm0                 ; q0-= q0 add
-        pxor        mm3,        [GLOBAL(t80)]       ; unoffset
-
-        ; mm3 = q0
-        psubsb      mm7,        mm4                 ; q1-= q1 add
-        pxor        mm7,        [GLOBAL(t80)]       ; unoffset
-        ; mm7 = q1
-
-        ; tranpose and write back
-        ; mm1 =    72 62 52 42 32 22 12 02
-        ; mm6 =    73 63 53 43 33 23 13 03
-        ; mm3 =    74 64 54 44 34 24 14 04
-        ; mm7 =    75 65 55 45 35 25 15 05
-
-        movq        mm2,        mm1             ; 72 62 52 42 32 22 12 02
-        punpcklbw   mm2,        mm6             ; 33 32 23 22 13 12 03 02
-
-        movq        mm4,        mm3             ; 74 64 54 44 34 24 14 04
-        punpckhbw   mm1,        mm6             ; 73 72 63 62 53 52 43 42
-
-        punpcklbw   mm4,        mm7             ; 35 34 25 24 15 14 05 04
-        punpckhbw   mm3,        mm7             ; 75 74 65 64 55 54 45 44
-
-        movq        mm6,        mm2             ; 33 32 23 22 13 12 03 02
-        punpcklwd   mm2,        mm4             ; 15 14 13 12 05 04 03 02
-
-        punpckhwd   mm6,        mm4             ; 35 34 33 32 25 24 23 22
-        movq        mm5,        mm1             ; 73 72 63 62 53 52 43 42
-
-        punpcklwd   mm1,        mm3             ; 55 54 53 52 45 44 43 42
-        punpckhwd   mm5,        mm3             ; 75 74 73 72 65 64 63 62
-
-
-        ; mm2 = 15 14 13 12 05 04 03 02
-        ; mm6 = 35 34 33 32 25 24 23 22
-        ; mm5 = 55 54 53 52 45 44 43 42
-        ; mm1 = 75 74 73 72 65 64 63 62
-
-
-
-        movd        [rsi+rax*4+2], mm2
-        psrlq       mm2,        32
-
-        movd        [rdi+rax*4+2], mm2
-        movd        [rsi+rax*2+2], mm6
-
-        psrlq       mm6,        32
-        movd        [rsi+rax+2],mm6
-
-        movd        [rsi+2],    mm1
-        psrlq       mm1,        32
-
-        movd        [rdi+2],    mm1
-        neg         rax
-
-        movd        [rdi+rax+2],mm5
-        psrlq       mm5,        32
-
-        movd        [rdi+rax*2+2], mm5
-
-        lea         rsi,        [rsi+rax*8]
-        dec         rcx
-        jnz         .next8_v
-
-    add rsp, 64
-    pop rsp
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vp9_loop_filter_simple_horizontal_edge_mmx
-;(
-;    unsigned char *src_ptr,
-;    int  src_pixel_step,
-;    const char *blimit
-;)
-global sym(vp9_loop_filter_simple_horizontal_edge_mmx)
-sym(vp9_loop_filter_simple_horizontal_edge_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 3
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-        mov         rsi, arg(0) ;src_ptr
-        movsxd      rax, dword ptr arg(1) ;src_pixel_step     ; destination pitch?
-
-        mov         rcx, 2                ; count
-.nexts8_h:
-        mov         rdx, arg(2) ;blimit           ; get blimit
-        movq        mm3, [rdx]            ;
-
-        mov         rdi, rsi              ; rdi points to row +1 for indirect addressing
-        add         rdi, rax
-        neg         rax
-
-        ; calculate mask
-        movq        mm1, [rsi+2*rax]      ; p1
-        movq        mm0, [rdi]            ; q1
-        movq        mm2, mm1
-        movq        mm7, mm0
-        movq        mm4, mm0
-        psubusb     mm0, mm1              ; q1-=p1
-        psubusb     mm1, mm4              ; p1-=q1
-        por         mm1, mm0              ; abs(p1-q1)
-        pand        mm1, [GLOBAL(tfe)]    ; set lsb of each byte to zero
-        psrlw       mm1, 1                ; abs(p1-q1)/2
-
-        movq        mm5, [rsi+rax]        ; p0
-        movq        mm4, [rsi]            ; q0
-        movq        mm0, mm4              ; q0
-        movq        mm6, mm5              ; p0
-        psubusb     mm5, mm4              ; p0-=q0
-        psubusb     mm4, mm6              ; q0-=p0
-        por         mm5, mm4              ; abs(p0 - q0)
-        paddusb     mm5, mm5              ; abs(p0-q0)*2
-        paddusb     mm5, mm1              ; abs (p0 - q0) *2 + abs(p1-q1)/2
-
-        psubusb     mm5, mm3              ; abs(p0 - q0) *2 + abs(p1-q1)/2  > blimit
-        pxor        mm3, mm3
-        pcmpeqb     mm5, mm3
-
-        ; start work on filters
-        pxor        mm2, [GLOBAL(t80)]    ; p1 offset to convert to signed values
-        pxor        mm7, [GLOBAL(t80)]    ; q1 offset to convert to signed values
-        psubsb      mm2, mm7              ; p1 - q1
-
-        pxor        mm6, [GLOBAL(t80)]    ; offset to convert to signed values
-        pxor        mm0, [GLOBAL(t80)]    ; offset to convert to signed values
-        movq        mm3, mm0              ; q0
-        psubsb      mm0, mm6              ; q0 - p0
-        paddsb      mm2, mm0              ; p1 - q1 + 1 * (q0 - p0)
-        paddsb      mm2, mm0              ; p1 - q1 + 2 * (q0 - p0)
-        paddsb      mm2, mm0              ; p1 - q1 + 3 * (q0 - p0)
-        pand        mm5, mm2              ; mask filter values we don't care about
-
-        ; do + 4 side
-        paddsb      mm5, [GLOBAL(t4)]     ; 3* (q0 - p0) + (p1 - q1) + 4
-
-        movq        mm0, mm5              ; get a copy of filters
-        psllw       mm0, 8                ; shift left 8
-        psraw       mm0, 3                ; arithmetic shift right 11
-        psrlw       mm0, 8
-        movq        mm1, mm5              ; get a copy of filters
-        psraw       mm1, 11               ; arithmetic shift right 11
-        psllw       mm1, 8                ; shift left 8 to put it back
-
-        por         mm0, mm1              ; put the two together to get result
-
-        psubsb      mm3, mm0              ; q0-= q0 add
-        pxor        mm3, [GLOBAL(t80)]    ; unoffset
-        movq        [rsi], mm3            ; write back
-
-
-        ; now do +3 side
-        psubsb      mm5, [GLOBAL(t1s)]     ; +3 instead of +4
-
-        movq        mm0, mm5              ; get a copy of filters
-        psllw       mm0, 8                ; shift left 8
-        psraw       mm0, 3                ; arithmetic shift right 11
-        psrlw       mm0, 8
-        psraw       mm5, 11               ; arithmetic shift right 11
-        psllw       mm5, 8                ; shift left 8 to put it back
-        por         mm0, mm5              ; put the two together to get result
-
-
-        paddsb      mm6, mm0              ; p0+= p0 add
-        pxor        mm6, [GLOBAL(t80)]    ; unoffset
-        movq        [rsi+rax], mm6        ; write back
-
-        add         rsi,8
-        neg         rax
-        dec         rcx
-        jnz         .nexts8_h
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vp9_loop_filter_simple_vertical_edge_mmx
-;(
-;    unsigned char *src_ptr,
-;    int  src_pixel_step,
-;    const char *blimit
-;)
-global sym(vp9_loop_filter_simple_vertical_edge_mmx)
-sym(vp9_loop_filter_simple_vertical_edge_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 3
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub          rsp, 32      ; reserve 32 bytes
-    %define t0   [rsp + 0]    ;__declspec(align(16)) char t0[8];
-    %define t1   [rsp + 16]   ;__declspec(align(16)) char t1[8];
-
-        mov         rsi, arg(0) ;src_ptr
-        movsxd      rax, dword ptr arg(1) ;src_pixel_step     ; destination pitch?
-
-        lea         rsi, [rsi + rax*4- 2];  ;
-        mov         rcx, 2                                      ; count
-.nexts8_v:
-
-        lea         rdi,        [rsi + rax];
-        movd        mm0,        [rdi + rax * 2]                 ; xx xx xx xx 73 72 71 70
-
-        movd        mm6,        [rsi + rax * 2]                 ; xx xx xx xx 63 62 61 60
-        punpcklbw   mm6,        mm0                             ; 73 63 72 62 71 61 70 60
-
-        movd        mm0,        [rsi + rax]                     ; xx xx xx xx 53 52 51 50
-        movd        mm4,        [rsi]                           ; xx xx xx xx 43 42 41 40
-
-        punpcklbw   mm4,        mm0                             ; 53 43 52 42 51 41 50 40
-        movq        mm5,        mm4                             ; 53 43 52 42 51 41 50 40
-
-        punpcklwd   mm4,        mm6                             ; 71 61 51 41 70 60 50 40
-        punpckhwd   mm5,        mm6                             ; 73 63 53 43 72 62 52 42
-
-        neg         rax
-
-        movd        mm7,        [rsi + rax]                     ; xx xx xx xx 33 32 31 30
-        movd        mm6,        [rsi + rax * 2]                 ; xx xx xx xx 23 22 21 20
-
-        punpcklbw   mm6,        mm7                             ; 33 23 32 22 31 21 30 20
-        movd        mm1,        [rdi + rax * 4]                 ; xx xx xx xx 13 12 11 10
-
-        movd        mm0,        [rsi + rax * 4]                 ; xx xx xx xx 03 02 01 00
-        punpcklbw   mm0,        mm1                             ; 13 03 12 02 11 01 10 00
-
-        movq        mm2,        mm0                             ; 13 03 12 02 11 01 10 00
-        punpcklwd   mm0,        mm6                             ; 31 21 11 01 30 20 10 00
-
-        punpckhwd   mm2,        mm6                             ; 33 23 13 03 32 22 12 02
-        movq        mm1,        mm0                             ; 13 03 12 02 11 01 10 00
-
-        punpckldq   mm0,        mm4                             ; 70 60 50 40 30 20 10 00       = p1
-        movq        mm3,        mm2                             ; 33 23 13 03 32 22 12 02
-
-        punpckhdq   mm1,        mm4                             ; 71 61 51 41 31 21 11 01       = p0
-        punpckldq   mm2,        mm5                             ; 72 62 52 42 32 22 12 02       = q0
-
-        punpckhdq   mm3,        mm5                             ; 73 63 53 43 33 23 13 03       = q1
-
-
-        ; calculate mask
-        movq        mm6,        mm0                             ; p1
-        movq        mm7,        mm3                             ; q1
-        psubusb     mm7,        mm6                             ; q1-=p1
-        psubusb     mm6,        mm3                             ; p1-=q1
-        por         mm6,        mm7                             ; abs(p1-q1)
-        pand        mm6,        [GLOBAL(tfe)]                   ; set lsb of each byte to zero
-        psrlw       mm6,        1                               ; abs(p1-q1)/2
-
-        movq        mm5,        mm1                             ; p0
-        movq        mm4,        mm2                             ; q0
-
-        psubusb     mm5,        mm2                             ; p0-=q0
-        psubusb     mm4,        mm1                             ; q0-=p0
-
-        por         mm5,        mm4                             ; abs(p0 - q0)
-        paddusb     mm5,        mm5                             ; abs(p0-q0)*2
-        paddusb     mm5,        mm6                             ; abs (p0 - q0) *2 + abs(p1-q1)/2
-
-        mov         rdx,        arg(2) ;blimit                          ; get blimit
-        movq        mm7,        [rdx]
-
-        psubusb     mm5,        mm7                             ; abs(p0 - q0) *2 + abs(p1-q1)/2  > blimit
-        pxor        mm7,        mm7
-        pcmpeqb     mm5,        mm7                             ; mm5 = mask
-
-        ; start work on filters
-        movq        t0,         mm0
-        movq        t1,         mm3
-
-        pxor        mm0,        [GLOBAL(t80)]                   ; p1 offset to convert to signed values
-        pxor        mm3,        [GLOBAL(t80)]                   ; q1 offset to convert to signed values
-
-        psubsb      mm0,        mm3                             ; p1 - q1
-        movq        mm6,        mm1                             ; p0
-
-        movq        mm7,        mm2                             ; q0
-        pxor        mm6,        [GLOBAL(t80)]                   ; offset to convert to signed values
-
-        pxor        mm7,        [GLOBAL(t80)]                   ; offset to convert to signed values
-        movq        mm3,        mm7                             ; offseted ; q0
-
-        psubsb      mm7,        mm6                             ; q0 - p0
-        paddsb      mm0,        mm7                             ; p1 - q1 + 1 * (q0 - p0)
-
-        paddsb      mm0,        mm7                             ; p1 - q1 + 2 * (q0 - p0)
-        paddsb      mm0,        mm7                             ; p1 - q1 + 3 * (q0 - p0)
-
-        pand        mm5,        mm0                             ; mask filter values we don't care about
-
-        paddsb      mm5,        [GLOBAL(t4)]                    ;  3* (q0 - p0) + (p1 - q1) + 4
-
-        movq        mm0,        mm5                             ; get a copy of filters
-        psllw       mm0,        8                               ; shift left 8
-        psraw       mm0,        3                               ; arithmetic shift right 11
-        psrlw       mm0,        8
-
-        movq        mm7,        mm5                             ; get a copy of filters
-        psraw       mm7,        11                              ; arithmetic shift right 11
-        psllw       mm7,        8                               ; shift left 8 to put it back
-
-        por         mm0,        mm7                             ; put the two together to get result
-
-        psubsb      mm3,        mm0                             ; q0-= q0sz add
-        pxor        mm3,        [GLOBAL(t80)]                   ; unoffset
-
-        ; now do +3 side
-        psubsb      mm5, [GLOBAL(t1s)]                          ; +3 instead of +4
-
-        movq        mm0, mm5                                    ; get a copy of filters
-        psllw       mm0, 8                                      ; shift left 8
-        psraw       mm0, 3                                      ; arithmetic shift right 11
-        psrlw       mm0, 8
-
-        psraw       mm5, 11                                     ; arithmetic shift right 11
-        psllw       mm5, 8                                      ; shift left 8 to put it back
-        por         mm0, mm5                                    ; put the two together to get result
-
-        paddsb      mm6, mm0                                    ; p0+= p0 add
-        pxor        mm6, [GLOBAL(t80)]                          ; unoffset
-
-
-        movq        mm0,        t0
-        movq        mm4,        t1
-
-        ; mm0 = 70 60 50 40 30 20 10 00
-        ; mm6 = 71 61 51 41 31 21 11 01
-        ; mm3 = 72 62 52 42 32 22 12 02
-        ; mm4 = 73 63 53 43 33 23 13 03
-        ; transpose back to write out
-
-        movq        mm1,        mm0                         ;
-        punpcklbw   mm0,        mm6                         ; 31 30 21 20 11 10 01 00
-
-        punpckhbw   mm1,        mm6                         ; 71 70 61 60 51 50 41 40
-        movq        mm2,        mm3                         ;
-
-        punpcklbw   mm2,        mm4                         ; 33 32 23 22 13 12 03 02
-        movq        mm5,        mm1                         ; 71 70 61 60 51 50 41 40
-
-        punpckhbw   mm3,        mm4                         ; 73 72 63 62 53 52 43 42
-        movq        mm6,        mm0                         ; 31 30 21 20 11 10 01 00
-
-        punpcklwd   mm0,        mm2                         ; 13 12 11 10 03 02 01 00
-        punpckhwd   mm6,        mm2                         ; 33 32 31 30 23 22 21 20
-
-        movd        [rsi+rax*4], mm0                        ; write 03 02 01 00
-        punpcklwd   mm1,        mm3                         ; 53 52 51 50 43 42 41 40
-
-        psrlq       mm0,        32                          ; xx xx xx xx 13 12 11 10
-        punpckhwd   mm5,        mm3                         ; 73 72 71 70 63 62 61 60
-
-        movd        [rdi+rax*4], mm0                        ; write 13 12 11 10
-        movd        [rsi+rax*2], mm6                        ; write 23 22 21 20
-
-        psrlq       mm6,        32                          ; 33 32 31 30
-        movd        [rsi],      mm1                         ; write 43 42 41 40
-
-        movd        [rsi + rax], mm6                        ; write 33 32 31 30
-        neg         rax
-
-        movd        [rsi + rax*2], mm5                      ; write 63 62 61 60
-        psrlq       mm1,        32                          ; 53 52 51 50
-
-        movd        [rdi],      mm1                         ; write out 53 52 51 50
-        psrlq       mm5,        32                          ; 73 72 71 70
-
-        movd        [rdi + rax*2], mm5                      ; write 73 72 71 70
-
-        lea         rsi,        [rsi+rax*8]                 ; next 8
-
-        dec         rcx
-        jnz         .nexts8_v
-
-    add rsp, 32
-    pop rsp
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-
-;void fast_loop_filter_vertical_edges_mmx(unsigned char *y_ptr,
-;                  int y_stride,
-;                  loop_filter_info *lfi)
-;{
-;
-;
-;    vp9_loop_filter_simple_vertical_edge_mmx(y_ptr+4, y_stride, lfi->flim,lfi->lim,lfi->thr,2);
-;    vp9_loop_filter_simple_vertical_edge_mmx(y_ptr+8, y_stride, lfi->flim,lfi->lim,lfi->thr,2);
-;    vp9_loop_filter_simple_vertical_edge_mmx(y_ptr+12, y_stride, lfi->flim,lfi->lim,lfi->thr,2);
-;}
-
-SECTION_RODATA
-align 16
-tfe:
-    times 8 db 0xfe
-align 16
-t80:
-    times 8 db 0x80
-align 16
-t1s:
-    times 8 db 0x01
-align 16
-t3:
-    times 8 db 0x03
-align 16
-t4:
-    times 8 db 0x04
-align 16
-ones:
-    times 4 dw 0x0001
-align 16
-s27:
-    times 4 dw 0x1b00
-align 16
-s18:
-    times 4 dw 0x1200
-align 16
-s9:
-    times 4 dw 0x0900
-align 16
-s63:
-    times 4 dw 0x003f
--- a/vp8/common/x86/loopfilter_sse2.asm
+++ /dev/null
@@ -1,1238 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-; Use of pmaxub instead of psubusb to compute filter mask was seen
-; in ffvp8
-
-%macro LFH_FILTER_AND_HEV_MASK 1
-%if %1
-        movdqa      xmm2,                   [rdi+2*rax]       ; q3
-        movdqa      xmm1,                   [rsi+2*rax]       ; q2
-        movdqa      xmm4,                   [rsi+rax]         ; q1
-        movdqa      xmm5,                   [rsi]             ; q0
-        neg         rax                     ; negate pitch to deal with above border
-%else
-        movlps      xmm2,                   [rsi + rcx*2]     ; q3
-        movlps      xmm1,                   [rsi + rcx]       ; q2
-        movlps      xmm4,                   [rsi]             ; q1
-        movlps      xmm5,                   [rsi + rax]       ; q0
-
-        movhps      xmm2,                   [rdi + rcx*2]
-        movhps      xmm1,                   [rdi + rcx]
-        movhps      xmm4,                   [rdi]
-        movhps      xmm5,                   [rdi + rax]
-
-        lea         rsi,                    [rsi + rax*4]
-        lea         rdi,                    [rdi + rax*4]
-
-        movdqa      XMMWORD PTR [rsp],      xmm1              ; store q2
-        movdqa      XMMWORD PTR [rsp + 16], xmm4              ; store q1
-%endif
-
-        movdqa      xmm6,                   xmm1              ; q2
-        movdqa      xmm3,                   xmm4              ; q1
-
-        psubusb     xmm1,                   xmm2              ; q2-=q3
-        psubusb     xmm2,                   xmm6              ; q3-=q2
-
-        psubusb     xmm4,                   xmm6              ; q1-=q2
-        psubusb     xmm6,                   xmm3              ; q2-=q1
-
-        por         xmm4,                   xmm6              ; abs(q2-q1)
-        por         xmm1,                   xmm2              ; abs(q3-q2)
-
-        movdqa      xmm0,                   xmm5              ; q0
-        pmaxub      xmm1,                   xmm4
-
-        psubusb     xmm5,                   xmm3              ; q0-=q1
-        psubusb     xmm3,                   xmm0              ; q1-=q0
-
-        por         xmm5,                   xmm3              ; abs(q0-q1)
-        movdqa      t0,                     xmm5              ; save to t0
-
-        pmaxub      xmm1,                   xmm5
-
-%if %1
-        movdqa      xmm2,                   [rsi+4*rax]       ; p3
-        movdqa      xmm4,                   [rdi+4*rax]       ; p2
-        movdqa      xmm6,                   [rsi+2*rax]       ; p1
-%else
-        movlps      xmm2,                   [rsi + rax]       ; p3
-        movlps      xmm4,                   [rsi]             ; p2
-        movlps      xmm6,                   [rsi + rcx]       ; p1
-
-        movhps      xmm2,                   [rdi + rax]
-        movhps      xmm4,                   [rdi]
-        movhps      xmm6,                   [rdi + rcx]
-
-        movdqa      XMMWORD PTR [rsp + 32], xmm4              ; store p2
-        movdqa      XMMWORD PTR [rsp + 48], xmm6              ; store p1
-%endif
-
-        movdqa      xmm5,                   xmm4              ; p2
-        movdqa      xmm3,                   xmm6              ; p1
-
-        psubusb     xmm4,                   xmm2              ; p2-=p3
-        psubusb     xmm2,                   xmm5              ; p3-=p2
-
-        psubusb     xmm3,                   xmm5              ; p1-=p2
-        pmaxub      xmm1,                   xmm4              ; abs(p3 - p2)
-
-        psubusb     xmm5,                   xmm6              ; p2-=p1
-        pmaxub      xmm1,                   xmm2              ; abs(p3 - p2)
-
-        pmaxub      xmm1,                   xmm5              ; abs(p2 - p1)
-        movdqa      xmm2,                   xmm6              ; p1
-
-        pmaxub      xmm1,                   xmm3              ; abs(p2 - p1)
-%if %1
-        movdqa      xmm4,                   [rsi+rax]         ; p0
-        movdqa      xmm3,                   [rdi]             ; q1
-%else
-        movlps      xmm4,                   [rsi + rcx*2]     ; p0
-        movhps      xmm4,                   [rdi + rcx*2]
-        movdqa      xmm3,                   q1                ; q1
-%endif
-
-        movdqa      xmm5,                   xmm4              ; p0
-        psubusb     xmm4,                   xmm6              ; p0-=p1
-
-        psubusb     xmm6,                   xmm5              ; p1-=p0
-
-        por         xmm6,                   xmm4              ; abs(p1 - p0)
-        mov         rdx,                    arg(2)            ; get blimit
-
-        movdqa        t1,                   xmm6              ; save to t1
-
-        movdqa      xmm4,                   xmm3              ; q1
-        pmaxub      xmm1,                   xmm6
-
-        psubusb     xmm3,                   xmm2              ; q1-=p1
-        psubusb     xmm2,                   xmm4              ; p1-=q1
-
-        psubusb     xmm1,                   xmm7
-        por         xmm2,                   xmm3              ; abs(p1-q1)
-
-        movdqa      xmm7,                   XMMWORD PTR [rdx] ; blimit
-
-        movdqa      xmm3,                   xmm0              ; q0
-        pand        xmm2,                   [GLOBAL(tfe)]     ; set lsb of each byte to zero
-
-        mov         rdx,                    arg(4)            ; hev get thresh
-
-        movdqa      xmm6,                   xmm5              ; p0
-        psrlw       xmm2,                   1                 ; abs(p1-q1)/2
-
-        psubusb     xmm5,                   xmm3              ; p0-=q0
-
-        psubusb     xmm3,                   xmm6              ; q0-=p0
-        por         xmm5,                   xmm3              ; abs(p0 - q0)
-
-        paddusb     xmm5,                   xmm5              ; abs(p0-q0)*2
-
-        movdqa      xmm4,                   t0                ; hev get abs (q1 - q0)
-
-        movdqa      xmm3,                   t1                ; get abs (p1 - p0)
-
-        paddusb     xmm5,                   xmm2              ; abs (p0 - q0) *2 + abs(p1-q1)/2
-
-        movdqa      xmm2,                   XMMWORD PTR [rdx] ; hev
-
-        psubusb     xmm5,                   xmm7              ; abs (p0 - q0) *2 + abs(p1-q1)/2  > blimit
-        psubusb     xmm4,                   xmm2              ; hev
-
-        psubusb     xmm3,                   xmm2              ; hev
-        por         xmm1,                   xmm5
-
-        pxor        xmm7,                   xmm7
-        paddb       xmm4,                   xmm3              ; hev abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
-
-        pcmpeqb     xmm4,                   xmm5              ; hev
-        pcmpeqb     xmm3,                   xmm3              ; hev
-
-        pcmpeqb     xmm1,                   xmm7              ; mask xmm1
-        pxor        xmm4,                   xmm3              ; hev
-%endmacro
-
-%macro B_FILTER 1
-%if %1 == 0
-        movdqa      xmm2,                   p1                ; p1
-        movdqa      xmm7,                   q1                ; q1
-%elif %1 == 1
-        movdqa      xmm2,                   [rsi+2*rax]       ; p1
-        movdqa      xmm7,                   [rdi]             ; q1
-%elif %1 == 2
-        lea         rdx,                    srct
-
-        movdqa      xmm2,                   [rdx]             ; p1
-        movdqa      xmm7,                   [rdx+48]          ; q1
-        movdqa      xmm6,                   [rdx+16]          ; p0
-        movdqa      xmm0,                   [rdx+32]          ; q0
-%endif
-
-        pxor        xmm2,                   [GLOBAL(t80)]     ; p1 offset to convert to signed values
-        pxor        xmm7,                   [GLOBAL(t80)]     ; q1 offset to convert to signed values
-
-        psubsb      xmm2,                   xmm7              ; p1 - q1
-        pxor        xmm6,                   [GLOBAL(t80)]     ; offset to convert to signed values
-
-        pand        xmm2,                   xmm4              ; high var mask (hvm)(p1 - q1)
-        pxor        xmm0,                   [GLOBAL(t80)]     ; offset to convert to signed values
-
-        movdqa      xmm3,                   xmm0              ; q0
-        psubsb      xmm0,                   xmm6              ; q0 - p0
-
-        paddsb      xmm2,                   xmm0              ; 1 * (q0 - p0) + hvm(p1 - q1)
-
-        paddsb      xmm2,                   xmm0              ; 2 * (q0 - p0) + hvm(p1 - q1)
-
-        paddsb      xmm2,                   xmm0              ; 3 * (q0 - p0) + hvm(p1 - q1)
-
-        pand        xmm1,                   xmm2              ; mask filter values we don't care about
-
-        movdqa      xmm2,                   xmm1
-
-        paddsb      xmm1,                   [GLOBAL(t4)]      ; 3* (q0 - p0) + hvm(p1 - q1) + 4
-        paddsb      xmm2,                   [GLOBAL(t3)]      ; 3* (q0 - p0) + hvm(p1 - q1) + 3
-
-        punpckhbw   xmm5,                   xmm2              ; axbxcxdx
-        punpcklbw   xmm2,                   xmm2              ; exfxgxhx
-
-        punpcklbw   xmm0,                   xmm1              ; exfxgxhx
-        psraw       xmm5,                   11                ; sign extended shift right by 3
-
-        punpckhbw   xmm1,                   xmm1              ; axbxcxdx
-        psraw       xmm2,                   11                ; sign extended shift right by 3
-
-        packsswb    xmm2,                   xmm5              ; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3;
-        psraw       xmm0,                   11                ; sign extended shift right by 3
-
-        psraw       xmm1,                   11                ; sign extended shift right by 3
-        movdqa      xmm5,                   xmm0              ; save results
-
-        packsswb    xmm0,                   xmm1              ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3
-        paddsw      xmm5,                   [GLOBAL(ones)]
-
-        paddsw      xmm1,                   [GLOBAL(ones)]
-        psraw       xmm5,                   1                 ; partial shifted one more time for 2nd tap
-
-        psraw       xmm1,                   1                 ; partial shifted one more time for 2nd tap
-
-        paddsb      xmm6,                   xmm2              ; p0+= p0 add
-        packsswb    xmm5,                   xmm1              ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4
-
-%if %1 == 0
-        movdqa      xmm1,                   p1                ; p1
-%elif %1 == 1
-        movdqa      xmm1,                   [rsi+2*rax]       ; p1
-%elif %1 == 2
-        movdqa      xmm1,                   [rdx]             ; p1
-%endif
-        pandn       xmm4,                   xmm5              ; high edge variance additive
-        pxor        xmm6,                   [GLOBAL(t80)]     ; unoffset
-
-        pxor        xmm1,                   [GLOBAL(t80)]     ; reoffset
-        psubsb      xmm3,                   xmm0              ; q0-= q0 add
-
-        paddsb      xmm1,                   xmm4              ; p1+= p1 add
-        pxor        xmm3,                   [GLOBAL(t80)]     ; unoffset
-
-        pxor        xmm1,                   [GLOBAL(t80)]     ; unoffset
-        psubsb      xmm7,                   xmm4              ; q1-= q1 add
-
-        pxor        xmm7,                   [GLOBAL(t80)]     ; unoffset
-%if %1 == 0
-        lea         rsi,                    [rsi + rcx*2]
-        lea         rdi,                    [rdi + rcx*2]
-        movq        MMWORD PTR [rsi],       xmm6              ; p0
-        movhps      MMWORD PTR [rdi],       xmm6
-        movq        MMWORD PTR [rsi + rax], xmm1              ; p1
-        movhps      MMWORD PTR [rdi + rax], xmm1
-        movq        MMWORD PTR [rsi + rcx], xmm3              ; q0
-        movhps      MMWORD PTR [rdi + rcx], xmm3
-        movq        MMWORD PTR [rsi + rcx*2],xmm7             ; q1
-        movhps      MMWORD PTR [rdi + rcx*2],xmm7
-%elif %1 == 1
-        movdqa      [rsi+rax],              xmm6              ; write back
-        movdqa      [rsi+2*rax],            xmm1              ; write back
-        movdqa      [rsi],                  xmm3              ; write back
-        movdqa      [rdi],                  xmm7              ; write back
-%endif
-
-%endmacro
-
-
-;void vp9_loop_filter_horizontal_edge_sse2
-;(
-;    unsigned char *src_ptr,
-;    int            src_pixel_step,
-;    const char    *blimit,
-;    const char    *limit,
-;    const char    *thresh,
-;    int            count
-;)
-global sym(vp9_loop_filter_horizontal_edge_sse2)
-sym(vp9_loop_filter_horizontal_edge_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 32     ; reserve 32 bytes
-    %define t0 [rsp + 0]    ;__declspec(align(16)) char t0[16];
-    %define t1 [rsp + 16]   ;__declspec(align(16)) char t1[16];
-
-        mov         rsi,                    arg(0)           ;src_ptr
-        movsxd      rax,                    dword ptr arg(1) ;src_pixel_step
-
-        mov         rdx,                    arg(3)           ;limit
-        movdqa      xmm7,                   XMMWORD PTR [rdx]
-
-        lea         rdi,                    [rsi+rax]        ; rdi points to row +1 for indirect addressing
-
-        ; calculate breakout conditions and high edge variance
-        LFH_FILTER_AND_HEV_MASK 1
-        ; filter and write back the result
-        B_FILTER 1
-
-    add rsp, 32
-    pop rsp
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vp9_loop_filter_horizontal_edge_uv_sse2
-;(
-;    unsigned char *src_ptr,
-;    int            src_pixel_step,
-;    const char    *blimit,
-;    const char    *limit,
-;    const char    *thresh,
-;    int            count
-;)
-global sym(vp9_loop_filter_horizontal_edge_uv_sse2)
-sym(vp9_loop_filter_horizontal_edge_uv_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 96       ; reserve 96 bytes
-    %define q2  [rsp + 0]     ;__declspec(align(16)) char q2[16];
-    %define q1  [rsp + 16]    ;__declspec(align(16)) char q1[16];
-    %define p2  [rsp + 32]    ;__declspec(align(16)) char p2[16];
-    %define p1  [rsp + 48]    ;__declspec(align(16)) char p1[16];
-    %define t0  [rsp + 64]    ;__declspec(align(16)) char t0[16];
-    %define t1  [rsp + 80]    ;__declspec(align(16)) char t1[16];
-
-        mov         rsi,                    arg(0)             ; u
-        mov         rdi,                    arg(5)             ; v
-        movsxd      rax,                    dword ptr arg(1)   ; src_pixel_step
-        mov         rcx,                    rax
-        neg         rax                     ; negate pitch to deal with above border
-
-        mov         rdx,                    arg(3)             ;limit
-        movdqa      xmm7,                   XMMWORD PTR [rdx]
-
-        lea         rsi,                    [rsi + rcx]
-        lea         rdi,                    [rdi + rcx]
-
-        ; calculate breakout conditions and high edge variance
-        LFH_FILTER_AND_HEV_MASK 0
-        ; filter and write back the result
-        B_FILTER 0
-
-    add rsp, 96
-    pop rsp
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-%macro TRANSPOSE_16X8 2
-        movq        xmm4,               QWORD PTR [rsi]        ; xx xx xx xx xx xx xx xx 07 06 05 04 03 02 01 00
-        movq        xmm1,               QWORD PTR [rdi]        ; xx xx xx xx xx xx xx xx 17 16 15 14 13 12 11 10
-        movq        xmm0,               QWORD PTR [rsi+2*rax]  ; xx xx xx xx xx xx xx xx 27 26 25 24 23 22 21 20
-        movq        xmm7,               QWORD PTR [rdi+2*rax]  ; xx xx xx xx xx xx xx xx 37 36 35 34 33 32 31 30
-        movq        xmm5,               QWORD PTR [rsi+4*rax]  ; xx xx xx xx xx xx xx xx 47 46 45 44 43 42 41 40
-        movq        xmm2,               QWORD PTR [rdi+4*rax]  ; xx xx xx xx xx xx xx xx 57 56 55 54 53 52 51 50
-
-        punpcklbw   xmm4,               xmm1            ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00
-
-        movq        xmm1,               QWORD PTR [rdi+2*rcx]  ; xx xx xx xx xx xx xx xx 77 76 75 74 73 72 71 70
-
-        movdqa      xmm3,               xmm4            ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00
-        punpcklbw   xmm0,               xmm7            ; 37 27 36 36 35 25 34 24 33 23 32 22 31 21 30 20
-
-        movq        xmm7,               QWORD PTR [rsi+2*rcx]  ; xx xx xx xx xx xx xx xx 67 66 65 64 63 62 61 60
-
-        punpcklbw   xmm5,               xmm2            ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40
-%if %1
-        lea         rsi,                [rsi+rax*8]
-%else
-        mov         rsi,                arg(5)          ; v_ptr
-%endif
-
-        movdqa      xmm6,               xmm5            ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40
-        punpcklbw   xmm7,               xmm1            ; 77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60
-
-        punpcklwd   xmm5,               xmm7            ; 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40
-
-        punpckhwd   xmm6,               xmm7            ; 77 67 57 47 76 66 56 46 75 65 55 45 74 64 54 44
-%if %1
-        lea         rdi,                [rdi+rax*8]
-%else
-        lea         rsi,                [rsi - 4]
-%endif
-
-        punpcklwd   xmm3,               xmm0            ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
-%if %1
-        lea         rdx,                srct
-%else
-        lea         rdi,                [rsi + rax]     ; rdi points to row +1 for indirect addressing
-%endif
-
-        movdqa      xmm2,               xmm3            ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
-        punpckhwd   xmm4,               xmm0            ; 37 27 17 07 36 26 16 06 35 25 15 05 34 24 14 04
-
-        movdqa      xmm7,               xmm4            ; 37 27 17 07 36 26 16 06 35 25 15 05 34 24 14 04
-        punpckhdq   xmm3,               xmm5            ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
-
-        punpckhdq   xmm7,               xmm6            ; 77 67 57 47 37 27 17 07 76 66 56 46 36 26 16 06
-
-        punpckldq   xmm4,               xmm6            ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04
-
-        punpckldq   xmm2,               xmm5            ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
-
-        movdqa      t0,                 xmm2            ; save to free XMM2
-        movq        xmm2,               QWORD PTR [rsi]       ; xx xx xx xx xx xx xx xx 87 86 85 84 83 82 81 80
-        movq        xmm6,               QWORD PTR [rdi]       ; xx xx xx xx xx xx xx xx 97 96 95 94 93 92 91 90
-        movq        xmm0,               QWORD PTR [rsi+2*rax] ; xx xx xx xx xx xx xx xx a7 a6 a5 a4 a3 a2 a1 a0
-        movq        xmm5,               QWORD PTR [rdi+2*rax] ; xx xx xx xx xx xx xx xx b7 b6 b5 b4 b3 b2 b1 b0
-        movq        xmm1,               QWORD PTR [rsi+4*rax] ; xx xx xx xx xx xx xx xx c7 c6 c5 c4 c3 c2 c1 c0
-
-        punpcklbw   xmm2,               xmm6            ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80
-
-        movq        xmm6,               QWORD PTR [rdi+4*rax] ; xx xx xx xx xx xx xx xx d7 d6 d5 d4 d3 d2 d1 d0
-
-        punpcklbw   xmm0,               xmm5                  ; b7 a7 b6 a6 b5 a5 b4 a4 b3 a3 b2 a2 b1 a1 b0 a0
-
-        movq        xmm5,               QWORD PTR [rsi+2*rcx] ; xx xx xx xx xx xx xx xx e7 e6 e5 e4 e3 e2 e1 e0
-
-        punpcklbw   xmm1,               xmm6            ; d7 c7 d6 c6 d5 c5 d4 c4 d3 c3 d2 c2 d1 e1 d0 c0
-
-        movq        xmm6,               QWORD PTR [rdi+2*rcx] ; xx xx xx xx xx xx xx xx f7 f6 f5 f4 f3 f2 f1 f0
-
-        punpcklbw   xmm5,               xmm6            ; f7 e7 f6 e6 f5 e5 f4 e4 f3 e3 f2 e2 f1 e1 f0 e0
-
-        movdqa      xmm6,               xmm1            ;
-        punpckhwd   xmm6,               xmm5            ; f7 e7 d7 c7 f6 e6 d6 c6 f5 e5 d5 c5 f4 e4 d4 c4
-
-        punpcklwd   xmm1,               xmm5            ; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0
-        movdqa      xmm5,               xmm2            ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80
-
-        punpcklwd   xmm5,               xmm0            ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80
-
-        punpckhwd   xmm2,               xmm0            ; b7 a7 97 87 b6 a6 96 86 b5 a5 95 85 b4 a4 94 84
-
-        movdqa      xmm0,               xmm5
-        punpckldq   xmm0,               xmm1            ; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80
-
-        punpckhdq   xmm5,               xmm1            ; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82
-        movdqa      xmm1,               xmm2            ; b7 a7 97 87 b6 a6 96 86 b5 a5 95 85 b4 a4 94 84
-
-        punpckldq   xmm1,               xmm6            ; f5 e5 d5 c5 b5 a5 95 85 f4 e4 d4 c4 b4 a4 94 84
-
-        punpckhdq   xmm2,               xmm6            ; f7 e7 d7 c7 b7 a7 97 87 f6 e6 d6 c6 b6 a6 96 86
-        movdqa      xmm6,               xmm7            ; 77 67 57 47 37 27 17 07 76 66 56 46 36 26 16 06
-
-        punpcklqdq  xmm6,               xmm2            ; f6 e6 d6 c6 b6 a6 96 86 76 66 56 46 36 26 16 06
-
-        punpckhqdq  xmm7,               xmm2            ; f7 e7 d7 c7 b7 a7 97 87 77 67 57 47 37 27 17 07
-%if %2
-        movdqa      xmm2,               xmm3            ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
-        punpcklqdq  xmm2,               xmm5            ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
-
-        punpckhqdq  xmm3,               xmm5            ; f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
-
-        movdqa      [rdx],              xmm2            ; save 2
-
-        movdqa      xmm5,               xmm4            ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04
-        punpcklqdq  xmm4,               xmm1            ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
-
-        movdqa      [rdx+16],           xmm3            ; save 3
-
-        punpckhqdq  xmm5,               xmm1            ; f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05
-
-        movdqa      [rdx+32],           xmm4            ; save 4
-        movdqa      [rdx+48],           xmm5            ; save 5
-        movdqa      xmm1,               t0              ; get
-
-        movdqa      xmm2,               xmm1            ;
-        punpckhqdq  xmm1,               xmm0            ; f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
-
-        punpcklqdq  xmm2,               xmm0            ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
-%else
-        movdqa      [rdx+112],          xmm7            ; save 7
-
-        movdqa      [rdx+96],           xmm6            ; save 6
-
-        movdqa      xmm2,               xmm3            ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
-        punpckhqdq  xmm3,               xmm5            ; f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
-
-        punpcklqdq  xmm2,               xmm5            ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
-
-        movdqa      [rdx+32],           xmm2            ; save 2
-
-        movdqa      xmm5,               xmm4            ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04
-        punpcklqdq  xmm4,               xmm1            ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
-
-        movdqa      [rdx+48],           xmm3            ; save 3
-
-        punpckhqdq  xmm5,               xmm1            ; f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05
-
-        movdqa      [rdx+64],           xmm4            ; save 4
-        movdqa      [rdx+80],           xmm5            ; save 5
-        movdqa      xmm1,               t0              ; get
-
-        movdqa      xmm2,               xmm1
-        punpckhqdq  xmm1,               xmm0            ; f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
-
-        punpcklqdq  xmm2,               xmm0            ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
-
-        movdqa      [rdx+16],           xmm1
-
-        movdqa      [rdx],              xmm2
-%endif
-%endmacro
-
-%macro LFV_FILTER_MASK_HEV_MASK 1
-        movdqa      xmm0,               xmm6            ; q2
-        psubusb     xmm0,               xmm7            ; q2-q3
-
-        psubusb     xmm7,               xmm6            ; q3-q2
-        movdqa      xmm4,               xmm5            ; q1
-
-        por         xmm7,               xmm0            ; abs (q3-q2)
-        psubusb     xmm4,               xmm6            ; q1-q2
-
-        movdqa      xmm0,               xmm1
-        psubusb     xmm6,               xmm5            ; q2-q1
-
-        por         xmm6,               xmm4            ; abs (q2-q1)
-        psubusb     xmm0,               xmm2            ; p2 - p3;
-
-        psubusb     xmm2,               xmm1            ; p3 - p2;
-        por         xmm0,               xmm2            ; abs(p2-p3)
-%if %1
-        movdqa      xmm2,               [rdx]           ; p1
-%else
-        movdqa      xmm2,               [rdx+32]        ; p1
-%endif
-        movdqa      xmm5,               xmm2            ; p1
-        pmaxub      xmm0,               xmm7
-
-        psubusb     xmm5,               xmm1            ; p1-p2
-        psubusb     xmm1,               xmm2            ; p2-p1
-
-        movdqa      xmm7,               xmm3            ; p0
-        psubusb     xmm7,               xmm2            ; p0-p1
-
-        por         xmm1,               xmm5            ; abs(p2-p1)
-        pmaxub      xmm0,               xmm6
-
-        pmaxub      xmm0,               xmm1
-        movdqa      xmm1,               xmm2            ; p1
-
-        psubusb     xmm2,               xmm3            ; p1-p0
-        lea         rdx,                srct
-
-        por         xmm2,               xmm7            ; abs(p1-p0)
-
-        movdqa      t0,                 xmm2            ; save abs(p1-p0)
-
-        pmaxub      xmm0,               xmm2
-
-%if %1
-        movdqa      xmm5,               [rdx+32]        ; q0
-        movdqa      xmm7,               [rdx+48]        ; q1
-%else
-        movdqa      xmm5,               [rdx+64]        ; q0
-        movdqa      xmm7,               [rdx+80]        ; q1
-%endif
-        mov         rdx,                arg(3)          ; limit
-
-        movdqa      xmm6,               xmm5            ; q0
-        movdqa      xmm2,               xmm7            ; q1
-
-        psubusb     xmm5,               xmm7            ; q0-q1
-        psubusb     xmm7,               xmm6            ; q1-q0
-
-        por         xmm7,               xmm5            ; abs(q1-q0)
-
-        movdqa      t1,                 xmm7            ; save abs(q1-q0)
-
-        movdqa      xmm4,               XMMWORD PTR [rdx]; limit
-
-        pmaxub      xmm0,               xmm7
-        mov         rdx,                arg(2)          ; blimit
-
-        psubusb     xmm0,               xmm4
-        movdqa      xmm5,               xmm2            ; q1
-
-        psubusb     xmm5,               xmm1            ; q1-=p1
-        psubusb     xmm1,               xmm2            ; p1-=q1
-
-        por         xmm5,               xmm1            ; abs(p1-q1)
-        movdqa      xmm1,               xmm3            ; p0
-
-        pand        xmm5,               [GLOBAL(tfe)]   ; set lsb of each byte to zero
-        psubusb     xmm1,               xmm6            ; p0-q0
-
-        psrlw       xmm5,               1               ; abs(p1-q1)/2
-        psubusb     xmm6,               xmm3            ; q0-p0
-
-        movdqa      xmm4,               XMMWORD PTR [rdx]; blimit
-
-        mov         rdx,                arg(4)          ; get thresh
-
-        por         xmm1,               xmm6            ; abs(q0-p0)
-
-        movdqa      xmm6,               t0              ; get abs (q1 - q0)
-
-        paddusb     xmm1,               xmm1            ; abs(q0-p0)*2
-
-        movdqa      xmm3,               t1              ; get abs (p1 - p0)
-
-        movdqa      xmm7,               XMMWORD PTR [rdx]
-
-        paddusb     xmm1,               xmm5            ; abs (p0 - q0) *2 + abs(p1-q1)/2
-        psubusb     xmm6,               xmm7            ; abs(q1 - q0) > thresh
-
-        psubusb     xmm3,               xmm7            ; abs(p1 - p0)> thresh
-
-        psubusb     xmm1,               xmm4            ; abs (p0 - q0) *2 + abs(p1-q1)/2  > blimit
-        por         xmm6,               xmm3            ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
-
-        por         xmm1,               xmm0            ; mask
-        pcmpeqb     xmm6,               xmm0
-
-        pxor        xmm0,               xmm0
-        pcmpeqb     xmm4,               xmm4
-
-        pcmpeqb     xmm1,               xmm0
-        pxor        xmm4,               xmm6
-%endmacro
-
-%macro BV_TRANSPOSE 0
-        ; xmm1 =    f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
-        ; xmm6 =    f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
-        ; xmm3 =    f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
-        ; xmm7 =    f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05
-        movdqa      xmm2,               xmm1            ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
-        punpcklbw   xmm2,               xmm6            ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
-
-        movdqa      xmm4,               xmm3            ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
-        punpckhbw   xmm1,               xmm6            ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
-
-        punpcklbw   xmm4,               xmm7            ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04
-
-        punpckhbw   xmm3,               xmm7            ; f5 f4 e5 e4 d5 d4 c5 c4 b5 b4 a5 a4 95 94 85 84
-
-        movdqa      xmm6,               xmm2            ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
-        punpcklwd   xmm2,               xmm4            ; 35 34 33 32 25 24 23 22 15 14 13 12 05 04 03 02
-
-        punpckhwd   xmm6,               xmm4            ; 75 74 73 72 65 64 63 62 55 54 53 52 45 44 43 42
-        movdqa      xmm5,               xmm1            ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
-
-        punpcklwd   xmm1,               xmm3            ; b5 b4 b3 b2 a5 a4 a3 a2 95 94 93 92 85 84 83 82
-
-        punpckhwd   xmm5,               xmm3            ; f5 f4 f3 f2 e5 e4 e3 e2 d5 d4 d3 d2 c5 c4 c3 c2
-        ; xmm2 = 35 34 33 32 25 24 23 22 15 14 13 12 05 04 03 02
-        ; xmm6 = 75 74 73 72 65 64 63 62 55 54 53 52 45 44 43 42
-        ; xmm1 = b5 b4 b3 b2 a5 a4 a3 a2 95 94 93 92 85 84 83 82
-        ; xmm5 = f5 f4 f3 f2 e5 e4 e3 e2 d5 d4 d3 d2 c5 c4 c3 c2
-%endmacro
-
-%macro BV_WRITEBACK 2
-        movd        [rsi+2],            %1
-        psrldq      %1,                 4
-
-        movd        [rdi+2],            %1
-        psrldq      %1,                 4
-
-        movd        [rsi+2*rax+2],      %1
-        psrldq      %1,                 4
-
-        movd        [rdi+2*rax+2],      %1
-
-        movd        [rsi+4*rax+2],      %2
-        psrldq      %2,                 4
-
-        movd        [rdi+4*rax+2],      %2
-        psrldq      %2,                 4
-
-        movd        [rsi+2*rcx+2],      %2
-        psrldq      %2,                 4
-
-        movd        [rdi+2*rcx+2],      %2
-%endmacro
-
-
-;void vp9_loop_filter_vertical_edge_sse2
-;(
-;    unsigned char *src_ptr,
-;    int            src_pixel_step,
-;    const char    *blimit,
-;    const char    *limit,
-;    const char    *thresh,
-;    int            count
-;)
-global sym(vp9_loop_filter_vertical_edge_sse2)
-sym(vp9_loop_filter_vertical_edge_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub             rsp, 96      ; reserve 96 bytes
-    %define t0      [rsp + 0]    ;__declspec(align(16)) char t0[16];
-    %define t1      [rsp + 16]   ;__declspec(align(16)) char t1[16];
-    %define srct    [rsp + 32]   ;__declspec(align(16)) char srct[64];
-
-        mov         rsi,        arg(0)                  ; src_ptr
-        movsxd      rax,        dword ptr arg(1)        ; src_pixel_step
-
-        lea         rsi,        [rsi - 4]
-        lea         rdi,        [rsi + rax]             ; rdi points to row +1 for indirect addressing
-        lea         rcx,        [rax*2+rax]
-
-        ;transpose 16x8 to 8x16, and store the 8-line result on stack.
-        TRANSPOSE_16X8 1, 1
-
-        ; calculate filter mask and high edge variance
-        LFV_FILTER_MASK_HEV_MASK 1
-
-        ; start work on filters
-        B_FILTER 2
-
-        ; tranpose and write back - only work on q1, q0, p0, p1
-        BV_TRANSPOSE
-        ; store 16-line result
-
-        lea         rdx,        [rax]
-        neg         rdx
-
-        BV_WRITEBACK xmm1, xmm5
-
-        lea         rsi,        [rsi+rdx*8]
-        lea         rdi,        [rdi+rdx*8]
-        BV_WRITEBACK xmm2, xmm6
-
-    add rsp, 96
-    pop rsp
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vp9_loop_filter_vertical_edge_uv_sse2
-;(
-;    unsigned char *u,
-;    int            src_pixel_step,
-;    const char    *blimit,
-;    const char    *limit,
-;    const char    *thresh,
-;    unsigned char *v
-;)
-global sym(vp9_loop_filter_vertical_edge_uv_sse2)
-sym(vp9_loop_filter_vertical_edge_uv_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub             rsp, 96      ; reserve 96 bytes
-    %define t0      [rsp + 0]    ;__declspec(align(16)) char t0[16];
-    %define t1      [rsp + 16]   ;__declspec(align(16)) char t1[16];
-    %define srct    [rsp + 32]   ;__declspec(align(16)) char srct[64];
-
-        mov         rsi,        arg(0)                  ; u_ptr
-        movsxd      rax,        dword ptr arg(1)        ; src_pixel_step
-
-        lea         rsi,        [rsi - 4]
-        lea         rdi,        [rsi + rax]             ; rdi points to row +1 for indirect addressing
-        lea         rcx,        [rax+2*rax]
-
-        lea         rdx,        srct
-
-        ;transpose 16x8 to 8x16, and store the 8-line result on stack.
-        TRANSPOSE_16X8 0, 1
-
-        ; calculate filter mask and high edge variance
-        LFV_FILTER_MASK_HEV_MASK 1
-
-        ; start work on filters
-        B_FILTER 2
-
-        ; tranpose and write back - only work on q1, q0, p0, p1
-        BV_TRANSPOSE
-
-        lea         rdi,        [rsi + rax]             ; rdi points to row +1 for indirect addressing
-
-        ; store 16-line result
-        BV_WRITEBACK xmm1, xmm5
-
-        mov         rsi,        arg(0)                  ; u_ptr
-        lea         rsi,        [rsi - 4]
-        lea         rdi,        [rsi + rax]             ; rdi points to row +1 for indirect addressing
-        BV_WRITEBACK xmm2, xmm6
-
-    add rsp, 96
-    pop rsp
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void vp9_loop_filter_simple_horizontal_edge_sse2
-;(
-;    unsigned char *src_ptr,
-;    int  src_pixel_step,
-;    const char *blimit,
-;)
-global sym(vp9_loop_filter_simple_horizontal_edge_sse2)
-sym(vp9_loop_filter_simple_horizontal_edge_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 3
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-        mov         rsi, arg(0)             ;src_ptr
-        movsxd      rax, dword ptr arg(1)   ;src_pixel_step     ; destination pitch?
-        mov         rdx, arg(2)             ;blimit
-        movdqa      xmm3, XMMWORD PTR [rdx]
-
-        mov         rdi, rsi                ; rdi points to row +1 for indirect addressing
-        add         rdi, rax
-        neg         rax
-
-        ; calculate mask
-        movdqa      xmm1, [rsi+2*rax]       ; p1
-        movdqa      xmm0, [rdi]             ; q1
-        movdqa      xmm2, xmm1
-        movdqa      xmm7, xmm0
-        movdqa      xmm4, xmm0
-        psubusb     xmm0, xmm1              ; q1-=p1
-        psubusb     xmm1, xmm4              ; p1-=q1
-        por         xmm1, xmm0              ; abs(p1-q1)
-        pand        xmm1, [GLOBAL(tfe)]     ; set lsb of each byte to zero
-        psrlw       xmm1, 1                 ; abs(p1-q1)/2
-
-        movdqa      xmm5, [rsi+rax]         ; p0
-        movdqa      xmm4, [rsi]             ; q0
-        movdqa      xmm0, xmm4              ; q0
-        movdqa      xmm6, xmm5              ; p0
-        psubusb     xmm5, xmm4              ; p0-=q0
-        psubusb     xmm4, xmm6              ; q0-=p0
-        por         xmm5, xmm4              ; abs(p0 - q0)
-        paddusb     xmm5, xmm5              ; abs(p0-q0)*2
-        paddusb     xmm5, xmm1              ; abs (p0 - q0) *2 + abs(p1-q1)/2
-
-        psubusb     xmm5, xmm3              ; abs(p0 - q0) *2 + abs(p1-q1)/2  > blimit
-        pxor        xmm3, xmm3
-        pcmpeqb     xmm5, xmm3
-
-        ; start work on filters
-        pxor        xmm2, [GLOBAL(t80)]     ; p1 offset to convert to signed values
-        pxor        xmm7, [GLOBAL(t80)]     ; q1 offset to convert to signed values
-        psubsb      xmm2, xmm7              ; p1 - q1
-
-        pxor        xmm6, [GLOBAL(t80)]     ; offset to convert to signed values
-        pxor        xmm0, [GLOBAL(t80)]     ; offset to convert to signed values
-        movdqa      xmm3, xmm0              ; q0
-        psubsb      xmm0, xmm6              ; q0 - p0
-        paddsb      xmm2, xmm0              ; p1 - q1 + 1 * (q0 - p0)
-        paddsb      xmm2, xmm0              ; p1 - q1 + 2 * (q0 - p0)
-        paddsb      xmm2, xmm0              ; p1 - q1 + 3 * (q0 - p0)
-        pand        xmm5, xmm2              ; mask filter values we don't care about
-
-        ; do + 4 side
-        paddsb      xmm5, [GLOBAL(t4)]      ; 3* (q0 - p0) + (p1 - q1) + 4
-
-        movdqa      xmm0, xmm5              ; get a copy of filters
-        psllw       xmm0, 8                 ; shift left 8
-        psraw       xmm0, 3                 ; arithmetic shift right 11
-        psrlw       xmm0, 8
-        movdqa      xmm1, xmm5              ; get a copy of filters
-        psraw       xmm1, 11                ; arithmetic shift right 11
-        psllw       xmm1, 8                 ; shift left 8 to put it back
-
-        por         xmm0, xmm1              ; put the two together to get result
-
-        psubsb      xmm3, xmm0              ; q0-= q0 add
-        pxor        xmm3, [GLOBAL(t80)]     ; unoffset
-        movdqa      [rsi], xmm3             ; write back
-
-        ; now do +3 side
-        psubsb      xmm5, [GLOBAL(t1s)]     ; +3 instead of +4
-
-        movdqa      xmm0, xmm5              ; get a copy of filters
-        psllw       xmm0, 8                 ; shift left 8
-        psraw       xmm0, 3                 ; arithmetic shift right 11
-        psrlw       xmm0, 8
-        psraw       xmm5, 11                ; arithmetic shift right 11
-        psllw       xmm5, 8                 ; shift left 8 to put it back
-        por         xmm0, xmm5              ; put the two together to get result
-
-
-        paddsb      xmm6, xmm0              ; p0+= p0 add
-        pxor        xmm6, [GLOBAL(t80)]     ; unoffset
-        movdqa      [rsi+rax], xmm6         ; write back
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vp9_loop_filter_simple_vertical_edge_sse2
-;(
-;    unsigned char *src_ptr,
-;    int  src_pixel_step,
-;    const char *blimit,
-;)
-global sym(vp9_loop_filter_simple_vertical_edge_sse2)
-sym(vp9_loop_filter_simple_vertical_edge_sse2):
-    push        rbp         ; save old base pointer value.
-    mov         rbp, rsp    ; set new base pointer value.
-    SHADOW_ARGS_TO_STACK 3
-    SAVE_XMM 7
-    GET_GOT     rbx         ; save callee-saved reg
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 32                         ; reserve 32 bytes
-    %define t0  [rsp + 0]    ;__declspec(align(16)) char t0[16];
-    %define t1  [rsp + 16]   ;__declspec(align(16)) char t1[16];
-
-        mov         rsi, arg(0) ;src_ptr
-        movsxd      rax, dword ptr arg(1) ;src_pixel_step     ; destination pitch?
-
-        lea         rsi,        [rsi - 2 ]
-        lea         rdi,        [rsi + rax]
-        lea         rdx,        [rsi + rax*4]
-        lea         rcx,        [rdx + rax]
-
-        movd        xmm0,       [rsi]                   ; (high 96 bits unused) 03 02 01 00
-        movd        xmm1,       [rdx]                   ; (high 96 bits unused) 43 42 41 40
-        movd        xmm2,       [rdi]                   ; 13 12 11 10
-        movd        xmm3,       [rcx]                   ; 53 52 51 50
-        punpckldq   xmm0,       xmm1                    ; (high 64 bits unused) 43 42 41 40 03 02 01 00
-        punpckldq   xmm2,       xmm3                    ; 53 52 51 50 13 12 11 10
-
-        movd        xmm4,       [rsi + rax*2]           ; 23 22 21 20
-        movd        xmm5,       [rdx + rax*2]           ; 63 62 61 60
-        movd        xmm6,       [rdi + rax*2]           ; 33 32 31 30
-        movd        xmm7,       [rcx + rax*2]           ; 73 72 71 70
-        punpckldq   xmm4,       xmm5                    ; 63 62 61 60 23 22 21 20
-        punpckldq   xmm6,       xmm7                    ; 73 72 71 70 33 32 31 30
-
-        punpcklbw   xmm0,       xmm2                    ; 53 43 52 42 51 41 50 40 13 03 12 02 11 01 10 00
-        punpcklbw   xmm4,       xmm6                    ; 73 63 72 62 71 61 70 60 33 23 32 22 31 21 30 20
-
-        movdqa      xmm1,       xmm0
-        punpcklwd   xmm0,       xmm4                    ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
-        punpckhwd   xmm1,       xmm4                    ; 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40
-
-        movdqa      xmm2,       xmm0
-        punpckldq   xmm0,       xmm1                    ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
-        punpckhdq   xmm2,       xmm1                    ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
-
-        movdqa      t0,         xmm0                    ; save to t0
-        movdqa      t1,         xmm2                    ; save to t1
-
-        lea         rsi,        [rsi + rax*8]
-        lea         rdi,        [rsi + rax]
-        lea         rdx,        [rsi + rax*4]
-        lea         rcx,        [rdx + rax]
-
-        movd        xmm4,       [rsi]                   ; 83 82 81 80
-        movd        xmm1,       [rdx]                   ; c3 c2 c1 c0
-        movd        xmm6,       [rdi]                   ; 93 92 91 90
-        movd        xmm3,       [rcx]                   ; d3 d2 d1 d0
-        punpckldq   xmm4,       xmm1                    ; c3 c2 c1 c0 83 82 81 80
-        punpckldq   xmm6,       xmm3                    ; d3 d2 d1 d0 93 92 91 90
-
-        movd        xmm0,       [rsi + rax*2]           ; a3 a2 a1 a0
-        movd        xmm5,       [rdx + rax*2]           ; e3 e2 e1 e0
-        movd        xmm2,       [rdi + rax*2]           ; b3 b2 b1 b0
-        movd        xmm7,       [rcx + rax*2]           ; f3 f2 f1 f0
-        punpckldq   xmm0,       xmm5                    ; e3 e2 e1 e0 a3 a2 a1 a0
-        punpckldq   xmm2,       xmm7                    ; f3 f2 f1 f0 b3 b2 b1 b0
-
-        punpcklbw   xmm4,       xmm6                    ; d3 c3 d2 c2 d1 c1 d0 c0 93 83 92 82 91 81 90 80
-        punpcklbw   xmm0,       xmm2                    ; f3 e3 f2 e2 f1 e1 f0 e0 b3 a3 b2 a2 b1 a1 b0 a0
-
-        movdqa      xmm1,       xmm4
-        punpcklwd   xmm4,       xmm0                    ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80
-        punpckhwd   xmm1,       xmm0                    ; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0
-
-        movdqa      xmm6,       xmm4
-        punpckldq   xmm4,       xmm1                    ; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80
-        punpckhdq   xmm6,       xmm1                    ; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82
-
-        movdqa      xmm0,       t0                      ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
-        movdqa      xmm2,       t1                      ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
-        movdqa      xmm1,       xmm0
-        movdqa      xmm3,       xmm2
-
-        punpcklqdq  xmm0,       xmm4                    ; p1  f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
-        punpckhqdq  xmm1,       xmm4                    ; p0  f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
-        punpcklqdq  xmm2,       xmm6                    ; q0  f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
-        punpckhqdq  xmm3,       xmm6                    ; q1  f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
-
-        ; calculate mask
-        movdqa      xmm6,       xmm0                            ; p1
-        movdqa      xmm7,       xmm3                            ; q1
-        psubusb     xmm7,       xmm0                            ; q1-=p1
-        psubusb     xmm6,       xmm3                            ; p1-=q1
-        por         xmm6,       xmm7                            ; abs(p1-q1)
-        pand        xmm6,       [GLOBAL(tfe)]                   ; set lsb of each byte to zero
-        psrlw       xmm6,       1                               ; abs(p1-q1)/2
-
-        movdqa      xmm5,       xmm1                            ; p0
-        movdqa      xmm4,       xmm2                            ; q0
-        psubusb     xmm5,       xmm2                            ; p0-=q0
-        psubusb     xmm4,       xmm1                            ; q0-=p0
-        por         xmm5,       xmm4                            ; abs(p0 - q0)
-        paddusb     xmm5,       xmm5                            ; abs(p0-q0)*2
-        paddusb     xmm5,       xmm6                            ; abs (p0 - q0) *2 + abs(p1-q1)/2
-
-        mov         rdx,        arg(2)                          ;blimit
-        movdqa      xmm7, XMMWORD PTR [rdx]
-
-        psubusb     xmm5,        xmm7                           ; abs(p0 - q0) *2 + abs(p1-q1)/2  > blimit
-        pxor        xmm7,        xmm7
-        pcmpeqb     xmm5,        xmm7                           ; mm5 = mask
-
-        ; start work on filters
-        movdqa        t0,        xmm0
-        movdqa        t1,        xmm3
-
-        pxor        xmm0,        [GLOBAL(t80)]                  ; p1 offset to convert to signed values
-        pxor        xmm3,        [GLOBAL(t80)]                  ; q1 offset to convert to signed values
-
-        psubsb      xmm0,        xmm3                           ; p1 - q1
-        movdqa      xmm6,        xmm1                           ; p0
-
-        movdqa      xmm7,        xmm2                           ; q0
-        pxor        xmm6,        [GLOBAL(t80)]                  ; offset to convert to signed values
-
-        pxor        xmm7,        [GLOBAL(t80)]                  ; offset to convert to signed values
-        movdqa      xmm3,        xmm7                           ; offseted ; q0
-
-        psubsb      xmm7,        xmm6                           ; q0 - p0
-        paddsb      xmm0,        xmm7                           ; p1 - q1 + 1 * (q0 - p0)
-
-        paddsb      xmm0,        xmm7                           ; p1 - q1 + 2 * (q0 - p0)
-        paddsb      xmm0,        xmm7                           ; p1 - q1 + 3 * (q0 - p0)
-
-        pand        xmm5,        xmm0                           ; mask filter values we don't care about
-
-
-        paddsb      xmm5,        [GLOBAL(t4)]                   ;  3* (q0 - p0) + (p1 - q1) + 4
-
-        movdqa      xmm0,        xmm5                           ; get a copy of filters
-        psllw       xmm0,        8                              ; shift left 8
-
-        psraw       xmm0,        3                              ; arithmetic shift right 11
-        psrlw       xmm0,        8
-
-        movdqa      xmm7,        xmm5                           ; get a copy of filters
-        psraw       xmm7,        11                             ; arithmetic shift right 11
-
-        psllw       xmm7,        8                              ; shift left 8 to put it back
-        por         xmm0,        xmm7                           ; put the two together to get result
-
-        psubsb      xmm3,        xmm0                           ; q0-= q0sz add
-        pxor        xmm3,        [GLOBAL(t80)]                  ; unoffset   q0
-
-        ; now do +3 side
-        psubsb      xmm5,        [GLOBAL(t1s)]                  ; +3 instead of +4
-        movdqa      xmm0,        xmm5                           ; get a copy of filters
-
-        psllw       xmm0,        8                              ; shift left 8
-        psraw       xmm0,        3                              ; arithmetic shift right 11
-
-        psrlw       xmm0,        8
-        psraw       xmm5,        11                             ; arithmetic shift right 11
-
-        psllw       xmm5,        8                              ; shift left 8 to put it back
-        por         xmm0,        xmm5                           ; put the two together to get result
-
-        paddsb      xmm6,        xmm0                           ; p0+= p0 add
-        pxor        xmm6,        [GLOBAL(t80)]                  ; unoffset   p0
-
-        movdqa      xmm0,        t0                             ; p1
-        movdqa      xmm4,        t1                             ; q1
-
-        ; transpose back to write out
-        ; p1  f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
-        ; p0  f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
-        ; q0  f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
-        ; q1  f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
-        movdqa      xmm1,       xmm0
-        punpcklbw   xmm0,       xmm6                               ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
-        punpckhbw   xmm1,       xmm6                               ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80
-
-        movdqa      xmm5,       xmm3
-        punpcklbw   xmm3,       xmm4                               ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
-        punpckhbw   xmm5,       xmm4                               ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
-
-        movdqa      xmm2,       xmm0
-        punpcklwd   xmm0,       xmm3                               ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00
-        punpckhwd   xmm2,       xmm3                               ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40
-
-        movdqa      xmm3,       xmm1
-        punpcklwd   xmm1,       xmm5                               ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80
-        punpckhwd   xmm3,       xmm5                               ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0
-
-        ; write out order: xmm0 xmm2 xmm1 xmm3
-        lea         rdx,        [rsi + rax*4]
-
-        movd        [rsi],      xmm1                               ; write the second 8-line result
-        psrldq      xmm1,       4
-        movd        [rdi],      xmm1
-        psrldq      xmm1,       4
-        movd        [rsi + rax*2], xmm1
-        psrldq      xmm1,       4
-        movd        [rdi + rax*2], xmm1
-
-        movd        [rdx],      xmm3
-        psrldq      xmm3,       4
-        movd        [rcx],      xmm3
-        psrldq      xmm3,       4
-        movd        [rdx + rax*2], xmm3
-        psrldq      xmm3,       4
-        movd        [rcx + rax*2], xmm3
-
-        neg         rax
-        lea         rsi,        [rsi + rax*8]
-        neg         rax
-        lea         rdi,        [rsi + rax]
-        lea         rdx,        [rsi + rax*4]
-        lea         rcx,        [rdx + rax]
-
-        movd        [rsi],      xmm0                                ; write the first 8-line result
-        psrldq      xmm0,       4
-        movd        [rdi],      xmm0
-        psrldq      xmm0,       4
-        movd        [rsi + rax*2], xmm0
-        psrldq      xmm0,       4
-        movd        [rdi + rax*2], xmm0
-
-        movd        [rdx],      xmm2
-        psrldq      xmm2,       4
-        movd        [rcx],      xmm2
-        psrldq      xmm2,       4
-        movd        [rdx + rax*2], xmm2
-        psrldq      xmm2,       4
-        movd        [rcx + rax*2], xmm2
-
-    add rsp, 32
-    pop rsp
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-SECTION_RODATA
-align 16
-tfe:
-    times 16 db 0xfe
-align 16
-t80:
-    times 16 db 0x80
-align 16
-t1s:
-    times 16 db 0x01
-align 16
-t3:
-    times 16 db 0x03
-align 16
-t4:
-    times 16 db 0x04
-align 16
-ones:
-    times 8 dw 0x0001
-align 16
-s9:
-    times 8 dw 0x0900
-align 16
-s63:
-    times 8 dw 0x003f
--- a/vp8/common/x86/loopfilter_x86.c
+++ /dev/null
@@ -1,543 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <emmintrin.h>  // SSE2
-#include "vpx_config.h"
-#include "vp8/common/loopfilter.h"
-
-prototype_loopfilter(vp9_loop_filter_vertical_edge_mmx);
-prototype_loopfilter(vp9_loop_filter_horizontal_edge_mmx);
-
-prototype_loopfilter(vp9_loop_filter_vertical_edge_sse2);
-prototype_loopfilter(vp9_loop_filter_horizontal_edge_sse2);
-
-extern loop_filter_uvfunction vp9_loop_filter_horizontal_edge_uv_sse2;
-extern loop_filter_uvfunction vp9_loop_filter_vertical_edge_uv_sse2;
-
-#if HAVE_MMX
-/* Horizontal MB filtering */
-void vp9_loop_filter_mbh_mmx(unsigned char *y_ptr,
-                             unsigned char *u_ptr, unsigned char *v_ptr,
-                             int y_stride, int uv_stride,
-                             struct loop_filter_info *lfi) {
-}
-
-/* Vertical MB Filtering */
-void vp9_loop_filter_mbv_mmx(unsigned char *y_ptr,
-                             unsigned char *u_ptr, unsigned char *v_ptr,
-                             int y_stride, int uv_stride,
-                             struct loop_filter_info *lfi) {
-}
-
-/* Horizontal B Filtering */
-void vp9_loop_filter_bh_mmx(unsigned char *y_ptr,
-                            unsigned char *u_ptr, unsigned char *v_ptr,
-                            int y_stride, int uv_stride,
-                            struct loop_filter_info *lfi) {
-
-}
-
-void vp9_loop_filter_bhs_mmx(unsigned char *y_ptr, int y_stride,
-                             const unsigned char *blimit) {
-  vp9_loop_filter_simple_horizontal_edge_mmx(y_ptr + 4 * y_stride,
-                                             y_stride, blimit);
-  vp9_loop_filter_simple_horizontal_edge_mmx(y_ptr + 8 * y_stride,
-                                             y_stride, blimit);
-  vp9_loop_filter_simple_horizontal_edge_mmx(y_ptr + 12 * y_stride,
-                                             y_stride, blimit);
-}
-
-/* Vertical B Filtering */
-void vp9_loop_filter_bv_mmx(unsigned char *y_ptr,
-                            unsigned char *u_ptr, unsigned char *v_ptr,
-                            int y_stride, int uv_stride,
-                            struct loop_filter_info *lfi) {
-  vp9_loop_filter_vertical_edge_mmx(y_ptr + 4, y_stride,
-                                    lfi->blim, lfi->lim, lfi->hev_thr, 2);
-  vp9_loop_filter_vertical_edge_mmx(y_ptr + 8, y_stride,
-                                    lfi->blim, lfi->lim, lfi->hev_thr, 2);
-  vp9_loop_filter_vertical_edge_mmx(y_ptr + 12, y_stride,
-                                    lfi->blim, lfi->lim, lfi->hev_thr, 2);
-
-  if (u_ptr)
-    vp9_loop_filter_vertical_edge_mmx(u_ptr + 4, uv_stride,
-                                      lfi->blim, lfi->lim, lfi->hev_thr, 1);
-
-  if (v_ptr)
-    vp9_loop_filter_vertical_edge_mmx(v_ptr + 4, uv_stride,
-                                      lfi->blim, lfi->lim, lfi->hev_thr, 1);
-}
-
-void vp9_loop_filter_bvs_mmx(unsigned char *y_ptr, int y_stride,
-                             const unsigned char *blimit) {
-  vp9_loop_filter_simple_vertical_edge_mmx(y_ptr + 4, y_stride, blimit);
-  vp9_loop_filter_simple_vertical_edge_mmx(y_ptr + 8, y_stride, blimit);
-  vp9_loop_filter_simple_vertical_edge_mmx(y_ptr + 12, y_stride, blimit);
-}
-#endif
-
-#if HAVE_SSE2
-void vp9_mbloop_filter_horizontal_edge_c_sse2(unsigned char *s,
-                                              int p,
-                                              const unsigned char *_blimit,
-                                              const unsigned char *_limit,
-                                              const unsigned char *_thresh,
-                                              int count) {
-  DECLARE_ALIGNED(16, unsigned char, flat_op2[16]);
-  DECLARE_ALIGNED(16, unsigned char, flat_op1[16]);
-  DECLARE_ALIGNED(16, unsigned char, flat_op0[16]);
-  DECLARE_ALIGNED(16, unsigned char, flat_oq2[16]);
-  DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]);
-  DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]);
-  __m128i mask, hev, flat;
-  __m128i thresh, limit, blimit;
-  const __m128i zero = _mm_set1_epi16(0);
-  __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4;
-
-  thresh = _mm_shuffle_epi32(_mm_cvtsi32_si128(_thresh[0] * 0x01010101), 0);
-  limit = _mm_shuffle_epi32(_mm_cvtsi32_si128(_limit[0] * 0x01010101), 0);
-  blimit = _mm_shuffle_epi32(_mm_cvtsi32_si128(_blimit[0] * 0x01010101), 0);
-
-  p4 = _mm_loadu_si128((__m128i *)(s - 5 * p));
-  p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
-  p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
-  p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
-  p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
-  q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
-  q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
-  q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
-  q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
-  q4 = _mm_loadu_si128((__m128i *)(s + 4 * p));
-  {
-    const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0),
-                                          _mm_subs_epu8(p0, p1));
-    const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0),
-                                          _mm_subs_epu8(q0, q1));
-    const __m128i one = _mm_set1_epi8(1);
-    const __m128i fe = _mm_set1_epi8(0xfe);
-    const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
-    __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0),
-                                    _mm_subs_epu8(q0, p0));
-    __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1),
-                                    _mm_subs_epu8(q1, p1));
-    __m128i work;
-    flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
-    hev = _mm_subs_epu8(flat, thresh);
-    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
-
-    abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0);
-    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
-    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
-    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
-    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
-    mask = _mm_max_epu8(flat, mask);
-    // mask |= (abs(p1 - p0) > limit) * -1;
-    // mask |= (abs(q1 - q0) > limit) * -1;
-    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p1),
-                                     _mm_subs_epu8(p1, p2)),
-                         _mm_or_si128(_mm_subs_epu8(p3, p2),
-                                      _mm_subs_epu8(p2, p3)));
-    mask = _mm_max_epu8(work, mask);
-    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2, q1),
-                                     _mm_subs_epu8(q1, q2)),
-                         _mm_or_si128(_mm_subs_epu8(q3, q2),
-                                      _mm_subs_epu8(q2, q3)));
-    mask = _mm_max_epu8(work, mask);
-    mask = _mm_subs_epu8(mask, limit);
-    mask = _mm_cmpeq_epi8(mask, zero);
-
-    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p0),
-                                     _mm_subs_epu8(p0, p2)),
-                         _mm_or_si128(_mm_subs_epu8(q2, q0),
-                                      _mm_subs_epu8(q0, q2)));
-    flat = _mm_max_epu8(work, flat);
-    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p3, p0),
-                                     _mm_subs_epu8(p0, p3)),
-                         _mm_or_si128(_mm_subs_epu8(q3, q0),
-                                      _mm_subs_epu8(q0, q3)));
-    flat = _mm_max_epu8(work, flat);
-    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p4, p0),
-                                     _mm_subs_epu8(p0, p4)),
-                         _mm_or_si128(_mm_subs_epu8(q4, q0),
-                                      _mm_subs_epu8(q0, q4)));
-    flat = _mm_max_epu8(work, flat);
-    flat = _mm_subs_epu8(flat, one);
-    flat = _mm_cmpeq_epi8(flat, zero);
-    flat = _mm_and_si128(flat, mask);
-  }
-  {
-    const __m128i four = _mm_set1_epi16(4);
-    unsigned char *src = s;
-    int i = 0;
-    do {
-      __m128i workp_a, workp_b, workp_shft;
-      p4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 5 * p)), zero);
-      p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero);
-      p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero);
-      p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero);
-      p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero);
-      q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero);
-      q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero);
-      q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero);
-      q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero);
-      q4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 4 * p)), zero);
-
-      workp_a = _mm_add_epi16(_mm_add_epi16(p4, p3), _mm_add_epi16(p2, p1));
-      workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0);
-      workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p4);
-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-      _mm_storel_epi64((__m128i *)&flat_op2[i*8],
-                       _mm_packus_epi16(workp_shft, workp_shft));
-
-      workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1);
-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-      _mm_storel_epi64((__m128i *)&flat_op1[i*8],
-                       _mm_packus_epi16(workp_shft, workp_shft));
-
-      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p4), q2);
-      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0);
-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-      _mm_storel_epi64((__m128i *)&flat_op0[i*8],
-                       _mm_packus_epi16(workp_shft, workp_shft));
-
-      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3);
-      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0);
-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-      _mm_storel_epi64((__m128i *)&flat_oq0[i*8],
-                       _mm_packus_epi16(workp_shft, workp_shft));
-
-      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q4);
-      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1);
-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-      _mm_storel_epi64((__m128i *)&flat_oq1[i*8],
-                       _mm_packus_epi16(workp_shft, workp_shft));
-
-      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q4);
-      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2);
-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-      _mm_storel_epi64((__m128i *)&flat_oq2[i*8],
-                       _mm_packus_epi16(workp_shft, workp_shft));
-
-      src += 8;
-    } while (++i < count);
-  }
-  // lp filter
-  {
-    const __m128i t4 = _mm_set1_epi8(4);
-    const __m128i t3 = _mm_set1_epi8(3);
-    const __m128i t80 = _mm_set1_epi8(0x80);
-    const __m128i te0 = _mm_set1_epi8(0xe0);
-    const __m128i t1f = _mm_set1_epi8(0x1f);
-    const __m128i t1 = _mm_set1_epi8(0x1);
-    const __m128i t7f = _mm_set1_epi8(0x7f);
-
-    const __m128i ps1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)),
-                                      t80);
-    const __m128i ps0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)),
-                                      t80);
-    const __m128i qs0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)),
-                                      t80);
-    const __m128i qs1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)),
-                                      t80);
-    __m128i filt;
-    __m128i work_a;
-    __m128i filter1, filter2;
-
-    filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
-    work_a = _mm_subs_epi8(qs0, ps0);
-    filt = _mm_adds_epi8(filt, work_a);
-    filt = _mm_adds_epi8(filt, work_a);
-    filt = _mm_adds_epi8(filt, work_a);
-    /* (vp9_filter + 3 * (qs0 - ps0)) & mask */
-    filt = _mm_and_si128(filt, mask);
-
-    filter1 = _mm_adds_epi8(filt, t4);
-    filter2 = _mm_adds_epi8(filt, t3);
-
-    /* Filter1 >> 3 */
-    work_a = _mm_cmpgt_epi8(zero, filter1);
-    filter1 = _mm_srli_epi16(filter1, 3);
-    work_a = _mm_and_si128(work_a, te0);
-    filter1 = _mm_and_si128(filter1, t1f);
-    filter1 = _mm_or_si128(filter1, work_a);
-
-    /* Filter2 >> 3 */
-    work_a = _mm_cmpgt_epi8(zero, filter2);
-    filter2 = _mm_srli_epi16(filter2, 3);
-    work_a = _mm_and_si128(work_a, te0);
-    filter2 = _mm_and_si128(filter2, t1f);
-    filter2 = _mm_or_si128(filter2, work_a);
-
-    /* filt >> 1 */
-    filt = _mm_adds_epi8(filter1, t1);
-    work_a = _mm_cmpgt_epi8(zero, filt);
-    filt = _mm_srli_epi16(filt, 1);
-    work_a = _mm_and_si128(work_a, t80);
-    filt = _mm_and_si128(filt, t7f);
-    filt = _mm_or_si128(filt, work_a);
-
-    filt = _mm_andnot_si128(hev, filt);
-
-    work_a = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
-    q0 = _mm_load_si128((__m128i *)flat_oq0);
-    work_a = _mm_andnot_si128(flat, work_a);
-    q0 = _mm_and_si128(flat, q0);
-    q0 = _mm_or_si128(work_a, q0);
-
-    work_a = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
-    q1 = _mm_load_si128((__m128i *)flat_oq1);
-    work_a = _mm_andnot_si128(flat, work_a);
-    q1 = _mm_and_si128(flat, q1);
-    q1 = _mm_or_si128(work_a, q1);
-
-    work_a = _mm_loadu_si128((__m128i *)(s + 2 * p));
-    q2 = _mm_load_si128((__m128i *)flat_oq2);
-    work_a = _mm_andnot_si128(flat, work_a);
-    q2 = _mm_and_si128(flat, q2);
-    q2 = _mm_or_si128(work_a, q2);
-
-    work_a = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
-    p0 = _mm_load_si128((__m128i *)flat_op0);
-    work_a = _mm_andnot_si128(flat, work_a);
-    p0 = _mm_and_si128(flat, p0);
-    p0 = _mm_or_si128(work_a, p0);
-
-    work_a = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
-    p1 = _mm_load_si128((__m128i *)flat_op1);
-    work_a = _mm_andnot_si128(flat, work_a);
-    p1 = _mm_and_si128(flat, p1);
-    p1 = _mm_or_si128(work_a, p1);
-
-    work_a = _mm_loadu_si128((__m128i *)(s - 3 * p));
-    p2 = _mm_load_si128((__m128i *)flat_op2);
-    work_a = _mm_andnot_si128(flat, work_a);
-    p2 = _mm_and_si128(flat, p2);
-    p2 = _mm_or_si128(work_a, p2);
-
-    if (count == 1) {
-      _mm_storel_epi64((__m128i *)(s - 3 * p), p2);
-      _mm_storel_epi64((__m128i *)(s - 2 * p), p1);
-      _mm_storel_epi64((__m128i *)(s - 1 * p), p0);
-      _mm_storel_epi64((__m128i *)(s + 0 * p), q0);
-      _mm_storel_epi64((__m128i *)(s + 1 * p), q1);
-      _mm_storel_epi64((__m128i *)(s + 2 * p), q2);
-    } else {
-      _mm_storeu_si128((__m128i *)(s - 3 * p), p2);
-      _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
-      _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
-      _mm_storeu_si128((__m128i *)(s + 0 * p), q0);
-      _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
-      _mm_storeu_si128((__m128i *)(s + 2 * p), q2);
-    }
-  }
-}
-
-static __inline void transpose(unsigned char *src[], int in_p,
-                               unsigned char *dst[], int out_p,
-                               int num_8x8_to_transpose) {
-  int idx8x8 = 0;
-  __m128i x0, x1, x2, x3, x4, x5, x6, x7;
-  do {
-    unsigned char *in = src[idx8x8];
-    unsigned char *out = dst[idx8x8];
-
-    x0 = _mm_loadl_epi64((__m128i *)(in + 0*in_p));  // 00 01 02 03 04 05 06 07
-    x1 = _mm_loadl_epi64((__m128i *)(in + 1*in_p));  // 10 11 12 13 14 15 16 17
-    x2 = _mm_loadl_epi64((__m128i *)(in + 2*in_p));  // 20 21 22 23 24 25 26 27
-    x3 = _mm_loadl_epi64((__m128i *)(in + 3*in_p));  // 30 31 32 33 34 35 36 37
-    x4 = _mm_loadl_epi64((__m128i *)(in + 4*in_p));  // 40 41 42 43 44 45 46 47
-    x5 = _mm_loadl_epi64((__m128i *)(in + 5*in_p));  // 50 51 52 53 54 55 56 57
-    x6 = _mm_loadl_epi64((__m128i *)(in + 6*in_p));  // 60 61 62 63 64 65 66 67
-    x7 = _mm_loadl_epi64((__m128i *)(in + 7*in_p));  // 70 71 72 73 74 75 76 77
-    // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
-    x0 = _mm_unpacklo_epi8(x0, x1);
-    // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
-    x1 = _mm_unpacklo_epi8(x2, x3);
-    // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
-    x2 = _mm_unpacklo_epi8(x4, x5);
-    // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
-    x3 = _mm_unpacklo_epi8(x6, x7);
-    // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
-    x4 = _mm_unpacklo_epi16(x0, x1);
-    // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
-    x5 = _mm_unpacklo_epi16(x2, x3);
-    // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
-    x6 = _mm_unpacklo_epi32(x4, x5);
-    // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
-    x7 = _mm_unpackhi_epi32(x4, x5);
-
-    _mm_storel_pd((double *)(out + 0*out_p),
-                  _mm_castsi128_pd(x6));  // 00 10 20 30 40 50 60 70
-    _mm_storeh_pd((double *)(out + 1*out_p),
-                  _mm_castsi128_pd(x6));  // 01 11 21 31 41 51 61 71
-    _mm_storel_pd((double *)(out + 2*out_p),
-                  _mm_castsi128_pd(x7));  // 02 12 22 32 42 52 62 72
-    _mm_storeh_pd((double *)(out + 3*out_p),
-                  _mm_castsi128_pd(x7));  // 03 13 23 33 43 53 63 73
-
-    // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
-    x4 = _mm_unpackhi_epi16(x0, x1);
-    // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77
-    x5 = _mm_unpackhi_epi16(x2, x3);
-    // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
-    x6 = _mm_unpacklo_epi32(x4, x5);
-    // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
-    x7 = _mm_unpackhi_epi32(x4, x5);
-
-    _mm_storel_pd((double *)(out + 4*out_p),
-                  _mm_castsi128_pd(x6));  // 04 14 24 34 44 54 64 74
-    _mm_storeh_pd((double *)(out + 5*out_p),
-                  _mm_castsi128_pd(x6));  // 05 15 25 35 45 55 65 75
-    _mm_storel_pd((double *)(out + 6*out_p),
-                  _mm_castsi128_pd(x7));  // 06 16 26 36 46 56 66 76
-    _mm_storeh_pd((double *)(out + 7*out_p),
-                  _mm_castsi128_pd(x7));  // 07 17 27 37 47 57 67 77
-  } while (++idx8x8 < num_8x8_to_transpose);
-}
-
-void vp9_mbloop_filter_vertical_edge_c_sse2(unsigned char *s,
-                                            int p,
-                                            const unsigned char *blimit,
-                                            const unsigned char *limit,
-                                            const unsigned char *thresh,
-                                            int count) {
-  DECLARE_ALIGNED(16, unsigned char, t_dst[16 * 16]);
-  unsigned char *src[4];
-  unsigned char *dst[4];
-
-  src[0] = s - 5;
-  src[1] = s - 5 + 8;
-  src[2] = s - 5 + p*8;
-  src[3] = s - 5 + p*8 + 8;
-
-  dst[0] = t_dst;
-  dst[1] = t_dst + 16*8;
-  dst[2] = t_dst + 8;
-  dst[3] = t_dst + 16*8 + 8;
-
-  // 16x16->16x16 or 16x8->8x16
-  transpose(src, p, dst, 16, (1 << count));
-
-  vp9_mbloop_filter_horizontal_edge_c_sse2(t_dst + 5*16, 16, blimit, limit,
-                                           thresh, count);
-
-  dst[0] = s - 5;
-  dst[1] = s - 5 + p*8;
-
-  src[0] = t_dst;
-  src[1] = t_dst + 8;
-
-  // 16x8->8x16 or 8x8->8x8
-  transpose(src, 16, dst, p, (1 << (count - 1)));
-}
-
-/* Horizontal MB filtering */
-void vp9_loop_filter_mbh_sse2(unsigned char *y_ptr,
-                              unsigned char *u_ptr, unsigned char *v_ptr,
-                              int y_stride, int uv_stride,
-                              struct loop_filter_info *lfi) {
-  vp9_mbloop_filter_horizontal_edge_c_sse2(y_ptr, y_stride, lfi->mblim,
-                                           lfi->lim, lfi->hev_thr, 2);
-
-  /* TODO: write sse2 version with u,v interleaved */
-  if (u_ptr)
-    vp9_mbloop_filter_horizontal_edge_c_sse2(u_ptr, uv_stride, lfi->mblim,
-                                             lfi->lim, lfi->hev_thr, 1);
-
-  if (v_ptr)
-    vp9_mbloop_filter_horizontal_edge_c_sse2(v_ptr, uv_stride, lfi->mblim,
-                                             lfi->lim, lfi->hev_thr, 1);
-}
-
-void vp9_loop_filter_bh8x8_sse2(unsigned char *y_ptr, unsigned char *u_ptr,
-                             unsigned char *v_ptr, int y_stride, int uv_stride,
-                             struct loop_filter_info *lfi) {
-  vp9_mbloop_filter_horizontal_edge_c_sse2(
-    y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
-}
-
-/* Vertical MB Filtering */
-void vp9_loop_filter_mbv_sse2(unsigned char *y_ptr, unsigned char *u_ptr,
-                              unsigned char *v_ptr, int y_stride, int uv_stride,
-                              struct loop_filter_info *lfi) {
-  vp9_mbloop_filter_vertical_edge_c_sse2(y_ptr, y_stride, lfi->mblim, lfi->lim,
-                                         lfi->hev_thr, 2);
-
-  /* TODO: write sse2 version with u,v interleaved */
-  if (u_ptr)
-    vp9_mbloop_filter_vertical_edge_c_sse2(u_ptr, uv_stride, lfi->mblim,
-                                           lfi->lim, lfi->hev_thr, 1);
-
-  if (v_ptr)
-    vp9_mbloop_filter_vertical_edge_c_sse2(v_ptr, uv_stride, lfi->mblim,
-                                           lfi->lim, lfi->hev_thr, 1);
-}
-
-void vp9_loop_filter_bv8x8_sse2(unsigned char *y_ptr, unsigned char *u_ptr,
-                             unsigned char *v_ptr, int y_stride, int uv_stride,
-                             struct loop_filter_info *lfi) {
-  vp9_mbloop_filter_vertical_edge_c_sse2(
-    y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
-}
-
-/* Horizontal B Filtering */
-void vp9_loop_filter_bh_sse2(unsigned char *y_ptr,
-                             unsigned char *u_ptr, unsigned char *v_ptr,
-                             int y_stride, int uv_stride,
-                             struct loop_filter_info *lfi) {
-  vp9_loop_filter_horizontal_edge_sse2(y_ptr + 4 * y_stride, y_stride,
-                                       lfi->blim, lfi->lim, lfi->hev_thr, 2);
-  vp9_loop_filter_horizontal_edge_sse2(y_ptr + 8 * y_stride, y_stride,
-                                       lfi->blim, lfi->lim, lfi->hev_thr, 2);
-  vp9_loop_filter_horizontal_edge_sse2(y_ptr + 12 * y_stride, y_stride,
-                                       lfi->blim, lfi->lim, lfi->hev_thr, 2);
-
-  if (u_ptr)
-    vp9_loop_filter_horizontal_edge_uv_sse2(u_ptr + 4 * uv_stride, uv_stride,
-                                            lfi->blim, lfi->lim, lfi->hev_thr,
-                                            v_ptr + 4 * uv_stride);
-}
-
-void vp9_loop_filter_bhs_sse2(unsigned char *y_ptr, int y_stride,
-                              const unsigned char *blimit) {
-  vp9_loop_filter_simple_horizontal_edge_sse2(y_ptr + 4 * y_stride,
-                                              y_stride, blimit);
-  vp9_loop_filter_simple_horizontal_edge_sse2(y_ptr + 8 * y_stride,
-                                              y_stride, blimit);
-  vp9_loop_filter_simple_horizontal_edge_sse2(y_ptr + 12 * y_stride,
-                                              y_stride, blimit);
-}
-
-/* Vertical B Filtering */
-void vp9_loop_filter_bv_sse2(unsigned char *y_ptr,
-                             unsigned char *u_ptr, unsigned char *v_ptr,
-                             int y_stride, int uv_stride,
-                             struct loop_filter_info *lfi) {
-  vp9_loop_filter_vertical_edge_sse2(y_ptr + 4, y_stride,
-                                     lfi->blim, lfi->lim, lfi->hev_thr, 2);
-  vp9_loop_filter_vertical_edge_sse2(y_ptr + 8, y_stride,
-                                     lfi->blim, lfi->lim, lfi->hev_thr, 2);
-  vp9_loop_filter_vertical_edge_sse2(y_ptr + 12, y_stride,
-                                     lfi->blim, lfi->lim, lfi->hev_thr, 2);
-
-  if (u_ptr)
-    vp9_loop_filter_vertical_edge_uv_sse2(u_ptr + 4, uv_stride,
-                                          lfi->blim, lfi->lim, lfi->hev_thr,
-                                          v_ptr + 4);
-}
-
-void vp9_loop_filter_bvs_sse2(unsigned char *y_ptr, int y_stride,
-                              const unsigned char *blimit) {
-  vp9_loop_filter_simple_vertical_edge_sse2(y_ptr + 4, y_stride, blimit);
-  vp9_loop_filter_simple_vertical_edge_sse2(y_ptr + 8, y_stride, blimit);
-  vp9_loop_filter_simple_vertical_edge_sse2(y_ptr + 12, y_stride, blimit);
-}
-
-#endif
--- a/vp8/common/x86/loopfilter_x86.h
+++ /dev/null
@@ -1,43 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef LOOPFILTER_X86_H
-#define LOOPFILTER_X86_H
-
-/* Note:
- *
- * This platform is commonly built for runtime CPU detection. If you modify
- * any of the function mappings present in this file, be sure to also update
- * them in the function pointer initialization code
- */
-
-#if HAVE_MMX
-extern prototype_loopfilter_block(vp9_loop_filter_mbv_mmx);
-extern prototype_loopfilter_block(vp9_loop_filter_bv_mmx);
-extern prototype_loopfilter_block(vp9_loop_filter_mbh_mmx);
-extern prototype_loopfilter_block(vp9_loop_filter_bh_mmx);
-extern prototype_simple_loopfilter(vp9_loop_filter_simple_vertical_edge_mmx);
-extern prototype_simple_loopfilter(vp9_loop_filter_bvs_mmx);
-extern prototype_simple_loopfilter(vp9_loop_filter_simple_horizontal_edge_mmx);
-extern prototype_simple_loopfilter(vp9_loop_filter_bhs_mmx);
-#endif
-
-#if HAVE_SSE2
-extern prototype_loopfilter_block(vp9_loop_filter_mbv_sse2);
-extern prototype_loopfilter_block(vp9_loop_filter_bv_sse2);
-extern prototype_loopfilter_block(vp9_loop_filter_mbh_sse2);
-extern prototype_loopfilter_block(vp9_loop_filter_bh_sse2);
-extern prototype_simple_loopfilter(vp9_loop_filter_simple_vertical_edge_sse2);
-extern prototype_simple_loopfilter(vp9_loop_filter_bvs_sse2);
-extern prototype_simple_loopfilter(vp9_loop_filter_simple_horizontal_edge_sse2);
-extern prototype_simple_loopfilter(vp9_loop_filter_bhs_sse2);
-#endif
-
-#endif  // LOOPFILTER_X86_H
--- a/vp8/common/x86/mask_sse3.asm
+++ /dev/null
@@ -1,484 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-;void int vp8_makemask_sse3(
-;    unsigned char *y,
-;    unsigned char *u,
-;    unsigned char *v,
-;    unsigned char *ym,
-;    unsigned char *uvm,
-;    int yp,
-;    int uvp,
-;    int ys,
-;    int us,
-;    int vs,
-;    int yt,
-;    int ut,
-;    int vt)
-global sym(vp8_makemask_sse3)
-sym(vp8_makemask_sse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 14
-    push        rsi
-    push        rdi
-    ; end prolog
-
-        mov             rsi,        arg(0) ;y
-        mov             rdi,        arg(1) ;u
-        mov             rcx,        arg(2) ;v
-        mov             rax,        arg(3) ;ym
-        movsxd          rbx,        dword arg(4) ;yp
-        movsxd          rdx,        dword arg(5) ;uvp
-
-        pxor            xmm0,xmm0
-
-        ;make 16 copies of the center y value
-        movd            xmm1, arg(6)
-        pshufb          xmm1, xmm0
-
-        ; make 16 copies of the center u value
-        movd            xmm2, arg(7)
-        pshufb          xmm2, xmm0
-
-        ; make 16 copies of the center v value
-        movd            xmm3, arg(8)
-        pshufb          xmm3, xmm0
-        unpcklpd        xmm2, xmm3
-
-        ;make 16 copies of the y tolerance
-        movd            xmm3, arg(9)
-        pshufb          xmm3, xmm0
-
-        ;make 16 copies of the u tolerance
-        movd            xmm4, arg(10)
-        pshufb          xmm4, xmm0
-
-        ;make 16 copies of the v tolerance
-        movd            xmm5, arg(11)
-        pshufb          xmm5, xmm0
-        unpckhpd        xmm4, xmm5
-
-        mov             r8,8
-
-NextPairOfRows:
-
-        ;grab the y source values
-        movdqu          xmm0, [rsi]
-
-        ;compute abs difference between source and y target
-        movdqa          xmm6, xmm1
-        movdqa          xmm7, xmm0
-        psubusb         xmm0, xmm1
-        psubusb         xmm6, xmm7
-        por             xmm0, xmm6
-
-        ;compute abs difference between
-        movdqa          xmm6, xmm3
-        pcmpgtb         xmm6, xmm0
-
-        ;grab the y source values
-        add             rsi, rbx
-        movdqu          xmm0, [rsi]
-
-        ;compute abs difference between source and y target
-        movdqa          xmm11, xmm1
-        movdqa          xmm7, xmm0
-        psubusb         xmm0, xmm1
-        psubusb         xmm11, xmm7
-        por             xmm0, xmm11
-
-        ;compute abs difference between
-        movdqa          xmm11, xmm3
-        pcmpgtb         xmm11, xmm0
-
-
-        ;grab the u and v source values
-        movdqu          xmm7, [rdi]
-        movdqu          xmm8, [rcx]
-        unpcklpd        xmm7, xmm8
-
-        ;compute abs difference between source and uv targets
-        movdqa          xmm9, xmm2
-        movdqa          xmm10, xmm7
-        psubusb         xmm7, xmm2
-        psubusb         xmm9, xmm10
-        por             xmm7, xmm9
-
-        ;check whether the number is < tolerance
-        movdqa          xmm0, xmm4
-        pcmpgtb         xmm0, xmm7
-
-        ;double  u and v masks
-        movdqa          xmm8, xmm0
-        punpckhbw       xmm0, xmm0
-        punpcklbw       xmm8, xmm8
-
-        ;mask row 0 and output
-        pand            xmm6, xmm8
-        pand            xmm6, xmm0
-        movdqa          [rax],xmm6
-
-        ;mask row 1 and output
-        pand            xmm11, xmm8
-        pand            xmm11, xmm0
-        movdqa          [rax+16],xmm11
-
-
-        ; to the next row or set of rows
-        add             rsi, rbx
-        add             rdi, rdx
-        add             rcx, rdx
-        add             rax,32
-        dec r8
-        jnz NextPairOfRows
-
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;GROW_HORIZ (register for result, source register or mem local)
-; takes source and shifts left and ors with source
-; then shifts right and ors with source
-%macro GROW_HORIZ 2
-    movdqa          %1, %2
-    movdqa          xmm14, %1
-    movdqa          xmm15, %1
-    pslldq          xmm14, 1
-    psrldq          xmm15, 1
-    por             %1,xmm14
-    por             %1,xmm15
-%endmacro
-;GROW_VERT (result, center row, above row, below row)
-%macro GROW_VERT 4
-    movdqa          %1,%2
-    por             %1,%3
-    por             %1,%4
-%endmacro
-
-;GROW_NEXTLINE (new line to grow, new source, line to write)
-%macro GROW_NEXTLINE 3
-    GROW_HORIZ %1, %2
-    GROW_VERT xmm3, xmm0, xmm1, xmm2
-    movdqa %3,xmm3
-%endmacro
-
-
-;void int vp8_growmaskmb_sse3(
-;    unsigned char *om,
-;    unsigned char *nm,
-global sym(vp8_growmaskmb_sse3)
-sym(vp8_growmaskmb_sse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 2
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    mov             rsi,        arg(0) ;src
-    mov             rdi,        arg(1) ;rst
-
-    GROW_HORIZ xmm0, [rsi]
-    GROW_HORIZ xmm1, [rsi+16]
-    GROW_HORIZ xmm2, [rsi+32]
-
-    GROW_VERT xmm3, xmm0, xmm1, xmm2
-    por xmm0,xmm1
-    movdqa [rdi], xmm0
-    movdqa [rdi+16],xmm3
-
-    GROW_NEXTLINE xmm0,[rsi+48],[rdi+32]
-    GROW_NEXTLINE xmm1,[rsi+64],[rdi+48]
-    GROW_NEXTLINE xmm2,[rsi+80],[rdi+64]
-    GROW_NEXTLINE xmm0,[rsi+96],[rdi+80]
-    GROW_NEXTLINE xmm1,[rsi+112],[rdi+96]
-    GROW_NEXTLINE xmm2,[rsi+128],[rdi+112]
-    GROW_NEXTLINE xmm0,[rsi+144],[rdi+128]
-    GROW_NEXTLINE xmm1,[rsi+160],[rdi+144]
-    GROW_NEXTLINE xmm2,[rsi+176],[rdi+160]
-    GROW_NEXTLINE xmm0,[rsi+192],[rdi+176]
-    GROW_NEXTLINE xmm1,[rsi+208],[rdi+192]
-    GROW_NEXTLINE xmm2,[rsi+224],[rdi+208]
-    GROW_NEXTLINE xmm0,[rsi+240],[rdi+224]
-
-    por xmm0,xmm2
-    movdqa [rdi+240], xmm0
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-
-;unsigned int vp8_sad16x16_masked_wmt(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride,
-;    unsigned char *mask)
-global sym(vp8_sad16x16_masked_wmt)
-sym(vp8_sad16x16_masked_wmt):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    push        rsi
-    push        rdi
-    ; end prolog
-    mov             rsi,        arg(0) ;src_ptr
-    mov             rdi,        arg(2) ;ref_ptr
-
-    mov             rbx,        arg(4) ;mask
-    movsxd          rax,        dword ptr arg(1) ;src_stride
-    movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-    mov             rcx,        16
-
-    pxor            xmm3,       xmm3
-
-NextSadRow:
-    movdqu          xmm0,       [rsi]
-    movdqu          xmm1,       [rdi]
-    movdqu          xmm2,       [rbx]
-    pand            xmm0,       xmm2
-    pand            xmm1,       xmm2
-
-    psadbw          xmm0,       xmm1
-    paddw           xmm3,       xmm0
-
-    add             rsi, rax
-    add             rdi, rdx
-    add             rbx,  16
-
-    dec rcx
-    jnz NextSadRow
-
-    movdqa          xmm4 ,     xmm3
-    psrldq          xmm4,       8
-    paddw           xmm3,      xmm4
-    movq            rax,       xmm3
-    ; begin epilog
-    pop rdi
-    pop rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;unsigned int vp8_sad16x16_unmasked_wmt(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride,
-;    unsigned char *mask)
-global sym(vp8_sad16x16_unmasked_wmt)
-sym(vp8_sad16x16_unmasked_wmt):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    push        rsi
-    push        rdi
-    ; end prolog
-    mov             rsi,        arg(0) ;src_ptr
-    mov             rdi,        arg(2) ;ref_ptr
-
-    mov             rbx,        arg(4) ;mask
-    movsxd          rax,        dword ptr arg(1) ;src_stride
-    movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-    mov             rcx,        16
-
-    pxor            xmm3,       xmm3
-
-next_vp8_sad16x16_unmasked_wmt:
-    movdqu          xmm0,       [rsi]
-    movdqu          xmm1,       [rdi]
-    movdqu          xmm2,       [rbx]
-    por             xmm0,       xmm2
-    por             xmm1,       xmm2
-
-    psadbw          xmm0,       xmm1
-    paddw           xmm3,       xmm0
-
-    add             rsi, rax
-    add             rdi, rdx
-    add             rbx,  16
-
-    dec rcx
-    jnz next_vp8_sad16x16_unmasked_wmt
-
-    movdqa          xmm4 ,     xmm3
-    psrldq          xmm4,       8
-    paddw           xmm3,      xmm4
-    movq            rax,        xmm3
-    ; begin epilog
-    pop rdi
-    pop rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;unsigned int vp8_masked_predictor_wmt(
-;    unsigned char *masked,
-;    unsigned char *unmasked,
-;    int  src_stride,
-;    unsigned char *dst_ptr,
-;    int  dst_stride,
-;    unsigned char *mask)
-global sym(vp8_masked_predictor_wmt)
-sym(vp8_masked_predictor_wmt):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    push        rsi
-    push        rdi
-    ; end prolog
-    mov             rsi,        arg(0) ;src_ptr
-    mov             rdi,        arg(1) ;ref_ptr
-
-    mov             rbx,        arg(5) ;mask
-    movsxd          rax,        dword ptr arg(2) ;src_stride
-    mov             r11,        arg(3) ; destination
-    movsxd          rdx,        dword ptr arg(4) ;dst_stride
-
-    mov             rcx,        16
-
-    pxor            xmm3,       xmm3
-
-next_vp8_masked_predictor_wmt:
-    movdqu          xmm0,       [rsi]
-    movdqu          xmm1,       [rdi]
-    movdqu          xmm2,       [rbx]
-
-    pand            xmm0,       xmm2
-    pandn           xmm2,       xmm1
-    por             xmm0,       xmm2
-    movdqu          [r11],      xmm0
-
-    add             r11, rdx
-    add             rsi, rax
-    add             rdi, rdx
-    add             rbx,  16
-
-    dec rcx
-    jnz next_vp8_masked_predictor_wmt
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;unsigned int vp8_masked_predictor_uv_wmt(
-;    unsigned char *masked,
-;    unsigned char *unmasked,
-;    int  src_stride,
-;    unsigned char *dst_ptr,
-;    int  dst_stride,
-;    unsigned char *mask)
-global sym(vp8_masked_predictor_uv_wmt)
-sym(vp8_masked_predictor_uv_wmt):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    push        rsi
-    push        rdi
-    ; end prolog
-    mov             rsi,        arg(0) ;src_ptr
-    mov             rdi,        arg(1) ;ref_ptr
-
-    mov             rbx,        arg(5) ;mask
-    movsxd          rax,        dword ptr arg(2) ;src_stride
-    mov             r11,        arg(3) ; destination
-    movsxd          rdx,        dword ptr arg(4) ;dst_stride
-
-    mov             rcx,        8
-
-    pxor            xmm3,       xmm3
-
-next_vp8_masked_predictor_uv_wmt:
-    movq            xmm0,       [rsi]
-    movq            xmm1,       [rdi]
-    movq            xmm2,       [rbx]
-
-    pand            xmm0,       xmm2
-    pandn           xmm2,       xmm1
-    por             xmm0,       xmm2
-    movq            [r11],      xmm0
-
-    add             r11, rdx
-    add             rsi, rax
-    add             rdi, rax
-    add             rbx,  8
-
-    dec rcx
-    jnz next_vp8_masked_predictor_uv_wmt
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;unsigned int vp8_uv_from_y_mask(
-;    unsigned char *ymask,
-;    unsigned char *uvmask)
-global sym(vp8_uv_from_y_mask)
-sym(vp8_uv_from_y_mask):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    push        rsi
-    push        rdi
-    ; end prolog
-    mov             rsi,        arg(0) ;src_ptr
-    mov             rdi,        arg(1) ;dst_ptr
-
-
-    mov             rcx,        8
-
-    pxor            xmm3,       xmm3
-
-next_p8_uv_from_y_mask:
-    movdqu          xmm0,       [rsi]
-    pshufb          xmm0, [shuf1b] ;[GLOBAL(shuf1b)]
-    movq            [rdi],xmm0
-    add             rdi, 8
-    add             rsi,32
-
-    dec rcx
-    jnz next_p8_uv_from_y_mask
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-SECTION_RODATA
-align 16
-shuf1b:
-    db 0, 2, 4, 6, 8, 10, 12, 14, 0, 0, 0, 0, 0, 0, 0, 0
-
--- a/vp8/common/x86/postproc_mmx.asm
+++ /dev/null
@@ -1,534 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-%define VP9_FILTER_WEIGHT 128
-%define VP9_FILTER_SHIFT  7
-
-;void vp9_post_proc_down_and_across_mmx
-;(
-;    unsigned char *src_ptr,
-;    unsigned char *dst_ptr,
-;    int src_pixels_per_line,
-;    int dst_pixels_per_line,
-;    int rows,
-;    int cols,
-;    int flimit
-;)
-global sym(vp9_post_proc_down_and_across_mmx)
-sym(vp9_post_proc_down_and_across_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-%if ABI_IS_32BIT=1 && CONFIG_PIC=1
-    ; move the global rd onto the stack, since we don't have enough registers
-    ; to do PIC addressing
-    movq        mm0, [GLOBAL(rd)]
-    sub         rsp, 8
-    movq        [rsp], mm0
-%define RD [rsp]
-%else
-%define RD [GLOBAL(rd)]
-%endif
-
-        push        rbx
-        lea         rbx, [GLOBAL(Blur)]
-        movd        mm2, dword ptr arg(6) ;flimit
-        punpcklwd   mm2, mm2
-        punpckldq   mm2, mm2
-
-        mov         rsi,        arg(0) ;src_ptr
-        mov         rdi,        arg(1) ;dst_ptr
-
-        movsxd      rcx, DWORD PTR arg(4) ;rows
-        movsxd      rax, DWORD PTR arg(2) ;src_pixels_per_line ; destination pitch?
-        pxor        mm0, mm0              ; mm0 = 00000000
-
-.nextrow:
-
-        xor         rdx,        rdx       ; clear out rdx for use as loop counter
-.nextcol:
-
-        pxor        mm7, mm7              ; mm7 = 00000000
-        movq        mm6, [rbx + 32 ]      ; mm6 = kernel 2 taps
-        movq        mm3, [rsi]            ; mm4 = r0 p0..p7
-        punpcklbw   mm3, mm0              ; mm3 = p0..p3
-        movq        mm1, mm3              ; mm1 = p0..p3
-        pmullw      mm3, mm6              ; mm3 *= kernel 2 modifiers
-
-        movq        mm6, [rbx + 48]       ; mm6 = kernel 3 taps
-        movq        mm5, [rsi + rax]      ; mm4 = r1 p0..p7
-        punpcklbw   mm5, mm0              ; mm5 = r1 p0..p3
-        pmullw      mm6, mm5              ; mm6 *= p0..p3 * kernel 3 modifiers
-        paddusw     mm3, mm6              ; mm3 += mm6
-
-        ; thresholding
-        movq        mm7, mm1              ; mm7 = r0 p0..p3
-        psubusw     mm7, mm5              ; mm7 = r0 p0..p3 - r1 p0..p3
-        psubusw     mm5, mm1              ; mm5 = r1 p0..p3 - r0 p0..p3
-        paddusw     mm7, mm5              ; mm7 = abs(r0 p0..p3 - r1 p0..p3)
-        pcmpgtw     mm7, mm2
-
-        movq        mm6, [rbx + 64 ]      ; mm6 = kernel 4 modifiers
-        movq        mm5, [rsi + 2*rax]    ; mm4 = r2 p0..p7
-        punpcklbw   mm5, mm0              ; mm5 = r2 p0..p3
-        pmullw      mm6, mm5              ; mm5 *= kernel 4 modifiers
-        paddusw     mm3, mm6              ; mm3 += mm5
-
-        ; thresholding
-        movq        mm6, mm1              ; mm6 = r0 p0..p3
-        psubusw     mm6, mm5              ; mm6 = r0 p0..p3 - r2 p0..p3
-        psubusw     mm5, mm1              ; mm5 = r2 p0..p3 - r2 p0..p3
-        paddusw     mm6, mm5              ; mm6 = abs(r0 p0..p3 - r2 p0..p3)
-        pcmpgtw     mm6, mm2
-        por         mm7, mm6              ; accumulate thresholds
-
-
-        neg         rax
-        movq        mm6, [rbx ]           ; kernel 0 taps
-        movq        mm5, [rsi+2*rax]      ; mm4 = r-2 p0..p7
-        punpcklbw   mm5, mm0              ; mm5 = r-2 p0..p3
-        pmullw      mm6, mm5              ; mm5 *= kernel 0 modifiers
-        paddusw     mm3, mm6              ; mm3 += mm5
-
-        ; thresholding
-        movq        mm6, mm1              ; mm6 = r0 p0..p3
-        psubusw     mm6, mm5              ; mm6 = p0..p3 - r-2 p0..p3
-        psubusw     mm5, mm1              ; mm5 = r-2 p0..p3 - p0..p3
-        paddusw     mm6, mm5              ; mm6 = abs(r0 p0..p3 - r-2 p0..p3)
-        pcmpgtw     mm6, mm2
-        por         mm7, mm6              ; accumulate thresholds
-
-        movq        mm6, [rbx + 16]       ; kernel 1 taps
-        movq        mm4, [rsi+rax]        ; mm4 = r-1 p0..p7
-        punpcklbw   mm4, mm0              ; mm4 = r-1 p0..p3
-        pmullw      mm6, mm4              ; mm4 *= kernel 1 modifiers.
-        paddusw     mm3, mm6              ; mm3 += mm5
-
-        ; thresholding
-        movq        mm6, mm1              ; mm6 = r0 p0..p3
-        psubusw     mm6, mm4              ; mm6 = p0..p3 - r-2 p0..p3
-        psubusw     mm4, mm1              ; mm5 = r-1 p0..p3 - p0..p3
-        paddusw     mm6, mm4              ; mm6 = abs(r0 p0..p3 - r-1 p0..p3)
-        pcmpgtw     mm6, mm2
-        por         mm7, mm6              ; accumulate thresholds
-
-
-        paddusw     mm3, RD               ; mm3 += round value
-        psraw       mm3, VP9_FILTER_SHIFT     ; mm3 /= 128
-
-        pand        mm1, mm7              ; mm1 select vals > thresh from source
-        pandn       mm7, mm3              ; mm7 select vals < thresh from blurred result
-        paddusw     mm1, mm7              ; combination
-
-        packuswb    mm1, mm0              ; pack to bytes
-
-        movd        [rdi], mm1            ;
-        neg         rax                   ; pitch is positive
-
-
-        add         rsi, 4
-        add         rdi, 4
-        add         rdx, 4
-
-        cmp         edx, dword ptr arg(5) ;cols
-        jl          .nextcol
-        ; done with the all cols, start the across filtering in place
-        sub         rsi, rdx
-        sub         rdi, rdx
-
-
-        push        rax
-        xor         rdx,    rdx
-        mov         rax,    [rdi-4];
-
-.acrossnextcol:
-        pxor        mm7, mm7              ; mm7 = 00000000
-        movq        mm6, [rbx + 32 ]      ;
-        movq        mm4, [rdi+rdx]        ; mm4 = p0..p7
-        movq        mm3, mm4              ; mm3 = p0..p7
-        punpcklbw   mm3, mm0              ; mm3 = p0..p3
-        movq        mm1, mm3              ; mm1 = p0..p3
-        pmullw      mm3, mm6              ; mm3 *= kernel 2 modifiers
-
-        movq        mm6, [rbx + 48]
-        psrlq       mm4, 8                ; mm4 = p1..p7
-        movq        mm5, mm4              ; mm5 = p1..p7
-        punpcklbw   mm5, mm0              ; mm5 = p1..p4
-        pmullw      mm6, mm5              ; mm6 *= p1..p4 * kernel 3 modifiers
-        paddusw     mm3, mm6              ; mm3 += mm6
-
-        ; thresholding
-        movq        mm7, mm1              ; mm7 = p0..p3
-        psubusw     mm7, mm5              ; mm7 = p0..p3 - p1..p4
-        psubusw     mm5, mm1              ; mm5 = p1..p4 - p0..p3
-        paddusw     mm7, mm5              ; mm7 = abs(p0..p3 - p1..p4)
-        pcmpgtw     mm7, mm2
-
-        movq        mm6, [rbx + 64 ]
-        psrlq       mm4, 8                ; mm4 = p2..p7
-        movq        mm5, mm4              ; mm5 = p2..p7
-        punpcklbw   mm5, mm0              ; mm5 = p2..p5
-        pmullw      mm6, mm5              ; mm5 *= kernel 4 modifiers
-        paddusw     mm3, mm6              ; mm3 += mm5
-
-        ; thresholding
-        movq        mm6, mm1              ; mm6 = p0..p3
-        psubusw     mm6, mm5              ; mm6 = p0..p3 - p1..p4
-        psubusw     mm5, mm1              ; mm5 = p1..p4 - p0..p3
-        paddusw     mm6, mm5              ; mm6 = abs(p0..p3 - p1..p4)
-        pcmpgtw     mm6, mm2
-        por         mm7, mm6              ; accumulate thresholds
-
-
-        movq        mm6, [rbx ]
-        movq        mm4, [rdi+rdx-2]      ; mm4 = p-2..p5
-        movq        mm5, mm4              ; mm5 = p-2..p5
-        punpcklbw   mm5, mm0              ; mm5 = p-2..p1
-        pmullw      mm6, mm5              ; mm5 *= kernel 0 modifiers
-        paddusw     mm3, mm6              ; mm3 += mm5
-
-        ; thresholding
-        movq        mm6, mm1              ; mm6 = p0..p3
-        psubusw     mm6, mm5              ; mm6 = p0..p3 - p1..p4
-        psubusw     mm5, mm1              ; mm5 = p1..p4 - p0..p3
-        paddusw     mm6, mm5              ; mm6 = abs(p0..p3 - p1..p4)
-        pcmpgtw     mm6, mm2
-        por         mm7, mm6              ; accumulate thresholds
-
-        movq        mm6, [rbx + 16]
-        psrlq       mm4, 8                ; mm4 = p-1..p5
-        punpcklbw   mm4, mm0              ; mm4 = p-1..p2
-        pmullw      mm6, mm4              ; mm4 *= kernel 1 modifiers.
-        paddusw     mm3, mm6              ; mm3 += mm5
-
-        ; thresholding
-        movq        mm6, mm1              ; mm6 = p0..p3
-        psubusw     mm6, mm4              ; mm6 = p0..p3 - p1..p4
-        psubusw     mm4, mm1              ; mm5 = p1..p4 - p0..p3
-        paddusw     mm6, mm4              ; mm6 = abs(p0..p3 - p1..p4)
-        pcmpgtw     mm6, mm2
-        por         mm7, mm6              ; accumulate thresholds
-
-        paddusw     mm3, RD               ; mm3 += round value
-        psraw       mm3, VP9_FILTER_SHIFT     ; mm3 /= 128
-
-        pand        mm1, mm7              ; mm1 select vals > thresh from source
-        pandn       mm7, mm3              ; mm7 select vals < thresh from blurred result
-        paddusw     mm1, mm7              ; combination
-
-        packuswb    mm1, mm0              ; pack to bytes
-        mov         DWORD PTR [rdi+rdx-4],  eax   ; store previous four bytes
-        movd        eax,    mm1
-
-        add         rdx, 4
-        cmp         edx, dword ptr arg(5) ;cols
-        jl          .acrossnextcol;
-
-        mov         DWORD PTR [rdi+rdx-4],  eax
-        pop         rax
-
-        ; done with this rwo
-        add         rsi,rax               ; next line
-        movsxd      rax, dword ptr arg(3) ;dst_pixels_per_line ; destination pitch?
-        add         rdi,rax               ; next destination
-        movsxd      rax, dword ptr arg(2) ;src_pixels_per_line ; destination pitch?
-
-        dec         rcx                   ; decrement count
-        jnz         .nextrow               ; next row
-        pop         rbx
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-%undef RD
-
-
-;void vp9_mbpost_proc_down_mmx(unsigned char *dst,
-;                             int pitch, int rows, int cols,int flimit)
-extern sym(vp9_rv)
-global sym(vp9_mbpost_proc_down_mmx)
-sym(vp9_mbpost_proc_down_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 136
-
-    ; unsigned char d[16][8] at [rsp]
-    ; create flimit2 at [rsp+128]
-    mov         eax, dword ptr arg(4) ;flimit
-    mov         [rsp+128], eax
-    mov         [rsp+128+4], eax
-%define flimit2 [rsp+128]
-
-%if ABI_IS_32BIT=0
-    lea         r8,       [GLOBAL(sym(vp9_rv))]
-%endif
-
-    ;rows +=8;
-    add         dword ptr arg(2), 8
-
-    ;for(c=0; c<cols; c+=4)
-.loop_col:
-            mov         rsi,        arg(0)  ;s
-            pxor        mm0,        mm0     ;
-
-            movsxd      rax,        dword ptr arg(1) ;pitch       ;
-            neg         rax                                     ; rax = -pitch
-
-            lea         rsi,        [rsi + rax*8];              ; rdi = s[-pitch*8]
-            neg         rax
-
-
-            pxor        mm5,        mm5
-            pxor        mm6,        mm6     ;
-
-            pxor        mm7,        mm7     ;
-            mov         rdi,        rsi
-
-            mov         rcx,        15          ;
-
-.loop_initvar:
-            movd        mm1,        DWORD PTR [rdi];
-            punpcklbw   mm1,        mm0     ;
-
-            paddw       mm5,        mm1     ;
-            pmullw      mm1,        mm1     ;
-
-            movq        mm2,        mm1     ;
-            punpcklwd   mm1,        mm0     ;
-
-            punpckhwd   mm2,        mm0     ;
-            paddd       mm6,        mm1     ;
-
-            paddd       mm7,        mm2     ;
-            lea         rdi,        [rdi+rax]   ;
-
-            dec         rcx
-            jne         .loop_initvar
-            ;save the var and sum
-            xor         rdx,        rdx
-.loop_row:
-            movd        mm1,        DWORD PTR [rsi]     ; [s-pitch*8]
-            movd        mm2,        DWORD PTR [rdi]     ; [s+pitch*7]
-
-            punpcklbw   mm1,        mm0
-            punpcklbw   mm2,        mm0
-
-            paddw       mm5,        mm2
-            psubw       mm5,        mm1
-
-            pmullw      mm2,        mm2
-            movq        mm4,        mm2
-
-            punpcklwd   mm2,        mm0
-            punpckhwd   mm4,        mm0
-
-            paddd       mm6,        mm2
-            paddd       mm7,        mm4
-
-            pmullw      mm1,        mm1
-            movq        mm2,        mm1
-
-            punpcklwd   mm1,        mm0
-            psubd       mm6,        mm1
-
-            punpckhwd   mm2,        mm0
-            psubd       mm7,        mm2
-
-
-            movq        mm3,        mm6
-            pslld       mm3,        4
-
-            psubd       mm3,        mm6
-            movq        mm1,        mm5
-
-            movq        mm4,        mm5
-            pmullw      mm1,        mm1
-
-            pmulhw      mm4,        mm4
-            movq        mm2,        mm1
-
-            punpcklwd   mm1,        mm4
-            punpckhwd   mm2,        mm4
-
-            movq        mm4,        mm7
-            pslld       mm4,        4
-
-            psubd       mm4,        mm7
-
-            psubd       mm3,        mm1
-            psubd       mm4,        mm2
-
-            psubd       mm3,        flimit2
-            psubd       mm4,        flimit2
-
-            psrad       mm3,        31
-            psrad       mm4,        31
-
-            packssdw    mm3,        mm4
-            packsswb    mm3,        mm0
-
-            movd        mm1,        DWORD PTR [rsi+rax*8]
-
-            movq        mm2,        mm1
-            punpcklbw   mm1,        mm0
-
-            paddw       mm1,        mm5
-            mov         rcx,        rdx
-
-            and         rcx,        127
-%if ABI_IS_32BIT=1 && CONFIG_PIC=1
-            push        rax
-            lea         rax,        [GLOBAL(sym(vp9_rv))]
-            movq        mm4,        [rax + rcx*2] ;vp9_rv[rcx*2]
-            pop         rax
-%elif ABI_IS_32BIT=0
-            movq        mm4,        [r8 + rcx*2] ;vp9_rv[rcx*2]
-%else
-            movq        mm4,        [sym(vp9_rv) + rcx*2]
-%endif
-            paddw       mm1,        mm4
-            ;paddw     xmm1,       eight8s
-            psraw       mm1,        4
-
-            packuswb    mm1,        mm0
-            pand        mm1,        mm3
-
-            pandn       mm3,        mm2
-            por         mm1,        mm3
-
-            and         rcx,        15
-            movd        DWORD PTR   [rsp+rcx*4], mm1 ;d[rcx*4]
-
-            mov         rcx,        rdx
-            sub         rcx,        8
-
-            and         rcx,        15
-            movd        mm1,        DWORD PTR [rsp+rcx*4] ;d[rcx*4]
-
-            movd        [rsi],      mm1
-            lea         rsi,        [rsi+rax]
-
-            lea         rdi,        [rdi+rax]
-            add         rdx,        1
-
-            cmp         edx,        dword arg(2) ;rows
-            jl          .loop_row
-
-
-        add         dword arg(0), 4 ; s += 4
-        sub         dword arg(3), 4 ; cols -= 4
-        cmp         dword arg(3), 0
-        jg          .loop_col
-
-    add         rsp, 136
-    pop         rsp
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-%undef flimit2
-
-
-;void vp9_plane_add_noise_mmx (unsigned char *Start, unsigned char *noise,
-;                            unsigned char blackclamp[16],
-;                            unsigned char whiteclamp[16],
-;                            unsigned char bothclamp[16],
-;                            unsigned int Width, unsigned int Height, int Pitch)
-extern sym(rand)
-global sym(vp9_plane_add_noise_mmx)
-sym(vp9_plane_add_noise_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 8
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-.addnoise_loop:
-    call sym(rand) WRT_PLT
-    mov     rcx, arg(1) ;noise
-    and     rax, 0xff
-    add     rcx, rax
-
-    ; we rely on the fact that the clamping vectors are stored contiguously
-    ; in black/white/both order. Note that we have to reload this here because
-    ; rdx could be trashed by rand()
-    mov     rdx, arg(2) ; blackclamp
-
-
-            mov     rdi, rcx
-            movsxd  rcx, dword arg(5) ;[Width]
-            mov     rsi, arg(0) ;Pos
-            xor         rax,rax
-
-.addnoise_nextset:
-            movq        mm1,[rsi+rax]         ; get the source
-
-            psubusb     mm1, [rdx]    ;blackclamp        ; clamp both sides so we don't outrange adding noise
-            paddusb     mm1, [rdx+32] ;bothclamp
-            psubusb     mm1, [rdx+16] ;whiteclamp
-
-            movq        mm2,[rdi+rax]         ; get the noise for this line
-            paddb       mm1,mm2              ; add it in
-            movq        [rsi+rax],mm1         ; store the result
-
-            add         rax,8                 ; move to the next line
-
-            cmp         rax, rcx
-            jl          .addnoise_nextset
-
-    movsxd  rax, dword arg(7) ; Pitch
-    add     arg(0), rax ; Start += Pitch
-    sub     dword arg(6), 1   ; Height -= 1
-    jg      .addnoise_loop
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-SECTION_RODATA
-align 16
-Blur:
-    times 16 dw 16
-    times  8 dw 64
-    times 16 dw 16
-    times  8 dw  0
-
-rd:
-    times 4 dw 0x40
--- a/vp8/common/x86/postproc_sse2.asm
+++ /dev/null
@@ -1,695 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-;void vp9_post_proc_down_and_across_xmm
-;(
-;    unsigned char *src_ptr,
-;    unsigned char *dst_ptr,
-;    int src_pixels_per_line,
-;    int dst_pixels_per_line,
-;    int rows,
-;    int cols,
-;    int flimit
-;)
-global sym(vp9_post_proc_down_and_across_xmm)
-sym(vp9_post_proc_down_and_across_xmm):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-%if ABI_IS_32BIT=1 && CONFIG_PIC=1
-    ALIGN_STACK 16, rax
-    ; move the global rd onto the stack, since we don't have enough registers
-    ; to do PIC addressing
-    movdqa      xmm0, [GLOBAL(rd42)]
-    sub         rsp, 16
-    movdqa      [rsp], xmm0
-%define RD42 [rsp]
-%else
-%define RD42 [GLOBAL(rd42)]
-%endif
-
-
-        movd        xmm2,       dword ptr arg(6) ;flimit
-        punpcklwd   xmm2,       xmm2
-        punpckldq   xmm2,       xmm2
-        punpcklqdq  xmm2,       xmm2
-
-        mov         rsi,        arg(0) ;src_ptr
-        mov         rdi,        arg(1) ;dst_ptr
-
-        movsxd      rcx,        DWORD PTR arg(4) ;rows
-        movsxd      rax,        DWORD PTR arg(2) ;src_pixels_per_line ; destination pitch?
-        pxor        xmm0,       xmm0              ; mm0 = 00000000
-
-.nextrow:
-
-        xor         rdx,        rdx       ; clear out rdx for use as loop counter
-.nextcol:
-        movq        xmm3,       QWORD PTR [rsi]         ; mm4 = r0 p0..p7
-        punpcklbw   xmm3,       xmm0                    ; mm3 = p0..p3
-        movdqa      xmm1,       xmm3                    ; mm1 = p0..p3
-        psllw       xmm3,       2                       ;
-
-        movq        xmm5,       QWORD PTR [rsi + rax]   ; mm4 = r1 p0..p7
-        punpcklbw   xmm5,       xmm0                    ; mm5 = r1 p0..p3
-        paddusw     xmm3,       xmm5                    ; mm3 += mm6
-
-        ; thresholding
-        movdqa      xmm7,       xmm1                    ; mm7 = r0 p0..p3
-        psubusw     xmm7,       xmm5                    ; mm7 = r0 p0..p3 - r1 p0..p3
-        psubusw     xmm5,       xmm1                    ; mm5 = r1 p0..p3 - r0 p0..p3
-        paddusw     xmm7,       xmm5                    ; mm7 = abs(r0 p0..p3 - r1 p0..p3)
-        pcmpgtw     xmm7,       xmm2
-
-        movq        xmm5,       QWORD PTR [rsi + 2*rax] ; mm4 = r2 p0..p7
-        punpcklbw   xmm5,       xmm0                    ; mm5 = r2 p0..p3
-        paddusw     xmm3,       xmm5                    ; mm3 += mm5
-
-        ; thresholding
-        movdqa      xmm6,       xmm1                    ; mm6 = r0 p0..p3
-        psubusw     xmm6,       xmm5                    ; mm6 = r0 p0..p3 - r2 p0..p3
-        psubusw     xmm5,       xmm1                    ; mm5 = r2 p0..p3 - r2 p0..p3
-        paddusw     xmm6,       xmm5                    ; mm6 = abs(r0 p0..p3 - r2 p0..p3)
-        pcmpgtw     xmm6,       xmm2
-        por         xmm7,       xmm6                    ; accumulate thresholds
-
-
-        neg         rax
-        movq        xmm5,       QWORD PTR [rsi+2*rax]   ; mm4 = r-2 p0..p7
-        punpcklbw   xmm5,       xmm0                    ; mm5 = r-2 p0..p3
-        paddusw     xmm3,       xmm5                    ; mm3 += mm5
-
-        ; thresholding
-        movdqa      xmm6,       xmm1                    ; mm6 = r0 p0..p3
-        psubusw     xmm6,       xmm5                    ; mm6 = p0..p3 - r-2 p0..p3
-        psubusw     xmm5,       xmm1                    ; mm5 = r-2 p0..p3 - p0..p3
-        paddusw     xmm6,       xmm5                    ; mm6 = abs(r0 p0..p3 - r-2 p0..p3)
-        pcmpgtw     xmm6,       xmm2
-        por         xmm7,       xmm6                    ; accumulate thresholds
-
-        movq        xmm4,       QWORD PTR [rsi+rax]     ; mm4 = r-1 p0..p7
-        punpcklbw   xmm4,       xmm0                    ; mm4 = r-1 p0..p3
-        paddusw     xmm3,       xmm4                    ; mm3 += mm5
-
-        ; thresholding
-        movdqa      xmm6,       xmm1                    ; mm6 = r0 p0..p3
-        psubusw     xmm6,       xmm4                    ; mm6 = p0..p3 - r-2 p0..p3
-        psubusw     xmm4,       xmm1                    ; mm5 = r-1 p0..p3 - p0..p3
-        paddusw     xmm6,       xmm4                    ; mm6 = abs(r0 p0..p3 - r-1 p0..p3)
-        pcmpgtw     xmm6,       xmm2
-        por         xmm7,       xmm6                    ; accumulate thresholds
-
-
-        paddusw     xmm3,       RD42                    ; mm3 += round value
-        psraw       xmm3,       3                       ; mm3 /= 8
-
-        pand        xmm1,       xmm7                    ; mm1 select vals > thresh from source
-        pandn       xmm7,       xmm3                    ; mm7 select vals < thresh from blurred result
-        paddusw     xmm1,       xmm7                    ; combination
-
-        packuswb    xmm1,       xmm0                    ; pack to bytes
-        movq        QWORD PTR [rdi], xmm1             ;
-
-        neg         rax                   ; pitch is positive
-        add         rsi,        8
-        add         rdi,        8
-
-        add         rdx,        8
-        cmp         edx,        dword arg(5) ;cols
-
-        jl          .nextcol
-
-        ; done with the all cols, start the across filtering in place
-        sub         rsi,        rdx
-        sub         rdi,        rdx
-
-        xor         rdx,        rdx
-        movq        mm0,        QWORD PTR [rdi-8];
-
-.acrossnextcol:
-        movq        xmm7,       QWORD PTR [rdi +rdx -2]
-        movd        xmm4,       DWORD PTR [rdi +rdx +6]
-
-        pslldq      xmm4,       8
-        por         xmm4,       xmm7
-
-        movdqa      xmm3,       xmm4
-        psrldq      xmm3,       2
-        punpcklbw   xmm3,       xmm0              ; mm3 = p0..p3
-        movdqa      xmm1,       xmm3              ; mm1 = p0..p3
-        psllw       xmm3,       2
-
-
-        movdqa      xmm5,       xmm4
-        psrldq      xmm5,       3
-        punpcklbw   xmm5,       xmm0              ; mm5 = p1..p4
-        paddusw     xmm3,       xmm5              ; mm3 += mm6
-
-        ; thresholding
-        movdqa      xmm7,       xmm1              ; mm7 = p0..p3
-        psubusw     xmm7,       xmm5              ; mm7 = p0..p3 - p1..p4
-        psubusw     xmm5,       xmm1              ; mm5 = p1..p4 - p0..p3
-        paddusw     xmm7,       xmm5              ; mm7 = abs(p0..p3 - p1..p4)
-        pcmpgtw     xmm7,       xmm2
-
-        movdqa      xmm5,       xmm4
-        psrldq      xmm5,       4
-        punpcklbw   xmm5,       xmm0              ; mm5 = p2..p5
-        paddusw     xmm3,       xmm5              ; mm3 += mm5
-
-        ; thresholding
-        movdqa      xmm6,       xmm1              ; mm6 = p0..p3
-        psubusw     xmm6,       xmm5              ; mm6 = p0..p3 - p1..p4
-        psubusw     xmm5,       xmm1              ; mm5 = p1..p4 - p0..p3
-        paddusw     xmm6,       xmm5              ; mm6 = abs(p0..p3 - p1..p4)
-        pcmpgtw     xmm6,       xmm2
-        por         xmm7,       xmm6              ; accumulate thresholds
-
-
-        movdqa      xmm5,       xmm4              ; mm5 = p-2..p5
-        punpcklbw   xmm5,       xmm0              ; mm5 = p-2..p1
-        paddusw     xmm3,       xmm5              ; mm3 += mm5
-
-        ; thresholding
-        movdqa      xmm6,       xmm1              ; mm6 = p0..p3
-        psubusw     xmm6,       xmm5              ; mm6 = p0..p3 - p1..p4
-        psubusw     xmm5,       xmm1              ; mm5 = p1..p4 - p0..p3
-        paddusw     xmm6,       xmm5              ; mm6 = abs(p0..p3 - p1..p4)
-        pcmpgtw     xmm6,       xmm2
-        por         xmm7,       xmm6              ; accumulate thresholds
-
-        psrldq      xmm4,       1                   ; mm4 = p-1..p5
-        punpcklbw   xmm4,       xmm0              ; mm4 = p-1..p2
-        paddusw     xmm3,       xmm4              ; mm3 += mm5
-
-        ; thresholding
-        movdqa      xmm6,       xmm1              ; mm6 = p0..p3
-        psubusw     xmm6,       xmm4              ; mm6 = p0..p3 - p1..p4
-        psubusw     xmm4,       xmm1              ; mm5 = p1..p4 - p0..p3
-        paddusw     xmm6,       xmm4              ; mm6 = abs(p0..p3 - p1..p4)
-        pcmpgtw     xmm6,       xmm2
-        por         xmm7,       xmm6              ; accumulate thresholds
-
-        paddusw     xmm3,       RD42              ; mm3 += round value
-        psraw       xmm3,       3                 ; mm3 /= 8
-
-        pand        xmm1,       xmm7              ; mm1 select vals > thresh from source
-        pandn       xmm7,       xmm3              ; mm7 select vals < thresh from blurred result
-        paddusw     xmm1,       xmm7              ; combination
-
-        packuswb    xmm1,       xmm0              ; pack to bytes
-        movq        QWORD PTR [rdi+rdx-8],  mm0   ; store previous four bytes
-        movdq2q     mm0,        xmm1
-
-        add         rdx,        8
-        cmp         edx,        dword arg(5) ;cols
-        jl          .acrossnextcol;
-
-        ; last 8 pixels
-        movq        QWORD PTR [rdi+rdx-8],  mm0
-
-        ; done with this rwo
-        add         rsi,rax               ; next line
-        mov         eax, dword arg(3) ;dst_pixels_per_line ; destination pitch?
-        add         rdi,rax               ; next destination
-        mov         eax, dword arg(2) ;src_pixels_per_line ; destination pitch?
-
-        dec         rcx                   ; decrement count
-        jnz         .nextrow              ; next row
-
-%if ABI_IS_32BIT=1 && CONFIG_PIC=1
-    add rsp,16
-    pop rsp
-%endif
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-%undef RD42
-
-
-;void vp9_mbpost_proc_down_xmm(unsigned char *dst,
-;                            int pitch, int rows, int cols,int flimit)
-extern sym(vp9_rv)
-global sym(vp9_mbpost_proc_down_xmm)
-sym(vp9_mbpost_proc_down_xmm):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 128+16
-
-    ; unsigned char d[16][8] at [rsp]
-    ; create flimit2 at [rsp+128]
-    mov         eax, dword ptr arg(4) ;flimit
-    mov         [rsp+128], eax
-    mov         [rsp+128+4], eax
-    mov         [rsp+128+8], eax
-    mov         [rsp+128+12], eax
-%define flimit4 [rsp+128]
-
-%if ABI_IS_32BIT=0
-    lea         r8,       [GLOBAL(sym(vp9_rv))]
-%endif
-
-    ;rows +=8;
-    add         dword arg(2), 8
-
-    ;for(c=0; c<cols; c+=8)
-.loop_col:
-            mov         rsi,        arg(0) ; s
-            pxor        xmm0,       xmm0        ;
-
-            movsxd      rax,        dword ptr arg(1) ;pitch       ;
-            neg         rax                                     ; rax = -pitch
-
-            lea         rsi,        [rsi + rax*8];              ; rdi = s[-pitch*8]
-            neg         rax
-
-
-            pxor        xmm5,       xmm5
-            pxor        xmm6,       xmm6        ;
-
-            pxor        xmm7,       xmm7        ;
-            mov         rdi,        rsi
-
-            mov         rcx,        15          ;
-
-.loop_initvar:
-            movq        xmm1,       QWORD PTR [rdi];
-            punpcklbw   xmm1,       xmm0        ;
-
-            paddw       xmm5,       xmm1        ;
-            pmullw      xmm1,       xmm1        ;
-
-            movdqa      xmm2,       xmm1        ;
-            punpcklwd   xmm1,       xmm0        ;
-
-            punpckhwd   xmm2,       xmm0        ;
-            paddd       xmm6,       xmm1        ;
-
-            paddd       xmm7,       xmm2        ;
-            lea         rdi,        [rdi+rax]   ;
-
-            dec         rcx
-            jne         .loop_initvar
-            ;save the var and sum
-            xor         rdx,        rdx
-.loop_row:
-            movq        xmm1,       QWORD PTR [rsi]     ; [s-pitch*8]
-            movq        xmm2,       QWORD PTR [rdi]     ; [s+pitch*7]
-
-            punpcklbw   xmm1,       xmm0
-            punpcklbw   xmm2,       xmm0
-
-            paddw       xmm5,       xmm2
-            psubw       xmm5,       xmm1
-
-            pmullw      xmm2,       xmm2
-            movdqa      xmm4,       xmm2
-
-            punpcklwd   xmm2,       xmm0
-            punpckhwd   xmm4,       xmm0
-
-            paddd       xmm6,       xmm2
-            paddd       xmm7,       xmm4
-
-            pmullw      xmm1,       xmm1
-            movdqa      xmm2,       xmm1
-
-            punpcklwd   xmm1,       xmm0
-            psubd       xmm6,       xmm1
-
-            punpckhwd   xmm2,       xmm0
-            psubd       xmm7,       xmm2
-
-
-            movdqa      xmm3,       xmm6
-            pslld       xmm3,       4
-
-            psubd       xmm3,       xmm6
-            movdqa      xmm1,       xmm5
-
-            movdqa      xmm4,       xmm5
-            pmullw      xmm1,       xmm1
-
-            pmulhw      xmm4,       xmm4
-            movdqa      xmm2,       xmm1
-
-            punpcklwd   xmm1,       xmm4
-            punpckhwd   xmm2,       xmm4
-
-            movdqa      xmm4,       xmm7
-            pslld       xmm4,       4
-
-            psubd       xmm4,       xmm7
-
-            psubd       xmm3,       xmm1
-            psubd       xmm4,       xmm2
-
-            psubd       xmm3,       flimit4
-            psubd       xmm4,       flimit4
-
-            psrad       xmm3,       31
-            psrad       xmm4,       31
-
-            packssdw    xmm3,       xmm4
-            packsswb    xmm3,       xmm0
-
-            movq        xmm1,       QWORD PTR [rsi+rax*8]
-
-            movq        xmm2,       xmm1
-            punpcklbw   xmm1,       xmm0
-
-            paddw       xmm1,       xmm5
-            mov         rcx,        rdx
-
-            and         rcx,        127
-%if ABI_IS_32BIT=1 && CONFIG_PIC=1
-            push        rax
-            lea         rax,        [GLOBAL(sym(vp9_rv))]
-            movdqu      xmm4,       [rax + rcx*2] ;vp9_rv[rcx*2]
-            pop         rax
-%elif ABI_IS_32BIT=0
-            movdqu      xmm4,       [r8 + rcx*2] ;vp9_rv[rcx*2]
-%else
-            movdqu      xmm4,       [sym(vp9_rv) + rcx*2]
-%endif
-
-            paddw       xmm1,       xmm4
-            ;paddw     xmm1,       eight8s
-            psraw       xmm1,       4
-
-            packuswb    xmm1,       xmm0
-            pand        xmm1,       xmm3
-
-            pandn       xmm3,       xmm2
-            por         xmm1,       xmm3
-
-            and         rcx,        15
-            movq        QWORD PTR   [rsp + rcx*8], xmm1 ;d[rcx*8]
-
-            mov         rcx,        rdx
-            sub         rcx,        8
-
-            and         rcx,        15
-            movq        mm0,        [rsp + rcx*8] ;d[rcx*8]
-
-            movq        [rsi],      mm0
-            lea         rsi,        [rsi+rax]
-
-            lea         rdi,        [rdi+rax]
-            add         rdx,        1
-
-            cmp         edx,        dword arg(2) ;rows
-            jl          .loop_row
-
-        add         dword arg(0), 8 ; s += 8
-        sub         dword arg(3), 8 ; cols -= 8
-        cmp         dword arg(3), 0
-        jg          .loop_col
-
-    add         rsp, 128+16
-    pop         rsp
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-%undef flimit4
-
-
-;void vp9_mbpost_proc_across_ip_xmm(unsigned char *src,
-;                                int pitch, int rows, int cols,int flimit)
-global sym(vp9_mbpost_proc_across_ip_xmm)
-sym(vp9_mbpost_proc_across_ip_xmm):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 16
-
-    ; create flimit4 at [rsp]
-    mov         eax, dword ptr arg(4) ;flimit
-    mov         [rsp], eax
-    mov         [rsp+4], eax
-    mov         [rsp+8], eax
-    mov         [rsp+12], eax
-%define flimit4 [rsp]
-
-
-    ;for(r=0;r<rows;r++)
-.ip_row_loop:
-
-        xor         rdx,    rdx ;sumsq=0;
-        xor         rcx,    rcx ;sum=0;
-        mov         rsi,    arg(0); s
-        mov         rdi,    -8
-.ip_var_loop:
-        ;for(i=-8;i<=6;i++)
-        ;{
-        ;    sumsq += s[i]*s[i];
-        ;    sum   += s[i];
-        ;}
-        movzx       eax, byte [rsi+rdi]
-        add         ecx, eax
-        mul         al
-        add         edx, eax
-        add         rdi, 1
-        cmp         rdi, 6
-        jle         .ip_var_loop
-
-
-            ;mov         rax,    sumsq
-            ;movd        xmm7,   rax
-            movd        xmm7,   edx
-
-            ;mov         rax,    sum
-            ;movd        xmm6,   rax
-            movd        xmm6,   ecx
-
-            mov         rsi,    arg(0) ;s
-            xor         rcx,    rcx
-
-            movsxd      rdx,    dword arg(3) ;cols
-            add         rdx,    8
-            pxor        mm0,    mm0
-            pxor        mm1,    mm1
-
-            pxor        xmm0,   xmm0
-.nextcol4:
-
-            movd        xmm1,   DWORD PTR [rsi+rcx-8]   ; -8 -7 -6 -5
-            movd        xmm2,   DWORD PTR [rsi+rcx+7]   ; +7 +8 +9 +10
-
-            punpcklbw   xmm1,   xmm0                    ; expanding
-            punpcklbw   xmm2,   xmm0                    ; expanding
-
-            punpcklwd   xmm1,   xmm0                    ; expanding to dwords
-            punpcklwd   xmm2,   xmm0                    ; expanding to dwords
-
-            psubd       xmm2,   xmm1                    ; 7--8   8--7   9--6 10--5
-            paddd       xmm1,   xmm1                    ; -8*2   -7*2   -6*2 -5*2
-
-            paddd       xmm1,   xmm2                    ; 7+-8   8+-7   9+-6 10+-5
-            pmaddwd     xmm1,   xmm2                    ; squared of 7+-8   8+-7   9+-6 10+-5
-
-            paddd       xmm6,   xmm2
-            paddd       xmm7,   xmm1
-
-            pshufd      xmm6,   xmm6,   0               ; duplicate the last ones
-            pshufd      xmm7,   xmm7,   0               ; duplicate the last ones
-
-            psrldq      xmm1,       4                   ; 8--7   9--6 10--5  0000
-            psrldq      xmm2,       4                   ; 8--7   9--6 10--5  0000
-
-            pshufd      xmm3,   xmm1,   3               ; 0000  8--7   8--7   8--7 squared
-            pshufd      xmm4,   xmm2,   3               ; 0000  8--7   8--7   8--7 squared
-
-            paddd       xmm6,   xmm4
-            paddd       xmm7,   xmm3
-
-            pshufd      xmm3,   xmm1,   01011111b       ; 0000  0000   9--6   9--6 squared
-            pshufd      xmm4,   xmm2,   01011111b       ; 0000  0000   9--6   9--6 squared
-
-            paddd       xmm7,   xmm3
-            paddd       xmm6,   xmm4
-
-            pshufd      xmm3,   xmm1,   10111111b       ; 0000  0000   8--7   8--7 squared
-            pshufd      xmm4,   xmm2,   10111111b       ; 0000  0000   8--7   8--7 squared
-
-            paddd       xmm7,   xmm3
-            paddd       xmm6,   xmm4
-
-            movdqa      xmm3,   xmm6
-            pmaddwd     xmm3,   xmm3
-
-            movdqa      xmm5,   xmm7
-            pslld       xmm5,   4
-
-            psubd       xmm5,   xmm7
-            psubd       xmm5,   xmm3
-
-            psubd       xmm5,   flimit4
-            psrad       xmm5,   31
-
-            packssdw    xmm5,   xmm0
-            packsswb    xmm5,   xmm0
-
-            movd        xmm1,   DWORD PTR [rsi+rcx]
-            movq        xmm2,   xmm1
-
-            punpcklbw   xmm1,   xmm0
-            punpcklwd   xmm1,   xmm0
-
-            paddd       xmm1,   xmm6
-            paddd       xmm1,   [GLOBAL(four8s)]
-
-            psrad       xmm1,   4
-            packssdw    xmm1,   xmm0
-
-            packuswb    xmm1,   xmm0
-            pand        xmm1,   xmm5
-
-            pandn       xmm5,   xmm2
-            por         xmm5,   xmm1
-
-            movd        [rsi+rcx-8],  mm0
-            movq        mm0,    mm1
-
-            movdq2q     mm1,    xmm5
-            psrldq      xmm7,   12
-
-            psrldq      xmm6,   12
-            add         rcx,    4
-
-            cmp         rcx,    rdx
-            jl          .nextcol4
-
-        ;s+=pitch;
-        movsxd rax, dword arg(1)
-        add    arg(0), rax
-
-        sub dword arg(2), 1 ;rows-=1
-        cmp dword arg(2), 0
-        jg .ip_row_loop
-
-    add         rsp, 16
-    pop         rsp
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-%undef flimit4
-
-
-;void vp9_plane_add_noise_wmt (unsigned char *Start, unsigned char *noise,
-;                            unsigned char blackclamp[16],
-;                            unsigned char whiteclamp[16],
-;                            unsigned char bothclamp[16],
-;                            unsigned int Width, unsigned int Height, int Pitch)
-extern sym(rand)
-global sym(vp9_plane_add_noise_wmt)
-sym(vp9_plane_add_noise_wmt):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 8
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-.addnoise_loop:
-    call sym(rand) WRT_PLT
-    mov     rcx, arg(1) ;noise
-    and     rax, 0xff
-    add     rcx, rax
-
-    ; we rely on the fact that the clamping vectors are stored contiguously
-    ; in black/white/both order. Note that we have to reload this here because
-    ; rdx could be trashed by rand()
-    mov     rdx, arg(2) ; blackclamp
-
-
-            mov     rdi, rcx
-            movsxd  rcx, dword arg(5) ;[Width]
-            mov     rsi, arg(0) ;Pos
-            xor         rax,rax
-
-.addnoise_nextset:
-            movdqu      xmm1,[rsi+rax]         ; get the source
-
-            psubusb     xmm1, [rdx]    ;blackclamp        ; clamp both sides so we don't outrange adding noise
-            paddusb     xmm1, [rdx+32] ;bothclamp
-            psubusb     xmm1, [rdx+16] ;whiteclamp
-
-            movdqu      xmm2,[rdi+rax]         ; get the noise for this line
-            paddb       xmm1,xmm2              ; add it in
-            movdqu      [rsi+rax],xmm1         ; store the result
-
-            add         rax,16                 ; move to the next line
-
-            cmp         rax, rcx
-            jl          .addnoise_nextset
-
-    movsxd  rax, dword arg(7) ; Pitch
-    add     arg(0), rax ; Start += Pitch
-    sub     dword arg(6), 1   ; Height -= 1
-    jg      .addnoise_loop
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-SECTION_RODATA
-align 16
-rd42:
-    times 8 dw 0x04
-four8s:
-    times 4 dd 8
--- a/vp8/common/x86/postproc_x86.h
+++ /dev/null
@@ -1,64 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef POSTPROC_X86_H
-#define POSTPROC_X86_H
-
-/* Note:
- *
- * This platform is commonly built for runtime CPU detection. If you modify
- * any of the function mappings present in this file, be sure to also update
- * them in the function pointer initialization code
- */
-
-#if HAVE_MMX
-extern prototype_postproc_inplace(vp9_mbpost_proc_down_mmx);
-extern prototype_postproc(vp9_post_proc_down_and_across_mmx);
-extern prototype_postproc_addnoise(vp9_plane_add_noise_mmx);
-
-#if !CONFIG_RUNTIME_CPU_DETECT
-#undef  vp9_postproc_down
-#define vp9_postproc_down vp9_mbpost_proc_down_mmx
-
-#undef  vp9_postproc_downacross
-#define vp9_postproc_downacross vp9_post_proc_down_and_across_mmx
-
-#undef  vp9_postproc_addnoise
-#define vp9_postproc_addnoise vp9_plane_add_noise_mmx
-
-#endif
-#endif
-
-
-#if HAVE_SSE2
-extern prototype_postproc_inplace(vp9_mbpost_proc_down_xmm);
-extern prototype_postproc_inplace(vp9_mbpost_proc_across_ip_xmm);
-extern prototype_postproc(vp9_post_proc_down_and_across_xmm);
-extern prototype_postproc_addnoise(vp9_plane_add_noise_wmt);
-
-#if !CONFIG_RUNTIME_CPU_DETECT
-#undef  vp9_postproc_down
-#define vp9_postproc_down vp9_mbpost_proc_down_xmm
-
-#undef  vp9_postproc_across
-#define vp9_postproc_across vp9_mbpost_proc_across_ip_xmm
-
-#undef  vp9_postproc_downacross
-#define vp9_postproc_downacross vp9_post_proc_down_and_across_xmm
-
-#undef  vp9_postproc_addnoise
-#define vp9_postproc_addnoise vp9_plane_add_noise_wmt
-
-
-#endif
-#endif
-
-#endif
--- a/vp8/common/x86/recon_mmx.asm
+++ /dev/null
@@ -1,321 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-;void vp9_recon_b_mmx(unsigned char *s, short *q, unsigned char *d, int stride)
-global sym(vp9_recon_b_mmx)
-sym(vp9_recon_b_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 4
-    push        rsi
-    push        rdi
-    ; end prolog
-
-        mov       rsi, arg(0) ;s
-        mov       rdi, arg(2) ;d
-        mov       rdx, arg(1) ;q
-        movsxd    rax, dword ptr arg(3) ;stride
-        pxor      mm0, mm0
-
-        movd      mm1, [rsi]
-        punpcklbw mm1, mm0
-        paddsw    mm1, [rdx]
-        packuswb  mm1,  mm0              ; pack and unpack to saturate
-        movd      [rdi], mm1
-
-        movd      mm2, [rsi+16]
-        punpcklbw mm2, mm0
-        paddsw    mm2, [rdx+32]
-        packuswb  mm2, mm0              ; pack and unpack to saturate
-        movd      [rdi+rax], mm2
-
-        movd      mm3, [rsi+32]
-        punpcklbw mm3, mm0
-        paddsw    mm3, [rdx+64]
-        packuswb  mm3,  mm0              ; pack and unpack to saturate
-        movd      [rdi+2*rax], mm3
-
-        add       rdi, rax
-        movd      mm4, [rsi+48]
-        punpcklbw mm4, mm0
-        paddsw    mm4, [rdx+96]
-        packuswb  mm4, mm0              ; pack and unpack to saturate
-        movd      [rdi+2*rax], mm4
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void copy_mem8x8_mmx(
-;    unsigned char *src,
-;    int src_stride,
-;    unsigned char *dst,
-;    int dst_stride
-;    )
-global sym(vp9_copy_mem8x8_mmx)
-sym(vp9_copy_mem8x8_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 4
-    push        rsi
-    push        rdi
-    ; end prolog
-
-        mov         rsi,        arg(0) ;src;
-        movq        mm0,        [rsi]
-
-        movsxd      rax,        dword ptr arg(1) ;src_stride;
-        mov         rdi,        arg(2) ;dst;
-
-        movq        mm1,        [rsi+rax]
-        movq        mm2,        [rsi+rax*2]
-
-        movsxd      rcx,        dword ptr arg(3) ;dst_stride
-        lea         rsi,        [rsi+rax*2]
-
-        movq        [rdi],      mm0
-        add         rsi,        rax
-
-        movq        [rdi+rcx],      mm1
-        movq        [rdi+rcx*2],    mm2
-
-
-        lea         rdi,        [rdi+rcx*2]
-        movq        mm3,        [rsi]
-
-        add         rdi,        rcx
-        movq        mm4,        [rsi+rax]
-
-        movq        mm5,        [rsi+rax*2]
-        movq        [rdi],      mm3
-
-        lea         rsi,        [rsi+rax*2]
-        movq        [rdi+rcx],  mm4
-
-        movq        [rdi+rcx*2],    mm5
-        lea         rdi,        [rdi+rcx*2]
-
-        movq        mm0,        [rsi+rax]
-        movq        mm1,        [rsi+rax*2]
-
-        movq        [rdi+rcx],  mm0
-        movq        [rdi+rcx*2],mm1
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void copy_mem8x4_mmx(
-;    unsigned char *src,
-;    int src_stride,
-;    unsigned char *dst,
-;    int dst_stride
-;    )
-global sym(vp9_copy_mem8x4_mmx)
-sym(vp9_copy_mem8x4_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 4
-    push        rsi
-    push        rdi
-    ; end prolog
-
-        mov         rsi,        arg(0) ;src;
-        movq        mm0,        [rsi]
-
-        movsxd      rax,        dword ptr arg(1) ;src_stride;
-        mov         rdi,        arg(2) ;dst;
-
-        movq        mm1,        [rsi+rax]
-        movq        mm2,        [rsi+rax*2]
-
-        movsxd      rcx,        dword ptr arg(3) ;dst_stride
-        lea         rsi,        [rsi+rax*2]
-
-        movq        [rdi],      mm0
-        movq        [rdi+rcx],      mm1
-
-        movq        [rdi+rcx*2],    mm2
-        lea         rdi,        [rdi+rcx*2]
-
-        movq        mm3,        [rsi+rax]
-        movq        [rdi+rcx],      mm3
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void copy_mem16x16_mmx(
-;    unsigned char *src,
-;    int src_stride,
-;    unsigned char *dst,
-;    int dst_stride
-;    )
-global sym(vp9_copy_mem16x16_mmx)
-sym(vp9_copy_mem16x16_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 4
-    push        rsi
-    push        rdi
-    ; end prolog
-
-        mov         rsi,        arg(0) ;src;
-        movsxd      rax,        dword ptr arg(1) ;src_stride;
-
-        mov         rdi,        arg(2) ;dst;
-        movsxd      rcx,        dword ptr arg(3) ;dst_stride
-
-        movq        mm0,            [rsi]
-        movq        mm3,            [rsi+8];
-
-        movq        mm1,            [rsi+rax]
-        movq        mm4,            [rsi+rax+8]
-
-        movq        mm2,            [rsi+rax*2]
-        movq        mm5,            [rsi+rax*2+8]
-
-        lea         rsi,            [rsi+rax*2]
-        add         rsi,            rax
-
-        movq        [rdi],          mm0
-        movq        [rdi+8],        mm3
-
-        movq        [rdi+rcx],      mm1
-        movq        [rdi+rcx+8],    mm4
-
-        movq        [rdi+rcx*2],    mm2
-        movq        [rdi+rcx*2+8],  mm5
-
-        lea         rdi,            [rdi+rcx*2]
-        add         rdi,            rcx
-
-        movq        mm0,            [rsi]
-        movq        mm3,            [rsi+8];
-
-        movq        mm1,            [rsi+rax]
-        movq        mm4,            [rsi+rax+8]
-
-        movq        mm2,            [rsi+rax*2]
-        movq        mm5,            [rsi+rax*2+8]
-
-        lea         rsi,            [rsi+rax*2]
-        add         rsi,            rax
-
-        movq        [rdi],          mm0
-        movq        [rdi+8],        mm3
-
-        movq        [rdi+rcx],      mm1
-        movq        [rdi+rcx+8],    mm4
-
-        movq        [rdi+rcx*2],    mm2
-        movq        [rdi+rcx*2+8],  mm5
-
-        lea         rdi,            [rdi+rcx*2]
-        add         rdi,            rcx
-
-        movq        mm0,            [rsi]
-        movq        mm3,            [rsi+8];
-
-        movq        mm1,            [rsi+rax]
-        movq        mm4,            [rsi+rax+8]
-
-        movq        mm2,            [rsi+rax*2]
-        movq        mm5,            [rsi+rax*2+8]
-
-        lea         rsi,            [rsi+rax*2]
-        add         rsi,            rax
-
-        movq        [rdi],          mm0
-        movq        [rdi+8],        mm3
-
-        movq        [rdi+rcx],      mm1
-        movq        [rdi+rcx+8],    mm4
-
-        movq        [rdi+rcx*2],    mm2
-        movq        [rdi+rcx*2+8],  mm5
-
-        lea         rdi,            [rdi+rcx*2]
-        add         rdi,            rcx
-
-        movq        mm0,            [rsi]
-        movq        mm3,            [rsi+8];
-
-        movq        mm1,            [rsi+rax]
-        movq        mm4,            [rsi+rax+8]
-
-        movq        mm2,            [rsi+rax*2]
-        movq        mm5,            [rsi+rax*2+8]
-
-        lea         rsi,            [rsi+rax*2]
-        add         rsi,            rax
-
-        movq        [rdi],          mm0
-        movq        [rdi+8],        mm3
-
-        movq        [rdi+rcx],      mm1
-        movq        [rdi+rcx+8],    mm4
-
-        movq        [rdi+rcx*2],    mm2
-        movq        [rdi+rcx*2+8],  mm5
-
-        lea         rdi,            [rdi+rcx*2]
-        add         rdi,            rcx
-
-        movq        mm0,            [rsi]
-        movq        mm3,            [rsi+8];
-
-        movq        mm1,            [rsi+rax]
-        movq        mm4,            [rsi+rax+8]
-
-        movq        mm2,            [rsi+rax*2]
-        movq        mm5,            [rsi+rax*2+8]
-
-        lea         rsi,            [rsi+rax*2]
-        add         rsi,            rax
-
-        movq        [rdi],          mm0
-        movq        [rdi+8],        mm3
-
-        movq        [rdi+rcx],      mm1
-        movq        [rdi+rcx+8],    mm4
-
-        movq        [rdi+rcx*2],    mm2
-        movq        [rdi+rcx*2+8],  mm5
-
-        lea         rdi,            [rdi+rcx*2]
-        add         rdi,            rcx
-
-        movq        mm0,            [rsi]
-        movq        mm3,            [rsi+8];
-
-        movq        [rdi],          mm0
-        movq        [rdi+8],        mm3
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
--- a/vp8/common/x86/recon_sse2.asm
+++ /dev/null
@@ -1,688 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-;void vp9_recon2b_sse2(unsigned char *s, short *q, unsigned char *d, int stride)
-global sym(vp9_recon2b_sse2)
-sym(vp9_recon2b_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 4
-    push        rsi
-    push        rdi
-    ; end prolog
-
-        mov         rsi,        arg(0) ;s
-        mov         rdi,        arg(2) ;d
-        mov         rdx,        arg(1) ;q
-        movsxd      rax,        dword ptr arg(3) ;stride
-        pxor        xmm0,       xmm0
-
-        movq        xmm1,       MMWORD PTR [rsi]
-        punpcklbw   xmm1,       xmm0
-        paddsw      xmm1,       XMMWORD PTR [rdx]
-        packuswb    xmm1,       xmm0              ; pack and unpack to saturate
-        movq        MMWORD PTR [rdi],   xmm1
-
-
-        movq        xmm2,       MMWORD PTR [rsi+8]
-        punpcklbw   xmm2,       xmm0
-        paddsw      xmm2,       XMMWORD PTR [rdx+16]
-        packuswb    xmm2,       xmm0              ; pack and unpack to saturate
-        movq        MMWORD PTR [rdi+rax],   xmm2
-
-
-        movq        xmm3,       MMWORD PTR [rsi+16]
-        punpcklbw   xmm3,       xmm0
-        paddsw      xmm3,       XMMWORD PTR [rdx+32]
-        packuswb    xmm3,       xmm0              ; pack and unpack to saturate
-        movq        MMWORD PTR [rdi+rax*2], xmm3
-
-        add         rdi, rax
-        movq        xmm4,       MMWORD PTR [rsi+24]
-        punpcklbw   xmm4,       xmm0
-        paddsw      xmm4,       XMMWORD PTR [rdx+48]
-        packuswb    xmm4,       xmm0              ; pack and unpack to saturate
-        movq        MMWORD PTR [rdi+rax*2], xmm4
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vp9_recon4b_sse2(unsigned char *s, short *q, unsigned char *d, int stride)
-global sym(vp9_recon4b_sse2)
-sym(vp9_recon4b_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 4
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-        mov         rsi,        arg(0) ;s
-        mov         rdi,        arg(2) ;d
-        mov         rdx,        arg(1) ;q
-        movsxd      rax,        dword ptr arg(3) ;stride
-        pxor        xmm0,       xmm0
-
-        movdqa      xmm1,       XMMWORD PTR [rsi]
-        movdqa      xmm5,       xmm1
-        punpcklbw   xmm1,       xmm0
-        punpckhbw   xmm5,       xmm0
-        paddsw      xmm1,       XMMWORD PTR [rdx]
-        paddsw      xmm5,       XMMWORD PTR [rdx+16]
-        packuswb    xmm1,       xmm5              ; pack and unpack to saturate
-        movdqa      XMMWORD PTR [rdi],  xmm1
-
-
-        movdqa      xmm2,       XMMWORD PTR [rsi+16]
-        movdqa      xmm6,       xmm2
-        punpcklbw   xmm2,       xmm0
-        punpckhbw   xmm6,       xmm0
-        paddsw      xmm2,       XMMWORD PTR [rdx+32]
-        paddsw      xmm6,       XMMWORD PTR [rdx+48]
-        packuswb    xmm2,       xmm6              ; pack and unpack to saturate
-        movdqa      XMMWORD PTR [rdi+rax],  xmm2
-
-
-        movdqa      xmm3,       XMMWORD PTR [rsi+32]
-        movdqa      xmm7,       xmm3
-        punpcklbw   xmm3,       xmm0
-        punpckhbw   xmm7,       xmm0
-        paddsw      xmm3,       XMMWORD PTR [rdx+64]
-        paddsw      xmm7,       XMMWORD PTR [rdx+80]
-        packuswb    xmm3,       xmm7              ; pack and unpack to saturate
-        movdqa      XMMWORD PTR [rdi+rax*2],    xmm3
-
-        add       rdi, rax
-        movdqa      xmm4,       XMMWORD PTR [rsi+48]
-        movdqa      xmm5,       xmm4
-        punpcklbw   xmm4,       xmm0
-        punpckhbw   xmm5,       xmm0
-        paddsw      xmm4,       XMMWORD PTR [rdx+96]
-        paddsw      xmm5,       XMMWORD PTR [rdx+112]
-        packuswb    xmm4,       xmm5              ; pack and unpack to saturate
-        movdqa      XMMWORD PTR [rdi+rax*2],    xmm4
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void copy_mem16x16_sse2(
-;    unsigned char *src,
-;    int src_stride,
-;    unsigned char *dst,
-;    int dst_stride
-;    )
-global sym(vp9_copy_mem16x16_sse2)
-sym(vp9_copy_mem16x16_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 4
-    push        rsi
-    push        rdi
-    ; end prolog
-
-        mov         rsi,        arg(0) ;src;
-        movdqu      xmm0,       [rsi]
-
-        movsxd      rax,        dword ptr arg(1) ;src_stride;
-        mov         rdi,        arg(2) ;dst;
-
-        movdqu      xmm1,       [rsi+rax]
-        movdqu      xmm2,       [rsi+rax*2]
-
-        movsxd      rcx,        dword ptr arg(3) ;dst_stride
-        lea         rsi,        [rsi+rax*2]
-
-        movdqa      [rdi],      xmm0
-        add         rsi,        rax
-
-        movdqa      [rdi+rcx],  xmm1
-        movdqa      [rdi+rcx*2],xmm2
-
-        lea         rdi,        [rdi+rcx*2]
-        movdqu      xmm3,       [rsi]
-
-        add         rdi,        rcx
-        movdqu      xmm4,       [rsi+rax]
-
-        movdqu      xmm5,       [rsi+rax*2]
-        lea         rsi,        [rsi+rax*2]
-
-        movdqa      [rdi],  xmm3
-        add         rsi,        rax
-
-        movdqa      [rdi+rcx],  xmm4
-        movdqa      [rdi+rcx*2],xmm5
-
-        lea         rdi,        [rdi+rcx*2]
-        movdqu      xmm0,       [rsi]
-
-        add         rdi,        rcx
-        movdqu      xmm1,       [rsi+rax]
-
-        movdqu      xmm2,       [rsi+rax*2]
-        lea         rsi,        [rsi+rax*2]
-
-        movdqa      [rdi],      xmm0
-        add         rsi,        rax
-
-        movdqa      [rdi+rcx],  xmm1
-
-        movdqa      [rdi+rcx*2],    xmm2
-        movdqu      xmm3,       [rsi]
-
-        movdqu      xmm4,       [rsi+rax]
-        lea         rdi,        [rdi+rcx*2]
-
-        add         rdi,        rcx
-        movdqu      xmm5,       [rsi+rax*2]
-
-        lea         rsi,        [rsi+rax*2]
-        movdqa      [rdi],  xmm3
-
-        add         rsi,        rax
-        movdqa      [rdi+rcx],  xmm4
-
-        movdqa      [rdi+rcx*2],xmm5
-        movdqu      xmm0,       [rsi]
-
-        lea         rdi,        [rdi+rcx*2]
-        movdqu      xmm1,       [rsi+rax]
-
-        add         rdi,        rcx
-        movdqu      xmm2,       [rsi+rax*2]
-
-        lea         rsi,        [rsi+rax*2]
-        movdqa      [rdi],      xmm0
-
-        movdqa      [rdi+rcx],  xmm1
-        movdqa      [rdi+rcx*2],xmm2
-
-        movdqu      xmm3,       [rsi+rax]
-        lea         rdi,        [rdi+rcx*2]
-
-        movdqa      [rdi+rcx],  xmm3
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vp9_intra_pred_uv_dc_mmx2(
-;    unsigned char *dst,
-;    int dst_stride
-;    unsigned char *src,
-;    int src_stride,
-;    )
-global sym(vp9_intra_pred_uv_dc_mmx2)
-sym(vp9_intra_pred_uv_dc_mmx2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 4
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ; from top
-    mov         rsi,        arg(2) ;src;
-    movsxd      rax,        dword ptr arg(3) ;src_stride;
-    sub         rsi,        rax
-    pxor        mm0,        mm0
-    movq        mm1,        [rsi]
-    psadbw      mm1,        mm0
-
-    ; from left
-    dec         rsi
-    lea         rdi,        [rax*3]
-    movzx       ecx,        byte [rsi+rax]
-    movzx       edx,        byte [rsi+rax*2]
-    add         ecx,        edx
-    movzx       edx,        byte [rsi+rdi]
-    add         ecx,        edx
-    lea         rsi,        [rsi+rax*4]
-    movzx       edx,        byte [rsi]
-    add         ecx,        edx
-    movzx       edx,        byte [rsi+rax]
-    add         ecx,        edx
-    movzx       edx,        byte [rsi+rax*2]
-    add         ecx,        edx
-    movzx       edx,        byte [rsi+rdi]
-    add         ecx,        edx
-    movzx       edx,        byte [rsi+rax*4]
-    add         ecx,        edx
-
-    ; add up
-    pextrw      edx,        mm1, 0x0
-    lea         edx,        [edx+ecx+8]
-    sar         edx,        4
-    movd        mm1,        edx
-    pshufw      mm1,        mm1, 0x0
-    packuswb    mm1,        mm1
-
-    ; write out
-    mov         rdi,        arg(0) ;dst;
-    movsxd      rcx,        dword ptr arg(1) ;dst_stride
-    lea         rax,        [rcx*3]
-
-    movq [rdi      ],       mm1
-    movq [rdi+rcx  ],       mm1
-    movq [rdi+rcx*2],       mm1
-    movq [rdi+rax  ],       mm1
-    lea         rdi,        [rdi+rcx*4]
-    movq [rdi      ],       mm1
-    movq [rdi+rcx  ],       mm1
-    movq [rdi+rcx*2],       mm1
-    movq [rdi+rax  ],       mm1
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void vp9_intra_pred_uv_dctop_mmx2(
-;    unsigned char *dst,
-;    int dst_stride
-;    unsigned char *src,
-;    int src_stride,
-;    )
-global sym(vp9_intra_pred_uv_dctop_mmx2)
-sym(vp9_intra_pred_uv_dctop_mmx2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 4
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ; from top
-    mov         rsi,        arg(2) ;src;
-    movsxd      rax,        dword ptr arg(3) ;src_stride;
-    sub         rsi,        rax
-    pxor        mm0,        mm0
-    movq        mm1,        [rsi]
-    psadbw      mm1,        mm0
-
-    ; add up
-    paddw       mm1,        [GLOBAL(dc_4)]
-    psraw       mm1,        3
-    pshufw      mm1,        mm1, 0x0
-    packuswb    mm1,        mm1
-
-    ; write out
-    mov         rdi,        arg(0) ;dst;
-    movsxd      rcx,        dword ptr arg(1) ;dst_stride
-    lea         rax,        [rcx*3]
-
-    movq [rdi      ],       mm1
-    movq [rdi+rcx  ],       mm1
-    movq [rdi+rcx*2],       mm1
-    movq [rdi+rax  ],       mm1
-    lea         rdi,        [rdi+rcx*4]
-    movq [rdi      ],       mm1
-    movq [rdi+rcx  ],       mm1
-    movq [rdi+rcx*2],       mm1
-    movq [rdi+rax  ],       mm1
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void vp9_intra_pred_uv_dcleft_mmx2(
-;    unsigned char *dst,
-;    int dst_stride
-;    unsigned char *src,
-;    int src_stride,
-;    )
-global sym(vp9_intra_pred_uv_dcleft_mmx2)
-sym(vp9_intra_pred_uv_dcleft_mmx2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 4
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ; from left
-    mov         rsi,        arg(2) ;src;
-    movsxd      rax,        dword ptr arg(3) ;src_stride;
-    dec         rsi
-    lea         rdi,        [rax*3]
-    movzx       ecx,        byte [rsi]
-    movzx       edx,        byte [rsi+rax]
-    add         ecx,        edx
-    movzx       edx,        byte [rsi+rax*2]
-    add         ecx,        edx
-    movzx       edx,        byte [rsi+rdi]
-    add         ecx,        edx
-    lea         rsi,        [rsi+rax*4]
-    movzx       edx,        byte [rsi]
-    add         ecx,        edx
-    movzx       edx,        byte [rsi+rax]
-    add         ecx,        edx
-    movzx       edx,        byte [rsi+rax*2]
-    add         ecx,        edx
-    movzx       edx,        byte [rsi+rdi]
-    lea         edx,        [ecx+edx+4]
-
-    ; add up
-    shr         edx,        3
-    movd        mm1,        edx
-    pshufw      mm1,        mm1, 0x0
-    packuswb    mm1,        mm1
-
-    ; write out
-    mov         rdi,        arg(0) ;dst;
-    movsxd      rcx,        dword ptr arg(1) ;dst_stride
-    lea         rax,        [rcx*3]
-
-    movq [rdi      ],       mm1
-    movq [rdi+rcx  ],       mm1
-    movq [rdi+rcx*2],       mm1
-    movq [rdi+rax  ],       mm1
-    lea         rdi,        [rdi+rcx*4]
-    movq [rdi      ],       mm1
-    movq [rdi+rcx  ],       mm1
-    movq [rdi+rcx*2],       mm1
-    movq [rdi+rax  ],       mm1
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void vp9_intra_pred_uv_dc128_mmx(
-;    unsigned char *dst,
-;    int dst_stride
-;    unsigned char *src,
-;    int src_stride,
-;    )
-global sym(vp9_intra_pred_uv_dc128_mmx)
-sym(vp9_intra_pred_uv_dc128_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 4
-    GET_GOT     rbx
-    ; end prolog
-
-    ; write out
-    movq        mm1,        [GLOBAL(dc_128)]
-    mov         rax,        arg(0) ;dst;
-    movsxd      rdx,        dword ptr arg(1) ;dst_stride
-    lea         rcx,        [rdx*3]
-
-    movq [rax      ],       mm1
-    movq [rax+rdx  ],       mm1
-    movq [rax+rdx*2],       mm1
-    movq [rax+rcx  ],       mm1
-    lea         rax,        [rax+rdx*4]
-    movq [rax      ],       mm1
-    movq [rax+rdx  ],       mm1
-    movq [rax+rdx*2],       mm1
-    movq [rax+rcx  ],       mm1
-
-    ; begin epilog
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void vp9_intra_pred_uv_tm_sse2(
-;    unsigned char *dst,
-;    int dst_stride
-;    unsigned char *src,
-;    int src_stride,
-;    )
-%macro vp9_intra_pred_uv_tm 1
-global sym(vp9_intra_pred_uv_tm_%1)
-sym(vp9_intra_pred_uv_tm_%1):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 4
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ; read top row
-    mov         edx,        4
-    mov         rsi,        arg(2) ;src;
-    movsxd      rax,        dword ptr arg(3) ;src_stride;
-    sub         rsi,        rax
-    pxor        xmm0,       xmm0
-%ifidn %1, ssse3
-    movdqa      xmm2,       [GLOBAL(dc_1024)]
-%endif
-    movq        xmm1,       [rsi]
-    punpcklbw   xmm1,       xmm0
-
-    ; set up left ptrs ans subtract topleft
-    movd        xmm3,       [rsi-1]
-    lea         rsi,        [rsi+rax-1]
-%ifidn %1, sse2
-    punpcklbw   xmm3,       xmm0
-    pshuflw     xmm3,       xmm3, 0x0
-    punpcklqdq  xmm3,       xmm3
-%else
-    pshufb      xmm3,       xmm2
-%endif
-    psubw       xmm1,       xmm3
-
-    ; set up dest ptrs
-    mov         rdi,        arg(0) ;dst;
-    movsxd      rcx,        dword ptr arg(1) ;dst_stride
-
-.vp9_intra_pred_uv_tm_%1_loop:
-    movd        xmm3,       [rsi]
-    movd        xmm5,       [rsi+rax]
-%ifidn %1, sse2
-    punpcklbw   xmm3,       xmm0
-    punpcklbw   xmm5,       xmm0
-    pshuflw     xmm3,       xmm3, 0x0
-    pshuflw     xmm5,       xmm5, 0x0
-    punpcklqdq  xmm3,       xmm3
-    punpcklqdq  xmm5,       xmm5
-%else
-    pshufb      xmm3,       xmm2
-    pshufb      xmm5,       xmm2
-%endif
-    paddw       xmm3,       xmm1
-    paddw       xmm5,       xmm1
-    packuswb    xmm3,       xmm5
-    movq  [rdi    ],        xmm3
-    movhps[rdi+rcx],        xmm3
-    lea         rsi,        [rsi+rax*2]
-    lea         rdi,        [rdi+rcx*2]
-    dec         edx
-    jnz .vp9_intra_pred_uv_tm_%1_loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-%endmacro
-
-vp9_intra_pred_uv_tm sse2
-vp9_intra_pred_uv_tm ssse3
-
-;void vp9_intra_pred_uv_ve_mmx(
-;    unsigned char *dst,
-;    int dst_stride
-;    unsigned char *src,
-;    int src_stride,
-;    )
-global sym(vp9_intra_pred_uv_ve_mmx)
-sym(vp9_intra_pred_uv_ve_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 4
-    ; end prolog
-
-    ; read from top
-    mov         rax,        arg(2) ;src;
-    movsxd      rdx,        dword ptr arg(3) ;src_stride;
-    sub         rax,        rdx
-    movq        mm1,        [rax]
-
-    ; write out
-    mov         rax,        arg(0) ;dst;
-    movsxd      rdx,        dword ptr arg(1) ;dst_stride
-    lea         rcx,        [rdx*3]
-
-    movq [rax      ],       mm1
-    movq [rax+rdx  ],       mm1
-    movq [rax+rdx*2],       mm1
-    movq [rax+rcx  ],       mm1
-    lea         rax,        [rax+rdx*4]
-    movq [rax      ],       mm1
-    movq [rax+rdx  ],       mm1
-    movq [rax+rdx*2],       mm1
-    movq [rax+rcx  ],       mm1
-
-    ; begin epilog
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void vp9_intra_pred_uv_ho_mmx2(
-;    unsigned char *dst,
-;    int dst_stride
-;    unsigned char *src,
-;    int src_stride,
-;    )
-%macro vp9_intra_pred_uv_ho 1
-global sym(vp9_intra_pred_uv_ho_%1)
-sym(vp9_intra_pred_uv_ho_%1):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 4
-    push        rsi
-    push        rdi
-%ifidn %1, ssse3
-%ifndef GET_GOT_SAVE_ARG
-    push        rbx
-%endif
-    GET_GOT     rbx
-%endif
-    ; end prolog
-
-    ; read from left and write out
-%ifidn %1, mmx2
-    mov         edx,        4
-%endif
-    mov         rsi,        arg(2) ;src;
-    movsxd      rax,        dword ptr arg(3) ;src_stride;
-    mov         rdi,        arg(0) ;dst;
-    movsxd      rcx,        dword ptr arg(1) ;dst_stride
-%ifidn %1, ssse3
-    lea         rdx,        [rcx*3]
-    movdqa      xmm2,       [GLOBAL(dc_00001111)]
-    lea         rbx,        [rax*3]
-%endif
-    dec         rsi
-%ifidn %1, mmx2
-.vp9_intra_pred_uv_ho_%1_loop:
-    movd        mm0,        [rsi]
-    movd        mm1,        [rsi+rax]
-    punpcklbw   mm0,        mm0
-    punpcklbw   mm1,        mm1
-    pshufw      mm0,        mm0, 0x0
-    pshufw      mm1,        mm1, 0x0
-    movq  [rdi    ],        mm0
-    movq  [rdi+rcx],        mm1
-    lea         rsi,        [rsi+rax*2]
-    lea         rdi,        [rdi+rcx*2]
-    dec         edx
-    jnz .vp9_intra_pred_uv_ho_%1_loop
-%else
-    movd        xmm0,       [rsi]
-    movd        xmm3,       [rsi+rax]
-    movd        xmm1,       [rsi+rax*2]
-    movd        xmm4,       [rsi+rbx]
-    punpcklbw   xmm0,       xmm3
-    punpcklbw   xmm1,       xmm4
-    pshufb      xmm0,       xmm2
-    pshufb      xmm1,       xmm2
-    movq   [rdi    ],       xmm0
-    movhps [rdi+rcx],       xmm0
-    movq [rdi+rcx*2],       xmm1
-    movhps [rdi+rdx],       xmm1
-    lea         rsi,        [rsi+rax*4]
-    lea         rdi,        [rdi+rcx*4]
-    movd        xmm0,       [rsi]
-    movd        xmm3,       [rsi+rax]
-    movd        xmm1,       [rsi+rax*2]
-    movd        xmm4,       [rsi+rbx]
-    punpcklbw   xmm0,       xmm3
-    punpcklbw   xmm1,       xmm4
-    pshufb      xmm0,       xmm2
-    pshufb      xmm1,       xmm2
-    movq   [rdi    ],       xmm0
-    movhps [rdi+rcx],       xmm0
-    movq [rdi+rcx*2],       xmm1
-    movhps [rdi+rdx],       xmm1
-%endif
-
-    ; begin epilog
-%ifidn %1, ssse3
-    RESTORE_GOT
-%ifndef GET_GOT_SAVE_ARG
-    pop         rbx
-%endif
-%endif
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-%endmacro
-
-vp9_intra_pred_uv_ho mmx2
-vp9_intra_pred_uv_ho ssse3
-
-SECTION_RODATA
-dc_128:
-    times 8 db 128
-dc_4:
-    times 4 dw 4
-align 16
-dc_1024:
-    times 8 dw 0x400
-align 16
-dc_00001111:
-    times 8 db 0
-    times 8 db 1
--- a/vp8/common/x86/recon_wrapper_sse2.c
+++ /dev/null
@@ -1,101 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "vpx_ports/config.h"
-#include "vpx_mem/vpx_mem.h"
-#include "vp8/common/blockd.h"
-
-#define build_intra_predictors_mbuv_prototype(sym) \
-  void sym(unsigned char *dst, int dst_stride, \
-           const unsigned char *src, int src_stride)
-typedef build_intra_predictors_mbuv_prototype((*build_intra_pred_mbuv_fn_t));
-
-extern build_intra_predictors_mbuv_prototype(vp9_intra_pred_uv_dc_mmx2);
-extern build_intra_predictors_mbuv_prototype(vp9_intra_pred_uv_dctop_mmx2);
-extern build_intra_predictors_mbuv_prototype(vp9_intra_pred_uv_dcleft_mmx2);
-extern build_intra_predictors_mbuv_prototype(vp9_intra_pred_uv_dc128_mmx);
-extern build_intra_predictors_mbuv_prototype(vp9_intra_pred_uv_ho_mmx2);
-extern build_intra_predictors_mbuv_prototype(vp9_intra_pred_uv_ho_ssse3);
-extern build_intra_predictors_mbuv_prototype(vp9_intra_pred_uv_ve_mmx);
-extern build_intra_predictors_mbuv_prototype(vp9_intra_pred_uv_tm_sse2);
-extern build_intra_predictors_mbuv_prototype(vp9_intra_pred_uv_tm_ssse3);
-
-static void build_intra_predictors_mbuv_x86(MACROBLOCKD *xd,
-                                            unsigned char *dst_u,
-                                            unsigned char *dst_v,
-                                            int dst_stride,
-                                            build_intra_pred_mbuv_fn_t tm_fn,
-                                            build_intra_pred_mbuv_fn_t ho_fn) {
-  int mode = xd->mode_info_context->mbmi.uv_mode;
-  build_intra_pred_mbuv_fn_t fn;
-  int src_stride = xd->dst.uv_stride;
-
-  switch (mode) {
-    case  V_PRED:
-      fn = vp9_intra_pred_uv_ve_mmx;
-      break;
-    case  H_PRED:
-      fn = ho_fn;
-      break;
-    case TM_PRED:
-      fn = tm_fn;
-      break;
-    case DC_PRED:
-      if (xd->up_available) {
-        if (xd->left_available) {
-          fn = vp9_intra_pred_uv_dc_mmx2;
-          break;
-        } else {
-          fn = vp9_intra_pred_uv_dctop_mmx2;
-          break;
-        }
-      } else if (xd->left_available) {
-        fn = vp9_intra_pred_uv_dcleft_mmx2;
-        break;
-      } else {
-        fn = vp9_intra_pred_uv_dc128_mmx;
-        break;
-      }
-      break;
-    default:
-      return;
-  }
-
-  fn(dst_u, dst_stride, xd->dst.u_buffer, src_stride);
-  fn(dst_v, dst_stride, xd->dst.v_buffer, src_stride);
-}
-
-void vp9_build_intra_predictors_mbuv_sse2(MACROBLOCKD *xd) {
-  build_intra_predictors_mbuv_x86(xd, &xd->predictor[256],
-                                  &xd->predictor[320], 8,
-                                  vp9_intra_pred_uv_tm_sse2,
-                                  vp9_intra_pred_uv_ho_mmx2);
-}
-
-void vp9_build_intra_predictors_mbuv_ssse3(MACROBLOCKD *xd) {
-  build_intra_predictors_mbuv_x86(xd, &xd->predictor[256],
-                                  &xd->predictor[320], 8,
-                                  vp9_intra_pred_uv_tm_ssse3,
-                                  vp9_intra_pred_uv_ho_ssse3);
-}
-
-void vp9_build_intra_predictors_mbuv_s_sse2(MACROBLOCKD *xd) {
-  build_intra_predictors_mbuv_x86(xd, xd->dst.u_buffer,
-                                  xd->dst.v_buffer, xd->dst.uv_stride,
-                                  vp9_intra_pred_uv_tm_sse2,
-                                  vp9_intra_pred_uv_ho_mmx2);
-}
-
-void vp9_build_intra_predictors_mbuv_s_ssse3(MACROBLOCKD *xd) {
-  build_intra_predictors_mbuv_x86(xd, xd->dst.u_buffer,
-                                  xd->dst.v_buffer, xd->dst.uv_stride,
-                                  vp9_intra_pred_uv_tm_ssse3,
-                                  vp9_intra_pred_uv_ho_ssse3);
-}
--- a/vp8/common/x86/sadmxn_x86.c
+++ /dev/null
@@ -1,92 +1,0 @@
-/*
- *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <emmintrin.h>  // SSE2
-#include "./vpx_config.h"
-#include "./vpx_rtcd.h"
-
-
-#if CONFIG_NEWBESTREFMV
-
-
-#if HAVE_SSE2
-unsigned int vp9_sad16x3_sse2(
-  const unsigned char *src_ptr,
-  int  src_stride,
-  const unsigned char *ref_ptr,
-  int  ref_stride,
-  int max_sad) {
-  __m128i s0, s1, s2;
-  __m128i r0, r1, r2;
-  __m128i sad;
-
-  (void)max_sad;
-
-  s0 = _mm_loadu_si128((const __m128i *)(src_ptr + 0 * src_stride));
-  s1 = _mm_loadu_si128((const __m128i *)(src_ptr + 1 * src_stride));
-  s2 = _mm_loadu_si128((const __m128i *)(src_ptr + 2 * src_stride));
-
-  r0 = _mm_loadu_si128((const __m128i *)(ref_ptr + 0 * src_stride));
-  r1 = _mm_loadu_si128((const __m128i *)(ref_ptr + 1 * src_stride));
-  r2 = _mm_loadu_si128((const __m128i *)(ref_ptr + 2 * src_stride));
-
-  sad = _mm_sad_epu8(s0, r0);
-  sad = _mm_add_epi16(sad,  _mm_sad_epu8(s1, r1));
-  sad = _mm_add_epi16(sad,  _mm_sad_epu8(s2, r2));
-  sad = _mm_add_epi16(sad,  _mm_srli_si128(sad, 8));
-
-  return _mm_cvtsi128_si32(sad);
-}
-
-unsigned int vp9_sad3x16_sse2(
-  const unsigned char *src_ptr,
-  int  src_stride,
-  const unsigned char *ref_ptr,
-  int  ref_stride,
-  int max_sad) {
-  int r;
-  __m128i s0, s1, s2, s3;
-  __m128i r0, r1, r2, r3;
-  __m128i sad = _mm_set1_epi16(0);
-  for (r = 0; r < 16; r += 4) {
-    s0 = _mm_cvtsi32_si128 (*(const int *)(src_ptr + 0 * src_stride));
-    s1 = _mm_cvtsi32_si128 (*(const int *)(src_ptr + 1 * src_stride));
-    s2 = _mm_cvtsi32_si128 (*(const int *)(src_ptr + 2 * src_stride));
-    s3 = _mm_cvtsi32_si128 (*(const int *)(src_ptr + 3 * src_stride));
-    r0 = _mm_cvtsi32_si128 (*(const int *)(ref_ptr + 0 * src_stride));
-    r1 = _mm_cvtsi32_si128 (*(const int *)(ref_ptr + 1 * src_stride));
-    r2 = _mm_cvtsi32_si128 (*(const int *)(ref_ptr + 2 * src_stride));
-    r3 = _mm_cvtsi32_si128 (*(const int *)(ref_ptr + 3 * src_stride));
-
-    s0 = _mm_unpacklo_epi8(s0, s1);
-    r0 = _mm_unpacklo_epi8(r0, r1);
-    s2 = _mm_unpacklo_epi8(s2, s3);
-    r2 = _mm_unpacklo_epi8(r2, r3);
-    s0 = _mm_unpacklo_epi64(s0, s2);
-    r0 = _mm_unpacklo_epi64(r0, r2);
-
-    // throw out byte 3
-    s0 = _mm_slli_epi64(s0, 16);
-    r0 = _mm_slli_epi64(r0, 16);
-
-    sad = _mm_add_epi16(sad, _mm_sad_epu8(s0, r0));
-
-    src_ptr += src_stride*4;
-    ref_ptr += ref_stride*4;
-  }
-
-  sad = _mm_add_epi16(sad,  _mm_srli_si128(sad, 8));
-  return _mm_cvtsi128_si32(sad);
-}
-
-#endif
-
-
-#endif  // CONFIG_NEWBESTREFMV
--- a/vp8/common/x86/subpixel_8t_ssse3.asm
+++ /dev/null
@@ -1,550 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-;/************************************************************************************
-; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
-; input pixel array has output_height rows. This routine assumes that output_height is an
-; even number. This function handles 8 pixels in horizontal direction, calculating ONE
-; rows each iteration to take advantage of the 128 bits operations.
-;
-; This is an implementation of some of the SSE optimizations first seen in ffvp8
-;
-;*************************************************************************************/
-
-;void vp9_filter_block1d8_v8_ssse3
-;(
-;    unsigned char *src_ptr,
-;    unsigned int   src_pitch,
-;    unsigned char *output_ptr,
-;    unsigned int   out_pitch,
-;    unsigned int   output_height,
-;    short *filter
-;)
-global sym(vp9_filter_block1d8_v8_ssse3)
-sym(vp9_filter_block1d8_v8_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    push        rbx
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 16*5
-    %define k0k1 [rsp + 16*0]
-    %define k2k3 [rsp + 16*1]
-    %define k4k5 [rsp + 16*2]
-    %define k6k7 [rsp + 16*3]
-    %define krd [rsp + 16*4]
-
-    mov         rdx, arg(5)                 ;filter ptr
-    mov         rsi, arg(0)                 ;src_ptr
-    mov         rdi, arg(2)                 ;output_ptr
-    mov         rcx, 0x0400040
-
-    movdqa      xmm4, [rdx]                 ;load filters
-    movd        xmm5, rcx
-    packsswb    xmm4, xmm4
-    pshuflw     xmm0, xmm4, 0b              ;k0_k1
-    pshuflw     xmm1, xmm4, 01010101b       ;k2_k3
-    pshuflw     xmm2, xmm4, 10101010b       ;k4_k5
-    pshuflw     xmm3, xmm4, 11111111b       ;k6_k7
-
-    punpcklqdq  xmm0, xmm0
-    punpcklqdq  xmm1, xmm1
-    punpcklqdq  xmm2, xmm2
-    punpcklqdq  xmm3, xmm3
-
-    movdqa      k0k1, xmm0
-    movdqa      k2k3, xmm1
-    pshufd      xmm5, xmm5, 0
-    movdqa      k4k5, xmm2
-    movdqa      k6k7, xmm3
-    movdqa      krd, xmm5
-
-    movsxd      rdx, DWORD PTR arg(1)       ;pixels_per_line
-
-%if ABI_IS_32BIT=0
-    movsxd      r8, DWORD PTR arg(3)        ;out_pitch
-%endif
-    mov         rax, rsi
-    movsxd      rcx, DWORD PTR arg(4)       ;output_height
-    add         rax, rdx
-
-    lea         rbx, [rdx + rdx*4]
-    add         rbx, rdx                    ;pitch * 6
-
-.vp9_filter_block1d8_v8_ssse3_loop:
-    movq        xmm0, [rsi]                 ;A
-    movq        xmm1, [rsi + rdx]           ;B
-    movq        xmm2, [rsi + rdx * 2]       ;C
-    movq        xmm3, [rax + rdx * 2]       ;D
-    movq        xmm4, [rsi + rdx * 4]       ;E
-    movq        xmm5, [rax + rdx * 4]       ;F
-
-    punpcklbw   xmm0, xmm1                  ;A B
-    punpcklbw   xmm2, xmm3                  ;C D
-    punpcklbw   xmm4, xmm5                  ;E F
-
-    movq        xmm6, [rsi + rbx]           ;G
-    movq        xmm7, [rax + rbx]           ;H
-
-    pmaddubsw   xmm0, k0k1
-    pmaddubsw   xmm2, k2k3
-    punpcklbw   xmm6, xmm7                  ;G H
-    pmaddubsw   xmm4, k4k5
-    pmaddubsw   xmm6, k6k7
-
-    paddsw      xmm0, xmm2
-    paddsw      xmm0, krd
-    paddsw      xmm4, xmm6
-    paddsw      xmm0, xmm4
-
-    psraw       xmm0, 7
-    packuswb    xmm0, xmm0
-
-    add         rsi,  rdx
-    add         rax,  rdx
-
-    movq        [rdi], xmm0
-
-%if ABI_IS_32BIT
-    add         rdi, DWORD PTR arg(3)       ;out_pitch
-%else
-    add         rdi, r8
-%endif
-    dec         rcx
-    jnz         .vp9_filter_block1d8_v8_ssse3_loop
-
-    add rsp, 16*5
-    pop rsp
-    pop rbx
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void vp9_filter_block1d16_v8_ssse3
-;(
-;    unsigned char *src_ptr,
-;    unsigned int   src_pitch,
-;    unsigned char *output_ptr,
-;    unsigned int   out_pitch,
-;    unsigned int   output_height,
-;    short *filter
-;)
-global sym(vp9_filter_block1d16_v8_ssse3)
-sym(vp9_filter_block1d16_v8_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    push        rbx
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 16*5
-    %define k0k1 [rsp + 16*0]
-    %define k2k3 [rsp + 16*1]
-    %define k4k5 [rsp + 16*2]
-    %define k6k7 [rsp + 16*3]
-    %define krd [rsp + 16*4]
-
-    mov         rdx, arg(5)                 ;filter ptr
-    mov         rsi, arg(0)                 ;src_ptr
-    mov         rdi, arg(2)                 ;output_ptr
-    mov         rcx, 0x0400040
-
-    movdqa      xmm4, [rdx]                 ;load filters
-    movd        xmm5, rcx
-    packsswb    xmm4, xmm4
-    pshuflw     xmm0, xmm4, 0b              ;k0_k1
-    pshuflw     xmm1, xmm4, 01010101b       ;k2_k3
-    pshuflw     xmm2, xmm4, 10101010b       ;k4_k5
-    pshuflw     xmm3, xmm4, 11111111b       ;k6_k7
-
-    punpcklqdq  xmm0, xmm0
-    punpcklqdq  xmm1, xmm1
-    punpcklqdq  xmm2, xmm2
-    punpcklqdq  xmm3, xmm3
-
-    movdqa      k0k1, xmm0
-    movdqa      k2k3, xmm1
-    pshufd      xmm5, xmm5, 0
-    movdqa      k4k5, xmm2
-    movdqa      k6k7, xmm3
-    movdqa      krd, xmm5
-
-    movsxd      rdx, DWORD PTR arg(1)       ;pixels_per_line
-
-%if ABI_IS_32BIT=0
-    movsxd      r8, DWORD PTR arg(3)        ;out_pitch
-%endif
-    mov         rax, rsi
-    movsxd      rcx, DWORD PTR arg(4)       ;output_height
-    add         rax, rdx
-
-    lea         rbx, [rdx + rdx*4]
-    add         rbx, rdx                    ;pitch * 6
-
-.vp9_filter_block1d16_v8_ssse3_loop:
-    movq        xmm0, [rsi]                 ;A
-    movq        xmm1, [rsi + rdx]           ;B
-    movq        xmm2, [rsi + rdx * 2]       ;C
-    movq        xmm3, [rax + rdx * 2]       ;D
-    movq        xmm4, [rsi + rdx * 4]       ;E
-    movq        xmm5, [rax + rdx * 4]       ;F
-
-    punpcklbw   xmm0, xmm1                  ;A B
-    punpcklbw   xmm2, xmm3                  ;C D
-    punpcklbw   xmm4, xmm5                  ;E F
-
-    movq        xmm6, [rsi + rbx]           ;G
-    movq        xmm7, [rax + rbx]           ;H
-
-    pmaddubsw   xmm0, k0k1
-    pmaddubsw   xmm2, k2k3
-    punpcklbw   xmm6, xmm7                  ;G H
-    pmaddubsw   xmm4, k4k5
-    pmaddubsw   xmm6, k6k7
-
-    paddsw      xmm0, xmm2
-    paddsw      xmm0, krd
-    paddsw      xmm4, xmm6
-    paddsw      xmm0, xmm4
-
-    psraw       xmm0, 7
-    packuswb    xmm0, xmm0
-
-    movq        [rdi], xmm0
-
-    movq        xmm0, [rsi + 8]             ;A
-    movq        xmm1, [rsi + rdx + 8]       ;B
-    movq        xmm2, [rsi + rdx * 2 + 8]   ;C
-    movq        xmm3, [rax + rdx * 2 + 8]   ;D
-    movq        xmm4, [rsi + rdx * 4 + 8]   ;E
-    movq        xmm5, [rax + rdx * 4 + 8]   ;F
-
-    punpcklbw   xmm0, xmm1                  ;A B
-    punpcklbw   xmm2, xmm3                  ;C D
-    punpcklbw   xmm4, xmm5                  ;E F
-
-
-    movq        xmm6, [rsi + rbx + 8]       ;G
-    movq        xmm7, [rax + rbx + 8]       ;H
-    punpcklbw   xmm6, xmm7                  ;G H
-
-
-    pmaddubsw   xmm0, k0k1
-    pmaddubsw   xmm2, k2k3
-    pmaddubsw   xmm4, k4k5
-    pmaddubsw   xmm6, k6k7
-
-    paddsw      xmm0, xmm2
-    paddsw      xmm4, xmm6
-    paddsw      xmm0, krd
-    paddsw      xmm0, xmm4
-
-    psraw       xmm0, 7
-    packuswb    xmm0, xmm0
-
-    add         rsi,  rdx
-    add         rax,  rdx
-
-    movq        [rdi+8], xmm0
-
-%if ABI_IS_32BIT
-    add         rdi, DWORD PTR arg(3)       ;out_pitch
-%else
-    add         rdi, r8
-%endif
-    dec         rcx
-    jnz         .vp9_filter_block1d16_v8_ssse3_loop
-
-    add rsp, 16*5
-    pop rsp
-    pop rbx
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void vp9_filter_block1d8_h8_ssse3
-;(
-;    unsigned char  *src_ptr,
-;    unsigned int    src_pixels_per_line,
-;    unsigned char  *output_ptr,
-;    unsigned int    output_pitch,
-;    unsigned int    output_height,
-;    short *filter
-;)
-global sym(vp9_filter_block1d8_h8_ssse3)
-sym(vp9_filter_block1d8_h8_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 16*5
-    %define k0k1 [rsp + 16*0]
-    %define k2k3 [rsp + 16*1]
-    %define k4k5 [rsp + 16*2]
-    %define k6k7 [rsp + 16*3]
-    %define krd [rsp + 16*4]
-
-    mov         rdx, arg(5)                 ;filter ptr
-    mov         rsi, arg(0)                 ;src_ptr
-    mov         rdi, arg(2)                 ;output_ptr
-    mov         rcx, 0x0400040
-
-    movdqa      xmm4, [rdx]                 ;load filters
-    movd        xmm5, rcx
-    packsswb    xmm4, xmm4
-    pshuflw     xmm0, xmm4, 0b              ;k0_k1
-    pshuflw     xmm1, xmm4, 01010101b       ;k2_k3
-    pshuflw     xmm2, xmm4, 10101010b       ;k4_k5
-    pshuflw     xmm3, xmm4, 11111111b       ;k6_k7
-
-    punpcklqdq  xmm0, xmm0
-    punpcklqdq  xmm1, xmm1
-    punpcklqdq  xmm2, xmm2
-    punpcklqdq  xmm3, xmm3
-
-    movdqa      k0k1, xmm0
-    movdqa      k2k3, xmm1
-    pshufd      xmm5, xmm5, 0
-    movdqa      k4k5, xmm2
-    movdqa      k6k7, xmm3
-;    movdqa      krd, xmm5
-
-    movsxd      rax, dword ptr arg(1)       ;src_pixels_per_line
-    movsxd      rdx, dword ptr arg(3)       ;output_pitch
-    movsxd      rcx, dword ptr arg(4)       ;output_height
-
-.filter_block1d8_h8_rowloop_ssse3:
-    movq        xmm0,   [rsi - 3]    ; -3 -2 -1  0  1  2  3  4
-
-;    movq        xmm3,   [rsi + 4]    ; 4  5  6  7  8  9 10 11
-    movq        xmm3,   [rsi + 5]    ; 5  6  7  8  9 10 11 12
-;note: if we create a k0_k7 filter, we can save a pshufb
-;    punpcklbw   xmm0,   xmm3         ; -3 4 -2 5 -1 6 0 7 1 8 2 9 3 10 4 11
-    punpcklqdq  xmm0,   xmm3
-
-    movdqa      xmm1,   xmm0
-    pshufb      xmm0,   [GLOBAL(shuf_t0t1)]
-    pmaddubsw   xmm0,   k0k1
-
-    movdqa      xmm2,   xmm1
-    pshufb      xmm1,   [GLOBAL(shuf_t2t3)]
-    pmaddubsw   xmm1,   k2k3
-
-    movdqa      xmm4,   xmm2
-    pshufb      xmm2,   [GLOBAL(shuf_t4t5)]
-    pmaddubsw   xmm2,   k4k5
-
-    pshufb      xmm4,   [GLOBAL(shuf_t6t7)]
-    pmaddubsw   xmm4,   k6k7
-
-    paddsw      xmm0,   xmm1
-    paddsw      xmm0,   xmm2
-    paddsw      xmm0,   xmm5
-    paddsw      xmm0,   xmm4
-    psraw       xmm0,   7
-    packuswb    xmm0,   xmm0
-
-    lea         rsi,    [rsi + rax]
-    movq        [rdi],  xmm0
-
-    lea         rdi,    [rdi + rdx]
-    dec         rcx
-    jnz         .filter_block1d8_h8_rowloop_ssse3
-
-    add rsp, 16*5
-    pop rsp
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void vp9_filter_block1d16_h8_ssse3
-;(
-;    unsigned char  *src_ptr,
-;    unsigned int    src_pixels_per_line,
-;    unsigned char  *output_ptr,
-;    unsigned int    output_pitch,
-;    unsigned int    output_height,
-;    short *filter
-;)
-global sym(vp9_filter_block1d16_h8_ssse3)
-sym(vp9_filter_block1d16_h8_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 16*5
-    %define k0k1 [rsp + 16*0]
-    %define k2k3 [rsp + 16*1]
-    %define k4k5 [rsp + 16*2]
-    %define k6k7 [rsp + 16*3]
-    %define krd [rsp + 16*4]
-
-    mov         rdx, arg(5)                 ;filter ptr
-    mov         rsi, arg(0)                 ;src_ptr
-    mov         rdi, arg(2)                 ;output_ptr
-    mov         rcx, 0x0400040
-
-    movdqa      xmm4, [rdx]                 ;load filters
-    movd        xmm5, rcx
-    packsswb    xmm4, xmm4
-    pshuflw     xmm0, xmm4, 0b              ;k0_k1
-    pshuflw     xmm1, xmm4, 01010101b       ;k2_k3
-    pshuflw     xmm2, xmm4, 10101010b       ;k4_k5
-    pshuflw     xmm3, xmm4, 11111111b       ;k6_k7
-
-    punpcklqdq  xmm0, xmm0
-    punpcklqdq  xmm1, xmm1
-    punpcklqdq  xmm2, xmm2
-    punpcklqdq  xmm3, xmm3
-
-    movdqa      k0k1, xmm0
-    movdqa      k2k3, xmm1
-    pshufd      xmm5, xmm5, 0
-    movdqa      k4k5, xmm2
-    movdqa      k6k7, xmm3
-    movdqa      krd, xmm5
-
-    movsxd      rax, dword ptr arg(1)       ;src_pixels_per_line
-    movsxd      rdx, dword ptr arg(3)       ;output_pitch
-    movsxd      rcx, dword ptr arg(4)       ;output_height
-
-.filter_block1d16_h8_rowloop_ssse3:
-    movq        xmm0,   [rsi - 3]    ; -3 -2 -1  0  1  2  3  4
-
-;    movq        xmm3,   [rsi + 4]    ; 4  5  6  7  8  9 10 11
-    movq        xmm3,   [rsi + 5]    ; 5  6  7  8  9 10 11 12
-;note: if we create a k0_k7 filter, we can save a pshufb
-;    punpcklbw   xmm0,   xmm3         ; -3 4 -2 5 -1 6 0 7 1 8 2 9 3 10 4 11
-    punpcklqdq  xmm0,   xmm3
-
-    movdqa      xmm1,   xmm0
-    pshufb      xmm0,   [GLOBAL(shuf_t0t1)]
-    pmaddubsw   xmm0,   k0k1
-
-    movdqa      xmm2,   xmm1
-    pshufb      xmm1,   [GLOBAL(shuf_t2t3)]
-    pmaddubsw   xmm1,   k2k3
-
-    movdqa      xmm4,   xmm2
-    pshufb      xmm2,   [GLOBAL(shuf_t4t5)]
-    pmaddubsw   xmm2,   k4k5
-
-    pshufb      xmm4,   [GLOBAL(shuf_t6t7)]
-    pmaddubsw   xmm4,   k6k7
-
-    paddsw      xmm0,   xmm1
-    paddsw      xmm0,   xmm4
-    paddsw      xmm0,   xmm2
-    paddsw      xmm0,   krd
-    psraw       xmm0,   7
-    packuswb    xmm0,   xmm0
-
-
-    movq        xmm3,   [rsi +  5]
-;    movq        xmm7,   [rsi + 12]
-    movq        xmm7,   [rsi + 13]
-;note: same as above
-;    punpcklbw   xmm3,   xmm7
-    punpcklqdq  xmm3,   xmm7
-
-    movdqa      xmm1,   xmm3
-    pshufb      xmm3,   [GLOBAL(shuf_t0t1)]
-    pmaddubsw   xmm3,   k0k1
-
-    movdqa      xmm2,   xmm1
-    pshufb      xmm1,   [GLOBAL(shuf_t2t3)]
-    pmaddubsw   xmm1,   k2k3
-
-    movdqa      xmm4,   xmm2
-    pshufb      xmm2,   [GLOBAL(shuf_t4t5)]
-    pmaddubsw   xmm2,   k4k5
-
-    pshufb      xmm4,   [GLOBAL(shuf_t6t7)]
-    pmaddubsw   xmm4,   k6k7
-
-    paddsw      xmm3,   xmm1
-    paddsw      xmm3,   xmm2
-    paddsw      xmm3,   krd
-    paddsw      xmm3,   xmm4
-    psraw       xmm3,   7
-    packuswb    xmm3,   xmm3
-    punpcklqdq  xmm0,   xmm3
-
-    lea         rsi,    [rsi + rax]
-    movdqa      [rdi],  xmm0
-
-    lea         rdi,    [rdi + rdx]
-    dec         rcx
-    jnz         .filter_block1d16_h8_rowloop_ssse3
-
-    add rsp, 16*5
-    pop rsp
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-SECTION_RODATA
-align 16
-shuf_t0t1:
-    db  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
-align 16
-shuf_t2t3:
-    db  2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
-align 16
-shuf_t4t5:
-    db  4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12
-align 16
-shuf_t6t7:
-    db  6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
--- a/vp8/common/x86/subpixel_mmx.asm
+++ /dev/null
@@ -1,727 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-
-%define BLOCK_HEIGHT_WIDTH 4
-%define vp9_filter_weight 128
-%define VP9_FILTER_SHIFT  7
-
-
-;void vp9_filter_block1d_h6_mmx
-;(
-;    unsigned char   *src_ptr,
-;    unsigned short  *output_ptr,
-;    unsigned int    src_pixels_per_line,
-;    unsigned int    pixel_step,
-;    unsigned int    output_height,
-;    unsigned int    output_width,
-;    short           * vp9_filter
-;)
-global sym(vp9_filter_block1d_h6_mmx)
-sym(vp9_filter_block1d_h6_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-        mov         rdx,    arg(6) ;vp9_filter
-
-        movq        mm1,    [rdx + 16]             ; do both the negative taps first!!!
-        movq        mm2,    [rdx + 32]         ;
-        movq        mm6,    [rdx + 48]        ;
-        movq        mm7,    [rdx + 64]        ;
-
-        mov         rdi,    arg(1) ;output_ptr
-        mov         rsi,    arg(0) ;src_ptr
-        movsxd      rcx,    dword ptr arg(4) ;output_height
-        movsxd      rax,    dword ptr arg(5) ;output_width      ; destination pitch?
-        pxor        mm0,    mm0              ; mm0 = 00000000
-
-.nextrow:
-        movq        mm3,    [rsi-2]          ; mm3 = p-2..p5
-        movq        mm4,    mm3              ; mm4 = p-2..p5
-        psrlq       mm3,    8                ; mm3 = p-1..p5
-        punpcklbw   mm3,    mm0              ; mm3 = p-1..p2
-        pmullw      mm3,    mm1              ; mm3 *= kernel 1 modifiers.
-
-        movq        mm5,    mm4              ; mm5 = p-2..p5
-        punpckhbw   mm4,    mm0              ; mm5 = p2..p5
-        pmullw      mm4,    mm7              ; mm5 *= kernel 4 modifiers
-        paddsw      mm3,    mm4              ; mm3 += mm5
-
-        movq        mm4,    mm5              ; mm4 = p-2..p5;
-        psrlq       mm5,    16               ; mm5 = p0..p5;
-        punpcklbw   mm5,    mm0              ; mm5 = p0..p3
-        pmullw      mm5,    mm2              ; mm5 *= kernel 2 modifiers
-        paddsw      mm3,    mm5              ; mm3 += mm5
-
-        movq        mm5,    mm4              ; mm5 = p-2..p5
-        psrlq       mm4,    24               ; mm4 = p1..p5
-        punpcklbw   mm4,    mm0              ; mm4 = p1..p4
-        pmullw      mm4,    mm6              ; mm5 *= kernel 3 modifiers
-        paddsw      mm3,    mm4              ; mm3 += mm5
-
-        ; do outer positive taps
-        movd        mm4,    [rsi+3]
-        punpcklbw   mm4,    mm0              ; mm5 = p3..p6
-        pmullw      mm4,    [rdx+80]         ; mm5 *= kernel 0 modifiers
-        paddsw      mm3,    mm4              ; mm3 += mm5
-
-        punpcklbw   mm5,    mm0              ; mm5 = p-2..p1
-        pmullw      mm5,    [rdx]            ; mm5 *= kernel 5 modifiers
-        paddsw      mm3,    mm5              ; mm3 += mm5
-
-        paddsw      mm3,    [GLOBAL(rd)]              ; mm3 += round value
-        psraw       mm3,    VP9_FILTER_SHIFT     ; mm3 /= 128
-        packuswb    mm3,    mm0              ; pack and unpack to saturate
-        punpcklbw   mm3,    mm0              ;
-
-        movq        [rdi],  mm3              ; store the results in the destination
-
-%if ABI_IS_32BIT
-        add         rsi,    dword ptr arg(2) ;src_pixels_per_line ; next line
-        add         rdi,    rax;
-%else
-        movsxd      r8,     dword ptr arg(2) ;src_pixels_per_line
-        add         rdi,    rax;
-
-        add         rsi,    r8               ; next line
-%endif
-
-        dec         rcx                      ; decrement count
-        jnz         .nextrow                 ; next row
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vp9_filter_block1dc_v6_mmx
-;(
-;   short *src_ptr,
-;   unsigned char *output_ptr,
-;    int output_pitch,
-;   unsigned int pixels_per_line,
-;   unsigned int pixel_step,
-;   unsigned int output_height,
-;   unsigned int output_width,
-;   short * vp9_filter
-;)
-global sym(vp9_filter_block1dc_v6_mmx)
-sym(vp9_filter_block1dc_v6_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 8
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-        movq      mm5, [GLOBAL(rd)]
-        push        rbx
-        mov         rbx, arg(7) ;vp9_filter
-        movq      mm1, [rbx + 16]             ; do both the negative taps first!!!
-        movq      mm2, [rbx + 32]         ;
-        movq      mm6, [rbx + 48]        ;
-        movq      mm7, [rbx + 64]        ;
-
-        movsxd      rdx, dword ptr arg(3) ;pixels_per_line
-        mov         rdi, arg(1) ;output_ptr
-        mov         rsi, arg(0) ;src_ptr
-        sub         rsi, rdx
-        sub         rsi, rdx
-        movsxd      rcx, DWORD PTR arg(5) ;output_height
-        movsxd      rax, DWORD PTR arg(2) ;output_pitch      ; destination pitch?
-        pxor        mm0, mm0              ; mm0 = 00000000
-
-
-.nextrow_cv:
-        movq        mm3, [rsi+rdx]        ; mm3 = p0..p8  = row -1
-        pmullw      mm3, mm1              ; mm3 *= kernel 1 modifiers.
-
-
-        movq        mm4, [rsi + 4*rdx]      ; mm4 = p0..p3  = row 2
-        pmullw      mm4, mm7              ; mm4 *= kernel 4 modifiers.
-        paddsw      mm3, mm4              ; mm3 += mm4
-
-        movq        mm4, [rsi + 2*rdx]           ; mm4 = p0..p3  = row 0
-        pmullw      mm4, mm2              ; mm4 *= kernel 2 modifiers.
-        paddsw      mm3, mm4              ; mm3 += mm4
-
-        movq        mm4, [rsi]            ; mm4 = p0..p3  = row -2
-        pmullw      mm4, [rbx]            ; mm4 *= kernel 0 modifiers.
-        paddsw      mm3, mm4              ; mm3 += mm4
-
-
-        add         rsi, rdx              ; move source forward 1 line to avoid 3 * pitch
-        movq        mm4, [rsi + 2*rdx]     ; mm4 = p0..p3  = row 1
-        pmullw      mm4, mm6              ; mm4 *= kernel 3 modifiers.
-        paddsw      mm3, mm4              ; mm3 += mm4
-
-        movq        mm4, [rsi + 4*rdx]    ; mm4 = p0..p3  = row 3
-        pmullw      mm4, [rbx +80]        ; mm4 *= kernel 3 modifiers.
-        paddsw      mm3, mm4              ; mm3 += mm4
-
-
-        paddsw      mm3, mm5               ; mm3 += round value
-        psraw       mm3, VP9_FILTER_SHIFT     ; mm3 /= 128
-        packuswb    mm3, mm0              ; pack and saturate
-
-        movd        [rdi],mm3             ; store the results in the destination
-        ; the subsequent iterations repeat 3 out of 4 of these reads.  Since the
-        ; recon block should be in cache this shouldn't cost much.  Its obviously
-        ; avoidable!!!.
-        lea         rdi,  [rdi+rax] ;
-        dec         rcx                   ; decrement count
-        jnz         .nextrow_cv           ; next row
-
-        pop         rbx
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void bilinear_predict8x8_mmx
-;(
-;    unsigned char  *src_ptr,
-;    int   src_pixels_per_line,
-;    int  xoffset,
-;    int  yoffset,
-;   unsigned char *dst_ptr,
-;    int dst_pitch
-;)
-global sym(vp9_bilinear_predict8x8_mmx)
-sym(vp9_bilinear_predict8x8_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ;const short *HFilter = bilinear_filters_mmx[xoffset];
-    ;const short *VFilter = bilinear_filters_mmx[yoffset];
-
-        movsxd      rax,        dword ptr arg(2) ;xoffset
-        mov         rdi,        arg(4) ;dst_ptr           ;
-
-        shl         rax,        5 ; offset * 32
-        lea         rcx,        [GLOBAL(sym(vp9_bilinear_filters_8x_mmx))]
-
-        add         rax,        rcx ; HFilter
-        mov         rsi,        arg(0) ;src_ptr              ;
-
-        movsxd      rdx,        dword ptr arg(5) ;dst_pitch
-        movq        mm1,        [rax]               ;
-
-        movq        mm2,        [rax+16]            ;
-        movsxd      rax,        dword ptr arg(3) ;yoffset
-
-        pxor        mm0,        mm0                 ;
-
-        shl         rax,        5 ; offset*32
-        add         rax,        rcx ; VFilter
-
-        lea         rcx,        [rdi+rdx*8]          ;
-        movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line    ;
-
-
-
-        ; get the first horizontal line done       ;
-        movq        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
-        movq        mm4,        mm3                 ; make a copy of current line
-
-        punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
-        punpckhbw   mm4,        mm0                 ;
-
-        pmullw      mm3,        mm1                 ;
-        pmullw      mm4,        mm1                 ;
-
-        movq        mm5,        [rsi+1]             ;
-        movq        mm6,        mm5                 ;
-
-        punpcklbw   mm5,        mm0                 ;
-        punpckhbw   mm6,        mm0                 ;
-
-        pmullw      mm5,        mm2                 ;
-        pmullw      mm6,        mm2                 ;
-
-        paddw       mm3,        mm5                 ;
-        paddw       mm4,        mm6                 ;
-
-        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
-        psraw       mm3,        VP9_FILTER_SHIFT        ; xmm3 /= 128
-
-        paddw       mm4,        [GLOBAL(rd)]                 ;
-        psraw       mm4,        VP9_FILTER_SHIFT        ;
-
-        movq        mm7,        mm3                 ;
-        packuswb    mm7,        mm4                 ;
-
-        add         rsi,        rdx                 ; next line
-.next_row_8x8:
-        movq        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
-        movq        mm4,        mm3                 ; make a copy of current line
-
-        punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
-        punpckhbw   mm4,        mm0                 ;
-
-        pmullw      mm3,        mm1                 ;
-        pmullw      mm4,        mm1                 ;
-
-        movq        mm5,        [rsi+1]             ;
-        movq        mm6,        mm5                 ;
-
-        punpcklbw   mm5,        mm0                 ;
-        punpckhbw   mm6,        mm0                 ;
-
-        pmullw      mm5,        mm2                 ;
-        pmullw      mm6,        mm2                 ;
-
-        paddw       mm3,        mm5                 ;
-        paddw       mm4,        mm6                 ;
-
-        movq        mm5,        mm7                 ;
-        movq        mm6,        mm7                 ;
-
-        punpcklbw   mm5,        mm0                 ;
-        punpckhbw   mm6,        mm0
-
-        pmullw      mm5,        [rax]               ;
-        pmullw      mm6,        [rax]               ;
-
-        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
-        psraw       mm3,        VP9_FILTER_SHIFT        ; xmm3 /= 128
-
-        paddw       mm4,        [GLOBAL(rd)]                 ;
-        psraw       mm4,        VP9_FILTER_SHIFT        ;
-
-        movq        mm7,        mm3                 ;
-        packuswb    mm7,        mm4                 ;
-
-
-        pmullw      mm3,        [rax+16]            ;
-        pmullw      mm4,        [rax+16]            ;
-
-        paddw       mm3,        mm5                 ;
-        paddw       mm4,        mm6                 ;
-
-
-        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
-        psraw       mm3,        VP9_FILTER_SHIFT        ; xmm3 /= 128
-
-        paddw       mm4,        [GLOBAL(rd)]                 ;
-        psraw       mm4,        VP9_FILTER_SHIFT        ;
-
-        packuswb    mm3,        mm4
-
-        movq        [rdi],      mm3                 ; store the results in the destination
-
-%if ABI_IS_32BIT
-        add         rsi,        rdx                 ; next line
-        add         rdi,        dword ptr arg(5) ;dst_pitch                   ;
-%else
-        movsxd      r8,         dword ptr arg(5) ;dst_pitch
-        add         rsi,        rdx                 ; next line
-        add         rdi,        r8                  ;dst_pitch
-%endif
-        cmp         rdi,        rcx                 ;
-        jne         .next_row_8x8
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void bilinear_predict8x4_mmx
-;(
-;    unsigned char  *src_ptr,
-;    int   src_pixels_per_line,
-;    int  xoffset,
-;    int  yoffset,
-;    unsigned char *dst_ptr,
-;    int dst_pitch
-;)
-global sym(vp9_bilinear_predict8x4_mmx)
-sym(vp9_bilinear_predict8x4_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ;const short *HFilter = bilinear_filters_mmx[xoffset];
-    ;const short *VFilter = bilinear_filters_mmx[yoffset];
-
-        movsxd      rax,        dword ptr arg(2) ;xoffset
-        mov         rdi,        arg(4) ;dst_ptr           ;
-
-        lea         rcx,        [GLOBAL(sym(vp9_bilinear_filters_8x_mmx))]
-        shl         rax,        5
-
-        mov         rsi,        arg(0) ;src_ptr              ;
-        add         rax,        rcx
-
-        movsxd      rdx,        dword ptr arg(5) ;dst_pitch
-        movq        mm1,        [rax]               ;
-
-        movq        mm2,        [rax+16]            ;
-        movsxd      rax,        dword ptr arg(3) ;yoffset
-
-        pxor        mm0,        mm0                 ;
-        shl         rax,        5
-
-        add         rax,        rcx
-        lea         rcx,        [rdi+rdx*4]          ;
-
-        movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line    ;
-
-        ; get the first horizontal line done       ;
-        movq        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
-        movq        mm4,        mm3                 ; make a copy of current line
-
-        punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
-        punpckhbw   mm4,        mm0                 ;
-
-        pmullw      mm3,        mm1                 ;
-        pmullw      mm4,        mm1                 ;
-
-        movq        mm5,        [rsi+1]             ;
-        movq        mm6,        mm5                 ;
-
-        punpcklbw   mm5,        mm0                 ;
-        punpckhbw   mm6,        mm0                 ;
-
-        pmullw      mm5,        mm2                 ;
-        pmullw      mm6,        mm2                 ;
-
-        paddw       mm3,        mm5                 ;
-        paddw       mm4,        mm6                 ;
-
-        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
-        psraw       mm3,        VP9_FILTER_SHIFT        ; xmm3 /= 128
-
-        paddw       mm4,        [GLOBAL(rd)]                 ;
-        psraw       mm4,        VP9_FILTER_SHIFT        ;
-
-        movq        mm7,        mm3                 ;
-        packuswb    mm7,        mm4                 ;
-
-        add         rsi,        rdx                 ; next line
-.next_row_8x4:
-        movq        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
-        movq        mm4,        mm3                 ; make a copy of current line
-
-        punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
-        punpckhbw   mm4,        mm0                 ;
-
-        pmullw      mm3,        mm1                 ;
-        pmullw      mm4,        mm1                 ;
-
-        movq        mm5,        [rsi+1]             ;
-        movq        mm6,        mm5                 ;
-
-        punpcklbw   mm5,        mm0                 ;
-        punpckhbw   mm6,        mm0                 ;
-
-        pmullw      mm5,        mm2                 ;
-        pmullw      mm6,        mm2                 ;
-
-        paddw       mm3,        mm5                 ;
-        paddw       mm4,        mm6                 ;
-
-        movq        mm5,        mm7                 ;
-        movq        mm6,        mm7                 ;
-
-        punpcklbw   mm5,        mm0                 ;
-        punpckhbw   mm6,        mm0
-
-        pmullw      mm5,        [rax]               ;
-        pmullw      mm6,        [rax]               ;
-
-        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
-        psraw       mm3,        VP9_FILTER_SHIFT        ; xmm3 /= 128
-
-        paddw       mm4,        [GLOBAL(rd)]                 ;
-        psraw       mm4,        VP9_FILTER_SHIFT        ;
-
-        movq        mm7,        mm3                 ;
-        packuswb    mm7,        mm4                 ;
-
-
-        pmullw      mm3,        [rax+16]            ;
-        pmullw      mm4,        [rax+16]            ;
-
-        paddw       mm3,        mm5                 ;
-        paddw       mm4,        mm6                 ;
-
-
-        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
-        psraw       mm3,        VP9_FILTER_SHIFT        ; xmm3 /= 128
-
-        paddw       mm4,        [GLOBAL(rd)]                 ;
-        psraw       mm4,        VP9_FILTER_SHIFT        ;
-
-        packuswb    mm3,        mm4
-
-        movq        [rdi],      mm3                 ; store the results in the destination
-
-%if ABI_IS_32BIT
-        add         rsi,        rdx                 ; next line
-        add         rdi,        dword ptr arg(5) ;dst_pitch                   ;
-%else
-        movsxd      r8,         dword ptr arg(5) ;dst_pitch
-        add         rsi,        rdx                 ; next line
-        add         rdi,        r8
-%endif
-        cmp         rdi,        rcx                 ;
-        jne         .next_row_8x4
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void bilinear_predict4x4_mmx
-;(
-;    unsigned char  *src_ptr,
-;    int   src_pixels_per_line,
-;    int  xoffset,
-;    int  yoffset,
-;    unsigned char *dst_ptr,
-;    int dst_pitch
-;)
-global sym(vp9_bilinear_predict4x4_mmx)
-sym(vp9_bilinear_predict4x4_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ;const short *HFilter = bilinear_filters_mmx[xoffset];
-    ;const short *VFilter = bilinear_filters_mmx[yoffset];
-
-        movsxd      rax,        dword ptr arg(2) ;xoffset
-        mov         rdi,        arg(4) ;dst_ptr           ;
-
-        lea         rcx,        [GLOBAL(sym(vp9_bilinear_filters_8x_mmx))]
-        shl         rax,        5
-
-        add         rax,        rcx ; HFilter
-        mov         rsi,        arg(0) ;src_ptr              ;
-
-        movsxd      rdx,        dword ptr arg(5) ;ldst_pitch
-        movq        mm1,        [rax]               ;
-
-        movq        mm2,        [rax+16]            ;
-        movsxd      rax,        dword ptr arg(3) ;yoffset
-
-        pxor        mm0,        mm0                 ;
-        shl         rax,        5
-
-        add         rax,        rcx
-        lea         rcx,        [rdi+rdx*4]          ;
-
-        movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line    ;
-
-        ; get the first horizontal line done       ;
-        movd        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
-        punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
-
-        pmullw      mm3,        mm1                 ;
-        movd        mm5,        [rsi+1]             ;
-
-        punpcklbw   mm5,        mm0                 ;
-        pmullw      mm5,        mm2                 ;
-
-        paddw       mm3,        mm5                 ;
-        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
-
-        psraw       mm3,        VP9_FILTER_SHIFT        ; xmm3 /= 128
-
-        movq        mm7,        mm3                 ;
-        packuswb    mm7,        mm0                 ;
-
-        add         rsi,        rdx                 ; next line
-.next_row_4x4:
-        movd        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
-        punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
-
-        pmullw      mm3,        mm1                 ;
-        movd        mm5,        [rsi+1]             ;
-
-        punpcklbw   mm5,        mm0                 ;
-        pmullw      mm5,        mm2                 ;
-
-        paddw       mm3,        mm5                 ;
-
-        movq        mm5,        mm7                 ;
-        punpcklbw   mm5,        mm0                 ;
-
-        pmullw      mm5,        [rax]               ;
-        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
-
-        psraw       mm3,        VP9_FILTER_SHIFT        ; xmm3 /= 128
-        movq        mm7,        mm3                 ;
-
-        packuswb    mm7,        mm0                 ;
-
-        pmullw      mm3,        [rax+16]            ;
-        paddw       mm3,        mm5                 ;
-
-
-        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
-        psraw       mm3,        VP9_FILTER_SHIFT        ; xmm3 /= 128
-
-        packuswb    mm3,        mm0
-        movd        [rdi],      mm3                 ; store the results in the destination
-
-%if ABI_IS_32BIT
-        add         rsi,        rdx                 ; next line
-        add         rdi,        dword ptr arg(5) ;dst_pitch                   ;
-%else
-        movsxd      r8,         dword ptr arg(5) ;dst_pitch                   ;
-        add         rsi,        rdx                 ; next line
-        add         rdi,        r8
-%endif
-
-        cmp         rdi,        rcx                 ;
-        jne         .next_row_4x4
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-
-SECTION_RODATA
-align 16
-rd:
-    times 4 dw 0x40
-
-align 16
-global HIDDEN_DATA(sym(vp9_six_tap_mmx))
-sym(vp9_six_tap_mmx):
-    times 8 dw 0
-    times 8 dw 0
-    times 8 dw 128
-    times 8 dw 0
-    times 8 dw 0
-    times 8 dw 0
-
-    times 8 dw 0
-    times 8 dw -6
-    times 8 dw 123
-    times 8 dw 12
-    times 8 dw -1
-    times 8 dw 0
-
-    times 8 dw 2
-    times 8 dw -11
-    times 8 dw 108
-    times 8 dw 36
-    times 8 dw -8
-    times 8 dw 1
-
-    times 8 dw 0
-    times 8 dw -9
-    times 8 dw 93
-    times 8 dw 50
-    times 8 dw -6
-    times 8 dw 0
-
-    times 8 dw 3
-    times 8 dw -16
-    times 8 dw 77
-    times 8 dw 77
-    times 8 dw -16
-    times 8 dw 3
-
-    times 8 dw 0
-    times 8 dw -6
-    times 8 dw 50
-    times 8 dw 93
-    times 8 dw -9
-    times 8 dw 0
-
-    times 8 dw 1
-    times 8 dw -8
-    times 8 dw 36
-    times 8 dw 108
-    times 8 dw -11
-    times 8 dw 2
-
-    times 8 dw 0
-    times 8 dw -1
-    times 8 dw 12
-    times 8 dw 123
-    times 8 dw -6
-    times 8 dw 0
-
-
-align 16
-global HIDDEN_DATA(sym(vp9_bilinear_filters_8x_mmx))
-sym(vp9_bilinear_filters_8x_mmx):
-    times 8 dw 128
-    times 8 dw 0
-
-    times 8 dw 112
-    times 8 dw 16
-
-    times 8 dw 96
-    times 8 dw 32
-
-    times 8 dw 80
-    times 8 dw 48
-
-    times 8 dw 64
-    times 8 dw 64
-
-    times 8 dw 48
-    times 8 dw 80
-
-    times 8 dw 32
-    times 8 dw 96
-
-    times 8 dw 16
-    times 8 dw 112
--- a/vp8/common/x86/subpixel_sse2.asm
+++ /dev/null
@@ -1,1372 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-%define BLOCK_HEIGHT_WIDTH 4
-%define VP9_FILTER_WEIGHT 128
-%define VP9_FILTER_SHIFT  7
-
-
-;/************************************************************************************
-; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
-; input pixel array has output_height rows. This routine assumes that output_height is an
-; even number. This function handles 8 pixels in horizontal direction, calculating ONE
-; rows each iteration to take advantage of the 128 bits operations.
-;*************************************************************************************/
-;void vp9_filter_block1d8_h6_sse2
-;(
-;    unsigned char  *src_ptr,
-;    unsigned short *output_ptr,
-;    unsigned int    src_pixels_per_line,
-;    unsigned int    pixel_step,
-;    unsigned int    output_height,
-;    unsigned int    output_width,
-;    short           *vp9_filter
-;)
-global sym(vp9_filter_block1d8_h6_sse2)
-sym(vp9_filter_block1d8_h6_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-        mov         rdx,        arg(6) ;vp9_filter
-        mov         rsi,        arg(0) ;src_ptr
-
-        mov         rdi,        arg(1) ;output_ptr
-
-        movsxd      rcx,        dword ptr arg(4) ;output_height
-        movsxd      rax,        dword ptr arg(2) ;src_pixels_per_line            ; Pitch for Source
-%if ABI_IS_32BIT=0
-        movsxd      r8,         dword ptr arg(5) ;output_width
-%endif
-        pxor        xmm0,       xmm0                        ; clear xmm0 for unpack
-
-.filter_block1d8_h6_rowloop:
-        movq        xmm3,       MMWORD PTR [rsi - 2]
-        movq        xmm1,       MMWORD PTR [rsi + 6]
-
-        prefetcht2  [rsi+rax-2]
-
-        pslldq      xmm1,       8
-        por         xmm1,       xmm3
-
-        movdqa      xmm4,       xmm1
-        movdqa      xmm5,       xmm1
-
-        movdqa      xmm6,       xmm1
-        movdqa      xmm7,       xmm1
-
-        punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
-        psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
-
-        pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1
-        punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
-
-        psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
-        pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2
-
-
-        punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
-        psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
-
-        pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3
-
-        punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
-        psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
-
-        pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4
-
-        punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
-        psrldq      xmm1,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
-
-
-        pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5
-
-        punpcklbw   xmm1,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
-        pmullw      xmm1,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6
-
-
-        paddsw      xmm4,       xmm7
-        paddsw      xmm4,       xmm5
-
-        paddsw      xmm4,       xmm3
-        paddsw      xmm4,       xmm6
-
-        paddsw      xmm4,       xmm1
-        paddsw      xmm4,       [GLOBAL(rd)]
-
-        psraw       xmm4,       7
-
-        packuswb    xmm4,       xmm0
-        punpcklbw   xmm4,       xmm0
-
-        movdqa      XMMWORD Ptr [rdi],         xmm4
-        lea         rsi,        [rsi + rax]
-
-%if ABI_IS_32BIT
-        add         rdi,        DWORD Ptr arg(5) ;[output_width]
-%else
-        add         rdi,        r8
-%endif
-        dec         rcx
-
-        jnz         .filter_block1d8_h6_rowloop                ; next row
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vp9_filter_block1d16_h6_sse2
-;(
-;    unsigned char  *src_ptr,
-;    unsigned short *output_ptr,
-;    unsigned int    src_pixels_per_line,
-;    unsigned int    pixel_step,
-;    unsigned int    output_height,
-;    unsigned int    output_width,
-;    short           *vp9_filter
-;)
-;/************************************************************************************
-; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
-; input pixel array has output_height rows. This routine assumes that output_height is an
-; even number. This function handles 8 pixels in horizontal direction, calculating ONE
-; rows each iteration to take advantage of the 128 bits operations.
-;*************************************************************************************/
-global sym(vp9_filter_block1d16_h6_sse2)
-sym(vp9_filter_block1d16_h6_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-        mov         rdx,        arg(6) ;vp9_filter
-        mov         rsi,        arg(0) ;src_ptr
-
-        mov         rdi,        arg(1) ;output_ptr
-
-        movsxd      rcx,        dword ptr arg(4) ;output_height
-        movsxd      rax,        dword ptr arg(2) ;src_pixels_per_line            ; Pitch for Source
-%if ABI_IS_32BIT=0
-        movsxd      r8,         dword ptr arg(5) ;output_width
-%endif
-
-        pxor        xmm0,       xmm0                        ; clear xmm0 for unpack
-
-.filter_block1d16_h6_sse2_rowloop:
-        movq        xmm3,       MMWORD PTR [rsi - 2]
-        movq        xmm1,       MMWORD PTR [rsi + 6]
-
-        movq        xmm2,       MMWORD PTR [rsi +14]
-        pslldq      xmm2,       8
-
-        por         xmm2,       xmm1
-        prefetcht2  [rsi+rax-2]
-
-        pslldq      xmm1,       8
-        por         xmm1,       xmm3
-
-        movdqa      xmm4,       xmm1
-        movdqa      xmm5,       xmm1
-
-        movdqa      xmm6,       xmm1
-        movdqa      xmm7,       xmm1
-
-        punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
-        psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
-
-        pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1
-        punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
-
-        psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
-        pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2
-
-
-        punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
-        psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
-
-        pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3
-
-        punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
-        psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
-
-        pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4
-
-        punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
-        psrldq      xmm1,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
-
-
-        pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5
-
-        punpcklbw   xmm1,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
-        pmullw      xmm1,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6
-
-        paddsw      xmm4,       xmm7
-        paddsw      xmm4,       xmm5
-
-        paddsw      xmm4,       xmm3
-        paddsw      xmm4,       xmm6
-
-        paddsw      xmm4,       xmm1
-        paddsw      xmm4,       [GLOBAL(rd)]
-
-        psraw       xmm4,       7
-
-        packuswb    xmm4,       xmm0
-        punpcklbw   xmm4,       xmm0
-
-        movdqa      XMMWORD Ptr [rdi],         xmm4
-
-        movdqa      xmm3,       xmm2
-        movdqa      xmm4,       xmm2
-
-        movdqa      xmm5,       xmm2
-        movdqa      xmm6,       xmm2
-
-        movdqa      xmm7,       xmm2
-
-        punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
-        psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
-
-        pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1
-        punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
-
-        psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
-        pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2
-
-
-        punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
-        psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
-
-        pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3
-
-        punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
-        psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
-
-        pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4
-
-        punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
-        psrldq      xmm2,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
-
-        pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5
-
-        punpcklbw   xmm2,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
-        pmullw      xmm2,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6
-
-
-        paddsw      xmm4,       xmm7
-        paddsw      xmm4,       xmm5
-
-        paddsw      xmm4,       xmm3
-        paddsw      xmm4,       xmm6
-
-        paddsw      xmm4,       xmm2
-        paddsw      xmm4,       [GLOBAL(rd)]
-
-        psraw       xmm4,       7
-
-        packuswb    xmm4,       xmm0
-        punpcklbw   xmm4,       xmm0
-
-        movdqa      XMMWORD Ptr [rdi+16],      xmm4
-
-        lea         rsi,        [rsi + rax]
-%if ABI_IS_32BIT
-        add         rdi,        DWORD Ptr arg(5) ;[output_width]
-%else
-        add         rdi,        r8
-%endif
-
-        dec         rcx
-        jnz         .filter_block1d16_h6_sse2_rowloop                ; next row
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vp9_filter_block1d8_v6_sse2
-;(
-;    short *src_ptr,
-;    unsigned char *output_ptr,
-;    int dst_ptich,
-;    unsigned int pixels_per_line,
-;    unsigned int pixel_step,
-;    unsigned int output_height,
-;    unsigned int output_width,
-;    short * vp9_filter
-;)
-;/************************************************************************************
-; Notes: filter_block1d8_v6 applies a 6 tap filter vertically to the input pixels. The
-; input pixel array has output_height rows.
-;*************************************************************************************/
-global sym(vp9_filter_block1d8_v6_sse2)
-sym(vp9_filter_block1d8_v6_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 8
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-        mov         rax,        arg(7) ;vp9_filter
-        movsxd      rdx,        dword ptr arg(3) ;pixels_per_line
-
-        mov         rdi,        arg(1) ;output_ptr
-        mov         rsi,        arg(0) ;src_ptr
-
-        sub         rsi,        rdx
-        sub         rsi,        rdx
-
-        movsxd      rcx,        DWORD PTR arg(5) ;[output_height]
-        pxor        xmm0,       xmm0                        ; clear xmm0
-
-        movdqa      xmm7,       XMMWORD PTR [GLOBAL(rd)]
-%if ABI_IS_32BIT=0
-        movsxd      r8,         dword ptr arg(2) ; dst_ptich
-%endif
-
-.vp9_filter_block1d8_v6_sse2_loop:
-        movdqa      xmm1,       XMMWORD PTR [rsi]
-        pmullw      xmm1,       [rax]
-
-        movdqa      xmm2,       XMMWORD PTR [rsi + rdx]
-        pmullw      xmm2,       [rax + 16]
-
-        movdqa      xmm3,       XMMWORD PTR [rsi + rdx * 2]
-        pmullw      xmm3,       [rax + 32]
-
-        movdqa      xmm5,       XMMWORD PTR [rsi + rdx * 4]
-        pmullw      xmm5,       [rax + 64]
-
-        add         rsi,        rdx
-        movdqa      xmm4,       XMMWORD PTR [rsi + rdx * 2]
-
-        pmullw      xmm4,       [rax + 48]
-        movdqa      xmm6,       XMMWORD PTR [rsi + rdx * 4]
-
-        pmullw      xmm6,       [rax + 80]
-
-        paddsw      xmm2,       xmm5
-        paddsw      xmm2,       xmm3
-
-        paddsw      xmm2,       xmm1
-        paddsw      xmm2,       xmm4
-
-        paddsw      xmm2,       xmm6
-        paddsw      xmm2,       xmm7
-
-        psraw       xmm2,       7
-        packuswb    xmm2,       xmm0              ; pack and saturate
-
-        movq        QWORD PTR [rdi], xmm2         ; store the results in the destination
-%if ABI_IS_32BIT
-        add         rdi,        DWORD PTR arg(2) ;[dst_ptich]
-%else
-        add         rdi,        r8
-%endif
-        dec         rcx         ; decrement count
-        jnz         .vp9_filter_block1d8_v6_sse2_loop               ; next row
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vp9_filter_block1d16_v6_sse2
-;(
-;    unsigned short *src_ptr,
-;    unsigned char *output_ptr,
-;    int dst_ptich,
-;    unsigned int pixels_per_line,
-;    unsigned int pixel_step,
-;    unsigned int output_height,
-;    unsigned int output_width,
-;    const short    *vp9_filter
-;)
-;/************************************************************************************
-; Notes: filter_block1d16_v6 applies a 6 tap filter vertically to the input pixels. The
-; input pixel array has output_height rows.
-;*************************************************************************************/
-global sym(vp9_filter_block1d16_v6_sse2)
-sym(vp9_filter_block1d16_v6_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 8
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-        mov         rax,        arg(7) ;vp9_filter
-        movsxd      rdx,        dword ptr arg(3) ;pixels_per_line
-
-        mov         rdi,        arg(1) ;output_ptr
-        mov         rsi,        arg(0) ;src_ptr
-
-        sub         rsi,        rdx
-        sub         rsi,        rdx
-
-        movsxd      rcx,        DWORD PTR arg(5) ;[output_height]
-%if ABI_IS_32BIT=0
-        movsxd      r8,         dword ptr arg(2) ; dst_ptich
-%endif
-
-.vp9_filter_block1d16_v6_sse2_loop:
-; The order for adding 6-tap is 2 5 3 1 4 6. Read in data in that order.
-        movdqa      xmm1,       XMMWORD PTR [rsi + rdx]       ; line 2
-        movdqa      xmm2,       XMMWORD PTR [rsi + rdx + 16]
-        pmullw      xmm1,       [rax + 16]
-        pmullw      xmm2,       [rax + 16]
-
-        movdqa      xmm3,       XMMWORD PTR [rsi + rdx * 4]       ; line 5
-        movdqa      xmm4,       XMMWORD PTR [rsi + rdx * 4 + 16]
-        pmullw      xmm3,       [rax + 64]
-        pmullw      xmm4,       [rax + 64]
-
-        movdqa      xmm5,       XMMWORD PTR [rsi + rdx * 2]       ; line 3
-        movdqa      xmm6,       XMMWORD PTR [rsi + rdx * 2 + 16]
-        pmullw      xmm5,       [rax + 32]
-        pmullw      xmm6,       [rax + 32]
-
-        movdqa      xmm7,       XMMWORD PTR [rsi]       ; line 1
-        movdqa      xmm0,       XMMWORD PTR [rsi + 16]
-        pmullw      xmm7,       [rax]
-        pmullw      xmm0,       [rax]
-
-        paddsw      xmm1,       xmm3
-        paddsw      xmm2,       xmm4
-        paddsw      xmm1,       xmm5
-        paddsw      xmm2,       xmm6
-        paddsw      xmm1,       xmm7
-        paddsw      xmm2,       xmm0
-
-        add         rsi,        rdx
-
-        movdqa      xmm3,       XMMWORD PTR [rsi + rdx * 2]       ; line 4
-        movdqa      xmm4,       XMMWORD PTR [rsi + rdx * 2 + 16]
-        pmullw      xmm3,       [rax + 48]
-        pmullw      xmm4,       [rax + 48]
-
-        movdqa      xmm5,       XMMWORD PTR [rsi + rdx * 4]       ; line 6
-        movdqa      xmm6,       XMMWORD PTR [rsi + rdx * 4 + 16]
-        pmullw      xmm5,       [rax + 80]
-        pmullw      xmm6,       [rax + 80]
-
-        movdqa      xmm7,       XMMWORD PTR [GLOBAL(rd)]
-        pxor        xmm0,       xmm0                        ; clear xmm0
-
-        paddsw      xmm1,       xmm3
-        paddsw      xmm2,       xmm4
-        paddsw      xmm1,       xmm5
-        paddsw      xmm2,       xmm6
-
-        paddsw      xmm1,       xmm7
-        paddsw      xmm2,       xmm7
-
-        psraw       xmm1,       7
-        psraw       xmm2,       7
-
-        packuswb    xmm1,       xmm2              ; pack and saturate
-        movdqa      XMMWORD PTR [rdi], xmm1       ; store the results in the destination
-%if ABI_IS_32BIT
-        add         rdi,        DWORD PTR arg(2) ;[dst_ptich]
-%else
-        add         rdi,        r8
-%endif
-        dec         rcx         ; decrement count
-        jnz         .vp9_filter_block1d16_v6_sse2_loop              ; next row
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vp9_filter_block1d8_h6_only_sse2
-;(
-;    unsigned char  *src_ptr,
-;    unsigned int    src_pixels_per_line,
-;    unsigned char  *output_ptr,
-;    int dst_ptich,
-;    unsigned int    output_height,
-;    const short    *vp9_filter
-;)
-; First-pass filter only when yoffset==0
-global sym(vp9_filter_block1d8_h6_only_sse2)
-sym(vp9_filter_block1d8_h6_only_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-        mov         rdx,        arg(5) ;vp9_filter
-        mov         rsi,        arg(0) ;src_ptr
-
-        mov         rdi,        arg(2) ;output_ptr
-
-        movsxd      rcx,        dword ptr arg(4) ;output_height
-        movsxd      rax,        dword ptr arg(1) ;src_pixels_per_line            ; Pitch for Source
-%if ABI_IS_32BIT=0
-        movsxd      r8,         dword ptr arg(3) ;dst_ptich
-%endif
-        pxor        xmm0,       xmm0                        ; clear xmm0 for unpack
-
-.filter_block1d8_h6_only_rowloop:
-        movq        xmm3,       MMWORD PTR [rsi - 2]
-        movq        xmm1,       MMWORD PTR [rsi + 6]
-
-        prefetcht2  [rsi+rax-2]
-
-        pslldq      xmm1,       8
-        por         xmm1,       xmm3
-
-        movdqa      xmm4,       xmm1
-        movdqa      xmm5,       xmm1
-
-        movdqa      xmm6,       xmm1
-        movdqa      xmm7,       xmm1
-
-        punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
-        psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
-
-        pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1
-        punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
-
-        psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
-        pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2
-
-
-        punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
-        psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
-
-        pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3
-
-        punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
-        psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
-
-        pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4
-
-        punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
-        psrldq      xmm1,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
-
-
-        pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5
-
-        punpcklbw   xmm1,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
-        pmullw      xmm1,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6
-
-
-        paddsw      xmm4,       xmm7
-        paddsw      xmm4,       xmm5
-
-        paddsw      xmm4,       xmm3
-        paddsw      xmm4,       xmm6
-
-        paddsw      xmm4,       xmm1
-        paddsw      xmm4,       [GLOBAL(rd)]
-
-        psraw       xmm4,       7
-
-        packuswb    xmm4,       xmm0
-
-        movq        QWORD PTR [rdi],   xmm4       ; store the results in the destination
-        lea         rsi,        [rsi + rax]
-
-%if ABI_IS_32BIT
-        add         rdi,        DWORD Ptr arg(3) ;dst_ptich
-%else
-        add         rdi,        r8
-%endif
-        dec         rcx
-
-        jnz         .filter_block1d8_h6_only_rowloop               ; next row
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vp9_filter_block1d16_h6_only_sse2
-;(
-;    unsigned char  *src_ptr,
-;    unsigned int    src_pixels_per_line,
-;    unsigned char  *output_ptr,
-;    int dst_ptich,
-;    unsigned int    output_height,
-;    const short    *vp9_filter
-;)
-; First-pass filter only when yoffset==0
-global sym(vp9_filter_block1d16_h6_only_sse2)
-sym(vp9_filter_block1d16_h6_only_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-        mov         rdx,        arg(5) ;vp9_filter
-        mov         rsi,        arg(0) ;src_ptr
-
-        mov         rdi,        arg(2) ;output_ptr
-
-        movsxd      rcx,        dword ptr arg(4) ;output_height
-        movsxd      rax,        dword ptr arg(1) ;src_pixels_per_line            ; Pitch for Source
-%if ABI_IS_32BIT=0
-        movsxd      r8,         dword ptr arg(3) ;dst_ptich
-%endif
-
-        pxor        xmm0,       xmm0                        ; clear xmm0 for unpack
-
-.filter_block1d16_h6_only_sse2_rowloop:
-        movq        xmm3,       MMWORD PTR [rsi - 2]
-        movq        xmm1,       MMWORD PTR [rsi + 6]
-
-        movq        xmm2,       MMWORD PTR [rsi +14]
-        pslldq      xmm2,       8
-
-        por         xmm2,       xmm1
-        prefetcht2  [rsi+rax-2]
-
-        pslldq      xmm1,       8
-        por         xmm1,       xmm3
-
-        movdqa      xmm4,       xmm1
-        movdqa      xmm5,       xmm1
-
-        movdqa      xmm6,       xmm1
-        movdqa      xmm7,       xmm1
-
-        punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
-        psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
-
-        pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1
-        punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
-
-        psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
-        pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2
-
-        punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
-        psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
-
-        pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3
-
-        punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
-        psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
-
-        pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4
-
-        punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
-        psrldq      xmm1,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
-
-        pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5
-
-        punpcklbw   xmm1,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
-        pmullw      xmm1,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6
-
-        paddsw      xmm4,       xmm7
-        paddsw      xmm4,       xmm5
-
-        paddsw      xmm4,       xmm3
-        paddsw      xmm4,       xmm6
-
-        paddsw      xmm4,       xmm1
-        paddsw      xmm4,       [GLOBAL(rd)]
-
-        psraw       xmm4,       7
-
-        packuswb    xmm4,       xmm0                        ; lower 8 bytes
-
-        movq        QWORD Ptr [rdi],         xmm4           ; store the results in the destination
-
-        movdqa      xmm3,       xmm2
-        movdqa      xmm4,       xmm2
-
-        movdqa      xmm5,       xmm2
-        movdqa      xmm6,       xmm2
-
-        movdqa      xmm7,       xmm2
-
-        punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
-        psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
-
-        pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1
-        punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
-
-        psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
-        pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2
-
-        punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
-        psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
-
-        pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3
-
-        punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
-        psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
-
-        pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4
-
-        punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
-        psrldq      xmm2,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
-
-        pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5
-
-        punpcklbw   xmm2,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
-        pmullw      xmm2,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6
-
-        paddsw      xmm4,       xmm7
-        paddsw      xmm4,       xmm5
-
-        paddsw      xmm4,       xmm3
-        paddsw      xmm4,       xmm6
-
-        paddsw      xmm4,       xmm2
-        paddsw      xmm4,       [GLOBAL(rd)]
-
-        psraw       xmm4,       7
-
-        packuswb    xmm4,       xmm0                        ; higher 8 bytes
-
-        movq        QWORD Ptr [rdi+8],      xmm4            ; store the results in the destination
-
-        lea         rsi,        [rsi + rax]
-%if ABI_IS_32BIT
-        add         rdi,        DWORD Ptr arg(3) ;dst_ptich
-%else
-        add         rdi,        r8
-%endif
-
-        dec         rcx
-        jnz         .filter_block1d16_h6_only_sse2_rowloop               ; next row
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vp9_filter_block1d8_v6_only_sse2
-;(
-;    unsigned char *src_ptr,
-;    unsigned int    src_pixels_per_line,
-;    unsigned char *output_ptr,
-;    int dst_ptich,
-;    unsigned int output_height,
-;    const short    *vp9_filter
-;)
-; Second-pass filter only when xoffset==0
-global sym(vp9_filter_block1d8_v6_only_sse2)
-sym(vp9_filter_block1d8_v6_only_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-        mov         rsi,        arg(0) ;src_ptr
-        mov         rdi,        arg(2) ;output_ptr
-
-        movsxd      rcx,        dword ptr arg(4) ;output_height
-        movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line
-
-        mov         rax,        arg(5) ;vp9_filter
-
-        pxor        xmm0,       xmm0                        ; clear xmm0
-
-        movdqa      xmm7,       XMMWORD PTR [GLOBAL(rd)]
-%if ABI_IS_32BIT=0
-        movsxd      r8,         dword ptr arg(3) ; dst_ptich
-%endif
-
-.vp9_filter_block1d8_v6_only_sse2_loop:
-        movq        xmm1,       MMWORD PTR [rsi]
-        movq        xmm2,       MMWORD PTR [rsi + rdx]
-        movq        xmm3,       MMWORD PTR [rsi + rdx * 2]
-        movq        xmm5,       MMWORD PTR [rsi + rdx * 4]
-        add         rsi,        rdx
-        movq        xmm4,       MMWORD PTR [rsi + rdx * 2]
-        movq        xmm6,       MMWORD PTR [rsi + rdx * 4]
-
-        punpcklbw   xmm1,       xmm0
-        pmullw      xmm1,       [rax]
-
-        punpcklbw   xmm2,       xmm0
-        pmullw      xmm2,       [rax + 16]
-
-        punpcklbw   xmm3,       xmm0
-        pmullw      xmm3,       [rax + 32]
-
-        punpcklbw   xmm5,       xmm0
-        pmullw      xmm5,       [rax + 64]
-
-        punpcklbw   xmm4,       xmm0
-        pmullw      xmm4,       [rax + 48]
-
-        punpcklbw   xmm6,       xmm0
-        pmullw      xmm6,       [rax + 80]
-
-        paddsw      xmm2,       xmm5
-        paddsw      xmm2,       xmm3
-
-        paddsw      xmm2,       xmm1
-        paddsw      xmm2,       xmm4
-
-        paddsw      xmm2,       xmm6
-        paddsw      xmm2,       xmm7
-
-        psraw       xmm2,       7
-        packuswb    xmm2,       xmm0              ; pack and saturate
-
-        movq        QWORD PTR [rdi], xmm2         ; store the results in the destination
-%if ABI_IS_32BIT
-        add         rdi,        DWORD PTR arg(3) ;[dst_ptich]
-%else
-        add         rdi,        r8
-%endif
-        dec         rcx         ; decrement count
-        jnz         .vp9_filter_block1d8_v6_only_sse2_loop              ; next row
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vp9_unpack_block1d16_h6_sse2
-;(
-;    unsigned char  *src_ptr,
-;    unsigned short *output_ptr,
-;    unsigned int    src_pixels_per_line,
-;    unsigned int    output_height,
-;    unsigned int    output_width
-;)
-global sym(vp9_unpack_block1d16_h6_sse2)
-sym(vp9_unpack_block1d16_h6_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-        mov         rsi,        arg(0) ;src_ptr
-        mov         rdi,        arg(1) ;output_ptr
-
-        movsxd      rcx,        dword ptr arg(3) ;output_height
-        movsxd      rax,        dword ptr arg(2) ;src_pixels_per_line            ; Pitch for Source
-
-        pxor        xmm0,       xmm0                        ; clear xmm0 for unpack
-%if ABI_IS_32BIT=0
-        movsxd      r8,         dword ptr arg(4) ;output_width            ; Pitch for Source
-%endif
-
-.unpack_block1d16_h6_sse2_rowloop:
-        movq        xmm1,       MMWORD PTR [rsi]            ; 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 -2
-        movq        xmm3,       MMWORD PTR [rsi+8]          ; make copy of xmm1
-
-        punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
-        punpcklbw   xmm1,       xmm0
-
-        movdqa      XMMWORD Ptr [rdi],         xmm1
-        movdqa      XMMWORD Ptr [rdi + 16],    xmm3
-
-        lea         rsi,        [rsi + rax]
-%if ABI_IS_32BIT
-        add         rdi,        DWORD Ptr arg(4) ;[output_width]
-%else
-        add         rdi,        r8
-%endif
-        dec         rcx
-        jnz         .unpack_block1d16_h6_sse2_rowloop               ; next row
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vp9_bilinear_predict16x16_sse2
-;(
-;    unsigned char  *src_ptr,
-;    int   src_pixels_per_line,
-;    int  xoffset,
-;    int  yoffset,
-;    unsigned char *dst_ptr,
-;    int dst_pitch
-;)
-extern sym(vp9_bilinear_filters_mmx)
-global sym(vp9_bilinear_predict16x16_sse2)
-sym(vp9_bilinear_predict16x16_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ;const short *HFilter = bilinear_filters_mmx[xoffset]
-    ;const short *VFilter = bilinear_filters_mmx[yoffset]
-
-        lea         rcx,        [GLOBAL(sym(vp9_bilinear_filters_mmx))]
-        movsxd      rax,        dword ptr arg(2) ;xoffset
-
-        cmp         rax,        0      ;skip first_pass filter if xoffset=0
-        je          .b16x16_sp_only
-
-        shl         rax,        5
-        add         rax,        rcx    ;HFilter
-
-        mov         rdi,        arg(4) ;dst_ptr
-        mov         rsi,        arg(0) ;src_ptr
-        movsxd      rdx,        dword ptr arg(5) ;dst_pitch
-
-        movdqa      xmm1,       [rax]
-        movdqa      xmm2,       [rax+16]
-
-        movsxd      rax,        dword ptr arg(3) ;yoffset
-
-        cmp         rax,        0      ;skip second_pass filter if yoffset=0
-        je          .b16x16_fp_only
-
-        shl         rax,        5
-        add         rax,        rcx    ;VFilter
-
-        lea         rcx,        [rdi+rdx*8]
-        lea         rcx,        [rcx+rdx*8]
-        movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line
-
-        pxor        xmm0,       xmm0
-
-%if ABI_IS_32BIT=0
-        movsxd      r8,         dword ptr arg(5) ;dst_pitch
-%endif
-        ; get the first horizontal line done
-        movdqu      xmm3,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
-        movdqa      xmm4,       xmm3                 ; make a copy of current line
-
-        punpcklbw   xmm3,       xmm0                 ; xx 00 01 02 03 04 05 06
-        punpckhbw   xmm4,       xmm0
-
-        pmullw      xmm3,       xmm1
-        pmullw      xmm4,       xmm1
-
-        movdqu      xmm5,       [rsi+1]
-        movdqa      xmm6,       xmm5
-
-        punpcklbw   xmm5,       xmm0
-        punpckhbw   xmm6,       xmm0
-
-        pmullw      xmm5,       xmm2
-        pmullw      xmm6,       xmm2
-
-        paddw       xmm3,       xmm5
-        paddw       xmm4,       xmm6
-
-        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
-        psraw       xmm3,       VP9_FILTER_SHIFT        ; xmm3 /= 128
-
-        paddw       xmm4,       [GLOBAL(rd)]
-        psraw       xmm4,       VP9_FILTER_SHIFT
-
-        movdqa      xmm7,       xmm3
-        packuswb    xmm7,       xmm4
-
-        add         rsi,        rdx                 ; next line
-.next_row:
-        movdqu      xmm3,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
-        movdqa      xmm4,       xmm3                 ; make a copy of current line
-
-        punpcklbw   xmm3,       xmm0                 ; xx 00 01 02 03 04 05 06
-        punpckhbw   xmm4,       xmm0
-
-        pmullw      xmm3,       xmm1
-        pmullw      xmm4,       xmm1
-
-        movdqu      xmm5,       [rsi+1]
-        movdqa      xmm6,       xmm5
-
-        punpcklbw   xmm5,       xmm0
-        punpckhbw   xmm6,       xmm0
-
-        pmullw      xmm5,       xmm2
-        pmullw      xmm6,       xmm2
-
-        paddw       xmm3,       xmm5
-        paddw       xmm4,       xmm6
-
-        movdqa      xmm5,       xmm7
-        movdqa      xmm6,       xmm7
-
-        punpcklbw   xmm5,       xmm0
-        punpckhbw   xmm6,       xmm0
-
-        pmullw      xmm5,       [rax]
-        pmullw      xmm6,       [rax]
-
-        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
-        psraw       xmm3,       VP9_FILTER_SHIFT        ; xmm3 /= 128
-
-        paddw       xmm4,       [GLOBAL(rd)]
-        psraw       xmm4,       VP9_FILTER_SHIFT
-
-        movdqa      xmm7,       xmm3
-        packuswb    xmm7,       xmm4
-
-        pmullw      xmm3,       [rax+16]
-        pmullw      xmm4,       [rax+16]
-
-        paddw       xmm3,       xmm5
-        paddw       xmm4,       xmm6
-
-        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
-        psraw       xmm3,       VP9_FILTER_SHIFT        ; xmm3 /= 128
-
-        paddw       xmm4,       [GLOBAL(rd)]
-        psraw       xmm4,       VP9_FILTER_SHIFT
-
-        packuswb    xmm3,       xmm4
-        movdqa      [rdi],      xmm3                 ; store the results in the destination
-
-        add         rsi,        rdx                 ; next line
-%if ABI_IS_32BIT
-        add         rdi,        DWORD PTR arg(5) ;dst_pitch
-%else
-        add         rdi,        r8
-%endif
-
-        cmp         rdi,        rcx
-        jne         .next_row
-
-        jmp         .done
-
-.b16x16_sp_only:
-        movsxd      rax,        dword ptr arg(3) ;yoffset
-        shl         rax,        5
-        add         rax,        rcx    ;VFilter
-
-        mov         rdi,        arg(4) ;dst_ptr
-        mov         rsi,        arg(0) ;src_ptr
-        movsxd      rdx,        dword ptr arg(5) ;dst_pitch
-
-        movdqa      xmm1,       [rax]
-        movdqa      xmm2,       [rax+16]
-
-        lea         rcx,        [rdi+rdx*8]
-        lea         rcx,        [rcx+rdx*8]
-        movsxd      rax,        dword ptr arg(1) ;src_pixels_per_line
-
-        pxor        xmm0,       xmm0
-
-        ; get the first horizontal line done
-        movdqu      xmm7,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
-
-        add         rsi,        rax                 ; next line
-.next_row_spo:
-        movdqu      xmm3,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
-
-        movdqa      xmm5,       xmm7
-        movdqa      xmm6,       xmm7
-
-        movdqa      xmm4,       xmm3                 ; make a copy of current line
-        movdqa      xmm7,       xmm3
-
-        punpcklbw   xmm5,       xmm0
-        punpckhbw   xmm6,       xmm0
-        punpcklbw   xmm3,       xmm0                 ; xx 00 01 02 03 04 05 06
-        punpckhbw   xmm4,       xmm0
-
-        pmullw      xmm5,       xmm1
-        pmullw      xmm6,       xmm1
-        pmullw      xmm3,       xmm2
-        pmullw      xmm4,       xmm2
-
-        paddw       xmm3,       xmm5
-        paddw       xmm4,       xmm6
-
-        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
-        psraw       xmm3,       VP9_FILTER_SHIFT        ; xmm3 /= 128
-
-        paddw       xmm4,       [GLOBAL(rd)]
-        psraw       xmm4,       VP9_FILTER_SHIFT
-
-        packuswb    xmm3,       xmm4
-        movdqa      [rdi],      xmm3                 ; store the results in the destination
-
-        add         rsi,        rax                 ; next line
-        add         rdi,        rdx                 ;dst_pitch
-        cmp         rdi,        rcx
-        jne         .next_row_spo
-
-        jmp         .done
-
-.b16x16_fp_only:
-        lea         rcx,        [rdi+rdx*8]
-        lea         rcx,        [rcx+rdx*8]
-        movsxd      rax,        dword ptr arg(1) ;src_pixels_per_line
-        pxor        xmm0,       xmm0
-
-.next_row_fpo:
-        movdqu      xmm3,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
-        movdqa      xmm4,       xmm3                 ; make a copy of current line
-
-        punpcklbw   xmm3,       xmm0                 ; xx 00 01 02 03 04 05 06
-        punpckhbw   xmm4,       xmm0
-
-        pmullw      xmm3,       xmm1
-        pmullw      xmm4,       xmm1
-
-        movdqu      xmm5,       [rsi+1]
-        movdqa      xmm6,       xmm5
-
-        punpcklbw   xmm5,       xmm0
-        punpckhbw   xmm6,       xmm0
-
-        pmullw      xmm5,       xmm2
-        pmullw      xmm6,       xmm2
-
-        paddw       xmm3,       xmm5
-        paddw       xmm4,       xmm6
-
-        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
-        psraw       xmm3,       VP9_FILTER_SHIFT        ; xmm3 /= 128
-
-        paddw       xmm4,       [GLOBAL(rd)]
-        psraw       xmm4,       VP9_FILTER_SHIFT
-
-        packuswb    xmm3,       xmm4
-        movdqa      [rdi],      xmm3                 ; store the results in the destination
-
-        add         rsi,        rax                 ; next line
-        add         rdi,        rdx                 ; dst_pitch
-        cmp         rdi,        rcx
-        jne         .next_row_fpo
-
-.done:
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vp9_bilinear_predict8x8_sse2
-;(
-;    unsigned char  *src_ptr,
-;    int   src_pixels_per_line,
-;    int  xoffset,
-;    int  yoffset,
-;    unsigned char *dst_ptr,
-;    int dst_pitch
-;)
-extern sym(vp9_bilinear_filters_mmx)
-global sym(vp9_bilinear_predict8x8_sse2)
-sym(vp9_bilinear_predict8x8_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 144                         ; reserve 144 bytes
-
-    ;const short *HFilter = bilinear_filters_mmx[xoffset]
-    ;const short *VFilter = bilinear_filters_mmx[yoffset]
-        lea         rcx,        [GLOBAL(sym(vp9_bilinear_filters_mmx))]
-
-        mov         rsi,        arg(0) ;src_ptr
-        movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line
-
-    ;Read 9-line unaligned data in and put them on stack. This gives a big
-    ;performance boost.
-        movdqu      xmm0,       [rsi]
-        lea         rax,        [rdx + rdx*2]
-        movdqu      xmm1,       [rsi+rdx]
-        movdqu      xmm2,       [rsi+rdx*2]
-        add         rsi,        rax
-        movdqu      xmm3,       [rsi]
-        movdqu      xmm4,       [rsi+rdx]
-        movdqu      xmm5,       [rsi+rdx*2]
-        add         rsi,        rax
-        movdqu      xmm6,       [rsi]
-        movdqu      xmm7,       [rsi+rdx]
-
-        movdqa      XMMWORD PTR [rsp],            xmm0
-
-        movdqu      xmm0,       [rsi+rdx*2]
-
-        movdqa      XMMWORD PTR [rsp+16],         xmm1
-        movdqa      XMMWORD PTR [rsp+32],         xmm2
-        movdqa      XMMWORD PTR [rsp+48],         xmm3
-        movdqa      XMMWORD PTR [rsp+64],         xmm4
-        movdqa      XMMWORD PTR [rsp+80],         xmm5
-        movdqa      XMMWORD PTR [rsp+96],         xmm6
-        movdqa      XMMWORD PTR [rsp+112],        xmm7
-        movdqa      XMMWORD PTR [rsp+128],        xmm0
-
-        movsxd      rax,        dword ptr arg(2) ;xoffset
-        shl         rax,        5
-        add         rax,        rcx    ;HFilter
-
-        mov         rdi,        arg(4) ;dst_ptr
-        movsxd      rdx,        dword ptr arg(5) ;dst_pitch
-
-        movdqa      xmm1,       [rax]
-        movdqa      xmm2,       [rax+16]
-
-        movsxd      rax,        dword ptr arg(3) ;yoffset
-        shl         rax,        5
-        add         rax,        rcx    ;VFilter
-
-        lea         rcx,        [rdi+rdx*8]
-
-        movdqa      xmm5,       [rax]
-        movdqa      xmm6,       [rax+16]
-
-        pxor        xmm0,       xmm0
-
-        ; get the first horizontal line done
-        movdqa      xmm3,       XMMWORD PTR [rsp]
-        movdqa      xmm4,       xmm3                 ; make a copy of current line
-        psrldq      xmm4,       1
-
-        punpcklbw   xmm3,       xmm0                 ; 00 01 02 03 04 05 06 07
-        punpcklbw   xmm4,       xmm0                 ; 01 02 03 04 05 06 07 08
-
-        pmullw      xmm3,       xmm1
-        pmullw      xmm4,       xmm2
-
-        paddw       xmm3,       xmm4
-
-        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
-        psraw       xmm3,       VP9_FILTER_SHIFT        ; xmm3 /= 128
-
-        movdqa      xmm7,       xmm3
-        add         rsp,        16                 ; next line
-.next_row8x8:
-        movdqa      xmm3,       XMMWORD PTR [rsp]               ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
-        movdqa      xmm4,       xmm3                 ; make a copy of current line
-        psrldq      xmm4,       1
-
-        punpcklbw   xmm3,       xmm0                 ; 00 01 02 03 04 05 06 07
-        punpcklbw   xmm4,       xmm0                 ; 01 02 03 04 05 06 07 08
-
-        pmullw      xmm3,       xmm1
-        pmullw      xmm4,       xmm2
-
-        paddw       xmm3,       xmm4
-        pmullw      xmm7,       xmm5
-
-        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
-        psraw       xmm3,       VP9_FILTER_SHIFT        ; xmm3 /= 128
-
-        movdqa      xmm4,       xmm3
-
-        pmullw      xmm3,       xmm6
-        paddw       xmm3,       xmm7
-
-        movdqa      xmm7,       xmm4
-
-        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
-        psraw       xmm3,       VP9_FILTER_SHIFT        ; xmm3 /= 128
-
-        packuswb    xmm3,       xmm0
-        movq        [rdi],      xmm3                 ; store the results in the destination
-
-        add         rsp,        16                 ; next line
-        add         rdi,        rdx
-
-        cmp         rdi,        rcx
-        jne         .next_row8x8
-
-    ;add rsp, 144
-    pop rsp
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-SECTION_RODATA
-align 16
-rd:
-    times 8 dw 0x40
--- a/vp8/common/x86/subpixel_ssse3.asm
+++ /dev/null
@@ -1,1515 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-%define BLOCK_HEIGHT_WIDTH 4
-%define VP9_FILTER_WEIGHT 128
-%define VP9_FILTER_SHIFT  7
-
-
-;/************************************************************************************
-; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
-; input pixel array has output_height rows. This routine assumes that output_height is an
-; even number. This function handles 8 pixels in horizontal direction, calculating ONE
-; rows each iteration to take advantage of the 128 bits operations.
-;
-; This is an implementation of some of the SSE optimizations first seen in ffvp8
-;
-;*************************************************************************************/
-;void vp9_filter_block1d8_h6_ssse3
-;(
-;    unsigned char  *src_ptr,
-;    unsigned int    src_pixels_per_line,
-;    unsigned char *output_ptr,
-;    unsigned int    output_pitch,
-;    unsigned int    output_height,
-;    unsigned int    vp9_filter_index
-;)
-global sym(vp9_filter_block1d8_h6_ssse3)
-sym(vp9_filter_block1d8_h6_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    movsxd      rdx, DWORD PTR arg(5)   ;table index
-    xor         rsi, rsi
-    shl         rdx, 4
-
-    movdqa      xmm7, [GLOBAL(rd)]
-
-    lea         rax, [GLOBAL(k0_k5)]
-    add         rax, rdx
-    mov         rdi, arg(2)             ;output_ptr
-
-    cmp         esi, DWORD PTR [rax]
-    je          vp9_filter_block1d8_h4_ssse3
-
-    movdqa      xmm4, XMMWORD PTR [rax]         ;k0_k5
-    movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4
-    movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3
-
-    mov         rsi, arg(0)             ;src_ptr
-    movsxd      rax, dword ptr arg(1)   ;src_pixels_per_line
-    movsxd      rcx, dword ptr arg(4)   ;output_height
-
-    movsxd      rdx, dword ptr arg(3)   ;output_pitch
-
-    sub         rdi, rdx
-;xmm3 free
-.filter_block1d8_h6_rowloop_ssse3:
-    movq        xmm0,   MMWORD PTR [rsi - 2]    ; -2 -1  0  1  2  3  4  5
-
-    movq        xmm2,   MMWORD PTR [rsi + 3]    ;  3  4  5  6  7  8  9 10
-
-    punpcklbw   xmm0,   xmm2                    ; -2  3 -1  4  0  5  1  6  2  7  3  8  4  9  5 10
-
-    movdqa      xmm1,   xmm0
-    pmaddubsw   xmm0,   xmm4
-
-    movdqa      xmm2,   xmm1
-    pshufb      xmm1,   [GLOBAL(shuf2bfrom1)]
-
-    pshufb      xmm2,   [GLOBAL(shuf3bfrom1)]
-    pmaddubsw   xmm1,   xmm5
-
-    lea         rdi,    [rdi + rdx]
-    pmaddubsw   xmm2,   xmm6
-
-    lea         rsi,    [rsi + rax]
-    dec         rcx
-
-    paddsw      xmm0,   xmm1
-    paddsw      xmm2,   xmm7
-
-    paddsw      xmm0,   xmm2
-
-    psraw       xmm0,   7
-
-    packuswb    xmm0,   xmm0
-
-    movq        MMWORD Ptr [rdi], xmm0
-    jnz         .filter_block1d8_h6_rowloop_ssse3
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-vp9_filter_block1d8_h4_ssse3:
-    movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4
-    movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3
-
-    movdqa      xmm3, XMMWORD PTR [GLOBAL(shuf2bfrom1)]
-    movdqa      xmm4, XMMWORD PTR [GLOBAL(shuf3bfrom1)]
-
-    mov         rsi, arg(0)             ;src_ptr
-
-    movsxd      rax, dword ptr arg(1)   ;src_pixels_per_line
-    movsxd      rcx, dword ptr arg(4)   ;output_height
-
-    movsxd      rdx, dword ptr arg(3)   ;output_pitch
-
-    sub         rdi, rdx
-
-.filter_block1d8_h4_rowloop_ssse3:
-    movq        xmm0,   MMWORD PTR [rsi - 2]    ; -2 -1  0  1  2  3  4  5
-
-    movq        xmm1,   MMWORD PTR [rsi + 3]    ;  3  4  5  6  7  8  9 10
-
-    punpcklbw   xmm0,   xmm1                    ; -2  3 -1  4  0  5  1  6  2  7  3  8  4  9  5 10
-
-    movdqa      xmm2,   xmm0
-    pshufb      xmm0,   xmm3
-
-    pshufb      xmm2,   xmm4
-    pmaddubsw   xmm0,   xmm5
-
-    lea         rdi,    [rdi + rdx]
-    pmaddubsw   xmm2,   xmm6
-
-    lea         rsi,    [rsi + rax]
-    dec         rcx
-
-    paddsw      xmm0,   xmm7
-
-    paddsw      xmm0,   xmm2
-
-    psraw       xmm0,   7
-
-    packuswb    xmm0,   xmm0
-
-    movq        MMWORD Ptr [rdi], xmm0
-
-    jnz         .filter_block1d8_h4_rowloop_ssse3
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-;void vp9_filter_block1d16_h6_ssse3
-;(
-;    unsigned char  *src_ptr,
-;    unsigned int    src_pixels_per_line,
-;    unsigned char  *output_ptr,
-;    unsigned int    output_pitch,
-;    unsigned int    output_height,
-;    unsigned int    vp9_filter_index
-;)
-global sym(vp9_filter_block1d16_h6_ssse3)
-sym(vp9_filter_block1d16_h6_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    movsxd      rdx, DWORD PTR arg(5)           ;table index
-    xor         rsi, rsi
-    shl         rdx, 4      ;
-
-    lea         rax, [GLOBAL(k0_k5)]
-    add         rax, rdx
-
-    mov         rdi, arg(2)                     ;output_ptr
-
-    mov         rsi, arg(0)                     ;src_ptr
-
-    movdqa      xmm4, XMMWORD PTR [rax]         ;k0_k5
-    movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4
-    movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3
-
-    movsxd      rax, dword ptr arg(1)           ;src_pixels_per_line
-    movsxd      rcx, dword ptr arg(4)           ;output_height
-    movsxd      rdx, dword ptr arg(3)           ;output_pitch
-
-.filter_block1d16_h6_rowloop_ssse3:
-    movq        xmm0,   MMWORD PTR [rsi - 2]    ; -2 -1  0  1  2  3  4  5
-
-    movq        xmm3,   MMWORD PTR [rsi + 3]    ;  3  4  5  6  7  8  9 10
-
-    punpcklbw   xmm0,   xmm3                    ; -2  3 -1  4  0  5  1  6  2  7  3  8  4  9  5 10
-
-    movdqa      xmm1,   xmm0
-    pmaddubsw   xmm0,   xmm4
-
-    movdqa      xmm2,   xmm1
-    pshufb      xmm1,   [GLOBAL(shuf2bfrom1)]
-
-    pshufb      xmm2,   [GLOBAL(shuf3bfrom1)]
-    movq        xmm3,   MMWORD PTR [rsi +  6]
-
-    pmaddubsw   xmm1,   xmm5
-    movq        xmm7,   MMWORD PTR [rsi + 11]
-
-    pmaddubsw   xmm2,   xmm6
-    punpcklbw   xmm3,   xmm7
-
-    paddsw      xmm0,   xmm1
-    movdqa      xmm1,   xmm3
-
-    pmaddubsw   xmm3,   xmm4
-    paddsw      xmm0,   xmm2
-
-    movdqa      xmm2,   xmm1
-    paddsw      xmm0,   [GLOBAL(rd)]
-
-    pshufb      xmm1,   [GLOBAL(shuf2bfrom1)]
-    pshufb      xmm2,   [GLOBAL(shuf3bfrom1)]
-
-    psraw       xmm0,   7
-    pmaddubsw   xmm1,   xmm5
-
-    pmaddubsw   xmm2,   xmm6
-    packuswb    xmm0,   xmm0
-
-    lea         rsi,    [rsi + rax]
-    paddsw      xmm3,   xmm1
-
-    paddsw      xmm3,   xmm2
-
-    paddsw      xmm3,   [GLOBAL(rd)]
-
-    psraw       xmm3,   7
-
-    packuswb    xmm3,   xmm3
-
-    punpcklqdq  xmm0,   xmm3
-
-    movdqa      XMMWORD Ptr [rdi], xmm0
-
-    lea         rdi,    [rdi + rdx]
-    dec         rcx
-    jnz         .filter_block1d16_h6_rowloop_ssse3
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void vp9_filter_block1d4_h6_ssse3
-;(
-;    unsigned char  *src_ptr,
-;    unsigned int    src_pixels_per_line,
-;    unsigned char  *output_ptr,
-;    unsigned int    output_pitch,
-;    unsigned int    output_height,
-;    unsigned int    vp9_filter_index
-;)
-global sym(vp9_filter_block1d4_h6_ssse3)
-sym(vp9_filter_block1d4_h6_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    movsxd      rdx, DWORD PTR arg(5)   ;table index
-    xor         rsi, rsi
-    shl         rdx, 4      ;
-
-    lea         rax, [GLOBAL(k0_k5)]
-    add         rax, rdx
-    movdqa      xmm7, [GLOBAL(rd)]
-
-    cmp         esi, DWORD PTR [rax]
-    je          .vp9_filter_block1d4_h4_ssse3
-
-    movdqa      xmm4, XMMWORD PTR [rax]         ;k0_k5
-    movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4
-    movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3
-
-    mov         rsi, arg(0)             ;src_ptr
-    mov         rdi, arg(2)             ;output_ptr
-    movsxd      rax, dword ptr arg(1)   ;src_pixels_per_line
-    movsxd      rcx, dword ptr arg(4)   ;output_height
-
-    movsxd      rdx, dword ptr arg(3)   ;output_pitch
-
-;xmm3 free
-.filter_block1d4_h6_rowloop_ssse3:
-    movdqu      xmm0,   XMMWORD PTR [rsi - 2]
-
-    movdqa      xmm1, xmm0
-    pshufb      xmm0, [GLOBAL(shuf1b)]
-
-    movdqa      xmm2, xmm1
-    pshufb      xmm1, [GLOBAL(shuf2b)]
-    pmaddubsw   xmm0, xmm4
-    pshufb      xmm2, [GLOBAL(shuf3b)]
-    pmaddubsw   xmm1, xmm5
-
-;--
-    pmaddubsw   xmm2, xmm6
-
-    lea         rsi,    [rsi + rax]
-;--
-    paddsw      xmm0, xmm1
-    paddsw      xmm0, xmm7
-    pxor        xmm1, xmm1
-    paddsw      xmm0, xmm2
-    psraw       xmm0, 7
-    packuswb    xmm0, xmm0
-
-    movd        DWORD PTR [rdi], xmm0
-
-    add         rdi, rdx
-    dec         rcx
-    jnz         .filter_block1d4_h6_rowloop_ssse3
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-.vp9_filter_block1d4_h4_ssse3:
-    movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4
-    movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3
-    movdqa      xmm0, XMMWORD PTR [GLOBAL(shuf2b)]
-    movdqa      xmm3, XMMWORD PTR [GLOBAL(shuf3b)]
-
-    mov         rsi, arg(0)             ;src_ptr
-    mov         rdi, arg(2)             ;output_ptr
-    movsxd      rax, dword ptr arg(1)   ;src_pixels_per_line
-    movsxd      rcx, dword ptr arg(4)   ;output_height
-
-    movsxd      rdx, dword ptr arg(3)   ;output_pitch
-
-.filter_block1d4_h4_rowloop_ssse3:
-    movdqu      xmm1,   XMMWORD PTR [rsi - 2]
-
-    movdqa      xmm2, xmm1
-    pshufb      xmm1, xmm0 ;;[GLOBAL(shuf2b)]
-    pshufb      xmm2, xmm3 ;;[GLOBAL(shuf3b)]
-    pmaddubsw   xmm1, xmm5
-
-;--
-    pmaddubsw   xmm2, xmm6
-
-    lea         rsi,    [rsi + rax]
-;--
-    paddsw      xmm1, xmm7
-    paddsw      xmm1, xmm2
-    psraw       xmm1, 7
-    packuswb    xmm1, xmm1
-
-    movd        DWORD PTR [rdi], xmm1
-
-    add         rdi, rdx
-    dec         rcx
-    jnz         .filter_block1d4_h4_rowloop_ssse3
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-
-;void vp9_filter_block1d16_v6_ssse3
-;(
-;    unsigned char *src_ptr,
-;    unsigned int   src_pitch,
-;    unsigned char *output_ptr,
-;    unsigned int   out_pitch,
-;    unsigned int   output_height,
-;    unsigned int   vp9_filter_index
-;)
-global sym(vp9_filter_block1d16_v6_ssse3)
-sym(vp9_filter_block1d16_v6_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    movsxd      rdx, DWORD PTR arg(5)   ;table index
-    xor         rsi, rsi
-    shl         rdx, 4      ;
-
-    lea         rax, [GLOBAL(k0_k5)]
-    add         rax, rdx
-
-    cmp         esi, DWORD PTR [rax]
-    je          .vp9_filter_block1d16_v4_ssse3
-
-    movdqa      xmm5, XMMWORD PTR [rax]         ;k0_k5
-    movdqa      xmm6, XMMWORD PTR [rax+256]     ;k2_k4
-    movdqa      xmm7, XMMWORD PTR [rax+128]     ;k1_k3
-
-    mov         rsi, arg(0)             ;src_ptr
-    movsxd      rdx, DWORD PTR arg(1)   ;pixels_per_line
-    mov         rdi, arg(2)             ;output_ptr
-
-%if ABI_IS_32BIT=0
-    movsxd      r8, DWORD PTR arg(3)    ;out_pitch
-%endif
-    mov         rax, rsi
-    movsxd      rcx, DWORD PTR arg(4)   ;output_height
-    add         rax, rdx
-
-
-.vp9_filter_block1d16_v6_ssse3_loop:
-    movq        xmm1, MMWORD PTR [rsi]                  ;A
-    movq        xmm2, MMWORD PTR [rsi + rdx]            ;B
-    movq        xmm3, MMWORD PTR [rsi + rdx * 2]        ;C
-    movq        xmm4, MMWORD PTR [rax + rdx * 2]        ;D
-    movq        xmm0, MMWORD PTR [rsi + rdx * 4]        ;E
-
-    punpcklbw   xmm2, xmm4                  ;B D
-    punpcklbw   xmm3, xmm0                  ;C E
-
-    movq        xmm0, MMWORD PTR [rax + rdx * 4]        ;F
-
-    pmaddubsw   xmm3, xmm6
-    punpcklbw   xmm1, xmm0                  ;A F
-    pmaddubsw   xmm2, xmm7
-    pmaddubsw   xmm1, xmm5
-
-    paddsw      xmm2, xmm3
-    paddsw      xmm2, xmm1
-    paddsw      xmm2, [GLOBAL(rd)]
-    psraw       xmm2, 7
-    packuswb    xmm2, xmm2
-
-    movq        MMWORD PTR [rdi], xmm2          ;store the results
-
-    movq        xmm1, MMWORD PTR [rsi + 8]                  ;A
-    movq        xmm2, MMWORD PTR [rsi + rdx + 8]            ;B
-    movq        xmm3, MMWORD PTR [rsi + rdx * 2 + 8]        ;C
-    movq        xmm4, MMWORD PTR [rax + rdx * 2 + 8]        ;D
-    movq        xmm0, MMWORD PTR [rsi + rdx * 4 + 8]        ;E
-
-    punpcklbw   xmm2, xmm4                  ;B D
-    punpcklbw   xmm3, xmm0                  ;C E
-
-    movq        xmm0, MMWORD PTR [rax + rdx * 4 + 8]        ;F
-    pmaddubsw   xmm3, xmm6
-    punpcklbw   xmm1, xmm0                  ;A F
-    pmaddubsw   xmm2, xmm7
-    pmaddubsw   xmm1, xmm5
-
-    add         rsi,  rdx
-    add         rax,  rdx
-;--
-;--
-    paddsw      xmm2, xmm3
-    paddsw      xmm2, xmm1
-    paddsw      xmm2, [GLOBAL(rd)]
-    psraw       xmm2, 7
-    packuswb    xmm2, xmm2
-
-    movq        MMWORD PTR [rdi+8], xmm2
-
-%if ABI_IS_32BIT
-    add         rdi,        DWORD PTR arg(3) ;out_pitch
-%else
-    add         rdi,        r8
-%endif
-    dec         rcx
-    jnz         .vp9_filter_block1d16_v6_ssse3_loop
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-.vp9_filter_block1d16_v4_ssse3:
-    movdqa      xmm6, XMMWORD PTR [rax+256]     ;k2_k4
-    movdqa      xmm7, XMMWORD PTR [rax+128]     ;k1_k3
-
-    mov         rsi, arg(0)             ;src_ptr
-    movsxd      rdx, DWORD PTR arg(1)   ;pixels_per_line
-    mov         rdi, arg(2)             ;output_ptr
-
-%if ABI_IS_32BIT=0
-    movsxd      r8, DWORD PTR arg(3)    ;out_pitch
-%endif
-    mov         rax, rsi
-    movsxd      rcx, DWORD PTR arg(4)   ;output_height
-    add         rax, rdx
-
-.vp9_filter_block1d16_v4_ssse3_loop:
-    movq        xmm2, MMWORD PTR [rsi + rdx]            ;B
-    movq        xmm3, MMWORD PTR [rsi + rdx * 2]        ;C
-    movq        xmm4, MMWORD PTR [rax + rdx * 2]        ;D
-    movq        xmm0, MMWORD PTR [rsi + rdx * 4]        ;E
-
-    punpcklbw   xmm2, xmm4                  ;B D
-    punpcklbw   xmm3, xmm0                  ;C E
-
-    pmaddubsw   xmm3, xmm6
-    pmaddubsw   xmm2, xmm7
-    movq        xmm5, MMWORD PTR [rsi + rdx + 8]            ;B
-    movq        xmm1, MMWORD PTR [rsi + rdx * 2 + 8]        ;C
-    movq        xmm4, MMWORD PTR [rax + rdx * 2 + 8]        ;D
-    movq        xmm0, MMWORD PTR [rsi + rdx * 4 + 8]        ;E
-
-    paddsw      xmm2, [GLOBAL(rd)]
-    paddsw      xmm2, xmm3
-    psraw       xmm2, 7
-    packuswb    xmm2, xmm2
-
-    punpcklbw   xmm5, xmm4                  ;B D
-    punpcklbw   xmm1, xmm0                  ;C E
-
-    pmaddubsw   xmm1, xmm6
-    pmaddubsw   xmm5, xmm7
-
-    movdqa      xmm4, [GLOBAL(rd)]
-    add         rsi,  rdx
-    add         rax,  rdx
-;--
-;--
-    paddsw      xmm5, xmm1
-    paddsw      xmm5, xmm4
-    psraw       xmm5, 7
-    packuswb    xmm5, xmm5
-
-    punpcklqdq  xmm2, xmm5
-
-    movdqa       XMMWORD PTR [rdi], xmm2
-
-%if ABI_IS_32BIT
-    add         rdi,        DWORD PTR arg(3) ;out_pitch
-%else
-    add         rdi,        r8
-%endif
-    dec         rcx
-    jnz         .vp9_filter_block1d16_v4_ssse3_loop
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void vp9_filter_block1d8_v6_ssse3
-;(
-;    unsigned char *src_ptr,
-;    unsigned int   src_pitch,
-;    unsigned char *output_ptr,
-;    unsigned int   out_pitch,
-;    unsigned int   output_height,
-;    unsigned int   vp9_filter_index
-;)
-global sym(vp9_filter_block1d8_v6_ssse3)
-sym(vp9_filter_block1d8_v6_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    movsxd      rdx, DWORD PTR arg(5)   ;table index
-    xor         rsi, rsi
-    shl         rdx, 4      ;
-
-    lea         rax, [GLOBAL(k0_k5)]
-    add         rax, rdx
-
-    movsxd      rdx, DWORD PTR arg(1)   ;pixels_per_line
-    mov         rdi, arg(2)             ;output_ptr
-%if ABI_IS_32BIT=0
-    movsxd      r8, DWORD PTR arg(3)    ; out_pitch
-%endif
-    movsxd      rcx, DWORD PTR arg(4)   ;[output_height]
-
-    cmp         esi, DWORD PTR [rax]
-    je          .vp9_filter_block1d8_v4_ssse3
-
-    movdqa      xmm5, XMMWORD PTR [rax]         ;k0_k5
-    movdqa      xmm6, XMMWORD PTR [rax+256]     ;k2_k4
-    movdqa      xmm7, XMMWORD PTR [rax+128]     ;k1_k3
-
-    mov         rsi, arg(0)             ;src_ptr
-
-    mov         rax, rsi
-    add         rax, rdx
-
-.vp9_filter_block1d8_v6_ssse3_loop:
-    movq        xmm1, MMWORD PTR [rsi]                  ;A
-    movq        xmm2, MMWORD PTR [rsi + rdx]            ;B
-    movq        xmm3, MMWORD PTR [rsi + rdx * 2]        ;C
-    movq        xmm4, MMWORD PTR [rax + rdx * 2]        ;D
-    movq        xmm0, MMWORD PTR [rsi + rdx * 4]        ;E
-
-    punpcklbw   xmm2, xmm4                  ;B D
-    punpcklbw   xmm3, xmm0                  ;C E
-
-    movq        xmm0, MMWORD PTR [rax + rdx * 4]        ;F
-    movdqa      xmm4, [GLOBAL(rd)]
-
-    pmaddubsw   xmm3, xmm6
-    punpcklbw   xmm1, xmm0                  ;A F
-    pmaddubsw   xmm2, xmm7
-    pmaddubsw   xmm1, xmm5
-    add         rsi,  rdx
-    add         rax,  rdx
-;--
-;--
-    paddsw      xmm2, xmm3
-    paddsw      xmm2, xmm1
-    paddsw      xmm2, xmm4
-    psraw       xmm2, 7
-    packuswb    xmm2, xmm2
-
-    movq        MMWORD PTR [rdi], xmm2
-
-%if ABI_IS_32BIT
-    add         rdi,        DWORD PTR arg(3) ;[out_pitch]
-%else
-    add         rdi,        r8
-%endif
-    dec         rcx
-    jnz         .vp9_filter_block1d8_v6_ssse3_loop
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-.vp9_filter_block1d8_v4_ssse3:
-    movdqa      xmm6, XMMWORD PTR [rax+256]     ;k2_k4
-    movdqa      xmm7, XMMWORD PTR [rax+128]     ;k1_k3
-    movdqa      xmm5, [GLOBAL(rd)]
-
-    mov         rsi, arg(0)             ;src_ptr
-
-    mov         rax, rsi
-    add         rax, rdx
-
-.vp9_filter_block1d8_v4_ssse3_loop:
-    movq        xmm2, MMWORD PTR [rsi + rdx]            ;B
-    movq        xmm3, MMWORD PTR [rsi + rdx * 2]        ;C
-    movq        xmm4, MMWORD PTR [rax + rdx * 2]        ;D
-    movq        xmm0, MMWORD PTR [rsi + rdx * 4]        ;E
-
-    punpcklbw   xmm2, xmm4                  ;B D
-    punpcklbw   xmm3, xmm0                  ;C E
-
-    pmaddubsw   xmm3, xmm6
-    pmaddubsw   xmm2, xmm7
-    add         rsi,  rdx
-    add         rax,  rdx
-;--
-;--
-    paddsw      xmm2, xmm3
-    paddsw      xmm2, xmm5
-    psraw       xmm2, 7
-    packuswb    xmm2, xmm2
-
-    movq        MMWORD PTR [rdi], xmm2
-
-%if ABI_IS_32BIT
-    add         rdi,        DWORD PTR arg(3) ;[out_pitch]
-%else
-    add         rdi,        r8
-%endif
-    dec         rcx
-    jnz         .vp9_filter_block1d8_v4_ssse3_loop
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-;void vp9_filter_block1d4_v6_ssse3
-;(
-;    unsigned char *src_ptr,
-;    unsigned int   src_pitch,
-;    unsigned char *output_ptr,
-;    unsigned int   out_pitch,
-;    unsigned int   output_height,
-;    unsigned int   vp9_filter_index
-;)
-global sym(vp9_filter_block1d4_v6_ssse3)
-sym(vp9_filter_block1d4_v6_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    movsxd      rdx, DWORD PTR arg(5)   ;table index
-    xor         rsi, rsi
-    shl         rdx, 4      ;
-
-    lea         rax, [GLOBAL(k0_k5)]
-    add         rax, rdx
-
-    movsxd      rdx, DWORD PTR arg(1)   ;pixels_per_line
-    mov         rdi, arg(2)             ;output_ptr
-%if ABI_IS_32BIT=0
-    movsxd      r8, DWORD PTR arg(3)    ; out_pitch
-%endif
-    movsxd      rcx, DWORD PTR arg(4)   ;[output_height]
-
-    cmp         esi, DWORD PTR [rax]
-    je          .vp9_filter_block1d4_v4_ssse3
-
-    movq        mm5, MMWORD PTR [rax]         ;k0_k5
-    movq        mm6, MMWORD PTR [rax+256]     ;k2_k4
-    movq        mm7, MMWORD PTR [rax+128]     ;k1_k3
-
-    mov         rsi, arg(0)             ;src_ptr
-
-    mov         rax, rsi
-    add         rax, rdx
-
-.vp9_filter_block1d4_v6_ssse3_loop:
-    movd        mm1, DWORD PTR [rsi]                  ;A
-    movd        mm2, DWORD PTR [rsi + rdx]            ;B
-    movd        mm3, DWORD PTR [rsi + rdx * 2]        ;C
-    movd        mm4, DWORD PTR [rax + rdx * 2]        ;D
-    movd        mm0, DWORD PTR [rsi + rdx * 4]        ;E
-
-    punpcklbw   mm2, mm4                  ;B D
-    punpcklbw   mm3, mm0                  ;C E
-
-    movd        mm0, DWORD PTR [rax + rdx * 4]        ;F
-
-    movq        mm4, [GLOBAL(rd)]
-
-    pmaddubsw   mm3, mm6
-    punpcklbw   mm1, mm0                  ;A F
-    pmaddubsw   mm2, mm7
-    pmaddubsw   mm1, mm5
-    add         rsi,  rdx
-    add         rax,  rdx
-;--
-;--
-    paddsw      mm2, mm3
-    paddsw      mm2, mm1
-    paddsw      mm2, mm4
-    psraw       mm2, 7
-    packuswb    mm2, mm2
-
-    movd        DWORD PTR [rdi], mm2
-
-%if ABI_IS_32BIT
-    add         rdi,        DWORD PTR arg(3) ;[out_pitch]
-%else
-    add         rdi,        r8
-%endif
-    dec         rcx
-    jnz         .vp9_filter_block1d4_v6_ssse3_loop
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-.vp9_filter_block1d4_v4_ssse3:
-    movq        mm6, MMWORD PTR [rax+256]     ;k2_k4
-    movq        mm7, MMWORD PTR [rax+128]     ;k1_k3
-    movq        mm5, MMWORD PTR [GLOBAL(rd)]
-
-    mov         rsi, arg(0)             ;src_ptr
-
-    mov         rax, rsi
-    add         rax, rdx
-
-.vp9_filter_block1d4_v4_ssse3_loop:
-    movd        mm2, DWORD PTR [rsi + rdx]            ;B
-    movd        mm3, DWORD PTR [rsi + rdx * 2]        ;C
-    movd        mm4, DWORD PTR [rax + rdx * 2]        ;D
-    movd        mm0, DWORD PTR [rsi + rdx * 4]        ;E
-
-    punpcklbw   mm2, mm4                  ;B D
-    punpcklbw   mm3, mm0                  ;C E
-
-    pmaddubsw   mm3, mm6
-    pmaddubsw   mm2, mm7
-    add         rsi,  rdx
-    add         rax,  rdx
-;--
-;--
-    paddsw      mm2, mm3
-    paddsw      mm2, mm5
-    psraw       mm2, 7
-    packuswb    mm2, mm2
-
-    movd        DWORD PTR [rdi], mm2
-
-%if ABI_IS_32BIT
-    add         rdi,        DWORD PTR arg(3) ;[out_pitch]
-%else
-    add         rdi,        r8
-%endif
-    dec         rcx
-    jnz         .vp9_filter_block1d4_v4_ssse3_loop
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void vp9_bilinear_predict16x16_ssse3
-;(
-;    unsigned char  *src_ptr,
-;    int   src_pixels_per_line,
-;    int  xoffset,
-;    int  yoffset,
-;    unsigned char *dst_ptr,
-;    int dst_pitch
-;)
-global sym(vp9_bilinear_predict16x16_ssse3)
-sym(vp9_bilinear_predict16x16_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-        lea         rcx,        [GLOBAL(bilinear_filters_ssse3)]
-        movsxd      rax,        dword ptr arg(2)    ; xoffset
-
-        cmp         rax,        0                   ; skip first_pass filter if xoffset=0
-        je          .b16x16_sp_only
-
-        shl         rax,        4
-        lea         rax,        [rax + rcx]         ; HFilter
-
-        mov         rdi,        arg(4)              ; dst_ptr
-        mov         rsi,        arg(0)              ; src_ptr
-        movsxd      rdx,        dword ptr arg(5)    ; dst_pitch
-
-        movdqa      xmm1,       [rax]
-
-        movsxd      rax,        dword ptr arg(3)    ; yoffset
-
-        cmp         rax,        0                   ; skip second_pass filter if yoffset=0
-        je          .b16x16_fp_only
-
-        shl         rax,        4
-        lea         rax,        [rax + rcx]         ; VFilter
-
-        lea         rcx,        [rdi+rdx*8]
-        lea         rcx,        [rcx+rdx*8]
-        movsxd      rdx,        dword ptr arg(1)    ; src_pixels_per_line
-
-        movdqa      xmm2,       [rax]
-
-%if ABI_IS_32BIT=0
-        movsxd      r8,         dword ptr arg(5)    ; dst_pitch
-%endif
-        movq        xmm3,       [rsi]               ; 00 01 02 03 04 05 06 07
-        movq        xmm5,       [rsi+1]             ; 01 02 03 04 05 06 07 08
-
-        punpcklbw   xmm3,       xmm5                ; 00 01 01 02 02 03 03 04 04 05 05 06 06 07 07 08
-        movq        xmm4,       [rsi+8]             ; 08 09 10 11 12 13 14 15
-
-        movq        xmm5,       [rsi+9]             ; 09 10 11 12 13 14 15 16
-
-        lea         rsi,        [rsi + rdx]         ; next line
-
-        pmaddubsw   xmm3,       xmm1                ; 00 02 04 06 08 10 12 14
-
-        punpcklbw   xmm4,       xmm5                ; 08 09 09 10 10 11 11 12 12 13 13 14 14 15 15 16
-        pmaddubsw   xmm4,       xmm1                ; 01 03 05 07 09 11 13 15
-
-        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
-        psraw       xmm3,       VP9_FILTER_SHIFT    ; xmm3 /= 128
-
-        paddw       xmm4,       [GLOBAL(rd)]        ; xmm4 += round value
-        psraw       xmm4,       VP9_FILTER_SHIFT    ; xmm4 /= 128
-
-        movdqa      xmm7,       xmm3
-        packuswb    xmm7,       xmm4                ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
-
-.next_row:
-        movq        xmm6,       [rsi]               ; 00 01 02 03 04 05 06 07
-        movq        xmm5,       [rsi+1]             ; 01 02 03 04 05 06 07 08
-
-        punpcklbw   xmm6,       xmm5
-        movq        xmm4,       [rsi+8]             ; 08 09 10 11 12 13 14 15
-
-        movq        xmm5,       [rsi+9]             ; 09 10 11 12 13 14 15 16
-        lea         rsi,        [rsi + rdx]         ; next line
-
-        pmaddubsw   xmm6,       xmm1
-
-        punpcklbw   xmm4,       xmm5
-        pmaddubsw   xmm4,       xmm1
-
-        paddw       xmm6,       [GLOBAL(rd)]        ; xmm6 += round value
-        psraw       xmm6,       VP9_FILTER_SHIFT    ; xmm6 /= 128
-
-        paddw       xmm4,       [GLOBAL(rd)]        ; xmm4 += round value
-        psraw       xmm4,       VP9_FILTER_SHIFT    ; xmm4 /= 128
-
-        packuswb    xmm6,       xmm4
-        movdqa      xmm5,       xmm7
-
-        punpcklbw   xmm5,       xmm6
-        pmaddubsw   xmm5,       xmm2
-
-        punpckhbw   xmm7,       xmm6
-        pmaddubsw   xmm7,       xmm2
-
-        paddw       xmm5,       [GLOBAL(rd)]        ; xmm5 += round value
-        psraw       xmm5,       VP9_FILTER_SHIFT    ; xmm5 /= 128
-
-        paddw       xmm7,       [GLOBAL(rd)]        ; xmm7 += round value
-        psraw       xmm7,       VP9_FILTER_SHIFT    ; xmm7 /= 128
-
-        packuswb    xmm5,       xmm7
-        movdqa      xmm7,       xmm6
-
-        movdqa      [rdi],      xmm5                ; store the results in the destination
-%if ABI_IS_32BIT
-        add         rdi,        DWORD PTR arg(5)    ; dst_pitch
-%else
-        add         rdi,        r8
-%endif
-
-        cmp         rdi,        rcx
-        jne         .next_row
-
-        jmp         .done
-
-.b16x16_sp_only:
-        movsxd      rax,        dword ptr arg(3)    ; yoffset
-        shl         rax,        4
-        lea         rax,        [rax + rcx]         ; VFilter
-
-        mov         rdi,        arg(4)              ; dst_ptr
-        mov         rsi,        arg(0)              ; src_ptr
-        movsxd      rdx,        dword ptr arg(5)    ; dst_pitch
-
-        movdqa      xmm1,       [rax]               ; VFilter
-
-        lea         rcx,        [rdi+rdx*8]
-        lea         rcx,        [rcx+rdx*8]
-        movsxd      rax,        dword ptr arg(1)    ; src_pixels_per_line
-
-        ; get the first horizontal line done
-        movq        xmm4,       [rsi]               ; load row 0
-        movq        xmm2,       [rsi + 8]           ; load row 0
-
-        lea         rsi,        [rsi + rax]         ; next line
-.next_row_sp:
-        movq        xmm3,       [rsi]               ; load row + 1
-        movq        xmm5,       [rsi + 8]           ; load row + 1
-
-        punpcklbw   xmm4,       xmm3
-        punpcklbw   xmm2,       xmm5
-
-        pmaddubsw   xmm4,       xmm1
-        movq        xmm7,       [rsi + rax]         ; load row + 2
-
-        pmaddubsw   xmm2,       xmm1
-        movq        xmm6,       [rsi + rax + 8]     ; load row + 2
-
-        punpcklbw   xmm3,       xmm7
-        punpcklbw   xmm5,       xmm6
-
-        pmaddubsw   xmm3,       xmm1
-        paddw       xmm4,       [GLOBAL(rd)]
-
-        pmaddubsw   xmm5,       xmm1
-        paddw       xmm2,       [GLOBAL(rd)]
-
-        psraw       xmm4,       VP9_FILTER_SHIFT
-        psraw       xmm2,       VP9_FILTER_SHIFT
-
-        packuswb    xmm4,       xmm2
-        paddw       xmm3,       [GLOBAL(rd)]
-
-        movdqa      [rdi],      xmm4                ; store row 0
-        paddw       xmm5,       [GLOBAL(rd)]
-
-        psraw       xmm3,       VP9_FILTER_SHIFT
-        psraw       xmm5,       VP9_FILTER_SHIFT
-
-        packuswb    xmm3,       xmm5
-        movdqa      xmm4,       xmm7
-
-        movdqa      [rdi + rdx],xmm3                ; store row 1
-        lea         rsi,        [rsi + 2*rax]
-
-        movdqa      xmm2,       xmm6
-        lea         rdi,        [rdi + 2*rdx]
-
-        cmp         rdi,        rcx
-        jne         .next_row_sp
-
-        jmp         .done
-
-.b16x16_fp_only:
-        lea         rcx,        [rdi+rdx*8]
-        lea         rcx,        [rcx+rdx*8]
-        movsxd      rax,        dword ptr arg(1)    ; src_pixels_per_line
-
-.next_row_fp:
-        movq        xmm2,       [rsi]               ; 00 01 02 03 04 05 06 07
-        movq        xmm4,       [rsi+1]             ; 01 02 03 04 05 06 07 08
-
-        punpcklbw   xmm2,       xmm4
-        movq        xmm3,       [rsi+8]             ; 08 09 10 11 12 13 14 15
-
-        pmaddubsw   xmm2,       xmm1
-        movq        xmm4,       [rsi+9]             ; 09 10 11 12 13 14 15 16
-
-        lea         rsi,        [rsi + rax]         ; next line
-        punpcklbw   xmm3,       xmm4
-
-        pmaddubsw   xmm3,       xmm1
-        movq        xmm5,       [rsi]
-
-        paddw       xmm2,       [GLOBAL(rd)]
-        movq        xmm7,       [rsi+1]
-
-        movq        xmm6,       [rsi+8]
-        psraw       xmm2,       VP9_FILTER_SHIFT
-
-        punpcklbw   xmm5,       xmm7
-        movq        xmm7,       [rsi+9]
-
-        paddw       xmm3,       [GLOBAL(rd)]
-        pmaddubsw   xmm5,       xmm1
-
-        psraw       xmm3,       VP9_FILTER_SHIFT
-        punpcklbw   xmm6,       xmm7
-
-        packuswb    xmm2,       xmm3
-        pmaddubsw   xmm6,       xmm1
-
-        movdqa      [rdi],      xmm2                ; store the results in the destination
-        paddw       xmm5,       [GLOBAL(rd)]
-
-        lea         rdi,        [rdi + rdx]         ; dst_pitch
-        psraw       xmm5,       VP9_FILTER_SHIFT
-
-        paddw       xmm6,       [GLOBAL(rd)]
-        psraw       xmm6,       VP9_FILTER_SHIFT
-
-        packuswb    xmm5,       xmm6
-        lea         rsi,        [rsi + rax]         ; next line
-
-        movdqa      [rdi],      xmm5                ; store the results in the destination
-        lea         rdi,        [rdi + rdx]         ; dst_pitch
-
-        cmp         rdi,        rcx
-
-        jne         .next_row_fp
-
-.done:
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void vp9_bilinear_predict8x8_ssse3
-;(
-;    unsigned char  *src_ptr,
-;    int   src_pixels_per_line,
-;    int  xoffset,
-;    int  yoffset,
-;    unsigned char *dst_ptr,
-;    int dst_pitch
-;)
-global sym(vp9_bilinear_predict8x8_ssse3)
-sym(vp9_bilinear_predict8x8_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 144                         ; reserve 144 bytes
-
-        lea         rcx,        [GLOBAL(bilinear_filters_ssse3)]
-
-        mov         rsi,        arg(0) ;src_ptr
-        movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line
-
-    ;Read 9-line unaligned data in and put them on stack. This gives a big
-    ;performance boost.
-        movdqu      xmm0,       [rsi]
-        lea         rax,        [rdx + rdx*2]
-        movdqu      xmm1,       [rsi+rdx]
-        movdqu      xmm2,       [rsi+rdx*2]
-        add         rsi,        rax
-        movdqu      xmm3,       [rsi]
-        movdqu      xmm4,       [rsi+rdx]
-        movdqu      xmm5,       [rsi+rdx*2]
-        add         rsi,        rax
-        movdqu      xmm6,       [rsi]
-        movdqu      xmm7,       [rsi+rdx]
-
-        movdqa      XMMWORD PTR [rsp],            xmm0
-
-        movdqu      xmm0,       [rsi+rdx*2]
-
-        movdqa      XMMWORD PTR [rsp+16],         xmm1
-        movdqa      XMMWORD PTR [rsp+32],         xmm2
-        movdqa      XMMWORD PTR [rsp+48],         xmm3
-        movdqa      XMMWORD PTR [rsp+64],         xmm4
-        movdqa      XMMWORD PTR [rsp+80],         xmm5
-        movdqa      XMMWORD PTR [rsp+96],         xmm6
-        movdqa      XMMWORD PTR [rsp+112],        xmm7
-        movdqa      XMMWORD PTR [rsp+128],        xmm0
-
-        movsxd      rax,        dword ptr arg(2)    ; xoffset
-        cmp         rax,        0                   ; skip first_pass filter if xoffset=0
-        je          .b8x8_sp_only
-
-        shl         rax,        4
-        add         rax,        rcx                 ; HFilter
-
-        mov         rdi,        arg(4)              ; dst_ptr
-        movsxd      rdx,        dword ptr arg(5)    ; dst_pitch
-
-        movdqa      xmm0,       [rax]
-
-        movsxd      rax,        dword ptr arg(3)    ; yoffset
-        cmp         rax,        0                   ; skip second_pass filter if yoffset=0
-        je          .b8x8_fp_only
-
-        shl         rax,        4
-        lea         rax,        [rax + rcx]         ; VFilter
-
-        lea         rcx,        [rdi+rdx*8]
-
-        movdqa      xmm1,       [rax]
-
-        ; get the first horizontal line done
-        movdqa      xmm3,       [rsp]               ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
-        movdqa      xmm5,       xmm3                ; 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 xx
-
-        psrldq      xmm5,       1
-        lea         rsp,        [rsp + 16]          ; next line
-
-        punpcklbw   xmm3,       xmm5                ; 00 01 01 02 02 03 03 04 04 05 05 06 06 07 07 08
-        pmaddubsw   xmm3,       xmm0                ; 00 02 04 06 08 10 12 14
-
-        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
-        psraw       xmm3,       VP9_FILTER_SHIFT    ; xmm3 /= 128
-
-        movdqa      xmm7,       xmm3
-        packuswb    xmm7,       xmm7                ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
-
-.next_row:
-        movdqa      xmm6,       [rsp]               ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
-        lea         rsp,        [rsp + 16]          ; next line
-
-        movdqa      xmm5,       xmm6
-
-        psrldq      xmm5,       1
-
-        punpcklbw   xmm6,       xmm5
-        pmaddubsw   xmm6,       xmm0
-
-        paddw       xmm6,       [GLOBAL(rd)]        ; xmm6 += round value
-        psraw       xmm6,       VP9_FILTER_SHIFT    ; xmm6 /= 128
-
-        packuswb    xmm6,       xmm6
-
-        punpcklbw   xmm7,       xmm6
-        pmaddubsw   xmm7,       xmm1
-
-        paddw       xmm7,       [GLOBAL(rd)]        ; xmm7 += round value
-        psraw       xmm7,       VP9_FILTER_SHIFT    ; xmm7 /= 128
-
-        packuswb    xmm7,       xmm7
-
-        movq        [rdi],      xmm7                ; store the results in the destination
-        lea         rdi,        [rdi + rdx]
-
-        movdqa      xmm7,       xmm6
-
-        cmp         rdi,        rcx
-        jne         .next_row
-
-        jmp         .done8x8
-
-.b8x8_sp_only:
-        movsxd      rax,        dword ptr arg(3)    ; yoffset
-        shl         rax,        4
-        lea         rax,        [rax + rcx]         ; VFilter
-
-        mov         rdi,        arg(4) ;dst_ptr
-        movsxd      rdx,        dword ptr arg(5)    ; dst_pitch
-
-        movdqa      xmm0,       [rax]               ; VFilter
-
-        movq        xmm1,       XMMWORD PTR [rsp]
-        movq        xmm2,       XMMWORD PTR [rsp+16]
-
-        movq        xmm3,       XMMWORD PTR [rsp+32]
-        punpcklbw   xmm1,       xmm2
-
-        movq        xmm4,       XMMWORD PTR [rsp+48]
-        punpcklbw   xmm2,       xmm3
-
-        movq        xmm5,       XMMWORD PTR [rsp+64]
-        punpcklbw   xmm3,       xmm4
-
-        movq        xmm6,       XMMWORD PTR [rsp+80]
-        punpcklbw   xmm4,       xmm5
-
-        movq        xmm7,       XMMWORD PTR [rsp+96]
-        punpcklbw   xmm5,       xmm6
-
-        pmaddubsw   xmm1,       xmm0
-        pmaddubsw   xmm2,       xmm0
-
-        pmaddubsw   xmm3,       xmm0
-        pmaddubsw   xmm4,       xmm0
-
-        pmaddubsw   xmm5,       xmm0
-        punpcklbw   xmm6,       xmm7
-
-        pmaddubsw   xmm6,       xmm0
-        paddw       xmm1,       [GLOBAL(rd)]
-
-        paddw       xmm2,       [GLOBAL(rd)]
-        psraw       xmm1,       VP9_FILTER_SHIFT
-
-        paddw       xmm3,       [GLOBAL(rd)]
-        psraw       xmm2,       VP9_FILTER_SHIFT
-
-        paddw       xmm4,       [GLOBAL(rd)]
-        psraw       xmm3,       VP9_FILTER_SHIFT
-
-        paddw       xmm5,       [GLOBAL(rd)]
-        psraw       xmm4,       VP9_FILTER_SHIFT
-
-        paddw       xmm6,       [GLOBAL(rd)]
-        psraw       xmm5,       VP9_FILTER_SHIFT
-
-        psraw       xmm6,       VP9_FILTER_SHIFT
-        packuswb    xmm1,       xmm1
-
-        packuswb    xmm2,       xmm2
-        movq        [rdi],      xmm1
-
-        packuswb    xmm3,       xmm3
-        movq        [rdi+rdx],  xmm2
-
-        packuswb    xmm4,       xmm4
-        movq        xmm1,       XMMWORD PTR [rsp+112]
-
-        lea         rdi,        [rdi + 2*rdx]
-        movq        xmm2,       XMMWORD PTR [rsp+128]
-
-        packuswb    xmm5,       xmm5
-        movq        [rdi],      xmm3
-
-        packuswb    xmm6,       xmm6
-        movq        [rdi+rdx],  xmm4
-
-        lea         rdi,        [rdi + 2*rdx]
-        punpcklbw   xmm7,       xmm1
-
-        movq        [rdi],      xmm5
-        pmaddubsw   xmm7,       xmm0
-
-        movq        [rdi+rdx],  xmm6
-        punpcklbw   xmm1,       xmm2
-
-        pmaddubsw   xmm1,       xmm0
-        paddw       xmm7,       [GLOBAL(rd)]
-
-        psraw       xmm7,       VP9_FILTER_SHIFT
-        paddw       xmm1,       [GLOBAL(rd)]
-
-        psraw       xmm1,       VP9_FILTER_SHIFT
-        packuswb    xmm7,       xmm7
-
-        packuswb    xmm1,       xmm1
-        lea         rdi,        [rdi + 2*rdx]
-
-        movq        [rdi],      xmm7
-
-        movq        [rdi+rdx],  xmm1
-        lea         rsp,        [rsp + 144]
-
-        jmp         .done8x8
-
-.b8x8_fp_only:
-        lea         rcx,        [rdi+rdx*8]
-
-.next_row_fp:
-        movdqa      xmm1,       XMMWORD PTR [rsp]
-        movdqa      xmm3,       XMMWORD PTR [rsp+16]
-
-        movdqa      xmm2,       xmm1
-        movdqa      xmm5,       XMMWORD PTR [rsp+32]
-
-        psrldq      xmm2,       1
-        movdqa      xmm7,       XMMWORD PTR [rsp+48]
-
-        movdqa      xmm4,       xmm3
-        psrldq      xmm4,       1
-
-        movdqa      xmm6,       xmm5
-        psrldq      xmm6,       1
-
-        punpcklbw   xmm1,       xmm2
-        pmaddubsw   xmm1,       xmm0
-
-        punpcklbw   xmm3,       xmm4
-        pmaddubsw   xmm3,       xmm0
-
-        punpcklbw   xmm5,       xmm6
-        pmaddubsw   xmm5,       xmm0
-
-        movdqa      xmm2,       xmm7
-        psrldq      xmm2,       1
-
-        punpcklbw   xmm7,       xmm2
-        pmaddubsw   xmm7,       xmm0
-
-        paddw       xmm1,       [GLOBAL(rd)]
-        psraw       xmm1,       VP9_FILTER_SHIFT
-
-        paddw       xmm3,       [GLOBAL(rd)]
-        psraw       xmm3,       VP9_FILTER_SHIFT
-
-        paddw       xmm5,       [GLOBAL(rd)]
-        psraw       xmm5,       VP9_FILTER_SHIFT
-
-        paddw       xmm7,       [GLOBAL(rd)]
-        psraw       xmm7,       VP9_FILTER_SHIFT
-
-        packuswb    xmm1,       xmm1
-        packuswb    xmm3,       xmm3
-
-        packuswb    xmm5,       xmm5
-        movq        [rdi],      xmm1
-
-        packuswb    xmm7,       xmm7
-        movq        [rdi+rdx],  xmm3
-
-        lea         rdi,        [rdi + 2*rdx]
-        movq        [rdi],      xmm5
-
-        lea         rsp,        [rsp + 4*16]
-        movq        [rdi+rdx],  xmm7
-
-        lea         rdi,        [rdi + 2*rdx]
-        cmp         rdi,        rcx
-
-        jne         .next_row_fp
-
-        lea         rsp,        [rsp + 16]
-
-.done8x8:
-    ;add rsp, 144
-    pop         rsp
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-SECTION_RODATA
-align 16
-shuf1b:
-    db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12
-shuf2b:
-    db 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11
-shuf3b:
-    db 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10
-
-align 16
-shuf2bfrom1:
-    db  4, 8, 6, 1, 8, 3, 1, 5, 3, 7, 5, 9, 7,11, 9,13
-align 16
-shuf3bfrom1:
-    db  2, 6, 4, 8, 6, 1, 8, 3, 1, 5, 3, 7, 5, 9, 7,11
-
-align 16
-rd:
-    times 8 dw 0x40
-
-align 16
-k0_k5:
-    times 8 db 0, 0             ;placeholder
-    times 8 db 0, 0
-    times 8 db 2, 1
-    times 8 db 0, 0
-    times 8 db 3, 3
-    times 8 db 0, 0
-    times 8 db 1, 2
-    times 8 db 0, 0
-k1_k3:
-    times 8 db  0,    0         ;placeholder
-    times 8 db  -6,  12
-    times 8 db -11,  36
-    times 8 db  -9,  50
-    times 8 db -16,  77
-    times 8 db  -6,  93
-    times 8 db  -8, 108
-    times 8 db  -1, 123
-k2_k4:
-    times 8 db 128,    0        ;placeholder
-    times 8 db 123,   -1
-    times 8 db 108,   -8
-    times 8 db  93,   -6
-    times 8 db  77,  -16
-    times 8 db  50,   -9
-    times 8 db  36,  -11
-    times 8 db  12,   -6
-align 16
-bilinear_filters_ssse3:
-    times 8 db 128, 0
-    times 8 db 120, 8
-    times 8 db 112, 16
-    times 8 db 104, 24
-    times 8 db 96,  32
-    times 8 db 88,  40
-    times 8 db 80,  48
-    times 8 db 72,  56
-    times 8 db 64,  64
-    times 8 db 56,  72
-    times 8 db 48,  80
-    times 8 db 40,  88
-    times 8 db 32,  96
-    times 8 db 24,  104
-    times 8 db 16,  112
-    times 8 db 8,   120
-
--- a/vp8/common/x86/subpixel_x86.h
+++ /dev/null
@@ -1,122 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef SUBPIXEL_X86_H
-#define SUBPIXEL_X86_H
-
-/* Note:
- *
- * This platform is commonly built for runtime CPU detection. If you modify
- * any of the function mappings present in this file, be sure to also update
- * them in the function pointer initialization code
- */
-
-#if HAVE_MMX
-extern prototype_subpixel_predict(vp9_sixtap_predict16x16_mmx);
-extern prototype_subpixel_predict(vp9_sixtap_predict8x8_mmx);
-extern prototype_subpixel_predict(vp9_sixtap_predict8x4_mmx);
-extern prototype_subpixel_predict(vp9_sixtap_predict4x4_mmx);
-extern prototype_subpixel_predict(vp9_bilinear_predict16x16_mmx);
-extern prototype_subpixel_predict(vp9_bilinear_predict8x8_mmx);
-extern prototype_subpixel_predict(vp9_bilinear_predict8x4_mmx);
-extern prototype_subpixel_predict(vp9_bilinear_predict4x4_mmx);
-
-
-#if !CONFIG_RUNTIME_CPU_DETECT
-#undef  vp9_subpix_sixtap16x16
-#define vp9_subpix_sixtap16x16 vp9_sixtap_predict16x16_mmx
-
-#undef  vp9_subpix_sixtap8x8
-#define vp9_subpix_sixtap8x8 vp9_sixtap_predict8x8_mmx
-
-#undef  vp9_subpix_sixtap8x4
-#define vp9_subpix_sixtap8x4 vp9_sixtap_predict8x4_mmx
-
-#undef  vp9_subpix_sixtap4x4
-#define vp9_subpix_sixtap4x4 vp9_sixtap_predict4x4_mmx
-
-#undef  vp9_subpix_bilinear16x16
-#define vp9_subpix_bilinear16x16 vp9_bilinear_predict16x16_mmx
-
-#undef  vp9_subpix_bilinear8x8
-#define vp9_subpix_bilinear8x8 vp9_bilinear_predict8x8_mmx
-
-#undef  vp9_subpix_bilinear8x4
-#define vp9_subpix_bilinear8x4 vp9_bilinear_predict8x4_mmx
-
-#undef  vp9_subpix_bilinear4x4
-#define vp9_subpix_bilinear4x4 vp9_bilinear_predict4x4_mmx
-
-#endif
-#endif
-
-
-#if HAVE_SSE2
-extern prototype_subpixel_predict(vp9_sixtap_predict16x16_sse2);
-extern prototype_subpixel_predict(vp9_sixtap_predict8x8_sse2);
-extern prototype_subpixel_predict(vp9_sixtap_predict8x4_sse2);
-extern prototype_subpixel_predict(vp9_bilinear_predict16x16_sse2);
-extern prototype_subpixel_predict(vp9_bilinear_predict8x8_sse2);
-
-
-#if !CONFIG_RUNTIME_CPU_DETECT
-#undef  vp9_subpix_sixtap16x16
-#define vp9_subpix_sixtap16x16 vp9_sixtap_predict16x16_sse2
-
-#undef  vp9_subpix_sixtap8x8
-#define vp9_subpix_sixtap8x8 vp9_sixtap_predict8x8_sse2
-
-#undef  vp9_subpix_sixtap8x4
-#define vp9_subpix_sixtap8x4 vp9_sixtap_predict8x4_sse2
-
-#undef  vp9_subpix_bilinear16x16
-#define vp9_subpix_bilinear16x16 vp9_bilinear_predict16x16_sse2
-
-#undef  vp9_subpix_bilinear8x8
-#define vp9_subpix_bilinear8x8 vp9_bilinear_predict8x8_sse2
-
-#endif
-#endif
-
-#if HAVE_SSSE3
-extern prototype_subpixel_predict(vp9_sixtap_predict16x16_ssse3);
-extern prototype_subpixel_predict(vp9_sixtap_predict8x8_ssse3);
-extern prototype_subpixel_predict(vp9_sixtap_predict8x4_ssse3);
-extern prototype_subpixel_predict(vp9_sixtap_predict4x4_ssse3);
-extern prototype_subpixel_predict(vp9_bilinear_predict16x16_ssse3);
-extern prototype_subpixel_predict(vp9_bilinear_predict8x8_ssse3);
-
-#if !CONFIG_RUNTIME_CPU_DETECT
-#undef  vp9_subpix_sixtap16x16
-#define vp9_subpix_sixtap16x16 vp9_sixtap_predict16x16_ssse3
-
-#undef  vp9_subpix_sixtap8x8
-#define vp9_subpix_sixtap8x8 vp9_sixtap_predict8x8_ssse3
-
-#undef  vp9_subpix_sixtap8x4
-#define vp9_subpix_sixtap8x4 vp9_sixtap_predict8x4_ssse3
-
-#undef  vp9_subpix_sixtap4x4
-#define vp9_subpix_sixtap4x4 vp9_sixtap_predict4x4_ssse3
-
-
-#undef  vp9_subpix_bilinear16x16
-#define vp9_subpix_bilinear16x16 vp9_bilinear_predict16x16_ssse3
-
-#undef  vp9_subpix_bilinear8x8
-#define vp9_subpix_bilinear8x8 vp9_bilinear_predict8x8_ssse3
-
-#endif
-#endif
-
-
-
-#endif
--- a/vp8/common/x86/vp8_asm_stubs.c
+++ /dev/null
@@ -1,602 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vpx_ports/config.h"
-#include "vpx_ports/mem.h"
-#include "vp8/common/subpixel.h"
-
-extern const short vp9_six_tap_mmx[16][6 * 8];
-
-extern const short vp9_bilinear_filters_8x_mmx[16][2 * 8];
-
-extern void vp9_filter_block1d_h6_mmx(unsigned char   *src_ptr,
-                                      unsigned short  *output_ptr,
-                                      unsigned int     src_pixels_per_line,
-                                      unsigned int     pixel_step,
-                                      unsigned int     output_height,
-                                      unsigned int     output_width,
-                                      const short     *vp9_filter);
-
-extern void vp9_filter_block1dc_v6_mmx(unsigned short *src_ptr,
-                                       unsigned char  *output_ptr,
-                                       int             output_pitch,
-                                       unsigned int    pixels_per_line,
-                                       unsigned int    pixel_step,
-                                       unsigned int    output_height,
-                                       unsigned int    output_width,
-                                       const short    *vp9_filter);
-
-extern void vp9_filter_block1d8_h6_sse2(unsigned char  *src_ptr,
-                                        unsigned short *output_ptr,
-                                        unsigned int    src_pixels_per_line,
-                                        unsigned int    pixel_step,
-                                        unsigned int    output_height,
-                                        unsigned int    output_width,
-                                        const short    *vp9_filter);
-
-extern void vp9_filter_block1d16_h6_sse2(unsigned char  *src_ptr,
-                                         unsigned short *output_ptr,
-                                         unsigned int    src_pixels_per_line,
-                                         unsigned int    pixel_step,
-                                         unsigned int    output_height,
-                                         unsigned int    output_width,
-                                         const short    *vp9_filter);
-
-extern void vp9_filter_block1d8_v6_sse2(unsigned short *src_ptr,
-                                        unsigned char *output_ptr,
-                                        int dst_ptich,
-                                        unsigned int pixels_per_line,
-                                        unsigned int pixel_step,
-                                        unsigned int output_height,
-                                        unsigned int output_width,
-                                        const short    *vp9_filter);
-
-extern void vp9_filter_block1d16_v6_sse2(unsigned short *src_ptr,
-                                         unsigned char *output_ptr,
-                                         int dst_ptich,
-                                         unsigned int pixels_per_line,
-                                         unsigned int pixel_step,
-                                         unsigned int output_height,
-                                         unsigned int output_width,
-                                         const short    *vp9_filter);
-
-extern void vp9_unpack_block1d16_h6_sse2(unsigned char  *src_ptr,
-                                         unsigned short *output_ptr,
-                                         unsigned int    src_pixels_per_line,
-                                         unsigned int    output_height,
-                                         unsigned int    output_width);
-
-extern void vp9_filter_block1d8_h6_only_sse2(unsigned char *src_ptr,
-                                             unsigned int   src_pixels_per_line,
-                                             unsigned char *output_ptr,
-                                             int            dst_pitch,
-                                             unsigned int   output_height,
-                                             const short   *vp9_filter);
-
-extern void vp9_filter_block1d16_h6_only_sse2(unsigned char *src_ptr,
-                                              unsigned int   src_pixels_per_lin,
-                                              unsigned char *output_ptr,
-                                              int            dst_pitch,
-                                              unsigned int   output_height,
-                                              const short   *vp9_filter);
-
-extern void vp9_filter_block1d8_v6_only_sse2(unsigned char *src_ptr,
-                                             unsigned int   src_pixels_per_line,
-                                             unsigned char *output_ptr,
-                                             int            dst_pitch,
-                                             unsigned int   output_height,
-                                             const short   *vp9_filter);
-
-extern prototype_subpixel_predict(vp9_bilinear_predict8x8_mmx);
-
-#if HAVE_MMX
-void vp9_sixtap_predict4x4_mmx(unsigned char  *src_ptr,
-                               int  src_pixels_per_line,
-                               int  xoffset,
-                               int  yoffset,
-                               unsigned char *dst_ptr,
-                               int  dst_pitch) {
-#ifdef ANNOUNCE_FUNCTION
-  printf("vp9_sixtap_predict4x4_mmx\n");
-#endif
-  /* Temp data bufffer used in filtering */
-  DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 16 * 16);
-  const short *hfilter, *vfilter;
-  hfilter = vp9_six_tap_mmx[xoffset];
-  vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), fdata2,
-                            src_pixels_per_line, 1, 9, 8, hfilter);
-  vfilter = vp9_six_tap_mmx[yoffset];
-  vp9_filter_block1dc_v6_mmx(fdata2 + 8, dst_ptr, dst_pitch,
-                             8, 4, 4, 4, vfilter);
-}
-
-void vp9_sixtap_predict16x16_mmx(unsigned char  *src_ptr,
-                                 int  src_pixels_per_line,
-                                 int  xoffset,
-                                 int  yoffset,
-                                 unsigned char *dst_ptr,
-                                 int dst_pitch) {
-#ifdef ANNOUNCE_FUNCTION
-  printf("vp9_sixtap_predict16x16_mmx\n");
-#endif
-  /* Temp data bufffer used in filtering */
-  DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 24 * 24);
-  const short *hfilter, *vfilter;
-
-  hfilter = vp9_six_tap_mmx[xoffset];
-  vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line),
-                            fdata2,   src_pixels_per_line, 1, 21, 32,
-                            hfilter);
-  vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4,
-                            fdata2 + 4, src_pixels_per_line, 1, 21, 32,
-                            hfilter);
-  vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 8,
-                            fdata2 + 8, src_pixels_per_line, 1, 21, 32,
-                            hfilter);
-  vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 12,
-                            fdata2 + 12, src_pixels_per_line, 1, 21, 32,
-                            hfilter);
-
-  vfilter = vp9_six_tap_mmx[yoffset];
-  vp9_filter_block1dc_v6_mmx(fdata2 + 32, dst_ptr,      dst_pitch,
-                             32, 16, 16, 16, vfilter);
-  vp9_filter_block1dc_v6_mmx(fdata2 + 36, dst_ptr + 4,  dst_pitch,
-                             32, 16, 16, 16, vfilter);
-  vp9_filter_block1dc_v6_mmx(fdata2 + 40, dst_ptr + 8,  dst_pitch,
-                             32, 16, 16, 16, vfilter);
-  vp9_filter_block1dc_v6_mmx(fdata2 + 44, dst_ptr + 12, dst_pitch,
-                             32, 16, 16, 16, vfilter);
-}
-
-void vp9_sixtap_predict8x8_mmx(unsigned char  *src_ptr,
-                               int  src_pixels_per_line,
-                               int  xoffset,
-                               int  yoffset,
-                               unsigned char *dst_ptr,
-                               int  dst_pitch) {
-#ifdef ANNOUNCE_FUNCTION
-  printf("vp9_sixtap_predict8x8_mmx\n");
-#endif
-  /* Temp data bufffer used in filtering */
-  DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 256);
-  const short *hfilter, *vfilter;
-
-  hfilter = vp9_six_tap_mmx[xoffset];
-  vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line),
-                            fdata2,   src_pixels_per_line, 1, 13, 16,
-                            hfilter);
-  vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4,
-                            fdata2 + 4, src_pixels_per_line, 1, 13, 16,
-                            hfilter);
-
-  vfilter = vp9_six_tap_mmx[yoffset];
-  vp9_filter_block1dc_v6_mmx(fdata2 + 16, dst_ptr,     dst_pitch,
-                             16, 8, 8, 8, vfilter);
-  vp9_filter_block1dc_v6_mmx(fdata2 + 20, dst_ptr + 4, dst_pitch,
-                             16, 8, 8, 8, vfilter);
-}
-
-void vp9_sixtap_predict8x4_mmx(unsigned char  *src_ptr,
-                               int  src_pixels_per_line,
-                               int  xoffset,
-                               int  yoffset,
-                               unsigned char *dst_ptr,
-                               int  dst_pitch) {
-#ifdef ANNOUNCE_FUNCTION
-  printf("vp9_sixtap_predict8x4_mmx\n");
-#endif
-  /* Temp data bufffer used in filtering */
-  DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 256);
-  const short *hfilter, *vfilter;
-
-  hfilter = vp9_six_tap_mmx[xoffset];
-  vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line),
-                            fdata2,   src_pixels_per_line, 1, 9, 16, hfilter);
-  vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4,
-                            fdata2 + 4, src_pixels_per_line, 1, 9, 16, hfilter);
-
-  vfilter = vp9_six_tap_mmx[yoffset];
-  vp9_filter_block1dc_v6_mmx(fdata2 + 16, dst_ptr,     dst_pitch,
-                             16, 8, 4, 8, vfilter);
-  vp9_filter_block1dc_v6_mmx(fdata2 + 20, dst_ptr + 4, dst_pitch,
-                             16, 8, 4, 8, vfilter);
-}
-
-void vp9_bilinear_predict16x16_mmx(unsigned char  *src_ptr,
-                                   int  src_pixels_per_line,
-                                   int  xoffset,
-                                   int  yoffset,
-                                   unsigned char *dst_ptr,
-                                   int  dst_pitch) {
-  vp9_bilinear_predict8x8_mmx(src_ptr,
-                              src_pixels_per_line, xoffset, yoffset,
-                              dst_ptr, dst_pitch);
-  vp9_bilinear_predict8x8_mmx(src_ptr + 8,
-                              src_pixels_per_line, xoffset, yoffset,
-                              dst_ptr + 8, dst_pitch);
-  vp9_bilinear_predict8x8_mmx(src_ptr + 8 * src_pixels_per_line,
-                              src_pixels_per_line, xoffset, yoffset,
-                              dst_ptr + dst_pitch * 8, dst_pitch);
-  vp9_bilinear_predict8x8_mmx(src_ptr + 8 * src_pixels_per_line + 8,
-                              src_pixels_per_line, xoffset, yoffset,
-                              dst_ptr + dst_pitch * 8 + 8, dst_pitch);
-}
-#endif
-
-#if HAVE_SSE2
-void vp9_sixtap_predict16x16_sse2(unsigned char  *src_ptr,
-                                  int  src_pixels_per_line,
-                                  int  xoffset,
-                                  int  yoffset,
-                                  unsigned char *dst_ptr,
-                                  int  dst_pitch) {
-  /* Temp data bufffer used in filtering */
-  DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 24 * 24);
-  const short *hfilter, *vfilter;
-#ifdef ANNOUNCE_FUNCTION
-  printf("vp9_sixtap_predict16x16_sse2\n");
-#endif
-
-  if (xoffset) {
-    if (yoffset) {
-      hfilter = vp9_six_tap_mmx[xoffset];
-      vp9_filter_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), fdata2,
-                                   src_pixels_per_line, 1, 21, 32, hfilter);
-      vfilter = vp9_six_tap_mmx[yoffset];
-      vp9_filter_block1d16_v6_sse2(fdata2 + 32, dst_ptr, dst_pitch,
-                                   32, 16, 16, dst_pitch, vfilter);
-    } else {
-      /* First-pass only */
-      hfilter = vp9_six_tap_mmx[xoffset];
-      vp9_filter_block1d16_h6_only_sse2(src_ptr, src_pixels_per_line,
-                                        dst_ptr, dst_pitch, 16, hfilter);
-    }
-  } else {
-    /* Second-pass only */
-    vfilter = vp9_six_tap_mmx[yoffset];
-    vp9_unpack_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), fdata2,
-                                 src_pixels_per_line, 21, 32);
-    vp9_filter_block1d16_v6_sse2(fdata2 + 32, dst_ptr, dst_pitch,
-                                 32, 16, 16, dst_pitch, vfilter);
-  }
-}
-
-void vp9_sixtap_predict8x8_sse2(unsigned char  *src_ptr,
-                                int  src_pixels_per_line,
-                                int  xoffset,
-                                int  yoffset,
-                                unsigned char *dst_ptr,
-                                int  dst_pitch) {
-  /* Temp data bufffer used in filtering */
-  DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 256);
-  const short *hfilter, *vfilter;
-#ifdef ANNOUNCE_FUNCTION
-  printf("vp9_sixtap_predict8x8_sse2\n");
-#endif
-
-  if (xoffset) {
-    if (yoffset) {
-      hfilter = vp9_six_tap_mmx[xoffset];
-      vp9_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), fdata2,
-                                  src_pixels_per_line, 1, 13, 16, hfilter);
-      vfilter = vp9_six_tap_mmx[yoffset];
-      vp9_filter_block1d8_v6_sse2(fdata2 + 16, dst_ptr, dst_pitch,
-                                  16, 8, 8, dst_pitch, vfilter);
-    } else {
-      /* First-pass only */
-      hfilter = vp9_six_tap_mmx[xoffset];
-      vp9_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line,
-                                       dst_ptr, dst_pitch, 8, hfilter);
-    }
-  } else {
-    /* Second-pass only */
-    vfilter = vp9_six_tap_mmx[yoffset];
-    vp9_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line),
-                                     src_pixels_per_line,
-                                     dst_ptr, dst_pitch, 8, vfilter);
-  }
-}
-
-void vp9_sixtap_predict8x4_sse2(unsigned char  *src_ptr,
-                                int  src_pixels_per_line,
-                                int  xoffset,
-                                int  yoffset,
-                                unsigned char *dst_ptr,
-                                int  dst_pitch) {
-  /* Temp data bufffer used in filtering */
-  DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 256);
-  const short *hfilter, *vfilter;
-#ifdef ANNOUNCE_FUNCTION
-  printf("vp9_sixtap_predict8x4_sse2\n");
-#endif
-
-  if (xoffset) {
-    if (yoffset) {
-      hfilter = vp9_six_tap_mmx[xoffset];
-      vp9_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), fdata2,
-                                  src_pixels_per_line, 1, 9, 16, hfilter);
-      vfilter = vp9_six_tap_mmx[yoffset];
-      vp9_filter_block1d8_v6_sse2(fdata2 + 16, dst_ptr, dst_pitch,
-                                  16, 8, 4, dst_pitch, vfilter);
-    } else {
-      /* First-pass only */
-      hfilter = vp9_six_tap_mmx[xoffset];
-      vp9_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line,
-                                       dst_ptr, dst_pitch, 4, hfilter);
-    }
-  } else {
-    /* Second-pass only */
-    vfilter = vp9_six_tap_mmx[yoffset];
-    vp9_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line),
-                                     src_pixels_per_line,
-                                     dst_ptr, dst_pitch, 4, vfilter);
-  }
-}
-#endif
-
-#if HAVE_SSSE3
-extern void vp9_filter_block1d8_h6_ssse3(unsigned char  *src_ptr,
-                                         unsigned int    src_pixels_per_line,
-                                         unsigned char  *output_ptr,
-                                         unsigned int    output_pitch,
-                                         unsigned int    output_height,
-                                         unsigned int    vp9_filter_index);
-
-extern void vp9_filter_block1d16_h6_ssse3(unsigned char  *src_ptr,
-                                          unsigned int    src_pixels_per_line,
-                                          unsigned char  *output_ptr,
-                                          unsigned int    output_pitch,
-                                          unsigned int    output_height,
-                                          unsigned int    vp9_filter_index);
-
-extern void vp9_filter_block1d16_v6_ssse3(unsigned char *src_ptr,
-                                          unsigned int   src_pitch,
-                                          unsigned char *output_ptr,
-                                          unsigned int   out_pitch,
-                                          unsigned int   output_height,
-                                          unsigned int   vp9_filter_index);
-
-extern void vp9_filter_block1d8_v6_ssse3(unsigned char *src_ptr,
-                                         unsigned int   src_pitch,
-                                         unsigned char *output_ptr,
-                                         unsigned int   out_pitch,
-                                         unsigned int   output_height,
-                                         unsigned int   vp9_filter_index);
-
-extern void vp9_filter_block1d4_h6_ssse3(unsigned char  *src_ptr,
-                                         unsigned int    src_pixels_per_line,
-                                         unsigned char  *output_ptr,
-                                         unsigned int    output_pitch,
-                                         unsigned int    output_height,
-                                         unsigned int    vp9_filter_index);
-
-extern void vp9_filter_block1d4_v6_ssse3(unsigned char *src_ptr,
-                                         unsigned int   src_pitch,
-                                         unsigned char *output_ptr,
-                                         unsigned int   out_pitch,
-                                         unsigned int   output_height,
-                                         unsigned int   vp9_filter_index);
-
-void vp9_sixtap_predict16x16_ssse3(unsigned char  *src_ptr,
-                                   int  src_pixels_per_line,
-                                   int  xoffset,
-                                   int  yoffset,
-                                   unsigned char *dst_ptr,
-                                   int  dst_pitch) {
-  DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 24 * 24);
-#ifdef ANNOUNCE_FUNCTION
-  printf("vp9_sixtap_predict16x16_ssse3\n");
-#endif
-
-  if (xoffset) {
-    if (yoffset) {
-      vp9_filter_block1d16_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
-                                    src_pixels_per_line,
-                                    fdata2, 16, 21, xoffset);
-      vp9_filter_block1d16_v6_ssse3(fdata2, 16, dst_ptr, dst_pitch,
-                                    16, yoffset);
-    } else {
-      /* First-pass only */
-      vp9_filter_block1d16_h6_ssse3(src_ptr, src_pixels_per_line,
-                                    dst_ptr, dst_pitch, 16, xoffset);
-    }
-  } else {
-    /* Second-pass only */
-    vp9_filter_block1d16_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
-                                  src_pixels_per_line,
-                                  dst_ptr, dst_pitch, 16, yoffset);
-  }
-}
-
-void vp9_sixtap_predict8x8_ssse3(unsigned char  *src_ptr,
-                                 int  src_pixels_per_line,
-                                 int  xoffset,
-                                 int  yoffset,
-                                 unsigned char *dst_ptr,
-                                 int  dst_pitch) {
-  DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 256);
-#ifdef ANNOUNCE_FUNCTION
-  printf("vp9_sixtap_predict8x8_ssse3\n");
-#endif
-
-  if (xoffset) {
-    if (yoffset) {
-      vp9_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
-                                   src_pixels_per_line, fdata2, 8, 13, xoffset);
-      vp9_filter_block1d8_v6_ssse3(fdata2, 8, dst_ptr, dst_pitch, 8, yoffset);
-    } else {
-      vp9_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line,
-                                   dst_ptr, dst_pitch, 8, xoffset);
-    }
-  } else {
-    /* Second-pass only */
-    vp9_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
-                                 src_pixels_per_line,
-                                 dst_ptr, dst_pitch, 8, yoffset);
-  }
-}
-
-void vp9_sixtap_predict8x4_ssse3(unsigned char  *src_ptr,
-                                 int  src_pixels_per_line,
-                                 int  xoffset,
-                                 int  yoffset,
-                                 unsigned char *dst_ptr,
-                                 int  dst_pitch) {
-  DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 256);
-#ifdef ANNOUNCE_FUNCTION
-  printf("vp9_sixtap_predict8x4_ssse3\n");
-#endif
-
-  if (xoffset) {
-    if (yoffset) {
-      vp9_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
-                                   src_pixels_per_line, fdata2, 8, 9, xoffset);
-      vp9_filter_block1d8_v6_ssse3(fdata2, 8, dst_ptr, dst_pitch, 4, yoffset);
-    } else {
-      /* First-pass only */
-      vp9_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line,
-                                   dst_ptr, dst_pitch, 4, xoffset);
-    }
-  } else {
-    /* Second-pass only */
-    vp9_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
-                                 src_pixels_per_line,
-                                 dst_ptr, dst_pitch, 4, yoffset);
-  }
-}
-
-void vp9_sixtap_predict4x4_ssse3(unsigned char  *src_ptr,
-                                 int   src_pixels_per_line,
-                                 int  xoffset,
-                                 int  yoffset,
-                                 unsigned char *dst_ptr,
-                                 int dst_pitch) {
-  DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 4 * 9);
-#ifdef ANNOUNCE_FUNCTION
-  printf("vp9_sixtap_predict4x4_ssse3\n");
-#endif
-
-  if (xoffset) {
-    if (yoffset) {
-      vp9_filter_block1d4_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
-                                   src_pixels_per_line, fdata2, 4, 9, xoffset);
-      vp9_filter_block1d4_v6_ssse3(fdata2, 4, dst_ptr, dst_pitch, 4, yoffset);
-    } else {
-      vp9_filter_block1d4_h6_ssse3(src_ptr, src_pixels_per_line,
-                                   dst_ptr, dst_pitch, 4, xoffset);
-    }
-  } else {
-    vp9_filter_block1d4_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
-                                 src_pixels_per_line,
-                                 dst_ptr, dst_pitch, 4, yoffset);
-  }
-}
-
-void vp9_filter_block1d16_v8_ssse3(const unsigned char *src_ptr,
-                                   const unsigned int src_pitch,
-                                   unsigned char *output_ptr,
-                                   unsigned int out_pitch,
-                                   unsigned int output_height,
-                                   const short *filter);
-
-void vp9_filter_block1d16_h8_ssse3(const unsigned char *src_ptr,
-                                   const unsigned int src_pitch,
-                                   unsigned char *output_ptr,
-                                   unsigned int out_pitch,
-                                   unsigned int output_height,
-                                   const short *filter);
-
-void vp9_filter_block2d_16x16_8_ssse3(const unsigned char *src_ptr,
-                                      const unsigned int src_stride,
-                                      const short *hfilter_aligned16,
-                                      const short *vfilter_aligned16,
-                                      unsigned char *dst_ptr,
-                                      unsigned int dst_stride) {
-  if (hfilter_aligned16[3] != 128 && vfilter_aligned16[3] != 128) {
-    DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 23 * 16);
-
-    vp9_filter_block1d16_h8_ssse3(src_ptr - (3 * src_stride), src_stride,
-                                  fdata2, 16, 23, hfilter_aligned16);
-    vp9_filter_block1d16_v8_ssse3(fdata2, 16, dst_ptr, dst_stride, 16,
-                                  vfilter_aligned16);
-  } else {
-    if (hfilter_aligned16[3] != 128) {
-      vp9_filter_block1d16_h8_ssse3(src_ptr, src_stride, dst_ptr, dst_stride,
-                                    16, hfilter_aligned16);
-    } else {
-      vp9_filter_block1d16_v8_ssse3(src_ptr - (3 * src_stride), src_stride,
-                                    dst_ptr, dst_stride, 16, vfilter_aligned16);
-    }
-  }
-}
-
-void vp9_filter_block1d8_v8_ssse3(const unsigned char *src_ptr,
-                                   const unsigned int src_pitch,
-                                   unsigned char *output_ptr,
-                                   unsigned int out_pitch,
-                                   unsigned int output_height,
-                                   const short *filter);
-
-void vp9_filter_block1d8_h8_ssse3(const unsigned char *src_ptr,
-                                   const unsigned int src_pitch,
-                                   unsigned char *output_ptr,
-                                   unsigned int out_pitch,
-                                   unsigned int output_height,
-                                   const short *filter);
-
-void vp9_filter_block2d_8x8_8_ssse3(const unsigned char *src_ptr,
-                                    const unsigned int src_stride,
-                                    const short *hfilter_aligned16,
-                                    const short *vfilter_aligned16,
-                                    unsigned char *dst_ptr,
-                                    unsigned int dst_stride) {
-  if (hfilter_aligned16[3] != 128 && vfilter_aligned16[3] != 128) {
-    DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 23 * 16);
-
-    vp9_filter_block1d8_h8_ssse3(src_ptr - (3 * src_stride), src_stride,
-                                 fdata2, 16, 15, hfilter_aligned16);
-    vp9_filter_block1d8_v8_ssse3(fdata2, 16, dst_ptr, dst_stride, 8,
-                                 vfilter_aligned16);
-  } else {
-    if (hfilter_aligned16[3] != 128) {
-      vp9_filter_block1d8_h8_ssse3(src_ptr, src_stride, dst_ptr, dst_stride, 8,
-                                   hfilter_aligned16);
-    } else {
-      vp9_filter_block1d8_v8_ssse3(src_ptr - (3 * src_stride), src_stride,
-                                   dst_ptr, dst_stride, 8, vfilter_aligned16);
-    }
-  }
-}
-
-void vp9_filter_block2d_8x4_8_ssse3(const unsigned char *src_ptr,
-                                    const unsigned int src_stride,
-                                    const short *hfilter_aligned16,
-                                    const short *vfilter_aligned16,
-                                    unsigned char *dst_ptr,
-                                    unsigned int dst_stride) {
-  if (hfilter_aligned16[3] !=128 && vfilter_aligned16[3] != 128) {
-      DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 23 * 16);
-
-      vp9_filter_block1d8_h8_ssse3(src_ptr - (3 * src_stride), src_stride,
-                                   fdata2, 16, 11, hfilter_aligned16);
-      vp9_filter_block1d8_v8_ssse3(fdata2, 16, dst_ptr, dst_stride, 4,
-                                   vfilter_aligned16);
-  } else {
-    if (hfilter_aligned16[3] != 128) {
-      vp9_filter_block1d8_h8_ssse3(src_ptr, src_stride, dst_ptr, dst_stride, 4,
-                                   hfilter_aligned16);
-    } else {
-      vp9_filter_block1d8_v8_ssse3(src_ptr - (3 * src_stride), src_stride,
-                                   dst_ptr, dst_stride, 4, vfilter_aligned16);
-    }
-  }
-}
-#endif
--- a/vp8/common/x86/x86_systemdependent.c
+++ /dev/null
@@ -1,108 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "vpx_config.h"
-#include "vpx_ports/x86.h"
-#include "vp8/common/subpixel.h"
-#include "vp8/common/loopfilter.h"
-#include "vp8/common/idct.h"
-#include "vp8/common/pragmas.h"
-#include "vp8/common/onyxc_int.h"
-
-void vp9_arch_x86_common_init(VP9_COMMON *ctx) {
-#if CONFIG_RUNTIME_CPU_DETECT
-  VP9_COMMON_RTCD *rtcd = &ctx->rtcd;
-  int flags = x86_simd_caps();
-
-  /* Note:
-   *
-   * This platform can be built without runtime CPU detection as well. If
-   * you modify any of the function mappings present in this file, be sure
-   * to also update them in static mapings (<arch>/filename_<arch>.h)
-   */
-
-  /* Override default functions with fastest ones for this CPU. */
-#if HAVE_MMX
-// The commented functions need to be re-written for vpx.
-  if (flags & HAS_MMX) {
-    rtcd->idct.idct1        = vp9_short_idct4x4llm_1_mmx;
-    rtcd->idct.idct16       = vp9_short_idct4x4llm_mmx;
-    rtcd->idct.idct1_scalar_add = vp9_dc_only_idct_add_mmx;
-    // rtcd->idct.iwalsh16     = vp9_short_inv_walsh4x4_mmx;
-    // rtcd->idct.iwalsh1     = vp9_short_inv_walsh4x4_1_mmx;
-
-    /* Disabled due to unsupported enhanced interpolation/high_prec mv
-    rtcd->subpix.sixtap16x16   = vp9_sixtap_predict16x16_mmx;
-    rtcd->subpix.sixtap8x8     = vp9_sixtap_predict8x8_mmx;
-    rtcd->subpix.sixtap8x4     = vp9_sixtap_predict8x4_mmx;
-    rtcd->subpix.sixtap4x4     = vp9_sixtap_predict4x4_mmx;
-    */
-    rtcd->subpix.bilinear16x16 = vp9_bilinear_predict16x16_mmx;
-    rtcd->subpix.bilinear8x8   = vp9_bilinear_predict8x8_mmx;
-    rtcd->subpix.bilinear8x4   = vp9_bilinear_predict8x4_mmx;
-    rtcd->subpix.bilinear4x4   = vp9_bilinear_predict4x4_mmx;
-
-#if CONFIG_POSTPROC
-    rtcd->postproc.down        = vp9_mbpost_proc_down_mmx;
-    /*rtcd->postproc.across      = vp9_mbpost_proc_across_ip_c;*/
-    rtcd->postproc.downacross  = vp9_post_proc_down_and_across_mmx;
-    rtcd->postproc.addnoise    = vp9_plane_add_noise_mmx;
-#endif
-  }
-
-#endif
-#if HAVE_SSE2
-
-  if (flags & HAS_SSE2) {
-
-
-    // rtcd->idct.iwalsh16     = vp9_short_inv_walsh4x4_sse2;
-
-    /* Disabled due to unsupported enhanced interpolation/high_prec mv
-    rtcd->subpix.sixtap16x16   = vp9_sixtap_predict16x16_sse2;
-    rtcd->subpix.sixtap8x8     = vp9_sixtap_predict8x8_sse2;
-    rtcd->subpix.sixtap8x4     = vp9_sixtap_predict8x4_sse2;
-    */
-    rtcd->subpix.bilinear16x16 = vp9_bilinear_predict16x16_sse2;
-    rtcd->subpix.bilinear8x8   = vp9_bilinear_predict8x8_sse2;
-
-#if CONFIG_POSTPROC
-    rtcd->postproc.down        = vp9_mbpost_proc_down_xmm;
-    rtcd->postproc.across      = vp9_mbpost_proc_across_ip_xmm;
-    rtcd->postproc.downacross  = vp9_post_proc_down_and_across_xmm;
-    rtcd->postproc.addnoise    = vp9_plane_add_noise_wmt;
-#endif
-  }
-
-#endif
-
-#if HAVE_SSSE3
-
-  if (flags & HAS_SSSE3) {
-    /* Disabled due to unsupported enhanced interpolation/high_prec mv
-    rtcd->subpix.sixtap16x16   = vp9_sixtap_predict16x16_ssse3;
-    rtcd->subpix.sixtap8x8     = vp9_sixtap_predict8x8_ssse3;
-    rtcd->subpix.sixtap8x4     = vp9_sixtap_predict8x4_ssse3;
-    rtcd->subpix.sixtap4x4     = vp9_sixtap_predict4x4_ssse3;
-    rtcd->subpix.bilinear16x16 = vp9_bilinear_predict16x16_ssse3;
-    rtcd->subpix.bilinear8x8   = vp9_bilinear_predict8x8_ssse3;
-    */
-
-    /* these are disable because of unsupported diagonal pred modes
-    rtcd->recon.build_intra_predictors_mbuv =
-      vp9_build_intra_predictors_mbuv_ssse3;
-    rtcd->recon.build_intra_predictors_mbuv_s =
-      vp9_build_intra_predictors_mbuv_s_ssse3;
-      */
-  }
-#endif
-
-#endif
-}
--- a/vp8/decoder/arm/armv6/dequant_dc_idct_v6.asm
+++ /dev/null
@@ -1,218 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license and patent
-;  grant that can be found in the LICENSE file in the root of the source
-;  tree. All contributing project authors may be found in the AUTHORS
-;  file in the root of the source tree.
-;
-
-
-    EXPORT |vp8_dequant_dc_idct_add_v6|
-
-    AREA |.text|, CODE, READONLY
-
-;void vp8_dequant_dc_idct_v6(short *input, short *dq, unsigned char *pred,
-; unsigned char *dest, int pitch, int stride, int Dc)
-; r0 = input
-; r1 = dq
-; r2 = pred
-; r3 = dest
-; sp + 36 = pitch  ; +4 = 40
-; sp + 40 = stride  ; +4 = 44
-; sp + 44 = Dc  ; +4 = 48
-
-
-|vp8_dequant_dc_idct_add_v6| PROC
-    stmdb   sp!, {r4-r11, lr}
-
-    ldr     r6, [sp, #44]
-
-    ldr     r4, [r0]                ;input
-    ldr     r5, [r1], #4            ;dq
-
-    sub     sp, sp, #4
-    str     r3, [sp]
-
-    smultt  r7, r4, r5
-
-    ldr     r4, [r0, #4]            ;input
-    ldr     r5, [r1], #4            ;dq
-
-    strh    r6, [r0], #2
-    strh    r7, [r0], #2
-
-    smulbb  r6, r4, r5
-    smultt  r7, r4, r5
-
-    ldr     r4, [r0, #4]            ;input
-    ldr     r5, [r1], #4            ;dq
-
-    strh    r6, [r0], #2
-    strh    r7, [r0], #2
-
-    mov     r12, #3
-
-vp8_dequant_dc_add_loop
-    smulbb  r6, r4, r5
-    smultt  r7, r4, r5
-
-    ldr     r4, [r0, #4]            ;input
-    ldr     r5, [r1], #4            ;dq
-
-    strh    r6, [r0], #2
-    strh    r7, [r0], #2
-
-    smulbb  r6, r4, r5
-    smultt  r7, r4, r5
-
-    subs    r12, r12, #1
-
-    ldrne   r4, [r0, #4]
-    ldrne   r5, [r1], #4
-
-    strh    r6, [r0], #2
-    strh    r7, [r0], #2
-
-    bne     vp8_dequant_dc_add_loop
-
-    sub     r0, r0, #32
-    mov     r1, r0
-
-; short_idct4x4llm_v6_dual
-    ldr     r3, cospi8sqrt2minus1
-    ldr     r4, sinpi8sqrt2
-    ldr     r6, [r0, #8]
-    mov     r5, #2
-vp8_dequant_dc_idct_loop1_v6
-    ldr     r12, [r0, #24]
-    ldr     r14, [r0, #16]
-    smulwt  r9, r3, r6
-    smulwb  r7, r3, r6
-    smulwt  r10, r4, r6
-    smulwb  r8, r4, r6
-    pkhbt   r7, r7, r9, lsl #16
-    smulwt  r11, r3, r12
-    pkhbt   r8, r8, r10, lsl #16
-    uadd16  r6, r6, r7
-    smulwt  r7, r4, r12
-    smulwb  r9, r3, r12
-    smulwb  r10, r4, r12
-    subs    r5, r5, #1
-    pkhbt   r9, r9, r11, lsl #16
-    ldr     r11, [r0], #4
-    pkhbt   r10, r10, r7, lsl #16
-    uadd16  r7, r12, r9
-    usub16  r7, r8, r7
-    uadd16  r6, r6, r10
-    uadd16  r10, r11, r14
-    usub16  r8, r11, r14
-    uadd16  r9, r10, r6
-    usub16  r10, r10, r6
-    uadd16  r6, r8, r7
-    usub16  r7, r8, r7
-    str     r6, [r1, #8]
-    ldrne   r6, [r0, #8]
-    str     r7, [r1, #16]
-    str     r10, [r1, #24]
-    str     r9, [r1], #4
-    bne     vp8_dequant_dc_idct_loop1_v6
-
-    mov     r5, #2
-    sub     r0, r1, #8
-vp8_dequant_dc_idct_loop2_v6
-    ldr     r6, [r0], #4
-    ldr     r7, [r0], #4
-    ldr     r8, [r0], #4
-    ldr     r9, [r0], #4
-    smulwt  r1, r3, r6
-    smulwt  r12, r4, r6
-    smulwt  lr, r3, r8
-    smulwt  r10, r4, r8
-    pkhbt   r11, r8, r6, lsl #16
-    pkhbt   r1, lr, r1, lsl #16
-    pkhbt   r12, r10, r12, lsl #16
-    pkhtb   r6, r6, r8, asr #16
-    uadd16  r6, r1, r6
-    pkhbt   lr, r9, r7, lsl #16
-    uadd16  r10, r11, lr
-    usub16  lr, r11, lr
-    pkhtb   r8, r7, r9, asr #16
-    subs    r5, r5, #1
-    smulwt  r1, r3, r8
-    smulwb  r7, r3, r8
-    smulwt  r11, r4, r8
-    smulwb  r9, r4, r8
-    pkhbt   r1, r7, r1, lsl #16
-    uadd16  r8, r1, r8
-    pkhbt   r11, r9, r11, lsl #16
-    usub16  r1, r12, r8
-    uadd16  r8, r11, r6
-    ldr     r9, c0x00040004
-    ldr     r12, [sp, #40]
-    uadd16  r6, r10, r8
-    usub16  r7, r10, r8
-    uadd16  r7, r7, r9
-    uadd16  r6, r6, r9
-    uadd16  r10, r14, r1
-    usub16  r1, r14, r1
-    uadd16  r10, r10, r9
-    uadd16  r1, r1, r9
-    ldr     r11, [r2], r12
-    mov     r8, r7, asr #3
-    pkhtb   r9, r8, r10, asr #19
-    mov     r8, r1, asr #3
-    pkhtb   r8, r8, r6, asr #19
-    uxtb16  lr, r11, ror #8
-    qadd16  r9, r9, lr
-    uxtb16  lr, r11
-    qadd16  r8, r8, lr
-    usat16  r9, #8, r9
-    usat16  r8, #8, r8
-    orr     r9, r8, r9, lsl #8
-    ldr     r11, [r2], r12
-    ldr     lr, [sp]
-    ldr     r12, [sp, #44]
-    mov     r7, r7, lsl #16
-    mov     r1, r1, lsl #16
-    mov     r10, r10, lsl #16
-    mov     r6, r6, lsl #16
-    mov     r7, r7, asr #3
-    pkhtb   r7, r7, r10, asr #19
-    mov     r1, r1, asr #3
-    pkhtb   r1, r1, r6, asr #19
-    uxtb16  r8, r11, ror #8
-    qadd16  r7, r7, r8
-    uxtb16  r8, r11
-    qadd16  r1, r1, r8
-    usat16  r7, #8, r7
-    usat16  r1, #8, r1
-    orr     r1, r1, r7, lsl #8
-    str     r9, [lr], r12
-    str     r1, [lr], r12
-    str     lr, [sp]
-    bne     vp8_dequant_dc_idct_loop2_v6
-
-; vpx_memset
-    sub     r0, r0, #32
-    add     sp, sp, #4
-
-    mov     r12, #0
-    str     r12, [r0]
-    str     r12, [r0, #4]
-    str     r12, [r0, #8]
-    str     r12, [r0, #12]
-    str     r12, [r0, #16]
-    str     r12, [r0, #20]
-    str     r12, [r0, #24]
-    str     r12, [r0, #28]
-
-    ldmia   sp!, {r4 - r11, pc}
-    ENDP    ; |vp8_dequant_dc_idct_add_v6|
-
-; Constant Pool
-cospi8sqrt2minus1 DCD 0x00004E7B
-sinpi8sqrt2       DCD 0x00008A8C
-c0x00040004       DCD 0x00040004
-
-    END
--- a/vp8/decoder/arm/armv6/dequant_idct_v6.asm
+++ /dev/null
@@ -1,196 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license and patent
-;  grant that can be found in the LICENSE file in the root of the source
-;  tree. All contributing project authors may be found in the AUTHORS
-;  file in the root of the source tree.
-;
-
-    EXPORT |vp8_dequant_idct_add_v6|
-
-    AREA |.text|, CODE, READONLY
-;void vp8_dequant_idct_v6(short *input, short *dq, unsigned char *pred,
-; unsigned char *dest, int pitch, int stride)
-; r0 = input
-; r1 = dq
-; r2 = pred
-; r3 = dest
-; sp + 36 = pitch  ; +4 = 40
-; sp + 40 = stride  ; +4 = 44
-
-
-|vp8_dequant_idct_add_v6| PROC
-    stmdb   sp!, {r4-r11, lr}
-
-    ldr     r4, [r0]                ;input
-    ldr     r5, [r1], #4            ;dq
-
-    sub     sp, sp, #4
-    str     r3, [sp]
-
-    mov     r12, #4
-
-vp8_dequant_add_loop
-    smulbb  r6, r4, r5
-    smultt  r7, r4, r5
-
-    ldr     r4, [r0, #4]            ;input
-    ldr     r5, [r1], #4            ;dq
-
-    strh    r6, [r0], #2
-    strh    r7, [r0], #2
-
-    smulbb  r6, r4, r5
-    smultt  r7, r4, r5
-
-    subs    r12, r12, #1
-
-    ldrne   r4, [r0, #4]
-    ldrne   r5, [r1], #4
-
-    strh    r6, [r0], #2
-    strh    r7, [r0], #2
-
-    bne     vp8_dequant_add_loop
-
-    sub     r0, r0, #32
-    mov     r1, r0
-
-; short_idct4x4llm_v6_dual
-    ldr     r3, cospi8sqrt2minus1
-    ldr     r4, sinpi8sqrt2
-    ldr     r6, [r0, #8]
-    mov     r5, #2
-vp8_dequant_idct_loop1_v6
-    ldr     r12, [r0, #24]
-    ldr     r14, [r0, #16]
-    smulwt  r9, r3, r6
-    smulwb  r7, r3, r6
-    smulwt  r10, r4, r6
-    smulwb  r8, r4, r6
-    pkhbt   r7, r7, r9, lsl #16
-    smulwt  r11, r3, r12
-    pkhbt   r8, r8, r10, lsl #16
-    uadd16  r6, r6, r7
-    smulwt  r7, r4, r12
-    smulwb  r9, r3, r12
-    smulwb  r10, r4, r12
-    subs    r5, r5, #1
-    pkhbt   r9, r9, r11, lsl #16
-    ldr     r11, [r0], #4
-    pkhbt   r10, r10, r7, lsl #16
-    uadd16  r7, r12, r9
-    usub16  r7, r8, r7
-    uadd16  r6, r6, r10
-    uadd16  r10, r11, r14
-    usub16  r8, r11, r14
-    uadd16  r9, r10, r6
-    usub16  r10, r10, r6
-    uadd16  r6, r8, r7
-    usub16  r7, r8, r7
-    str     r6, [r1, #8]
-    ldrne   r6, [r0, #8]
-    str     r7, [r1, #16]
-    str     r10, [r1, #24]
-    str     r9, [r1], #4
-    bne     vp8_dequant_idct_loop1_v6
-
-    mov     r5, #2
-    sub     r0, r1, #8
-vp8_dequant_idct_loop2_v6
-    ldr     r6, [r0], #4
-    ldr     r7, [r0], #4
-    ldr     r8, [r0], #4
-    ldr     r9, [r0], #4
-    smulwt  r1, r3, r6
-    smulwt  r12, r4, r6
-    smulwt  lr, r3, r8
-    smulwt  r10, r4, r8
-    pkhbt   r11, r8, r6, lsl #16
-    pkhbt   r1, lr, r1, lsl #16
-    pkhbt   r12, r10, r12, lsl #16
-    pkhtb   r6, r6, r8, asr #16
-    uadd16  r6, r1, r6
-    pkhbt   lr, r9, r7, lsl #16
-    uadd16  r10, r11, lr
-    usub16  lr, r11, lr
-    pkhtb   r8, r7, r9, asr #16
-    subs    r5, r5, #1
-    smulwt  r1, r3, r8
-    smulwb  r7, r3, r8
-    smulwt  r11, r4, r8
-    smulwb  r9, r4, r8
-    pkhbt   r1, r7, r1, lsl #16
-    uadd16  r8, r1, r8
-    pkhbt   r11, r9, r11, lsl #16
-    usub16  r1, r12, r8
-    uadd16  r8, r11, r6
-    ldr     r9, c0x00040004
-    ldr     r12, [sp, #40]
-    uadd16  r6, r10, r8
-    usub16  r7, r10, r8
-    uadd16  r7, r7, r9
-    uadd16  r6, r6, r9
-    uadd16  r10, r14, r1
-    usub16  r1, r14, r1
-    uadd16  r10, r10, r9
-    uadd16  r1, r1, r9
-    ldr     r11, [r2], r12
-    mov     r8, r7, asr #3
-    pkhtb   r9, r8, r10, asr #19
-    mov     r8, r1, asr #3
-    pkhtb   r8, r8, r6, asr #19
-    uxtb16  lr, r11, ror #8
-    qadd16  r9, r9, lr
-    uxtb16  lr, r11
-    qadd16  r8, r8, lr
-    usat16  r9, #8, r9
-    usat16  r8, #8, r8
-    orr     r9, r8, r9, lsl #8
-    ldr     r11, [r2], r12
-    ldr     lr, [sp]
-    ldr     r12, [sp, #44]
-    mov     r7, r7, lsl #16
-    mov     r1, r1, lsl #16
-    mov     r10, r10, lsl #16
-    mov     r6, r6, lsl #16
-    mov     r7, r7, asr #3
-    pkhtb   r7, r7, r10, asr #19
-    mov     r1, r1, asr #3
-    pkhtb   r1, r1, r6, asr #19
-    uxtb16  r8, r11, ror #8
-    qadd16  r7, r7, r8
-    uxtb16  r8, r11
-    qadd16  r1, r1, r8
-    usat16  r7, #8, r7
-    usat16  r1, #8, r1
-    orr     r1, r1, r7, lsl #8
-    str     r9, [lr], r12
-    str     r1, [lr], r12
-    str     lr, [sp]
-    bne     vp8_dequant_idct_loop2_v6
-
-; vpx_memset
-    sub     r0, r0, #32
-    add     sp, sp, #4
-
-    mov     r12, #0
-    str     r12, [r0]
-    str     r12, [r0, #4]
-    str     r12, [r0, #8]
-    str     r12, [r0, #12]
-    str     r12, [r0, #16]
-    str     r12, [r0, #20]
-    str     r12, [r0, #24]
-    str     r12, [r0, #28]
-
-    ldmia   sp!, {r4 - r11, pc}
-    ENDP    ; |vp8_dequant_idct_add_v6|
-
-; Constant Pool
-cospi8sqrt2minus1 DCD 0x00004E7B
-sinpi8sqrt2       DCD 0x00008A8C
-c0x00040004       DCD 0x00040004
-
-    END
--- a/vp8/decoder/arm/armv6/dequantize_v6.asm
+++ /dev/null
@@ -1,69 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp8_dequantize_b_loop_v6|
-
-    AREA    |.text|, CODE, READONLY  ; name this block of code
-;-------------------------------
-;void   vp8_dequantize_b_loop_v6(short *Q, short *DQC, short *DQ);
-; r0    short *Q,
-; r1    short *DQC
-; r2    short *DQ
-|vp8_dequantize_b_loop_v6| PROC
-    stmdb   sp!, {r4-r9, lr}
-
-    ldr     r3, [r0]                ;load Q
-    ldr     r4, [r1]                ;load DQC
-    ldr     r5, [r0, #4]
-    ldr     r6, [r1, #4]
-
-    mov     r12, #2                 ;loop counter
-
-dequant_loop
-    smulbb  r7, r3, r4              ;multiply
-    smultt  r8, r3, r4
-    smulbb  r9, r5, r6
-    smultt  lr, r5, r6
-
-    ldr     r3, [r0, #8]
-    ldr     r4, [r1, #8]
-    ldr     r5, [r0, #12]
-    ldr     r6, [r1, #12]
-
-    strh    r7, [r2], #2            ;store result
-    smulbb  r7, r3, r4              ;multiply
-    strh    r8, [r2], #2
-    smultt  r8, r3, r4
-    strh    r9, [r2], #2
-    smulbb  r9, r5, r6
-    strh    lr, [r2], #2
-    smultt  lr, r5, r6
-
-    subs    r12, r12, #1
-
-    add     r0, r0, #16
-    add     r1, r1, #16
-
-    ldrne       r3, [r0]
-    strh    r7, [r2], #2            ;store result
-    ldrne       r4, [r1]
-    strh    r8, [r2], #2
-    ldrne       r5, [r0, #4]
-    strh    r9, [r2], #2
-    ldrne       r6, [r1, #4]
-    strh    lr, [r2], #2
-
-    bne     dequant_loop
-
-    ldmia   sp!, {r4-r9, pc}
-    ENDP    ;|vp8_dequantize_b_loop_v6|
-
-    END
--- a/vp8/decoder/arm/armv6/idct_blk_v6.c
+++ /dev/null
@@ -1,136 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "vpx_ports/config.h"
-#include "vp8/common/idct.h"
-#include "vp8/decoder/dequantize.h"
-
-void vp8_dequant_dc_idct_add_y_block_v6
-(short *q, short *dq, unsigned char *pre,
- unsigned char *dst, int stride, char *eobs, short *dc) {
-  int i;
-
-  for (i = 0; i < 4; i++) {
-    if (eobs[0] > 1)
-      vp8_dequant_dc_idct_add_v6(q, dq, pre, dst, 16, stride, dc[0]);
-    else
-      vp8_dc_only_idct_add_v6(dc[0], pre, dst, 16, stride);
-
-    if (eobs[1] > 1)
-      vp8_dequant_dc_idct_add_v6(q + 16, dq, pre + 4, dst + 4, 16, stride, dc[1]);
-    else
-      vp8_dc_only_idct_add_v6(dc[1], pre + 4, dst + 4, 16, stride);
-
-    if (eobs[2] > 1)
-      vp8_dequant_dc_idct_add_v6(q + 32, dq, pre + 8, dst + 8, 16, stride, dc[2]);
-    else
-      vp8_dc_only_idct_add_v6(dc[2], pre + 8, dst + 8, 16, stride);
-
-    if (eobs[3] > 1)
-      vp8_dequant_dc_idct_add_v6(q + 48, dq, pre + 12, dst + 12, 16, stride, dc[3]);
-    else
-      vp8_dc_only_idct_add_v6(dc[3], pre + 12, dst + 12, 16, stride);
-
-    q    += 64;
-    dc   += 4;
-    pre  += 64;
-    dst  += 4 * stride;
-    eobs += 4;
-  }
-}
-
-void vp8_dequant_idct_add_y_block_v6
-(short *q, short *dq, unsigned char *pre,
- unsigned char *dst, int stride, char *eobs) {
-  int i;
-
-  for (i = 0; i < 4; i++) {
-    if (eobs[0] > 1)
-      vp8_dequant_idct_add_v6(q, dq, pre, dst, 16, stride);
-    else {
-      vp8_dc_only_idct_add_v6(q[0]*dq[0], pre, dst, 16, stride);
-      ((int *)q)[0] = 0;
-    }
-
-    if (eobs[1] > 1)
-      vp8_dequant_idct_add_v6(q + 16, dq, pre + 4, dst + 4, 16, stride);
-    else {
-      vp8_dc_only_idct_add_v6(q[16]*dq[0], pre + 4, dst + 4, 16, stride);
-      ((int *)(q + 16))[0] = 0;
-    }
-
-    if (eobs[2] > 1)
-      vp8_dequant_idct_add_v6(q + 32, dq, pre + 8, dst + 8, 16, stride);
-    else {
-      vp8_dc_only_idct_add_v6(q[32]*dq[0], pre + 8, dst + 8, 16, stride);
-      ((int *)(q + 32))[0] = 0;
-    }
-
-    if (eobs[3] > 1)
-      vp8_dequant_idct_add_v6(q + 48, dq, pre + 12, dst + 12, 16, stride);
-    else {
-      vp8_dc_only_idct_add_v6(q[48]*dq[0], pre + 12, dst + 12, 16, stride);
-      ((int *)(q + 48))[0] = 0;
-    }
-
-    q    += 64;
-    pre  += 64;
-    dst  += 4 * stride;
-    eobs += 4;
-  }
-}
-
-void vp8_dequant_idct_add_uv_block_v6
-(short *q, short *dq, unsigned char *pre,
- unsigned char *dstu, unsigned char *dstv, int stride, char *eobs) {
-  int i;
-
-  for (i = 0; i < 2; i++) {
-    if (eobs[0] > 1)
-      vp8_dequant_idct_add_v6(q, dq, pre, dstu, 8, stride);
-    else {
-      vp8_dc_only_idct_add_v6(q[0]*dq[0], pre, dstu, 8, stride);
-      ((int *)q)[0] = 0;
-    }
-
-    if (eobs[1] > 1)
-      vp8_dequant_idct_add_v6(q + 16, dq, pre + 4, dstu + 4, 8, stride);
-    else {
-      vp8_dc_only_idct_add_v6(q[16]*dq[0], pre + 4, dstu + 4, 8, stride);
-      ((int *)(q + 16))[0] = 0;
-    }
-
-    q    += 32;
-    pre  += 32;
-    dstu += 4 * stride;
-    eobs += 2;
-  }
-
-  for (i = 0; i < 2; i++) {
-    if (eobs[0] > 1)
-      vp8_dequant_idct_add_v6(q, dq, pre, dstv, 8, stride);
-    else {
-      vp8_dc_only_idct_add_v6(q[0]*dq[0], pre, dstv, 8, stride);
-      ((int *)q)[0] = 0;
-    }
-
-    if (eobs[1] > 1)
-      vp8_dequant_idct_add_v6(q + 16, dq, pre + 4, dstv + 4, 8, stride);
-    else {
-      vp8_dc_only_idct_add_v6(q[16]*dq[0], pre + 4, dstv + 4, 8, stride);
-      ((int *)(q + 16))[0] = 0;
-    }
-
-    q    += 32;
-    pre  += 32;
-    dstv += 4 * stride;
-    eobs += 2;
-  }
-}
--- a/vp8/decoder/arm/dequantize_arm.c
+++ /dev/null
@@ -1,44 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vpx_ports/config.h"
-#include "vp8/decoder/dequantize.h"
-#include "vp8/common/idct.h"
-#include "vpx_mem/vpx_mem.h"
-
-#if HAVE_ARMV7
-extern void vp9_dequantize_b_loop_neon(short *Q, short *DQC, short *DQ);
-#endif
-
-#if HAVE_ARMV6
-extern void vp9_dequantize_b_loop_v6(short *Q, short *DQC, short *DQ);
-#endif
-
-#if HAVE_ARMV7
-
-void vp9_dequantize_b_neon(BLOCKD *d) {
-  short *DQ  = d->dqcoeff;
-  short *Q   = d->qcoeff;
-  short *DQC = d->dequant;
-
-  vp9_dequantize_b_loop_neon(Q, DQC, DQ);
-}
-#endif
-
-#if HAVE_ARMV6
-void vp9_dequantize_b_v6(BLOCKD *d) {
-  short *DQ  = d->dqcoeff;
-  short *Q   = d->qcoeff;
-  short *DQC = d->dequant;
-
-  vp9_dequantize_b_loop_v6(Q, DQC, DQ);
-}
-#endif
--- a/vp8/decoder/arm/neon/dequant_idct_neon.asm
+++ /dev/null
@@ -1,129 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp8_dequant_idct_add_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-;void vp8_dequant_idct_neon(short *input, short *dq, unsigned char *pred,
-;                           unsigned char *dest, int pitch, int stride)
-; r0    short *input,
-; r1    short *dq,
-; r2    unsigned char *pred
-; r3    unsigned char *dest
-; sp    int pitch
-; sp+4  int stride
-
-|vp8_dequant_idct_add_neon| PROC
-    vld1.16         {q3, q4}, [r0]
-    vld1.16         {q5, q6}, [r1]
-    ldr             r1, [sp]                ; pitch
-    vld1.32         {d14[0]}, [r2], r1
-    vld1.32         {d14[1]}, [r2], r1
-    vld1.32         {d15[0]}, [r2], r1
-    vld1.32         {d15[1]}, [r2]
-
-    ldr             r1, [sp, #4]            ; stride
-
-    adr             r12, cospi8sqrt2minus1  ; pointer to the first constant
-
-    vmul.i16        q1, q3, q5              ;input for short_idct4x4llm_neon
-    vmul.i16        q2, q4, q6
-
-;|short_idct4x4llm_neon| PROC
-    vld1.16         {d0}, [r12]
-    vswp            d3, d4                  ;q2(vp[4] vp[12])
-
-    vqdmulh.s16     q3, q2, d0[2]
-    vqdmulh.s16     q4, q2, d0[0]
-
-    vqadd.s16       d12, d2, d3             ;a1
-    vqsub.s16       d13, d2, d3             ;b1
-
-    vshr.s16        q3, q3, #1
-    vshr.s16        q4, q4, #1
-
-    vqadd.s16       q3, q3, q2
-    vqadd.s16       q4, q4, q2
-
-    vqsub.s16       d10, d6, d9             ;c1
-    vqadd.s16       d11, d7, d8             ;d1
-
-    vqadd.s16       d2, d12, d11
-    vqadd.s16       d3, d13, d10
-    vqsub.s16       d4, d13, d10
-    vqsub.s16       d5, d12, d11
-
-    vtrn.32         d2, d4
-    vtrn.32         d3, d5
-    vtrn.16         d2, d3
-    vtrn.16         d4, d5
-
-; memset(input, 0, 32) -- 32bytes
-    vmov.i16        q14, #0
-
-    vswp            d3, d4
-    vqdmulh.s16     q3, q2, d0[2]
-    vqdmulh.s16     q4, q2, d0[0]
-
-    vqadd.s16       d12, d2, d3             ;a1
-    vqsub.s16       d13, d2, d3             ;b1
-
-    vmov            q15, q14
-
-    vshr.s16        q3, q3, #1
-    vshr.s16        q4, q4, #1
-
-    vqadd.s16       q3, q3, q2
-    vqadd.s16       q4, q4, q2
-
-    vqsub.s16       d10, d6, d9             ;c1
-    vqadd.s16       d11, d7, d8             ;d1
-
-    vqadd.s16       d2, d12, d11
-    vqadd.s16       d3, d13, d10
-    vqsub.s16       d4, d13, d10
-    vqsub.s16       d5, d12, d11
-
-    vst1.16         {q14, q15}, [r0]
-
-    vrshr.s16       d2, d2, #3
-    vrshr.s16       d3, d3, #3
-    vrshr.s16       d4, d4, #3
-    vrshr.s16       d5, d5, #3
-
-    vtrn.32         d2, d4
-    vtrn.32         d3, d5
-    vtrn.16         d2, d3
-    vtrn.16         d4, d5
-
-    vaddw.u8        q1, q1, d14
-    vaddw.u8        q2, q2, d15
-
-    vqmovun.s16     d0, q1
-    vqmovun.s16     d1, q2
-
-    vst1.32         {d0[0]}, [r3], r1
-    vst1.32         {d0[1]}, [r3], r1
-    vst1.32         {d1[0]}, [r3], r1
-    vst1.32         {d1[1]}, [r3]
-
-    bx             lr
-
-    ENDP           ; |vp8_dequant_idct_add_neon|
-
-; Constant Pool
-cospi8sqrt2minus1 DCD 0x4e7b4e7b
-sinpi8sqrt2       DCD 0x8a8c8a8c
-
-    END
--- a/vp8/decoder/arm/neon/dequantizeb_neon.asm
+++ /dev/null
@@ -1,34 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp8_dequantize_b_loop_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-; r0    short *Q,
-; r1    short *DQC
-; r2    short *DQ
-|vp8_dequantize_b_loop_neon| PROC
-    vld1.16         {q0, q1}, [r0]
-    vld1.16         {q2, q3}, [r1]
-
-    vmul.i16        q4, q0, q2
-    vmul.i16        q5, q1, q3
-
-    vst1.16         {q4, q5}, [r2]
-
-    bx             lr
-
-    ENDP
-
-    END
--- a/vp8/decoder/arm/neon/idct_blk_neon.c
+++ /dev/null
@@ -1,110 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "vpx_ports/config.h"
-#include "vp8/common/idct.h"
-#include "vp8/decoder/dequantize.h"
-
-/* place these declarations here because we don't want to maintain them
- * outside of this scope
- */
-void idct_dequant_dc_full_2x_neon
-(short *input, short *dq, unsigned char *pre, unsigned char *dst,
- int stride, short *dc);
-void idct_dequant_dc_0_2x_neon
-(short *dc, unsigned char *pre, unsigned char *dst, int stride);
-void idct_dequant_full_2x_neon
-(short *q, short *dq, unsigned char *pre, unsigned char *dst,
- int pitch, int stride);
-void idct_dequant_0_2x_neon
-(short *q, short dq, unsigned char *pre, int pitch,
- unsigned char *dst, int stride);
-
-void vp8_dequant_dc_idct_add_y_block_neon
-(short *q, short *dq, unsigned char *pre,
- unsigned char *dst, int stride, char *eobs, short *dc) {
-  int i;
-
-  for (i = 0; i < 4; i++) {
-    if (((short *)eobs)[0] & 0xfefe)
-      idct_dequant_dc_full_2x_neon(q, dq, pre, dst, stride, dc);
-    else
-      idct_dequant_dc_0_2x_neon(dc, pre, dst, stride);
-
-    if (((short *)eobs)[1] & 0xfefe)
-      idct_dequant_dc_full_2x_neon(q + 32, dq, pre + 8, dst + 8, stride, dc + 2);
-    else
-      idct_dequant_dc_0_2x_neon(dc + 2, pre + 8, dst + 8, stride);
-
-    q    += 64;
-    dc   += 4;
-    pre  += 64;
-    dst  += 4 * stride;
-    eobs += 4;
-  }
-}
-
-void vp8_dequant_idct_add_y_block_neon
-(short *q, short *dq, unsigned char *pre,
- unsigned char *dst, int stride, char *eobs) {
-  int i;
-
-  for (i = 0; i < 4; i++) {
-    if (((short *)eobs)[0] & 0xfefe)
-      idct_dequant_full_2x_neon(q, dq, pre, dst, 16, stride);
-    else
-      idct_dequant_0_2x_neon(q, dq[0], pre, 16, dst, stride);
-
-    if (((short *)eobs)[1] & 0xfefe)
-      idct_dequant_full_2x_neon(q + 32, dq, pre + 8, dst + 8, 16, stride);
-    else
-      idct_dequant_0_2x_neon(q + 32, dq[0], pre + 8, 16, dst + 8, stride);
-
-    q    += 64;
-    pre  += 64;
-    dst  += 4 * stride;
-    eobs += 4;
-  }
-}
-
-void vp8_dequant_idct_add_uv_block_neon
-(short *q, short *dq, unsigned char *pre,
- unsigned char *dstu, unsigned char *dstv, int stride, char *eobs) {
-  if (((short *)eobs)[0] & 0xfefe)
-    idct_dequant_full_2x_neon(q, dq, pre, dstu, 8, stride);
-  else
-    idct_dequant_0_2x_neon(q, dq[0], pre, 8, dstu, stride);
-
-  q    += 32;
-  pre  += 32;
-  dstu += 4 * stride;
-
-  if (((short *)eobs)[1] & 0xfefe)
-    idct_dequant_full_2x_neon(q, dq, pre, dstu, 8, stride);
-  else
-    idct_dequant_0_2x_neon(q, dq[0], pre, 8, dstu, stride);
-
-  q += 32;
-  pre += 32;
-
-  if (((short *)eobs)[2] & 0xfefe)
-    idct_dequant_full_2x_neon(q, dq, pre, dstv, 8, stride);
-  else
-    idct_dequant_0_2x_neon(q, dq[0], pre, 8, dstv, stride);
-
-  q    += 32;
-  pre  += 32;
-  dstv += 4 * stride;
-
-  if (((short *)eobs)[3] & 0xfefe)
-    idct_dequant_full_2x_neon(q, dq, pre, dstv, 8, stride);
-  else
-    idct_dequant_0_2x_neon(q, dq[0], pre, 8, dstv, stride);
-}
--- a/vp8/decoder/arm/neon/idct_dequant_0_2x_neon.asm
+++ /dev/null
@@ -1,79 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license and patent
-;  grant that can be found in the LICENSE file in the root of the source
-;  tree. All contributing project authors may be found in the AUTHORS
-;  file in the root of the source tree.
-;
-
-
-    EXPORT  |idct_dequant_0_2x_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-;void idct_dequant_0_2x_neon(short *q, short dq, unsigned char *pre,
-;                            int pitch, unsigned char *dst, int stride);
-; r0   *q
-; r1   dq
-; r2   *pre
-; r3   pitch
-; sp   *dst
-; sp+4 stride
-|idct_dequant_0_2x_neon| PROC
-    add             r12, r2, #4
-    vld1.32         {d2[0]}, [r2], r3
-    vld1.32         {d2[1]}, [r2], r3
-    vld1.32         {d4[0]}, [r2], r3
-    vld1.32         {d4[1]}, [r2]
-    vld1.32         {d8[0]}, [r12], r3
-    vld1.32         {d8[1]}, [r12], r3
-    vld1.32         {d10[0]}, [r12], r3
-    vld1.32         {d10[1]}, [r12]
-
-    ldrh            r12, [r0]               ; lo q
-    ldrh            r2, [r0, #32]           ; hi q
-    mov             r3, #0
-    strh            r3, [r0]
-    strh            r3, [r0, #32]
-
-    sxth            r12, r12                ; lo
-    mul             r0, r12, r1
-    add             r0, r0, #4
-    asr             r0, r0, #3
-    vdup.16         q0, r0
-    sxth            r2, r2                  ; hi
-    mul             r0, r2, r1
-    add             r0, r0, #4
-    asr             r0, r0, #3
-    vdup.16         q3, r0
-
-    vaddw.u8        q1, q0, d2              ; lo
-    vaddw.u8        q2, q0, d4
-    vaddw.u8        q4, q3, d8              ; hi
-    vaddw.u8        q5, q3, d10
-
-    ldr             r2, [sp]                ; dst
-    ldr             r3, [sp, #4]            ; stride
-
-    vqmovun.s16     d2, q1                  ; lo
-    vqmovun.s16     d4, q2
-    vqmovun.s16     d8, q4                  ; hi
-    vqmovun.s16     d10, q5
-
-    add             r0, r2, #4
-    vst1.32         {d2[0]}, [r2], r3       ; lo
-    vst1.32         {d2[1]}, [r2], r3
-    vst1.32         {d4[0]}, [r2], r3
-    vst1.32         {d4[1]}, [r2]
-    vst1.32         {d8[0]}, [r0], r3       ; hi
-    vst1.32         {d8[1]}, [r0], r3
-    vst1.32         {d10[0]}, [r0], r3
-    vst1.32         {d10[1]}, [r0]
-
-    bx             lr
-
-    ENDP           ; |idct_dequant_0_2x_neon|
-    END
--- a/vp8/decoder/arm/neon/idct_dequant_dc_0_2x_neon.asm
+++ /dev/null
@@ -1,69 +1,0 @@
-;
-;  Copyright (c) 2010 The Webm project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license and patent
-;  grant that can be found in the LICENSE file in the root of the source
-;  tree. All contributing project authors may be found in the AUTHORS
-;  file in the root of the source tree.
-;
-
-
-    EXPORT  |idct_dequant_dc_0_2x_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-;void idct_dequant_dc_0_2x_neon(short *dc, unsigned char *pre,
-;                               unsigned char *dst, int stride);
-; r0  *dc
-; r1  *pre
-; r2  *dst
-; r3  stride
-|idct_dequant_dc_0_2x_neon| PROC
-    ldr             r0, [r0]                ; *dc
-    mov             r12, #16
-
-    vld1.32         {d2[0]}, [r1], r12      ; lo
-    vld1.32         {d2[1]}, [r1], r12
-    vld1.32         {d4[0]}, [r1], r12
-    vld1.32         {d4[1]}, [r1]
-    sub             r1, r1, #44
-    vld1.32         {d8[0]}, [r1], r12      ; hi
-    vld1.32         {d8[1]}, [r1], r12
-    vld1.32         {d10[0]}, [r1], r12
-    vld1.32         {d10[1]}, [r1]
-
-    sxth            r1, r0                  ; lo *dc
-    add             r1, r1, #4
-    asr             r1, r1, #3
-    vdup.16         q0, r1
-    sxth            r0, r0, ror #16         ; hi *dc
-    add             r0, r0, #4
-    asr             r0, r0, #3
-    vdup.16         q3, r0
-
-    vaddw.u8        q1, q0, d2              ; lo
-    vaddw.u8        q2, q0, d4
-    vaddw.u8        q4, q3, d8              ; hi
-    vaddw.u8        q5, q3, d10
-
-    vqmovun.s16     d2, q1                  ; lo
-    vqmovun.s16     d4, q2
-    vqmovun.s16     d8, q4                  ; hi
-    vqmovun.s16     d10, q5
-
-    add             r0, r2, #4
-    vst1.32         {d2[0]}, [r2], r3       ; lo
-    vst1.32         {d2[1]}, [r2], r3
-    vst1.32         {d4[0]}, [r2], r3
-    vst1.32         {d4[1]}, [r2]
-    vst1.32         {d8[0]}, [r0], r3       ; hi
-    vst1.32         {d8[1]}, [r0], r3
-    vst1.32         {d10[0]}, [r0], r3
-    vst1.32         {d10[1]}, [r0]
-
-    bx             lr
-
-    ENDP           ;|idct_dequant_dc_0_2x_neon|
-    END
--- a/vp8/decoder/arm/neon/idct_dequant_dc_full_2x_neon.asm
+++ /dev/null
@@ -1,205 +1,0 @@
-;
-;  Copyright (c) 2010 The Webm project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |idct_dequant_dc_full_2x_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-;void idct_dequant_dc_full_2x_neon(short *q, short *dq, unsigned char *pre,
-;                                  unsigned char *dst, int stride, short *dc);
-; r0    *q,
-; r1    *dq,
-; r2    *pre
-; r3    *dst
-; sp    stride
-; sp+4  *dc
-|idct_dequant_dc_full_2x_neon| PROC
-    vld1.16         {q0, q1}, [r1]          ; dq (same l/r)
-    vld1.16         {q2, q3}, [r0]          ; l q
-    mov             r1, #16                 ; pitch
-    add             r0, r0, #32
-    vld1.16         {q4, q5}, [r0]          ; r q
-    add             r12, r2, #4
-    ; interleave the predictors
-    vld1.32         {d28[0]}, [r2], r1      ; l pre
-    vld1.32         {d28[1]}, [r12], r1     ; r pre
-    vld1.32         {d29[0]}, [r2], r1
-    vld1.32         {d29[1]}, [r12], r1
-    vld1.32         {d30[0]}, [r2], r1
-    vld1.32         {d30[1]}, [r12], r1
-    vld1.32         {d31[0]}, [r2]
-    ldr             r1, [sp, #4]
-    vld1.32         {d31[1]}, [r12]
-
-    adr             r2, cospi8sqrt2minus1   ; pointer to the first constant
-
-    ldrh            r12, [r1], #2           ; lo *dc
-    ldrh            r1, [r1]                ; hi *dc
-
-    ; dequant: q[i] = q[i] * dq[i]
-    vmul.i16        q2, q2, q0
-    vmul.i16        q3, q3, q1
-    vmul.i16        q4, q4, q0
-    vmul.i16        q5, q5, q1
-
-    ; move dc up to neon and overwrite first element
-    vmov.16         d4[0], r12
-    vmov.16         d8[0], r1
-
-    vld1.16         {d0}, [r2]
-
-    ; q2: l0r0  q3: l8r8
-    ; q4: l4r4  q5: l12r12
-    vswp            d5, d8
-    vswp            d7, d10
-
-    ; _CONSTANTS_ * 4,12 >> 16
-    ; q6:  4 * sinpi : c1/temp1
-    ; q7: 12 * sinpi : d1/temp2
-    ; q8:  4 * cospi
-    ; q9: 12 * cospi
-    vqdmulh.s16     q6, q4, d0[2]           ; sinpi8sqrt2
-    vqdmulh.s16     q7, q5, d0[2]
-    vqdmulh.s16     q8, q4, d0[0]           ; cospi8sqrt2minus1
-    vqdmulh.s16     q9, q5, d0[0]
-
-    vqadd.s16       q10, q2, q3             ; a1 = 0 + 8
-    vqsub.s16       q11, q2, q3             ; b1 = 0 - 8
-
-    ; vqdmulh only accepts signed values. this was a problem because
-    ; our constant had the high bit set, and was treated as a negative value.
-    ; vqdmulh also doubles the value before it shifts by 16. we need to
-    ; compensate for this. in the case of sinpi8sqrt2, the lowest bit is 0,
-    ; so we can shift the constant without losing precision. this avoids
-    ; shift again afterward, but also avoids the sign issue. win win!
-    ; for cospi8sqrt2minus1 the lowest bit is 1, so we lose precision if we
-    ; pre-shift it
-    vshr.s16        q8, q8, #1
-    vshr.s16        q9, q9, #1
-
-    ; q4:  4 +  4 * cospi : d1/temp1
-    ; q5: 12 + 12 * cospi : c1/temp2
-    vqadd.s16       q4, q4, q8
-    vqadd.s16       q5, q5, q9
-
-    ; c1 = temp1 - temp2
-    ; d1 = temp1 + temp2
-    vqsub.s16       q2, q6, q5
-    vqadd.s16       q3, q4, q7
-
-    ; [0]: a1+d1
-    ; [1]: b1+c1
-    ; [2]: b1-c1
-    ; [3]: a1-d1
-    vqadd.s16       q4, q10, q3
-    vqadd.s16       q5, q11, q2
-    vqsub.s16       q6, q11, q2
-    vqsub.s16       q7, q10, q3
-
-    ; rotate
-    vtrn.32         q4, q6
-    vtrn.32         q5, q7
-    vtrn.16         q4, q5
-    vtrn.16         q6, q7
-    ; idct loop 2
-    ; q4: l 0, 4, 8,12 r 0, 4, 8,12
-    ; q5: l 1, 5, 9,13 r 1, 5, 9,13
-    ; q6: l 2, 6,10,14 r 2, 6,10,14
-    ; q7: l 3, 7,11,15 r 3, 7,11,15
-
-    ; q8:  1 * sinpi : c1/temp1
-    ; q9:  3 * sinpi : d1/temp2
-    ; q10: 1 * cospi
-    ; q11: 3 * cospi
-    vqdmulh.s16     q8, q5, d0[2]           ; sinpi8sqrt2
-    vqdmulh.s16     q9, q7, d0[2]
-    vqdmulh.s16     q10, q5, d0[0]          ; cospi8sqrt2minus1
-    vqdmulh.s16     q11, q7, d0[0]
-
-    vqadd.s16       q2, q4, q6             ; a1 = 0 + 2
-    vqsub.s16       q3, q4, q6             ; b1 = 0 - 2
-
-    ; see note on shifting above
-    vshr.s16        q10, q10, #1
-    vshr.s16        q11, q11, #1
-
-    ; q10: 1 + 1 * cospi : d1/temp1
-    ; q11: 3 + 3 * cospi : c1/temp2
-    vqadd.s16       q10, q5, q10
-    vqadd.s16       q11, q7, q11
-
-    ; q8: c1 = temp1 - temp2
-    ; q9: d1 = temp1 + temp2
-    vqsub.s16       q8, q8, q11
-    vqadd.s16       q9, q10, q9
-
-    ; a1+d1
-    ; b1+c1
-    ; b1-c1
-    ; a1-d1
-    vqadd.s16       q4, q2, q9
-    vqadd.s16       q5, q3, q8
-    vqsub.s16       q6, q3, q8
-    vqsub.s16       q7, q2, q9
-
-    ; +4 >> 3 (rounding)
-    vrshr.s16       q4, q4, #3              ; lo
-    vrshr.s16       q5, q5, #3
-    vrshr.s16       q6, q6, #3              ; hi
-    vrshr.s16       q7, q7, #3
-
-    vtrn.32         q4, q6
-    vtrn.32         q5, q7
-    vtrn.16         q4, q5
-    vtrn.16         q6, q7
-
-    ; adding pre
-    ; input is still packed. pre was read interleaved
-    vaddw.u8        q4, q4, d28
-    vaddw.u8        q5, q5, d29
-    vaddw.u8        q6, q6, d30
-    vaddw.u8        q7, q7, d31
-
-    vmov.i16        q14, #0
-    vmov            q15, q14
-    vst1.16         {q14, q15}, [r0]        ; write over high input
-    sub             r0, r0, #32
-    vst1.16         {q14, q15}, [r0]        ; write over low input
-
-    ;saturate and narrow
-    vqmovun.s16     d0, q4                  ; lo
-    vqmovun.s16     d1, q5
-    vqmovun.s16     d2, q6                  ; hi
-    vqmovun.s16     d3, q7
-
-    ldr             r1, [sp]                ; stride
-    add             r2, r3, #4              ; hi
-    vst1.32         {d0[0]}, [r3], r1       ; lo
-    vst1.32         {d0[1]}, [r2], r1       ; hi
-    vst1.32         {d1[0]}, [r3], r1
-    vst1.32         {d1[1]}, [r2], r1
-    vst1.32         {d2[0]}, [r3], r1
-    vst1.32         {d2[1]}, [r2], r1
-    vst1.32         {d3[0]}, [r3]
-    vst1.32         {d3[1]}, [r2]
-
-    bx             lr
-
-    ENDP           ; |idct_dequant_dc_full_2x_neon|
-
-; Constant Pool
-cospi8sqrt2minus1 DCD 0x4e7b
-; because the lowest bit in 0x8a8c is 0, we can pre-shift this
-sinpi8sqrt2       DCD 0x4546
-
-    END
--- a/vp8/decoder/arm/neon/idct_dequant_full_2x_neon.asm
+++ /dev/null
@@ -1,197 +1,0 @@
-;
-;  Copyright (c) 2010 The Webm project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |idct_dequant_full_2x_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-;void idct_dequant_full_2x_neon(short *q, short *dq, unsigned char *pre,
-;                               unsigned char *dst, int pitch, int stride);
-; r0    *q,
-; r1    *dq,
-; r2    *pre
-; r3    *dst
-; sp    pitch
-; sp+4  stride
-|idct_dequant_full_2x_neon| PROC
-    vld1.16         {q0, q1}, [r1]          ; dq (same l/r)
-    vld1.16         {q2, q3}, [r0]          ; l q
-    ldr             r1, [sp]                ; pitch
-    add             r0, r0, #32
-    vld1.16         {q4, q5}, [r0]          ; r q
-    add             r12, r2, #4
-    ; interleave the predictors
-    vld1.32         {d28[0]}, [r2], r1      ; l pre
-    vld1.32         {d28[1]}, [r12], r1     ; r pre
-    vld1.32         {d29[0]}, [r2], r1
-    vld1.32         {d29[1]}, [r12], r1
-    vld1.32         {d30[0]}, [r2], r1
-    vld1.32         {d30[1]}, [r12], r1
-    vld1.32         {d31[0]}, [r2]
-    vld1.32         {d31[1]}, [r12]
-
-    adr             r2, cospi8sqrt2minus1   ; pointer to the first constant
-
-    ; dequant: q[i] = q[i] * dq[i]
-    vmul.i16        q2, q2, q0
-    vmul.i16        q3, q3, q1
-    vmul.i16        q4, q4, q0
-    vmul.i16        q5, q5, q1
-
-    vld1.16         {d0}, [r2]
-
-    ; q2: l0r0  q3: l8r8
-    ; q4: l4r4  q5: l12r12
-    vswp            d5, d8
-    vswp            d7, d10
-
-    ; _CONSTANTS_ * 4,12 >> 16
-    ; q6:  4 * sinpi : c1/temp1
-    ; q7: 12 * sinpi : d1/temp2
-    ; q8:  4 * cospi
-    ; q9: 12 * cospi
-    vqdmulh.s16     q6, q4, d0[2]           ; sinpi8sqrt2
-    vqdmulh.s16     q7, q5, d0[2]
-    vqdmulh.s16     q8, q4, d0[0]           ; cospi8sqrt2minus1
-    vqdmulh.s16     q9, q5, d0[0]
-
-    vqadd.s16       q10, q2, q3             ; a1 = 0 + 8
-    vqsub.s16       q11, q2, q3             ; b1 = 0 - 8
-
-    ; vqdmulh only accepts signed values. this was a problem because
-    ; our constant had the high bit set, and was treated as a negative value.
-    ; vqdmulh also doubles the value before it shifts by 16. we need to
-    ; compensate for this. in the case of sinpi8sqrt2, the lowest bit is 0,
-    ; so we can shift the constant without losing precision. this avoids
-    ; shift again afterward, but also avoids the sign issue. win win!
-    ; for cospi8sqrt2minus1 the lowest bit is 1, so we lose precision if we
-    ; pre-shift it
-    vshr.s16        q8, q8, #1
-    vshr.s16        q9, q9, #1
-
-    ; q4:  4 +  4 * cospi : d1/temp1
-    ; q5: 12 + 12 * cospi : c1/temp2
-    vqadd.s16       q4, q4, q8
-    vqadd.s16       q5, q5, q9
-
-    ; c1 = temp1 - temp2
-    ; d1 = temp1 + temp2
-    vqsub.s16       q2, q6, q5
-    vqadd.s16       q3, q4, q7
-
-    ; [0]: a1+d1
-    ; [1]: b1+c1
-    ; [2]: b1-c1
-    ; [3]: a1-d1
-    vqadd.s16       q4, q10, q3
-    vqadd.s16       q5, q11, q2
-    vqsub.s16       q6, q11, q2
-    vqsub.s16       q7, q10, q3
-
-    ; rotate
-    vtrn.32         q4, q6
-    vtrn.32         q5, q7
-    vtrn.16         q4, q5
-    vtrn.16         q6, q7
-    ; idct loop 2
-    ; q4: l 0, 4, 8,12 r 0, 4, 8,12
-    ; q5: l 1, 5, 9,13 r 1, 5, 9,13
-    ; q6: l 2, 6,10,14 r 2, 6,10,14
-    ; q7: l 3, 7,11,15 r 3, 7,11,15
-
-    ; q8:  1 * sinpi : c1/temp1
-    ; q9:  3 * sinpi : d1/temp2
-    ; q10: 1 * cospi
-    ; q11: 3 * cospi
-    vqdmulh.s16     q8, q5, d0[2]           ; sinpi8sqrt2
-    vqdmulh.s16     q9, q7, d0[2]
-    vqdmulh.s16     q10, q5, d0[0]          ; cospi8sqrt2minus1
-    vqdmulh.s16     q11, q7, d0[0]
-
-    vqadd.s16       q2, q4, q6             ; a1 = 0 + 2
-    vqsub.s16       q3, q4, q6             ; b1 = 0 - 2
-
-    ; see note on shifting above
-    vshr.s16        q10, q10, #1
-    vshr.s16        q11, q11, #1
-
-    ; q10: 1 + 1 * cospi : d1/temp1
-    ; q11: 3 + 3 * cospi : c1/temp2
-    vqadd.s16       q10, q5, q10
-    vqadd.s16       q11, q7, q11
-
-    ; q8: c1 = temp1 - temp2
-    ; q9: d1 = temp1 + temp2
-    vqsub.s16       q8, q8, q11
-    vqadd.s16       q9, q10, q9
-
-    ; a1+d1
-    ; b1+c1
-    ; b1-c1
-    ; a1-d1
-    vqadd.s16       q4, q2, q9
-    vqadd.s16       q5, q3, q8
-    vqsub.s16       q6, q3, q8
-    vqsub.s16       q7, q2, q9
-
-    ; +4 >> 3 (rounding)
-    vrshr.s16       q4, q4, #3              ; lo
-    vrshr.s16       q5, q5, #3
-    vrshr.s16       q6, q6, #3              ; hi
-    vrshr.s16       q7, q7, #3
-
-    vtrn.32         q4, q6
-    vtrn.32         q5, q7
-    vtrn.16         q4, q5
-    vtrn.16         q6, q7
-
-    ; adding pre
-    ; input is still packed. pre was read interleaved
-    vaddw.u8        q4, q4, d28
-    vaddw.u8        q5, q5, d29
-    vaddw.u8        q6, q6, d30
-    vaddw.u8        q7, q7, d31
-
-    vmov.i16        q14, #0
-    vmov            q15, q14
-    vst1.16         {q14, q15}, [r0]        ; write over high input
-    sub             r0, r0, #32
-    vst1.16         {q14, q15}, [r0]        ; write over low input
-
-    ;saturate and narrow
-    vqmovun.s16     d0, q4                  ; lo
-    vqmovun.s16     d1, q5
-    vqmovun.s16     d2, q6                  ; hi
-    vqmovun.s16     d3, q7
-
-    ldr             r1, [sp, #4]            ; stride
-    add             r2, r3, #4              ; hi
-    vst1.32         {d0[0]}, [r3], r1       ; lo
-    vst1.32         {d0[1]}, [r2], r1       ; hi
-    vst1.32         {d1[0]}, [r3], r1
-    vst1.32         {d1[1]}, [r2], r1
-    vst1.32         {d2[0]}, [r3], r1
-    vst1.32         {d2[1]}, [r2], r1
-    vst1.32         {d3[0]}, [r3]
-    vst1.32         {d3[1]}, [r2]
-
-    bx             lr
-
-    ENDP           ; |idct_dequant_full_2x_neon|
-
-; Constant Pool
-cospi8sqrt2minus1 DCD 0x4e7b
-; because the lowest bit in 0x8a8c is 0, we can pre-shift this
-sinpi8sqrt2       DCD 0x4546
-
-    END
--- a/vp8/decoder/asm_dec_offsets.c
+++ /dev/null
@@ -1,39 +1,0 @@
-/*
- *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vpx_ports/asm_offsets.h"
-#include "onyxd_int.h"
-
-BEGIN
-
-DEFINE(detok_scan,                              offsetof(DETOK, scan));
-DEFINE(detok_ptr_block2leftabove,               offsetof(DETOK, ptr_block2leftabove));
-DEFINE(detok_coef_tree_ptr,                     offsetof(DETOK, vp9_coef_tree_ptr));
-DEFINE(detok_norm_ptr,                          offsetof(DETOK, norm_ptr));
-DEFINE(detok_ptr_coef_bands_x,                  offsetof(DETOK, ptr_coef_bands_x));
-
-DEFINE(detok_A,                                 offsetof(DETOK, A));
-DEFINE(detok_L,                                 offsetof(DETOK, L));
-
-DEFINE(detok_qcoeff_start_ptr,                  offsetof(DETOK, qcoeff_start_ptr));
-DEFINE(detok_coef_probs,                        offsetof(DETOK, coef_probs));
-DEFINE(detok_eob,                               offsetof(DETOK, eob));
-
-DEFINE(bool_decoder_user_buffer_end,            offsetof(BOOL_DECODER, user_buffer_end));
-DEFINE(bool_decoder_user_buffer,                offsetof(BOOL_DECODER, user_buffer));
-DEFINE(bool_decoder_value,                      offsetof(BOOL_DECODER, value));
-DEFINE(bool_decoder_count,                      offsetof(BOOL_DECODER, count));
-DEFINE(bool_decoder_range,                      offsetof(BOOL_DECODER, range));
-
-END
-
-/* add asserts for any offset that is not supported by assembly code */
-/* add asserts for any size that is not supported by assembly code */
--- a/vp8/decoder/dboolhuff.c
+++ /dev/null
@@ -1,100 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "dboolhuff.h"
-#include "vpx_ports/mem.h"
-#include "vpx_mem/vpx_mem.h"
-
-int vp9_start_decode(BOOL_DECODER *br,
-                     const unsigned char *source,
-                     unsigned int source_sz) {
-  br->user_buffer_end = source + source_sz;
-  br->user_buffer     = source;
-  br->value    = 0;
-  br->count    = -8;
-  br->range    = 255;
-
-  if (source_sz && !source)
-    return 1;
-
-  /* Populate the buffer */
-  vp9_bool_decoder_fill(br);
-
-  return 0;
-}
-
-
-void vp9_bool_decoder_fill(BOOL_DECODER *br) {
-  const unsigned char *bufptr;
-  const unsigned char *bufend;
-  VP9_BD_VALUE         value;
-  int                  count;
-  bufend = br->user_buffer_end;
-  bufptr = br->user_buffer;
-  value = br->value;
-  count = br->count;
-
-  VP9DX_BOOL_DECODER_FILL(count, value, bufptr, bufend);
-
-  br->user_buffer = bufptr;
-  br->value = value;
-  br->count = count;
-}
-
-
-static int get_unsigned_bits(unsigned num_values) {
-  int cat = 0;
-  if ((num_values--) <= 1) return 0;
-  while (num_values > 0) {
-    cat++;
-    num_values >>= 1;
-  }
-  return cat;
-}
-
-int vp9_inv_recenter_nonneg(int v, int m) {
-  if (v > (m << 1)) return v;
-  else if ((v & 1) == 0) return (v >> 1) + m;
-  else return m - ((v + 1) >> 1);
-}
-
-int vp9_decode_uniform(BOOL_DECODER *br, int n) {
-  int v;
-  int l = get_unsigned_bits(n);
-  int m = (1 << l) - n;
-  if (!l) return 0;
-  v = decode_value(br, l - 1);
-  if (v < m)
-    return v;
-  else
-    return (v << 1) - m + decode_value(br, 1);
-}
-
-int vp9_decode_term_subexp(BOOL_DECODER *br, int k, int num_syms) {
-  int i = 0, mk = 0, word;
-  while (1) {
-    int b = (i ? k + i - 1 : k);
-    int a = (1 << b);
-    if (num_syms <= mk + 3 * a) {
-      word = vp9_decode_uniform(br, num_syms - mk) + mk;
-      break;
-    } else {
-      if (decode_value(br, 1)) {
-        i++;
-        mk += a;
-      } else {
-        word = decode_value(br, b) + mk;
-        break;
-      }
-    }
-  }
-  return word;
-}
--- a/vp8/decoder/dboolhuff.h
+++ /dev/null
@@ -1,153 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef DBOOLHUFF_H
-#define DBOOLHUFF_H
-#include <stddef.h>
-#include <limits.h>
-#include "vpx_ports/config.h"
-#include "vpx_ports/mem.h"
-#include "vpx/vpx_integer.h"
-
-typedef size_t VP9_BD_VALUE;
-
-# define VP9_BD_VALUE_SIZE ((int)sizeof(VP9_BD_VALUE)*CHAR_BIT)
-/*This is meant to be a large, positive constant that can still be efficiently
-   loaded as an immediate (on platforms like ARM, for example).
-  Even relatively modest values like 100 would work fine.*/
-# define VP9_LOTS_OF_BITS (0x40000000)
-
-typedef struct {
-  const unsigned char *user_buffer_end;
-  const unsigned char *user_buffer;
-  VP9_BD_VALUE         value;
-  int                  count;
-  unsigned int         range;
-} BOOL_DECODER;
-
-DECLARE_ALIGNED(16, extern const unsigned char, vp9_norm[256]);
-
-int vp9_start_decode(BOOL_DECODER *br,
-                     const unsigned char *source,
-                     unsigned int source_sz);
-
-void vp9_bool_decoder_fill(BOOL_DECODER *br);
-
-int vp9_decode_uniform(BOOL_DECODER *br, int n);
-int vp9_decode_term_subexp(BOOL_DECODER *br, int k, int num_syms);
-int vp9_inv_recenter_nonneg(int v, int m);
-
-/*The refill loop is used in several places, so define it in a macro to make
-   sure they're all consistent.
-  An inline function would be cleaner, but has a significant penalty, because
-   multiple BOOL_DECODER fields must be modified, and the compiler is not smart
-   enough to eliminate the stores to those fields and the subsequent reloads
-   from them when inlining the function.*/
-#define VP9DX_BOOL_DECODER_FILL(_count,_value,_bufptr,_bufend) \
-  do \
-  { \
-    int shift = VP9_BD_VALUE_SIZE - 8 - ((_count) + 8); \
-    int loop_end, x; \
-    size_t bits_left = ((_bufend)-(_bufptr))*CHAR_BIT; \
-    \
-    x = shift + CHAR_BIT - bits_left; \
-    loop_end = 0; \
-    if(x >= 0) \
-    { \
-      (_count) += VP9_LOTS_OF_BITS; \
-      loop_end = x; \
-      if(!bits_left) break; \
-    } \
-    while(shift >= loop_end) \
-    { \
-      (_count) += CHAR_BIT; \
-      (_value) |= (VP9_BD_VALUE)*(_bufptr)++ << shift; \
-      shift -= CHAR_BIT; \
-    } \
-  } \
-  while(0) \
-
-
-static int decode_bool(BOOL_DECODER *br, int probability) {
-  unsigned int bit = 0;
-  VP9_BD_VALUE value;
-  unsigned int split;
-  VP9_BD_VALUE bigsplit;
-  int count;
-  unsigned int range;
-
-  split = 1 + (((br->range - 1) * probability) >> 8);
-
-  if (br->count < 0)
-    vp9_bool_decoder_fill(br);
-
-  value = br->value;
-  count = br->count;
-
-  bigsplit = (VP9_BD_VALUE)split << (VP9_BD_VALUE_SIZE - 8);
-
-  range = split;
-
-  if (value >= bigsplit) {
-    range = br->range - split;
-    value = value - bigsplit;
-    bit = 1;
-  }
-
-  {
-    register unsigned int shift = vp9_norm[range];
-    range <<= shift;
-    value <<= shift;
-    count -= shift;
-  }
-  br->value = value;
-  br->count = count;
-  br->range = range;
-
-  return bit;
-}
-
-static int decode_value(BOOL_DECODER *br, int bits) {
-  int z = 0;
-  int bit;
-
-  for (bit = bits - 1; bit >= 0; bit--) {
-    z |= (decode_bool(br, 0x80) << bit);
-  }
-
-  return z;
-}
-
-static int bool_error(BOOL_DECODER *br) {
-  /* Check if we have reached the end of the buffer.
-   *
-   * Variable 'count' stores the number of bits in the 'value' buffer, minus
-   * 8. The top byte is part of the algorithm, and the remainder is buffered
-   * to be shifted into it. So if count == 8, the top 16 bits of 'value' are
-   * occupied, 8 for the algorithm and 8 in the buffer.
-   *
-   * When reading a byte from the user's buffer, count is filled with 8 and
-   * one byte is filled into the value buffer. When we reach the end of the
-   * data, count is additionally filled with VP9_LOTS_OF_BITS. So when
-   * count == VP9_LOTS_OF_BITS - 1, the user's data has been exhausted.
-   */
-  if ((br->count > VP9_BD_VALUE_SIZE) && (br->count < VP9_LOTS_OF_BITS)) {
-    /* We have tried to decode bits after the end of
-     * stream was encountered.
-     */
-    return 1;
-  }
-
-  /* No error. */
-  return 0;
-}
-
-#endif
--- a/vp8/decoder/decodemv.c
+++ /dev/null
@@ -1,1199 +1,0 @@
-/*
-  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "treereader.h"
-#include "vp8/common/entropymv.h"
-#include "vp8/common/entropymode.h"
-#include "onyxd_int.h"
-#include "vp8/common/findnearmv.h"
-
-#include "vp8/common/seg_common.h"
-#include "vp8/common/pred_common.h"
-#include "vp8/common/entropy.h"
-#include "vp8/decoder/decodemv.h"
-#if CONFIG_DEBUG
-#include <assert.h>
-#endif
-
-// #define DEBUG_DEC_MV
-#ifdef DEBUG_DEC_MV
-int dec_mvcount = 0;
-#endif
-
-static int read_bmode(vp9_reader *bc, const vp9_prob *p) {
-  return treed_read(bc, vp9_bmode_tree, p);
-}
-
-static int read_ymode(vp9_reader *bc, const vp9_prob *p) {
-  return treed_read(bc, vp9_ymode_tree, p);
-}
-
-#if CONFIG_SUPERBLOCKS
-static int read_kf_sb_ymode(vp9_reader *bc, const vp9_prob *p) {
-  return treed_read(bc, vp9_uv_mode_tree, p);
-}
-#endif
-
-static int read_kf_mb_ymode(vp9_reader *bc, const vp9_prob *p) {
-  return treed_read(bc, vp9_kf_ymode_tree, p);
-}
-
-static int read_i8x8_mode(vp9_reader *bc, const vp9_prob *p) {
-  return treed_read(bc, vp9_i8x8_mode_tree, p);
-}
-
-static int read_uv_mode(vp9_reader *bc, const vp9_prob *p) {
-  return treed_read(bc, vp9_uv_mode_tree, p);
-}
-
-// This function reads the current macro block's segnent id from the bitstream
-// It should only be called if a segment map update is indicated.
-static void read_mb_segid(vp9_reader *r, MB_MODE_INFO *mi,
-                          MACROBLOCKD *xd) {
-  /* Is segmentation enabled */
-  if (xd->segmentation_enabled && xd->update_mb_segmentation_map) {
-    /* If so then read the segment id. */
-    if (vp9_read(r, xd->mb_segment_tree_probs[0]))
-      mi->segment_id =
-        (unsigned char)(2 + vp9_read(r, xd->mb_segment_tree_probs[2]));
-    else
-      mi->segment_id =
-        (unsigned char)(vp9_read(r, xd->mb_segment_tree_probs[1]));
-  }
-}
-
-#if CONFIG_NEW_MVREF
-int vp9_read_mv_ref_id(vp9_reader *r,
-                       vp9_prob * ref_id_probs) {
-  int ref_index = 0;
-
-  if (vp9_read(r, ref_id_probs[0])) {
-    ref_index++;
-    if (vp9_read(r, ref_id_probs[1])) {
-      ref_index++;
-      if (vp9_read(r, ref_id_probs[2]))
-        ref_index++;
-    }
-  }
-  return ref_index;
-}
-#endif
-
-extern const int vp9_i8x8_block[4];
-static void kfread_modes(VP9D_COMP *pbi,
-                         MODE_INFO *m,
-                         int mb_row,
-                         int mb_col,
-                         BOOL_DECODER* const bc) {
-  VP9_COMMON *const cm = &pbi->common;
-  const int mis = pbi->common.mode_info_stride;
-  int map_index = mb_row * pbi->common.mb_cols + mb_col;
-  MB_PREDICTION_MODE y_mode;
-
-  // Read the Macroblock segmentation map if it is being updated explicitly
-  // this frame (reset to 0 by default).
-  m->mbmi.segment_id = 0;
-  if (pbi->mb.update_mb_segmentation_map) {
-    read_mb_segid(bc, &m->mbmi, &pbi->mb);
-    pbi->common.last_frame_seg_map[map_index] = m->mbmi.segment_id;
-  }
-
-  m->mbmi.mb_skip_coeff = 0;
-  if (pbi->common.mb_no_coeff_skip &&
-      (!vp9_segfeature_active(&pbi->mb,
-                              m->mbmi.segment_id, SEG_LVL_EOB) ||
-       (vp9_get_segdata(&pbi->mb,
-                        m->mbmi.segment_id, SEG_LVL_EOB) != 0))) {
-    MACROBLOCKD *const xd  = &pbi->mb;
-    m->mbmi.mb_skip_coeff =
-      vp9_read(bc, vp9_get_pred_prob(cm, xd, PRED_MBSKIP));
-  } else {
-    if (vp9_segfeature_active(&pbi->mb,
-                              m->mbmi.segment_id, SEG_LVL_EOB) &&
-        (vp9_get_segdata(&pbi->mb,
-                         m->mbmi.segment_id, SEG_LVL_EOB) == 0)) {
-      m->mbmi.mb_skip_coeff = 1;
-    } else
-      m->mbmi.mb_skip_coeff = 0;
-  }
-
-#if CONFIG_SUPERBLOCKS
-  if (m->mbmi.encoded_as_sb) {
-    y_mode = (MB_PREDICTION_MODE) read_kf_sb_ymode(bc,
-      pbi->common.sb_kf_ymode_prob[pbi->common.kf_ymode_probs_index]);
-  } else
-#endif
-  y_mode = (MB_PREDICTION_MODE) read_kf_mb_ymode(bc,
-    pbi->common.kf_ymode_prob[pbi->common.kf_ymode_probs_index]);
-#if CONFIG_COMP_INTRA_PRED
-  m->mbmi.second_mode = (MB_PREDICTION_MODE)(DC_PRED - 1);
-#endif
-
-  m->mbmi.ref_frame = INTRA_FRAME;
-
-  if ((m->mbmi.mode = y_mode) == B_PRED) {
-    int i = 0;
-#if CONFIG_COMP_INTRA_PRED
-    int use_comp_pred = vp9_read(bc, 128);
-#endif
-    do {
-      const B_PREDICTION_MODE A = above_block_mode(m, i, mis);
-      const B_PREDICTION_MODE L = left_block_mode(m, i);
-
-      m->bmi[i].as_mode.first =
-        (B_PREDICTION_MODE) read_bmode(
-          bc, pbi->common.kf_bmode_prob [A] [L]);
-#if CONFIG_COMP_INTRA_PRED
-      if (use_comp_pred) {
-        m->bmi[i].as_mode.second =
-          (B_PREDICTION_MODE) read_bmode(
-            bc, pbi->common.kf_bmode_prob [A] [L]);
-      } else {
-        m->bmi[i].as_mode.second = (B_PREDICTION_MODE)(B_DC_PRED - 1);
-      }
-#endif
-    } while (++i < 16);
-  }
-  if ((m->mbmi.mode = y_mode) == I8X8_PRED) {
-    int i;
-    int mode8x8;
-    for (i = 0; i < 4; i++) {
-      int ib = vp9_i8x8_block[i];
-      mode8x8 = read_i8x8_mode(bc, pbi->common.fc.i8x8_mode_prob);
-      m->bmi[ib + 0].as_mode.first = mode8x8;
-      m->bmi[ib + 1].as_mode.first = mode8x8;
-      m->bmi[ib + 4].as_mode.first = mode8x8;
-      m->bmi[ib + 5].as_mode.first = mode8x8;
-#if CONFIG_COMP_INTRA_PRED
-      m->bmi[ib + 0].as_mode.second = (MB_PREDICTION_MODE)(DC_PRED - 1);
-      m->bmi[ib + 1].as_mode.second = (MB_PREDICTION_MODE)(DC_PRED - 1);
-      m->bmi[ib + 4].as_mode.second = (MB_PREDICTION_MODE)(DC_PRED - 1);
-      m->bmi[ib + 5].as_mode.second = (MB_PREDICTION_MODE)(DC_PRED - 1);
-#endif
-    }
-  } else
-    m->mbmi.uv_mode = (MB_PREDICTION_MODE)read_uv_mode(bc,
-                                                       pbi->common.kf_uv_mode_prob[m->mbmi.mode]);
-#if CONFIG_COMP_INTRA_PRED
-  m->mbmi.second_uv_mode = (MB_PREDICTION_MODE)(DC_PRED - 1);
-#endif
-
-#if CONFIG_SUPERBLOCKS
-  if (m->mbmi.encoded_as_sb)
-    m->mbmi.txfm_size = TX_8X8;
-  else
-#endif
-  if (cm->txfm_mode == TX_MODE_SELECT && m->mbmi.mb_skip_coeff == 0 &&
-      m->mbmi.mode <= I8X8_PRED) {
-    // FIXME(rbultje) code ternary symbol once all experiments are merged
-    m->mbmi.txfm_size = vp9_read(bc, cm->prob_tx[0]);
-    if (m->mbmi.txfm_size != TX_4X4 && m->mbmi.mode != I8X8_PRED)
-      m->mbmi.txfm_size += vp9_read(bc, cm->prob_tx[1]);
-  } else if (cm->txfm_mode >= ALLOW_16X16 && m->mbmi.mode <= TM_PRED) {
-    m->mbmi.txfm_size = TX_16X16;
-  } else if (cm->txfm_mode >= ALLOW_8X8 && m->mbmi.mode != B_PRED) {
-    m->mbmi.txfm_size = TX_8X8;
-  } else {
-    m->mbmi.txfm_size = TX_4X4;
-  }
-}
-
-static int read_nmv_component(vp9_reader *r,
-                              int rv,
-                              const nmv_component *mvcomp) {
-  int v, s, z, c, o, d;
-  s = vp9_read(r, mvcomp->sign);
-  c = treed_read(r, vp9_mv_class_tree, mvcomp->classes);
-  if (c == MV_CLASS_0) {
-    d = treed_read(r, vp9_mv_class0_tree, mvcomp->class0);
-  } else {
-    int i, b;
-    d = 0;
-    b = c + CLASS0_BITS - 1;  /* number of bits */
-    for (i = 0; i < b; ++i)
-      d |= (vp9_read(r, mvcomp->bits[i]) << i);
-  }
-  o = d << 3;
-
-  z = vp9_get_mv_mag(c, o);
-  v = (s ? -(z + 8) : (z + 8));
-  return v;
-}
-
-static int read_nmv_component_fp(vp9_reader *r,
-                                 int v,
-                                 int rv,
-                                 const nmv_component *mvcomp,
-                                 int usehp) {
-  int s, z, c, o, d, e, f;
-  s = v < 0;
-  z = (s ? -v : v) - 1;       /* magnitude - 1 */
-  z &= ~7;
-
-  c = vp9_get_mv_class(z, &o);
-  d = o >> 3;
-
-  if (c == MV_CLASS_0) {
-    f = treed_read(r, vp9_mv_fp_tree, mvcomp->class0_fp[d]);
-  } else {
-    f = treed_read(r, vp9_mv_fp_tree, mvcomp->fp);
-  }
-  o += (f << 1);
-
-  if (usehp) {
-    if (c == MV_CLASS_0) {
-      e = vp9_read(r, mvcomp->class0_hp);
-    } else {
-      e = vp9_read(r, mvcomp->hp);
-    }
-    o += e;
-  } else {
-    ++o;  /* Note if hp is not used, the default value of the hp bit is 1 */
-  }
-  z = vp9_get_mv_mag(c, o);
-  v = (s ? -(z + 1) : (z + 1));
-  return v;
-}
-
-static void read_nmv(vp9_reader *r, MV *mv, const MV *ref,
-                     const nmv_context *mvctx) {
-  MV_JOINT_TYPE j = treed_read(r, vp9_mv_joint_tree, mvctx->joints);
-  mv->row = mv-> col = 0;
-  if (j == MV_JOINT_HZVNZ || j == MV_JOINT_HNZVNZ) {
-    mv->row = read_nmv_component(r, ref->row, &mvctx->comps[0]);
-  }
-  if (j == MV_JOINT_HNZVZ || j == MV_JOINT_HNZVNZ) {
-    mv->col = read_nmv_component(r, ref->col, &mvctx->comps[1]);
-  }
-}
-
-static void read_nmv_fp(vp9_reader *r, MV *mv, const MV *ref,
-                        const nmv_context *mvctx, int usehp) {
-  MV_JOINT_TYPE j = vp9_get_mv_joint(*mv);
-  usehp = usehp && vp9_use_nmv_hp(ref);
-  if (j == MV_JOINT_HZVNZ || j == MV_JOINT_HNZVNZ) {
-    mv->row = read_nmv_component_fp(r, mv->row, ref->row, &mvctx->comps[0],
-                                    usehp);
-  }
-  if (j == MV_JOINT_HNZVZ || j == MV_JOINT_HNZVNZ) {
-    mv->col = read_nmv_component_fp(r, mv->col, ref->col, &mvctx->comps[1],
-                                    usehp);
-  }
-  //printf("  %d: %d %d ref: %d %d\n", usehp, mv->row, mv-> col, ref->row, ref->col);
-}
-
-static void update_nmv(vp9_reader *bc, vp9_prob *const p,
-                       const vp9_prob upd_p) {
-  if (vp9_read(bc, upd_p)) {
-#ifdef LOW_PRECISION_MV_UPDATE
-    *p = (vp9_read_literal(bc, 7) << 1) | 1;
-#else
-    *p = (vp9_read_literal(bc, 8));
-#endif
-  }
-}
-
-static void read_nmvprobs(vp9_reader *bc, nmv_context *mvctx,
-                          int usehp) {
-  int i, j, k;
-#ifdef MV_GROUP_UPDATE
-  if (!vp9_read_bit(bc)) return;
-#endif
-  for (j = 0; j < MV_JOINTS - 1; ++j) {
-    update_nmv(bc, &mvctx->joints[j],
-               VP9_NMV_UPDATE_PROB);
-  }
-  for (i = 0; i < 2; ++i) {
-    update_nmv(bc, &mvctx->comps[i].sign,
-               VP9_NMV_UPDATE_PROB);
-    for (j = 0; j < MV_CLASSES - 1; ++j) {
-      update_nmv(bc, &mvctx->comps[i].classes[j],
-                 VP9_NMV_UPDATE_PROB);
-    }
-    for (j = 0; j < CLASS0_SIZE - 1; ++j) {
-      update_nmv(bc, &mvctx->comps[i].class0[j],
-                 VP9_NMV_UPDATE_PROB);
-    }
-    for (j = 0; j < MV_OFFSET_BITS; ++j) {
-      update_nmv(bc, &mvctx->comps[i].bits[j],
-                 VP9_NMV_UPDATE_PROB);
-    }
-  }
-
-  for (i = 0; i < 2; ++i) {
-    for (j = 0; j < CLASS0_SIZE; ++j) {
-      for (k = 0; k < 3; ++k)
-        update_nmv(bc, &mvctx->comps[i].class0_fp[j][k],
-                   VP9_NMV_UPDATE_PROB);
-    }
-    for (j = 0; j < 3; ++j) {
-      update_nmv(bc, &mvctx->comps[i].fp[j],
-                 VP9_NMV_UPDATE_PROB);
-    }
-  }
-
-  if (usehp) {
-    for (i = 0; i < 2; ++i) {
-      update_nmv(bc, &mvctx->comps[i].class0_hp,
-                 VP9_NMV_UPDATE_PROB);
-      update_nmv(bc, &mvctx->comps[i].hp,
-                 VP9_NMV_UPDATE_PROB);
-    }
-  }
-}
-
-// Read the referncence frame
-static MV_REFERENCE_FRAME read_ref_frame(VP9D_COMP *pbi,
-                                         vp9_reader *const bc,
-                                         unsigned char segment_id) {
-  MV_REFERENCE_FRAME ref_frame;
-  int seg_ref_active;
-  int seg_ref_count = 0;
-
-  VP9_COMMON *const cm = &pbi->common;
-  MACROBLOCKD *const xd = &pbi->mb;
-
-  seg_ref_active = vp9_segfeature_active(xd,
-                                         segment_id,
-                                         SEG_LVL_REF_FRAME);
-
-  // If segment coding enabled does the segment allow for more than one
-  // possible reference frame
-  if (seg_ref_active) {
-    seg_ref_count = vp9_check_segref(xd, segment_id, INTRA_FRAME) +
-                    vp9_check_segref(xd, segment_id, LAST_FRAME) +
-                    vp9_check_segref(xd, segment_id, GOLDEN_FRAME) +
-                    vp9_check_segref(xd, segment_id, ALTREF_FRAME);
-  }
-
-  // Segment reference frame features not available or allows for
-  // multiple reference frame options
-  if (!seg_ref_active || (seg_ref_count > 1)) {
-    // Values used in prediction model coding
-    unsigned char prediction_flag;
-    vp9_prob pred_prob;
-    MV_REFERENCE_FRAME pred_ref;
-
-    // Get the context probability the prediction flag
-    pred_prob = vp9_get_pred_prob(cm, xd, PRED_REF);
-
-    // Read the prediction status flag
-    prediction_flag = (unsigned char)vp9_read(bc, pred_prob);
-
-    // Store the prediction flag.
-    vp9_set_pred_flag(xd, PRED_REF, prediction_flag);
-
-    // Get the predicted reference frame.
-    pred_ref = vp9_get_pred_ref(cm, xd);
-
-    // If correctly predicted then use the predicted value
-    if (prediction_flag) {
-      ref_frame = pred_ref;
-    }
-    // else decode the explicitly coded value
-    else {
-      vp9_prob mod_refprobs[PREDICTION_PROBS];
-      vpx_memcpy(mod_refprobs,
-                 cm->mod_refprobs[pred_ref], sizeof(mod_refprobs));
-
-      // If segment coding enabled blank out options that cant occur by
-      // setting the branch probability to 0.
-      if (seg_ref_active) {
-        mod_refprobs[INTRA_FRAME] *=
-          vp9_check_segref(xd, segment_id, INTRA_FRAME);
-        mod_refprobs[LAST_FRAME] *=
-          vp9_check_segref(xd, segment_id, LAST_FRAME);
-        mod_refprobs[GOLDEN_FRAME] *=
-          (vp9_check_segref(xd, segment_id, GOLDEN_FRAME) *
-           vp9_check_segref(xd, segment_id, ALTREF_FRAME));
-      }
-
-      // Default to INTRA_FRAME (value 0)
-      ref_frame = INTRA_FRAME;
-
-      // Do we need to decode the Intra/Inter branch
-      if (mod_refprobs[0])
-        ref_frame = (MV_REFERENCE_FRAME) vp9_read(bc, mod_refprobs[0]);
-      else
-        ref_frame++;
-
-      if (ref_frame) {
-        // Do we need to decode the Last/Gf_Arf branch
-        if (mod_refprobs[1])
-          ref_frame += vp9_read(bc, mod_refprobs[1]);
-        else
-          ref_frame++;
-
-        if (ref_frame > 1) {
-          // Do we need to decode the GF/Arf branch
-          if (mod_refprobs[2])
-            ref_frame += vp9_read(bc, mod_refprobs[2]);
-          else {
-            if (seg_ref_active) {
-              if ((pred_ref == GOLDEN_FRAME) ||
-                  !vp9_check_segref(xd, segment_id, GOLDEN_FRAME)) {
-                ref_frame = ALTREF_FRAME;
-              } else
-                ref_frame = GOLDEN_FRAME;
-            } else
-              ref_frame = (pred_ref == GOLDEN_FRAME)
-                          ? ALTREF_FRAME : GOLDEN_FRAME;
-          }
-        }
-      }
-    }
-  }
-
-  // Segment reference frame features are enabled
-  else {
-    // The reference frame for the mb is considered as correclty predicted
-    // if it is signaled at the segment level for the purposes of the
-    // common prediction model
-    vp9_set_pred_flag(xd, PRED_REF, 1);
-    ref_frame = vp9_get_pred_ref(cm, xd);
-  }
-
-  return (MV_REFERENCE_FRAME)ref_frame;
-}
-
-#if CONFIG_SUPERBLOCKS
-static MB_PREDICTION_MODE read_sb_mv_ref(vp9_reader *bc, const vp9_prob *p) {
-  return (MB_PREDICTION_MODE) treed_read(bc, vp9_sb_mv_ref_tree, p);
-}
-#endif
-
-static MB_PREDICTION_MODE read_mv_ref(vp9_reader *bc, const vp9_prob *p) {
-  return (MB_PREDICTION_MODE) treed_read(bc, vp9_mv_ref_tree, p);
-}
-
-static B_PREDICTION_MODE sub_mv_ref(vp9_reader *bc, const vp9_prob *p) {
-  return (B_PREDICTION_MODE) treed_read(bc, vp9_sub_mv_ref_tree, p);
-}
-
-#ifdef VPX_MODE_COUNT
-unsigned int vp9_mv_cont_count[5][4] = {
-  { 0, 0, 0, 0 },
-  { 0, 0, 0, 0 },
-  { 0, 0, 0, 0 },
-  { 0, 0, 0, 0 },
-  { 0, 0, 0, 0 }
-};
-#endif
-
-static const unsigned char mbsplit_fill_count[4] = {8, 8, 4, 1};
-static const unsigned char mbsplit_fill_offset[4][16] = {
-  { 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15},
-  { 0,  1,  4,  5,  8,  9, 12, 13,  2,  3,   6,  7, 10, 11, 14, 15},
-  { 0,  1,  4,  5,  2,  3,  6,  7,  8,  9,  12, 13, 10, 11, 14, 15},
-  { 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15}
-};
-
-static void read_switchable_interp_probs(VP9D_COMP* const pbi,
-                                         BOOL_DECODER* const bc) {
-  VP9_COMMON *const cm = &pbi->common;
-  int i, j;
-  for (j = 0; j <= VP9_SWITCHABLE_FILTERS; ++j) {
-    for (i = 0; i < VP9_SWITCHABLE_FILTERS - 1; ++i) {
-      cm->fc.switchable_interp_prob[j][i] = vp9_read_literal(bc, 8);
-    }
-  }
-  //printf("DECODER: %d %d\n", cm->fc.switchable_interp_prob[0],
-  //cm->fc.switchable_interp_prob[1]);
-}
-
-static void mb_mode_mv_init(VP9D_COMP *pbi, vp9_reader *bc) {
-  VP9_COMMON *const cm = &pbi->common;
-  nmv_context *const nmvc = &pbi->common.fc.nmvc;
-  MACROBLOCKD *const xd  = &pbi->mb;
-
-  if (cm->frame_type == KEY_FRAME) {
-    if (!cm->kf_ymode_probs_update)
-      cm->kf_ymode_probs_index = vp9_read_literal(bc, 3);
-  } else {
-#if CONFIG_PRED_FILTER
-    cm->pred_filter_mode = (vp9_prob)vp9_read_literal(bc, 2);
-
-    if (cm->pred_filter_mode == 2)
-      cm->prob_pred_filter_off = (vp9_prob)vp9_read_literal(bc, 8);
-#endif
-    if (cm->mcomp_filter_type == SWITCHABLE)
-      read_switchable_interp_probs(pbi, bc);
-    // Decode the baseline probabilities for decoding reference frame
-    cm->prob_intra_coded = (vp9_prob)vp9_read_literal(bc, 8);
-    cm->prob_last_coded  = (vp9_prob)vp9_read_literal(bc, 8);
-    cm->prob_gf_coded    = (vp9_prob)vp9_read_literal(bc, 8);
-
-    // Computes a modified set of probabilities for use when reference
-    // frame prediction fails.
-    vp9_compute_mod_refprobs(cm);
-
-    pbi->common.comp_pred_mode = vp9_read(bc, 128);
-    if (cm->comp_pred_mode)
-      cm->comp_pred_mode += vp9_read(bc, 128);
-    if (cm->comp_pred_mode == HYBRID_PREDICTION) {
-      int i;
-      for (i = 0; i < COMP_PRED_CONTEXTS; i++)
-        cm->prob_comppred[i] = (vp9_prob)vp9_read_literal(bc, 8);
-    }
-
-    if (vp9_read_bit(bc)) {
-      int i = 0;
-
-      do {
-        cm->fc.ymode_prob[i] = (vp9_prob) vp9_read_literal(bc, 8);
-      } while (++i < VP9_YMODES - 1);
-    }
-
-#if CONFIG_NEW_MVREF
-  // Temp defaults probabilities for ecnoding the MV ref id signal
-  vpx_memset(xd->mb_mv_ref_id_probs, 192, sizeof(xd->mb_mv_ref_id_probs));
-#endif
-
-    read_nmvprobs(bc, nmvc, xd->allow_high_precision_mv);
-  }
-}
-
-// This function either reads the segment id for the current macroblock from
-// the bitstream or if the value is temporally predicted asserts the predicted
-// value
-static void read_mb_segment_id(VP9D_COMP *pbi,
-                               int mb_row, int mb_col,
-                               BOOL_DECODER* const bc) {
-  VP9_COMMON *const cm = &pbi->common;
-  MACROBLOCKD *const xd  = &pbi->mb;
-  MODE_INFO *mi = xd->mode_info_context;
-  MB_MODE_INFO *mbmi = &mi->mbmi;
-  int index = mb_row * pbi->common.mb_cols + mb_col;
-
-  if (xd->segmentation_enabled) {
-    if (xd->update_mb_segmentation_map) {
-      // Is temporal coding of the segment id for this mb enabled.
-      if (cm->temporal_update) {
-        // Get the context based probability for reading the
-        // prediction status flag
-        vp9_prob pred_prob =
-          vp9_get_pred_prob(cm, xd, PRED_SEG_ID);
-
-        // Read the prediction status flag
-        unsigned char seg_pred_flag =
-          (unsigned char)vp9_read(bc, pred_prob);
-
-        // Store the prediction flag.
-        vp9_set_pred_flag(xd, PRED_SEG_ID, seg_pred_flag);
-
-        // If the value is flagged as correctly predicted
-        // then use the predicted value
-        if (seg_pred_flag) {
-          mbmi->segment_id = vp9_get_pred_mb_segid(cm, xd, index);
-        }
-        // Else .... decode it explicitly
-        else {
-          read_mb_segid(bc, mbmi, xd);
-        }
-      }
-      // Normal unpredicted coding mode
-      else {
-        read_mb_segid(bc, mbmi, xd);
-      }
-#if CONFIG_SUPERBLOCKS
-      if (mbmi->encoded_as_sb) {
-        cm->last_frame_seg_map[index] = mbmi->segment_id;
-        if (mb_col + 1 < cm->mb_cols)
-          cm->last_frame_seg_map[index + 1] = mbmi->segment_id;
-        if (mb_row + 1 < cm->mb_rows) {
-          cm->last_frame_seg_map[index + cm->mb_cols] = mbmi->segment_id;
-          if (mb_col + 1 < cm->mb_cols)
-            cm->last_frame_seg_map[index + cm->mb_cols + 1] = mbmi->segment_id;
-        }
-      } else
-#endif
-      {
-        cm->last_frame_seg_map[index] = mbmi->segment_id;
-      }
-    } else {
-#if CONFIG_SUPERBLOCKS
-      if (mbmi->encoded_as_sb) {
-        mbmi->segment_id = cm->last_frame_seg_map[index];
-        if (mb_col < cm->mb_cols - 1)
-          mbmi->segment_id = mbmi->segment_id &&
-                             cm->last_frame_seg_map[index + 1];
-        if (mb_row < cm->mb_rows - 1) {
-          mbmi->segment_id = mbmi->segment_id &&
-                             cm->last_frame_seg_map[index + cm->mb_cols];
-          if (mb_col < cm->mb_cols - 1)
-            mbmi->segment_id = mbmi->segment_id &&
-                               cm->last_frame_seg_map[index + cm->mb_cols + 1];
-        }
-      } else
-#endif
-      {
-        mbmi->segment_id = cm->last_frame_seg_map[index];
-      }
-    }
-  } else {
-    // The encoder explicitly sets the segment_id to 0
-    // when segmentation is disabled
-    mbmi->segment_id = 0;
-  }
-}
-
-static void read_mb_modes_mv(VP9D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,
-                             MODE_INFO *prev_mi,
-                             int mb_row, int mb_col,
-                             BOOL_DECODER* const bc) {
-  VP9_COMMON *const cm = &pbi->common;
-  nmv_context *const nmvc = &pbi->common.fc.nmvc;
-  const int mis = pbi->common.mode_info_stride;
-  MACROBLOCKD *const xd  = &pbi->mb;
-
-  int_mv *const mv = &mbmi->mv;
-  int mb_to_left_edge;
-  int mb_to_right_edge;
-  int mb_to_top_edge;
-  int mb_to_bottom_edge;
-
-  mb_to_top_edge = xd->mb_to_top_edge;
-  mb_to_bottom_edge = xd->mb_to_bottom_edge;
-  mb_to_top_edge -= LEFT_TOP_MARGIN;
-  mb_to_bottom_edge += RIGHT_BOTTOM_MARGIN;
-  mbmi->need_to_clamp_mvs = 0;
-  mbmi->need_to_clamp_secondmv = 0;
-  mbmi->second_ref_frame = 0;
-  /* Distance of Mb to the various image edges.
-   * These specified to 8th pel as they are always compared to MV values that are in 1/8th pel units
-   */
-  xd->mb_to_left_edge =
-    mb_to_left_edge = -((mb_col * 16) << 3);
-  mb_to_left_edge -= LEFT_TOP_MARGIN;
-
-  xd->mb_to_right_edge =
-    mb_to_right_edge = ((pbi->common.mb_cols - 1 - mb_col) * 16) << 3;
-  mb_to_right_edge += RIGHT_BOTTOM_MARGIN;
-
-  // Make sure the MACROBLOCKD mode info pointer is pointed at the
-  // correct entry for the current macroblock.
-  xd->mode_info_context = mi;
-  xd->prev_mode_info_context = prev_mi;
-
-  // Read the macroblock segment id.
-  read_mb_segment_id(pbi, mb_row, mb_col, bc);
-
-  if (pbi->common.mb_no_coeff_skip &&
-      (!vp9_segfeature_active(xd,
-                              mbmi->segment_id, SEG_LVL_EOB) ||
-       (vp9_get_segdata(xd, mbmi->segment_id, SEG_LVL_EOB) != 0))) {
-    // Read the macroblock coeff skip flag if this feature is in use,
-    // else default to 0
-    mbmi->mb_skip_coeff = vp9_read(bc, vp9_get_pred_prob(cm, xd, PRED_MBSKIP));
-  } else {
-    if (vp9_segfeature_active(xd,
-                              mbmi->segment_id, SEG_LVL_EOB) &&
-        (vp9_get_segdata(xd, mbmi->segment_id, SEG_LVL_EOB) == 0)) {
-      mbmi->mb_skip_coeff = 1;
-    } else
-      mbmi->mb_skip_coeff = 0;
-  }
-
-  // Read the reference frame
-  mbmi->ref_frame = read_ref_frame(pbi, bc, mbmi->segment_id);
-
-  // If reference frame is an Inter frame
-  if (mbmi->ref_frame) {
-    int rct[4];
-    int_mv nearest, nearby, best_mv;
-    int_mv nearest_second, nearby_second, best_mv_second;
-    vp9_prob mv_ref_p [VP9_MVREFS - 1];
-
-#if CONFIG_NEWBESTREFMV
-    int recon_y_stride, recon_yoffset;
-    int recon_uv_stride, recon_uvoffset;
-#endif
-
-    vp9_find_near_mvs(xd, mi,
-                      prev_mi,
-                      &nearest, &nearby, &best_mv, rct,
-                      mbmi->ref_frame, cm->ref_frame_sign_bias);
-
-#if CONFIG_NEWBESTREFMV
-    {
-      int ref_fb_idx;
-      MV_REFERENCE_FRAME ref_frame = mbmi->ref_frame;
-
-      /* Select the appropriate reference frame for this MB */
-      if (ref_frame == LAST_FRAME)
-        ref_fb_idx = cm->lst_fb_idx;
-      else if (ref_frame == GOLDEN_FRAME)
-        ref_fb_idx = cm->gld_fb_idx;
-      else
-        ref_fb_idx = cm->alt_fb_idx;
-
-      recon_y_stride = cm->yv12_fb[ref_fb_idx].y_stride  ;
-      recon_uv_stride = cm->yv12_fb[ref_fb_idx].uv_stride;
-
-      recon_yoffset = (mb_row * recon_y_stride * 16) + (mb_col * 16);
-      recon_uvoffset = (mb_row * recon_uv_stride * 8) + (mb_col * 8);
-
-      xd->pre.y_buffer = cm->yv12_fb[ref_fb_idx].y_buffer + recon_yoffset;
-      xd->pre.u_buffer = cm->yv12_fb[ref_fb_idx].u_buffer + recon_uvoffset;
-      xd->pre.v_buffer = cm->yv12_fb[ref_fb_idx].v_buffer + recon_uvoffset;
-
-      vp9_find_mv_refs(xd, mi, prev_mi,
-                       ref_frame, mbmi->ref_mvs[ref_frame],
-                       cm->ref_frame_sign_bias);
-
-      vp9_find_best_ref_mvs(xd,
-                            xd->pre.y_buffer,
-                            recon_y_stride,
-                            mbmi->ref_mvs[ref_frame],
-                            &best_mv, &nearest, &nearby);
-    }
-#endif
-
-    vp9_mv_ref_probs(&pbi->common, mv_ref_p, rct);
-
-    // Is the segment level mode feature enabled for this segment
-    if (vp9_segfeature_active(xd, mbmi->segment_id, SEG_LVL_MODE)) {
-      mbmi->mode =
-        vp9_get_segdata(xd, mbmi->segment_id, SEG_LVL_MODE);
-    } else {
-#if CONFIG_SUPERBLOCKS
-      if (mbmi->encoded_as_sb) {
-        mbmi->mode = read_sb_mv_ref(bc, mv_ref_p);
-      } else
-#endif
-      mbmi->mode = read_mv_ref(bc, mv_ref_p);
-
-      vp9_accum_mv_refs(&pbi->common, mbmi->mode, rct);
-    }
-
-#if CONFIG_PRED_FILTER
-    if (mbmi->mode >= NEARESTMV && mbmi->mode < SPLITMV) {
-      // Is the prediction filter enabled
-      if (cm->pred_filter_mode == 2)
-        mbmi->pred_filter_enabled =
-          vp9_read(bc, cm->prob_pred_filter_off);
-      else
-        mbmi->pred_filter_enabled = cm->pred_filter_mode;
-    }
-#endif
-    if (mbmi->mode >= NEARESTMV && mbmi->mode <= SPLITMV)
-    {
-      if (cm->mcomp_filter_type == SWITCHABLE) {
-        mbmi->interp_filter = vp9_switchable_interp[
-            treed_read(bc, vp9_switchable_interp_tree,
-                       vp9_get_pred_probs(cm, xd, PRED_SWITCHABLE_INTERP))];
-      } else {
-        mbmi->interp_filter = cm->mcomp_filter_type;
-      }
-    }
-
-    if (cm->comp_pred_mode == COMP_PREDICTION_ONLY ||
-        (cm->comp_pred_mode == HYBRID_PREDICTION &&
-         vp9_read(bc, vp9_get_pred_prob(cm, xd, PRED_COMP)))) {
-      /* Since we have 3 reference frames, we can only have 3 unique
-       * combinations of combinations of 2 different reference frames
-       * (A-G, G-L or A-L). In the bitstream, we use this to simply
-       * derive the second reference frame from the first reference
-       * frame, by saying it's the next one in the enumerator, and
-       * if that's > n_refs, then the second reference frame is the
-       * first one in the enumerator. */
-      mbmi->second_ref_frame = mbmi->ref_frame + 1;
-      if (mbmi->second_ref_frame == 4)
-        mbmi->second_ref_frame = 1;
-#if CONFIG_NEWBESTREFMV
-      if (mbmi->second_ref_frame) {
-        int second_ref_fb_idx;
-        /* Select the appropriate reference frame for this MB */
-        if (mbmi->second_ref_frame == LAST_FRAME)
-          second_ref_fb_idx = cm->lst_fb_idx;
-        else if (mbmi->second_ref_frame ==
-          GOLDEN_FRAME)
-          second_ref_fb_idx = cm->gld_fb_idx;
-        else
-          second_ref_fb_idx = cm->alt_fb_idx;
-
-        xd->second_pre.y_buffer =
-          cm->yv12_fb[second_ref_fb_idx].y_buffer + recon_yoffset;
-        xd->second_pre.u_buffer =
-          cm->yv12_fb[second_ref_fb_idx].u_buffer + recon_uvoffset;
-        xd->second_pre.v_buffer =
-          cm->yv12_fb[second_ref_fb_idx].v_buffer + recon_uvoffset;
-        vp9_find_near_mvs(xd, mi, prev_mi,
-                          &nearest_second, &nearby_second, &best_mv_second,
-                          rct,
-                          mbmi->second_ref_frame,
-                          cm->ref_frame_sign_bias);
-
-        vp9_find_mv_refs(xd, mi, prev_mi,
-                         mbmi->second_ref_frame,
-                         mbmi->ref_mvs[mbmi->second_ref_frame],
-                         cm->ref_frame_sign_bias);
-
-        vp9_find_best_ref_mvs(xd,
-                              xd->second_pre.y_buffer,
-                              recon_y_stride,
-                              mbmi->ref_mvs[mbmi->second_ref_frame],
-                              &best_mv_second,
-                              &nearest_second,
-                              &nearby_second);
-      }
-#else
-      vp9_find_near_mvs(xd, mi, prev_mi,
-                        &nearest_second, &nearby_second, &best_mv_second,
-                        rct,
-                        mbmi->second_ref_frame,
-                        pbi->common.ref_frame_sign_bias);
-#endif
-    } else {
-      mbmi->second_ref_frame = 0;
-    }
-
-    mbmi->uv_mode = DC_PRED;
-    switch (mbmi->mode) {
-      case SPLITMV: {
-        const int s = mbmi->partitioning =
-                        treed_read(bc, vp9_mbsplit_tree, cm->fc.mbsplit_prob);
-        const int num_p = vp9_mbsplit_count [s];
-        int j = 0;
-        cm->fc.mbsplit_counts[s]++;
-
-        mbmi->need_to_clamp_mvs = 0;
-        do { /* for each subset j */
-          int_mv leftmv, abovemv, second_leftmv, second_abovemv;
-          int_mv blockmv, secondmv;
-          int k;  /* first block in subset j */
-          int mv_contz;
-          int blockmode;
-
-          k = vp9_mbsplit_offset[s][j];
-
-          leftmv.as_int = left_block_mv(mi, k);
-          abovemv.as_int = above_block_mv(mi, k, mis);
-          if (mbmi->second_ref_frame) {
-            second_leftmv.as_int = left_block_second_mv(mi, k);
-            second_abovemv.as_int = above_block_second_mv(mi, k, mis);
-          }
-          mv_contz = vp9_mv_cont(&leftmv, &abovemv);
-          blockmode = sub_mv_ref(bc, cm->fc.sub_mv_ref_prob [mv_contz]);
-          cm->fc.sub_mv_ref_counts[mv_contz][blockmode - LEFT4X4]++;
-
-          switch (blockmode) {
-            case NEW4X4:
-              read_nmv(bc, &blockmv.as_mv, &best_mv.as_mv, nmvc);
-              read_nmv_fp(bc, &blockmv.as_mv, &best_mv.as_mv, nmvc,
-                          xd->allow_high_precision_mv);
-              vp9_increment_nmv(&blockmv.as_mv, &best_mv.as_mv,
-                                &cm->fc.NMVcount, xd->allow_high_precision_mv);
-              blockmv.as_mv.row += best_mv.as_mv.row;
-              blockmv.as_mv.col += best_mv.as_mv.col;
-
-              if (mbmi->second_ref_frame) {
-                read_nmv(bc, &secondmv.as_mv, &best_mv_second.as_mv, nmvc);
-                read_nmv_fp(bc, &secondmv.as_mv, &best_mv_second.as_mv, nmvc,
-                            xd->allow_high_precision_mv);
-                vp9_increment_nmv(&secondmv.as_mv, &best_mv_second.as_mv,
-                                  &cm->fc.NMVcount, xd->allow_high_precision_mv);
-                secondmv.as_mv.row += best_mv_second.as_mv.row;
-                secondmv.as_mv.col += best_mv_second.as_mv.col;
-              }
-#ifdef VPX_MODE_COUNT
-              vp9_mv_cont_count[mv_contz][3]++;
-#endif
-              break;
-            case LEFT4X4:
-              blockmv.as_int = leftmv.as_int;
-              if (mbmi->second_ref_frame)
-                secondmv.as_int = second_leftmv.as_int;
-#ifdef VPX_MODE_COUNT
-              vp9_mv_cont_count[mv_contz][0]++;
-#endif
-              break;
-            case ABOVE4X4:
-              blockmv.as_int = abovemv.as_int;
-              if (mbmi->second_ref_frame)
-                secondmv.as_int = second_abovemv.as_int;
-#ifdef VPX_MODE_COUNT
-              vp9_mv_cont_count[mv_contz][1]++;
-#endif
-              break;
-            case ZERO4X4:
-              blockmv.as_int = 0;
-              if (mbmi->second_ref_frame)
-                secondmv.as_int = 0;
-#ifdef VPX_MODE_COUNT
-              vp9_mv_cont_count[mv_contz][2]++;
-#endif
-              break;
-            default:
-              break;
-          }
-
-          mbmi->need_to_clamp_mvs |= check_mv_bounds(&blockmv,
-                                                     mb_to_left_edge,
-                                                     mb_to_right_edge,
-                                                     mb_to_top_edge,
-                                                     mb_to_bottom_edge);
-          if (mbmi->second_ref_frame) {
-            mbmi->need_to_clamp_mvs |= check_mv_bounds(&secondmv,
-                                                       mb_to_left_edge,
-                                                       mb_to_right_edge,
-                                                       mb_to_top_edge,
-                                                       mb_to_bottom_edge);
-          }
-
-          {
-            /* Fill (uniform) modes, mvs of jth subset.
-             Must do it here because ensuing subsets can
-             refer back to us via "left" or "above". */
-            const unsigned char *fill_offset;
-            unsigned int fill_count = mbsplit_fill_count[s];
-
-            fill_offset = &mbsplit_fill_offset[s][(unsigned char)j * mbsplit_fill_count[s]];
-
-            do {
-              mi->bmi[ *fill_offset].as_mv.first.as_int = blockmv.as_int;
-              if (mbmi->second_ref_frame)
-                mi->bmi[ *fill_offset].as_mv.second.as_int = secondmv.as_int;
-              fill_offset++;
-            } while (--fill_count);
-          }
-
-        } while (++j < num_p);
-      }
-
-      mv->as_int = mi->bmi[15].as_mv.first.as_int;
-      mbmi->mv[1].as_int = mi->bmi[15].as_mv.second.as_int;
-
-      break;  /* done with SPLITMV */
-
-      case NEARMV:
-        mv->as_int = nearby.as_int;
-        /* Clip "next_nearest" so that it does not extend to far out of image */
-        clamp_mv(mv, mb_to_left_edge, mb_to_right_edge,
-                 mb_to_top_edge, mb_to_bottom_edge);
-        if (mbmi->second_ref_frame) {
-          mbmi->mv[1].as_int = nearby_second.as_int;
-          clamp_mv(&mbmi->mv[1], mb_to_left_edge, mb_to_right_edge,
-                   mb_to_top_edge, mb_to_bottom_edge);
-        }
-        break;
-
-      case NEARESTMV:
-        mv->as_int = nearest.as_int;
-        /* Clip "next_nearest" so that it does not extend to far out of image */
-        clamp_mv(mv, mb_to_left_edge, mb_to_right_edge,
-                 mb_to_top_edge, mb_to_bottom_edge);
-        if (mbmi->second_ref_frame) {
-          mbmi->mv[1].as_int = nearest_second.as_int;
-          clamp_mv(&mbmi->mv[1], mb_to_left_edge, mb_to_right_edge,
-                   mb_to_top_edge, mb_to_bottom_edge);
-        }
-        break;
-
-      case ZEROMV:
-        mv->as_int = 0;
-        if (mbmi->second_ref_frame)
-          mbmi->mv[1].as_int = 0;
-        break;
-
-      case NEWMV:
-
-#if CONFIG_NEW_MVREF
-        {
-          int best_index;
-          MV_REFERENCE_FRAME ref_frame = mbmi->ref_frame;
-
-          // Encode the index of the choice.
-          best_index =
-            vp9_read_mv_ref_id(bc, xd->mb_mv_ref_id_probs[ref_frame]);
-
-          best_mv.as_int = mbmi->ref_mvs[ref_frame][best_index].as_int;
-        }
-#endif
-
-        read_nmv(bc, &mv->as_mv, &best_mv.as_mv, nmvc);
-        read_nmv_fp(bc, &mv->as_mv, &best_mv.as_mv, nmvc,
-                    xd->allow_high_precision_mv);
-        vp9_increment_nmv(&mv->as_mv, &best_mv.as_mv, &cm->fc.NMVcount,
-                          xd->allow_high_precision_mv);
-
-        mv->as_mv.row += best_mv.as_mv.row;
-        mv->as_mv.col += best_mv.as_mv.col;
-
-        /* Don't need to check this on NEARMV and NEARESTMV modes
-         * since those modes clamp the MV. The NEWMV mode does not,
-         * so signal to the prediction stage whether special
-         * handling may be required.
-         */
-        mbmi->need_to_clamp_mvs = check_mv_bounds(mv,
-                                                  mb_to_left_edge,
-                                                  mb_to_right_edge,
-                                                  mb_to_top_edge,
-                                                  mb_to_bottom_edge);
-
-        if (mbmi->second_ref_frame) {
-#if CONFIG_NEW_MVREF
-        {
-          int best_index;
-          MV_REFERENCE_FRAME ref_frame = mbmi->second_ref_frame;
-
-          // Encode the index of the choice.
-          best_index =
-            vp9_read_mv_ref_id(bc, xd->mb_mv_ref_id_probs[ref_frame]);
-          best_mv_second.as_int = mbmi->ref_mvs[ref_frame][best_index].as_int;
-        }
-#endif
-
-          read_nmv(bc, &mbmi->mv[1].as_mv, &best_mv_second.as_mv, nmvc);
-          read_nmv_fp(bc, &mbmi->mv[1].as_mv, &best_mv_second.as_mv, nmvc,
-                      xd->allow_high_precision_mv);
-          vp9_increment_nmv(&mbmi->mv[1].as_mv, &best_mv_second.as_mv,
-                            &cm->fc.NMVcount, xd->allow_high_precision_mv);
-          mbmi->mv[1].as_mv.row += best_mv_second.as_mv.row;
-          mbmi->mv[1].as_mv.col += best_mv_second.as_mv.col;
-          mbmi->need_to_clamp_secondmv |=
-            check_mv_bounds(&mbmi->mv[1],
-                            mb_to_left_edge, mb_to_right_edge,
-                            mb_to_top_edge, mb_to_bottom_edge);
-        }
-        break;
-      default:
-;
-#if CONFIG_DEBUG
-        assert(0);
-#endif
-    }
-  } else {
-    /* required for left and above block mv */
-    mbmi->mv[0].as_int = 0;
-
-    if (vp9_segfeature_active(xd, mbmi->segment_id, SEG_LVL_MODE))
-      mbmi->mode = (MB_PREDICTION_MODE)
-                   vp9_get_segdata(xd, mbmi->segment_id, SEG_LVL_MODE);
-    else {
-      // FIXME write using SB mode tree
-      mbmi->mode = (MB_PREDICTION_MODE)
-                   read_ymode(bc, pbi->common.fc.ymode_prob);
-      pbi->common.fc.ymode_counts[mbmi->mode]++;
-    }
-#if CONFIG_COMP_INTRA_PRED
-    mbmi->second_mode = (MB_PREDICTION_MODE)(DC_PRED - 1);
-#endif
-
-    // If MB mode is BPRED read the block modes
-    if (mbmi->mode == B_PRED) {
-      int j = 0;
-#if CONFIG_COMP_INTRA_PRED
-      int use_comp_pred = vp9_read(bc, 128);
-#endif
-      do {
-        mi->bmi[j].as_mode.first = (B_PREDICTION_MODE)read_bmode(bc, pbi->common.fc.bmode_prob);
-        /*
-        {
-          int p;
-          for (p = 0; p < VP9_BINTRAMODES - 1; ++p)
-            printf(" %d", pbi->common.fc.bmode_prob[p]);
-          printf("\nbmode[%d][%d]: %d\n", pbi->common.current_video_frame, j, mi->bmi[j].as_mode.first);
-        }
-        */
-        pbi->common.fc.bmode_counts[mi->bmi[j].as_mode.first]++;
-#if CONFIG_COMP_INTRA_PRED
-        if (use_comp_pred) {
-          mi->bmi[j].as_mode.second = (B_PREDICTION_MODE)read_bmode(bc, pbi->common.fc.bmode_prob);
-        } else {
-          mi->bmi[j].as_mode.second = (B_PREDICTION_MODE)(B_DC_PRED - 1);
-        }
-#endif
-      } while (++j < 16);
-    }
-
-    if (mbmi->mode == I8X8_PRED) {
-      int i;
-      int mode8x8;
-      for (i = 0; i < 4; i++) {
-        int ib = vp9_i8x8_block[i];
-        mode8x8 = read_i8x8_mode(bc, pbi->common.fc.i8x8_mode_prob);
-        mi->bmi[ib + 0].as_mode.first = mode8x8;
-        mi->bmi[ib + 1].as_mode.first = mode8x8;
-        mi->bmi[ib + 4].as_mode.first = mode8x8;
-        mi->bmi[ib + 5].as_mode.first = mode8x8;
-        pbi->common.fc.i8x8_mode_counts[mode8x8]++;
-#if CONFIG_COMP_INTRA_PRED
-        mi->bmi[ib + 0].as_mode.second = (MB_PREDICTION_MODE)(DC_PRED - 1);
-        mi->bmi[ib + 1].as_mode.second = (MB_PREDICTION_MODE)(DC_PRED - 1);
-        mi->bmi[ib + 4].as_mode.second = (MB_PREDICTION_MODE)(DC_PRED - 1);
-        mi->bmi[ib + 5].as_mode.second = (MB_PREDICTION_MODE)(DC_PRED - 1);
-#endif
-      }
-    } else {
-      mbmi->uv_mode = (MB_PREDICTION_MODE)read_uv_mode(
-        bc, pbi->common.fc.uv_mode_prob[mbmi->mode]);
-      pbi->common.fc.uv_mode_counts[mbmi->mode][mbmi->uv_mode]++;
-    }
-
-#if CONFIG_COMP_INTRA_PRED
-    mbmi->second_uv_mode = (MB_PREDICTION_MODE)(DC_PRED - 1);
-#endif
-  }
-
-#if CONFIG_SUPERBLOCKS
-  if (mbmi->encoded_as_sb)
-    mbmi->txfm_size = TX_8X8;
-  else
-#endif
-  if (cm->txfm_mode == TX_MODE_SELECT && mbmi->mb_skip_coeff == 0 &&
-      ((mbmi->ref_frame == INTRA_FRAME && mbmi->mode <= I8X8_PRED) ||
-       (mbmi->ref_frame != INTRA_FRAME && !(mbmi->mode == SPLITMV &&
-                           mbmi->partitioning == PARTITIONING_4X4)))) {
-    // FIXME(rbultje) code ternary symbol once all experiments are merged
-    mbmi->txfm_size = vp9_read(bc, cm->prob_tx[0]);
-    if (mbmi->txfm_size != TX_4X4 && mbmi->mode != I8X8_PRED &&
-        mbmi->mode != SPLITMV)
-      mbmi->txfm_size += vp9_read(bc, cm->prob_tx[1]);
-  } else if (cm->txfm_mode >= ALLOW_16X16 &&
-      ((mbmi->ref_frame == INTRA_FRAME && mbmi->mode <= TM_PRED) ||
-       (mbmi->ref_frame != INTRA_FRAME && mbmi->mode != SPLITMV))) {
-    mbmi->txfm_size = TX_16X16;
-  } else if (cm->txfm_mode >= ALLOW_8X8 &&
-      (!(mbmi->ref_frame == INTRA_FRAME && mbmi->mode == B_PRED) &&
-       !(mbmi->ref_frame != INTRA_FRAME && mbmi->mode == SPLITMV &&
-         mbmi->partitioning == PARTITIONING_4X4))) {
-    mbmi->txfm_size = TX_8X8;
-  } else {
-    mbmi->txfm_size = TX_4X4;
-  }
-}
-
-void vp9_decode_mode_mvs_init(VP9D_COMP *pbi, BOOL_DECODER* const bc) {
-  VP9_COMMON *cm = &pbi->common;
-
-  vpx_memset(cm->mbskip_pred_probs, 0, sizeof(cm->mbskip_pred_probs));
-  if (pbi->common.mb_no_coeff_skip) {
-    int k;
-    for (k = 0; k < MBSKIP_CONTEXTS; ++k)
-      cm->mbskip_pred_probs[k] = (vp9_prob)vp9_read_literal(bc, 8);
-  }
-
-  mb_mode_mv_init(pbi, bc);
-}
-void vp9_decode_mb_mode_mv(VP9D_COMP *pbi,
-                           MACROBLOCKD *xd,
-                           int mb_row,
-                           int mb_col,
-                           BOOL_DECODER* const bc) {
-  MODE_INFO *mi = xd->mode_info_context;
-  MODE_INFO *prev_mi = xd->prev_mode_info_context;
-
-  if (pbi->common.frame_type == KEY_FRAME)
-    kfread_modes(pbi, mi, mb_row, mb_col, bc);
-  else
-    read_mb_modes_mv(pbi, mi, &mi->mbmi, prev_mi, mb_row, mb_col, bc);
-}
--- a/vp8/decoder/decodemv.h
+++ /dev/null
@@ -1,19 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "onyxd_int.h"
-
-void vp9_decode_mb_mode_mv(VP9D_COMP* const pbi,
-                           MACROBLOCKD* const xd,
-                           int mb_row,
-                           int mb_col,
-                           BOOL_DECODER* const bc);
-void vp9_decode_mode_mvs_init(VP9D_COMP* const pbi, BOOL_DECODER* const bc);
--- a/vp8/decoder/decodframe.c
+++ /dev/null
@@ -1,1337 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "onyxd_int.h"
-#include "vp8/common/header.h"
-#include "vp8/common/reconintra.h"
-#include "vp8/common/reconintra4x4.h"
-#include "vp8/common/reconinter.h"
-#include "detokenize.h"
-#include "vp8/common/invtrans.h"
-#include "vp8/common/alloccommon.h"
-#include "vp8/common/entropymode.h"
-#include "vp8/common/quant_common.h"
-#include "vpx_scale/vpxscale.h"
-#include "vpx_scale/yv12extend.h"
-#include "vp8/common/setupintrarecon.h"
-
-#include "decodemv.h"
-#include "vp8/common/extend.h"
-#include "vp8/common/modecont.h"
-#include "vpx_mem/vpx_mem.h"
-#include "vp8/common/idct.h"
-#include "dboolhuff.h"
-
-#include "vp8/common/seg_common.h"
-#include "vp8/common/entropy.h"
-#include "vpx_rtcd.h"
-
-#include <assert.h>
-#include <stdio.h>
-
-
-#define COEFCOUNT_TESTING
-
-static int merge_index(int v, int n, int modulus) {
-  int max1 = (n - 1 - modulus / 2) / modulus + 1;
-  if (v < max1) v = v * modulus + modulus / 2;
-  else {
-    int w;
-    v -= max1;
-    w = v;
-    v += (v + modulus - modulus / 2) / modulus;
-    while (v % modulus == modulus / 2 ||
-           w != v - (v + modulus - modulus / 2) / modulus) v++;
-  }
-  return v;
-}
-
-static int inv_remap_prob(int v, int m) {
-  const int n = 256;
-  const int modulus = MODULUS_PARAM;
-  int i;
-  v = merge_index(v, n - 1, modulus);
-  if ((m << 1) <= n) {
-    i = vp9_inv_recenter_nonneg(v + 1, m);
-  } else {
-    i = n - 1 - vp9_inv_recenter_nonneg(v + 1, n - 1 - m);
-  }
-  return i;
-}
-
-static vp9_prob read_prob_diff_update(vp9_reader *const bc, int oldp) {
-  int delp = vp9_decode_term_subexp(bc, SUBEXP_PARAM, 255);
-  return (vp9_prob)inv_remap_prob(delp, oldp);
-}
-
-void vp9_init_de_quantizer(VP9D_COMP *pbi) {
-  int i;
-  int Q;
-  VP9_COMMON *const pc = &pbi->common;
-
-  for (Q = 0; Q < QINDEX_RANGE; Q++) {
-    pc->Y1dequant[Q][0] = (short)vp9_dc_quant(Q, pc->y1dc_delta_q);
-    pc->Y2dequant[Q][0] = (short)vp9_dc2quant(Q, pc->y2dc_delta_q);
-    pc->UVdequant[Q][0] = (short)vp9_dc_uv_quant(Q, pc->uvdc_delta_q);
-
-    /* all the ac values =; */
-    for (i = 1; i < 16; i++) {
-      int rc = vp9_default_zig_zag1d[i];
-
-      pc->Y1dequant[Q][rc] = (short)vp9_ac_yquant(Q);
-      pc->Y2dequant[Q][rc] = (short)vp9_ac2quant(Q, pc->y2ac_delta_q);
-      pc->UVdequant[Q][rc] = (short)vp9_ac_uv_quant(Q, pc->uvac_delta_q);
-    }
-  }
-}
-
-static void mb_init_dequantizer(VP9D_COMP *pbi, MACROBLOCKD *xd) {
-  int i;
-  int QIndex;
-  VP9_COMMON *const pc = &pbi->common;
-  int segment_id = xd->mode_info_context->mbmi.segment_id;
-
-  // Set the Q baseline allowing for any segment level adjustment
-  if (vp9_segfeature_active(xd, segment_id, SEG_LVL_ALT_Q)) {
-    /* Abs Value */
-    if (xd->mb_segment_abs_delta == SEGMENT_ABSDATA)
-      QIndex = vp9_get_segdata(xd, segment_id, SEG_LVL_ALT_Q);
-
-    /* Delta Value */
-    else {
-      QIndex = pc->base_qindex +
-               vp9_get_segdata(xd, segment_id, SEG_LVL_ALT_Q);
-      QIndex = (QIndex >= 0) ? ((QIndex <= MAXQ) ? QIndex : MAXQ) : 0;    /* Clamp to valid range */
-    }
-  } else
-    QIndex = pc->base_qindex;
-  xd->q_index = QIndex;
-
-  /* Set up the block level dequant pointers */
-  for (i = 0; i < 16; i++) {
-    xd->block[i].dequant = pc->Y1dequant[QIndex];
-  }
-
-#if CONFIG_LOSSLESS
-  if (!QIndex) {
-    pbi->common.rtcd.idct.idct1        = vp9_short_inv_walsh4x4_1_x8_c;
-    pbi->common.rtcd.idct.idct16       = vp9_short_inv_walsh4x4_x8_c;
-    pbi->common.rtcd.idct.idct1_scalar_add  = vp9_dc_only_inv_walsh_add_c;
-    pbi->common.rtcd.idct.iwalsh1      = vp9_short_inv_walsh4x4_1_lossless_c;
-    pbi->common.rtcd.idct.iwalsh16     = vp9_short_inv_walsh4x4_lossless_c;
-    pbi->idct_add            = vp9_dequant_idct_add_lossless_c;
-    pbi->dc_idct_add         = vp9_dequant_dc_idct_add_lossless_c;
-    pbi->dc_idct_add_y_block = vp9_dequant_dc_idct_add_y_block_lossless_c;
-    pbi->idct_add_y_block    = vp9_dequant_idct_add_y_block_lossless_c;
-    pbi->idct_add_uv_block   = vp9_dequant_idct_add_uv_block_lossless_c;
-  } else {
-    pbi->common.rtcd.idct.idct1        = vp9_short_idct4x4llm_1_c;
-    pbi->common.rtcd.idct.idct16       = vp9_short_idct4x4llm_c;
-    pbi->common.rtcd.idct.idct1_scalar_add  = vp9_dc_only_idct_add_c;
-    pbi->common.rtcd.idct.iwalsh1      = vp9_short_inv_walsh4x4_1_c;
-    pbi->common.rtcd.idct.iwalsh16     = vp9_short_inv_walsh4x4_c;
-    pbi->idct_add            = vp9_dequant_idct_add;
-    pbi->dc_idct_add         = vp9_dequant_dc_idct_add;
-    pbi->dc_idct_add_y_block = vp9_dequant_dc_idct_add_y_block;
-    pbi->idct_add_y_block    = vp9_dequant_idct_add_y_block;
-    pbi->idct_add_uv_block   = vp9_dequant_idct_add_uv_block;
-  }
-#else
-  pbi->idct_add            = vp9_dequant_idct_add;
-  pbi->dc_idct_add         = vp9_dequant_dc_idct_add;
-  pbi->dc_idct_add_y_block = vp9_dequant_dc_idct_add_y_block;
-  pbi->idct_add_y_block    = vp9_dequant_idct_add_y_block;
-  pbi->idct_add_uv_block   = vp9_dequant_idct_add_uv_block;
-#endif
-
-  for (i = 16; i < 24; i++) {
-    xd->block[i].dequant = pc->UVdequant[QIndex];
-  }
-
-  xd->block[24].dequant = pc->Y2dequant[QIndex];
-
-}
-
-#if CONFIG_RUNTIME_CPU_DETECT
-#define RTCD_VTABLE(x) (&(pbi)->common.rtcd.x)
-#else
-#define RTCD_VTABLE(x) NULL
-#endif
-
-/* skip_recon_mb() is Modified: Instead of writing the result to predictor buffer and then copying it
- *  to dst buffer, we can write the result directly to dst buffer. This eliminates unnecessary copy.
- */
-static void skip_recon_mb(VP9D_COMP *pbi, MACROBLOCKD *xd) {
-  if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) {
-#if CONFIG_SUPERBLOCKS
-    if (xd->mode_info_context->mbmi.encoded_as_sb) {
-      vp9_build_intra_predictors_sbuv_s(xd);
-      vp9_build_intra_predictors_sby_s(xd);
-    } else {
-#endif
-    vp9_build_intra_predictors_mbuv_s(xd);
-    vp9_build_intra_predictors_mby_s(xd);
-#if CONFIG_SUPERBLOCKS
-    }
-#endif
-  } else {
-#if CONFIG_SUPERBLOCKS
-    if (xd->mode_info_context->mbmi.encoded_as_sb) {
-      vp9_build_inter32x32_predictors_sb(xd, xd->dst.y_buffer,
-                                         xd->dst.u_buffer, xd->dst.v_buffer,
-                                         xd->dst.y_stride, xd->dst.uv_stride);
-    } else {
-#endif
-    vp9_build_1st_inter16x16_predictors_mb(xd, xd->dst.y_buffer,
-                                           xd->dst.u_buffer, xd->dst.v_buffer,
-                                           xd->dst.y_stride, xd->dst.uv_stride);
-
-    if (xd->mode_info_context->mbmi.second_ref_frame) {
-      vp9_build_2nd_inter16x16_predictors_mb(xd, xd->dst.y_buffer,
-                                             xd->dst.u_buffer, xd->dst.v_buffer,
-                                             xd->dst.y_stride, xd->dst.uv_stride);
-    }
-#if CONFIG_SUPERBLOCKS
-    }
-#endif
-  }
-}
-
-static void decode_macroblock(VP9D_COMP *pbi, MACROBLOCKD *xd,
-                              int mb_row, unsigned int mb_col,
-                              BOOL_DECODER* const bc) {
-  int eobtotal = 0;
-  MB_PREDICTION_MODE mode;
-  int i;
-  int tx_size;
-  TX_TYPE tx_type;
-  VP9_COMMON *pc = &pbi->common;
-#if CONFIG_SUPERBLOCKS
-  int orig_skip_flag = xd->mode_info_context->mbmi.mb_skip_coeff;
-#endif
-
-  // re-initialize macroblock dequantizer before detokenization
-  if (xd->segmentation_enabled)
-    mb_init_dequantizer(pbi, xd);
-
-  tx_size = xd->mode_info_context->mbmi.txfm_size;
-  mode = xd->mode_info_context->mbmi.mode;
-
-  if (xd->mode_info_context->mbmi.mb_skip_coeff) {
-    vp9_reset_mb_tokens_context(xd);
-#if CONFIG_SUPERBLOCKS
-    if (xd->mode_info_context->mbmi.encoded_as_sb &&
-        (mb_col < pc->mb_cols - 1 || mb_row < pc->mb_rows - 1)) {
-      if (mb_col < pc->mb_cols - 1)
-        xd->above_context++;
-      if (mb_row < pc->mb_rows - 1)
-        xd->left_context++;
-      vp9_reset_mb_tokens_context(xd);
-      if (mb_col < pc->mb_cols - 1)
-        xd->above_context--;
-      if (mb_row < pc->mb_rows - 1)
-        xd->left_context--;
-    }
-#endif
-  } else if (!bool_error(bc)) {
-    for (i = 0; i < 25; i++) {
-      xd->block[i].eob = 0;
-      xd->eobs[i] = 0;
-    }
-    if (tx_size == TX_16X16) {
-      eobtotal = vp9_decode_mb_tokens_16x16(pbi, xd, bc);
-    } else if (tx_size == TX_8X8) {
-      eobtotal = vp9_decode_mb_tokens_8x8(pbi, xd, bc);
-    } else {
-      eobtotal = vp9_decode_mb_tokens(pbi, xd, bc);
-    }
-  }
-
-  //mode = xd->mode_info_context->mbmi.mode;
-  if (pbi->common.frame_type != KEY_FRAME)
-    vp9_setup_interp_filters(xd, xd->mode_info_context->mbmi.interp_filter,
-                             &pbi->common);
-
-  if (eobtotal == 0 && mode != B_PRED && mode != SPLITMV
-      && mode != I8X8_PRED
-      && !bool_error(bc)) {
-    /* Special case:  Force the loopfilter to skip when eobtotal and
-     * mb_skip_coeff are zero.
-     * */
-    xd->mode_info_context->mbmi.mb_skip_coeff = 1;
-
-#if CONFIG_SUPERBLOCKS
-    if (!xd->mode_info_context->mbmi.encoded_as_sb || orig_skip_flag)
-#endif
-    {
-      skip_recon_mb(pbi, xd);
-      return;
-    }
-  }
-
-  // moved to be performed before detokenization
-//  if (xd->segmentation_enabled)
-//    mb_init_dequantizer(pbi, xd);
-
-  /* do prediction */
-  if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) {
-#if CONFIG_SUPERBLOCKS
-    if (xd->mode_info_context->mbmi.encoded_as_sb) {
-      vp9_build_intra_predictors_sby_s(xd);
-      vp9_build_intra_predictors_sbuv_s(xd);
-    } else
-#endif
-    if (mode != I8X8_PRED) {
-      vp9_build_intra_predictors_mbuv(xd);
-      if (mode != B_PRED) {
-        vp9_build_intra_predictors_mby(xd);
-      }
-    }
-  } else {
-#if CONFIG_SUPERBLOCKS
-    if (xd->mode_info_context->mbmi.encoded_as_sb) {
-      vp9_build_inter32x32_predictors_sb(xd, xd->dst.y_buffer,
-                                         xd->dst.u_buffer, xd->dst.v_buffer,
-                                         xd->dst.y_stride, xd->dst.uv_stride);
-    } else
-#endif
-    vp9_build_inter_predictors_mb(xd);
-  }
-
-  /* dequantization and idct */
-  if (mode == I8X8_PRED) {
-    for (i = 0; i < 4; i++) {
-      int ib = vp9_i8x8_block[i];
-      const int iblock[4] = {0, 1, 4, 5};
-      int j;
-      int i8x8mode;
-      BLOCKD *b;
-
-      int idx = (ib & 0x02) ? (ib + 2) : ib;
-
-      short *q  = xd->block[idx].qcoeff;
-      short *dq = xd->block[0].dequant;
-      unsigned char *pre = xd->block[ib].predictor;
-      unsigned char *dst = *(xd->block[ib].base_dst) + xd->block[ib].dst;
-      int stride = xd->dst.y_stride;
-
-      b = &xd->block[ib];
-      i8x8mode = b->bmi.as_mode.first;
-      vp9_intra8x8_predict(b, i8x8mode, b->predictor);
-
-      if (xd->mode_info_context->mbmi.txfm_size == TX_8X8) {
-        tx_type = get_tx_type(xd, &xd->block[idx]);
-        if (tx_type != DCT_DCT) {
-          vp9_ht_dequant_idct_add_8x8_c(tx_type,
-                                        q, dq, pre, dst, 16, stride);
-        } else {
-          vp9_dequant_idct_add_8x8_c(q, dq, pre, dst, 16, stride);
-        }
-        q += 64;
-      } else {
-        for (j = 0; j < 4; j++) {
-          b = &xd->block[ib + iblock[j]];
-          vp9_dequant_idct_add(b->qcoeff, b->dequant, b->predictor,
-                                 *(b->base_dst) + b->dst, 16, b->dst_stride);
-        }
-      }
-      b = &xd->block[16 + i];
-      vp9_intra_uv4x4_predict(b, i8x8mode, b->predictor);
-      pbi->idct_add(b->qcoeff, b->dequant, b->predictor,
-                    *(b->base_dst) + b->dst, 8, b->dst_stride);
-      b = &xd->block[20 + i];
-      vp9_intra_uv4x4_predict(b, i8x8mode, b->predictor);
-      pbi->idct_add(b->qcoeff, b->dequant, b->predictor,
-                    *(b->base_dst) + b->dst, 8, b->dst_stride);
-    }
-  } else if (mode == B_PRED) {
-    for (i = 0; i < 16; i++) {
-      BLOCKD *b = &xd->block[i];
-      int b_mode = xd->mode_info_context->bmi[i].as_mode.first;
-#if CONFIG_COMP_INTRA_PRED
-      int b_mode2 = xd->mode_info_context->bmi[i].as_mode.second;
-
-      if (b_mode2 == (B_PREDICTION_MODE)(B_DC_PRED - 1)) {
-#endif
-        vp9_intra4x4_predict(b, b_mode, b->predictor);
-#if CONFIG_COMP_INTRA_PRED
-      } else {
-        vp9_comp_intra4x4_predict(b, b_mode, b_mode2, b->predictor);
-      }
-#endif
-
-      tx_type = get_tx_type(xd, b);
-      if (tx_type != DCT_DCT) {
-        vp9_ht_dequant_idct_add_c(tx_type, b->qcoeff,
-                                  b->dequant, b->predictor,
-                                  *(b->base_dst) + b->dst, 16, b->dst_stride);
-      } else {
-        vp9_dequant_idct_add(b->qcoeff, b->dequant, b->predictor,
-                               *(b->base_dst) + b->dst, 16, b->dst_stride);
-      }
-    }
-  } else if (mode == SPLITMV) {
-    if (tx_size == TX_8X8) {
-      vp9_dequant_idct_add_y_block_8x8(xd->qcoeff, xd->block[0].dequant,
-                                         xd->predictor, xd->dst.y_buffer,
-                                         xd->dst.y_stride, xd->eobs, xd);
-    } else {
-      pbi->idct_add_y_block(xd->qcoeff, xd->block[0].dequant,
-                                       xd->predictor, xd->dst.y_buffer,
-                                       xd->dst.y_stride, xd->eobs);
-    }
-  } else {
-    BLOCKD *b = &xd->block[24];
-
-    if (tx_size == TX_16X16) {
-      BLOCKD *bd = &xd->block[0];
-      tx_type = get_tx_type(xd, bd);
-      if (tx_type != DCT_DCT) {
-        vp9_ht_dequant_idct_add_16x16_c(tx_type, xd->qcoeff,
-                                        xd->block[0].dequant, xd->predictor,
-                                        xd->dst.y_buffer, 16, xd->dst.y_stride);
-      } else {
-        vp9_dequant_idct_add_16x16(xd->qcoeff, xd->block[0].dequant,
-                                     xd->predictor, xd->dst.y_buffer,
-                                     16, xd->dst.y_stride);
-      }
-    } else if (tx_size == TX_8X8) {
-#if CONFIG_SUPERBLOCKS
-      void *orig = xd->mode_info_context;
-      int n, num = xd->mode_info_context->mbmi.encoded_as_sb ? 4 : 1;
-      for (n = 0; n < num; n++) {
-        int x_idx = n & 1, y_idx = n >> 1;
-        if (num == 4 && (mb_col + x_idx >= pc->mb_cols ||
-                         mb_row + y_idx >= pc->mb_rows))
-          continue;
-
-        if (n != 0) {
-          for (i = 0; i < 25; i++) {
-            xd->block[i].eob = 0;
-            xd->eobs[i] = 0;
-          }
-          xd->above_context = pc->above_context + mb_col + (n & 1);
-          xd->left_context = pc->left_context + (n >> 1);
-          xd->mode_info_context = orig;
-          xd->mode_info_context += (n & 1);
-          xd->mode_info_context += (n >> 1) * pc->mode_info_stride;
-          if (!orig_skip_flag) {
-            eobtotal = vp9_decode_mb_tokens_8x8(pbi, xd, bc);
-            if (eobtotal == 0) // skip loopfilter
-              xd->mode_info_context->mbmi.mb_skip_coeff = 1;
-          } else {
-            vp9_reset_mb_tokens_context(xd);
-          }
-        }
-
-        if (xd->mode_info_context->mbmi.mb_skip_coeff)
-          continue; // only happens for SBs, which are already in dest buffer
-#endif
-      vp9_dequantize_b_2x2(b);
-      IDCT_INVOKE(RTCD_VTABLE(idct), ihaar2)(&b->dqcoeff[0], b->diff, 8);
-      ((int *)b->qcoeff)[0] = 0;// 2nd order block are set to 0 after inverse transform
-      ((int *)b->qcoeff)[1] = 0;
-      ((int *)b->qcoeff)[2] = 0;
-      ((int *)b->qcoeff)[3] = 0;
-      ((int *)b->qcoeff)[4] = 0;
-      ((int *)b->qcoeff)[5] = 0;
-      ((int *)b->qcoeff)[6] = 0;
-      ((int *)b->qcoeff)[7] = 0;
-#if CONFIG_SUPERBLOCKS
-      if (xd->mode_info_context->mbmi.encoded_as_sb) {
-        vp9_dequant_dc_idct_add_y_block_8x8_inplace_c(xd->qcoeff,
-          xd->block[0].dequant,
-          xd->dst.y_buffer + (n >> 1) * 16 * xd->dst.y_stride + (n & 1) * 16,
-          xd->dst.y_stride, xd->eobs, xd->block[24].diff, xd);
-        // do UV inline also
-        vp9_dequant_idct_add_uv_block_8x8_inplace_c(xd->qcoeff + 16 * 16,
-          xd->block[16].dequant,
-          xd->dst.u_buffer + (n >> 1) * 8 * xd->dst.uv_stride + (n & 1) * 8,
-          xd->dst.v_buffer + (n >> 1) * 8 * xd->dst.uv_stride + (n & 1) * 8,
-          xd->dst.uv_stride, xd->eobs + 16, xd);
-      } else
-#endif
-        vp9_dequant_dc_idct_add_y_block_8x8(xd->qcoeff,
-          xd->block[0].dequant, xd->predictor, xd->dst.y_buffer,
-          xd->dst.y_stride, xd->eobs, xd->block[24].diff, xd);
-#if CONFIG_SUPERBLOCKS
-      }
-      xd->mode_info_context = orig;
-#endif
-    } else {
-      vp9_dequantize_b(b);
-      if (xd->eobs[24] > 1) {
-        IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh16)(&b->dqcoeff[0], b->diff);
-        ((int *)b->qcoeff)[0] = 0;
-        ((int *)b->qcoeff)[1] = 0;
-        ((int *)b->qcoeff)[2] = 0;
-        ((int *)b->qcoeff)[3] = 0;
-        ((int *)b->qcoeff)[4] = 0;
-        ((int *)b->qcoeff)[5] = 0;
-        ((int *)b->qcoeff)[6] = 0;
-        ((int *)b->qcoeff)[7] = 0;
-      } else {
-        IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh1)(&b->dqcoeff[0], b->diff);
-        ((int *)b->qcoeff)[0] = 0;
-      }
-
-      pbi->dc_idct_add_y_block(xd->qcoeff, xd->block[0].dequant, xd->predictor,
-                               xd->dst.y_buffer, xd->dst.y_stride, xd->eobs,
-                               xd->block[24].diff);
-    }
-  }
-
-#if CONFIG_SUPERBLOCKS
-  if (!xd->mode_info_context->mbmi.encoded_as_sb) {
-#endif
-    if ((tx_size == TX_8X8 &&
-         xd->mode_info_context->mbmi.mode != I8X8_PRED &&
-         xd->mode_info_context->mbmi.mode != SPLITMV)
-        || tx_size == TX_16X16
-       )
-      vp9_dequant_idct_add_uv_block_8x8
-          (xd->qcoeff + 16 * 16, xd->block[16].dequant,
-           xd->predictor + 16 * 16, xd->dst.u_buffer, xd->dst.v_buffer,
-           xd->dst.uv_stride, xd->eobs + 16, xd); //
-    else if (xd->mode_info_context->mbmi.mode != I8X8_PRED)
-      pbi->idct_add_uv_block(xd->qcoeff + 16 * 16, xd->block[16].dequant,
-           xd->predictor + 16 * 16, xd->dst.u_buffer, xd->dst.v_buffer,
-           xd->dst.uv_stride, xd->eobs + 16);
-#if CONFIG_SUPERBLOCKS
-  }
-#endif
-}
-
-
-static int get_delta_q(vp9_reader *bc, int prev, int *q_update) {
-  int ret_val = 0;
-
-  if (vp9_read_bit(bc)) {
-    ret_val = vp9_read_literal(bc, 4);
-
-    if (vp9_read_bit(bc))
-      ret_val = -ret_val;
-  }
-
-  /* Trigger a quantizer update if the delta-q value has changed */
-  if (ret_val != prev)
-    *q_update = 1;
-
-  return ret_val;
-}
-
-#ifdef PACKET_TESTING
-#include <stdio.h>
-FILE *vpxlog = 0;
-#endif
-
-/* Decode a row of Superblocks (2x2 region of MBs) */
-static void
-decode_sb_row(VP9D_COMP *pbi, VP9_COMMON *pc, int mbrow, MACROBLOCKD *xd,
-              BOOL_DECODER* const bc) {
-  int i;
-  int sb_col;
-  int mb_row, mb_col;
-  int recon_yoffset, recon_uvoffset;
-  int ref_fb_idx = pc->lst_fb_idx;
-  int dst_fb_idx = pc->new_fb_idx;
-  int recon_y_stride = pc->yv12_fb[ref_fb_idx].y_stride;
-  int recon_uv_stride = pc->yv12_fb[ref_fb_idx].uv_stride;
-  int row_delta[4] = { 0, +1,  0, -1};
-  int col_delta[4] = { +1, -1, +1, +1};
-  int sb_cols = (pc->mb_cols + 1) >> 1;
-
-  // For a SB there are 2 left contexts, each pertaining to a MB row within
-  vpx_memset(pc->left_context, 0, sizeof(pc->left_context));
-
-  mb_row = mbrow;
-  mb_col = 0;
-
-  for (sb_col = 0; sb_col < sb_cols; sb_col++) {
-    MODE_INFO *mi = xd->mode_info_context;
-
-#if CONFIG_SUPERBLOCKS
-    mi->mbmi.encoded_as_sb = vp9_read(bc, pc->sb_coded);
-#endif
-
-    // Process the 4 MBs within the SB in the order:
-    // top-left, top-right, bottom-left, bottom-right
-    for (i = 0; i < 4; i++) {
-      int dy = row_delta[i];
-      int dx = col_delta[i];
-      int offset_extended = dy * xd->mode_info_stride + dx;
-
-      xd->mb_index = i;
-
-      mi = xd->mode_info_context;
-      if ((mb_row >= pc->mb_rows) || (mb_col >= pc->mb_cols)) {
-        // MB lies outside frame, skip on to next
-        mb_row += dy;
-        mb_col += dx;
-        xd->mode_info_context += offset_extended;
-        xd->prev_mode_info_context += offset_extended;
-        continue;
-      }
-
-      // Set above context pointer
-      xd->above_context = pc->above_context + mb_col;
-      xd->left_context = pc->left_context + (i >> 1);
-
-      /* Distance of Mb to the various image edges.
-       * These are specified to 8th pel as they are always compared to
-       * values that are in 1/8th pel units
-       */
-      xd->mb_to_top_edge = -((mb_row * 16)) << 3;
-      xd->mb_to_bottom_edge = ((pc->mb_rows - 1 - mb_row) * 16) << 3;
-
-      xd->mb_to_left_edge = -((mb_col * 16) << 3);
-      xd->mb_to_right_edge = ((pc->mb_cols - 1 - mb_col) * 16) << 3;
-
-      xd->up_available = (mb_row != 0);
-      xd->left_available = (mb_col != 0);
-
-
-      recon_yoffset = (mb_row * recon_y_stride * 16) + (mb_col * 16);
-      recon_uvoffset = (mb_row * recon_uv_stride * 8) + (mb_col * 8);
-
-      xd->dst.y_buffer = pc->yv12_fb[dst_fb_idx].y_buffer + recon_yoffset;
-      xd->dst.u_buffer = pc->yv12_fb[dst_fb_idx].u_buffer + recon_uvoffset;
-      xd->dst.v_buffer = pc->yv12_fb[dst_fb_idx].v_buffer + recon_uvoffset;
-
-#if CONFIG_SUPERBLOCKS
-      if (i)
-        mi->mbmi.encoded_as_sb = 0;
-#endif
-      vp9_decode_mb_mode_mv(pbi, xd, mb_row, mb_col, bc);
-
-      update_blockd_bmi(xd);
-
-      /* Select the appropriate reference frame for this MB */
-      if (xd->mode_info_context->mbmi.ref_frame == LAST_FRAME)
-        ref_fb_idx = pc->lst_fb_idx;
-      else if (xd->mode_info_context->mbmi.ref_frame == GOLDEN_FRAME)
-        ref_fb_idx = pc->gld_fb_idx;
-      else
-        ref_fb_idx = pc->alt_fb_idx;
-
-      xd->pre.y_buffer = pc->yv12_fb[ref_fb_idx].y_buffer + recon_yoffset;
-      xd->pre.u_buffer = pc->yv12_fb[ref_fb_idx].u_buffer + recon_uvoffset;
-      xd->pre.v_buffer = pc->yv12_fb[ref_fb_idx].v_buffer + recon_uvoffset;
-
-      if (xd->mode_info_context->mbmi.second_ref_frame) {
-        int second_ref_fb_idx;
-
-        /* Select the appropriate reference frame for this MB */
-        if (xd->mode_info_context->mbmi.second_ref_frame == LAST_FRAME)
-          second_ref_fb_idx = pc->lst_fb_idx;
-        else if (xd->mode_info_context->mbmi.second_ref_frame ==
-                 GOLDEN_FRAME)
-          second_ref_fb_idx = pc->gld_fb_idx;
-        else
-          second_ref_fb_idx = pc->alt_fb_idx;
-
-        xd->second_pre.y_buffer =
-          pc->yv12_fb[second_ref_fb_idx].y_buffer + recon_yoffset;
-        xd->second_pre.u_buffer =
-          pc->yv12_fb[second_ref_fb_idx].u_buffer + recon_uvoffset;
-        xd->second_pre.v_buffer =
-          pc->yv12_fb[second_ref_fb_idx].v_buffer + recon_uvoffset;
-      }
-
-      if (xd->mode_info_context->mbmi.ref_frame != INTRA_FRAME) {
-        /* propagate errors from reference frames */
-        xd->corrupted |= pc->yv12_fb[ref_fb_idx].corrupted;
-      }
-
-#if CONFIG_SUPERBLOCKS
-      if (xd->mode_info_context->mbmi.encoded_as_sb) {
-        if (mb_col < pc->mb_cols - 1)
-          mi[1] = mi[0];
-        if (mb_row < pc->mb_rows - 1) {
-          mi[pc->mode_info_stride] = mi[0];
-          if (mb_col < pc->mb_cols - 1)
-            mi[pc->mode_info_stride + 1] = mi[0];
-        }
-      }
-#endif
-      vp9_intra_prediction_down_copy(xd);
-      decode_macroblock(pbi, xd, mb_row, mb_col, bc);
-
-      /* check if the boolean decoder has suffered an error */
-      xd->corrupted |= bool_error(bc);
-
-#if CONFIG_SUPERBLOCKS
-      if (mi->mbmi.encoded_as_sb) {
-        assert(!i);
-        mb_col += 2;
-        xd->mode_info_context += 2;
-        xd->prev_mode_info_context += 2;
-        break;
-      }
-#endif
-
-      // skip to next MB
-      xd->mode_info_context += offset_extended;
-      xd->prev_mode_info_context += offset_extended;
-      mb_row += dy;
-      mb_col += dx;
-    }
-  }
-
-  /* skip prediction column */
-  xd->mode_info_context += 1 - (pc->mb_cols & 0x1) + xd->mode_info_stride;
-  xd->prev_mode_info_context += 1 - (pc->mb_cols & 0x1) + xd->mode_info_stride;
-}
-
-static unsigned int read_partition_size(const unsigned char *cx_size) {
-  const unsigned int size =
-    cx_size[0] + (cx_size[1] << 8) + (cx_size[2] << 16);
-  return size;
-}
-
-static int read_is_valid(const unsigned char *start,
-                         size_t               len,
-                         const unsigned char *end) {
-  return (start + len > start && start + len <= end);
-}
-
-
-static void setup_token_decoder(VP9D_COMP *pbi,
-                                const unsigned char *cx_data,
-                                BOOL_DECODER* const bool_decoder) {
-  VP9_COMMON          *pc = &pbi->common;
-  const unsigned char *user_data_end = pbi->Source + pbi->source_sz;
-  const unsigned char *partition;
-
-  ptrdiff_t            partition_size;
-  ptrdiff_t            bytes_left;
-
-  // Set up pointers to token partition
-  partition = cx_data;
-  bytes_left = user_data_end - partition;
-  partition_size = bytes_left;
-
-  /* Validate the calculated partition length. If the buffer
-   * described by the partition can't be fully read, then restrict
-   * it to the portion that can be (for EC mode) or throw an error.
-   */
-  if (!read_is_valid(partition, partition_size, user_data_end)) {
-    vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,
-                       "Truncated packet or corrupt partition "
-                       "%d length", 1);
-  }
-
-  if (vp9_start_decode(bool_decoder, partition, partition_size))
-    vpx_internal_error(&pc->error, VPX_CODEC_MEM_ERROR,
-                       "Failed to allocate bool decoder %d", 1);
-}
-
-static void init_frame(VP9D_COMP *pbi) {
-  VP9_COMMON *const pc = &pbi->common;
-  MACROBLOCKD *const xd  = &pbi->mb;
-
-  if (pc->frame_type == KEY_FRAME) {
-    /* Various keyframe initializations */
-    vp9_init_mv_probs(pc);
-
-    vp9_init_mbmode_probs(pc);
-    vp9_default_bmode_probs(pc->fc.bmode_prob);
-
-    vp9_default_coef_probs(pc);
-    vp9_kf_default_bmode_probs(pc->kf_bmode_prob);
-
-    // Reset the segment feature data to the default stats:
-    // Features disabled, 0, with delta coding (Default state).
-    vp9_clearall_segfeatures(xd);
-
-    xd->mb_segment_abs_delta = SEGMENT_DELTADATA;
-
-    /* reset the mode ref deltasa for loop filter */
-    vpx_memset(xd->ref_lf_deltas, 0, sizeof(xd->ref_lf_deltas));
-    vpx_memset(xd->mode_lf_deltas, 0, sizeof(xd->mode_lf_deltas));
-
-    /* All buffers are implicitly updated on key frames. */
-    pc->refresh_golden_frame = 1;
-    pc->refresh_alt_ref_frame = 1;
-    pc->copy_buffer_to_gf = 0;
-    pc->copy_buffer_to_arf = 0;
-
-    /* Note that Golden and Altref modes cannot be used on a key frame so
-     * ref_frame_sign_bias[] is undefined and meaningless
-     */
-    pc->ref_frame_sign_bias[GOLDEN_FRAME] = 0;
-    pc->ref_frame_sign_bias[ALTREF_FRAME] = 0;
-
-    vp9_init_mode_contexts(&pbi->common);
-    vpx_memcpy(&pc->lfc, &pc->fc, sizeof(pc->fc));
-    vpx_memcpy(&pc->lfc_a, &pc->fc, sizeof(pc->fc));
-
-    vpx_memcpy(pbi->common.fc.vp8_mode_contexts,
-               pbi->common.fc.mode_context,
-               sizeof(pbi->common.fc.mode_context));
-    vpx_memset(pc->prev_mip, 0,
-               (pc->mb_cols + 1) * (pc->mb_rows + 1)* sizeof(MODE_INFO));
-    vpx_memset(pc->mip, 0,
-               (pc->mb_cols + 1) * (pc->mb_rows + 1)* sizeof(MODE_INFO));
-
-    vp9_update_mode_info_border(pc, pc->mip);
-    vp9_update_mode_info_in_image(pc, pc->mi);
-
-  } else {
-
-    if (!pc->use_bilinear_mc_filter)
-      pc->mcomp_filter_type = EIGHTTAP;
-    else
-      pc->mcomp_filter_type = BILINEAR;
-
-    /* To enable choice of different interpolation filters */
-    vp9_setup_interp_filters(xd, pc->mcomp_filter_type, pc);
-  }
-
-  xd->mode_info_context = pc->mi;
-  xd->prev_mode_info_context = pc->prev_mi;
-  xd->frame_type = pc->frame_type;
-  xd->mode_info_context->mbmi.mode = DC_PRED;
-  xd->mode_info_stride = pc->mode_info_stride;
-  xd->corrupted = 0; /* init without corruption */
-
-  xd->fullpixel_mask = 0xffffffff;
-  if (pc->full_pixel)
-    xd->fullpixel_mask = 0xfffffff8;
-
-}
-
-#if 0
-static void read_coef_probs2(VP9D_COMP *pbi) {
-  const vp9_prob grpupd = 192;
-  int i, j, k, l;
-  vp9_reader *const bc = &pbi->bc;
-  VP9_COMMON *const pc = &pbi->common;
-  for (l = 0; l < ENTROPY_NODES; l++) {
-    if (vp9_read(bc, grpupd)) {
-      // printf("Decoding %d\n", l);
-      for (i = 0; i < BLOCK_TYPES; i++)
-        for (j = !i; j < COEF_BANDS; j++)
-          for (k = 0; k < PREV_COEF_CONTEXTS; k++) {
-            if (k >= 3 && ((i == 0 && j == 1) ||
-                           (i > 0 && j == 0)))
-              continue;
-            {
-              vp9_prob *const p = pc->fc.coef_probs [i][j][k] + l;
-              int u = vp9_read(bc, COEF_UPDATE_PROB);
-              if (u) *p = read_prob_diff_update(bc, *p);
-            }
-          }
-    }
-  }
-  if (pbi->common.txfm_mode == ALLOW_8X8) {
-    for (l = 0; l < ENTROPY_NODES; l++) {
-      if (vp9_read(bc, grpupd)) {
-        for (i = 0; i < BLOCK_TYPES_8X8; i++)
-          for (j = !i; j < COEF_BANDS; j++)
-            for (k = 0; k < PREV_COEF_CONTEXTS; k++) {
-              if (k >= 3 && ((i == 0 && j == 1) ||
-                             (i > 0 && j == 0)))
-                continue;
-              {
-                vp9_prob *const p = pc->fc.coef_probs_8x8 [i][j][k] + l;
-
-                int u = vp9_read(bc, COEF_UPDATE_PROB_8X8);
-                if (u) *p = read_prob_diff_update(bc, *p);
-              }
-            }
-      }
-    }
-  }
-}
-#endif
-
-static void read_coef_probs_common(
-    BOOL_DECODER* const bc,
-    vp9_prob coef_probs[BLOCK_TYPES][COEF_BANDS]
-                       [PREV_COEF_CONTEXTS][ENTROPY_NODES]) {
-  int i, j, k, l;
-
-  if (vp9_read_bit(bc)) {
-    for (i = 0; i < BLOCK_TYPES; i++) {
-      for (j = !i; j < COEF_BANDS; j++) {
-        /* NB: This j loop starts from 1 on block type i == 0 */
-        for (k = 0; k < PREV_COEF_CONTEXTS; k++) {
-          if (k >= 3 && ((i == 0 && j == 1) ||
-                         (i > 0 && j == 0)))
-            continue;
-          for (l = 0; l < ENTROPY_NODES; l++) {
-            vp9_prob *const p = coef_probs[i][j][k] + l;
-
-            if (vp9_read(bc, COEF_UPDATE_PROB)) {
-              *p = read_prob_diff_update(bc, *p);
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-static void read_coef_probs(VP9D_COMP *pbi, BOOL_DECODER* const bc) {
-  VP9_COMMON *const pc = &pbi->common;
-
-  read_coef_probs_common(bc, pc->fc.coef_probs);
-  read_coef_probs_common(bc, pc->fc.hybrid_coef_probs);
-
-  if (pbi->common.txfm_mode != ONLY_4X4) {
-    read_coef_probs_common(bc, pc->fc.coef_probs_8x8);
-    read_coef_probs_common(bc, pc->fc.hybrid_coef_probs_8x8);
-  }
-  if (pbi->common.txfm_mode > ALLOW_8X8) {
-    read_coef_probs_common(bc, pc->fc.coef_probs_16x16);
-    read_coef_probs_common(bc, pc->fc.hybrid_coef_probs_16x16);
-  }
-}
-
-int vp9_decode_frame(VP9D_COMP *pbi) {
-  BOOL_DECODER header_bc, residual_bc;
-  VP9_COMMON *const pc = &pbi->common;
-  MACROBLOCKD *const xd  = &pbi->mb;
-  const unsigned char *data = (const unsigned char *)pbi->Source;
-  const unsigned char *data_end = data + pbi->source_sz;
-  ptrdiff_t first_partition_length_in_bytes = 0;
-
-  int mb_row;
-  int i, j;
-  int corrupt_tokens = 0;
-
-  /* start with no corruption of current frame */
-  xd->corrupted = 0;
-  pc->yv12_fb[pc->new_fb_idx].corrupted = 0;
-
-  if (data_end - data < 3) {
-    vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,
-                       "Truncated packet");
-  } else {
-    pc->last_frame_type = pc->frame_type;
-    pc->frame_type = (FRAME_TYPE)(data[0] & 1);
-    pc->version = (data[0] >> 1) & 7;
-    pc->show_frame = (data[0] >> 4) & 1;
-    first_partition_length_in_bytes =
-      (data[0] | (data[1] << 8) | (data[2] << 16)) >> 5;
-
-    if ((data + first_partition_length_in_bytes > data_end
-         || data + first_partition_length_in_bytes < data))
-      vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,
-                         "Truncated packet or corrupt partition 0 length");
-
-    data += 3;
-
-    vp9_setup_version(pc);
-
-    if (pc->frame_type == KEY_FRAME) {
-      const int Width = pc->Width;
-      const int Height = pc->Height;
-
-      /* vet via sync code */
-      /* When error concealment is enabled we should only check the sync
-       * code if we have enough bits available
-       */
-      if (data + 3 < data_end) {
-        if (data[0] != 0x9d || data[1] != 0x01 || data[2] != 0x2a)
-          vpx_internal_error(&pc->error, VPX_CODEC_UNSUP_BITSTREAM,
-                             "Invalid frame sync code");
-      }
-
-      /* If error concealment is enabled we should only parse the new size
-       * if we have enough data. Otherwise we will end up with the wrong
-       * size.
-       */
-      if (data + 6 < data_end) {
-        pc->Width = (data[3] | (data[4] << 8)) & 0x3fff;
-        pc->horiz_scale = data[4] >> 6;
-        pc->Height = (data[5] | (data[6] << 8)) & 0x3fff;
-        pc->vert_scale = data[6] >> 6;
-      }
-      data += 7;
-
-      if (Width != pc->Width  ||  Height != pc->Height) {
-        if (pc->Width <= 0) {
-          pc->Width = Width;
-          vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,
-                             "Invalid frame width");
-        }
-
-        if (pc->Height <= 0) {
-          pc->Height = Height;
-          vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,
-                             "Invalid frame height");
-        }
-
-        if (vp9_alloc_frame_buffers(pc, pc->Width, pc->Height))
-          vpx_internal_error(&pc->error, VPX_CODEC_MEM_ERROR,
-                             "Failed to allocate frame buffers");
-      }
-    }
-  }
-
-  if ((!pbi->decoded_key_frame && pc->frame_type != KEY_FRAME) ||
-      pc->Width == 0 || pc->Height == 0) {
-    return -1;
-  }
-
-  init_frame(pbi);
-
-  if (vp9_start_decode(&header_bc, data, first_partition_length_in_bytes))
-    vpx_internal_error(&pc->error, VPX_CODEC_MEM_ERROR,
-                       "Failed to allocate bool decoder 0");
-  if (pc->frame_type == KEY_FRAME) {
-    pc->clr_type    = (YUV_TYPE)vp9_read_bit(&header_bc);
-    pc->clamp_type  = (CLAMP_TYPE)vp9_read_bit(&header_bc);
-  }
-
-  /* Is segmentation enabled */
-  xd->segmentation_enabled = (unsigned char)vp9_read_bit(&header_bc);
-
-  if (xd->segmentation_enabled) {
-    // Read whether or not the segmentation map is being explicitly
-    // updated this frame.
-    xd->update_mb_segmentation_map = (unsigned char)vp9_read_bit(&header_bc);
-
-    // If so what method will be used.
-    if (xd->update_mb_segmentation_map) {
-      // Which macro block level features are enabled
-
-      // Read the probs used to decode the segment id for each macro
-      // block.
-      for (i = 0; i < MB_FEATURE_TREE_PROBS; i++) {
-          xd->mb_segment_tree_probs[i] = vp9_read_bit(&header_bc) ?
-              (vp9_prob)vp9_read_literal(&header_bc, 8) : 255;
-      }
-
-      // Read the prediction probs needed to decode the segment id
-      pc->temporal_update = (unsigned char)vp9_read_bit(&header_bc);
-      for (i = 0; i < PREDICTION_PROBS; i++) {
-        if (pc->temporal_update) {
-          pc->segment_pred_probs[i] = vp9_read_bit(&header_bc) ?
-              (vp9_prob)vp9_read_literal(&header_bc, 8) : 255;
-        } else {
-          pc->segment_pred_probs[i] = 255;
-        }
-      }
-    }
-    // Is the segment data being updated
-    xd->update_mb_segmentation_data = (unsigned char)vp9_read_bit(&header_bc);
-
-    if (xd->update_mb_segmentation_data) {
-      int data;
-
-      xd->mb_segment_abs_delta = (unsigned char)vp9_read_bit(&header_bc);
-
-      vp9_clearall_segfeatures(xd);
-
-      // For each segmentation...
-      for (i = 0; i < MAX_MB_SEGMENTS; i++) {
-        // For each of the segments features...
-        for (j = 0; j < SEG_LVL_MAX; j++) {
-          // Is the feature enabled
-          if (vp9_read_bit(&header_bc)) {
-            // Update the feature data and mask
-            vp9_enable_segfeature(xd, i, j);
-
-            data = (signed char)vp9_read_literal(
-                     &header_bc, vp9_seg_feature_data_bits(j));
-
-            // Is the segment data signed..
-            if (vp9_is_segfeature_signed(j)) {
-              if (vp9_read_bit(&header_bc))
-                data = - data;
-            }
-          } else
-            data = 0;
-
-          vp9_set_segdata(xd, i, j, data);
-        }
-      }
-    }
-  }
-
-  // Read common prediction model status flag probability updates for the
-  // reference frame
-  if (pc->frame_type == KEY_FRAME) {
-    // Set the prediction probabilities to defaults
-    pc->ref_pred_probs[0] = 120;
-    pc->ref_pred_probs[1] = 80;
-    pc->ref_pred_probs[2] = 40;
-  } else {
-    for (i = 0; i < PREDICTION_PROBS; i++) {
-      if (vp9_read_bit(&header_bc))
-        pc->ref_pred_probs[i] = (vp9_prob)vp9_read_literal(&header_bc, 8);
-    }
-  }
-
-#if CONFIG_SUPERBLOCKS
-  pc->sb_coded = vp9_read_literal(&header_bc, 8);
-#endif
-
-  /* Read the loop filter level and type */
-  pc->txfm_mode = vp9_read_literal(&header_bc, 2);
-  if (pc->txfm_mode == TX_MODE_SELECT) {
-    pc->prob_tx[0] = vp9_read_literal(&header_bc, 8);
-    pc->prob_tx[1] = vp9_read_literal(&header_bc, 8);
-  }
-
-  pc->filter_type = (LOOPFILTERTYPE) vp9_read_bit(&header_bc);
-  pc->filter_level = vp9_read_literal(&header_bc, 6);
-  pc->sharpness_level = vp9_read_literal(&header_bc, 3);
-
-  /* Read in loop filter deltas applied at the MB level based on mode or ref frame. */
-  xd->mode_ref_lf_delta_update = 0;
-  xd->mode_ref_lf_delta_enabled = (unsigned char)vp9_read_bit(&header_bc);
-
-  if (xd->mode_ref_lf_delta_enabled) {
-    /* Do the deltas need to be updated */
-    xd->mode_ref_lf_delta_update = (unsigned char)vp9_read_bit(&header_bc);
-
-    if (xd->mode_ref_lf_delta_update) {
-      /* Send update */
-      for (i = 0; i < MAX_REF_LF_DELTAS; i++) {
-        if (vp9_read_bit(&header_bc)) {
-          /*sign = vp9_read_bit( &header_bc );*/
-          xd->ref_lf_deltas[i] = (signed char)vp9_read_literal(&header_bc, 6);
-
-          if (vp9_read_bit(&header_bc))        /* Apply sign */
-            xd->ref_lf_deltas[i] = xd->ref_lf_deltas[i] * -1;
-        }
-      }
-
-      /* Send update */
-      for (i = 0; i < MAX_MODE_LF_DELTAS; i++) {
-        if (vp9_read_bit(&header_bc)) {
-          /*sign = vp9_read_bit( &header_bc );*/
-          xd->mode_lf_deltas[i] = (signed char)vp9_read_literal(&header_bc, 6);
-
-          if (vp9_read_bit(&header_bc))        /* Apply sign */
-            xd->mode_lf_deltas[i] = xd->mode_lf_deltas[i] * -1;
-        }
-      }
-    }
-  }
-
-  // Dummy read for now
-  vp9_read_literal(&header_bc, 2);
-
-  setup_token_decoder(pbi, data + first_partition_length_in_bytes,
-                      &residual_bc);
-
-  /* Read the default quantizers. */
-  {
-    int Q, q_update;
-
-    Q = vp9_read_literal(&header_bc, QINDEX_BITS);
-    pc->base_qindex = Q;
-    q_update = 0;
-    /* AC 1st order Q = default */
-    pc->y1dc_delta_q = get_delta_q(&header_bc, pc->y1dc_delta_q, &q_update);
-    pc->y2dc_delta_q = get_delta_q(&header_bc, pc->y2dc_delta_q, &q_update);
-    pc->y2ac_delta_q = get_delta_q(&header_bc, pc->y2ac_delta_q, &q_update);
-    pc->uvdc_delta_q = get_delta_q(&header_bc, pc->uvdc_delta_q, &q_update);
-    pc->uvac_delta_q = get_delta_q(&header_bc, pc->uvac_delta_q, &q_update);
-
-    if (q_update)
-      vp9_init_de_quantizer(pbi);
-
-    /* MB level dequantizer setup */
-    mb_init_dequantizer(pbi, &pbi->mb);
-  }
-
-  /* Determine if the golden frame or ARF buffer should be updated and how.
-   * For all non key frames the GF and ARF refresh flags and sign bias
-   * flags must be set explicitly.
-   */
-  if (pc->frame_type != KEY_FRAME) {
-    /* Should the GF or ARF be updated from the current frame */
-    pc->refresh_golden_frame = vp9_read_bit(&header_bc);
-    pc->refresh_alt_ref_frame = vp9_read_bit(&header_bc);
-
-    if (pc->refresh_alt_ref_frame) {
-      vpx_memcpy(&pc->fc, &pc->lfc_a, sizeof(pc->fc));
-      vpx_memcpy(pc->fc.vp8_mode_contexts,
-                 pc->fc.mode_context_a,
-                 sizeof(pc->fc.vp8_mode_contexts));
-    } else {
-      vpx_memcpy(&pc->fc, &pc->lfc, sizeof(pc->fc));
-      vpx_memcpy(pc->fc.vp8_mode_contexts,
-                 pc->fc.mode_context,
-                 sizeof(pc->fc.vp8_mode_contexts));
-    }
-
-    /* Buffer to buffer copy flags. */
-    pc->copy_buffer_to_gf = 0;
-
-    if (!pc->refresh_golden_frame)
-      pc->copy_buffer_to_gf = vp9_read_literal(&header_bc, 2);
-
-    pc->copy_buffer_to_arf = 0;
-
-    if (!pc->refresh_alt_ref_frame)
-      pc->copy_buffer_to_arf = vp9_read_literal(&header_bc, 2);
-
-    pc->ref_frame_sign_bias[GOLDEN_FRAME] = vp9_read_bit(&header_bc);
-    pc->ref_frame_sign_bias[ALTREF_FRAME] = vp9_read_bit(&header_bc);
-
-    /* Is high precision mv allowed */
-    xd->allow_high_precision_mv = (unsigned char)vp9_read_bit(&header_bc);
-    // Read the type of subpel filter to use
-    if (vp9_read_bit(&header_bc)) {
-      pc->mcomp_filter_type = SWITCHABLE;
-    } else {
-      pc->mcomp_filter_type = vp9_read_literal(&header_bc, 2);
-    }
-    /* To enable choice of different interploation filters */
-    vp9_setup_interp_filters(xd, pc->mcomp_filter_type, pc);
-  }
-
-  pc->refresh_entropy_probs = vp9_read_bit(&header_bc);
-  if (pc->refresh_entropy_probs == 0) {
-    vpx_memcpy(&pc->lfc, &pc->fc, sizeof(pc->fc));
-  }
-
-  pc->refresh_last_frame = (pc->frame_type == KEY_FRAME)
-                           || vp9_read_bit(&header_bc);
-
-  if (0) {
-    FILE *z = fopen("decodestats.stt", "a");
-    fprintf(z, "%6d F:%d,G:%d,A:%d,L:%d,Q:%d\n",
-            pc->current_video_frame,
-            pc->frame_type,
-            pc->refresh_golden_frame,
-            pc->refresh_alt_ref_frame,
-            pc->refresh_last_frame,
-            pc->base_qindex);
-    fclose(z);
-  }
-
-  vp9_copy(pbi->common.fc.pre_coef_probs,
-           pbi->common.fc.coef_probs);
-  vp9_copy(pbi->common.fc.pre_hybrid_coef_probs,
-           pbi->common.fc.hybrid_coef_probs);
-  vp9_copy(pbi->common.fc.pre_coef_probs_8x8,
-           pbi->common.fc.coef_probs_8x8);
-  vp9_copy(pbi->common.fc.pre_hybrid_coef_probs_8x8,
-           pbi->common.fc.hybrid_coef_probs_8x8);
-  vp9_copy(pbi->common.fc.pre_coef_probs_16x16,
-           pbi->common.fc.coef_probs_16x16);
-  vp9_copy(pbi->common.fc.pre_hybrid_coef_probs_16x16,
-           pbi->common.fc.hybrid_coef_probs_16x16);
-  vp9_copy(pbi->common.fc.pre_ymode_prob, pbi->common.fc.ymode_prob);
-  vp9_copy(pbi->common.fc.pre_uv_mode_prob, pbi->common.fc.uv_mode_prob);
-  vp9_copy(pbi->common.fc.pre_bmode_prob, pbi->common.fc.bmode_prob);
-  vp9_copy(pbi->common.fc.pre_i8x8_mode_prob, pbi->common.fc.i8x8_mode_prob);
-  vp9_copy(pbi->common.fc.pre_sub_mv_ref_prob, pbi->common.fc.sub_mv_ref_prob);
-  vp9_copy(pbi->common.fc.pre_mbsplit_prob, pbi->common.fc.mbsplit_prob);
-  pbi->common.fc.pre_nmvc = pbi->common.fc.nmvc;
-  vp9_zero(pbi->common.fc.coef_counts);
-  vp9_zero(pbi->common.fc.hybrid_coef_counts);
-  vp9_zero(pbi->common.fc.coef_counts_8x8);
-  vp9_zero(pbi->common.fc.hybrid_coef_counts_8x8);
-  vp9_zero(pbi->common.fc.coef_counts_16x16);
-  vp9_zero(pbi->common.fc.hybrid_coef_counts_16x16);
-  vp9_zero(pbi->common.fc.ymode_counts);
-  vp9_zero(pbi->common.fc.uv_mode_counts);
-  vp9_zero(pbi->common.fc.bmode_counts);
-  vp9_zero(pbi->common.fc.i8x8_mode_counts);
-  vp9_zero(pbi->common.fc.sub_mv_ref_counts);
-  vp9_zero(pbi->common.fc.mbsplit_counts);
-  vp9_zero(pbi->common.fc.NMVcount);
-  vp9_zero(pbi->common.fc.mv_ref_ct);
-  vp9_zero(pbi->common.fc.mv_ref_ct_a);
-
-  read_coef_probs(pbi, &header_bc);
-
-  vpx_memcpy(&xd->pre, &pc->yv12_fb[pc->lst_fb_idx], sizeof(YV12_BUFFER_CONFIG));
-  vpx_memcpy(&xd->dst, &pc->yv12_fb[pc->new_fb_idx], sizeof(YV12_BUFFER_CONFIG));
-
-  // Create the segmentation map structure and set to 0
-  if (!pc->last_frame_seg_map)
-    CHECK_MEM_ERROR(pc->last_frame_seg_map,
-                    vpx_calloc((pc->mb_rows * pc->mb_cols), 1));
-
-  /* set up frame new frame for intra coded blocks */
-  vp9_setup_intra_recon(&pc->yv12_fb[pc->new_fb_idx]);
-
-  vp9_setup_block_dptrs(xd);
-
-  vp9_build_block_doffsets(xd);
-
-  /* clear out the coeff buffer */
-  vpx_memset(xd->qcoeff, 0, sizeof(xd->qcoeff));
-
-  /* Read the mb_no_coeff_skip flag */
-  pc->mb_no_coeff_skip = (int)vp9_read_bit(&header_bc);
-
-  vp9_decode_mode_mvs_init(pbi, &header_bc);
-
-  vpx_memset(pc->above_context, 0, sizeof(ENTROPY_CONTEXT_PLANES) * pc->mb_cols);
-
-  // Resset the macroblock mode info context to the start of the list
-  xd->mode_info_context = pc->mi;
-  xd->prev_mode_info_context = pc->prev_mi;
-
-  /* Decode a row of superblocks */
-  for (mb_row = 0; mb_row < pc->mb_rows; mb_row += 2) {
-    decode_sb_row(pbi, pc, mb_row, xd, &residual_bc);
-  }
-  corrupt_tokens |= xd->corrupted;
-
-  /* Collect information about decoder corruption. */
-  /* 1. Check first boolean decoder for errors. */
-  pc->yv12_fb[pc->new_fb_idx].corrupted = bool_error(&header_bc);
-  /* 2. Check the macroblock information */
-  pc->yv12_fb[pc->new_fb_idx].corrupted |= corrupt_tokens;
-
-  if (!pbi->decoded_key_frame) {
-    if (pc->frame_type == KEY_FRAME &&
-        !pc->yv12_fb[pc->new_fb_idx].corrupted)
-      pbi->decoded_key_frame = 1;
-    else
-      vpx_internal_error(&pbi->common.error, VPX_CODEC_CORRUPT_FRAME,
-                         "A stream must start with a complete key frame");
-  }
-
-  vp9_adapt_coef_probs(pc);
-  if (pc->frame_type != KEY_FRAME) {
-    vp9_adapt_mode_probs(pc);
-    vp9_adapt_nmv_probs(pc, xd->allow_high_precision_mv);
-    vp9_update_mode_context(&pbi->common);
-  }
-
-  /* If this was a kf or Gf note the Q used */
-  if ((pc->frame_type == KEY_FRAME) ||
-      pc->refresh_golden_frame || pc->refresh_alt_ref_frame) {
-    pc->last_kf_gf_q = pc->base_qindex;
-  }
-  if (pc->refresh_entropy_probs) {
-    if (pc->refresh_alt_ref_frame)
-      vpx_memcpy(&pc->lfc_a, &pc->fc, sizeof(pc->fc));
-    else
-      vpx_memcpy(&pc->lfc, &pc->fc, sizeof(pc->fc));
-  }
-
-#ifdef PACKET_TESTING
-  {
-    FILE *f = fopen("decompressor.VP8", "ab");
-    unsigned int size = residual_bc.pos + header_bc.pos + 8;
-    fwrite((void *) &size, 4, 1, f);
-    fwrite((void *) pbi->Source, size, 1, f);
-    fclose(f);
-  }
-#endif
-  // printf("Frame %d Done\n", frame_count++);
-
-  return 0;
-}
--- a/vp8/decoder/dequantize.c
+++ /dev/null
@@ -1,543 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vpx_ports/config.h"
-#include "dequantize.h"
-#include "vp8/common/idct.h"
-#include "vpx_mem/vpx_mem.h"
-#include "onyxd_int.h"
-
-extern void vp9_short_idct4x4llm_c(short *input, short *output, int pitch);
-extern void vp9_short_idct4x4llm_1_c(short *input, short *output, int pitch);
-extern void vp9_short_idct8x8_c(short *input, short *output, int pitch);
-extern void vp9_short_idct8x8_1_c(short *input, short *output, int pitch);
-
-#if CONFIG_LOSSLESS
-extern void vp9_short_inv_walsh4x4_x8_c(short *input, short *output,
-                                        int pitch);
-extern void vp9_short_inv_walsh4x4_1_x8_c(short *input, short *output,
-                                          int pitch);
-#endif
-
-#ifdef DEC_DEBUG
-extern int dec_debug;
-#endif
-
-void vp9_dequantize_b_c(BLOCKD *d) {
-
-  int i;
-  short *DQ  = d->dqcoeff;
-  short *Q   = d->qcoeff;
-  short *DQC = d->dequant;
-
-  for (i = 0; i < 16; i++) {
-    DQ[i] = Q[i] * DQC[i];
-  }
-}
-
-
-void vp9_ht_dequant_idct_add_c(TX_TYPE tx_type, short *input, short *dq,
-                               unsigned char *pred, unsigned char *dest,
-                               int pitch, int stride) {
-  short output[16];
-  short *diff_ptr = output;
-  int r, c;
-  int i;
-
-  for (i = 0; i < 16; i++) {
-    input[i] = dq[i] * input[i];
-  }
-
-  vp9_ihtllm_c(input, output, 4 << 1, tx_type, 4);
-
-  vpx_memset(input, 0, 32);
-
-  for (r = 0; r < 4; r++) {
-      for (c = 0; c < 4; c++) {
-        int a = diff_ptr[c] + pred[c];
-
-        if (a < 0)
-            a = 0;
-
-        if (a > 255)
-            a = 255;
-
-        dest[c] = (unsigned char) a;
-    }
-
-      dest += stride;
-      diff_ptr += 4;
-      pred += pitch;
-  }
-}
-
-void vp9_ht_dequant_idct_add_8x8_c(TX_TYPE tx_type, short *input, short *dq,
-                                   unsigned char *pred, unsigned char *dest,
-                                   int pitch, int stride) {
-  short output[64];
-  short *diff_ptr = output;
-  int b, r, c;
-  int i;
-  unsigned char *origdest = dest;
-  unsigned char *origpred = pred;
-
-  input[0] = dq[0] * input[0];
-  for (i = 1; i < 64; i++) {
-    input[i] = dq[1] * input[i];
-  }
-
-  vp9_ihtllm_c(input, output, 16, tx_type, 8);
-
-  vpx_memset(input, 0, 128);
-
-  for (b = 0; b < 4; b++) {
-    for (r = 0; r < 4; r++) {
-      for (c = 0; c < 4; c++) {
-        int a = diff_ptr[c] + pred[c];
-
-        if (a < 0)
-          a = 0;
-
-        if (a > 255)
-          a = 255;
-
-        dest[c] = (unsigned char) a;
-      }
-
-      dest += stride;
-      diff_ptr += 8;
-      pred += pitch;
-    }
-    // shift buffer pointers to next 4x4 block in the submacroblock
-    diff_ptr = output + (b + 1) / 2 * 4 * 8 + ((b + 1) % 2) * 4;
-    dest = origdest + (b + 1) / 2 * 4 * stride + ((b + 1) % 2) * 4;
-    pred = origpred + (b + 1) / 2 * 4 * pitch + ((b + 1) % 2) * 4;
-  }
-}
-
-void vp9_dequant_idct_add_c(short *input, short *dq, unsigned char *pred,
-                            unsigned char *dest, int pitch, int stride) {
-  short output[16];
-  short *diff_ptr = output;
-  int r, c;
-  int i;
-
-  for (i = 0; i < 16; i++) {
-    input[i] = dq[i] * input[i];
-  }
-
-  /* the idct halves ( >> 1) the pitch */
-  vp9_short_idct4x4llm_c(input, output, 4 << 1);
-
-  vpx_memset(input, 0, 32);
-
-  for (r = 0; r < 4; r++) {
-    for (c = 0; c < 4; c++) {
-      int a = diff_ptr[c] + pred[c];
-
-      if (a < 0)
-        a = 0;
-
-      if (a > 255)
-        a = 255;
-
-      dest[c] = (unsigned char) a;
-    }
-
-    dest += stride;
-    diff_ptr += 4;
-    pred += pitch;
-  }
-}
-
-void vp9_dequant_dc_idct_add_c(short *input, short *dq, unsigned char *pred,
-                               unsigned char *dest, int pitch, int stride,
-                               int Dc) {
-  int i;
-  short output[16];
-  short *diff_ptr = output;
-  int r, c;
-
-  input[0] = (short)Dc;
-
-  for (i = 1; i < 16; i++) {
-    input[i] = dq[i] * input[i];
-  }
-
-  /* the idct halves ( >> 1) the pitch */
-  vp9_short_idct4x4llm_c(input, output, 4 << 1);
-
-  vpx_memset(input, 0, 32);
-
-  for (r = 0; r < 4; r++) {
-    for (c = 0; c < 4; c++) {
-      int a = diff_ptr[c] + pred[c];
-
-      if (a < 0)
-        a = 0;
-
-      if (a > 255)
-        a = 255;
-
-      dest[c] = (unsigned char) a;
-    }
-
-    dest += stride;
-    diff_ptr += 4;
-    pred += pitch;
-  }
-}
-
-#if CONFIG_LOSSLESS
-void vp9_dequant_idct_add_lossless_c(short *input, short *dq,
-                                     unsigned char *pred, unsigned char *dest,
-                                     int pitch, int stride) {
-  short output[16];
-  short *diff_ptr = output;
-  int r, c;
-  int i;
-
-  for (i = 0; i < 16; i++) {
-    input[i] = dq[i] * input[i];
-  }
-
-  vp9_short_inv_walsh4x4_x8_c(input, output, 4 << 1);
-
-  vpx_memset(input, 0, 32);
-
-  for (r = 0; r < 4; r++) {
-    for (c = 0; c < 4; c++) {
-      int a = diff_ptr[c] + pred[c];
-
-      if (a < 0)
-        a = 0;
-
-      if (a > 255)
-        a = 255;
-
-      dest[c] = (unsigned char) a;
-    }
-
-    dest += stride;
-    diff_ptr += 4;
-    pred += pitch;
-  }
-}
-
-void vp9_dequant_dc_idct_add_lossless_c(short *input, short *dq,
-                                        unsigned char *pred,
-                                        unsigned char *dest,
-                                        int pitch, int stride, int dc) {
-  int i;
-  short output[16];
-  short *diff_ptr = output;
-  int r, c;
-
-  input[0] = (short)dc;
-
-  for (i = 1; i < 16; i++) {
-    input[i] = dq[i] * input[i];
-  }
-
-  vp9_short_inv_walsh4x4_x8_c(input, output, 4 << 1);
-  vpx_memset(input, 0, 32);
-
-  for (r = 0; r < 4; r++) {
-    for (c = 0; c < 4; c++) {
-      int a = diff_ptr[c] + pred[c];
-
-      if (a < 0)
-        a = 0;
-
-      if (a > 255)
-        a = 255;
-
-      dest[c] = (unsigned char) a;
-    }
-
-    dest += stride;
-    diff_ptr += 4;
-    pred += pitch;
-  }
-}
-#endif
-
-void vp9_dequantize_b_2x2_c(BLOCKD *d) {
-  int i;
-  short *DQ  = d->dqcoeff;
-  short *Q   = d->qcoeff;
-  short *DQC = d->dequant;
-
-  for (i = 0; i < 16; i++) {
-    DQ[i] = (short)((Q[i] * DQC[i]));
-  }
-#ifdef DEC_DEBUG
-  if (dec_debug) {
-    int j;
-    printf("Dequantize 2x2\n");
-    for (j = 0; j < 16; j++) printf("%d ", Q[j]);
-    printf("\n");
-    for (j = 0; j < 16; j++) printf("%d ", DQ[j]);
-    printf("\n");
-  }
-#endif
-}
-
-void vp9_dequant_idct_add_8x8_c(short *input, short *dq, unsigned char *pred,
-                                unsigned char *dest, int pitch, int stride) {
-  short output[64];
-  short *diff_ptr = output;
-  int r, c, b;
-  int i;
-  unsigned char *origdest = dest;
-  unsigned char *origpred = pred;
-
-#ifdef DEC_DEBUG
-  if (dec_debug) {
-    int j;
-    printf("Input 8x8\n");
-    for (j = 0; j < 64; j++) {
-      printf("%d ", input[j]);
-      if (j % 8 == 7) printf("\n");
-    }
-  }
-#endif
-
-  input[0] = input[0] * dq[0];
-
-  // recover quantizer for 4 4x4 blocks
-  for (i = 1; i < 64; i++) {
-    input[i] = input[i] * dq[1];
-  }
-#ifdef DEC_DEBUG
-  if (dec_debug) {
-    int j;
-    printf("Input DQ 8x8\n");
-    for (j = 0; j < 64; j++) {
-      printf("%d ", input[j]);
-      if (j % 8 == 7) printf("\n");
-    }
-  }
-#endif
-
-  // the idct halves ( >> 1) the pitch
-  vp9_short_idct8x8_c(input, output, 16);
-#ifdef DEC_DEBUG
-  if (dec_debug) {
-    int j;
-    printf("Output 8x8\n");
-    for (j = 0; j < 64; j++) {
-      printf("%d ", output[j]);
-      if (j % 8 == 7) printf("\n");
-    }
-  }
-#endif
-
-  vpx_memset(input, 0, 128);// test what should i put here
-
-  for (b = 0; b < 4; b++) {
-    for (r = 0; r < 4; r++) {
-      for (c = 0; c < 4; c++) {
-        int a = diff_ptr[c] + pred[c];
-
-        if (a < 0)
-          a = 0;
-
-        if (a > 255)
-          a = 255;
-
-        dest[c] = (unsigned char) a;
-      }
-
-      dest += stride;
-      diff_ptr += 8;
-      pred += pitch;
-    }
-    diff_ptr = output + (b + 1) / 2 * 4 * 8 + (b + 1) % 2 * 4;
-    dest = origdest + (b + 1) / 2 * 4 * stride + (b + 1) % 2 * 4;
-    pred = origpred + (b + 1) / 2 * 4 * pitch + (b + 1) % 2 * 4;
-  }
-#ifdef DEC_DEBUG
-  if (dec_debug) {
-    int k, j;
-    printf("Final 8x8\n");
-    for (j = 0; j < 8; j++) {
-      for (k = 0; k < 8; k++) {
-        printf("%d ", origdest[k]);
-      }
-      printf("\n");
-      origdest += stride;
-    }
-  }
-#endif
-}
-
-void vp9_dequant_dc_idct_add_8x8_c(short *input, short *dq, unsigned char *pred,
-                                   unsigned char *dest, int pitch, int stride,
-                                   int Dc) { // Dc for 1st order T in some rear case
-  short output[64];
-  short *diff_ptr = output;
-  int r, c, b;
-  int i;
-  unsigned char *origdest = dest;
-  unsigned char *origpred = pred;
-
-  input[0] = (short)Dc;// Dc is the reconstructed value, do not need dequantization
-  // dc value is recovered after dequantization, since dc need not quantization
-#ifdef DEC_DEBUG
-  if (dec_debug) {
-    int j;
-    printf("Input 8x8\n");
-    for (j = 0; j < 64; j++) {
-      printf("%d ", input[j]);
-      if (j % 8 == 7) printf("\n");
-    }
-  }
-#endif
-  for (i = 1; i < 64; i++) {
-    input[i] = input[i] * dq[1];
-  }
-
-#ifdef DEC_DEBUG
-  if (dec_debug) {
-    int j;
-    printf("Input DQ 8x8\n");
-    for (j = 0; j < 64; j++) {
-      printf("%d ", input[j]);
-      if (j % 8 == 7) printf("\n");
-    }
-  }
-#endif
-
-  // the idct halves ( >> 1) the pitch
-  vp9_short_idct8x8_c(input, output, 16);
-#ifdef DEC_DEBUG
-  if (dec_debug) {
-    int j;
-    printf("Output 8x8\n");
-    for (j = 0; j < 64; j++) {
-      printf("%d ", output[j]);
-      if (j % 8 == 7) printf("\n");
-    }
-  }
-#endif
-  vpx_memset(input, 0, 128);
-
-  for (b = 0; b < 4; b++) {
-    for (r = 0; r < 4; r++) {
-      for (c = 0; c < 4; c++) {
-        int a = diff_ptr[c] + pred[c];
-
-        if (a < 0)
-          a = 0;
-
-        if (a > 255)
-          a = 255;
-
-        dest[c] = (unsigned char) a;
-      }
-
-      dest += stride;
-      diff_ptr += 8;
-      pred += pitch;
-    }
-    diff_ptr = output + (b + 1) / 2 * 4 * 8 + (b + 1) % 2 * 4;
-    dest = origdest + (b + 1) / 2 * 4 * stride + (b + 1) % 2 * 4;
-    pred = origpred + (b + 1) / 2 * 4 * pitch + (b + 1) % 2 * 4;
-  }
-#ifdef DEC_DEBUG
-  if (dec_debug) {
-    int k, j;
-    printf("Final 8x8\n");
-    for (j = 0; j < 8; j++) {
-      for (k = 0; k < 8; k++) {
-        printf("%d ", origdest[k]);
-      }
-      printf("\n");
-      origdest += stride;
-    }
-  }
-#endif
-}
-
-void vp9_ht_dequant_idct_add_16x16_c(TX_TYPE tx_type, short *input, short *dq,
-                                     unsigned char *pred, unsigned char *dest,
-                                     int pitch, int stride) {
-  short output[256];
-  short *diff_ptr = output;
-  int r, c, i;
-
-  input[0]= input[0] * dq[0];
-
-  // recover quantizer for 4 4x4 blocks
-  for (i = 1; i < 256; i++)
-    input[i] = input[i] * dq[1];
-
-  // inverse hybrid transform
-  vp9_ihtllm_c(input, output, 32, tx_type, 16);
-
-  // the idct halves ( >> 1) the pitch
-  // vp9_short_idct16x16_c(input, output, 32);
-
-  vpx_memset(input, 0, 512);
-
-  for (r = 0; r < 16; r++) {
-    for (c = 0; c < 16; c++) {
-      int a = diff_ptr[c] + pred[c];
-
-      if (a < 0)
-        a = 0;
-      else if (a > 255)
-        a = 255;
-
-      dest[c] = (unsigned char) a;
-    }
-
-    dest += stride;
-    diff_ptr += 16;
-    pred += pitch;
-  }
-}
-
-void vp9_dequant_idct_add_16x16_c(short *input, short *dq, unsigned char *pred,
-                                  unsigned char *dest, int pitch, int stride) {
-  short output[256];
-  short *diff_ptr = output;
-  int r, c, i;
-
-  input[0]= input[0] * dq[0];
-
-  // recover quantizer for 4 4x4 blocks
-  for (i = 1; i < 256; i++)
-    input[i] = input[i] * dq[1];
-
-  // the idct halves ( >> 1) the pitch
-  vp9_short_idct16x16_c(input, output, 32);
-
-  vpx_memset(input, 0, 512);
-
-  for (r = 0; r < 16; r++) {
-    for (c = 0; c < 16; c++) {
-      int a = diff_ptr[c] + pred[c];
-
-      if (a < 0)
-        a = 0;
-      else if (a > 255)
-        a = 255;
-
-      dest[c] = (unsigned char) a;
-    }
-
-    dest += stride;
-    diff_ptr += 16;
-    pred += pitch;
-  }
-}
--- a/vp8/decoder/dequantize.h
+++ /dev/null
@@ -1,78 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef DEQUANTIZE_H
-#define DEQUANTIZE_H
-#include "vp8/common/blockd.h"
-
-#if CONFIG_LOSSLESS
-extern void vp9_dequant_idct_add_lossless_c(short *input, short *dq,
-                                            unsigned char *pred,
-                                            unsigned char *output,
-                                            int pitch, int stride);
-extern void vp9_dequant_dc_idct_add_lossless_c(short *input, short *dq,
-                                               unsigned char *pred,
-                                               unsigned char *output,
-                                               int pitch, int stride, int dc);
-extern void vp9_dequant_dc_idct_add_y_block_lossless_c(short *q, short *dq,
-                                                       unsigned char *pre,
-                                                       unsigned char *dst,
-                                                       int stride, char *eobs,
-                                                       short *dc);
-extern void vp9_dequant_idct_add_y_block_lossless_c(short *q, short *dq,
-                                                    unsigned char *pre,
-                                                    unsigned char *dst,
-                                                    int stride, char *eobs);
-extern void vp9_dequant_idct_add_uv_block_lossless_c(short *q, short *dq,
-                                                     unsigned char *pre,
-                                                     unsigned char *dst_u,
-                                                     unsigned char *dst_v,
-                                                     int stride, char *eobs);
-#endif
-
-typedef void (*vp9_dequant_idct_add_fn_t)(short *input, short *dq,
-    unsigned char *pred, unsigned char *output, int pitch, int stride);
-typedef void(*vp9_dequant_dc_idct_add_fn_t)(short *input, short *dq,
-    unsigned char *pred, unsigned char *output, int pitch, int stride, int dc);
-
-typedef void(*vp9_dequant_dc_idct_add_y_block_fn_t)(short *q, short *dq,
-    unsigned char *pre, unsigned char *dst, int stride, char *eobs, short *dc);
-typedef void(*vp9_dequant_idct_add_y_block_fn_t)(short *q, short *dq,
-    unsigned char *pre, unsigned char *dst, int stride, char *eobs);
-typedef void(*vp9_dequant_idct_add_uv_block_fn_t)(short *q, short *dq,
-    unsigned char *pre, unsigned char *dst_u, unsigned char *dst_v, int stride,
-    char *eobs);
-
-void vp9_ht_dequant_idct_add_c(TX_TYPE tx_type, short *input, short *dq,
-                                    unsigned char *pred, unsigned char *dest,
-                                    int pitch, int stride);
-
-void vp9_ht_dequant_idct_add_8x8_c(TX_TYPE tx_type, short *input, short *dq,
-                                   unsigned char *pred, unsigned char *dest,
-                                   int pitch, int stride);
-
-void vp9_ht_dequant_idct_add_16x16_c(TX_TYPE tx_type, short *input, short *dq,
-                                     unsigned char *pred, unsigned char *dest,
-                                     int pitch, int stride);
-
-#if CONFIG_SUPERBLOCKS
-void vp9_dequant_dc_idct_add_y_block_8x8_inplace_c(short *q, short *dq,
-                                                   unsigned char *dst,
-                                                   int stride, char *eobs,
-                                                   short *dc, MACROBLOCKD *xd);
-void vp9_dequant_idct_add_uv_block_8x8_inplace_c(short *q, short *dq,
-                                                 unsigned char *dstu,
-                                                 unsigned char *dstv,
-                                                 int stride, char *eobs,
-                                                 MACROBLOCKD *xd);
-#endif
-
-#endif
--- a/vp8/decoder/detokenize.c
+++ /dev/null
@@ -1,640 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vp8/common/type_aliases.h"
-#include "vp8/common/blockd.h"
-#include "onyxd_int.h"
-#include "vpx_mem/vpx_mem.h"
-#include "vpx_ports/mem.h"
-#include "detokenize.h"
-
-#include "vp8/common/seg_common.h"
-
-#define BOOL_DATA UINT8
-
-#define OCB_X PREV_COEF_CONTEXTS * ENTROPY_NODES
-
-DECLARE_ALIGNED(16, static const int, coef_bands_x[16]) = {
-  0 * OCB_X, 1 * OCB_X, 2 * OCB_X, 3 * OCB_X,
-  6 * OCB_X, 4 * OCB_X, 5 * OCB_X, 6 * OCB_X,
-  6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X,
-  6 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X
-};
-DECLARE_ALIGNED(16, static const int, coef_bands_x_8x8[64]) = {
-  0 * OCB_X, 1 * OCB_X, 2 * OCB_X, 3 * OCB_X, 5 * OCB_X, 4 * OCB_X, 4 * OCB_X, 5 * OCB_X,
-  5 * OCB_X, 3 * OCB_X, 6 * OCB_X, 3 * OCB_X, 5 * OCB_X, 4 * OCB_X, 6 * OCB_X, 6 * OCB_X,
-  6 * OCB_X, 5 * OCB_X, 5 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X,
-  6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X,
-  6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X,
-  7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X,
-  7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X,
-  7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X,
-};
-
-DECLARE_ALIGNED(16, static const int, coef_bands_x_16x16[256]) = {
-  0 * OCB_X, 1 * OCB_X, 2 * OCB_X, 3 * OCB_X, 5 * OCB_X, 4 * OCB_X, 4 * OCB_X, 5 * OCB_X, 5 * OCB_X, 3 * OCB_X, 6 * OCB_X, 3 * OCB_X, 5 * OCB_X, 4 * OCB_X, 6 * OCB_X, 6 * OCB_X,
-  6 * OCB_X, 5 * OCB_X, 5 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X,
-  6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X,
-  7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X,
-  7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X,
-  7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X,
-  7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X,
-  7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X,
-  7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X,
-  7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X,
-  7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X,
-  7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X,
-  7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X,
-  7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X,
-  7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X,
-  7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X
-};
-
-#define EOB_CONTEXT_NODE            0
-#define ZERO_CONTEXT_NODE           1
-#define ONE_CONTEXT_NODE            2
-#define LOW_VAL_CONTEXT_NODE        3
-#define TWO_CONTEXT_NODE            4
-#define THREE_CONTEXT_NODE          5
-#define HIGH_LOW_CONTEXT_NODE       6
-#define CAT_ONE_CONTEXT_NODE        7
-#define CAT_THREEFOUR_CONTEXT_NODE  8
-#define CAT_THREE_CONTEXT_NODE      9
-#define CAT_FIVE_CONTEXT_NODE       10
-
-#define CAT1_MIN_VAL    5
-#define CAT2_MIN_VAL    7
-#define CAT3_MIN_VAL   11
-#define CAT4_MIN_VAL   19
-#define CAT5_MIN_VAL   35
-#define CAT6_MIN_VAL   67
-#define CAT1_PROB0    159
-#define CAT2_PROB0    145
-#define CAT2_PROB1    165
-
-#define CAT3_PROB0 140
-#define CAT3_PROB1 148
-#define CAT3_PROB2 173
-
-#define CAT4_PROB0 135
-#define CAT4_PROB1 140
-#define CAT4_PROB2 155
-#define CAT4_PROB3 176
-
-#define CAT5_PROB0 130
-#define CAT5_PROB1 134
-#define CAT5_PROB2 141
-#define CAT5_PROB3 157
-#define CAT5_PROB4 180
-
-static const unsigned char cat6_prob[14] =
-{ 254, 254, 252, 249, 243, 230, 196, 177, 153, 140, 133, 130, 129, 0 };
-
-void vp9_reset_mb_tokens_context(MACROBLOCKD *xd) {
-  /* Clear entropy contexts for Y2 blocks */
-  if ((xd->mode_info_context->mbmi.mode != B_PRED &&
-      xd->mode_info_context->mbmi.mode != I8X8_PRED &&
-      xd->mode_info_context->mbmi.mode != SPLITMV)
-      || xd->mode_info_context->mbmi.txfm_size == TX_16X16
-      ) {
-    vpx_memset(xd->above_context, 0, sizeof(ENTROPY_CONTEXT_PLANES));
-    vpx_memset(xd->left_context, 0, sizeof(ENTROPY_CONTEXT_PLANES));
-  } else {
-    vpx_memset(xd->above_context, 0, sizeof(ENTROPY_CONTEXT_PLANES) - 1);
-    vpx_memset(xd->left_context, 0, sizeof(ENTROPY_CONTEXT_PLANES) - 1);
-  }
-}
-
-DECLARE_ALIGNED(16, extern const unsigned char, vp9_norm[256]);
-
-// #define PREV_CONTEXT_INC(val) (2+((val)>2))
-// #define PREV_CONTEXT_INC(val) (vp9_prev_token_class[(val)])
-#define PREV_CONTEXT_INC(val) (vp9_prev_token_class[(val)>10?10:(val)])
-
-static int get_token(int v) {
-  if (v < 0) v = -v;
-  if (v == 0) return ZERO_TOKEN;
-  else if (v == 1) return ONE_TOKEN;
-  else if (v == 2) return TWO_TOKEN;
-  else if (v == 3) return THREE_TOKEN;
-  else if (v == 4) return FOUR_TOKEN;
-  else if (v <= 6) return DCT_VAL_CATEGORY1;
-  else if (v <= 10) return DCT_VAL_CATEGORY2;
-  else if (v <= 18) return DCT_VAL_CATEGORY3;
-  else if (v <= 34) return DCT_VAL_CATEGORY4;
-  else if (v <= 66) return DCT_VAL_CATEGORY5;
-  else return DCT_VAL_CATEGORY6;
-}
-
-void static count_tokens_adaptive_scan(const MACROBLOCKD *xd, INT16 *qcoeff_ptr,
-                                       int block, PLANE_TYPE type,
-                                       TX_TYPE tx_type,
-                                       ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
-                                       int eob, int seg_eob,
-                                       FRAME_CONTEXT *fc) {
-  int c, pt, token, band;
-  const int *scan;
-
-  switch(tx_type) {
-    case ADST_DCT :
-      scan = vp9_row_scan;
-      break;
-
-    case DCT_ADST :
-      scan = vp9_col_scan;
-      break;
-
-    default :
-      scan = vp9_default_zig_zag1d;
-      break;
-  }
-
-  VP9_COMBINEENTROPYCONTEXTS(pt, *a, *l);
-  for (c = !type; c < eob; ++c) {
-    int rc = scan[c];
-    int v = qcoeff_ptr[rc];
-    band = vp9_coef_bands[c];
-    token = get_token(v);
-    if (tx_type != DCT_DCT)
-      fc->hybrid_coef_counts[type][band][pt][token]++;
-    else
-      fc->coef_counts[type][band][pt][token]++;
-    pt = vp9_prev_token_class[token];
-  }
-
-  if (eob < seg_eob) {
-    band = vp9_coef_bands[c];
-    if (tx_type != DCT_DCT)
-      fc->hybrid_coef_counts[type][band][pt][DCT_EOB_TOKEN]++;
-    else
-      fc->coef_counts[type][band][pt][DCT_EOB_TOKEN]++;
-  }
-}
-
-void static count_tokens(INT16 *qcoeff_ptr, int block, PLANE_TYPE type,
-                         ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
-                         int eob, int seg_eob, FRAME_CONTEXT *const fc) {
-  int c, pt, token, band;
-  VP9_COMBINEENTROPYCONTEXTS(pt, *a, *l);
-  for (c = !type; c < eob; ++c) {
-    int rc = vp9_default_zig_zag1d[c];
-    int v = qcoeff_ptr[rc];
-    band = vp9_coef_bands[c];
-    token = get_token(v);
-    fc->coef_counts[type][band][pt][token]++;
-    pt = vp9_prev_token_class[token];
-  }
-  if (eob < seg_eob) {
-    band = vp9_coef_bands[c];
-    fc->coef_counts[type][band][pt][DCT_EOB_TOKEN]++;
-  }
-}
-
-void static count_tokens_8x8(INT16 *qcoeff_ptr, int block, PLANE_TYPE type,
-                             TX_TYPE tx_type,
-                             ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
-                             int eob, int seg_eob, FRAME_CONTEXT *fc) {
-  int c, pt, token, band;
-  VP9_COMBINEENTROPYCONTEXTS(pt, *a, *l);
-  for (c = !type; c < eob; ++c) {
-    int rc = (type == 1 ? vp9_default_zig_zag1d[c] : vp9_default_zig_zag1d_8x8[c]);
-    int v = qcoeff_ptr[rc];
-    band = (type == 1 ? vp9_coef_bands[c] : vp9_coef_bands_8x8[c]);
-    token = get_token(v);
-    if (tx_type != DCT_DCT)
-      fc->hybrid_coef_counts_8x8[type][band][pt][token]++;
-    else
-      fc->coef_counts_8x8[type][band][pt][token]++;
-    pt = vp9_prev_token_class[token];
-  }
-  if (eob < seg_eob) {
-    band = (type == 1 ? vp9_coef_bands[c] : vp9_coef_bands_8x8[c]);
-    if (tx_type != DCT_DCT)
-      fc->hybrid_coef_counts_8x8[type][band][pt][DCT_EOB_TOKEN]++;
-    else
-      fc->coef_counts_8x8[type][band][pt][DCT_EOB_TOKEN]++;
-  }
-}
-
-void static count_tokens_16x16(INT16 *qcoeff_ptr, int block, PLANE_TYPE type,
-                               TX_TYPE tx_type,
-                               ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
-                               int eob, int seg_eob, FRAME_CONTEXT *fc) {
-  int c, pt, token;
-  VP9_COMBINEENTROPYCONTEXTS(pt, *a, *l);
-  for (c = !type; c < eob; ++c) {
-    int rc = vp9_default_zig_zag1d_16x16[c];
-    int v = qcoeff_ptr[rc];
-    int band = vp9_coef_bands_16x16[c];
-    token = get_token(v);
-    if (tx_type != DCT_DCT)
-      fc->hybrid_coef_counts_16x16[type][band][pt][token]++;
-    else
-      fc->coef_counts_16x16[type][band][pt][token]++;
-    pt = vp9_prev_token_class[token];
-  }
-  if (eob < seg_eob) {
-    int band = vp9_coef_bands_16x16[c];
-    if (tx_type != DCT_DCT)
-      fc->hybrid_coef_counts_16x16[type][band][pt][DCT_EOB_TOKEN]++;
-    else
-      fc->coef_counts_16x16[type][band][pt][DCT_EOB_TOKEN]++;
-  }
-}
-
-static int get_signed(BOOL_DECODER *br, int value_to_sign) {
-  const int split = (br->range + 1) >> 1;
-  const VP9_BD_VALUE bigsplit = (VP9_BD_VALUE)split << (VP9_BD_VALUE_SIZE - 8);
-  int v;
-
-  if (br->count < 0)
-    vp9_bool_decoder_fill(br);
-
-  if (br->value < bigsplit) {
-    br->range = split;
-    v = value_to_sign;
-  } else {
-    br->range = br->range - split;
-    br->value = br->value - bigsplit;
-    v = -value_to_sign;
-  }
-  br->range += br->range;
-  br->value += br->value;
-  --br->count;
-
-  return v;
-}
-
-#define WRITE_COEF_CONTINUE(val)                              \
-  {                                                           \
-    prob = coef_probs + (ENTROPY_NODES*PREV_CONTEXT_INC(val));\
-    qcoeff_ptr[scan[c]] = (INT16) get_signed(br, val);        \
-    c++;                                                      \
-    continue;                                                 \
-  }
-
-#define ADJUST_COEF(prob, bits_count)  \
-  do {                                 \
-    if (vp9_read(br, prob))            \
-      val += (UINT16)(1 << bits_count);\
-  } while (0);
-
-static int decode_coefs(VP9D_COMP *dx, const MACROBLOCKD *xd,
-                        BOOL_DECODER* const br,
-                        ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
-                        PLANE_TYPE type,
-                        TX_TYPE tx_type,
-                        int seg_eob, INT16 *qcoeff_ptr, int i,
-                        const int *const scan, int block_type,
-                        const int *coef_bands) {
-  FRAME_CONTEXT *const fc = &dx->common.fc;
-  int tmp, c = (type == PLANE_TYPE_Y_NO_DC);
-  const vp9_prob *prob, *coef_probs;
-
-  switch (block_type) {
-    default:
-    case TX_4X4:
-      coef_probs =
-        tx_type != DCT_DCT ? fc->hybrid_coef_probs[type][0][0] :
-        fc->coef_probs[type][0][0];
-      break;
-    case TX_8X8:
-      coef_probs =
-        tx_type != DCT_DCT ? fc->hybrid_coef_probs_8x8[type][0][0] :
-        fc->coef_probs_8x8[type][0][0];
-      break;
-    case TX_16X16:
-      coef_probs =
-        tx_type != DCT_DCT ? fc->hybrid_coef_probs_16x16[type][0][0] :
-        fc->coef_probs_16x16[type][0][0];
-      break;
-  }
-
-  VP9_COMBINEENTROPYCONTEXTS(tmp, *a, *l);
-  prob = coef_probs + tmp * ENTROPY_NODES;
-
-  while (1) {
-    int val;
-    const uint8_t *cat6 = cat6_prob;
-    if (c == seg_eob) break;
-    prob += coef_bands[c];
-    if (!vp9_read(br, prob[EOB_CONTEXT_NODE]))
-      break;
-SKIP_START:
-    if (c == seg_eob) break;
-    if (!vp9_read(br, prob[ZERO_CONTEXT_NODE])) {
-      ++c;
-      prob = coef_probs + coef_bands[c];
-      goto SKIP_START;
-    }
-    // ONE_CONTEXT_NODE_0_
-    if (!vp9_read(br, prob[ONE_CONTEXT_NODE])) {
-      prob = coef_probs + ENTROPY_NODES;
-      qcoeff_ptr[scan[c]] = (INT16) get_signed(br, 1);
-      ++c;
-      continue;
-    }
-    // LOW_VAL_CONTEXT_NODE_0_
-    if (!vp9_read(br, prob[LOW_VAL_CONTEXT_NODE])) {
-      if (!vp9_read(br, prob[TWO_CONTEXT_NODE])) {
-        WRITE_COEF_CONTINUE(2);
-      }
-      if (!vp9_read(br, prob[THREE_CONTEXT_NODE])) {
-        WRITE_COEF_CONTINUE(3);
-      }
-      WRITE_COEF_CONTINUE(4);
-    }
-    // HIGH_LOW_CONTEXT_NODE_0_
-    if (!vp9_read(br, prob[HIGH_LOW_CONTEXT_NODE])) {
-      if (!vp9_read(br, prob[CAT_ONE_CONTEXT_NODE])) {
-        val = CAT1_MIN_VAL;
-        ADJUST_COEF(CAT1_PROB0, 0);
-        WRITE_COEF_CONTINUE(val);
-      }
-      val = CAT2_MIN_VAL;
-      ADJUST_COEF(CAT2_PROB1, 1);
-      ADJUST_COEF(CAT2_PROB0, 0);
-      WRITE_COEF_CONTINUE(val);
-    }
-    // CAT_THREEFOUR_CONTEXT_NODE_0_
-    if (!vp9_read(br, prob[CAT_THREEFOUR_CONTEXT_NODE])) {
-      if (!vp9_read(br, prob[CAT_THREE_CONTEXT_NODE])) {
-        val = CAT3_MIN_VAL;
-        ADJUST_COEF(CAT3_PROB2, 2);
-        ADJUST_COEF(CAT3_PROB1, 1);
-        ADJUST_COEF(CAT3_PROB0, 0);
-        WRITE_COEF_CONTINUE(val);
-      }
-      val = CAT4_MIN_VAL;
-      ADJUST_COEF(CAT4_PROB3, 3);
-      ADJUST_COEF(CAT4_PROB2, 2);
-      ADJUST_COEF(CAT4_PROB1, 1);
-      ADJUST_COEF(CAT4_PROB0, 0);
-      WRITE_COEF_CONTINUE(val);
-    }
-    // CAT_FIVE_CONTEXT_NODE_0_:
-    if (!vp9_read(br, prob[CAT_FIVE_CONTEXT_NODE])) {
-      val = CAT5_MIN_VAL;
-      ADJUST_COEF(CAT5_PROB4, 4);
-      ADJUST_COEF(CAT5_PROB3, 3);
-      ADJUST_COEF(CAT5_PROB2, 2);
-      ADJUST_COEF(CAT5_PROB1, 1);
-      ADJUST_COEF(CAT5_PROB0, 0);
-      WRITE_COEF_CONTINUE(val);
-    }
-    val = 0;
-    while (*cat6) {
-      val = (val << 1) | vp9_read(br, *cat6++);
-    }
-    val += CAT6_MIN_VAL;
-    WRITE_COEF_CONTINUE(val);
-  }
-
-  if (block_type == TX_4X4) {
-    count_tokens_adaptive_scan(xd, qcoeff_ptr, i, type,
-                               tx_type,
-                               a, l, c, seg_eob, fc);
-  }
-  else if (block_type == TX_8X8)
-    count_tokens_8x8(qcoeff_ptr, i, type,
-                     tx_type,
-                     a, l, c, seg_eob, fc);
-  else
-    count_tokens_16x16(qcoeff_ptr, i, type,
-                       tx_type,
-                       a, l, c, seg_eob, fc);
-  return c;
-}
-
-int vp9_decode_mb_tokens_16x16(VP9D_COMP *pbi, MACROBLOCKD *xd,
-                               BOOL_DECODER* const bc) {
-  ENTROPY_CONTEXT* const A = (ENTROPY_CONTEXT *)xd->above_context;
-  ENTROPY_CONTEXT* const L = (ENTROPY_CONTEXT *)xd->left_context;
-
-  char* const eobs = xd->eobs;
-  PLANE_TYPE type;
-  int c, i, eobtotal = 0, seg_eob;
-  const int segment_id = xd->mode_info_context->mbmi.segment_id;
-  const int seg_active = vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB);
-  INT16 *qcoeff_ptr = &xd->qcoeff[0];
-  TX_TYPE tx_type = get_tx_type(xd, &xd->block[0]);
-
-  type = PLANE_TYPE_Y_WITH_DC;
-
-  if (seg_active)
-      seg_eob = vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);
-  else
-      seg_eob = 256;
-
-  // Luma block
-  {
-    const int* const scan = vp9_default_zig_zag1d_16x16;
-    c = decode_coefs(pbi, xd, bc, A, L, type,
-                     tx_type,
-                     seg_eob, qcoeff_ptr,
-                     0, scan, TX_16X16, coef_bands_x_16x16);
-    eobs[0] = c;
-    A[0] = L[0] = (c != !type);
-    A[1] = A[2] = A[3] = A[0];
-    L[1] = L[2] = L[3] = L[0];
-    eobtotal += c;
-  }
-
-  // 8x8 chroma blocks
-  qcoeff_ptr += 256;
-  type = PLANE_TYPE_UV;
-  tx_type = DCT_DCT;
-  if (seg_active)
-    seg_eob = vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);
-  else
-    seg_eob = 64;
-  for (i = 16; i < 24; i += 4) {
-    ENTROPY_CONTEXT* const a = A + vp9_block2above_8x8[i];
-    ENTROPY_CONTEXT* const l = L + vp9_block2left_8x8[i];
-    const int* const scan = vp9_default_zig_zag1d_8x8;
-
-    c = decode_coefs(pbi, xd, bc, a, l, type,
-                     tx_type,
-                     seg_eob, qcoeff_ptr,
-                     i, scan, TX_8X8, coef_bands_x_8x8);
-    a[0] = l[0] = ((eobs[i] = c) != !type);
-    a[1] = a[0];
-    l[1] = l[0];
-
-    eobtotal += c;
-    qcoeff_ptr += 64;
-  }
-  vpx_memset(&A[8], 0, sizeof(A[8]));
-  vpx_memset(&L[8], 0, sizeof(L[8]));
-  return eobtotal;
-}
-
-int vp9_decode_mb_tokens_8x8(VP9D_COMP *pbi, MACROBLOCKD *xd,
-                             BOOL_DECODER* const bc) {
-  ENTROPY_CONTEXT *const A = (ENTROPY_CONTEXT *)xd->above_context;
-  ENTROPY_CONTEXT *const L = (ENTROPY_CONTEXT *)xd->left_context;
-
-  char *const eobs = xd->eobs;
-  PLANE_TYPE type;
-  int c, i, eobtotal = 0, seg_eob;
-  const int segment_id = xd->mode_info_context->mbmi.segment_id;
-  const int seg_active = vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB);
-  INT16 *qcoeff_ptr = &xd->qcoeff[0];
-  TX_TYPE tx_type = DCT_DCT;
-
-  int bufthred = (xd->mode_info_context->mbmi.mode == I8X8_PRED ||
-                  xd->mode_info_context->mbmi.mode == SPLITMV) ? 16 : 24;
-  if (xd->mode_info_context->mbmi.mode != B_PRED &&
-      xd->mode_info_context->mbmi.mode != SPLITMV &&
-      xd->mode_info_context->mbmi.mode != I8X8_PRED) {
-    ENTROPY_CONTEXT *const a = A + vp9_block2above_8x8[24];
-    ENTROPY_CONTEXT *const l = L + vp9_block2left_8x8[24];
-    const int *const scan = vp9_default_zig_zag1d;
-    type = PLANE_TYPE_Y2;
-
-    if (seg_active)
-      seg_eob = vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);
-    else
-      seg_eob = 4;
-    c = decode_coefs(pbi, xd, bc, a, l, type,
-                     tx_type,
-                     seg_eob, qcoeff_ptr + 24 * 16,
-                     24, scan, TX_8X8, coef_bands_x);
-    a[0] = l[0] = ((eobs[24] = c) != !type);
-
-    eobtotal += c - 4;
-
-    type = PLANE_TYPE_Y_NO_DC;
-  } else
-    type = PLANE_TYPE_Y_WITH_DC;
-
-  if (seg_active)
-    seg_eob = vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);
-  else
-    seg_eob = 64;
-
-  for (i = 0; i < bufthred ; i += 4) {
-    ENTROPY_CONTEXT *const a = A + vp9_block2above_8x8[i];
-    ENTROPY_CONTEXT *const l = L + vp9_block2left_8x8[i];
-    const int *const scan = vp9_default_zig_zag1d_8x8;
-    tx_type = DCT_DCT;
-
-    if (i == 16)
-      type = PLANE_TYPE_UV;
-    if (type == PLANE_TYPE_Y_WITH_DC) {
-      tx_type = get_tx_type(xd, xd->block + i);
-    }
-
-    c = decode_coefs(pbi, xd, bc, a, l, type,
-                     tx_type,
-                     seg_eob, qcoeff_ptr,
-                     i, scan, TX_8X8, coef_bands_x_8x8);
-    a[0] = l[0] = ((eobs[i] = c) != !type);
-    a[1] = a[0];
-    l[1] = l[0];
-
-    eobtotal += c;
-    qcoeff_ptr += 64;
-  }
-
-  if (bufthred == 16) {
-    type = PLANE_TYPE_UV;
-    tx_type = DCT_DCT;
-    seg_eob = 16;
-
-    // use 4x4 transform for U, V components in I8X8 prediction mode
-    for (i = 16; i < 24; i++) {
-      ENTROPY_CONTEXT *const a = A + vp9_block2above[i];
-      ENTROPY_CONTEXT *const l = L + vp9_block2left[i];
-      const int *scan = vp9_default_zig_zag1d;
-
-      c = decode_coefs(pbi, xd, bc, a, l, type,
-                       tx_type,
-                       seg_eob, qcoeff_ptr,
-                       i, scan, TX_4X4, coef_bands_x);
-      a[0] = l[0] = ((eobs[i] = c) != !type);
-
-      eobtotal += c;
-      qcoeff_ptr += 16;
-    }
-  }
-
-  return eobtotal;
-}
-
-
-int vp9_decode_mb_tokens(VP9D_COMP *dx, MACROBLOCKD *xd,
-                         BOOL_DECODER* const bc) {
-  ENTROPY_CONTEXT *const A = (ENTROPY_CONTEXT *)xd->above_context;
-  ENTROPY_CONTEXT *const L = (ENTROPY_CONTEXT *)xd->left_context;
-
-  char *const eobs = xd->eobs;
-  const int *scan = vp9_default_zig_zag1d;
-  PLANE_TYPE type;
-  int c, i, eobtotal = 0, seg_eob = 16;
-  INT16 *qcoeff_ptr = &xd->qcoeff[0];
-
-  int segment_id = xd->mode_info_context->mbmi.segment_id;
-  if (vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB))
-    seg_eob = vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);
-
-  if (xd->mode_info_context->mbmi.mode != B_PRED &&
-      xd->mode_info_context->mbmi.mode != I8X8_PRED &&
-      xd->mode_info_context->mbmi.mode != SPLITMV) {
-    ENTROPY_CONTEXT *const a = A + vp9_block2above[24];
-    ENTROPY_CONTEXT *const l = L + vp9_block2left[24];
-    type = PLANE_TYPE_Y2;
-
-    c = decode_coefs(dx, xd, bc, a, l, type,
-                     DCT_DCT,
-                     seg_eob, qcoeff_ptr + 24 * 16, 24,
-                     scan, TX_4X4, coef_bands_x);
-    a[0] = l[0] = ((eobs[24] = c) != !type);
-    eobtotal += c - 16;
-
-    type = PLANE_TYPE_Y_NO_DC;
-  } else {
-    type = PLANE_TYPE_Y_WITH_DC;
-  }
-
-  for (i = 0; i < 24; ++i) {
-    ENTROPY_CONTEXT *const a = A + vp9_block2above[i];
-    ENTROPY_CONTEXT *const l = L + vp9_block2left[i];
-    TX_TYPE tx_type = DCT_DCT;
-    if (i == 16)
-      type = PLANE_TYPE_UV;
-
-    tx_type = get_tx_type(xd, &xd->block[i]);
-    switch(tx_type) {
-      case ADST_DCT :
-        scan = vp9_row_scan;
-        break;
-
-      case DCT_ADST :
-        scan = vp9_col_scan;
-        break;
-
-      default :
-        scan = vp9_default_zig_zag1d;
-        break;
-    }
-
-    c = decode_coefs(dx, xd, bc, a, l, type, tx_type,
-                     seg_eob, qcoeff_ptr,
-                     i, scan, TX_4X4, coef_bands_x);
-    a[0] = l[0] = ((eobs[i] = c) != !type);
-
-    eobtotal += c;
-    qcoeff_ptr += 16;
-  }
-
-  return eobtotal;
-}
--- a/vp8/decoder/detokenize.h
+++ /dev/null
@@ -1,25 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef DETOKENIZE_H
-#define DETOKENIZE_H
-
-#include "onyxd_int.h"
-
-void vp9_reset_mb_tokens_context(MACROBLOCKD* const);
-int vp9_decode_mb_tokens(VP9D_COMP* const, MACROBLOCKD* const,
-                         BOOL_DECODER* const);
-int vp9_decode_mb_tokens_8x8(VP9D_COMP* const, MACROBLOCKD* const,
-                             BOOL_DECODER* const);
-int vp9_decode_mb_tokens_16x16(VP9D_COMP* const, MACROBLOCKD* const,
-                               BOOL_DECODER* const);
-
-#endif /* DETOKENIZE_H */
--- a/vp8/decoder/idct_blk.c
+++ /dev/null
@@ -1,292 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "vpx_ports/config.h"
-#include "vp8/common/idct.h"
-#include "dequantize.h"
-
-void vp9_dequant_dc_idct_add_c(short *input, short *dq, unsigned char *pred,
-                               unsigned char *dest, int pitch, int stride,
-                               int Dc);
-void vp9_dequant_idct_add_c(short *input, short *dq, unsigned char *pred,
-                            unsigned char *dest, int pitch, int stride);
-void vp9_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr,
-                            unsigned char *dst_ptr, int pitch, int stride);
-#if CONFIG_LOSSLESS
-void vp9_dequant_idct_add_lossless_c(short *input, short *dq,
-                                     unsigned char *pred, unsigned char *dest,
-                                     int pitch, int stride);
-void vp9_dc_only_idct_add_lossless_c(short input_dc, unsigned char *pred_ptr,
-                                     unsigned char *dst_ptr,
-                                     int pitch, int stride);
-#endif
-
-void vp9_dequant_dc_idct_add_y_block_c(short *q, short *dq,
-                                       unsigned char *pre,
-                                       unsigned char *dst,
-                                       int stride, char *eobs,
-                                       short *dc) {
-  int i, j;
-
-  for (i = 0; i < 4; i++) {
-    for (j = 0; j < 4; j++) {
-      if (*eobs++ > 1)
-        vp9_dequant_dc_idct_add_c(q, dq, pre, dst, 16, stride, dc[0]);
-      else
-        vp9_dc_only_idct_add_c(dc[0], pre, dst, 16, stride);
-
-      q   += 16;
-      pre += 4;
-      dst += 4;
-      dc++;
-    }
-
-    pre += 64 - 16;
-    dst += 4 * stride - 16;
-  }
-}
-
-void vp9_dequant_idct_add_y_block_c(short *q, short *dq,
-                                    unsigned char *pre,
-                                    unsigned char *dst,
-                                    int stride, char *eobs) {
-  int i, j;
-
-  for (i = 0; i < 4; i++) {
-    for (j = 0; j < 4; j++) {
-      if (*eobs++ > 1)
-        vp9_dequant_idct_add_c(q, dq, pre, dst, 16, stride);
-      else {
-        vp9_dc_only_idct_add_c(q[0]*dq[0], pre, dst, 16, stride);
-        ((int *)q)[0] = 0;
-      }
-
-      q   += 16;
-      pre += 4;
-      dst += 4;
-    }
-
-    pre += 64 - 16;
-    dst += 4 * stride - 16;
-  }
-}
-
-void vp9_dequant_idct_add_uv_block_c(short *q, short *dq, unsigned char *pre,
-                                     unsigned char *dstu, unsigned char *dstv,
-                                     int stride, char *eobs) {
-  int i, j;
-
-  for (i = 0; i < 2; i++) {
-    for (j = 0; j < 2; j++) {
-      if (*eobs++ > 1)
-        vp9_dequant_idct_add_c(q, dq, pre, dstu, 8, stride);
-      else {
-        vp9_dc_only_idct_add_c(q[0]*dq[0], pre, dstu, 8, stride);
-        ((int *)q)[0] = 0;
-      }
-
-      q    += 16;
-      pre  += 4;
-      dstu += 4;
-    }
-
-    pre  += 32 - 8;
-    dstu += 4 * stride - 8;
-  }
-
-  for (i = 0; i < 2; i++) {
-    for (j = 0; j < 2; j++) {
-      if (*eobs++ > 1)
-        vp9_dequant_idct_add_c(q, dq, pre, dstv, 8, stride);
-      else {
-        vp9_dc_only_idct_add_c(q[0]*dq[0], pre, dstv, 8, stride);
-        ((int *)q)[0] = 0;
-      }
-
-      q    += 16;
-      pre  += 4;
-      dstv += 4;
-    }
-
-    pre  += 32 - 8;
-    dstv += 4 * stride - 8;
-  }
-}
-
-
-void vp9_dequant_dc_idct_add_y_block_8x8_c(short *q, short *dq,
-                                           unsigned char *pre,
-                                           unsigned char *dst,
-                                           int stride, char *eobs, short *dc,
-                                           MACROBLOCKD *xd) {
-  vp9_dequant_dc_idct_add_8x8_c(q, dq, pre, dst, 16, stride, dc[0]);
-  vp9_dequant_dc_idct_add_8x8_c(&q[64], dq, pre + 8, dst + 8, 16, stride, dc[1]);
-  vp9_dequant_dc_idct_add_8x8_c(&q[128], dq, pre + 8 * 16,
-                                dst + 8 * stride, 16, stride, dc[4]);
-  vp9_dequant_dc_idct_add_8x8_c(&q[192], dq, pre + 8 * 16 + 8,
-                                dst + 8 * stride + 8, 16, stride, dc[8]);
-}
-
-#if CONFIG_SUPERBLOCKS
-void vp9_dequant_dc_idct_add_y_block_8x8_inplace_c(short *q, short *dq,
-                                                   unsigned char *dst,
-                                                   int stride, char *eobs,
-                                                   short *dc, MACROBLOCKD *xd) {
-  vp9_dequant_dc_idct_add_8x8_c(q, dq, dst, dst, stride, stride, dc[0]);
-  vp9_dequant_dc_idct_add_8x8_c(&q[64], dq, dst + 8,
-                                dst + 8, stride, stride, dc[1]);
-  vp9_dequant_dc_idct_add_8x8_c(&q[128], dq, dst + 8 * stride,
-                                dst + 8 * stride, stride, stride, dc[4]);
-  vp9_dequant_dc_idct_add_8x8_c(&q[192], dq, dst + 8 * stride + 8,
-                                dst + 8 * stride + 8, stride, stride, dc[8]);
-}
-#endif
-
-void vp9_dequant_idct_add_y_block_8x8_c(short *q, short *dq,
-                                        unsigned char *pre,
-                                        unsigned char *dst,
-                                        int stride, char *eobs,
-                                        MACROBLOCKD *xd) {
-  unsigned char *origdest = dst;
-  unsigned char *origpred = pre;
-
-  vp9_dequant_idct_add_8x8_c(q, dq, pre, dst, 16, stride);
-  vp9_dequant_idct_add_8x8_c(&q[64], dq, origpred + 8,
-                             origdest + 8, 16, stride);
-  vp9_dequant_idct_add_8x8_c(&q[128], dq, origpred + 8 * 16,
-                             origdest + 8 * stride, 16, stride);
-  vp9_dequant_idct_add_8x8_c(&q[192], dq, origpred + 8 * 16 + 8,
-                             origdest + 8 * stride + 8, 16, stride);
-}
-
-void vp9_dequant_idct_add_uv_block_8x8_c(short *q, short *dq,
-                                         unsigned char *pre,
-                                         unsigned char *dstu,
-                                         unsigned char *dstv,
-                                         int stride, char *eobs,
-                                         MACROBLOCKD *xd) {
-  vp9_dequant_idct_add_8x8_c(q, dq, pre, dstu, 8, stride);
-
-  q    += 64;
-  pre  += 64;
-
-  vp9_dequant_idct_add_8x8_c(q, dq, pre, dstv, 8, stride);
-}
-
-#if CONFIG_SUPERBLOCKS
-void vp9_dequant_idct_add_uv_block_8x8_inplace_c(short *q, short *dq,
-                                                 unsigned char *dstu,
-                                                 unsigned char *dstv,
-                                                 int stride, char *eobs,
-                                                 MACROBLOCKD *xd) {
-  vp9_dequant_idct_add_8x8_c(q, dq, dstu, dstu, stride, stride);
-
-  q    += 64;
-
-  vp9_dequant_idct_add_8x8_c(q, dq, dstv, dstv, stride, stride);
-}
-#endif
-
-#if CONFIG_LOSSLESS
-void vp9_dequant_dc_idct_add_y_block_lossless_c(short *q, short *dq,
-                                                unsigned char *pre,
-                                                unsigned char *dst,
-                                                int stride, char *eobs,
-                                                short *dc) {
-  int i, j;
-
-  for (i = 0; i < 4; i++) {
-    for (j = 0; j < 4; j++) {
-      if (*eobs++ > 1)
-        vp9_dequant_dc_idct_add_lossless_c(q, dq, pre, dst, 16, stride, dc[0]);
-      else
-        vp9_dc_only_inv_walsh_add_c(dc[0], pre, dst, 16, stride);
-
-      q   += 16;
-      pre += 4;
-      dst += 4;
-      dc++;
-    }
-
-    pre += 64 - 16;
-    dst += 4 * stride - 16;
-  }
-}
-
-void vp9_dequant_idct_add_y_block_lossless_c(short *q, short *dq,
-                                             unsigned char *pre,
-                                             unsigned char *dst,
-                                             int stride, char *eobs) {
-  int i, j;
-
-  for (i = 0; i < 4; i++) {
-    for (j = 0; j < 4; j++) {
-      if (*eobs++ > 1)
-        vp9_dequant_idct_add_lossless_c(q, dq, pre, dst, 16, stride);
-      else {
-        vp9_dc_only_inv_walsh_add_c(q[0]*dq[0], pre, dst, 16, stride);
-        ((int *)q)[0] = 0;
-      }
-
-      q   += 16;
-      pre += 4;
-      dst += 4;
-    }
-
-    pre += 64 - 16;
-    dst += 4 * stride - 16;
-  }
-}
-
-void vp9_dequant_idct_add_uv_block_lossless_c(short *q, short *dq,
-                                              unsigned char *pre,
-                                              unsigned char *dstu,
-                                              unsigned char *dstv,
-                                              int stride, char *eobs) {
-  int i, j;
-
-  for (i = 0; i < 2; i++) {
-    for (j = 0; j < 2; j++) {
-      if (*eobs++ > 1)
-        vp9_dequant_idct_add_lossless_c(q, dq, pre, dstu, 8, stride);
-      else {
-        vp9_dc_only_inv_walsh_add_c(q[0]*dq[0], pre, dstu, 8, stride);
-        ((int *)q)[0] = 0;
-      }
-
-      q    += 16;
-      pre  += 4;
-      dstu += 4;
-    }
-
-    pre  += 32 - 8;
-    dstu += 4 * stride - 8;
-  }
-
-  for (i = 0; i < 2; i++) {
-    for (j = 0; j < 2; j++) {
-      if (*eobs++ > 1)
-        vp9_dequant_idct_add_lossless_c(q, dq, pre, dstv, 8, stride);
-      else {
-        vp9_dc_only_inv_walsh_add_c(q[0]*dq[0], pre, dstv, 8, stride);
-        ((int *)q)[0] = 0;
-      }
-
-      q    += 16;
-      pre  += 4;
-      dstv += 4;
-    }
-
-    pre  += 32 - 8;
-    dstv += 4 * stride - 8;
-  }
-}
-#endif
-
--- a/vp8/decoder/onyxd_if.c
+++ /dev/null
@@ -1,506 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vp8/common/onyxc_int.h"
-#if CONFIG_POSTPROC
-#include "vp8/common/postproc.h"
-#endif
-#include "vp8/common/onyxd.h"
-#include "onyxd_int.h"
-#include "vpx_mem/vpx_mem.h"
-#include "vp8/common/alloccommon.h"
-#include "vpx_scale/yv12extend.h"
-#include "vp8/common/loopfilter.h"
-#include "vp8/common/swapyv12buffer.h"
-#include <stdio.h>
-#include <assert.h>
-
-#include "vp8/common/quant_common.h"
-#include "vpx_scale/vpxscale.h"
-#include "vp8/common/systemdependent.h"
-#include "vpx_ports/vpx_timer.h"
-#include "detokenize.h"
-#if ARCH_ARM
-#include "vpx_ports/arm.h"
-#endif
-
-extern void vp9_init_de_quantizer(VP9D_COMP *pbi);
-static int get_free_fb(VP9_COMMON *cm);
-static void ref_cnt_fb(int *buf, int *idx, int new_idx);
-
-#if CONFIG_DEBUG
-static void recon_write_yuv_frame(char *name, YV12_BUFFER_CONFIG *s) {
-  FILE *yuv_file = fopen((char *)name, "ab");
-  unsigned char *src = s->y_buffer;
-  int h = s->y_height;
-
-  do {
-    fwrite(src, s->y_width, 1,  yuv_file);
-    src += s->y_stride;
-  } while (--h);
-
-  src = s->u_buffer;
-  h = s->uv_height;
-
-  do {
-    fwrite(src, s->uv_width, 1,  yuv_file);
-    src += s->uv_stride;
-  } while (--h);
-
-  src = s->v_buffer;
-  h = s->uv_height;
-
-  do {
-    fwrite(src, s->uv_width, 1, yuv_file);
-    src += s->uv_stride;
-  } while (--h);
-
-  fclose(yuv_file);
-}
-#endif
-#define WRITE_RECON_BUFFER 0
-#if WRITE_RECON_BUFFER
-void write_dx_frame_to_file(YV12_BUFFER_CONFIG *frame, int this_frame) {
-
-  // write the frame
-  FILE *yframe;
-  int i;
-  char filename[255];
-
-  sprintf(filename, "dx\\y%04d.raw", this_frame);
-  yframe = fopen(filename, "wb");
-
-  for (i = 0; i < frame->y_height; i++)
-    fwrite(frame->y_buffer + i * frame->y_stride,
-           frame->y_width, 1, yframe);
-
-  fclose(yframe);
-  sprintf(filename, "dx\\u%04d.raw", this_frame);
-  yframe = fopen(filename, "wb");
-
-  for (i = 0; i < frame->uv_height; i++)
-    fwrite(frame->u_buffer + i * frame->uv_stride,
-           frame->uv_width, 1, yframe);
-
-  fclose(yframe);
-  sprintf(filename, "dx\\v%04d.raw", this_frame);
-  yframe = fopen(filename, "wb");
-
-  for (i = 0; i < frame->uv_height; i++)
-    fwrite(frame->v_buffer + i * frame->uv_stride,
-           frame->uv_width, 1, yframe);
-
-  fclose(yframe);
-}
-#endif
-
-void vp9_initialize_dec(void) {
-  static int init_done = 0;
-
-  if (!init_done) {
-    vp9_initialize_common();
-    vp9_init_quant_tables();
-    vp8_scale_machine_specific_config();
-    init_done = 1;
-  }
-}
-
-VP9D_PTR vp9_create_decompressor(VP9D_CONFIG *oxcf) {
-  VP9D_COMP *pbi = vpx_memalign(32, sizeof(VP9D_COMP));
-
-  if (!pbi)
-    return NULL;
-
-  vpx_memset(pbi, 0, sizeof(VP9D_COMP));
-
-  if (setjmp(pbi->common.error.jmp)) {
-    pbi->common.error.setjmp = 0;
-    vp9_remove_decompressor(pbi);
-    return 0;
-  }
-
-  pbi->common.error.setjmp = 1;
-  vp9_initialize_dec();
-
-  vp9_create_common(&pbi->common);
-
-  pbi->common.current_video_frame = 0;
-  pbi->ready_for_new_data = 1;
-
-  /* vp9_init_de_quantizer() is first called here. Add check in
-   * frame_init_dequantizer() to avoid unnecessary calling of
-   * vp9_init_de_quantizer() for every frame.
-   */
-  vp9_init_de_quantizer(pbi);
-
-  vp9_loop_filter_init(&pbi->common);
-
-  pbi->common.error.setjmp = 0;
-
-  pbi->decoded_key_frame = 0;
-
-  return (VP9D_PTR) pbi;
-}
-
-void vp9_remove_decompressor(VP9D_PTR ptr) {
-  VP9D_COMP *pbi = (VP9D_COMP *) ptr;
-
-  if (!pbi)
-    return;
-
-  // Delete sementation map
-  if (pbi->common.last_frame_seg_map != 0)
-    vpx_free(pbi->common.last_frame_seg_map);
-
-  vp9_remove_common(&pbi->common);
-  vpx_free(pbi->mbc);
-  vpx_free(pbi);
-}
-
-
-vpx_codec_err_t vp9_get_reference_dec(VP9D_PTR ptr, VP9_REFFRAME ref_frame_flag,
-                                      YV12_BUFFER_CONFIG *sd) {
-  VP9D_COMP *pbi = (VP9D_COMP *) ptr;
-  VP9_COMMON *cm = &pbi->common;
-  int ref_fb_idx;
-
-  if (ref_frame_flag == VP9_LAST_FLAG)
-    ref_fb_idx = cm->lst_fb_idx;
-  else if (ref_frame_flag == VP9_GOLD_FLAG)
-    ref_fb_idx = cm->gld_fb_idx;
-  else if (ref_frame_flag == VP9_ALT_FLAG)
-    ref_fb_idx = cm->alt_fb_idx;
-  else {
-    vpx_internal_error(&pbi->common.error, VPX_CODEC_ERROR,
-                       "Invalid reference frame");
-    return pbi->common.error.error_code;
-  }
-
-  if (cm->yv12_fb[ref_fb_idx].y_height != sd->y_height ||
-      cm->yv12_fb[ref_fb_idx].y_width != sd->y_width ||
-      cm->yv12_fb[ref_fb_idx].uv_height != sd->uv_height ||
-      cm->yv12_fb[ref_fb_idx].uv_width != sd->uv_width) {
-    vpx_internal_error(&pbi->common.error, VPX_CODEC_ERROR,
-                       "Incorrect buffer dimensions");
-  } else
-    vp8_yv12_copy_frame_ptr(&cm->yv12_fb[ref_fb_idx], sd);
-
-  return pbi->common.error.error_code;
-}
-
-
-vpx_codec_err_t vp9_set_reference_dec(VP9D_PTR ptr, VP9_REFFRAME ref_frame_flag,
-                                      YV12_BUFFER_CONFIG *sd) {
-  VP9D_COMP *pbi = (VP9D_COMP *) ptr;
-  VP9_COMMON *cm = &pbi->common;
-  int *ref_fb_ptr = NULL;
-  int free_fb;
-
-  if (ref_frame_flag == VP9_LAST_FLAG)
-    ref_fb_ptr = &cm->lst_fb_idx;
-  else if (ref_frame_flag == VP9_GOLD_FLAG)
-    ref_fb_ptr = &cm->gld_fb_idx;
-  else if (ref_frame_flag == VP9_ALT_FLAG)
-    ref_fb_ptr = &cm->alt_fb_idx;
-  else {
-    vpx_internal_error(&pbi->common.error, VPX_CODEC_ERROR,
-                       "Invalid reference frame");
-    return pbi->common.error.error_code;
-  }
-
-  if (cm->yv12_fb[*ref_fb_ptr].y_height != sd->y_height ||
-      cm->yv12_fb[*ref_fb_ptr].y_width != sd->y_width ||
-      cm->yv12_fb[*ref_fb_ptr].uv_height != sd->uv_height ||
-      cm->yv12_fb[*ref_fb_ptr].uv_width != sd->uv_width) {
-    vpx_internal_error(&pbi->common.error, VPX_CODEC_ERROR,
-                       "Incorrect buffer dimensions");
-  } else {
-    /* Find an empty frame buffer. */
-    free_fb = get_free_fb(cm);
-    /* Decrease fb_idx_ref_cnt since it will be increased again in
-     * ref_cnt_fb() below. */
-    cm->fb_idx_ref_cnt[free_fb]--;
-
-    /* Manage the reference counters and copy image. */
-    ref_cnt_fb(cm->fb_idx_ref_cnt, ref_fb_ptr, free_fb);
-    vp8_yv12_copy_frame_ptr(sd, &cm->yv12_fb[*ref_fb_ptr]);
-  }
-
-  return pbi->common.error.error_code;
-}
-
-/*For ARM NEON, d8-d15 are callee-saved registers, and need to be saved by us.*/
-#if HAVE_ARMV7
-extern void vp9_push_neon(int64_t *store);
-extern void vp9_pop_neon(int64_t *store);
-#endif
-
-static int get_free_fb(VP9_COMMON *cm) {
-  int i;
-  for (i = 0; i < NUM_YV12_BUFFERS; i++)
-    if (cm->fb_idx_ref_cnt[i] == 0)
-      break;
-
-  assert(i < NUM_YV12_BUFFERS);
-  cm->fb_idx_ref_cnt[i] = 1;
-  return i;
-}
-
-static void ref_cnt_fb(int *buf, int *idx, int new_idx) {
-  if (buf[*idx] > 0)
-    buf[*idx]--;
-
-  *idx = new_idx;
-
-  buf[new_idx]++;
-}
-
-/* If any buffer copy / swapping is signalled it should be done here. */
-static int swap_frame_buffers(VP9_COMMON *cm) {
-  int err = 0;
-
-  /* The alternate reference frame or golden frame can be updated
-   *  using the new, last, or golden/alt ref frame.  If it
-   *  is updated using the newly decoded frame it is a refresh.
-   *  An update using the last or golden/alt ref frame is a copy.
-   */
-  if (cm->copy_buffer_to_arf) {
-    int new_fb = 0;
-
-    if (cm->copy_buffer_to_arf == 1)
-      new_fb = cm->lst_fb_idx;
-    else if (cm->copy_buffer_to_arf == 2)
-      new_fb = cm->gld_fb_idx;
-    else
-      err = -1;
-
-    ref_cnt_fb(cm->fb_idx_ref_cnt, &cm->alt_fb_idx, new_fb);
-  }
-
-  if (cm->copy_buffer_to_gf) {
-    int new_fb = 0;
-
-    if (cm->copy_buffer_to_gf == 1)
-      new_fb = cm->lst_fb_idx;
-    else if (cm->copy_buffer_to_gf == 2)
-      new_fb = cm->alt_fb_idx;
-    else
-      err = -1;
-
-    ref_cnt_fb(cm->fb_idx_ref_cnt, &cm->gld_fb_idx, new_fb);
-  }
-
-  if (cm->refresh_golden_frame)
-    ref_cnt_fb(cm->fb_idx_ref_cnt, &cm->gld_fb_idx, cm->new_fb_idx);
-
-  if (cm->refresh_alt_ref_frame)
-    ref_cnt_fb(cm->fb_idx_ref_cnt, &cm->alt_fb_idx, cm->new_fb_idx);
-
-  if (cm->refresh_last_frame) {
-    ref_cnt_fb(cm->fb_idx_ref_cnt, &cm->lst_fb_idx, cm->new_fb_idx);
-
-    cm->frame_to_show = &cm->yv12_fb[cm->lst_fb_idx];
-  } else
-    cm->frame_to_show = &cm->yv12_fb[cm->new_fb_idx];
-
-  cm->fb_idx_ref_cnt[cm->new_fb_idx]--;
-
-  return err;
-}
-
-int vp9_receive_compressed_data(VP9D_PTR ptr, unsigned long size,
-                                const unsigned char *source,
-                                int64_t time_stamp) {
-#if HAVE_ARMV7
-  int64_t dx_store_reg[8];
-#endif
-  VP9D_COMP *pbi = (VP9D_COMP *) ptr;
-  VP9_COMMON *cm = &pbi->common;
-  int retcode = 0;
-
-  /*if(pbi->ready_for_new_data == 0)
-      return -1;*/
-
-  if (ptr == 0) {
-    return -1;
-  }
-
-  pbi->common.error.error_code = VPX_CODEC_OK;
-
-  pbi->Source = source;
-  pbi->source_sz = size;
-
-  if (pbi->source_sz == 0) {
-    /* This is used to signal that we are missing frames.
-     * We do not know if the missing frame(s) was supposed to update
-     * any of the reference buffers, but we act conservative and
-     * mark only the last buffer as corrupted.
-     */
-    cm->yv12_fb[cm->lst_fb_idx].corrupted = 1;
-  }
-
-#if HAVE_ARMV7
-#if CONFIG_RUNTIME_CPU_DETECT
-  if (cm->rtcd.flags & HAS_NEON)
-#endif
-  {
-    vp9_push_neon(dx_store_reg);
-  }
-#endif
-
-  cm->new_fb_idx = get_free_fb(cm);
-
-  if (setjmp(pbi->common.error.jmp)) {
-#if HAVE_ARMV7
-#if CONFIG_RUNTIME_CPU_DETECT
-    if (cm->rtcd.flags & HAS_NEON)
-#endif
-    {
-      vp9_pop_neon(dx_store_reg);
-    }
-#endif
-    pbi->common.error.setjmp = 0;
-
-    /* We do not know if the missing frame(s) was supposed to update
-     * any of the reference buffers, but we act conservative and
-     * mark only the last buffer as corrupted.
-     */
-    cm->yv12_fb[cm->lst_fb_idx].corrupted = 1;
-
-    if (cm->fb_idx_ref_cnt[cm->new_fb_idx] > 0)
-      cm->fb_idx_ref_cnt[cm->new_fb_idx]--;
-    return -1;
-  }
-
-  pbi->common.error.setjmp = 1;
-
-  retcode = vp9_decode_frame(pbi);
-
-  if (retcode < 0) {
-#if HAVE_ARMV7
-#if CONFIG_RUNTIME_CPU_DETECT
-    if (cm->rtcd.flags & HAS_NEON)
-#endif
-    {
-      vp9_pop_neon(dx_store_reg);
-    }
-#endif
-    pbi->common.error.error_code = VPX_CODEC_ERROR;
-    pbi->common.error.setjmp = 0;
-    if (cm->fb_idx_ref_cnt[cm->new_fb_idx] > 0)
-      cm->fb_idx_ref_cnt[cm->new_fb_idx]--;
-    return retcode;
-  }
-
-  {
-    if (swap_frame_buffers(cm)) {
-#if HAVE_ARMV7
-#if CONFIG_RUNTIME_CPU_DETECT
-      if (cm->rtcd.flags & HAS_NEON)
-#endif
-      {
-        vp9_pop_neon(dx_store_reg);
-      }
-#endif
-      pbi->common.error.error_code = VPX_CODEC_ERROR;
-      pbi->common.error.setjmp = 0;
-      return -1;
-    }
-
-#if WRITE_RECON_BUFFER
-    if (cm->show_frame)
-      write_dx_frame_to_file(cm->frame_to_show,
-                             cm->current_video_frame);
-    else
-      write_dx_frame_to_file(cm->frame_to_show,
-                             cm->current_video_frame + 1000);
-#endif
-
-    if (cm->filter_level) {
-      /* Apply the loop filter if appropriate. */
-      vp9_loop_filter_frame(cm, &pbi->mb);
-    }
-    vp8_yv12_extend_frame_borders_ptr(cm->frame_to_show);
-  }
-
-#if CONFIG_DEBUG
-  if (cm->show_frame)
-    recon_write_yuv_frame("recon.yuv", cm->frame_to_show);
-#endif
-
-  vp9_clear_system_state();
-
-  if (cm->show_frame) {
-    vpx_memcpy(cm->prev_mip, cm->mip,
-               (cm->mb_cols + 1) * (cm->mb_rows + 1)* sizeof(MODE_INFO));
-  } else {
-    vpx_memset(cm->prev_mip, 0,
-               (cm->mb_cols + 1) * (cm->mb_rows + 1)* sizeof(MODE_INFO));
-  }
-
-  /*vp9_print_modes_and_motion_vectors(cm->mi, cm->mb_rows,cm->mb_cols,
-                                       cm->current_video_frame);*/
-
-  if (cm->show_frame)
-    cm->current_video_frame++;
-
-  pbi->ready_for_new_data = 0;
-  pbi->last_time_stamp = time_stamp;
-  pbi->source_sz = 0;
-
-#if HAVE_ARMV7
-#if CONFIG_RUNTIME_CPU_DETECT
-  if (cm->rtcd.flags & HAS_NEON)
-#endif
-  {
-    vp9_pop_neon(dx_store_reg);
-  }
-#endif
-  pbi->common.error.setjmp = 0;
-  return retcode;
-}
-
-int vp9_get_raw_frame(VP9D_PTR ptr, YV12_BUFFER_CONFIG *sd,
-                      int64_t *time_stamp, int64_t *time_end_stamp,
-                      vp9_ppflags_t *flags) {
-  int ret = -1;
-  VP9D_COMP *pbi = (VP9D_COMP *) ptr;
-
-  if (pbi->ready_for_new_data == 1)
-    return ret;
-
-  /* ie no raw frame to show!!! */
-  if (pbi->common.show_frame == 0)
-    return ret;
-
-  pbi->ready_for_new_data = 1;
-  *time_stamp = pbi->last_time_stamp;
-  *time_end_stamp = 0;
-
-  sd->clrtype = pbi->common.clr_type;
-#if CONFIG_POSTPROC
-  ret = vp9_post_proc_frame(&pbi->common, sd, flags);
-#else
-
-  if (pbi->common.frame_to_show) {
-    *sd = *pbi->common.frame_to_show;
-    sd->y_width = pbi->common.Width;
-    sd->y_height = pbi->common.Height;
-    sd->uv_height = pbi->common.Height / 2;
-    ret = 0;
-  } else {
-    ret = -1;
-  }
-
-#endif /*!CONFIG_POSTPROC*/
-  vp9_clear_system_state();
-  return ret;
-}
--- a/vp8/decoder/onyxd_int.h
+++ /dev/null
@@ -1,106 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef __INC_ONYXD_INT_H
-#define __INC_ONYXD_INT_H
-#include "vpx_ports/config.h"
-#include "vp8/common/onyxd.h"
-#include "treereader.h"
-#include "vp8/common/onyxc_int.h"
-#include "dequantize.h"
-
-// #define DEC_DEBUG
-
-typedef struct {
-  int ithread;
-  void *ptr1;
-  void *ptr2;
-} DECODETHREAD_DATA;
-
-typedef struct {
-  MACROBLOCKD  mbd;
-  int mb_row;
-  int current_mb_col;
-  short *coef_ptr;
-} MB_ROW_DEC;
-
-typedef struct {
-  int const *scan;
-  int const *scan_8x8;
-  UINT8 const *ptr_block2leftabove;
-  vp9_tree_index const *vp9_coef_tree_ptr;
-  unsigned char *norm_ptr;
-  UINT8 *ptr_coef_bands_x;
-  UINT8 *ptr_coef_bands_x_8x8;
-
-  ENTROPY_CONTEXT_PLANES *A;
-  ENTROPY_CONTEXT_PLANES *L;
-
-  INT16 *qcoeff_start_ptr;
-
-  vp9_prob const *coef_probs[BLOCK_TYPES];
-  vp9_prob const *coef_probs_8x8[BLOCK_TYPES_8X8];
-  vp9_prob const *coef_probs_16X16[BLOCK_TYPES_16X16];
-
-  UINT8 eob[25];
-
-} DETOK;
-
-typedef struct VP9Decompressor {
-  DECLARE_ALIGNED(16, MACROBLOCKD, mb);
-
-  DECLARE_ALIGNED(16, VP9_COMMON, common);
-
-  VP9D_CONFIG oxcf;
-
-
-  const unsigned char *Source;
-  unsigned int   source_sz;
-
-  vp9_reader *mbc;
-  int64_t last_time_stamp;
-  int   ready_for_new_data;
-
-  DETOK detoken;
-
-  vp9_dequant_idct_add_fn_t            idct_add;
-  vp9_dequant_dc_idct_add_fn_t         dc_idct_add;
-  vp9_dequant_dc_idct_add_y_block_fn_t dc_idct_add_y_block;
-  vp9_dequant_idct_add_y_block_fn_t    idct_add_y_block;
-  vp9_dequant_idct_add_uv_block_fn_t   idct_add_uv_block;
-
-  vp9_prob prob_skip_false;
-
-  int decoded_key_frame;
-
-} VP9D_COMP;
-
-int vp9_decode_frame(VP9D_COMP *cpi);
-
-
-#if CONFIG_DEBUG
-#define CHECK_MEM_ERROR(lval,expr) do {\
-    lval = (expr); \
-    if(!lval) \
-      vpx_internal_error(&pbi->common.error, VPX_CODEC_MEM_ERROR,\
-                         "Failed to allocate "#lval" at %s:%d", \
-                         __FILE__,__LINE__);\
-  } while(0)
-#else
-#define CHECK_MEM_ERROR(lval,expr) do {\
-    lval = (expr); \
-    if(!lval) \
-      vpx_internal_error(&pbi->common.error, VPX_CODEC_MEM_ERROR,\
-                         "Failed to allocate "#lval);\
-  } while(0)
-#endif
-
-#endif  // __INC_ONYXD_INT_H
--- a/vp8/decoder/reconintra_mt.h
+++ /dev/null
@@ -1,15 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef __INC_RECONINTRA_MT_H
-#define __INC_RECONINTRA_MT_H
-
-#endif
--- a/vp8/decoder/treereader.h
+++ /dev/null
@@ -1,37 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef tree_reader_h
-#define tree_reader_h 1
-
-#include "vp8/common/treecoder.h"
-
-#include "dboolhuff.h"
-
-typedef BOOL_DECODER vp9_reader;
-
-#define vp9_read decode_bool
-#define vp9_read_literal decode_value
-#define vp9_read_bit(R) vp9_read(R, vp9_prob_half)
-
-/* Intent of tree data structure is to make decoding trivial. */
-
-static int treed_read(vp9_reader *const r, /* !!! must return a 0 or 1 !!! */
-                      vp9_tree t,
-                      const vp9_prob *const p) {
-  register vp9_tree_index i = 0;
-
-  while ((i = t[ i + vp9_read(r, p[i >> 1])]) > 0);
-
-  return -i;
-}
-
-#endif /* tree_reader_h */
--- a/vp8/decoder/x86/dequantize_mmx.asm
+++ /dev/null
@@ -1,406 +1,0 @@
-;
-;  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-%include "third_party/x86inc/x86inc.asm"
-
-SECTION_RODATA
-align 16
-x_s1sqr2:      times 4 dw 0x8A8C
-align 16
-x_c1sqr2less1: times 4 dw 0x4E7B
-align 16
-pw_16:         times 4 dw 16
-
-SECTION .text
-
-INIT_MMX
-
-
-;void dequantize_b_impl_mmx(short *sq, short *dq, short *q)
-cglobal dequantize_b_impl_mmx, 3,3,0,sq,dq,arg3
-    mova       m1, [sqq]
-    pmullw     m1, [arg3q+0]            ; mm4 *= kernel 0 modifiers.
-    mova [dqq+ 0], m1
-
-    mova       m1, [sqq+8]
-    pmullw     m1, [arg3q+8]            ; mm4 *= kernel 0 modifiers.
-    mova [dqq+ 8], m1
-
-    mova       m1, [sqq+16]
-    pmullw     m1, [arg3q+16]            ; mm4 *= kernel 0 modifiers.
-    mova [dqq+16], m1
-
-    mova       m1, [sqq+24]
-    pmullw     m1, [arg3q+24]            ; mm4 *= kernel 0 modifiers.
-    mova [dqq+24], m1
-    RET
-
-
-;void dequant_idct_add_mmx(short *input, short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride)
-cglobal dequant_idct_add_mmx, 4,6,0,inp,dq,pred,dest,pit,stride
-
-%if ARCH_X86_64
-    movsxd              strideq,  dword stridem
-    movsxd              pitq,     dword pitm
-%else
-    mov                 strideq,  stridem
-    mov                 pitq,     pitm
-%endif
-
-    mova                m0,       [inpq+ 0]
-    pmullw              m0,       [dqq]
-
-    mova                m1,       [inpq+ 8]
-    pmullw              m1,       [dqq+ 8]
-
-    mova                m2,       [inpq+16]
-    pmullw              m2,       [dqq+16]
-
-    mova                m3,       [inpq+24]
-    pmullw              m3,       [dqq+24]
-
-    pxor                m7,        m7
-    mova            [inpq],        m7
-    mova          [inpq+8],        m7
-    mova         [inpq+16],        m7
-    mova         [inpq+24],        m7
-
-
-    psubw               m0,        m2             ; b1= 0-2
-    paddw               m2,        m2             ;
-
-    mova                m5,        m1
-    paddw               m2,        m0             ; a1 =0+2
-
-    pmulhw              m5,       [x_s1sqr2];
-    paddw               m5,        m1             ; ip1 * sin(pi/8) * sqrt(2)
-
-    mova                m7,        m3             ;
-    pmulhw              m7,       [x_c1sqr2less1];
-
-    paddw               m7,        m3             ; ip3 * cos(pi/8) * sqrt(2)
-    psubw               m7,        m5             ; c1
-
-    mova                m5,        m1
-    mova                m4,        m3
-
-    pmulhw              m5,       [x_c1sqr2less1]
-    paddw               m5,        m1
-
-    pmulhw              m3,       [x_s1sqr2]
-    paddw               m3,        m4
-
-    paddw               m3,        m5             ; d1
-    mova                m6,        m2             ; a1
-
-    mova                m4,        m0             ; b1
-    paddw               m2,        m3             ;0
-
-    paddw               m4,        m7             ;1
-    psubw               m0,        m7             ;2
-
-    psubw               m6,        m3             ;3
-
-    mova                m1,        m2             ; 03 02 01 00
-    mova                m3,        m4             ; 23 22 21 20
-
-    punpcklwd           m1,        m0             ; 11 01 10 00
-    punpckhwd           m2,        m0             ; 13 03 12 02
-
-    punpcklwd           m3,        m6             ; 31 21 30 20
-    punpckhwd           m4,        m6             ; 33 23 32 22
-
-    mova                m0,        m1             ; 11 01 10 00
-    mova                m5,        m2             ; 13 03 12 02
-
-    punpckldq           m0,        m3             ; 30 20 10 00
-    punpckhdq           m1,        m3             ; 31 21 11 01
-
-    punpckldq           m2,        m4             ; 32 22 12 02
-    punpckhdq           m5,        m4             ; 33 23 13 03
-
-    mova                m3,        m5             ; 33 23 13 03
-
-    psubw               m0,        m2             ; b1= 0-2
-    paddw               m2,        m2             ;
-
-    mova                m5,        m1
-    paddw               m2,        m0             ; a1 =0+2
-
-    pmulhw              m5,       [x_s1sqr2];
-    paddw               m5,        m1             ; ip1 * sin(pi/8) * sqrt(2)
-
-    mova                m7,        m3             ;
-    pmulhw              m7,       [x_c1sqr2less1];
-
-    paddw               m7,        m3             ; ip3 * cos(pi/8) * sqrt(2)
-    psubw               m7,        m5             ; c1
-
-    mova                m5,        m1
-    mova                m4,        m3
-
-    pmulhw              m5,       [x_c1sqr2less1]
-    paddw               m5,        m1
-
-    pmulhw              m3,       [x_s1sqr2]
-    paddw               m3,        m4
-
-    paddw               m3,        m5             ; d1
-    paddw               m0,       [pw_16]
-
-    paddw               m2,       [pw_16]
-    mova                m6,        m2             ; a1
-
-    mova                m4,        m0             ; b1
-    paddw               m2,        m3             ;0
-
-    paddw               m4,        m7             ;1
-    psubw               m0,        m7             ;2
-
-    psubw               m6,        m3             ;3
-    psraw               m2,        5
-
-    psraw               m0,        5
-    psraw               m4,        5
-
-    psraw               m6,        5
-
-    mova                m1,        m2             ; 03 02 01 00
-    mova                m3,        m4             ; 23 22 21 20
-
-    punpcklwd           m1,        m0             ; 11 01 10 00
-    punpckhwd           m2,        m0             ; 13 03 12 02
-
-    punpcklwd           m3,        m6             ; 31 21 30 20
-    punpckhwd           m4,        m6             ; 33 23 32 22
-
-    mova                m0,        m1             ; 11 01 10 00
-    mova                m5,        m2             ; 13 03 12 02
-
-    punpckldq           m0,        m3             ; 30 20 10 00
-    punpckhdq           m1,        m3             ; 31 21 11 01
-
-    punpckldq           m2,        m4             ; 32 22 12 02
-    punpckhdq           m5,        m4             ; 33 23 13 03
-
-    pxor                m7,        m7
-
-    movh                m4,       [predq]
-    punpcklbw           m4,        m7
-    paddsw              m0,        m4
-    packuswb            m0,        m7
-    movh           [destq],      m0
-
-    movh                m4,       [predq+pitq]
-    punpcklbw           m4,        m7
-    paddsw              m1,        m4
-    packuswb            m1,        m7
-    movh   [destq+strideq],        m1
-
-    movh                m4,       [predq+2*pitq]
-    punpcklbw           m4,        m7
-    paddsw              m2,        m4
-    packuswb            m2,        m7
-    movh [destq+strideq*2],        m2
-
-    add              destq,        strideq
-    add              predq,        pitq
-
-    movh                m4,       [predq+2*pitq]
-    punpcklbw           m4,        m7
-    paddsw              m5,        m4
-    packuswb            m5,        m7
-    movh [destq+strideq*2],        m5
-    RET
-
-
-;void dequant_dc_idct_add_mmx(short *input, short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride, int Dc)
-cglobal dequant_dc_idct_add_mmx, 4,7,0,inp,dq,pred,dest,pit,stride,Dc
-
-%if ARCH_X86_64
-    movsxd              strideq,   dword stridem
-    movsxd              pitq,      dword pitm
-%else
-    mov                 strideq,   stridem
-    mov                 pitq,      pitm
-%endif
-
-    mov                 Dcq, Dcm
-    mova                m0,       [inpq+ 0]
-    pmullw              m0,       [dqq+ 0]
-
-    mova                m1,       [inpq+ 8]
-    pmullw              m1,       [dqq+ 8]
-
-    mova                m2,       [inpq+16]
-    pmullw              m2,       [dqq+16]
-
-    mova                m3,       [inpq+24]
-    pmullw              m3,       [dqq+24]
-
-    pxor                m7,        m7
-    mova         [inpq+ 0],        m7
-    mova         [inpq+ 8],        m7
-    mova         [inpq+16],        m7
-    mova         [inpq+24],        m7
-
-    ; move lower word of Dc to lower word of m0
-    psrlq               m0,        16
-    psllq               m0,        16
-    and                Dcq,        0xFFFF         ; If Dc < 0, we don't want the full dword precision.
-    movh                m7,        Dcq
-    por                 m0,        m7
-    psubw               m0,        m2             ; b1= 0-2
-    paddw               m2,        m2             ;
-
-    mova                m5,        m1
-    paddw               m2,        m0             ; a1 =0+2
-
-    pmulhw              m5,       [x_s1sqr2];
-    paddw               m5,        m1             ; ip1 * sin(pi/8) * sqrt(2)
-
-    mova                m7,        m3             ;
-    pmulhw              m7,       [x_c1sqr2less1];
-
-    paddw               m7,        m3             ; ip3 * cos(pi/8) * sqrt(2)
-    psubw               m7,        m5             ; c1
-
-    mova                m5,        m1
-    mova                m4,        m3
-
-    pmulhw              m5,       [x_c1sqr2less1]
-    paddw               m5,        m1
-
-    pmulhw              m3,       [x_s1sqr2]
-    paddw               m3,        m4
-
-    paddw               m3,        m5             ; d1
-    mova                m6,        m2             ; a1
-
-    mova                m4,        m0             ; b1
-    paddw               m2,        m3             ;0
-
-    paddw               m4,        m7             ;1
-    psubw               m0,        m7             ;2
-
-    psubw               m6,        m3             ;3
-
-    mova                m1,        m2             ; 03 02 01 00
-    mova                m3,        m4             ; 23 22 21 20
-
-    punpcklwd           m1,        m0             ; 11 01 10 00
-    punpckhwd           m2,        m0             ; 13 03 12 02
-
-    punpcklwd           m3,        m6             ; 31 21 30 20
-    punpckhwd           m4,        m6             ; 33 23 32 22
-
-    mova                m0,        m1             ; 11 01 10 00
-    mova                m5,        m2             ; 13 03 12 02
-
-    punpckldq           m0,        m3             ; 30 20 10 00
-    punpckhdq           m1,        m3             ; 31 21 11 01
-
-    punpckldq           m2,        m4             ; 32 22 12 02
-    punpckhdq           m5,        m4             ; 33 23 13 03
-
-    mova                m3,        m5             ; 33 23 13 03
-
-    psubw               m0,        m2             ; b1= 0-2
-    paddw               m2,        m2             ;
-
-    mova                m5,        m1
-    paddw               m2,        m0             ; a1 =0+2
-
-    pmulhw              m5,       [x_s1sqr2];
-    paddw               m5,        m1             ; ip1 * sin(pi/8) * sqrt(2)
-
-    mova                m7,        m3             ;
-    pmulhw              m7,       [x_c1sqr2less1];
-
-    paddw               m7,        m3             ; ip3 * cos(pi/8) * sqrt(2)
-    psubw               m7,        m5             ; c1
-
-    mova                m5,        m1
-    mova                m4,        m3
-
-    pmulhw              m5,       [x_c1sqr2less1]
-    paddw               m5,        m1
-
-    pmulhw              m3,       [x_s1sqr2]
-    paddw               m3,        m4
-
-    paddw               m3,        m5             ; d1
-    paddw               m0,       [pw_16]
-
-    paddw               m2,       [pw_16]
-    mova                m6,        m2             ; a1
-
-    mova                m4,        m0             ; b1
-    paddw               m2,        m3             ;0
-
-    paddw               m4,        m7             ;1
-    psubw               m0,        m7             ;2
-
-    psubw               m6,        m3             ;3
-    psraw               m2,        5
-
-    psraw               m0,        5
-    psraw               m4,        5
-
-    psraw               m6,        5
-
-    mova                m1,        m2             ; 03 02 01 00
-    mova                m3,        m4             ; 23 22 21 20
-
-    punpcklwd           m1,        m0             ; 11 01 10 00
-    punpckhwd           m2,        m0             ; 13 03 12 02
-
-    punpcklwd           m3,        m6             ; 31 21 30 20
-    punpckhwd           m4,        m6             ; 33 23 32 22
-
-    mova                m0,        m1             ; 11 01 10 00
-    mova                m5,        m2             ; 13 03 12 02
-
-    punpckldq           m0,        m3             ; 30 20 10 00
-    punpckhdq           m1,        m3             ; 31 21 11 01
-
-    punpckldq           m2,        m4             ; 32 22 12 02
-    punpckhdq           m5,        m4             ; 33 23 13 03
-
-    pxor                m7,        m7
-
-    movh                m4,       [predq]
-    punpcklbw           m4,        m7
-    paddsw              m0,        m4
-    packuswb            m0,        m7
-    movh           [destq],        m0
-
-    movh                m4,       [predq+pitq]
-    punpcklbw           m4,        m7
-    paddsw              m1,        m4
-    packuswb            m1,        m7
-    movh   [destq+strideq],        m1
-
-    movh                m4,       [predq+2*pitq]
-    punpcklbw           m4,        m7
-    paddsw              m2,        m4
-    packuswb            m2,        m7
-    movh [destq+strideq*2],        m2
-
-    add              destq,        strideq
-    add              predq,        pitq
-
-    movh                m4,       [predq+2*pitq]
-    punpcklbw           m4,        m7
-    paddsw              m5,        m4
-    packuswb            m5,        m7
-    movh [destq+strideq*2],        m5
-    RET
-
--- a/vp8/decoder/x86/idct_blk_mmx.c
+++ /dev/null
@@ -1,143 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "vpx_ports/config.h"
-#include "vp8/common/idct.h"
-#include "vp8/decoder/dequantize.h"
-
-void vp9_dequant_dc_idct_add_y_block_mmx(short *q, short *dq,
-                                         unsigned char *pre,
-                                         unsigned char *dst,
-                                         int stride, char *eobs, short *dc) {
-  int i;
-
-  for (i = 0; i < 4; i++) {
-    if (eobs[0] > 1)
-      vp9_dequant_dc_idct_add_mmx(q, dq, pre, dst, 16, stride, dc[0]);
-    else
-      vp9_dc_only_idct_add_mmx(dc[0], pre, dst, 16, stride);
-
-    if (eobs[1] > 1)
-      vp9_dequant_dc_idct_add_mmx(q + 16, dq, pre + 4,
-                                  dst + 4, 16, stride, dc[1]);
-    else
-      vp9_dc_only_idct_add_mmx(dc[1], pre + 4, dst + 4, 16, stride);
-
-    if (eobs[2] > 1)
-      vp9_dequant_dc_idct_add_mmx(q + 32, dq, pre + 8,
-                                  dst + 8, 16, stride, dc[2]);
-    else
-      vp9_dc_only_idct_add_mmx(dc[2], pre + 8, dst + 8, 16, stride);
-
-    if (eobs[3] > 1)
-      vp9_dequant_dc_idct_add_mmx(q + 48, dq, pre + 12,
-                                  dst + 12, 16, stride, dc[3]);
-    else
-      vp9_dc_only_idct_add_mmx(dc[3], pre + 12, dst + 12, 16, stride);
-
-    q    += 64;
-    dc   += 4;
-    pre  += 64;
-    dst  += 4 * stride;
-    eobs += 4;
-  }
-}
-
-void vp9_dequant_idct_add_y_block_mmx(short *q, short *dq,
-                                      unsigned char *pre,
-                                      unsigned char *dst,
-                                      int stride, char *eobs) {
-  int i;
-
-  for (i = 0; i < 4; i++) {
-    if (eobs[0] > 1)
-      vp9_dequant_idct_add_mmx(q, dq, pre, dst, 16, stride);
-    else {
-      vp9_dc_only_idct_add_mmx(q[0]*dq[0], pre, dst, 16, stride);
-      ((int *)q)[0] = 0;
-    }
-
-    if (eobs[1] > 1)
-      vp9_dequant_idct_add_mmx(q + 16, dq, pre + 4, dst + 4, 16, stride);
-    else {
-      vp9_dc_only_idct_add_mmx(q[16]*dq[0], pre + 4, dst + 4, 16, stride);
-      ((int *)(q + 16))[0] = 0;
-    }
-
-    if (eobs[2] > 1)
-      vp9_dequant_idct_add_mmx(q + 32, dq, pre + 8, dst + 8, 16, stride);
-    else {
-      vp9_dc_only_idct_add_mmx(q[32]*dq[0], pre + 8, dst + 8, 16, stride);
-      ((int *)(q + 32))[0] = 0;
-    }
-
-    if (eobs[3] > 1)
-      vp9_dequant_idct_add_mmx(q + 48, dq, pre + 12, dst + 12, 16, stride);
-    else {
-      vp9_dc_only_idct_add_mmx(q[48]*dq[0], pre + 12, dst + 12, 16, stride);
-      ((int *)(q + 48))[0] = 0;
-    }
-
-    q    += 64;
-    pre  += 64;
-    dst  += 4 * stride;
-    eobs += 4;
-  }
-}
-
-void vp9_dequant_idct_add_uv_block_mmx(short *q, short *dq,
-                                       unsigned char *pre,
-                                       unsigned char *dstu,
-                                       unsigned char *dstv,
-                                       int stride, char *eobs) {
-  int i;
-
-  for (i = 0; i < 2; i++) {
-    if (eobs[0] > 1)
-      vp9_dequant_idct_add_mmx(q, dq, pre, dstu, 8, stride);
-    else {
-      vp9_dc_only_idct_add_mmx(q[0]*dq[0], pre, dstu, 8, stride);
-      ((int *)q)[0] = 0;
-    }
-
-    if (eobs[1] > 1)
-      vp9_dequant_idct_add_mmx(q + 16, dq, pre + 4, dstu + 4, 8, stride);
-    else {
-      vp9_dc_only_idct_add_mmx(q[16]*dq[0], pre + 4, dstu + 4, 8, stride);
-      ((int *)(q + 16))[0] = 0;
-    }
-
-    q    += 32;
-    pre  += 32;
-    dstu += 4 * stride;
-    eobs += 2;
-  }
-
-  for (i = 0; i < 2; i++) {
-    if (eobs[0] > 1)
-      vp9_dequant_idct_add_mmx(q, dq, pre, dstv, 8, stride);
-    else {
-      vp9_dc_only_idct_add_mmx(q[0]*dq[0], pre, dstv, 8, stride);
-      ((int *)q)[0] = 0;
-    }
-
-    if (eobs[1] > 1)
-      vp9_dequant_idct_add_mmx(q + 16, dq, pre + 4, dstv + 4, 8, stride);
-    else {
-      vp9_dc_only_idct_add_mmx(q[16]*dq[0], pre + 4, dstv + 4, 8, stride);
-      ((int *)(q + 16))[0] = 0;
-    }
-
-    q    += 32;
-    pre  += 32;
-    dstv += 4 * stride;
-    eobs += 2;
-  }
-}
--- a/vp8/decoder/x86/idct_blk_sse2.c
+++ /dev/null
@@ -1,116 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "vpx_ports/config.h"
-#include "vp8/common/idct.h"
-#include "vp8/decoder/dequantize.h"
-
-void vp9_idct_dequant_dc_0_2x_sse2(short *q, short *dq,
-                                   unsigned char *pre, unsigned char *dst,
-                                   int dst_stride, short *dc);
-
-void vp9_idct_dequant_dc_full_2x_sse2(short *q, short *dq,
-                                      unsigned char *pre, unsigned char *dst,
-                                      int dst_stride, short *dc);
-
-void vp9_idct_dequant_0_2x_sse2(short *q, short *dq,
-                                unsigned char *pre, unsigned char *dst,
-                                int dst_stride, int blk_stride);
-
-void vp9_idct_dequant_full_2x_sse2(short *q, short *dq,
-                                   unsigned char *pre, unsigned char *dst,
-                                   int dst_stride, int blk_stride);
-
-void vp9_dequant_dc_idct_add_y_block_sse2(short *q, short *dq,
-                                          unsigned char *pre,
-                                          unsigned char *dst,
-                                          int stride, char *eobs, short *dc) {
-  int i;
-
-  for (i = 0; i < 4; i++) {
-    if (((short *)(eobs))[0] & 0xfefe)
-      vp9_idct_dequant_dc_full_2x_sse2(q, dq, pre, dst, stride, dc);
-    else
-      vp9_idct_dequant_dc_0_2x_sse2(q, dq, pre, dst, stride, dc);
-
-    if (((short *)(eobs))[1] & 0xfefe)
-      vp9_idct_dequant_dc_full_2x_sse2(q + 32, dq, pre + 8, dst + 8,
-                                       stride, dc + 2);
-    else
-      vp9_idct_dequant_dc_0_2x_sse2(q + 32, dq, pre + 8, dst + 8,
-                                    stride, dc + 2);
-
-    q    += 64;
-    dc   += 4;
-    pre  += 64;
-    dst  += stride * 4;
-    eobs += 4;
-  }
-}
-
-void vp9_dequant_idct_add_y_block_sse2(short *q, short *dq,
-                                       unsigned char *pre, unsigned char *dst,
-                                       int stride, char *eobs) {
-  int i;
-
-  for (i = 0; i < 4; i++) {
-    if (((short *)(eobs))[0] & 0xfefe)
-      vp9_idct_dequant_full_2x_sse2(q, dq, pre, dst, stride, 16);
-    else
-      vp9_idct_dequant_0_2x_sse2(q, dq, pre, dst, stride, 16);
-
-    if (((short *)(eobs))[1] & 0xfefe)
-      vp9_idct_dequant_full_2x_sse2(q + 32, dq, pre + 8, dst + 8, stride, 16);
-    else
-      vp9_idct_dequant_0_2x_sse2(q + 32, dq, pre + 8, dst + 8, stride, 16);
-
-    q    += 64;
-    pre  += 64;
-    dst  += stride * 4;
-    eobs += 4;
-  }
-}
-
-void vp9_dequant_idct_add_uv_block_sse2(short *q, short *dq,
-                                        unsigned char *pre,
-                                        unsigned char *dstu,
-                                        unsigned char *dstv,
-                                        int stride, char *eobs) {
-  if (((short *)(eobs))[0] & 0xfefe)
-    vp9_idct_dequant_full_2x_sse2(q, dq, pre, dstu, stride, 8);
-  else
-    vp9_idct_dequant_0_2x_sse2(q, dq, pre, dstu, stride, 8);
-
-  q    += 32;
-  pre  += 32;
-  dstu += stride * 4;
-
-  if (((short *)(eobs))[1] & 0xfefe)
-    vp9_idct_dequant_full_2x_sse2(q, dq, pre, dstu, stride, 8);
-  else
-    vp9_idct_dequant_0_2x_sse2(q, dq, pre, dstu, stride, 8);
-
-  q    += 32;
-  pre  += 32;
-
-  if (((short *)(eobs))[2] & 0xfefe)
-    vp9_idct_dequant_full_2x_sse2(q, dq, pre, dstv, stride, 8);
-  else
-    vp9_idct_dequant_0_2x_sse2(q, dq, pre, dstv, stride, 8);
-
-  q    += 32;
-  pre  += 32;
-  dstv += stride * 4;
-
-  if (((short *)(eobs))[3] & 0xfefe)
-    vp9_idct_dequant_full_2x_sse2(q, dq, pre, dstv, stride, 8);
-  else
-    vp9_idct_dequant_0_2x_sse2(q, dq, pre, dstv, stride, 8);
-}
--- a/vp8/decoder/x86/x86_dsystemdependent.c
+++ /dev/null
@@ -1,26 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "vpx_ports/config.h"
-#include "vpx_ports/x86.h"
-#include "vp8/decoder/onyxd_int.h"
-
-#if HAVE_MMX
-void vp9_dequantize_b_impl_mmx(short *sq, short *dq, short *q);
-
-void vp9_dequantize_b_mmx(BLOCKD *d) {
-  short *sq = (short *) d->qcoeff;
-  short *dq = (short *) d->dqcoeff;
-  short *q = (short *) d->dequant;
-  vp9_dequantize_b_impl_mmx(sq, dq, q);
-}
-#endif
-
-
--- a/vp8/encoder/arm/arm_csystemdependent.c
+++ /dev/null
@@ -1,129 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vpx_ports/config.h"
-#include "vpx_ports/arm.h"
-#include "vp8/encoder/variance.h"
-#include "vp8/encoder/onyx_int.h"
-
-extern void (*vp9_yv12_copy_partial_frame_ptr)(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc, int Fraction);
-extern void vp9_yv12_copy_partial_frame(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc, int Fraction);
-extern void vpxyv12_copy_partial_frame_neon(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc, int Fraction);
-
-void vp9_arch_arm_encoder_init(VP9_COMP *cpi) {
-#if CONFIG_RUNTIME_CPU_DETECT
-  int flags = cpi->common.rtcd.flags;
-
-#if HAVE_ARMV5TE
-  if (flags & HAS_EDSP) {
-  }
-#endif
-
-#if HAVE_ARMV6
-  if (flags & HAS_MEDIA) {
-    cpi->rtcd.variance.sad16x16              = vp9_sad16x16_armv6;
-    /*cpi->rtcd.variance.sad16x8               = vp9_sad16x8_c;
-    cpi->rtcd.variance.sad8x16               = vp9_sad8x16_c;
-    cpi->rtcd.variance.sad8x8                = vp9_sad8x8_c;
-    cpi->rtcd.variance.sad4x4                = vp9_sad4x4_c;*/
-
-    /*cpi->rtcd.variance.var4x4                = vp9_variance4x4_c;*/
-    cpi->rtcd.variance.var8x8                = vp9_variance8x8_armv6;
-    /*cpi->rtcd.variance.var8x16               = vp9_variance8x16_c;
-    cpi->rtcd.variance.var16x8               = vp9_variance16x8_c;*/
-    cpi->rtcd.variance.var16x16              = vp9_variance16x16_armv6;
-
-    /*cpi->rtcd.variance.subpixvar4x4          = vp9_sub_pixel_variance4x4_c;*/
-    cpi->rtcd.variance.subpixvar8x8          = vp9_sub_pixel_variance8x8_armv6;
-    /*cpi->rtcd.variance.subpixvar8x16         = vp9_sub_pixel_variance8x16_c;
-    cpi->rtcd.variance.subpixvar16x8         = vp9_sub_pixel_variance16x8_c;*/
-    cpi->rtcd.variance.subpixvar16x16        = vp9_sub_pixel_variance16x16_armv6;
-    cpi->rtcd.variance.halfpixvar16x16_h     = vp9_variance_halfpixvar16x16_h_armv6;
-    cpi->rtcd.variance.halfpixvar16x16_v     = vp9_variance_halfpixvar16x16_v_armv6;
-    cpi->rtcd.variance.halfpixvar16x16_hv    = vp9_variance_halfpixvar16x16_hv_armv6;
-
-    cpi->rtcd.variance.mse16x16              = vp9_mse16x16_armv6;
-    /*cpi->rtcd.variance.getmbss               = vp9_get_mb_ss_c;*/
-
-    cpi->rtcd.fdct.short4x4                  = vp9_short_fdct4x4_armv6;
-    cpi->rtcd.fdct.short8x4                  = vp9_short_fdct8x4_armv6;
-    cpi->rtcd.fdct.fast4x4                   = vp9_short_fdct4x4_armv6;
-    cpi->rtcd.fdct.fast8x4                   = vp9_short_fdct8x4_armv6;
-    cpi->rtcd.fdct.walsh_short4x4            = vp9_short_walsh4x4_armv6;
-
-    /*cpi->rtcd.encodemb.berr                  = vp9_block_error_c;
-    cpi->rtcd.encodemb.mberr                 = vp9_mbblock_error_c;
-    cpi->rtcd.encodemb.mbuverr               = vp9_mbuverror_c;*/
-    cpi->rtcd.encodemb.subb                  = vp9_subtract_b_armv6;
-    cpi->rtcd.encodemb.submby                = vp9_subtract_mby_armv6;
-    cpi->rtcd.encodemb.submbuv               = vp9_subtract_mbuv_armv6;
-
-    /*cpi->rtcd.quantize.quantb                = vp8_regular_quantize_b;*/
-    cpi->rtcd.quantize.fastquantb            = vp8_fast_quantize_b_armv6;
-  }
-#endif
-
-#if HAVE_ARMV7
-  if (flags & HAS_NEON) {
-    cpi->rtcd.variance.sad16x16              = vp9_sad16x16_neon;
-    cpi->rtcd.variance.sad16x8               = vp9_sad16x8_neon;
-    cpi->rtcd.variance.sad8x16               = vp9_sad8x16_neon;
-    cpi->rtcd.variance.sad8x8                = vp9_sad8x8_neon;
-    cpi->rtcd.variance.sad4x4                = vp9_sad4x4_neon;
-
-    /*cpi->rtcd.variance.var4x4                = vp9_variance4x4_c;*/
-    cpi->rtcd.variance.var8x8                = vp9_variance8x8_neon;
-    cpi->rtcd.variance.var8x16               = vp9_variance8x16_neon;
-    cpi->rtcd.variance.var16x8               = vp9_variance16x8_neon;
-    cpi->rtcd.variance.var16x16              = vp9_variance16x16_neon;
-
-    /*cpi->rtcd.variance.subpixvar4x4          = vp9_sub_pixel_variance4x4_c;*/
-    cpi->rtcd.variance.subpixvar8x8          = vp9_sub_pixel_variance8x8_neon;
-    /*cpi->rtcd.variance.subpixvar8x16         = vp9_sub_pixel_variance8x16_c;
-    cpi->rtcd.variance.subpixvar16x8         = vp9_sub_pixel_variance16x8_c;*/
-    cpi->rtcd.variance.subpixvar16x16        = vp9_sub_pixel_variance16x16_neon;
-    cpi->rtcd.variance.halfpixvar16x16_h     = vp9_variance_halfpixvar16x16_h_neon;
-    cpi->rtcd.variance.halfpixvar16x16_v     = vp9_variance_halfpixvar16x16_v_neon;
-    cpi->rtcd.variance.halfpixvar16x16_hv    = vp9_variance_halfpixvar16x16_hv_neon;
-
-    cpi->rtcd.variance.mse16x16              = vp9_mse16x16_neon;
-    /*cpi->rtcd.variance.getmbss               = vp9_get_mb_ss_c;*/
-
-    cpi->rtcd.fdct.short4x4                  = vp9_short_fdct4x4_neon;
-    cpi->rtcd.fdct.short8x4                  = vp9_short_fdct8x4_neon;
-    cpi->rtcd.fdct.fast4x4                   = vp9_short_fdct4x4_neon;
-    cpi->rtcd.fdct.fast8x4                   = vp9_short_fdct8x4_neon;
-    cpi->rtcd.fdct.walsh_short4x4            = vp9_short_walsh4x4_neon;
-
-    /*cpi->rtcd.encodemb.berr                  = vp9_block_error_c;
-    cpi->rtcd.encodemb.mberr                 = vp9_mbblock_error_c;
-    cpi->rtcd.encodemb.mbuverr               = vp9_mbuverror_c;*/
-    cpi->rtcd.encodemb.subb                  = vp9_subtract_b_neon;
-    cpi->rtcd.encodemb.submby                = vp9_subtract_mby_neon;
-    cpi->rtcd.encodemb.submbuv               = vp9_subtract_mbuv_neon;
-
-    /*cpi->rtcd.quantize.quantb                = vp8_regular_quantize_b;
-    cpi->rtcd.quantize.quantb_pair           = vp8_regular_quantize_b_pair;*/
-    cpi->rtcd.quantize.fastquantb            = vp8_fast_quantize_b_neon;
-    cpi->rtcd.quantize.fastquantb_pair       = vp8_fast_quantize_b_pair_neon;
-  }
-#endif
-
-#if HAVE_ARMV7
-#if CONFIG_RUNTIME_CPU_DETECT
-  if (flags & HAS_NEON)
-#endif
-  {
-    vp9_yv12_copy_partial_frame_ptr = vpxyv12_copy_partial_frame_neon;
-  }
-#endif
-#endif
-}
--- a/vp8/encoder/arm/armv5te/boolhuff_armv5te.asm
+++ /dev/null
@@ -1,286 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT |vp8_start_encode|
-    EXPORT |vp9_encode_bool|
-    EXPORT |vp8_stop_encode|
-    EXPORT |vp8_encode_value|
-
-    INCLUDE asm_enc_offsets.asm
-
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA    |.text|, CODE, READONLY
-
-; r0 BOOL_CODER *br
-; r1 unsigned char *source
-
-|vp8_start_encode| PROC
-    mov     r12, #0
-    mov     r3,  #255
-    mvn     r2,  #23
-    str     r12, [r0, #vp9_writer_lowvalue]
-    str     r3,  [r0, #vp9_writer_range]
-    str     r12, [r0, #vp9_writer_value]
-    str     r2,  [r0, #vp9_writer_count]
-    str     r12, [r0, #vp9_writer_pos]
-    str     r1,  [r0, #vp9_writer_buffer]
-    bx      lr
-    ENDP
-
-; r0 BOOL_CODER *br
-; r1 int bit
-; r2 int probability
-|vp9_encode_bool| PROC
-    push    {r4-r9, lr}
-
-    mov     r4, r2
-
-    ldr     r2, [r0, #vp9_writer_lowvalue]
-    ldr     r5, [r0, #vp9_writer_range]
-    ldr     r3, [r0, #vp9_writer_count]
-
-    sub     r7, r5, #1                  ; range-1
-
-    cmp     r1, #0
-    mul     r6, r4, r7                  ; ((range-1) * probability)
-
-    mov     r7, #1
-    add     r4, r7, r6, lsr #8          ; 1 + (((range-1) * probability) >> 8)
-
-    addne   r2, r2, r4                  ; if  (bit) lowvalue += split
-    subne   r4, r5, r4                  ; if  (bit) range = range-split
-
-    ; Counting the leading zeros is used to normalize range.
-    clz     r6, r4
-    sub     r6, r6, #24                 ; shift
-
-    ; Flag is set on the sum of count.  This flag is used later
-    ; to determine if count >= 0
-    adds    r3, r3, r6                  ; count += shift
-    lsl     r5, r4, r6                  ; range <<= shift
-    bmi     token_count_lt_zero         ; if(count >= 0)
-
-    sub     r6, r6, r3                  ; offset = shift - count
-    sub     r4, r6, #1                  ; offset-1
-    lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )
-    bpl     token_high_bit_not_set
-
-    ldr     r4, [r0, #vp9_writer_pos]   ; x
-    sub     r4, r4, #1                  ; x = w->pos-1
-    b       token_zero_while_start
-token_zero_while_loop
-    mov     r9, #0
-    strb    r9, [r7, r4]                ; w->buffer[x] =(unsigned char)0
-    sub     r4, r4, #1                  ; x--
-token_zero_while_start
-    cmp     r4, #0
-    ldrge   r7, [r0, #vp9_writer_buffer]
-    ldrb    r1, [r7, r4]
-    cmpge   r1, #0xff
-    beq     token_zero_while_loop
-
-    ldr     r7, [r0, #vp9_writer_buffer]
-    ldrb    r9, [r7, r4]                ; w->buffer[x]
-    add     r9, r9, #1
-    strb    r9, [r7, r4]                ; w->buffer[x] + 1
-token_high_bit_not_set
-    rsb     r4, r6, #24                 ; 24-offset
-    ldr     r9, [r0, #vp9_writer_buffer]
-    lsr     r7, r2, r4                  ; lowvalue >> (24-offset)
-    ldr     r4, [r0, #vp9_writer_pos]   ; w->pos
-    lsl     r2, r2, r6                  ; lowvalue <<= offset
-    mov     r6, r3                      ; shift = count
-    add     r1, r4, #1                  ; w->pos++
-    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
-    str     r1, [r0, #vp9_writer_pos]
-    sub     r3, r3, #8                  ; count -= 8
-    strb    r7, [r9, r4]                ; w->buffer[w->pos++]
-
-token_count_lt_zero
-    lsl     r2, r2, r6                  ; lowvalue <<= shift
-
-    str     r2, [r0, #vp9_writer_lowvalue]
-    str     r5, [r0, #vp9_writer_range]
-    str     r3, [r0, #vp9_writer_count]
-    pop     {r4-r9, pc}
-    ENDP
-
-; r0 BOOL_CODER *br
-|vp8_stop_encode| PROC
-    push    {r4-r10, lr}
-
-    ldr     r2, [r0, #vp9_writer_lowvalue]
-    ldr     r5, [r0, #vp9_writer_range]
-    ldr     r3, [r0, #vp9_writer_count]
-
-    mov     r10, #32
-
-stop_encode_loop
-    sub     r7, r5, #1                  ; range-1
-
-    mov     r4, r7, lsl #7              ; ((range-1) * 128)
-
-    mov     r7, #1
-    add     r4, r7, r4, lsr #8          ; 1 + (((range-1) * 128) >> 8)
-
-    ; Counting the leading zeros is used to normalize range.
-    clz     r6, r4
-    sub     r6, r6, #24                 ; shift
-
-    ; Flag is set on the sum of count.  This flag is used later
-    ; to determine if count >= 0
-    adds    r3, r3, r6                  ; count += shift
-    lsl     r5, r4, r6                  ; range <<= shift
-    bmi     token_count_lt_zero_se      ; if(count >= 0)
-
-    sub     r6, r6, r3                  ; offset = shift - count
-    sub     r4, r6, #1                  ; offset-1
-    lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )
-    bpl     token_high_bit_not_set_se
-
-    ldr     r4, [r0, #vp9_writer_pos]   ; x
-    sub     r4, r4, #1                  ; x = w->pos-1
-    b       token_zero_while_start_se
-token_zero_while_loop_se
-    mov     r9, #0
-    strb    r9, [r7, r4]                ; w->buffer[x] =(unsigned char)0
-    sub     r4, r4, #1                  ; x--
-token_zero_while_start_se
-    cmp     r4, #0
-    ldrge   r7, [r0, #vp9_writer_buffer]
-    ldrb    r1, [r7, r4]
-    cmpge   r1, #0xff
-    beq     token_zero_while_loop_se
-
-    ldr     r7, [r0, #vp9_writer_buffer]
-    ldrb    r9, [r7, r4]                ; w->buffer[x]
-    add     r9, r9, #1
-    strb    r9, [r7, r4]                ; w->buffer[x] + 1
-token_high_bit_not_set_se
-    rsb     r4, r6, #24                 ; 24-offset
-    ldr     r9, [r0, #vp9_writer_buffer]
-    lsr     r7, r2, r4                  ; lowvalue >> (24-offset)
-    ldr     r4, [r0, #vp9_writer_pos]   ; w->pos
-    lsl     r2, r2, r6                  ; lowvalue <<= offset
-    mov     r6, r3                      ; shift = count
-    add     r1, r4, #1                  ; w->pos++
-    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
-    str     r1, [r0, #vp9_writer_pos]
-    sub     r3, r3, #8                  ; count -= 8
-    strb    r7, [r9, r4]                ; w->buffer[w->pos++]
-
-token_count_lt_zero_se
-    lsl     r2, r2, r6                  ; lowvalue <<= shift
-
-    subs    r10, r10, #1
-    bne     stop_encode_loop
-
-    str     r2, [r0, #vp9_writer_lowvalue]
-    str     r5, [r0, #vp9_writer_range]
-    str     r3, [r0, #vp9_writer_count]
-    pop     {r4-r10, pc}
-
-    ENDP
-
-; r0 BOOL_CODER *br
-; r1 int data
-; r2 int bits
-|vp8_encode_value| PROC
-    push    {r4-r11, lr}
-
-    mov     r10, r2
-
-    ldr     r2, [r0, #vp9_writer_lowvalue]
-    ldr     r5, [r0, #vp9_writer_range]
-    ldr     r3, [r0, #vp9_writer_count]
-
-    rsb     r4, r10, #32                 ; 32-n
-
-    ; v is kept in r1 during the token pack loop
-    lsl     r1, r1, r4                  ; r1 = v << 32 - n
-
-encode_value_loop
-    sub     r7, r5, #1                  ; range-1
-
-    ; Decisions are made based on the bit value shifted
-    ; off of v, so set a flag here based on this.
-    ; This value is refered to as "bb"
-    lsls    r1, r1, #1                  ; bit = v >> n
-    mov     r4, r7, lsl #7              ; ((range-1) * 128)
-
-    mov     r7, #1
-    add     r4, r7, r4, lsr #8          ; 1 + (((range-1) * 128) >> 8)
-
-    addcs   r2, r2, r4                  ; if  (bit) lowvalue += split
-    subcs   r4, r5, r4                  ; if  (bit) range = range-split
-
-    ; Counting the leading zeros is used to normalize range.
-    clz     r6, r4
-    sub     r6, r6, #24                 ; shift
-
-    ; Flag is set on the sum of count.  This flag is used later
-    ; to determine if count >= 0
-    adds    r3, r3, r6                  ; count += shift
-    lsl     r5, r4, r6                  ; range <<= shift
-    bmi     token_count_lt_zero_ev      ; if(count >= 0)
-
-    sub     r6, r6, r3                  ; offset = shift - count
-    sub     r4, r6, #1                  ; offset-1
-    lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )
-    bpl     token_high_bit_not_set_ev
-
-    ldr     r4, [r0, #vp9_writer_pos]   ; x
-    sub     r4, r4, #1                  ; x = w->pos-1
-    b       token_zero_while_start_ev
-token_zero_while_loop_ev
-    mov     r9, #0
-    strb    r9, [r7, r4]                ; w->buffer[x] =(unsigned char)0
-    sub     r4, r4, #1                  ; x--
-token_zero_while_start_ev
-    cmp     r4, #0
-    ldrge   r7, [r0, #vp9_writer_buffer]
-    ldrb    r11, [r7, r4]
-    cmpge   r11, #0xff
-    beq     token_zero_while_loop_ev
-
-    ldr     r7, [r0, #vp9_writer_buffer]
-    ldrb    r9, [r7, r4]                ; w->buffer[x]
-    add     r9, r9, #1
-    strb    r9, [r7, r4]                ; w->buffer[x] + 1
-token_high_bit_not_set_ev
-    rsb     r4, r6, #24                 ; 24-offset
-    ldr     r9, [r0, #vp9_writer_buffer]
-    lsr     r7, r2, r4                  ; lowvalue >> (24-offset)
-    ldr     r4, [r0, #vp9_writer_pos]   ; w->pos
-    lsl     r2, r2, r6                  ; lowvalue <<= offset
-    mov     r6, r3                      ; shift = count
-    add     r11, r4, #1                 ; w->pos++
-    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
-    str     r11, [r0, #vp9_writer_pos]
-    sub     r3, r3, #8                  ; count -= 8
-    strb    r7, [r9, r4]                ; w->buffer[w->pos++]
-
-token_count_lt_zero_ev
-    lsl     r2, r2, r6                  ; lowvalue <<= shift
-
-    subs    r10, r10, #1
-    bne     encode_value_loop
-
-    str     r2, [r0, #vp9_writer_lowvalue]
-    str     r5, [r0, #vp9_writer_range]
-    str     r3, [r0, #vp9_writer_count]
-    pop     {r4-r11, pc}
-    ENDP
-
-    END
--- a/vp8/encoder/arm/armv5te/vp8_packtokens_armv5.asm
+++ /dev/null
@@ -1,291 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT |vp8cx_pack_tokens_armv5|
-
-    INCLUDE asm_enc_offsets.asm
-
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA    |.text|, CODE, READONLY
-
-; r0 vp9_writer *w
-; r1 const TOKENEXTRA *p
-; r2 int xcount
-; r3 vp8_coef_encodings
-; s0 vp8_extra_bits
-; s1 vp8_coef_tree
-|vp8cx_pack_tokens_armv5| PROC
-    push    {r4-r11, lr}
-
-    ; Add size of xcount * sizeof (TOKENEXTRA) to get stop
-    ;  sizeof (TOKENEXTRA) is 8
-    sub     sp, sp, #12
-    add     r2, r1, r2, lsl #3          ; stop = p + xcount*sizeof(TOKENEXTRA)
-    str     r2, [sp, #0]
-    str     r3, [sp, #8]                ; save vp8_coef_encodings
-    ldr     r2, [r0, #vp9_writer_lowvalue]
-    ldr     r5, [r0, #vp9_writer_range]
-    ldr     r3, [r0, #vp9_writer_count]
-    b       check_p_lt_stop
-
-while_p_lt_stop
-    ldrb    r6, [r1, #tokenextra_token] ; t
-    ldr     r4, [sp, #8]                ; vp8_coef_encodings
-    mov     lr, #0
-    add     r4, r4, r6, lsl #3          ; a = vp8_coef_encodings + t
-    ldr     r9, [r1, #tokenextra_context_tree]   ; pp
-
-    ldrb    r7, [r1, #tokenextra_skip_eob_node]
-
-    ldr     r6, [r4, #vp9_token_value]  ; v
-    ldr     r8, [r4, #vp9_token_len]    ; n
-
-    ; vp8 specific skip_eob_node
-    cmp     r7, #0
-    movne   lr, #2                      ; i = 2
-    subne   r8, r8, #1                  ; --n
-
-    rsb     r4, r8, #32                 ; 32-n
-    ldr     r10, [sp, #52]              ; vp8_coef_tree
-
-    ; v is kept in r12 during the token pack loop
-    lsl     r12, r6, r4                ; r12 = v << 32 - n
-
-; loop start
-token_loop
-    ldrb    r4, [r9, lr, asr #1]        ; pp [i>>1]
-    sub     r7, r5, #1                  ; range-1
-
-    ; Decisions are made based on the bit value shifted
-    ; off of v, so set a flag here based on this.
-    ; This value is refered to as "bb"
-    lsls    r12, r12, #1                ; bb = v >> n
-    mul     r6, r4, r7                  ; ((range-1) * pp[i>>1]))
-
-    ; bb can only be 0 or 1.  So only execute this statement
-    ; if bb == 1, otherwise it will act like i + 0
-    addcs   lr, lr, #1                  ; i + bb
-
-    mov     r7, #1
-    ldrsb   lr, [r10, lr]               ; i = vp8_coef_tree[i+bb]
-    add     r4, r7, r6, lsr #8          ; 1 + (((range-1) * pp[i>>1]) >> 8)
-
-    addcs   r2, r2, r4                  ; if  (bb) lowvalue += split
-    subcs   r4, r5, r4                  ; if  (bb) range = range-split
-
-    ; Counting the leading zeros is used to normalize range.
-    clz     r6, r4
-    sub     r6, r6, #24                 ; shift
-
-    ; Flag is set on the sum of count.  This flag is used later
-    ; to determine if count >= 0
-    adds    r3, r3, r6                  ; count += shift
-    lsl     r5, r4, r6                  ; range <<= shift
-    bmi     token_count_lt_zero         ; if(count >= 0)
-
-    sub     r6, r6, r3                  ; offset = shift - count
-    sub     r4, r6, #1                  ; offset-1
-    lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )
-    bpl     token_high_bit_not_set
-
-    ldr     r4, [r0, #vp9_writer_pos]   ; x
-    sub     r4, r4, #1                  ; x = w->pos-1
-    b       token_zero_while_start
-token_zero_while_loop
-    mov     r10, #0
-    strb    r10, [r7, r4]               ; w->buffer[x] =(unsigned char)0
-    sub     r4, r4, #1                  ; x--
-token_zero_while_start
-    cmp     r4, #0
-    ldrge   r7, [r0, #vp9_writer_buffer]
-    ldrb    r11, [r7, r4]
-    cmpge   r11, #0xff
-    beq     token_zero_while_loop
-
-    ldr     r7, [r0, #vp9_writer_buffer]
-    ldrb    r10, [r7, r4]               ; w->buffer[x]
-    add     r10, r10, #1
-    strb    r10, [r7, r4]               ; w->buffer[x] + 1
-token_high_bit_not_set
-    rsb     r4, r6, #24                 ; 24-offset
-    ldr     r10, [r0, #vp9_writer_buffer]
-    lsr     r7, r2, r4                  ; lowvalue >> (24-offset)
-    ldr     r4, [r0, #vp9_writer_pos]   ; w->pos
-    lsl     r2, r2, r6                  ; lowvalue <<= offset
-    mov     r6, r3                      ; shift = count
-    add     r11, r4, #1                 ; w->pos++
-    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
-    str     r11, [r0, #vp9_writer_pos]
-    sub     r3, r3, #8                  ; count -= 8
-    strb    r7, [r10, r4]               ; w->buffer[w->pos++]
-
-    ; r10 is used earlier in the loop, but r10 is used as
-    ; temp variable here.  So after r10 is used, reload
-    ; vp8_coef_tree_dcd into r10
-    ldr     r10, [sp, #52]              ; vp8_coef_tree
-
-token_count_lt_zero
-    lsl     r2, r2, r6                  ; lowvalue <<= shift
-
-    subs    r8, r8, #1                  ; --n
-    bne     token_loop
-
-    ldrb    r6, [r1, #tokenextra_token] ; t
-    ldr     r7, [sp, #48]               ; vp8_extra_bits
-    ; Add t * sizeof (vp9_extra_bit_struct) to get the desired
-    ;  element.  Here vp9_extra_bit_struct == 16
-    add     r12, r7, r6, lsl #4         ; b = vp8_extra_bits + t
-
-    ldr     r4, [r12, #vp9_extra_bit_struct_base_val]
-    cmp     r4, #0
-    beq     skip_extra_bits
-
-;   if( b->base_val)
-    ldr     r8, [r12, #vp9_extra_bit_struct_len] ; L
-    ldrsh   lr, [r1, #tokenextra_extra] ; e = p->Extra
-    cmp     r8, #0                      ; if( L)
-    beq     no_extra_bits
-
-    ldr     r9, [r12, #vp9_extra_bit_struct_prob]
-    asr     r7, lr, #1                  ; v=e>>1
-
-    ldr     r10, [r12, #vp9_extra_bit_struct_tree]
-    str     r10, [sp, #4]               ; b->tree
-
-    rsb     r4, r8, #32
-    lsl     r12, r7, r4
-
-    mov     lr, #0                      ; i = 0
-
-extra_bits_loop
-    ldrb    r4, [r9, lr, asr #1]            ; pp[i>>1]
-    sub     r7, r5, #1                  ; range-1
-    lsls    r12, r12, #1                ; v >> n
-    mul     r6, r4, r7                  ; (range-1) * pp[i>>1]
-    addcs   lr, lr, #1                  ; i + bb
-
-    mov     r7, #1
-    ldrsb   lr, [r10, lr]               ; i = b->tree[i+bb]
-    add     r4, r7, r6, lsr #8          ; split = 1 +  (((range-1) * pp[i>>1]) >> 8)
-
-    addcs   r2, r2, r4                  ; if  (bb) lowvalue += split
-    subcs   r4, r5, r4                  ; if  (bb) range = range-split
-
-    clz     r6, r4
-    sub     r6, r6, #24
-
-    adds    r3, r3, r6                  ; count += shift
-    lsl     r5, r4, r6                  ; range <<= shift
-    bmi     extra_count_lt_zero         ; if(count >= 0)
-
-    sub     r6, r6, r3                  ; offset= shift - count
-    sub     r4, r6, #1                  ; offset-1
-    lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )
-    bpl     extra_high_bit_not_set
-
-    ldr     r4, [r0, #vp9_writer_pos]   ; x
-    sub     r4, r4, #1                  ; x = w->pos - 1
-    b       extra_zero_while_start
-extra_zero_while_loop
-    mov     r10, #0
-    strb    r10, [r7, r4]               ; w->buffer[x] =(unsigned char)0
-    sub     r4, r4, #1                  ; x--
-extra_zero_while_start
-    cmp     r4, #0
-    ldrge   r7, [r0, #vp9_writer_buffer]
-    ldrb    r11, [r7, r4]
-    cmpge   r11, #0xff
-    beq     extra_zero_while_loop
-
-    ldr     r7, [r0, #vp9_writer_buffer]
-    ldrb    r10, [r7, r4]
-    add     r10, r10, #1
-    strb    r10, [r7, r4]
-extra_high_bit_not_set
-    rsb     r4, r6, #24                 ; 24-offset
-    ldr     r10, [r0, #vp9_writer_buffer]
-    lsr     r7, r2, r4                  ; lowvalue >> (24-offset)
-    ldr     r4, [r0, #vp9_writer_pos]
-    lsl     r2, r2, r6                  ; lowvalue <<= offset
-    mov     r6, r3                      ; shift = count
-    add     r11, r4, #1                 ; w->pos++
-    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
-    str     r11, [r0, #vp9_writer_pos]
-    sub     r3, r3, #8                  ; count -= 8
-    strb    r7, [r10, r4]               ; w->buffer[w->pos++]=(lowvalue >> (24-offset))
-    ldr     r10, [sp, #4]               ; b->tree
-extra_count_lt_zero
-    lsl     r2, r2, r6
-
-    subs    r8, r8, #1                  ; --n
-    bne     extra_bits_loop             ; while (n)
-
-no_extra_bits
-    ldr     lr, [r1, #4]                ; e = p->Extra
-    add     r4, r5, #1                  ; range + 1
-    tst     lr, #1
-    lsr     r4, r4, #1                  ; split = (range + 1) >> 1
-    addne   r2, r2, r4                  ; lowvalue += split
-    subne   r4, r5, r4                  ; range = range-split
-    tst     r2, #0x80000000             ; lowvalue & 0x80000000
-    lsl     r5, r4, #1                  ; range <<= 1
-    beq     end_high_bit_not_set
-
-    ldr     r4, [r0, #vp9_writer_pos]
-    mov     r7, #0
-    sub     r4, r4, #1
-    b       end_zero_while_start
-end_zero_while_loop
-    strb    r7, [r6, r4]
-    sub     r4, r4, #1                  ; x--
-end_zero_while_start
-    cmp     r4, #0
-    ldrge   r6, [r0, #vp9_writer_buffer]
-    ldrb    r12, [r6, r4]
-    cmpge   r12, #0xff
-    beq     end_zero_while_loop
-
-    ldr     r6, [r0, #vp9_writer_buffer]
-    ldrb    r7, [r6, r4]
-    add     r7, r7, #1
-    strb    r7, [r6, r4]
-end_high_bit_not_set
-    adds    r3, r3, #1                  ; ++count
-    lsl     r2, r2, #1                  ; lowvalue  <<= 1
-    bne     end_count_zero
-
-    ldr     r4, [r0, #vp9_writer_pos]
-    mvn     r3, #7
-    ldr     r7, [r0, #vp9_writer_buffer]
-    lsr     r6, r2, #24                 ; lowvalue >> 24
-    add     r12, r4, #1                 ; w->pos++
-    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
-    str     r12, [r0, #0x10]
-    strb    r6, [r7, r4]
-end_count_zero
-skip_extra_bits
-    add     r1, r1, #TOKENEXTRA_SZ      ; ++p
-check_p_lt_stop
-    ldr     r4, [sp, #0]                ; stop
-    cmp     r1, r4                      ; while( p < stop)
-    bcc     while_p_lt_stop
-
-    str     r2, [r0, #vp9_writer_lowvalue]
-    str     r5, [r0, #vp9_writer_range]
-    str     r3, [r0, #vp9_writer_count]
-    add     sp, sp, #12
-    pop     {r4-r11, pc}
-    ENDP
-
-    END
--- a/vp8/encoder/arm/armv5te/vp8_packtokens_mbrow_armv5.asm
+++ /dev/null
@@ -1,327 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT |vp8cx_pack_mb_row_tokens_armv5|
-
-    INCLUDE asm_enc_offsets.asm
-
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA    |.text|, CODE, READONLY
-
-; r0 VP8_COMP *cpi
-; r1 vp9_writer *w
-; r2 vp8_coef_encodings
-; r3 vp8_extra_bits
-; s0 vp8_coef_tree
-
-|vp8cx_pack_mb_row_tokens_armv5| PROC
-    push    {r4-r11, lr}
-    sub     sp, sp, #24
-
-    ; Compute address of cpi->common.mb_rows
-    ldr     r4, _VP8_COMP_common_
-    ldr     r6, _VP8_COMMON_MBrows_
-    add     r4, r0, r4
-
-    ldr     r5, [r4, r6]                ; load up mb_rows
-
-    str     r2, [sp, #20]               ; save vp8_coef_encodings
-    str     r5, [sp, #12]               ; save mb_rows
-    str     r3, [sp, #8]                ; save vp8_extra_bits
-
-    ldr     r4, _VP8_COMP_tplist_
-    add     r4, r0, r4
-    ldr     r7, [r4, #0]                ; dereference cpi->tp_list
-
-    mov     r0, r1                      ; keep same as other loops
-
-    ldr     r2, [r0, #vp9_writer_lowvalue]
-    ldr     r5, [r0, #vp9_writer_range]
-    ldr     r3, [r0, #vp9_writer_count]
-
-mb_row_loop
-
-    ldr     r1, [r7, #tokenlist_start]
-    ldr     r9, [r7, #tokenlist_stop]
-    str     r9, [sp, #0]                ; save stop for later comparison
-    str     r7, [sp, #16]               ; tokenlist address for next time
-
-    b       check_p_lt_stop
-
-    ; actuall work gets done here!
-
-while_p_lt_stop
-    ldrb    r6, [r1, #tokenextra_token] ; t
-    ldr     r4, [sp, #20]               ; vp8_coef_encodings
-    mov     lr, #0
-    add     r4, r4, r6, lsl #3          ; a = vp8_coef_encodings + t
-    ldr     r9, [r1, #tokenextra_context_tree]   ; pp
-
-    ldrb    r7, [r1, #tokenextra_skip_eob_node]
-
-    ldr     r6, [r4, #vp9_token_value]  ; v
-    ldr     r8, [r4, #vp9_token_len]    ; n
-
-    ; vp8 specific skip_eob_node
-    cmp     r7, #0
-    movne   lr, #2                      ; i = 2
-    subne   r8, r8, #1                  ; --n
-
-    rsb     r4, r8, #32                 ; 32-n
-    ldr     r10, [sp, #60]              ; vp8_coef_tree
-
-    ; v is kept in r12 during the token pack loop
-    lsl     r12, r6, r4                 ; r12 = v << 32 - n
-
-; loop start
-token_loop
-    ldrb    r4, [r9, lr, asr #1]        ; pp [i>>1]
-    sub     r7, r5, #1                  ; range-1
-
-    ; Decisions are made based on the bit value shifted
-    ; off of v, so set a flag here based on this.
-    ; This value is refered to as "bb"
-    lsls    r12, r12, #1                ; bb = v >> n
-    mul     r6, r4, r7                  ; ((range-1) * pp[i>>1]))
-
-    ; bb can only be 0 or 1.  So only execute this statement
-    ; if bb == 1, otherwise it will act like i + 0
-    addcs   lr, lr, #1                  ; i + bb
-
-    mov     r7, #1
-    ldrsb   lr, [r10, lr]               ; i = vp8_coef_tree[i+bb]
-    add     r4, r7, r6, lsr #8          ; 1 + (((range-1) * pp[i>>1]) >> 8)
-
-    addcs   r2, r2, r4                  ; if  (bb) lowvalue += split
-    subcs   r4, r5, r4                  ; if  (bb) range = range-split
-
-    ; Counting the leading zeros is used to normalize range.
-    clz     r6, r4
-    sub     r6, r6, #24                 ; shift
-
-    ; Flag is set on the sum of count.  This flag is used later
-    ; to determine if count >= 0
-    adds    r3, r3, r6                  ; count += shift
-    lsl     r5, r4, r6                  ; range <<= shift
-    bmi     token_count_lt_zero         ; if(count >= 0)
-
-    sub     r6, r6, r3                  ; offset = shift - count
-    sub     r4, r6, #1                  ; offset-1
-    lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )
-    bpl     token_high_bit_not_set
-
-    ldr     r4, [r0, #vp9_writer_pos]   ; x
-    sub     r4, r4, #1                  ; x = w->pos-1
-    b       token_zero_while_start
-token_zero_while_loop
-    mov     r10, #0
-    strb    r10, [r7, r4]               ; w->buffer[x] =(unsigned char)0
-    sub     r4, r4, #1                  ; x--
-token_zero_while_start
-    cmp     r4, #0
-    ldrge   r7, [r0, #vp9_writer_buffer]
-    ldrb    r11, [r7, r4]
-    cmpge   r11, #0xff
-    beq     token_zero_while_loop
-
-    ldr     r7, [r0, #vp9_writer_buffer]
-    ldrb    r10, [r7, r4]               ; w->buffer[x]
-    add     r10, r10, #1
-    strb    r10, [r7, r4]               ; w->buffer[x] + 1
-token_high_bit_not_set
-    rsb     r4, r6, #24                 ; 24-offset
-    ldr     r10, [r0, #vp9_writer_buffer]
-    lsr     r7, r2, r4                  ; lowvalue >> (24-offset)
-    ldr     r4, [r0, #vp9_writer_pos]   ; w->pos
-    lsl     r2, r2, r6                  ; lowvalue <<= offset
-    mov     r6, r3                      ; shift = count
-    add     r11, r4, #1                 ; w->pos++
-    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
-    str     r11, [r0, #vp9_writer_pos]
-    sub     r3, r3, #8                  ; count -= 8
-    strb    r7, [r10, r4]               ; w->buffer[w->pos++]
-
-    ; r10 is used earlier in the loop, but r10 is used as
-    ; temp variable here.  So after r10 is used, reload
-    ; vp8_coef_tree_dcd into r10
-    ldr     r10, [sp, #60]              ; vp8_coef_tree
-
-token_count_lt_zero
-    lsl     r2, r2, r6                  ; lowvalue <<= shift
-
-    subs    r8, r8, #1                  ; --n
-    bne     token_loop
-
-    ldrb    r6, [r1, #tokenextra_token] ; t
-    ldr     r7, [sp, #8]                ; vp8_extra_bits
-    ; Add t * sizeof (vp9_extra_bit_struct) to get the desired
-    ;  element.  Here vp9_extra_bit_struct == 16
-    add     r12, r7, r6, lsl #4         ; b = vp8_extra_bits + t
-
-    ldr     r4, [r12, #vp9_extra_bit_struct_base_val]
-    cmp     r4, #0
-    beq     skip_extra_bits
-
-;   if( b->base_val)
-    ldr     r8, [r12, #vp9_extra_bit_struct_len] ; L
-    ldrsh   lr, [r1, #tokenextra_extra] ; e = p->Extra
-    cmp     r8, #0                      ; if( L)
-    beq     no_extra_bits
-
-    ldr     r9, [r12, #vp9_extra_bit_struct_prob]
-    asr     r7, lr, #1                  ; v=e>>1
-
-    ldr     r10, [r12, #vp9_extra_bit_struct_tree]
-    str     r10, [sp, #4]               ; b->tree
-
-    rsb     r4, r8, #32
-    lsl     r12, r7, r4
-
-    mov     lr, #0                      ; i = 0
-
-extra_bits_loop
-    ldrb    r4, [r9, lr, asr #1]            ; pp[i>>1]
-    sub     r7, r5, #1                  ; range-1
-    lsls    r12, r12, #1                ; v >> n
-    mul     r6, r4, r7                  ; (range-1) * pp[i>>1]
-    addcs   lr, lr, #1                  ; i + bb
-
-    mov     r7, #1
-    ldrsb   lr, [r10, lr]               ; i = b->tree[i+bb]
-    add     r4, r7, r6, lsr #8          ; split = 1 +  (((range-1) * pp[i>>1]) >> 8)
-
-    addcs   r2, r2, r4                  ; if  (bb) lowvalue += split
-    subcs   r4, r5, r4                  ; if  (bb) range = range-split
-
-    clz     r6, r4
-    sub     r6, r6, #24
-
-    adds    r3, r3, r6                  ; count += shift
-    lsl     r5, r4, r6                  ; range <<= shift
-    bmi     extra_count_lt_zero         ; if(count >= 0)
-
-    sub     r6, r6, r3                  ; offset= shift - count
-    sub     r4, r6, #1                  ; offset-1
-    lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )
-    bpl     extra_high_bit_not_set
-
-    ldr     r4, [r0, #vp9_writer_pos]   ; x
-    sub     r4, r4, #1                  ; x = w->pos - 1
-    b       extra_zero_while_start
-extra_zero_while_loop
-    mov     r10, #0
-    strb    r10, [r7, r4]               ; w->buffer[x] =(unsigned char)0
-    sub     r4, r4, #1                  ; x--
-extra_zero_while_start
-    cmp     r4, #0
-    ldrge   r7, [r0, #vp9_writer_buffer]
-    ldrb    r11, [r7, r4]
-    cmpge   r11, #0xff
-    beq     extra_zero_while_loop
-
-    ldr     r7, [r0, #vp9_writer_buffer]
-    ldrb    r10, [r7, r4]
-    add     r10, r10, #1
-    strb    r10, [r7, r4]
-extra_high_bit_not_set
-    rsb     r4, r6, #24                 ; 24-offset
-    ldr     r10, [r0, #vp9_writer_buffer]
-    lsr     r7, r2, r4                  ; lowvalue >> (24-offset)
-    ldr     r4, [r0, #vp9_writer_pos]
-    lsl     r2, r2, r6                  ; lowvalue <<= offset
-    mov     r6, r3                      ; shift = count
-    add     r11, r4, #1                 ; w->pos++
-    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
-    str     r11, [r0, #vp9_writer_pos]
-    sub     r3, r3, #8                  ; count -= 8
-    strb    r7, [r10, r4]               ; w->buffer[w->pos++]=(lowvalue >> (24-offset))
-    ldr     r10, [sp, #4]               ; b->tree
-extra_count_lt_zero
-    lsl     r2, r2, r6
-
-    subs    r8, r8, #1                  ; --n
-    bne     extra_bits_loop             ; while (n)
-
-no_extra_bits
-    ldr     lr, [r1, #4]                ; e = p->Extra
-    add     r4, r5, #1                  ; range + 1
-    tst     lr, #1
-    lsr     r4, r4, #1                  ; split = (range + 1) >> 1
-    addne   r2, r2, r4                  ; lowvalue += split
-    subne   r4, r5, r4                  ; range = range-split
-    tst     r2, #0x80000000             ; lowvalue & 0x80000000
-    lsl     r5, r4, #1                  ; range <<= 1
-    beq     end_high_bit_not_set
-
-    ldr     r4, [r0, #vp9_writer_pos]
-    mov     r7, #0
-    sub     r4, r4, #1
-    b       end_zero_while_start
-end_zero_while_loop
-    strb    r7, [r6, r4]
-    sub     r4, r4, #1                  ; x--
-end_zero_while_start
-    cmp     r4, #0
-    ldrge   r6, [r0, #vp9_writer_buffer]
-    ldrb    r12, [r6, r4]
-    cmpge   r12, #0xff
-    beq     end_zero_while_loop
-
-    ldr     r6, [r0, #vp9_writer_buffer]
-    ldrb    r7, [r6, r4]
-    add     r7, r7, #1
-    strb    r7, [r6, r4]
-end_high_bit_not_set
-    adds    r3, r3, #1                  ; ++count
-    lsl     r2, r2, #1                  ; lowvalue  <<= 1
-    bne     end_count_zero
-
-    ldr     r4, [r0, #vp9_writer_pos]
-    mvn     r3, #7
-    ldr     r7, [r0, #vp9_writer_buffer]
-    lsr     r6, r2, #24                 ; lowvalue >> 24
-    add     r12, r4, #1                 ; w->pos++
-    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
-    str     r12, [r0, #0x10]
-    strb    r6, [r7, r4]
-end_count_zero
-skip_extra_bits
-    add     r1, r1, #TOKENEXTRA_SZ      ; ++p
-check_p_lt_stop
-    ldr     r4, [sp, #0]                ; stop
-    cmp     r1, r4                      ; while( p < stop)
-    bcc     while_p_lt_stop
-
-    ldr     r6, [sp, #12]               ; mb_rows
-    ldr     r7, [sp, #16]               ; tokenlist address
-    subs    r6, r6, #1
-    add     r7, r7, #TOKENLIST_SZ       ; next element in the array
-    str     r6, [sp, #12]
-    bne     mb_row_loop
-
-    str     r2, [r0, #vp9_writer_lowvalue]
-    str     r5, [r0, #vp9_writer_range]
-    str     r3, [r0, #vp9_writer_count]
-    add     sp, sp, #24
-    pop     {r4-r11, pc}
-    ENDP
-
-_VP8_COMP_common_
-    DCD     vp8_comp_common
-_VP8_COMMON_MBrows_
-    DCD     vp8_common_mb_rows
-_VP8_COMP_tplist_
-    DCD     vp8_comp_tplist
-
-    END
--- a/vp8/encoder/arm/armv5te/vp8_packtokens_partitions_armv5.asm
+++ /dev/null
@@ -1,465 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT |vp8cx_pack_tokens_into_partitions_armv5|
-
-    INCLUDE asm_enc_offsets.asm
-
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA    |.text|, CODE, READONLY
-
-; r0 VP8_COMP *cpi
-; r1 unsigned char *cx_data
-; r2 int num_part
-; r3 *size
-; s0 vp8_coef_encodings
-; s1 vp8_extra_bits,
-; s2 const vp9_tree_index *,
-
-|vp8cx_pack_tokens_into_partitions_armv5| PROC
-    push    {r4-r11, lr}
-    sub     sp, sp, #44
-
-    ; Compute address of cpi->common.mb_rows
-    ldr     r4, _VP8_COMP_common_
-    ldr     r6, _VP8_COMMON_MBrows_
-    add     r4, r0, r4
-
-    ldr     r5, [r4, r6]                ; load up mb_rows
-
-    str     r5, [sp, #36]               ; save mb_rows
-    str     r1, [sp, #24]               ; save cx_data
-    str     r2, [sp, #20]               ; save num_part
-    str     r3, [sp, #8]                ; save *size
-
-    ; *size = 3*(num_part -1 );
-    sub     r2, r2, #1                  ; num_part - 1
-    add     r2, r2, r2, lsl #1          ; 3*(num_part - 1)
-    str     r2, [r3]
-
-    add     r2, r2, r1                  ; cx_data + *size
-    str     r2, [sp, #40]               ; ptr
-
-    ldr     r4, _VP8_COMP_tplist_
-    add     r4, r0, r4
-    ldr     r7, [r4, #0]                ; dereference cpi->tp_list
-    str     r7, [sp, #32]               ; store start of cpi->tp_list
-
-    ldr     r11, _VP8_COMP_bc2_         ; load up vp9_writer out of cpi
-    add     r0, r0, r11
-
-    mov     r11, #0
-    str     r11, [sp, #28]              ; i
-
-numparts_loop
-    ldr     r10, [sp, #40]              ; ptr
-    ldr     r5,  [sp, #36]              ; move mb_rows to the counting section
-    sub     r5, r5, r11                 ; move start point with each partition
-                                        ; mb_rows starts at i
-    str     r5,  [sp, #12]
-
-    ; Reset all of the VP8 Writer data for each partition that
-    ; is processed.
-    ; start_encode
-    mov     r2, #0                      ; vp9_writer_lowvalue
-    mov     r5, #255                    ; vp9_writer_range
-    mvn     r3, #23                     ; vp9_writer_count
-
-    str     r2,  [r0, #vp9_writer_value]
-    str     r2,  [r0, #vp9_writer_pos]
-    str     r10, [r0, #vp9_writer_buffer]
-
-mb_row_loop
-
-    ldr     r1, [r7, #tokenlist_start]
-    ldr     r9, [r7, #tokenlist_stop]
-    str     r9, [sp, #0]                ; save stop for later comparison
-    str     r7, [sp, #16]               ; tokenlist address for next time
-
-    b       check_p_lt_stop
-
-    ; actual work gets done here!
-
-while_p_lt_stop
-    ldrb    r6, [r1, #tokenextra_token] ; t
-    ldr     r4, [sp, #80]               ; vp8_coef_encodings
-    mov     lr, #0
-    add     r4, r4, r6, lsl #3          ; a = vp8_coef_encodings + t
-    ldr     r9, [r1, #tokenextra_context_tree]   ; pp
-
-    ldrb    r7, [r1, #tokenextra_skip_eob_node]
-
-    ldr     r6, [r4, #vp9_token_value]  ; v
-    ldr     r8, [r4, #vp9_token_len]    ; n
-
-    ; vp8 specific skip_eob_node
-    cmp     r7, #0
-    movne   lr, #2                      ; i = 2
-    subne   r8, r8, #1                  ; --n
-
-    rsb     r4, r8, #32                 ; 32-n
-    ldr     r10, [sp, #88]              ; vp8_coef_tree
-
-    ; v is kept in r12 during the token pack loop
-    lsl     r12, r6, r4                ; r12 = v << 32 - n
-
-; loop start
-token_loop
-    ldrb    r4, [r9, lr, asr #1]        ; pp [i>>1]
-    sub     r7, r5, #1                  ; range-1
-
-    ; Decisions are made based on the bit value shifted
-    ; off of v, so set a flag here based on this.
-    ; This value is refered to as "bb"
-    lsls    r12, r12, #1                ; bb = v >> n
-    mul     r6, r4, r7                  ; ((range-1) * pp[i>>1]))
-
-    ; bb can only be 0 or 1.  So only execute this statement
-    ; if bb == 1, otherwise it will act like i + 0
-    addcs   lr, lr, #1                  ; i + bb
-
-    mov     r7, #1
-    ldrsb   lr, [r10, lr]               ; i = vp8_coef_tree[i+bb]
-    add     r4, r7, r6, lsr #8          ; 1 + (((range-1) * pp[i>>1]) >> 8)
-
-    addcs   r2, r2, r4                  ; if  (bb) lowvalue += split
-    subcs   r4, r5, r4                  ; if  (bb) range = range-split
-
-    ; Counting the leading zeros is used to normalize range.
-    clz     r6, r4
-    sub     r6, r6, #24                 ; shift
-
-    ; Flag is set on the sum of count.  This flag is used later
-    ; to determine if count >= 0
-    adds    r3, r3, r6                  ; count += shift
-    lsl     r5, r4, r6                  ; range <<= shift
-    bmi     token_count_lt_zero         ; if(count >= 0)
-
-    sub     r6, r6, r3                  ; offset = shift - count
-    sub     r4, r6, #1                  ; offset-1
-    lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )
-    bpl     token_high_bit_not_set
-
-    ldr     r4, [r0, #vp9_writer_pos]   ; x
-    sub     r4, r4, #1                  ; x = w->pos-1
-    b       token_zero_while_start
-token_zero_while_loop
-    mov     r10, #0
-    strb    r10, [r7, r4]               ; w->buffer[x] =(unsigned char)0
-    sub     r4, r4, #1                  ; x--
-token_zero_while_start
-    cmp     r4, #0
-    ldrge   r7, [r0, #vp9_writer_buffer]
-    ldrb    r11, [r7, r4]
-    cmpge   r11, #0xff
-    beq     token_zero_while_loop
-
-    ldr     r7, [r0, #vp9_writer_buffer]
-    ldrb    r10, [r7, r4]               ; w->buffer[x]
-    add     r10, r10, #1
-    strb    r10, [r7, r4]               ; w->buffer[x] + 1
-token_high_bit_not_set
-    rsb     r4, r6, #24                 ; 24-offset
-    ldr     r10, [r0, #vp9_writer_buffer]
-    lsr     r7, r2, r4                  ; lowvalue >> (24-offset)
-    ldr     r4, [r0, #vp9_writer_pos]   ; w->pos
-    lsl     r2, r2, r6                  ; lowvalue <<= offset
-    mov     r6, r3                      ; shift = count
-    add     r11, r4, #1                 ; w->pos++
-    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
-    str     r11, [r0, #vp9_writer_pos]
-    sub     r3, r3, #8                  ; count -= 8
-    strb    r7, [r10, r4]               ; w->buffer[w->pos++]
-
-    ; r10 is used earlier in the loop, but r10 is used as
-    ; temp variable here.  So after r10 is used, reload
-    ; vp8_coef_tree_dcd into r10
-    ldr     r10, [sp, #88]              ; vp8_coef_tree
-
-token_count_lt_zero
-    lsl     r2, r2, r6                  ; lowvalue <<= shift
-
-    subs    r8, r8, #1                  ; --n
-    bne     token_loop
-
-    ldrb    r6, [r1, #tokenextra_token] ; t
-    ldr     r7, [sp, #84]                ; vp8_extra_bits
-    ; Add t * sizeof (vp9_extra_bit_struct) to get the desired
-    ;  element.  Here vp9_extra_bit_struct == 16
-    add     r12, r7, r6, lsl #4         ; b = vp8_extra_bits + t
-
-    ldr     r4, [r12, #vp9_extra_bit_struct_base_val]
-    cmp     r4, #0
-    beq     skip_extra_bits
-
-;   if( b->base_val)
-    ldr     r8, [r12, #vp9_extra_bit_struct_len] ; L
-    ldrsh   lr, [r1, #tokenextra_extra] ; e = p->Extra
-    cmp     r8, #0                      ; if( L)
-    beq     no_extra_bits
-
-    ldr     r9, [r12, #vp9_extra_bit_struct_prob]
-    asr     r7, lr, #1                  ; v=e>>1
-
-    ldr     r10, [r12, #vp9_extra_bit_struct_tree]
-    str     r10, [sp, #4]               ; b->tree
-
-    rsb     r4, r8, #32
-    lsl     r12, r7, r4
-
-    mov     lr, #0                      ; i = 0
-
-extra_bits_loop
-    ldrb    r4, [r9, lr, asr #1]        ; pp[i>>1]
-    sub     r7, r5, #1                  ; range-1
-    lsls    r12, r12, #1                ; v >> n
-    mul     r6, r4, r7                  ; (range-1) * pp[i>>1]
-    addcs   lr, lr, #1                  ; i + bb
-
-    mov     r7, #1
-    ldrsb   lr, [r10, lr]               ; i = b->tree[i+bb]
-    add     r4, r7, r6, lsr #8          ; split = 1 +  (((range-1) * pp[i>>1]) >> 8)
-
-    addcs   r2, r2, r4                  ; if  (bb) lowvalue += split
-    subcs   r4, r5, r4                  ; if  (bb) range = range-split
-
-    clz     r6, r4
-    sub     r6, r6, #24
-
-    adds    r3, r3, r6                  ; count += shift
-    lsl     r5, r4, r6                  ; range <<= shift
-    bmi     extra_count_lt_zero         ; if(count >= 0)
-
-    sub     r6, r6, r3                  ; offset= shift - count
-    sub     r4, r6, #1                  ; offset-1
-    lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )
-    bpl     extra_high_bit_not_set
-
-    ldr     r4, [r0, #vp9_writer_pos]   ; x
-    sub     r4, r4, #1                  ; x = w->pos - 1
-    b       extra_zero_while_start
-extra_zero_while_loop
-    mov     r10, #0
-    strb    r10, [r7, r4]               ; w->buffer[x] =(unsigned char)0
-    sub     r4, r4, #1                  ; x--
-extra_zero_while_start
-    cmp     r4, #0
-    ldrge   r7, [r0, #vp9_writer_buffer]
-    ldrb    r11, [r7, r4]
-    cmpge   r11, #0xff
-    beq     extra_zero_while_loop
-
-    ldr     r7, [r0, #vp9_writer_buffer]
-    ldrb    r10, [r7, r4]
-    add     r10, r10, #1
-    strb    r10, [r7, r4]
-extra_high_bit_not_set
-    rsb     r4, r6, #24                 ; 24-offset
-    ldr     r10, [r0, #vp9_writer_buffer]
-    lsr     r7, r2, r4                  ; lowvalue >> (24-offset)
-    ldr     r4, [r0, #vp9_writer_pos]
-    lsl     r2, r2, r6                  ; lowvalue <<= offset
-    mov     r6, r3                      ; shift = count
-    add     r11, r4, #1                 ; w->pos++
-    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
-    str     r11, [r0, #vp9_writer_pos]
-    sub     r3, r3, #8                  ; count -= 8
-    strb    r7, [r10, r4]               ; w->buffer[w->pos++]=(lowvalue >> (24-offset))
-    ldr     r10, [sp, #4]               ; b->tree
-extra_count_lt_zero
-    lsl     r2, r2, r6
-
-    subs    r8, r8, #1                  ; --n
-    bne     extra_bits_loop             ; while (n)
-
-no_extra_bits
-    ldr     lr, [r1, #4]                ; e = p->Extra
-    add     r4, r5, #1                  ; range + 1
-    tst     lr, #1
-    lsr     r4, r4, #1                  ; split = (range + 1) >> 1
-    addne   r2, r2, r4                  ; lowvalue += split
-    subne   r4, r5, r4                  ; range = range-split
-    tst     r2, #0x80000000             ; lowvalue & 0x80000000
-    lsl     r5, r4, #1                  ; range <<= 1
-    beq     end_high_bit_not_set
-
-    ldr     r4, [r0, #vp9_writer_pos]
-    mov     r7, #0
-    sub     r4, r4, #1
-    b       end_zero_while_start
-end_zero_while_loop
-    strb    r7, [r6, r4]
-    sub     r4, r4, #1                  ; x--
-end_zero_while_start
-    cmp     r4, #0
-    ldrge   r6, [r0, #vp9_writer_buffer]
-    ldrb    r12, [r6, r4]
-    cmpge   r12, #0xff
-    beq     end_zero_while_loop
-
-    ldr     r6, [r0, #vp9_writer_buffer]
-    ldrb    r7, [r6, r4]
-    add     r7, r7, #1
-    strb    r7, [r6, r4]
-end_high_bit_not_set
-    adds    r3, r3, #1                  ; ++count
-    lsl     r2, r2, #1                  ; lowvalue  <<= 1
-    bne     end_count_zero
-
-    ldr     r4, [r0, #vp9_writer_pos]
-    mvn     r3, #7
-    ldr     r7, [r0, #vp9_writer_buffer]
-    lsr     r6, r2, #24                 ; lowvalue >> 24
-    add     r12, r4, #1                 ; w->pos++
-    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
-    str     r12, [r0, #0x10]
-    strb    r6, [r7, r4]
-end_count_zero
-skip_extra_bits
-    add     r1, r1, #TOKENEXTRA_SZ      ; ++p
-check_p_lt_stop
-    ldr     r4, [sp, #0]                ; stop
-    cmp     r1, r4                      ; while( p < stop)
-    bcc     while_p_lt_stop
-
-    ldr     r10, [sp, #20]              ; num_parts
-    mov     r1, #TOKENLIST_SZ
-    mul     r1, r10, r1
-
-    ldr     r6, [sp, #12]               ; mb_rows
-    ldr     r7, [sp, #16]               ; tokenlist address
-    subs    r6, r6, r10
-    add     r7, r7, r1                  ; next element in the array
-    str     r6, [sp, #12]
-    bgt     mb_row_loop
-
-    mov     r12, #32
-
-stop_encode_loop
-    sub     r7, r5, #1                  ; range-1
-
-    mov     r4, r7, lsl #7              ; ((range-1) * 128)
-
-    mov     r7, #1
-    add     r4, r7, r4, lsr #8          ; 1 + (((range-1) * 128) >> 8)
-
-    ; Counting the leading zeros is used to normalize range.
-    clz     r6, r4
-    sub     r6, r6, #24                 ; shift
-
-    ; Flag is set on the sum of count.  This flag is used later
-    ; to determine if count >= 0
-    adds    r3, r3, r6                  ; count += shift
-    lsl     r5, r4, r6                  ; range <<= shift
-    bmi     token_count_lt_zero_se      ; if(count >= 0)
-
-    sub     r6, r6, r3                  ; offset = shift - count
-    sub     r4, r6, #1                  ; offset-1
-    lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )
-    bpl     token_high_bit_not_set_se
-
-    ldr     r4, [r0, #vp9_writer_pos]   ; x
-    sub     r4, r4, #1                  ; x = w->pos-1
-    b       token_zero_while_start_se
-token_zero_while_loop_se
-    mov     r10, #0
-    strb    r10, [r7, r4]               ; w->buffer[x] =(unsigned char)0
-    sub     r4, r4, #1                  ; x--
-token_zero_while_start_se
-    cmp     r4, #0
-    ldrge   r7, [r0, #vp9_writer_buffer]
-    ldrb    r11, [r7, r4]
-    cmpge   r11, #0xff
-    beq     token_zero_while_loop_se
-
-    ldr     r7, [r0, #vp9_writer_buffer]
-    ldrb    r10, [r7, r4]               ; w->buffer[x]
-    add     r10, r10, #1
-    strb    r10, [r7, r4]               ; w->buffer[x] + 1
-token_high_bit_not_set_se
-    rsb     r4, r6, #24                 ; 24-offset
-    ldr     r10, [r0, #vp9_writer_buffer]
-    lsr     r7, r2, r4                  ; lowvalue >> (24-offset)
-    ldr     r4, [r0, #vp9_writer_pos]   ; w->pos
-    lsl     r2, r2, r6                  ; lowvalue <<= offset
-    mov     r6, r3                      ; shift = count
-    add     r11, r4, #1                 ; w->pos++
-    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
-    str     r11, [r0, #vp9_writer_pos]
-    sub     r3, r3, #8                  ; count -= 8
-    strb    r7, [r10, r4]               ; w->buffer[w->pos++]
-
-token_count_lt_zero_se
-    lsl     r2, r2, r6                  ; lowvalue <<= shift
-
-    subs    r12, r12, #1
-    bne     stop_encode_loop
-
-    ldr     r10, [sp, #8]               ; *size
-    ldr     r11, [r10]
-    ldr     r4,  [r0, #vp9_writer_pos]  ; w->pos
-    add     r11, r11, r4                ; *size += w->pos
-    str     r11, [r10]
-
-    ldr     r9, [sp, #20]               ; num_parts
-    sub     r9, r9, #1
-    ldr     r10, [sp, #28]              ; i
-    cmp     r10, r9                     ; if(i<(num_part - 1))
-    bge     skip_write_partition
-
-    ldr     r12, [sp, #40]              ; ptr
-    add     r12, r12, r4                ; ptr += w->pos
-    str     r12, [sp, #40]
-
-    ldr     r9, [sp, #24]               ; cx_data
-    mov     r8, r4, asr #8
-    strb    r4, [r9, #0]
-    strb    r8, [r9, #1]
-    mov     r4, r4, asr #16
-    strb    r4, [r9, #2]
-
-    add     r9, r9, #3                  ; cx_data += 3
-    str     r9, [sp, #24]
-
-skip_write_partition
-
-    ldr     r11, [sp, #28]              ; i
-    ldr     r10, [sp, #20]              ; num_parts
-
-    add     r11, r11, #1                ; i++
-    str     r11, [sp, #28]
-
-    ldr     r7, [sp, #32]               ; cpi->tp_list[i]
-    mov     r1, #TOKENLIST_SZ
-    add     r7, r7, r1                  ; next element in cpi->tp_list
-    str     r7, [sp, #32]               ; cpi->tp_list[i+1]
-
-    cmp     r10, r11
-    bgt     numparts_loop
-
-
-    add     sp, sp, #44
-    pop     {r4-r11, pc}
-    ENDP
-
-_VP8_COMP_common_
-    DCD     vp8_comp_common
-_VP8_COMMON_MBrows_
-    DCD     vp8_common_mb_rows
-_VP8_COMP_tplist_
-    DCD     vp8_comp_tplist
-_VP8_COMP_bc2_
-    DCD     vp8_comp_bc2
-
-    END
--- a/vp8/encoder/arm/armv6/vp8_fast_quantize_b_armv6.asm
+++ /dev/null
@@ -1,224 +1,0 @@
-;
-;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp8_fast_quantize_b_armv6|
-
-    INCLUDE asm_enc_offsets.asm
-
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-; r0    BLOCK *b
-; r1    BLOCKD *d
-|vp8_fast_quantize_b_armv6| PROC
-    stmfd   sp!, {r1, r4-r11, lr}
-
-    ldr     r3, [r0, #vp8_block_coeff]      ; coeff
-    ldr     r4, [r0, #vp8_block_quant_fast] ; quant_fast
-    ldr     r5, [r0, #vp8_block_round]      ; round
-    ldr     r6, [r1, #vp8_blockd_qcoeff]    ; qcoeff
-    ldr     r7, [r1, #vp8_blockd_dqcoeff]   ; dqcoeff
-    ldr     r8, [r1, #vp8_blockd_dequant]   ; dequant
-
-    ldr     r2, loop_count          ; loop_count=0x1000000. 'lsls' instruction
-                                    ; is used to update the counter so that
-                                    ; it can be used to mark nonzero
-                                    ; quantized coefficient pairs.
-
-    mov     r1, #0                  ; flags for quantized coeffs
-
-    ; PART 1: quantization and dequantization loop
-loop
-    ldr     r9, [r3], #4            ; [z1 | z0]
-    ldr     r10, [r5], #4           ; [r1 | r0]
-    ldr     r11, [r4], #4           ; [q1 | q0]
-
-    ssat16  lr, #1, r9              ; [sz1 | sz0]
-    eor     r9, r9, lr              ; [z1 ^ sz1 | z0 ^ sz0]
-    ssub16  r9, r9, lr              ; x = (z ^ sz) - sz
-    sadd16  r9, r9, r10             ; [x1+r1 | x0+r0]
-
-    ldr     r12, [r3], #4           ; [z3 | z2]
-
-    smulbb  r0, r9, r11             ; [(x0+r0)*q0]
-    smultt  r9, r9, r11             ; [(x1+r1)*q1]
-
-    ldr     r10, [r5], #4           ; [r3 | r2]
-
-    ssat16  r11, #1, r12            ; [sz3 | sz2]
-    eor     r12, r12, r11           ; [z3 ^ sz3 | z2 ^ sz2]
-    pkhtb   r0, r9, r0, asr #16     ; [y1 | y0]
-    ldr     r9, [r4], #4            ; [q3 | q2]
-    ssub16  r12, r12, r11           ; x = (z ^ sz) - sz
-
-    sadd16  r12, r12, r10           ; [x3+r3 | x2+r2]
-
-    eor     r0, r0, lr              ; [(y1 ^ sz1) | (y0 ^ sz0)]
-
-    smulbb  r10, r12, r9            ; [(x2+r2)*q2]
-    smultt  r12, r12, r9            ; [(x3+r3)*q3]
-
-    ssub16  r0, r0, lr              ; x = (y ^ sz) - sz
-
-    cmp     r0, #0                  ; check if zero
-    orrne   r1, r1, r2, lsr #24     ; add flag for nonzero coeffs
-
-    str     r0, [r6], #4            ; *qcoeff++ = x
-    ldr     r9, [r8], #4            ; [dq1 | dq0]
-
-    pkhtb   r10, r12, r10, asr #16  ; [y3 | y2]
-    eor     r10, r10, r11           ; [(y3 ^ sz3) | (y2 ^ sz2)]
-    ssub16  r10, r10, r11           ; x = (y ^ sz) - sz
-
-    cmp     r10, #0                 ; check if zero
-    orrne   r1, r1, r2, lsr #23     ; add flag for nonzero coeffs
-
-    str     r10, [r6], #4           ; *qcoeff++ = x
-    ldr     r11, [r8], #4           ; [dq3 | dq2]
-
-    smulbb  r12, r0, r9             ; [x0*dq0]
-    smultt  r0, r0, r9              ; [x1*dq1]
-
-    smulbb  r9, r10, r11            ; [x2*dq2]
-    smultt  r10, r10, r11           ; [x3*dq3]
-
-    lsls    r2, r2, #2              ; update loop counter
-    strh    r12, [r7, #0]           ; dqcoeff[0] = [x0*dq0]
-    strh    r0, [r7, #2]            ; dqcoeff[1] = [x1*dq1]
-    strh    r9, [r7, #4]            ; dqcoeff[2] = [x2*dq2]
-    strh    r10, [r7, #6]           ; dqcoeff[3] = [x3*dq3]
-    add     r7, r7, #8              ; dqcoeff += 8
-    bne     loop
-
-    ; PART 2: check position for eob...
-    mov     lr, #0                  ; init eob
-    cmp     r1, #0                  ; coeffs after quantization?
-    ldr     r11, [sp, #0]           ; restore BLOCKD pointer
-    beq     end                     ; skip eob calculations if all zero
-
-    ldr     r0, [r11, #vp8_blockd_qcoeff]
-
-    ; check shortcut for nonzero qcoeffs
-    tst    r1, #0x80
-    bne    quant_coeff_15_14
-    tst    r1, #0x20
-    bne    quant_coeff_13_11
-    tst    r1, #0x8
-    bne    quant_coeff_12_7
-    tst    r1, #0x40
-    bne    quant_coeff_10_9
-    tst    r1, #0x10
-    bne    quant_coeff_8_3
-    tst    r1, #0x2
-    bne    quant_coeff_6_5
-    tst    r1, #0x4
-    bne    quant_coeff_4_2
-    b      quant_coeff_1_0
-
-quant_coeff_15_14
-    ldrh    r2, [r0, #30]       ; rc=15, i=15
-    mov     lr, #16
-    cmp     r2, #0
-    bne     end
-
-    ldrh    r3, [r0, #28]       ; rc=14, i=14
-    mov     lr, #15
-    cmp     r3, #0
-    bne     end
-
-quant_coeff_13_11
-    ldrh    r2, [r0, #22]       ; rc=11, i=13
-    mov     lr, #14
-    cmp     r2, #0
-    bne     end
-
-quant_coeff_12_7
-    ldrh    r3, [r0, #14]       ; rc=7,  i=12
-    mov     lr, #13
-    cmp     r3, #0
-    bne     end
-
-    ldrh    r2, [r0, #20]       ; rc=10, i=11
-    mov     lr, #12
-    cmp     r2, #0
-    bne     end
-
-quant_coeff_10_9
-    ldrh    r3, [r0, #26]       ; rc=13, i=10
-    mov     lr, #11
-    cmp     r3, #0
-    bne     end
-
-    ldrh    r2, [r0, #24]       ; rc=12, i=9
-    mov     lr, #10
-    cmp     r2, #0
-    bne     end
-
-quant_coeff_8_3
-    ldrh    r3, [r0, #18]       ; rc=9,  i=8
-    mov     lr, #9
-    cmp     r3, #0
-    bne     end
-
-    ldrh    r2, [r0, #12]       ; rc=6,  i=7
-    mov     lr, #8
-    cmp     r2, #0
-    bne     end
-
-quant_coeff_6_5
-    ldrh    r3, [r0, #6]        ; rc=3,  i=6
-    mov     lr, #7
-    cmp     r3, #0
-    bne     end
-
-    ldrh    r2, [r0, #4]        ; rc=2,  i=5
-    mov     lr, #6
-    cmp     r2, #0
-    bne     end
-
-quant_coeff_4_2
-    ldrh    r3, [r0, #10]       ; rc=5,  i=4
-    mov     lr, #5
-    cmp     r3, #0
-    bne     end
-
-    ldrh    r2, [r0, #16]       ; rc=8,  i=3
-    mov     lr, #4
-    cmp     r2, #0
-    bne     end
-
-    ldrh    r3, [r0, #8]        ; rc=4,  i=2
-    mov     lr, #3
-    cmp     r3, #0
-    bne     end
-
-quant_coeff_1_0
-    ldrh    r2, [r0, #2]        ; rc=1,  i=1
-    mov     lr, #2
-    cmp     r2, #0
-    bne     end
-
-    mov     lr, #1              ; rc=0,  i=0
-
-end
-    str     lr, [r11, #vp8_blockd_eob]
-    ldmfd   sp!, {r1, r4-r11, pc}
-
-    ENDP
-
-loop_count
-    DCD     0x1000000
-
-    END
-
--- a/vp8/encoder/arm/armv6/vp8_mse16x16_armv6.asm
+++ /dev/null
@@ -1,138 +1,0 @@
-;
-;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp8_mse16x16_armv6|
-
-    ARM
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-; r0    unsigned char *src_ptr
-; r1    int source_stride
-; r2    unsigned char *ref_ptr
-; r3    int  recon_stride
-; stack unsigned int *sse
-;
-;note: Based on vp9_variance16x16_armv6. In this function, sum is never used.
-;      So, we can remove this part of calculation.
-
-|vp8_mse16x16_armv6| PROC
-
-    push    {r4-r9, lr}
-
-    pld     [r0, r1, lsl #0]
-    pld     [r2, r3, lsl #0]
-
-    mov     r12, #16            ; set loop counter to 16 (=block height)
-    mov     r4, #0              ; initialize sse = 0
-
-loop
-    ; 1st 4 pixels
-    ldr     r5, [r0, #0x0]      ; load 4 src pixels
-    ldr     r6, [r2, #0x0]      ; load 4 ref pixels
-
-    mov     lr, #0              ; constant zero
-
-    usub8   r8, r5, r6          ; calculate difference
-    pld     [r0, r1, lsl #1]
-    sel     r7, r8, lr          ; select bytes with positive difference
-    usub8   r9, r6, r5          ; calculate difference with reversed operands
-    pld     [r2, r3, lsl #1]
-    sel     r8, r9, lr          ; select bytes with negative difference
-
-    ; calculate partial sums
-    usad8   r5, r7, lr          ; calculate sum of positive differences
-    usad8   r6, r8, lr          ; calculate sum of negative differences
-    orr     r8, r8, r7          ; differences of all 4 pixels
-
-    ldr     r5, [r0, #0x4]      ; load 4 src pixels
-
-    ; calculate sse
-    uxtb16  r6, r8              ; byte (two pixels) to halfwords
-    uxtb16  r7, r8, ror #8      ; another two pixels to halfwords
-    smlad   r4, r6, r6, r4      ; dual signed multiply, add and accumulate (1)
-
-    ; 2nd 4 pixels
-    ldr     r6, [r2, #0x4]      ; load 4 ref pixels
-    smlad   r4, r7, r7, r4      ; dual signed multiply, add and accumulate (2)
-
-    usub8   r8, r5, r6          ; calculate difference
-    sel     r7, r8, lr          ; select bytes with positive difference
-    usub8   r9, r6, r5          ; calculate difference with reversed operands
-    sel     r8, r9, lr          ; select bytes with negative difference
-
-    ; calculate partial sums
-    usad8   r5, r7, lr          ; calculate sum of positive differences
-    usad8   r6, r8, lr          ; calculate sum of negative differences
-    orr     r8, r8, r7          ; differences of all 4 pixels
-    ldr     r5, [r0, #0x8]      ; load 4 src pixels
-    ; calculate sse
-    uxtb16  r6, r8              ; byte (two pixels) to halfwords
-    uxtb16  r7, r8, ror #8      ; another two pixels to halfwords
-    smlad   r4, r6, r6, r4      ; dual signed multiply, add and accumulate (1)
-
-    ; 3rd 4 pixels
-    ldr     r6, [r2, #0x8]      ; load 4 ref pixels
-    smlad   r4, r7, r7, r4      ; dual signed multiply, add and accumulate (2)
-
-    usub8   r8, r5, r6          ; calculate difference
-    sel     r7, r8, lr          ; select bytes with positive difference
-    usub8   r9, r6, r5          ; calculate difference with reversed operands
-    sel     r8, r9, lr          ; select bytes with negative difference
-
-    ; calculate partial sums
-    usad8   r5, r7, lr          ; calculate sum of positive differences
-    usad8   r6, r8, lr          ; calculate sum of negative differences
-    orr     r8, r8, r7          ; differences of all 4 pixels
-
-    ldr     r5, [r0, #0xc]      ; load 4 src pixels
-
-    ; calculate sse
-    uxtb16  r6, r8              ; byte (two pixels) to halfwords
-    uxtb16  r7, r8, ror #8      ; another two pixels to halfwords
-    smlad   r4, r6, r6, r4      ; dual signed multiply, add and accumulate (1)
-
-    ; 4th 4 pixels
-    ldr     r6, [r2, #0xc]      ; load 4 ref pixels
-    smlad   r4, r7, r7, r4      ; dual signed multiply, add and accumulate (2)
-
-    usub8   r8, r5, r6          ; calculate difference
-    add     r0, r0, r1          ; set src_ptr to next row
-    sel     r7, r8, lr          ; select bytes with positive difference
-    usub8   r9, r6, r5          ; calculate difference with reversed operands
-    add     r2, r2, r3          ; set dst_ptr to next row
-    sel     r8, r9, lr          ; select bytes with negative difference
-
-    ; calculate partial sums
-    usad8   r5, r7, lr          ; calculate sum of positive differences
-    usad8   r6, r8, lr          ; calculate sum of negative differences
-    orr     r8, r8, r7          ; differences of all 4 pixels
-
-    subs    r12, r12, #1        ; next row
-
-    ; calculate sse
-    uxtb16  r6, r8              ; byte (two pixels) to halfwords
-    uxtb16  r7, r8, ror #8      ; another two pixels to halfwords
-    smlad   r4, r6, r6, r4      ; dual signed multiply, add and accumulate (1)
-    smlad   r4, r7, r7, r4      ; dual signed multiply, add and accumulate (2)
-
-    bne     loop
-
-    ; return stuff
-    ldr     r1, [sp, #28]       ; get address of sse
-    mov     r0, r4              ; return sse
-    str     r4, [r1]            ; store sse
-
-    pop     {r4-r9, pc}
-
-    ENDP
-
-    END
--- a/vp8/encoder/arm/armv6/vp8_sad16x16_armv6.asm
+++ /dev/null
@@ -1,96 +1,0 @@
-;
-;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp8_sad16x16_armv6|
-
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-; r0    const unsigned char *src_ptr
-; r1    int  src_stride
-; r2    const unsigned char *ref_ptr
-; r3    int  ref_stride
-; stack max_sad (not used)
-|vp8_sad16x16_armv6| PROC
-    stmfd   sp!, {r4-r12, lr}
-
-    pld     [r0, r1, lsl #0]
-    pld     [r2, r3, lsl #0]
-    pld     [r0, r1, lsl #1]
-    pld     [r2, r3, lsl #1]
-
-    mov     r4, #0              ; sad = 0;
-    mov     r5, #8              ; loop count
-
-loop
-    ; 1st row
-    ldr     r6, [r0, #0x0]      ; load 4 src pixels (1A)
-    ldr     r8, [r2, #0x0]      ; load 4 ref pixels (1A)
-    ldr     r7, [r0, #0x4]      ; load 4 src pixels (1A)
-    ldr     r9, [r2, #0x4]      ; load 4 ref pixels (1A)
-    ldr     r10, [r0, #0x8]     ; load 4 src pixels (1B)
-    ldr     r11, [r0, #0xC]     ; load 4 src pixels (1B)
-
-    usada8  r4, r8, r6, r4      ; calculate sad for 4 pixels
-    usad8   r8, r7, r9          ; calculate sad for 4 pixels
-
-    ldr     r12, [r2, #0x8]     ; load 4 ref pixels (1B)
-    ldr     lr, [r2, #0xC]      ; load 4 ref pixels (1B)
-
-    add     r0, r0, r1          ; set src pointer to next row
-    add     r2, r2, r3          ; set dst pointer to next row
-
-    pld     [r0, r1, lsl #1]
-    pld     [r2, r3, lsl #1]
-
-    usada8  r4, r10, r12, r4    ; calculate sad for 4 pixels
-    usada8  r8, r11, lr, r8     ; calculate sad for 4 pixels
-
-    ldr     r6, [r0, #0x0]      ; load 4 src pixels (2A)
-    ldr     r7, [r0, #0x4]      ; load 4 src pixels (2A)
-    add     r4, r4, r8          ; add partial sad values
-
-    ; 2nd row
-    ldr     r8, [r2, #0x0]      ; load 4 ref pixels (2A)
-    ldr     r9, [r2, #0x4]      ; load 4 ref pixels (2A)
-    ldr     r10, [r0, #0x8]     ; load 4 src pixels (2B)
-    ldr     r11, [r0, #0xC]     ; load 4 src pixels (2B)
-
-    usada8  r4, r6, r8, r4      ; calculate sad for 4 pixels
-    usad8   r8, r7, r9          ; calculate sad for 4 pixels
-
-    ldr     r12, [r2, #0x8]     ; load 4 ref pixels (2B)
-    ldr     lr, [r2, #0xC]      ; load 4 ref pixels (2B)
-
-    add     r0, r0, r1          ; set src pointer to next row
-    add     r2, r2, r3          ; set dst pointer to next row
-
-    usada8  r4, r10, r12, r4    ; calculate sad for 4 pixels
-    usada8  r8, r11, lr, r8     ; calculate sad for 4 pixels
-
-    pld     [r0, r1, lsl #1]
-    pld     [r2, r3, lsl #1]
-
-    subs    r5, r5, #1          ; decrement loop counter
-    add     r4, r4, r8          ; add partial sad values
-
-    bne     loop
-
-    mov     r0, r4              ; return sad
-    ldmfd   sp!, {r4-r12, pc}
-
-    ENDP
-
-    END
-
--- a/vp8/encoder/arm/armv6/vp8_short_fdct4x4_armv6.asm
+++ /dev/null
@@ -1,262 +1,0 @@
-;
-;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-    EXPORT |vp8_short_fdct4x4_armv6|
-
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA    |.text|, CODE, READONLY
-; void vp8_short_fdct4x4_c(short *input, short *output, int pitch)
-|vp8_short_fdct4x4_armv6| PROC
-
-    stmfd       sp!, {r4 - r12, lr}
-
-    ; PART 1
-
-    ; coeffs 0-3
-    ldrd        r4, r5, [r0]        ; [i1 | i0] [i3 | i2]
-
-    ldr         r10, c7500
-    ldr         r11, c14500
-    ldr         r12, c0x22a453a0    ; [2217*4 | 5352*4]
-    ldr         lr, c0x00080008
-    ror         r5, r5, #16         ; [i2 | i3]
-
-    qadd16      r6, r4, r5          ; [i1+i2 | i0+i3] = [b1 | a1] without shift
-    qsub16      r7, r4, r5          ; [i1-i2 | i0-i3] = [c1 | d1] without shift
-
-    add         r0, r0, r2          ; update input pointer
-
-    qadd16      r7, r7, r7          ; 2*[c1|d1] --> we can use smlad and smlsd
-                                    ; with 2217*4 and 5352*4 without losing the
-                                    ; sign bit (overflow)
-
-    smuad       r4, r6, lr          ; o0 = (i1+i2)*8 + (i0+i3)*8
-    smusd       r5, r6, lr          ; o2 = (i1+i2)*8 - (i0+i3)*8
-
-    smlad       r6, r7, r12, r11    ; o1 = (c1 * 2217 + d1 * 5352 +  14500)
-    smlsdx      r7, r7, r12, r10    ; o3 = (d1 * 2217 - c1 * 5352 +   7500)
-
-    ldrd        r8, r9, [r0]        ; [i5 | i4] [i7 | i6]
-
-    pkhbt       r3, r4, r6, lsl #4  ; [o1 | o0], keep in register for PART 2
-    pkhbt       r6, r5, r7, lsl #4  ; [o3 | o2]
-
-    str         r6, [r1, #4]
-
-    ; coeffs 4-7
-    ror         r9, r9, #16         ; [i6 | i7]
-
-    qadd16      r6, r8, r9          ; [i5+i6 | i4+i7] = [b1 | a1] without shift
-    qsub16      r7, r8, r9          ; [i5-i6 | i4-i7] = [c1 | d1] without shift
-
-    add         r0, r0, r2          ; update input pointer
-
-    qadd16      r7, r7, r7          ; 2x[c1|d1] --> we can use smlad and smlsd
-                                    ; with 2217*4 and 5352*4 without losing the
-                                    ; sign bit (overflow)
-
-    smuad       r9, r6, lr          ; o4 = (i5+i6)*8 + (i4+i7)*8
-    smusd       r8, r6, lr          ; o6 = (i5+i6)*8 - (i4+i7)*8
-
-    smlad       r6, r7, r12, r11    ; o5 = (c1 * 2217 + d1 * 5352 +  14500)
-    smlsdx      r7, r7, r12, r10    ; o7 = (d1 * 2217 - c1 * 5352 +   7500)
-
-    ldrd        r4, r5, [r0]        ; [i9 | i8] [i11 | i10]
-
-    pkhbt       r9, r9, r6, lsl #4  ; [o5 | o4], keep in register for PART 2
-    pkhbt       r6, r8, r7, lsl #4  ; [o7 | o6]
-
-    str         r6, [r1, #12]
-
-    ; coeffs 8-11
-    ror         r5, r5, #16         ; [i10 | i11]
-
-    qadd16      r6, r4, r5          ; [i9+i10 | i8+i11]=[b1 | a1] without shift
-    qsub16      r7, r4, r5          ; [i9-i10 | i8-i11]=[c1 | d1] without shift
-
-    add         r0, r0, r2          ; update input pointer
-
-    qadd16      r7, r7, r7          ; 2x[c1|d1] --> we can use smlad and smlsd
-                                    ; with 2217*4 and 5352*4 without losing the
-                                    ; sign bit (overflow)
-
-    smuad       r2, r6, lr          ; o8 = (i9+i10)*8 + (i8+i11)*8
-    smusd       r8, r6, lr          ; o10 = (i9+i10)*8 - (i8+i11)*8
-
-    smlad       r6, r7, r12, r11    ; o9 = (c1 * 2217 + d1 * 5352 +  14500)
-    smlsdx      r7, r7, r12, r10    ; o11 = (d1 * 2217 - c1 * 5352 +   7500)
-
-    ldrd        r4, r5, [r0]        ; [i13 | i12] [i15 | i14]
-
-    pkhbt       r2, r2, r6, lsl #4  ; [o9 | o8], keep in register for PART 2
-    pkhbt       r6, r8, r7, lsl #4  ; [o11 | o10]
-
-    str         r6, [r1, #20]
-
-    ; coeffs 12-15
-    ror         r5, r5, #16         ; [i14 | i15]
-
-    qadd16      r6, r4, r5          ; [i13+i14 | i12+i15]=[b1|a1] without shift
-    qsub16      r7, r4, r5          ; [i13-i14 | i12-i15]=[c1|d1] without shift
-
-    qadd16      r7, r7, r7          ; 2x[c1|d1] --> we can use smlad and smlsd
-                                    ; with 2217*4 and 5352*4 without losing the
-                                    ; sign bit (overflow)
-
-    smuad       r4, r6, lr          ; o12 = (i13+i14)*8 + (i12+i15)*8
-    smusd       r5, r6, lr          ; o14 = (i13+i14)*8 - (i12+i15)*8
-
-    smlad       r6, r7, r12, r11    ; o13 = (c1 * 2217 + d1 * 5352 +  14500)
-    smlsdx      r7, r7, r12, r10    ; o15 = (d1 * 2217 - c1 * 5352 +   7500)
-
-    pkhbt       r0, r4, r6, lsl #4  ; [o13 | o12], keep in register for PART 2
-    pkhbt       r6, r5, r7, lsl #4  ; [o15 | o14]
-
-    str         r6, [r1, #28]
-
-
-    ; PART 2 -------------------------------------------------
-    ldr         r11, c12000
-    ldr         r10, c51000
-    ldr         lr, c0x00070007
-
-    qadd16      r4, r3, r0          ; a1 = [i1+i13 | i0+i12]
-    qadd16      r5, r9, r2          ; b1 = [i5+i9  |  i4+i8]
-    qsub16      r6, r9, r2          ; c1 = [i5-i9  |  i4-i8]
-    qsub16      r7, r3, r0          ; d1 = [i1-i13 | i0-i12]
-
-    qadd16      r4, r4, lr          ; a1 + 7
-
-    add         r0, r11, #0x10000   ; add (d!=0)
-
-    qadd16      r2, r4, r5          ; a1 + b1 + 7
-    qsub16      r3, r4, r5          ; a1 - b1 + 7
-
-    ldr         r12, c0x08a914e8    ; [2217 | 5352]
-
-    lsl         r8, r2, #16         ; prepare bottom halfword for scaling
-    asr         r2, r2, #4          ; scale top halfword
-    lsl         r9, r3, #16         ; prepare bottom halfword for scaling
-    asr         r3, r3, #4          ; scale top halfword
-    pkhtb       r4, r2, r8, asr #20 ; pack and scale bottom halfword
-    pkhtb       r5, r3, r9, asr #20 ; pack and scale bottom halfword
-
-    smulbt      r2, r6, r12         ; [ ------ | c1*2217]
-    str         r4, [r1, #0]        ; [     o1 |      o0]
-    smultt      r3, r6, r12         ; [c1*2217 | ------ ]
-    str         r5, [r1, #16]       ; [     o9 |      o8]
-
-    smlabb      r8, r7, r12, r2     ; [ ------ | d1*5352]
-    smlatb      r9, r7, r12, r3     ; [d1*5352 | ------ ]
-
-    smulbb      r2, r6, r12         ; [ ------ | c1*5352]
-    smultb      r3, r6, r12         ; [c1*5352 | ------ ]
-
-    lsls        r6, r7, #16         ; d1 != 0 ?
-    addeq       r8, r8, r11         ; c1_b*2217+d1_b*5352+12000 + (d==0)
-    addne       r8, r8, r0          ; c1_b*2217+d1_b*5352+12000 + (d!=0)
-    asrs        r6, r7, #16
-    addeq       r9, r9, r11         ; c1_t*2217+d1_t*5352+12000 + (d==0)
-    addne       r9, r9, r0          ; c1_t*2217+d1_t*5352+12000 + (d!=0)
-
-    smlabt      r4, r7, r12, r10    ; [ ------ | d1*2217] + 51000
-    smlatt      r5, r7, r12, r10    ; [d1*2217 | ------ ] + 51000
-
-    pkhtb       r9, r9, r8, asr #16
-
-    sub         r4, r4, r2
-    sub         r5, r5, r3
-
-    ldr         r3, [r1, #4]        ; [i3 | i2]
-
-    pkhtb       r5, r5, r4, asr #16 ; [o13|o12]
-
-    str         r9, [r1, #8]        ; [o5 | 04]
-
-    ldr         r9, [r1, #12]       ; [i7 | i6]
-    ldr         r8, [r1, #28]       ; [i15|i14]
-    ldr         r2, [r1, #20]       ; [i11|i10]
-    str         r5, [r1, #24]       ; [o13|o12]
-
-    qadd16      r4, r3, r8          ; a1 = [i3+i15 | i2+i14]
-    qadd16      r5, r9, r2          ; b1 = [i7+i11 | i6+i10]
-
-    qadd16      r4, r4, lr          ; a1 + 7
-
-    qsub16      r6, r9, r2          ; c1 = [i7-i11 | i6-i10]
-    qadd16      r2, r4, r5          ; a1 + b1 + 7
-    qsub16      r7, r3, r8          ; d1 = [i3-i15 | i2-i14]
-    qsub16      r3, r4, r5          ; a1 - b1 + 7
-
-    lsl         r8, r2, #16         ; prepare bottom halfword for scaling
-    asr         r2, r2, #4          ; scale top halfword
-    lsl         r9, r3, #16         ; prepare bottom halfword for scaling
-    asr         r3, r3, #4          ; scale top halfword
-    pkhtb       r4, r2, r8, asr #20 ; pack and scale bottom halfword
-    pkhtb       r5, r3, r9, asr #20 ; pack and scale bottom halfword
-
-    smulbt      r2, r6, r12         ; [ ------ | c1*2217]
-    str         r4, [r1, #4]        ; [     o3 |      o2]
-    smultt      r3, r6, r12         ; [c1*2217 | ------ ]
-    str         r5, [r1, #20]       ; [    o11 |     o10]
-
-    smlabb      r8, r7, r12, r2     ; [ ------ | d1*5352]
-    smlatb      r9, r7, r12, r3     ; [d1*5352 | ------ ]
-
-    smulbb      r2, r6, r12         ; [ ------ | c1*5352]
-    smultb      r3, r6, r12         ; [c1*5352 | ------ ]
-
-    lsls        r6, r7, #16         ; d1 != 0 ?
-    addeq       r8, r8, r11         ; c1_b*2217+d1_b*5352+12000 + (d==0)
-    addne       r8, r8, r0          ; c1_b*2217+d1_b*5352+12000 + (d!=0)
-
-    asrs        r6, r7, #16
-    addeq       r9, r9, r11         ; c1_t*2217+d1_t*5352+12000 + (d==0)
-    addne       r9, r9, r0          ; c1_t*2217+d1_t*5352+12000 + (d!=0)
-
-    smlabt      r4, r7, r12, r10    ; [ ------ | d1*2217] + 51000
-    smlatt      r5, r7, r12, r10    ; [d1*2217 | ------ ] + 51000
-
-    pkhtb       r9, r9, r8, asr #16
-
-    sub         r4, r4, r2
-    sub         r5, r5, r3
-
-    str         r9, [r1, #12]       ; [o7 | o6]
-    pkhtb       r5, r5, r4, asr #16 ; [o15|o14]
-
-    str         r5, [r1, #28]       ; [o15|o14]
-
-    ldmfd       sp!, {r4 - r12, pc}
-
-    ENDP
-
-; Used constants
-c7500
-    DCD     7500
-c14500
-    DCD     14500
-c0x22a453a0
-    DCD     0x22a453a0
-c0x00080008
-    DCD     0x00080008
-c12000
-    DCD     12000
-c51000
-    DCD     51000
-c0x00070007
-    DCD     0x00070007
-c0x08a914e8
-    DCD     0x08a914e8
-
-    END
--- a/vp8/encoder/arm/armv6/vp8_subtract_armv6.asm
+++ /dev/null
@@ -1,265 +1,0 @@
-;
-;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp8_subtract_mby_armv6|
-    EXPORT  |vp8_subtract_mbuv_armv6|
-    EXPORT  |vp8_subtract_b_armv6|
-
-    INCLUDE asm_enc_offsets.asm
-
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-; r0    BLOCK *be
-; r1    BLOCKD *bd
-; r2    int pitch
-|vp8_subtract_b_armv6| PROC
-
-    stmfd   sp!, {r4-r9}
-
-    ldr     r4, [r0, #vp8_block_base_src]
-    ldr     r5, [r0, #vp8_block_src]
-    ldr     r6, [r0, #vp8_block_src_diff]
-
-    ldr     r3, [r4]
-    ldr     r7, [r0, #vp8_block_src_stride]
-    add     r3, r3, r5          ; src = *base_src + src
-    ldr     r8, [r1, #vp8_blockd_predictor]
-
-    mov     r9, #4              ; loop count
-
-loop_block
-
-    ldr     r0, [r3], r7        ; src
-    ldr     r1, [r8], r2        ; pred
-
-    uxtb16  r4, r0              ; [s2 | s0]
-    uxtb16  r5, r1              ; [p2 | p0]
-    uxtb16  r0, r0, ror #8      ; [s3 | s1]
-    uxtb16  r1, r1, ror #8      ; [p3 | p1]
-
-    usub16  r4, r4, r5          ; [d2 | d0]
-    usub16  r5, r0, r1          ; [d3 | d1]
-
-    subs    r9, r9, #1          ; decrement loop counter
-
-    pkhbt   r0, r4, r5, lsl #16 ; [d1 | d0]
-    pkhtb   r1, r5, r4, asr #16 ; [d3 | d2]
-
-    str     r0, [r6, #0]        ; diff
-    str     r1, [r6, #4]        ; diff
-
-    add     r6, r6, r2, lsl #1  ; update diff pointer
-    bne     loop_block
-
-    ldmfd   sp!, {r4-r9}
-    mov     pc, lr
-
-    ENDP
-
-
-; r0    short *diff
-; r1    unsigned char *usrc
-; r2    unsigned char *vsrc
-; r3    unsigned char *pred
-; stack int stride
-|vp8_subtract_mbuv_armv6| PROC
-
-    stmfd   sp!, {r4-r12, lr}
-
-    add     r0, r0, #512        ; set *diff point to Cb
-    add     r3, r3, #256        ; set *pred point to Cb
-
-    mov     r4, #8              ; loop count
-    ldr     r5, [sp, #40]       ; stride
-
-    ; Subtract U block
-loop_u
-    ldr     r6, [r1]            ; src       (A)
-    ldr     r7, [r3], #4        ; pred      (A)
-
-    uxtb16  r8, r6              ; [s2 | s0] (A)
-    uxtb16  r9, r7              ; [p2 | p0] (A)
-    uxtb16  r10, r6, ror #8     ; [s3 | s1] (A)
-    uxtb16  r11, r7, ror #8     ; [p3 | p1] (A)
-
-    usub16  r6, r8, r9          ; [d2 | d0] (A)
-    usub16  r7, r10, r11        ; [d3 | d1] (A)
-
-    ldr     r10, [r1, #4]       ; src       (B)
-    ldr     r11, [r3], #4       ; pred      (B)
-
-    pkhbt   r8, r6, r7, lsl #16 ; [d1 | d0] (A)
-    pkhtb   r9, r7, r6, asr #16 ; [d3 | d2] (A)
-
-    str     r8, [r0], #4        ; diff      (A)
-    uxtb16  r8, r10             ; [s2 | s0] (B)
-    str     r9, [r0], #4        ; diff      (A)
-
-    uxtb16  r9, r11             ; [p2 | p0] (B)
-    uxtb16  r10, r10, ror #8    ; [s3 | s1] (B)
-    uxtb16  r11, r11, ror #8    ; [p3 | p1] (B)
-
-    usub16  r6, r8, r9          ; [d2 | d0] (B)
-    usub16  r7, r10, r11        ; [d3 | d1] (B)
-
-    add     r1, r1, r5          ; update usrc pointer
-
-    pkhbt   r8, r6, r7, lsl #16 ; [d1 | d0] (B)
-    pkhtb   r9, r7, r6, asr #16 ; [d3 | d2] (B)
-
-    str     r8, [r0], #4        ; diff      (B)
-    subs    r4, r4, #1          ; update loop counter
-    str     r9, [r0], #4        ; diff      (B)
-
-    bne     loop_u
-
-    mov     r4, #8              ; loop count
-
-    ; Subtract V block
-loop_v
-    ldr     r6, [r2]            ; src       (A)
-    ldr     r7, [r3], #4        ; pred      (A)
-
-    uxtb16  r8, r6              ; [s2 | s0] (A)
-    uxtb16  r9, r7              ; [p2 | p0] (A)
-    uxtb16  r10, r6, ror #8     ; [s3 | s1] (A)
-    uxtb16  r11, r7, ror #8     ; [p3 | p1] (A)
-
-    usub16  r6, r8, r9          ; [d2 | d0] (A)
-    usub16  r7, r10, r11        ; [d3 | d1] (A)
-
-    ldr     r10, [r2, #4]       ; src       (B)
-    ldr     r11, [r3], #4       ; pred      (B)
-
-    pkhbt   r8, r6, r7, lsl #16 ; [d1 | d0] (A)
-    pkhtb   r9, r7, r6, asr #16 ; [d3 | d2] (A)
-
-    str     r8, [r0], #4        ; diff      (A)
-    uxtb16  r8, r10             ; [s2 | s0] (B)
-    str     r9, [r0], #4        ; diff      (A)
-
-    uxtb16  r9, r11             ; [p2 | p0] (B)
-    uxtb16  r10, r10, ror #8    ; [s3 | s1] (B)
-    uxtb16  r11, r11, ror #8    ; [p3 | p1] (B)
-
-    usub16  r6, r8, r9          ; [d2 | d0] (B)
-    usub16  r7, r10, r11        ; [d3 | d1] (B)
-
-    add     r2, r2, r5          ; update vsrc pointer
-
-    pkhbt   r8, r6, r7, lsl #16 ; [d1 | d0] (B)
-    pkhtb   r9, r7, r6, asr #16 ; [d3 | d2] (B)
-
-    str     r8, [r0], #4        ; diff      (B)
-    subs    r4, r4, #1          ; update loop counter
-    str     r9, [r0], #4        ; diff      (B)
-
-    bne     loop_v
-
-    ldmfd   sp!, {r4-r12, pc}
-
-    ENDP
-
-
-; r0    short *diff
-; r1    unsigned char *src
-; r2    unsigned char *pred
-; r3    int stride
-|vp8_subtract_mby_armv6| PROC
-
-    stmfd   sp!, {r4-r11}
-
-    mov     r4, #16
-loop
-    ldr     r6, [r1]            ; src       (A)
-    ldr     r7, [r2], #4        ; pred      (A)
-
-    uxtb16  r8, r6              ; [s2 | s0] (A)
-    uxtb16  r9, r7              ; [p2 | p0] (A)
-    uxtb16  r10, r6, ror #8     ; [s3 | s1] (A)
-    uxtb16  r11, r7, ror #8     ; [p3 | p1] (A)
-
-    usub16  r6, r8, r9          ; [d2 | d0] (A)
-    usub16  r7, r10, r11        ; [d3 | d1] (A)
-
-    ldr     r10, [r1, #4]       ; src       (B)
-    ldr     r11, [r2], #4       ; pred      (B)
-
-    pkhbt   r8, r6, r7, lsl #16 ; [d1 | d0] (A)
-    pkhtb   r9, r7, r6, asr #16 ; [d3 | d2] (A)
-
-    str     r8, [r0], #4        ; diff      (A)
-    uxtb16  r8, r10             ; [s2 | s0] (B)
-    str     r9, [r0], #4        ; diff      (A)
-
-    uxtb16  r9, r11             ; [p2 | p0] (B)
-    uxtb16  r10, r10, ror #8    ; [s3 | s1] (B)
-    uxtb16  r11, r11, ror #8    ; [p3 | p1] (B)
-
-    usub16  r6, r8, r9          ; [d2 | d0] (B)
-    usub16  r7, r10, r11        ; [d3 | d1] (B)
-
-    ldr     r10, [r1, #8]       ; src       (C)
-    ldr     r11, [r2], #4       ; pred      (C)
-
-    pkhbt   r8, r6, r7, lsl #16 ; [d1 | d0] (B)
-    pkhtb   r9, r7, r6, asr #16 ; [d3 | d2] (B)
-
-    str     r8, [r0], #4        ; diff      (B)
-    uxtb16  r8, r10             ; [s2 | s0] (C)
-    str     r9, [r0], #4        ; diff      (B)
-
-    uxtb16  r9, r11             ; [p2 | p0] (C)
-    uxtb16  r10, r10, ror #8    ; [s3 | s1] (C)
-    uxtb16  r11, r11, ror #8    ; [p3 | p1] (C)
-
-    usub16  r6, r8, r9          ; [d2 | d0] (C)
-    usub16  r7, r10, r11        ; [d3 | d1] (C)
-
-    ldr     r10, [r1, #12]      ; src       (D)
-    ldr     r11, [r2], #4       ; pred      (D)
-
-    pkhbt   r8, r6, r7, lsl #16  ; [d1 | d0] (C)
-    pkhtb   r9, r7, r6, asr #16  ; [d3 | d2] (C)
-
-    str     r8, [r0], #4        ; diff      (C)
-    uxtb16  r8, r10             ; [s2 | s0] (D)
-    str     r9, [r0], #4        ; diff      (C)
-
-    uxtb16  r9, r11             ; [p2 | p0] (D)
-    uxtb16  r10, r10, ror #8    ; [s3 | s1] (D)
-    uxtb16  r11, r11, ror #8    ; [p3 | p1] (D)
-
-    usub16  r6, r8, r9          ; [d2 | d0] (D)
-    usub16  r7, r10, r11        ; [d3 | d1] (D)
-
-    add     r1, r1, r3          ; update src pointer
-
-    pkhbt   r8, r6, r7, lsl #16 ; [d1 | d0] (D)
-    pkhtb   r9, r7, r6, asr #16 ; [d3 | d2] (D)
-
-    str     r8, [r0], #4        ; diff      (D)
-    subs    r4, r4, #1          ; update loop counter
-    str     r9, [r0], #4        ; diff      (D)
-
-    bne     loop
-
-    ldmfd   sp!, {r4-r11}
-    mov     pc, lr
-
-    ENDP
-
-    END
-
--- a/vp8/encoder/arm/armv6/vp8_variance16x16_armv6.asm
+++ /dev/null
@@ -1,154 +1,0 @@
-;
-;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp9_variance16x16_armv6|
-
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-; r0    unsigned char *src_ptr
-; r1    int source_stride
-; r2    unsigned char *ref_ptr
-; r3    int  recon_stride
-; stack unsigned int *sse
-|vp9_variance16x16_armv6| PROC
-
-    stmfd   sp!, {r4-r12, lr}
-
-    pld     [r0, r1, lsl #0]
-    pld     [r2, r3, lsl #0]
-
-    mov     r8, #0              ; initialize sum = 0
-    mov     r11, #0             ; initialize sse = 0
-    mov     r12, #16            ; set loop counter to 16 (=block height)
-
-loop
-    ; 1st 4 pixels
-    ldr     r4, [r0, #0]        ; load 4 src pixels
-    ldr     r5, [r2, #0]        ; load 4 ref pixels
-
-    mov     lr, #0              ; constant zero
-
-    usub8   r6, r4, r5          ; calculate difference
-    pld     [r0, r1, lsl #1]
-    sel     r7, r6, lr          ; select bytes with positive difference
-    usub8   r9, r5, r4          ; calculate difference with reversed operands
-    pld     [r2, r3, lsl #1]
-    sel     r6, r9, lr          ; select bytes with negative difference
-
-    ; calculate partial sums
-    usad8   r4, r7, lr          ; calculate sum of positive differences
-    usad8   r5, r6, lr          ; calculate sum of negative differences
-    orr     r6, r6, r7          ; differences of all 4 pixels
-    ; calculate total sum
-    adds    r8, r8, r4          ; add positive differences to sum
-    subs    r8, r8, r5          ; substract negative differences from sum
-
-    ; calculate sse
-    uxtb16  r5, r6              ; byte (two pixels) to halfwords
-    uxtb16  r10, r6, ror #8     ; another two pixels to halfwords
-    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
-
-    ; 2nd 4 pixels
-    ldr     r4, [r0, #4]        ; load 4 src pixels
-    ldr     r5, [r2, #4]        ; load 4 ref pixels
-    smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2)
-
-    usub8   r6, r4, r5          ; calculate difference
-    sel     r7, r6, lr          ; select bytes with positive difference
-    usub8   r9, r5, r4          ; calculate difference with reversed operands
-    sel     r6, r9, lr          ; select bytes with negative difference
-
-    ; calculate partial sums
-    usad8   r4, r7, lr          ; calculate sum of positive differences
-    usad8   r5, r6, lr          ; calculate sum of negative differences
-    orr     r6, r6, r7          ; differences of all 4 pixels
-
-    ; calculate total sum
-    add     r8, r8, r4          ; add positive differences to sum
-    sub     r8, r8, r5          ; substract negative differences from sum
-
-    ; calculate sse
-    uxtb16  r5, r6              ; byte (two pixels) to halfwords
-    uxtb16  r10, r6, ror #8     ; another two pixels to halfwords
-    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
-
-    ; 3rd 4 pixels
-    ldr     r4, [r0, #8]        ; load 4 src pixels
-    ldr     r5, [r2, #8]        ; load 4 ref pixels
-    smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2)
-
-    usub8   r6, r4, r5          ; calculate difference
-    sel     r7, r6, lr          ; select bytes with positive difference
-    usub8   r9, r5, r4          ; calculate difference with reversed operands
-    sel     r6, r9, lr          ; select bytes with negative difference
-
-    ; calculate partial sums
-    usad8   r4, r7, lr          ; calculate sum of positive differences
-    usad8   r5, r6, lr          ; calculate sum of negative differences
-    orr     r6, r6, r7          ; differences of all 4 pixels
-
-    ; calculate total sum
-    add     r8, r8, r4          ; add positive differences to sum
-    sub     r8, r8, r5          ; substract negative differences from sum
-
-    ; calculate sse
-    uxtb16  r5, r6              ; byte (two pixels) to halfwords
-    uxtb16  r10, r6, ror #8     ; another two pixels to halfwords
-    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
-
-    ; 4th 4 pixels
-    ldr     r4, [r0, #12]       ; load 4 src pixels
-    ldr     r5, [r2, #12]       ; load 4 ref pixels
-    smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2)
-
-    usub8   r6, r4, r5          ; calculate difference
-    add     r0, r0, r1          ; set src_ptr to next row
-    sel     r7, r6, lr          ; select bytes with positive difference
-    usub8   r9, r5, r4          ; calculate difference with reversed operands
-    add     r2, r2, r3          ; set dst_ptr to next row
-    sel     r6, r9, lr          ; select bytes with negative difference
-
-    ; calculate partial sums
-    usad8   r4, r7, lr          ; calculate sum of positive differences
-    usad8   r5, r6, lr          ; calculate sum of negative differences
-    orr     r6, r6, r7          ; differences of all 4 pixels
-
-    ; calculate total sum
-    add     r8, r8, r4          ; add positive differences to sum
-    sub     r8, r8, r5          ; substract negative differences from sum
-
-    ; calculate sse
-    uxtb16  r5, r6              ; byte (two pixels) to halfwords
-    uxtb16  r10, r6, ror #8     ; another two pixels to halfwords
-    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
-    smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2)
-
-
-    subs    r12, r12, #1
-
-    bne     loop
-
-    ; return stuff
-    ldr     r6, [sp, #40]       ; get address of sse
-    mul     r0, r8, r8          ; sum * sum
-    str     r11, [r6]           ; store sse
-    sub     r0, r11, r0, asr #8 ; return (sse - ((sum * sum) >> 8))
-
-    ldmfd   sp!, {r4-r12, pc}
-
-    ENDP
-
-    END
-
--- a/vp8/encoder/arm/armv6/vp8_variance8x8_armv6.asm
+++ /dev/null
@@ -1,101 +1,0 @@
-;
-;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp9_variance8x8_armv6|
-
-    ARM
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-; r0    unsigned char *src_ptr
-; r1    int source_stride
-; r2    unsigned char *ref_ptr
-; r3    int  recon_stride
-; stack unsigned int *sse
-|vp9_variance8x8_armv6| PROC
-
-    push    {r4-r10, lr}
-
-    pld     [r0, r1, lsl #0]
-    pld     [r2, r3, lsl #0]
-
-    mov     r12, #8             ; set loop counter to 8 (=block height)
-    mov     r4, #0              ; initialize sum = 0
-    mov     r5, #0              ; initialize sse = 0
-
-loop
-    ; 1st 4 pixels
-    ldr     r6, [r0, #0x0]      ; load 4 src pixels
-    ldr     r7, [r2, #0x0]      ; load 4 ref pixels
-
-    mov     lr, #0              ; constant zero
-
-    usub8   r8, r6, r7          ; calculate difference
-    pld     [r0, r1, lsl #1]
-    sel     r10, r8, lr         ; select bytes with positive difference
-    usub8   r9, r7, r6          ; calculate difference with reversed operands
-    pld     [r2, r3, lsl #1]
-    sel     r8, r9, lr          ; select bytes with negative difference
-
-    ; calculate partial sums
-    usad8   r6, r10, lr         ; calculate sum of positive differences
-    usad8   r7, r8, lr          ; calculate sum of negative differences
-    orr     r8, r8, r10         ; differences of all 4 pixels
-    ; calculate total sum
-    add    r4, r4, r6           ; add positive differences to sum
-    sub    r4, r4, r7           ; substract negative differences from sum
-
-    ; calculate sse
-    uxtb16  r7, r8              ; byte (two pixels) to halfwords
-    uxtb16  r10, r8, ror #8     ; another two pixels to halfwords
-    smlad   r5, r7, r7, r5      ; dual signed multiply, add and accumulate (1)
-
-    ; 2nd 4 pixels
-    ldr     r6, [r0, #0x4]      ; load 4 src pixels
-    ldr     r7, [r2, #0x4]      ; load 4 ref pixels
-    smlad   r5, r10, r10, r5    ; dual signed multiply, add and accumulate (2)
-
-    usub8   r8, r6, r7          ; calculate difference
-    add     r0, r0, r1          ; set src_ptr to next row
-    sel     r10, r8, lr         ; select bytes with positive difference
-    usub8   r9, r7, r6          ; calculate difference with reversed operands
-    add     r2, r2, r3          ; set dst_ptr to next row
-    sel     r8, r9, lr          ; select bytes with negative difference
-
-    ; calculate partial sums
-    usad8   r6, r10, lr         ; calculate sum of positive differences
-    usad8   r7, r8, lr          ; calculate sum of negative differences
-    orr     r8, r8, r10         ; differences of all 4 pixels
-
-    ; calculate total sum
-    add     r4, r4, r6          ; add positive differences to sum
-    sub     r4, r4, r7          ; substract negative differences from sum
-
-    ; calculate sse
-    uxtb16  r7, r8              ; byte (two pixels) to halfwords
-    uxtb16  r10, r8, ror #8     ; another two pixels to halfwords
-    smlad   r5, r7, r7, r5      ; dual signed multiply, add and accumulate (1)
-    subs    r12, r12, #1        ; next row
-    smlad   r5, r10, r10, r5    ; dual signed multiply, add and accumulate (2)
-
-    bne     loop
-
-    ; return stuff
-    ldr     r8, [sp, #32]       ; get address of sse
-    mul     r1, r4, r4          ; sum * sum
-    str     r5, [r8]            ; store sse
-    sub     r0, r5, r1, ASR #6  ; return (sse - ((sum * sum) >> 6))
-
-    pop     {r4-r10, pc}
-
-    ENDP
-
-    END
--- a/vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6.asm
+++ /dev/null
@@ -1,182 +1,0 @@
-;
-;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp9_variance_halfpixvar16x16_h_armv6|
-
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-; r0    unsigned char *src_ptr
-; r1    int source_stride
-; r2    unsigned char *ref_ptr
-; r3    int  recon_stride
-; stack unsigned int *sse
-|vp9_variance_halfpixvar16x16_h_armv6| PROC
-
-    stmfd   sp!, {r4-r12, lr}
-
-    pld     [r0, r1, lsl #0]
-    pld     [r2, r3, lsl #0]
-
-    mov     r8, #0              ; initialize sum = 0
-    ldr     r10, c80808080
-    mov     r11, #0             ; initialize sse = 0
-    mov     r12, #16            ; set loop counter to 16 (=block height)
-    mov     lr, #0              ; constant zero
-loop
-    ; 1st 4 pixels
-    ldr     r4, [r0, #0]        ; load 4 src pixels
-    ldr     r6, [r0, #1]        ; load 4 src pixels with 1 byte offset
-    ldr     r5, [r2, #0]        ; load 4 ref pixels
-
-    ; bilinear interpolation
-    mvn     r6, r6
-    uhsub8  r4, r4, r6
-    eor     r4, r4, r10
-
-    usub8   r6, r4, r5          ; calculate difference
-    pld     [r0, r1, lsl #1]
-    sel     r7, r6, lr          ; select bytes with positive difference
-    usub8   r6, r5, r4          ; calculate difference with reversed operands
-    pld     [r2, r3, lsl #1]
-    sel     r6, r6, lr          ; select bytes with negative difference
-
-    ; calculate partial sums
-    usad8   r4, r7, lr          ; calculate sum of positive differences
-    usad8   r5, r6, lr          ; calculate sum of negative differences
-    orr     r6, r6, r7          ; differences of all 4 pixels
-    ; calculate total sum
-    adds    r8, r8, r4          ; add positive differences to sum
-    subs    r8, r8, r5          ; substract negative differences from sum
-
-    ; calculate sse
-    uxtb16  r5, r6              ; byte (two pixels) to halfwords
-    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
-    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
-
-    ; 2nd 4 pixels
-    ldr     r4, [r0, #4]        ; load 4 src pixels
-    ldr     r6, [r0, #5]        ; load 4 src pixels with 1 byte offset
-    ldr     r5, [r2, #4]        ; load 4 ref pixels
-
-    ; bilinear interpolation
-    mvn     r6, r6
-    uhsub8  r4, r4, r6
-    eor     r4, r4, r10
-
-    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
-
-    usub8   r6, r4, r5          ; calculate difference
-    sel     r7, r6, lr          ; select bytes with positive difference
-    usub8   r6, r5, r4          ; calculate difference with reversed operands
-    sel     r6, r6, lr          ; select bytes with negative difference
-
-    ; calculate partial sums
-    usad8   r4, r7, lr          ; calculate sum of positive differences
-    usad8   r5, r6, lr          ; calculate sum of negative differences
-    orr     r6, r6, r7          ; differences of all 4 pixels
-
-    ; calculate total sum
-    add     r8, r8, r4          ; add positive differences to sum
-    sub     r8, r8, r5          ; substract negative differences from sum
-
-    ; calculate sse
-    uxtb16  r5, r6              ; byte (two pixels) to halfwords
-    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
-    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
-
-    ; 3rd 4 pixels
-    ldr     r4, [r0, #8]        ; load 4 src pixels
-    ldr     r6, [r0, #9]        ; load 4 src pixels with 1 byte offset
-    ldr     r5, [r2, #8]        ; load 4 ref pixels
-
-    ; bilinear interpolation
-    mvn     r6, r6
-    uhsub8  r4, r4, r6
-    eor     r4, r4, r10
-
-    smlad   r11, r7, r7, r11  ; dual signed multiply, add and accumulate (2)
-
-    usub8   r6, r4, r5          ; calculate difference
-    sel     r7, r6, lr          ; select bytes with positive difference
-    usub8   r6, r5, r4          ; calculate difference with reversed operands
-    sel     r6, r6, lr          ; select bytes with negative difference
-
-    ; calculate partial sums
-    usad8   r4, r7, lr          ; calculate sum of positive differences
-    usad8   r5, r6, lr          ; calculate sum of negative differences
-    orr     r6, r6, r7          ; differences of all 4 pixels
-
-    ; calculate total sum
-    add     r8, r8, r4          ; add positive differences to sum
-    sub     r8, r8, r5          ; substract negative differences from sum
-
-    ; calculate sse
-    uxtb16  r5, r6              ; byte (two pixels) to halfwords
-    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
-    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
-
-    ; 4th 4 pixels
-    ldr     r4, [r0, #12]       ; load 4 src pixels
-    ldr     r6, [r0, #13]       ; load 4 src pixels with 1 byte offset
-    ldr     r5, [r2, #12]       ; load 4 ref pixels
-
-    ; bilinear interpolation
-    mvn     r6, r6
-    uhsub8  r4, r4, r6
-    eor     r4, r4, r10
-
-    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
-
-    usub8   r6, r4, r5          ; calculate difference
-    add     r0, r0, r1          ; set src_ptr to next row
-    sel     r7, r6, lr          ; select bytes with positive difference
-    usub8   r6, r5, r4          ; calculate difference with reversed operands
-    add     r2, r2, r3          ; set dst_ptr to next row
-    sel     r6, r6, lr          ; select bytes with negative difference
-
-    ; calculate partial sums
-    usad8   r4, r7, lr          ; calculate sum of positive differences
-    usad8   r5, r6, lr          ; calculate sum of negative differences
-    orr     r6, r6, r7          ; differences of all 4 pixels
-
-    ; calculate total sum
-    add     r8, r8, r4          ; add positive differences to sum
-    sub     r8, r8, r5          ; substract negative differences from sum
-
-    ; calculate sse
-    uxtb16  r5, r6              ; byte (two pixels) to halfwords
-    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
-    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
-    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
-
-    subs    r12, r12, #1
-
-    bne     loop
-
-    ; return stuff
-    ldr     r6, [sp, #40]       ; get address of sse
-    mul     r0, r8, r8          ; sum * sum
-    str     r11, [r6]           ; store sse
-    sub     r0, r11, r0, asr #8 ; return (sse - ((sum * sum) >> 8))
-
-    ldmfd   sp!, {r4-r12, pc}
-
-    ENDP
-
-c80808080
-    DCD     0x80808080
-
-    END
-
--- a/vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6.asm
+++ /dev/null
@@ -1,222 +1,0 @@
-;
-;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp9_variance_halfpixvar16x16_hv_armv6|
-
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-; r0    unsigned char *src_ptr
-; r1    int source_stride
-; r2    unsigned char *ref_ptr
-; r3    int  recon_stride
-; stack unsigned int *sse
-|vp9_variance_halfpixvar16x16_hv_armv6| PROC
-
-    stmfd   sp!, {r4-r12, lr}
-
-    pld     [r0, r1, lsl #0]
-    pld     [r2, r3, lsl #0]
-
-    mov     r8, #0              ; initialize sum = 0
-    ldr     r10, c80808080
-    mov     r11, #0             ; initialize sse = 0
-    mov     r12, #16            ; set loop counter to 16 (=block height)
-    mov     lr, #0              ; constant zero
-loop
-    add     r9, r0, r1          ; pointer to pixels on the next row
-    ; 1st 4 pixels
-    ldr     r4, [r0, #0]        ; load source pixels a, row N
-    ldr     r6, [r0, #1]        ; load source pixels b, row N
-    ldr     r5, [r9, #0]        ; load source pixels c, row N+1
-    ldr     r7, [r9, #1]        ; load source pixels d, row N+1
-
-    ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N
-    mvn     r6, r6
-    uhsub8  r4, r4, r6
-    eor     r4, r4, r10
-    ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1
-    mvn     r7, r7
-    uhsub8  r5, r5, r7
-    eor     r5, r5, r10
-    ; z = (x + y + 1) >> 1, interpolate half pixel values vertically
-    mvn     r5, r5
-    uhsub8  r4, r4, r5
-    ldr     r5, [r2, #0]        ; load 4 ref pixels
-    eor     r4, r4, r10
-
-    usub8   r6, r4, r5          ; calculate difference
-    pld     [r0, r1, lsl #1]
-    sel     r7, r6, lr          ; select bytes with positive difference
-    usub8   r6, r5, r4          ; calculate difference with reversed operands
-    pld     [r2, r3, lsl #1]
-    sel     r6, r6, lr          ; select bytes with negative difference
-
-    ; calculate partial sums
-    usad8   r4, r7, lr          ; calculate sum of positive differences
-    usad8   r5, r6, lr          ; calculate sum of negative differences
-    orr     r6, r6, r7          ; differences of all 4 pixels
-    ; calculate total sum
-    adds    r8, r8, r4          ; add positive differences to sum
-    subs    r8, r8, r5          ; substract negative differences from sum
-
-    ; calculate sse
-    uxtb16  r5, r6              ; byte (two pixels) to halfwords
-    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
-    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
-
-    ; 2nd 4 pixels
-    ldr     r4, [r0, #4]        ; load source pixels a, row N
-    ldr     r6, [r0, #5]        ; load source pixels b, row N
-    ldr     r5, [r9, #4]        ; load source pixels c, row N+1
-
-    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
-
-    ldr     r7, [r9, #5]        ; load source pixels d, row N+1
-
-    ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N
-    mvn     r6, r6
-    uhsub8  r4, r4, r6
-    eor     r4, r4, r10
-    ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1
-    mvn     r7, r7
-    uhsub8  r5, r5, r7
-    eor     r5, r5, r10
-    ; z = (x + y + 1) >> 1, interpolate half pixel values vertically
-    mvn     r5, r5
-    uhsub8  r4, r4, r5
-    ldr     r5, [r2, #4]        ; load 4 ref pixels
-    eor     r4, r4, r10
-
-    usub8   r6, r4, r5          ; calculate difference
-    sel     r7, r6, lr          ; select bytes with positive difference
-    usub8   r6, r5, r4          ; calculate difference with reversed operands
-    sel     r6, r6, lr          ; select bytes with negative difference
-
-    ; calculate partial sums
-    usad8   r4, r7, lr          ; calculate sum of positive differences
-    usad8   r5, r6, lr          ; calculate sum of negative differences
-    orr     r6, r6, r7          ; differences of all 4 pixels
-
-    ; calculate total sum
-    add     r8, r8, r4          ; add positive differences to sum
-    sub     r8, r8, r5          ; substract negative differences from sum
-
-    ; calculate sse
-    uxtb16  r5, r6              ; byte (two pixels) to halfwords
-    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
-    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
-
-    ; 3rd 4 pixels
-    ldr     r4, [r0, #8]        ; load source pixels a, row N
-    ldr     r6, [r0, #9]        ; load source pixels b, row N
-    ldr     r5, [r9, #8]        ; load source pixels c, row N+1
-
-    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
-
-    ldr     r7, [r9, #9]        ; load source pixels d, row N+1
-
-    ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N
-    mvn     r6, r6
-    uhsub8  r4, r4, r6
-    eor     r4, r4, r10
-    ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1
-    mvn     r7, r7
-    uhsub8  r5, r5, r7
-    eor     r5, r5, r10
-    ; z = (x + y + 1) >> 1, interpolate half pixel values vertically
-    mvn     r5, r5
-    uhsub8  r4, r4, r5
-    ldr     r5, [r2, #8]        ; load 4 ref pixels
-    eor     r4, r4, r10
-
-    usub8   r6, r4, r5          ; calculate difference
-    sel     r7, r6, lr          ; select bytes with positive difference
-    usub8   r6, r5, r4          ; calculate difference with reversed operands
-    sel     r6, r6, lr          ; select bytes with negative difference
-
-    ; calculate partial sums
-    usad8   r4, r7, lr          ; calculate sum of positive differences
-    usad8   r5, r6, lr          ; calculate sum of negative differences
-    orr     r6, r6, r7          ; differences of all 4 pixels
-
-    ; calculate total sum
-    add     r8, r8, r4          ; add positive differences to sum
-    sub     r8, r8, r5          ; substract negative differences from sum
-
-    ; calculate sse
-    uxtb16  r5, r6              ; byte (two pixels) to halfwords
-    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
-    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
-
-    ; 4th 4 pixels
-    ldr     r4, [r0, #12]       ; load source pixels a, row N
-    ldr     r6, [r0, #13]       ; load source pixels b, row N
-    ldr     r5, [r9, #12]       ; load source pixels c, row N+1
-    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
-    ldr     r7, [r9, #13]       ; load source pixels d, row N+1
-
-    ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N
-    mvn     r6, r6
-    uhsub8  r4, r4, r6
-    eor     r4, r4, r10
-    ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1
-    mvn     r7, r7
-    uhsub8  r5, r5, r7
-    eor     r5, r5, r10
-    ; z = (x + y + 1) >> 1, interpolate half pixel values vertically
-    mvn     r5, r5
-    uhsub8  r4, r4, r5
-    ldr     r5, [r2, #12]       ; load 4 ref pixels
-    eor     r4, r4, r10
-
-    usub8   r6, r4, r5          ; calculate difference
-    add     r0, r0, r1          ; set src_ptr to next row
-    sel     r7, r6, lr          ; select bytes with positive difference
-    usub8   r6, r5, r4          ; calculate difference with reversed operands
-    add     r2, r2, r3          ; set dst_ptr to next row
-    sel     r6, r6, lr          ; select bytes with negative difference
-
-    ; calculate partial sums
-    usad8   r4, r7, lr          ; calculate sum of positive differences
-    usad8   r5, r6, lr          ; calculate sum of negative differences
-    orr     r6, r6, r7          ; differences of all 4 pixels
-
-    ; calculate total sum
-    add     r8, r8, r4          ; add positive differences to sum
-    sub     r8, r8, r5          ; substract negative differences from sum
-
-    ; calculate sse
-    uxtb16  r5, r6              ; byte (two pixels) to halfwords
-    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
-    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
-    subs    r12, r12, #1
-    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
-
-    bne     loop
-
-    ; return stuff
-    ldr     r6, [sp, #40]       ; get address of sse
-    mul     r0, r8, r8          ; sum * sum
-    str     r11, [r6]           ; store sse
-    sub     r0, r11, r0, asr #8 ; return (sse - ((sum * sum) >> 8))
-
-    ldmfd   sp!, {r4-r12, pc}
-
-    ENDP
-
-c80808080
-    DCD     0x80808080
-
-    END
--- a/vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6.asm
+++ /dev/null
@@ -1,184 +1,0 @@
-;
-;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp9_variance_halfpixvar16x16_v_armv6|
-
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-; r0    unsigned char *src_ptr
-; r1    int source_stride
-; r2    unsigned char *ref_ptr
-; r3    int  recon_stride
-; stack unsigned int *sse
-|vp9_variance_halfpixvar16x16_v_armv6| PROC
-
-    stmfd   sp!, {r4-r12, lr}
-
-    pld     [r0, r1, lsl #0]
-    pld     [r2, r3, lsl #0]
-
-    mov     r8, #0              ; initialize sum = 0
-    ldr     r10, c80808080
-    mov     r11, #0             ; initialize sse = 0
-    mov     r12, #16            ; set loop counter to 16 (=block height)
-    mov     lr, #0              ; constant zero
-loop
-    add     r9, r0, r1          ; set src pointer to next row
-    ; 1st 4 pixels
-    ldr     r4, [r0, #0]        ; load 4 src pixels
-    ldr     r6, [r9, #0]        ; load 4 src pixels from next row
-    ldr     r5, [r2, #0]        ; load 4 ref pixels
-
-    ; bilinear interpolation
-    mvn     r6, r6
-    uhsub8  r4, r4, r6
-    eor     r4, r4, r10
-
-    usub8   r6, r4, r5          ; calculate difference
-    pld     [r0, r1, lsl #1]
-    sel     r7, r6, lr          ; select bytes with positive difference
-    usub8   r6, r5, r4          ; calculate difference with reversed operands
-    pld     [r2, r3, lsl #1]
-    sel     r6, r6, lr          ; select bytes with negative difference
-
-    ; calculate partial sums
-    usad8   r4, r7, lr          ; calculate sum of positive differences
-    usad8   r5, r6, lr          ; calculate sum of negative differences
-    orr     r6, r6, r7          ; differences of all 4 pixels
-    ; calculate total sum
-    adds    r8, r8, r4          ; add positive differences to sum
-    subs    r8, r8, r5          ; substract negative differences from sum
-
-    ; calculate sse
-    uxtb16  r5, r6              ; byte (two pixels) to halfwords
-    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
-    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
-
-    ; 2nd 4 pixels
-    ldr     r4, [r0, #4]        ; load 4 src pixels
-    ldr     r6, [r9, #4]        ; load 4 src pixels from next row
-    ldr     r5, [r2, #4]        ; load 4 ref pixels
-
-    ; bilinear interpolation
-    mvn     r6, r6
-    uhsub8  r4, r4, r6
-    eor     r4, r4, r10
-
-    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
-
-    usub8   r6, r4, r5          ; calculate difference
-    sel     r7, r6, lr          ; select bytes with positive difference
-    usub8   r6, r5, r4          ; calculate difference with reversed operands
-    sel     r6, r6, lr          ; select bytes with negative difference
-
-    ; calculate partial sums
-    usad8   r4, r7, lr          ; calculate sum of positive differences
-    usad8   r5, r6, lr          ; calculate sum of negative differences
-    orr     r6, r6, r7          ; differences of all 4 pixels
-
-    ; calculate total sum
-    add     r8, r8, r4          ; add positive differences to sum
-    sub     r8, r8, r5          ; substract negative differences from sum
-
-    ; calculate sse
-    uxtb16  r5, r6              ; byte (two pixels) to halfwords
-    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
-    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
-
-    ; 3rd 4 pixels
-    ldr     r4, [r0, #8]        ; load 4 src pixels
-    ldr     r6, [r9, #8]        ; load 4 src pixels from next row
-    ldr     r5, [r2, #8]        ; load 4 ref pixels
-
-    ; bilinear interpolation
-    mvn     r6, r6
-    uhsub8  r4, r4, r6
-    eor     r4, r4, r10
-
-    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
-
-    usub8   r6, r4, r5          ; calculate difference
-    sel     r7, r6, lr          ; select bytes with positive difference
-    usub8   r6, r5, r4          ; calculate difference with reversed operands
-    sel     r6, r6, lr          ; select bytes with negative difference
-
-    ; calculate partial sums
-    usad8   r4, r7, lr          ; calculate sum of positive differences
-    usad8   r5, r6, lr          ; calculate sum of negative differences
-    orr     r6, r6, r7          ; differences of all 4 pixels
-
-    ; calculate total sum
-    add     r8, r8, r4          ; add positive differences to sum
-    sub     r8, r8, r5          ; substract negative differences from sum
-
-    ; calculate sse
-    uxtb16  r5, r6              ; byte (two pixels) to halfwords
-    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
-    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
-
-    ; 4th 4 pixels
-    ldr     r4, [r0, #12]       ; load 4 src pixels
-    ldr     r6, [r9, #12]       ; load 4 src pixels from next row
-    ldr     r5, [r2, #12]       ; load 4 ref pixels
-
-    ; bilinear interpolation
-    mvn     r6, r6
-    uhsub8  r4, r4, r6
-    eor     r4, r4, r10
-
-    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
-
-    usub8   r6, r4, r5          ; calculate difference
-    add     r0, r0, r1          ; set src_ptr to next row
-    sel     r7, r6, lr          ; select bytes with positive difference
-    usub8   r6, r5, r4          ; calculate difference with reversed operands
-    add     r2, r2, r3          ; set dst_ptr to next row
-    sel     r6, r6, lr          ; select bytes with negative difference
-
-    ; calculate partial sums
-    usad8   r4, r7, lr          ; calculate sum of positive differences
-    usad8   r5, r6, lr          ; calculate sum of negative differences
-    orr     r6, r6, r7          ; differences of all 4 pixels
-
-    ; calculate total sum
-    add     r8, r8, r4          ; add positive differences to sum
-    sub     r8, r8, r5          ; substract negative differences from sum
-
-    ; calculate sse
-    uxtb16  r5, r6              ; byte (two pixels) to halfwords
-    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
-    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
-    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
-
-
-    subs    r12, r12, #1
-
-    bne     loop
-
-    ; return stuff
-    ldr     r6, [sp, #40]       ; get address of sse
-    mul     r0, r8, r8          ; sum * sum
-    str     r11, [r6]           ; store sse
-    sub     r0, r11, r0, asr #8 ; return (sse - ((sum * sum) >> 8))
-
-    ldmfd   sp!, {r4-r12, pc}
-
-    ENDP
-
-c80808080
-    DCD     0x80808080
-
-    END
-
--- a/vp8/encoder/arm/armv6/walsh_v6.asm
+++ /dev/null
@@ -1,212 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-    EXPORT |vp8_short_walsh4x4_armv6|
-
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA    |.text|, CODE, READONLY  ; name this block of code
-
-;short vp8_short_walsh4x4_armv6(short *input, short *output, int pitch)
-; r0    short *input,
-; r1    short *output,
-; r2    int pitch
-|vp8_short_walsh4x4_armv6| PROC
-
-    stmdb       sp!, {r4 - r11, lr}
-
-    ldrd        r4, r5, [r0], r2
-    ldr         lr, c00040004
-    ldrd        r6, r7, [r0], r2
-
-    ; 0-3
-    qadd16      r3, r4, r5          ; [d1|a1] [1+3   |   0+2]
-    qsub16      r4, r4, r5          ; [c1|b1] [1-3   |   0-2]
-
-    ldrd        r8, r9, [r0], r2
-    ; 4-7
-    qadd16      r5, r6, r7          ; [d1|a1] [5+7   |   4+6]
-    qsub16      r6, r6, r7          ; [c1|b1] [5-7   |   4-6]
-
-    ldrd        r10, r11, [r0]
-    ; 8-11
-    qadd16      r7, r8, r9          ; [d1|a1] [9+11  |  8+10]
-    qsub16      r8, r8, r9          ; [c1|b1] [9-11  |  8-10]
-
-    ; 12-15
-    qadd16      r9, r10, r11        ; [d1|a1] [13+15 | 12+14]
-    qsub16      r10, r10, r11       ; [c1|b1] [13-15 | 12-14]
-
-
-    lsls        r2, r3, #16
-    smuad       r11, r3, lr         ; A0 = a1<<2 + d1<<2
-    addne       r11, r11, #1        ; A0 += (a1!=0)
-
-    lsls        r2, r7, #16
-    smuad       r12, r7, lr         ; C0 = a1<<2 + d1<<2
-    addne       r12, r12, #1        ; C0 += (a1!=0)
-
-    add         r0, r11, r12        ; a1_0 = A0 + C0
-    sub         r11, r11, r12       ; b1_0 = A0 - C0
-
-    lsls        r2, r5, #16
-    smuad       r12, r5, lr         ; B0 = a1<<2 + d1<<2
-    addne       r12, r12, #1        ; B0 += (a1!=0)
-
-    lsls        r2, r9, #16
-    smuad       r2, r9, lr          ; D0 = a1<<2 + d1<<2
-    addne       r2, r2, #1          ; D0 += (a1!=0)
-
-    add         lr, r12, r2         ; d1_0 = B0 + D0
-    sub         r12, r12, r2        ; c1_0 = B0 - D0
-
-    ; op[0,4,8,12]
-    adds        r2, r0, lr          ; a2 = a1_0 + d1_0
-    addmi       r2, r2, #1          ; += a2 < 0
-    add         r2, r2, #3          ; += 3
-    subs        r0, r0, lr          ; d2 = a1_0 - d1_0
-    mov         r2, r2, asr #3      ; >> 3
-    strh        r2, [r1]            ; op[0]
-
-    addmi       r0, r0, #1          ; += a2 < 0
-    add         r0, r0, #3          ; += 3
-    ldr         lr, c00040004
-    mov         r0, r0, asr #3      ; >> 3
-    strh        r0, [r1, #24]       ; op[12]
-
-    adds        r2, r11, r12        ; b2 = b1_0 + c1_0
-    addmi       r2, r2, #1          ; += a2 < 0
-    add         r2, r2, #3          ; += 3
-    subs        r0, r11, r12        ; c2 = b1_0 - c1_0
-    mov         r2, r2, asr #3      ; >> 3
-    strh        r2, [r1, #8]        ; op[4]
-
-    addmi       r0, r0, #1          ; += a2 < 0
-    add         r0, r0, #3          ; += 3
-    smusd       r3, r3, lr          ; A3 = a1<<2 - d1<<2
-    smusd       r7, r7, lr          ; C3 = a1<<2 - d1<<2
-    mov         r0, r0, asr #3      ; >> 3
-    strh        r0, [r1, #16]       ; op[8]
-
-
-    ; op[3,7,11,15]
-    add         r0, r3, r7          ; a1_3 = A3 + C3
-    sub         r3, r3, r7          ; b1_3 = A3 - C3
-
-    smusd       r5, r5, lr          ; B3 = a1<<2 - d1<<2
-    smusd       r9, r9, lr          ; D3 = a1<<2 - d1<<2
-    add         r7, r5, r9          ; d1_3 = B3 + D3
-    sub         r5, r5, r9          ; c1_3 = B3 - D3
-
-    adds        r2, r0, r7          ; a2 = a1_3 + d1_3
-    addmi       r2, r2, #1          ; += a2 < 0
-    add         r2, r2, #3          ; += 3
-    adds        r9, r3, r5          ; b2 = b1_3 + c1_3
-    mov         r2, r2, asr #3      ; >> 3
-    strh        r2, [r1, #6]        ; op[3]
-
-    addmi       r9, r9, #1          ; += a2 < 0
-    add         r9, r9, #3          ; += 3
-    subs        r2, r3, r5          ; c2 = b1_3 - c1_3
-    mov         r9, r9, asr #3      ; >> 3
-    strh        r9, [r1, #14]       ; op[7]
-
-    addmi       r2, r2, #1          ; += a2 < 0
-    add         r2, r2, #3          ; += 3
-    subs        r9, r0, r7          ; d2 = a1_3 - d1_3
-    mov         r2, r2, asr #3      ; >> 3
-    strh        r2, [r1, #22]       ; op[11]
-
-    addmi       r9, r9, #1          ; += a2 < 0
-    add         r9, r9, #3          ; += 3
-    smuad       r3, r4, lr          ; A1 = b1<<2 + c1<<2
-    smuad       r5, r8, lr          ; C1 = b1<<2 + c1<<2
-    mov         r9, r9, asr #3      ; >> 3
-    strh        r9, [r1, #30]       ; op[15]
-
-    ; op[1,5,9,13]
-    add         r0, r3, r5          ; a1_1 = A1 + C1
-    sub         r3, r3, r5          ; b1_1 = A1 - C1
-
-    smuad       r7, r6, lr          ; B1 = b1<<2 + c1<<2
-    smuad       r9, r10, lr         ; D1 = b1<<2 + c1<<2
-    add         r5, r7, r9          ; d1_1 = B1 + D1
-    sub         r7, r7, r9          ; c1_1 = B1 - D1
-
-    adds        r2, r0, r5          ; a2 = a1_1 + d1_1
-    addmi       r2, r2, #1          ; += a2 < 0
-    add         r2, r2, #3          ; += 3
-    adds        r9, r3, r7          ; b2 = b1_1 + c1_1
-    mov         r2, r2, asr #3      ; >> 3
-    strh        r2, [r1, #2]        ; op[1]
-
-    addmi       r9, r9, #1          ; += a2 < 0
-    add         r9, r9, #3          ; += 3
-    subs        r2, r3, r7          ; c2 = b1_1 - c1_1
-    mov         r9, r9, asr #3      ; >> 3
-    strh        r9, [r1, #10]       ; op[5]
-
-    addmi       r2, r2, #1          ; += a2 < 0
-    add         r2, r2, #3          ; += 3
-    subs        r9, r0, r5          ; d2 = a1_1 - d1_1
-    mov         r2, r2, asr #3      ; >> 3
-    strh        r2, [r1, #18]       ; op[9]
-
-    addmi       r9, r9, #1          ; += a2 < 0
-    add         r9, r9, #3          ; += 3
-    smusd       r4, r4, lr          ; A2 = b1<<2 - c1<<2
-    smusd       r8, r8, lr          ; C2 = b1<<2 - c1<<2
-    mov         r9, r9, asr #3      ; >> 3
-    strh        r9, [r1, #26]       ; op[13]
-
-
-    ; op[2,6,10,14]
-    add         r11, r4, r8         ; a1_2 = A2 + C2
-    sub         r12, r4, r8         ; b1_2 = A2 - C2
-
-    smusd       r6, r6, lr          ; B2 = b1<<2 - c1<<2
-    smusd       r10, r10, lr        ; D2 = b1<<2 - c1<<2
-    add         r4, r6, r10         ; d1_2 = B2 + D2
-    sub         r8, r6, r10         ; c1_2 = B2 - D2
-
-    adds        r2, r11, r4         ; a2 = a1_2 + d1_2
-    addmi       r2, r2, #1          ; += a2 < 0
-    add         r2, r2, #3          ; += 3
-    adds        r9, r12, r8         ; b2 = b1_2 + c1_2
-    mov         r2, r2, asr #3      ; >> 3
-    strh        r2, [r1, #4]        ; op[2]
-
-    addmi       r9, r9, #1          ; += a2 < 0
-    add         r9, r9, #3          ; += 3
-    subs        r2, r12, r8         ; c2 = b1_2 - c1_2
-    mov         r9, r9, asr #3      ; >> 3
-    strh        r9, [r1, #12]       ; op[6]
-
-    addmi       r2, r2, #1          ; += a2 < 0
-    add         r2, r2, #3          ; += 3
-    subs        r9, r11, r4         ; d2 = a1_2 - d1_2
-    mov         r2, r2, asr #3      ; >> 3
-    strh        r2, [r1, #20]       ; op[10]
-
-    addmi       r9, r9, #1          ; += a2 < 0
-    add         r9, r9, #3          ; += 3
-    mov         r9, r9, asr #3      ; >> 3
-    strh        r9, [r1, #28]       ; op[14]
-
-
-    ldmia       sp!, {r4 - r11, pc}
-    ENDP        ; |vp8_short_walsh4x4_armv6|
-
-c00040004
-    DCD         0x00040004
-
-    END
--- a/vp8/encoder/arm/boolhuff_arm.c
+++ /dev/null
@@ -1,33 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vp8/encoder/boolhuff.h"
-#include "vp8/common/blockd.h"
-
-const unsigned int vp9_prob_cost[256] = {
-  2047, 2047, 1791, 1641, 1535, 1452, 1385, 1328, 1279, 1235, 1196, 1161, 1129, 1099, 1072, 1046,
-  1023, 1000,  979,  959,  940,  922,  905,  889,  873,  858,  843,  829,  816,  803,  790,  778,
-  767,  755,  744,  733,  723,  713,  703,  693,  684,  675,  666,  657,  649,  641,  633,  625,
-  617,  609,  602,  594,  587,  580,  573,  567,  560,  553,  547,  541,  534,  528,  522,  516,
-  511,  505,  499,  494,  488,  483,  477,  472,  467,  462,  457,  452,  447,  442,  437,  433,
-  428,  424,  419,  415,  410,  406,  401,  397,  393,  389,  385,  381,  377,  373,  369,  365,
-  361,  357,  353,  349,  346,  342,  338,  335,  331,  328,  324,  321,  317,  314,  311,  307,
-  304,  301,  297,  294,  291,  288,  285,  281,  278,  275,  272,  269,  266,  263,  260,  257,
-  255,  252,  249,  246,  243,  240,  238,  235,  232,  229,  227,  224,  221,  219,  216,  214,
-  211,  208,  206,  203,  201,  198,  196,  194,  191,  189,  186,  184,  181,  179,  177,  174,
-  172,  170,  168,  165,  163,  161,  159,  156,  154,  152,  150,  148,  145,  143,  141,  139,
-  137,  135,  133,  131,  129,  127,  125,  123,  121,  119,  117,  115,  113,  111,  109,  107,
-  105,  103,  101,   99,   97,   95,   93,   92,   90,   88,   86,   84,   82,   81,   79,   77,
-  75,   73,   72,   70,   68,   66,   65,   63,   61,   60,   58,   56,   55,   53,   51,   50,
-  48,   46,   45,   43,   41,   40,   38,   37,   35,   33,   32,   30,   29,   27,   25,   24,
-  22,   21,   19,   18,   16,   15,   13,   12,   10,    9,    7,    6,    4,    3,    1,   1
-};
-
--- a/vp8/encoder/arm/dct_arm.c
+++ /dev/null
@@ -1,21 +1,0 @@
-/*
- *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "vpx_config.h"
-#include "./vpx_rtcd.h"
-
-#if HAVE_ARMV6
-
-void vp9_short_fdct8x4_armv6(short *input, short *output, int pitch) {
-  vp9_short_fdct4x4_armv6(input,   output,    pitch);
-  vp9_short_fdct4x4_armv6(input + 4, output + 16, pitch);
-}
-
-#endif /* HAVE_ARMV6 */
--- a/vp8/encoder/arm/dct_arm.h
+++ /dev/null
@@ -1,65 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef DCT_ARM_H
-#define DCT_ARM_H
-
-#if HAVE_ARMV6
-extern prototype_fdct(vp9_short_walsh4x4_armv6);
-extern prototype_fdct(vp9_short_fdct4x4_armv6);
-extern prototype_fdct(vp9_short_fdct8x4_armv6);
-
-#if !CONFIG_RUNTIME_CPU_DETECT
-#undef  vp8_fdct_walsh_short4x4
-#define vp8_fdct_walsh_short4x4 vp9_short_walsh4x4_armv6
-
-#undef  vp8_fdct_short4x4
-#define vp8_fdct_short4x4 vp9_short_fdct4x4_armv6
-
-#undef  vp8_fdct_short8x4
-#define vp8_fdct_short8x4 vp9_short_fdct8x4_armv6
-
-#undef  vp8_fdct_fast4x4
-#define vp8_fdct_fast4x4 vp9_short_fdct4x4_armv6
-
-#undef  vp8_fdct_fast8x4
-#define vp8_fdct_fast8x4 vp9_short_fdct8x4_armv6
-#endif
-
-#endif /* HAVE_ARMV6 */
-
-#if HAVE_ARMV7
-extern prototype_fdct(vp9_short_fdct4x4_neon);
-extern prototype_fdct(vp9_short_fdct8x4_neon);
-extern prototype_fdct(vp8_fast_fdct4x4_neon);
-extern prototype_fdct(vp8_fast_fdct8x4_neon);
-extern prototype_fdct(vp9_short_walsh4x4_neon);
-
-#if !CONFIG_RUNTIME_CPU_DETECT
-#undef  vp8_fdct_short4x4
-#define vp8_fdct_short4x4 vp9_short_fdct4x4_neon
-
-#undef  vp8_fdct_short8x4
-#define vp8_fdct_short8x4 vp9_short_fdct8x4_neon
-
-#undef  vp8_fdct_fast4x4
-#define vp8_fdct_fast4x4 vp9_short_fdct4x4_neon
-
-#undef  vp8_fdct_fast8x4
-#define vp8_fdct_fast8x4 vp9_short_fdct8x4_neon
-
-#undef  vp8_fdct_walsh_short4x4
-#define vp8_fdct_walsh_short4x4 vp9_short_walsh4x4_neon
-#endif
-
-#endif
-
-#endif
--- a/vp8/encoder/arm/encodemb_arm.h
+++ /dev/null
@@ -1,64 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef ENCODEMB_ARM_H
-#define ENCODEMB_ARM_H
-
-#if HAVE_ARMV6
-extern prototype_subb(vp9_subtract_b_armv6);
-extern prototype_submby(vp9_subtract_mby_armv6);
-extern prototype_submbuv(vp9_subtract_mbuv_armv6);
-
-#if !CONFIG_RUNTIME_CPU_DETECT
-#undef  vp8_encodemb_subb
-#define vp8_encodemb_subb vp9_subtract_b_armv6
-
-#undef  vp8_encodemb_submby
-#define vp8_encodemb_submby vp9_subtract_mby_armv6
-
-#undef  vp8_encodemb_submbuv
-#define vp8_encodemb_submbuv vp9_subtract_mbuv_armv6
-#endif
-
-#endif /* HAVE_ARMV6 */
-
-#if HAVE_ARMV7
-// extern prototype_berr(vp9_block_error_c);
-// extern prototype_mberr(vp9_mbblock_error_c);
-// extern prototype_mbuverr(vp9_mbuverror_c);
-
-extern prototype_subb(vp9_subtract_b_neon);
-extern prototype_submby(vp9_subtract_mby_neon);
-extern prototype_submbuv(vp9_subtract_mbuv_neon);
-
-// #undef  vp8_encodemb_berr
-// #define vp8_encodemb_berr vp9_block_error_c
-
-// #undef  vp8_encodemb_mberr
-// #define vp8_encodemb_mberr vp9_mbblock_error_c
-
-// #undef  vp8_encodemb_mbuverr
-// #define vp8_encodemb_mbuverr vp9_mbuverror_c
-
-#if !CONFIG_RUNTIME_CPU_DETECT
-#undef  vp8_encodemb_subb
-#define vp8_encodemb_subb vp9_subtract_b_neon
-
-#undef  vp8_encodemb_submby
-#define vp8_encodemb_submby vp9_subtract_mby_neon
-
-#undef  vp8_encodemb_submbuv
-#define vp8_encodemb_submbuv vp9_subtract_mbuv_neon
-#endif
-
-#endif
-
-#endif
--- a/vp8/encoder/arm/neon/fastquantizeb_neon.asm
+++ /dev/null
@@ -1,261 +1,0 @@
-;
-;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp8_fast_quantize_b_neon|
-    EXPORT  |vp8_fast_quantize_b_pair_neon|
-
-    INCLUDE asm_enc_offsets.asm
-
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=4
-
-;vp8_fast_quantize_b_pair_neon(BLOCK *b1, BLOCK *b2, BLOCKD *d1, BLOCKD *d2);
-|vp8_fast_quantize_b_pair_neon| PROC
-
-    stmfd           sp!, {r4-r9}
-    vstmdb          sp!, {q4-q7}
-
-    ldr             r4, [r0, #vp8_block_coeff]
-    ldr             r5, [r0, #vp8_block_quant_fast]
-    ldr             r6, [r0, #vp8_block_round]
-
-    vld1.16         {q0, q1}, [r4@128]  ; load z
-
-    ldr             r7, [r2, #vp8_blockd_qcoeff]
-
-    vabs.s16        q4, q0              ; calculate x = abs(z)
-    vabs.s16        q5, q1
-
-    ;right shift 15 to get sign, all 0 if it is positive, all 1 if it is negative
-    vshr.s16        q2, q0, #15         ; sz
-    vshr.s16        q3, q1, #15
-
-    vld1.s16        {q6, q7}, [r6@128]  ; load round_ptr [0-15]
-    vld1.s16        {q8, q9}, [r5@128]  ; load quant_ptr [0-15]
-
-    ldr             r4, [r1, #vp8_block_coeff]
-
-    vadd.s16        q4, q6              ; x + Round
-    vadd.s16        q5, q7
-
-    vld1.16         {q0, q1}, [r4@128]  ; load z2
-
-    vqdmulh.s16     q4, q8              ; y = ((Round+abs(z)) * Quant) >> 16
-    vqdmulh.s16     q5, q9
-
-    vabs.s16        q10, q0             ; calculate x2 = abs(z_2)
-    vabs.s16        q11, q1
-    vshr.s16        q12, q0, #15        ; sz2
-    vshr.s16        q13, q1, #15
-
-    ;modify data to have its original sign
-    veor.s16        q4, q2              ; y^sz
-    veor.s16        q5, q3
-
-    vadd.s16        q10, q6             ; x2 + Round
-    vadd.s16        q11, q7
-
-    ldr             r8, [r2, #vp8_blockd_dequant]
-
-    vqdmulh.s16     q10, q8             ; y2 = ((Round+abs(z)) * Quant) >> 16
-    vqdmulh.s16     q11, q9
-
-    vshr.s16        q4, #1              ; right shift 1 after vqdmulh
-    vshr.s16        q5, #1
-
-    vld1.s16        {q6, q7}, [r8@128]  ;load dequant_ptr[i]
-
-    vsub.s16        q4, q2              ; x1=(y^sz)-sz = (y^sz)-(-1) (2's complement)
-    vsub.s16        q5, q3
-
-    vshr.s16        q10, #1             ; right shift 1 after vqdmulh
-    vshr.s16        q11, #1
-
-    ldr             r9, [r2, #vp8_blockd_dqcoeff]
-
-    veor.s16        q10, q12            ; y2^sz2
-    veor.s16        q11, q13
-
-    vst1.s16        {q4, q5}, [r7]      ; store: qcoeff = x1
-
-
-    vsub.s16        q10, q12            ; x2=(y^sz)-sz = (y^sz)-(-1) (2's complement)
-    vsub.s16        q11, q13
-
-    ldr             r6, [r3, #vp8_blockd_qcoeff]
-
-    vmul.s16        q2, q6, q4          ; x * Dequant
-    vmul.s16        q3, q7, q5
-
-    ldr             r0, _inv_zig_zag_   ; load ptr of inverse zigzag table
-
-    vceq.s16        q8, q8              ; set q8 to all 1
-
-    vst1.s16        {q10, q11}, [r6]    ; store: qcoeff = x2
-
-    vmul.s16        q12, q6, q10        ; x2 * Dequant
-    vmul.s16        q13, q7, q11
-
-    vld1.16         {q6, q7}, [r0@128]  ; load inverse scan order
-
-    vtst.16         q14, q4, q8         ; now find eob
-    vtst.16         q15, q5, q8         ; non-zero element is set to all 1
-
-    vst1.s16        {q2, q3}, [r9]      ; store dqcoeff = x * Dequant
-
-    ldr             r7, [r3, #vp8_blockd_dqcoeff]
-
-    vand            q0, q6, q14         ; get all valid numbers from scan array
-    vand            q1, q7, q15
-
-    vst1.s16        {q12, q13}, [r7]    ; store dqcoeff = x * Dequant
-
-    vtst.16         q2, q10, q8         ; now find eob
-    vtst.16         q3, q11, q8         ; non-zero element is set to all 1
-
-    vmax.u16        q0, q0, q1          ; find maximum value in q0, q1
-
-    vand            q10, q6, q2         ; get all valid numbers from scan array
-    vand            q11, q7, q3
-    vmax.u16        q10, q10, q11       ; find maximum value in q10, q11
-
-    vmax.u16        d0, d0, d1
-    vmax.u16        d20, d20, d21
-    vmovl.u16       q0, d0
-    vmovl.u16       q10, d20
-
-
-    vmax.u32        d0, d0, d1
-    vmax.u32        d20, d20, d21
-    vpmax.u32       d0, d0, d0
-    vpmax.u32       d20, d20, d20
-
-    add             r4, r2, #vp8_blockd_eob
-    add             r5, r3, #vp8_blockd_eob
-
-    vst1.32         {d0[0]}, [r4@32]
-    vst1.32         {d20[0]}, [r5@32]
-
-    vldmia          sp!, {q4-q7}
-    ldmfd           sp!, {r4-r9}
-    bx              lr
-
-    ENDP
-
-;void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d)
-|vp8_fast_quantize_b_neon| PROC
-
-    stmfd           sp!, {r4-r7}
-
-    ldr             r3, [r0, #vp8_block_coeff]
-    ldr             r4, [r0, #vp8_block_quant_fast]
-    ldr             r5, [r0, #vp8_block_round]
-
-    vld1.16         {q0, q1}, [r3@128]  ; load z
-    vorr.s16        q14, q0, q1         ; check if all zero (step 1)
-    ldr             r6, [r1, #vp8_blockd_qcoeff]
-    ldr             r7, [r1, #vp8_blockd_dqcoeff]
-    vorr.s16        d28, d28, d29       ; check if all zero (step 2)
-
-    vabs.s16        q12, q0             ; calculate x = abs(z)
-    vabs.s16        q13, q1
-
-    ;right shift 15 to get sign, all 0 if it is positive, all 1 if it is negative
-    vshr.s16        q2, q0, #15         ; sz
-    vmov            r2, r3, d28         ; check if all zero (step 3)
-    vshr.s16        q3, q1, #15
-
-    vld1.s16        {q14, q15}, [r5@128]; load round_ptr [0-15]
-    vld1.s16        {q8, q9}, [r4@128]  ; load quant_ptr [0-15]
-
-    vadd.s16        q12, q14            ; x + Round
-    vadd.s16        q13, q15
-
-    ldr             r0, _inv_zig_zag_   ; load ptr of inverse zigzag table
-
-    vqdmulh.s16     q12, q8             ; y = ((Round+abs(z)) * Quant) >> 16
-    vqdmulh.s16     q13, q9
-
-    vld1.16         {q10, q11}, [r0@128]; load inverse scan order
-
-    vceq.s16        q8, q8              ; set q8 to all 1
-
-    ldr             r4, [r1, #vp8_blockd_dequant]
-
-    vshr.s16        q12, #1             ; right shift 1 after vqdmulh
-    vshr.s16        q13, #1
-
-    orr             r2, r2, r3          ; check if all zero (step 4)
-    cmp             r2, #0              ; check if all zero (step 5)
-    beq             zero_output         ; check if all zero (step 6)
-
-    ;modify data to have its original sign
-    veor.s16        q12, q2             ; y^sz
-    veor.s16        q13, q3
-
-    vsub.s16        q12, q2             ; x1=(y^sz)-sz = (y^sz)-(-1) (2's complement)
-    vsub.s16        q13, q3
-
-    vld1.s16        {q2, q3}, [r4@128]  ; load dequant_ptr[i]
-
-    vtst.16         q14, q12, q8        ; now find eob
-    vtst.16         q15, q13, q8        ; non-zero element is set to all 1
-
-    vst1.s16        {q12, q13}, [r6@128]; store: qcoeff = x1
-
-    vand            q10, q10, q14       ; get all valid numbers from scan array
-    vand            q11, q11, q15
-
-
-    vmax.u16        q0, q10, q11        ; find maximum value in q0, q1
-    vmax.u16        d0, d0, d1
-    vmovl.u16       q0, d0
-
-    vmul.s16        q2, q12             ; x * Dequant
-    vmul.s16        q3, q13
-
-    vmax.u32        d0, d0, d1
-    vpmax.u32       d0, d0, d0
-
-    vst1.s16        {q2, q3}, [r7@128]  ; store dqcoeff = x * Dequant
-
-    add             r4, r1, #vp8_blockd_eob
-    vst1.32         {d0[0]}, [r4@32]
-
-    ldmfd           sp!, {r4-r7}
-    bx              lr
-
-zero_output
-    str             r2, [r1, #vp8_blockd_eob]
-    vst1.s16        {q0, q1}, [r6@128]  ; qcoeff = 0
-    vst1.s16        {q0, q1}, [r7@128]  ; dqcoeff = 0
-
-    ldmfd           sp!, {r4-r7}
-    bx              lr
-
-    ENDP
-
-; default inverse zigzag table is defined in vp8/common/entropy.c
-_inv_zig_zag_
-    DCD inv_zig_zag
-
-    ALIGN 16    ; enable use of @128 bit aligned loads
-inv_zig_zag
-    DCW 0x0001, 0x0002, 0x0006, 0x0007
-    DCW 0x0003, 0x0005, 0x0008, 0x000d
-    DCW 0x0004, 0x0009, 0x000c, 0x000e
-    DCW 0x000a, 0x000b, 0x000f, 0x0010
-
-    END
-
--- a/vp8/encoder/arm/neon/picklpf_arm.c
+++ /dev/null
@@ -1,49 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vp8/common/onyxc_int.h"
-#include "vp8/encoder/onyx_int.h"
-#include "vp8/encoder/quantize.h"
-#include "vpx_mem/vpx_mem.h"
-#include "vpx_scale/yv12extend.h"
-#include "vpx_scale/vpxscale.h"
-#include "vp8/common/alloccommon.h"
-
-extern void vp8_memcpy_neon(unsigned char *dst_ptr, unsigned char *src_ptr, int sz);
-
-
-void
-vpxyv12_copy_partial_frame_neon(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc, int Fraction) {
-  unsigned char *src_y, *dst_y;
-  int yheight;
-  int ystride;
-  int border;
-  int yoffset;
-  int linestocopy;
-
-  border   = src_ybc->border;
-  yheight  = src_ybc->y_height;
-  ystride  = src_ybc->y_stride;
-
-  linestocopy = (yheight >> (Fraction + 4));
-
-  if (linestocopy < 1)
-    linestocopy = 1;
-
-  linestocopy <<= 4;
-
-  yoffset  = ystride * ((yheight >> 5) * 16 - 8);
-  src_y = src_ybc->y_buffer + yoffset;
-  dst_y = dst_ybc->y_buffer + yoffset;
-
-  // vpx_memcpy (dst_y, src_y, ystride * (linestocopy +16));
-  vp8_memcpy_neon((unsigned char *)dst_y, (unsigned char *)src_y, (int)(ystride * (linestocopy + 16)));
-}
--- a/vp8/encoder/arm/neon/sad16_neon.asm
+++ /dev/null
@@ -1,207 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp8_sad16x16_neon|
-    EXPORT  |vp8_sad16x8_neon|
-
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-; r0    unsigned char *src_ptr
-; r1    int  src_stride
-; r2    unsigned char *ref_ptr
-; r3    int  ref_stride
-|vp8_sad16x16_neon| PROC
-;;
-    vld1.8          {q0}, [r0], r1
-    vld1.8          {q4}, [r2], r3
-
-    vld1.8          {q1}, [r0], r1
-    vld1.8          {q5}, [r2], r3
-
-    vabdl.u8        q12, d0, d8
-    vabdl.u8        q13, d1, d9
-
-    vld1.8          {q2}, [r0], r1
-    vld1.8          {q6}, [r2], r3
-
-    vabal.u8        q12, d2, d10
-    vabal.u8        q13, d3, d11
-
-    vld1.8          {q3}, [r0], r1
-    vld1.8          {q7}, [r2], r3
-
-    vabal.u8        q12, d4, d12
-    vabal.u8        q13, d5, d13
-
-;;
-    vld1.8          {q0}, [r0], r1
-    vld1.8          {q4}, [r2], r3
-
-    vabal.u8        q12, d6, d14
-    vabal.u8        q13, d7, d15
-
-    vld1.8          {q1}, [r0], r1
-    vld1.8          {q5}, [r2], r3
-
-    vabal.u8        q12, d0, d8
-    vabal.u8        q13, d1, d9
-
-    vld1.8          {q2}, [r0], r1
-    vld1.8          {q6}, [r2], r3
-
-    vabal.u8        q12, d2, d10
-    vabal.u8        q13, d3, d11
-
-    vld1.8          {q3}, [r0], r1
-    vld1.8          {q7}, [r2], r3
-
-    vabal.u8        q12, d4, d12
-    vabal.u8        q13, d5, d13
-
-;;
-    vld1.8          {q0}, [r0], r1
-    vld1.8          {q4}, [r2], r3
-
-    vabal.u8        q12, d6, d14
-    vabal.u8        q13, d7, d15
-
-    vld1.8          {q1}, [r0], r1
-    vld1.8          {q5}, [r2], r3
-
-    vabal.u8        q12, d0, d8
-    vabal.u8        q13, d1, d9
-
-    vld1.8          {q2}, [r0], r1
-    vld1.8          {q6}, [r2], r3
-
-    vabal.u8        q12, d2, d10
-    vabal.u8        q13, d3, d11
-
-    vld1.8          {q3}, [r0], r1
-    vld1.8          {q7}, [r2], r3
-
-    vabal.u8        q12, d4, d12
-    vabal.u8        q13, d5, d13
-
-;;
-    vld1.8          {q0}, [r0], r1
-    vld1.8          {q4}, [r2], r3
-
-    vabal.u8        q12, d6, d14
-    vabal.u8        q13, d7, d15
-
-    vld1.8          {q1}, [r0], r1
-    vld1.8          {q5}, [r2], r3
-
-    vabal.u8        q12, d0, d8
-    vabal.u8        q13, d1, d9
-
-    vld1.8          {q2}, [r0], r1
-    vld1.8          {q6}, [r2], r3
-
-    vabal.u8        q12, d2, d10
-    vabal.u8        q13, d3, d11
-
-    vld1.8          {q3}, [r0]
-    vld1.8          {q7}, [r2]
-
-    vabal.u8        q12, d4, d12
-    vabal.u8        q13, d5, d13
-
-    vabal.u8        q12, d6, d14
-    vabal.u8        q13, d7, d15
-
-    vadd.u16        q0, q12, q13
-
-    vpaddl.u16      q1, q0
-    vpaddl.u32      q0, q1
-
-    vadd.u32        d0, d0, d1
-
-    vmov.32         r0, d0[0]
-
-    bx              lr
-
-    ENDP
-
-;==============================
-;unsigned int vp8_sad16x8_c(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride)
-|vp8_sad16x8_neon| PROC
-    vld1.8          {q0}, [r0], r1
-    vld1.8          {q4}, [r2], r3
-
-    vld1.8          {q1}, [r0], r1
-    vld1.8          {q5}, [r2], r3
-
-    vabdl.u8        q12, d0, d8
-    vabdl.u8        q13, d1, d9
-
-    vld1.8          {q2}, [r0], r1
-    vld1.8          {q6}, [r2], r3
-
-    vabal.u8        q12, d2, d10
-    vabal.u8        q13, d3, d11
-
-    vld1.8          {q3}, [r0], r1
-    vld1.8          {q7}, [r2], r3
-
-    vabal.u8        q12, d4, d12
-    vabal.u8        q13, d5, d13
-
-    vld1.8          {q0}, [r0], r1
-    vld1.8          {q4}, [r2], r3
-
-    vabal.u8        q12, d6, d14
-    vabal.u8        q13, d7, d15
-
-    vld1.8          {q1}, [r0], r1
-    vld1.8          {q5}, [r2], r3
-
-    vabal.u8        q12, d0, d8
-    vabal.u8        q13, d1, d9
-
-    vld1.8          {q2}, [r0], r1
-    vld1.8          {q6}, [r2], r3
-
-    vabal.u8        q12, d2, d10
-    vabal.u8        q13, d3, d11
-
-    vld1.8          {q3}, [r0], r1
-    vld1.8          {q7}, [r2], r3
-
-    vabal.u8        q12, d4, d12
-    vabal.u8        q13, d5, d13
-
-    vabal.u8        q12, d6, d14
-    vabal.u8        q13, d7, d15
-
-    vadd.u16        q0, q12, q13
-
-    vpaddl.u16      q1, q0
-    vpaddl.u32      q0, q1
-
-    vadd.u32        d0, d0, d1
-
-    vmov.32         r0, d0[0]
-
-    bx              lr
-
-    ENDP
-
-    END
--- a/vp8/encoder/arm/neon/sad8_neon.asm
+++ /dev/null
@@ -1,209 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp8_sad8x8_neon|
-    EXPORT  |vp8_sad8x16_neon|
-    EXPORT  |vp8_sad4x4_neon|
-
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-; unsigned int vp8_sad8x8_c(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride)
-
-|vp8_sad8x8_neon| PROC
-    vld1.8          {d0}, [r0], r1
-    vld1.8          {d8}, [r2], r3
-
-    vld1.8          {d2}, [r0], r1
-    vld1.8          {d10}, [r2], r3
-
-    vabdl.u8        q12, d0, d8
-
-    vld1.8          {d4}, [r0], r1
-    vld1.8          {d12}, [r2], r3
-
-    vabal.u8        q12, d2, d10
-
-    vld1.8          {d6}, [r0], r1
-    vld1.8          {d14}, [r2], r3
-
-    vabal.u8        q12, d4, d12
-
-    vld1.8          {d0}, [r0], r1
-    vld1.8          {d8}, [r2], r3
-
-    vabal.u8        q12, d6, d14
-
-    vld1.8          {d2}, [r0], r1
-    vld1.8          {d10}, [r2], r3
-
-    vabal.u8        q12, d0, d8
-
-    vld1.8          {d4}, [r0], r1
-    vld1.8          {d12}, [r2], r3
-
-    vabal.u8        q12, d2, d10
-
-    vld1.8          {d6}, [r0], r1
-    vld1.8          {d14}, [r2], r3
-
-    vabal.u8        q12, d4, d12
-    vabal.u8        q12, d6, d14
-
-    vpaddl.u16      q1, q12
-    vpaddl.u32      q0, q1
-    vadd.u32        d0, d0, d1
-
-    vmov.32         r0, d0[0]
-
-    bx              lr
-
-    ENDP
-
-;============================
-;unsigned int vp8_sad8x16_c(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride)
-
-|vp8_sad8x16_neon| PROC
-    vld1.8          {d0}, [r0], r1
-    vld1.8          {d8}, [r2], r3
-
-    vld1.8          {d2}, [r0], r1
-    vld1.8          {d10}, [r2], r3
-
-    vabdl.u8        q12, d0, d8
-
-    vld1.8          {d4}, [r0], r1
-    vld1.8          {d12}, [r2], r3
-
-    vabal.u8        q12, d2, d10
-
-    vld1.8          {d6}, [r0], r1
-    vld1.8          {d14}, [r2], r3
-
-    vabal.u8        q12, d4, d12
-
-    vld1.8          {d0}, [r0], r1
-    vld1.8          {d8}, [r2], r3
-
-    vabal.u8        q12, d6, d14
-
-    vld1.8          {d2}, [r0], r1
-    vld1.8          {d10}, [r2], r3
-
-    vabal.u8        q12, d0, d8
-
-    vld1.8          {d4}, [r0], r1
-    vld1.8          {d12}, [r2], r3
-
-    vabal.u8        q12, d2, d10
-
-    vld1.8          {d6}, [r0], r1
-    vld1.8          {d14}, [r2], r3
-
-    vabal.u8        q12, d4, d12
-
-    vld1.8          {d0}, [r0], r1
-    vld1.8          {d8}, [r2], r3
-
-    vabal.u8        q12, d6, d14
-
-    vld1.8          {d2}, [r0], r1
-    vld1.8          {d10}, [r2], r3
-
-    vabal.u8        q12, d0, d8
-
-    vld1.8          {d4}, [r0], r1
-    vld1.8          {d12}, [r2], r3
-
-    vabal.u8        q12, d2, d10
-
-    vld1.8          {d6}, [r0], r1
-    vld1.8          {d14}, [r2], r3
-
-    vabal.u8        q12, d4, d12
-
-    vld1.8          {d0}, [r0], r1
-    vld1.8          {d8}, [r2], r3
-
-    vabal.u8        q12, d6, d14
-
-    vld1.8          {d2}, [r0], r1
-    vld1.8          {d10}, [r2], r3
-
-    vabal.u8        q12, d0, d8
-
-    vld1.8          {d4}, [r0], r1
-    vld1.8          {d12}, [r2], r3
-
-    vabal.u8        q12, d2, d10
-
-    vld1.8          {d6}, [r0], r1
-    vld1.8          {d14}, [r2], r3
-
-    vabal.u8        q12, d4, d12
-    vabal.u8        q12, d6, d14
-
-    vpaddl.u16      q1, q12
-    vpaddl.u32      q0, q1
-    vadd.u32        d0, d0, d1
-
-    vmov.32         r0, d0[0]
-
-    bx              lr
-
-    ENDP
-
-;===========================
-;unsigned int vp8_sad4x4_c(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride)
-
-|vp8_sad4x4_neon| PROC
-    vld1.8          {d0}, [r0], r1
-    vld1.8          {d8}, [r2], r3
-
-    vld1.8          {d2}, [r0], r1
-    vld1.8          {d10}, [r2], r3
-
-    vabdl.u8        q12, d0, d8
-
-    vld1.8          {d4}, [r0], r1
-    vld1.8          {d12}, [r2], r3
-
-    vabal.u8        q12, d2, d10
-
-    vld1.8          {d6}, [r0], r1
-    vld1.8          {d14}, [r2], r3
-
-    vabal.u8        q12, d4, d12
-    vabal.u8        q12, d6, d14
-
-    vpaddl.u16      d1, d24
-    vpaddl.u32      d0, d1
-    vmov.32         r0, d0[0]
-
-    bx              lr
-
-    ENDP
-
-    END
--- a/vp8/encoder/arm/neon/shortfdct_neon.asm
+++ /dev/null
@@ -1,221 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp8_short_fdct4x4_neon|
-    EXPORT  |vp8_short_fdct8x4_neon|
-
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=4
-
-
-    ALIGN 16    ; enable use of @128 bit aligned loads
-coeff
-    DCW      5352,  5352,  5352, 5352
-    DCW      2217,  2217,  2217, 2217
-    DCD     14500, 14500, 14500, 14500
-    DCD      7500,  7500,  7500, 7500
-    DCD     12000, 12000, 12000, 12000
-    DCD     51000, 51000, 51000, 51000
-
-;void vp8_short_fdct4x4_c(short *input, short *output, int pitch)
-|vp8_short_fdct4x4_neon| PROC
-
-    ; Part one
-    vld1.16         {d0}, [r0@64], r2
-    adr             r12, coeff
-    vld1.16         {d1}, [r0@64], r2
-    vld1.16         {q8}, [r12@128]!        ; d16=5352,  d17=2217
-    vld1.16         {d2}, [r0@64], r2
-    vld1.32         {q9, q10}, [r12@128]!   ;  q9=14500, q10=7500
-    vld1.16         {d3}, [r0@64], r2
-
-    ; transpose d0=ip[0], d1=ip[1], d2=ip[2], d3=ip[3]
-    vtrn.32         d0, d2
-    vtrn.32         d1, d3
-    vld1.32         {q11,q12}, [r12@128]    ; q11=12000, q12=51000
-    vtrn.16         d0, d1
-    vtrn.16         d2, d3
-
-    vadd.s16        d4, d0, d3      ; a1 = ip[0] + ip[3]
-    vadd.s16        d5, d1, d2      ; b1 = ip[1] + ip[2]
-    vsub.s16        d6, d1, d2      ; c1 = ip[1] - ip[2]
-    vsub.s16        d7, d0, d3      ; d1 = ip[0] - ip[3]
-
-    vshl.s16        q2, q2, #3      ; (a1, b1) << 3
-    vshl.s16        q3, q3, #3      ; (c1, d1) << 3
-
-    vadd.s16        d0, d4, d5      ; op[0] = a1 + b1
-    vsub.s16        d2, d4, d5      ; op[2] = a1 - b1
-
-    vmlal.s16       q9, d7, d16     ; d1*5352 + 14500
-    vmlal.s16       q10, d7, d17    ; d1*2217 + 7500
-    vmlal.s16       q9, d6, d17     ; c1*2217 + d1*5352 + 14500
-    vmlsl.s16       q10, d6, d16    ; d1*2217 - c1*5352 + 7500
-
-    vshrn.s32       d1, q9, #12     ; op[1] = (c1*2217 + d1*5352 + 14500)>>12
-    vshrn.s32       d3, q10, #12    ; op[3] = (d1*2217 - c1*5352 +  7500)>>12
-
-
-    ; Part two
-
-    ; transpose d0=ip[0], d1=ip[4], d2=ip[8], d3=ip[12]
-    vtrn.32         d0, d2
-    vtrn.32         d1, d3
-    vtrn.16         d0, d1
-    vtrn.16         d2, d3
-
-    vmov.s16        d26, #7
-
-    vadd.s16        d4, d0, d3      ; a1 = ip[0] + ip[12]
-    vadd.s16        d5, d1, d2      ; b1 = ip[4] + ip[8]
-    vsub.s16        d6, d1, d2      ; c1 = ip[4] - ip[8]
-    vadd.s16        d4, d4, d26     ; a1 + 7
-    vsub.s16        d7, d0, d3      ; d1 = ip[0] - ip[12]
-
-    vadd.s16        d0, d4, d5      ; op[0] = a1 + b1 + 7
-    vsub.s16        d2, d4, d5      ; op[8] = a1 - b1 + 7
-
-    vmlal.s16       q11, d7, d16    ; d1*5352 + 12000
-    vmlal.s16       q12, d7, d17    ; d1*2217 + 51000
-
-    vceq.s16        d4, d7, #0
-
-    vshr.s16        d0, d0, #4
-    vshr.s16        d2, d2, #4
-
-    vmlal.s16       q11, d6, d17    ; c1*2217 + d1*5352 + 12000
-    vmlsl.s16       q12, d6, d16    ; d1*2217 - c1*5352 + 51000
-
-    vmvn.s16        d4, d4
-    vshrn.s32       d1, q11, #16    ; op[4] = (c1*2217 + d1*5352 + 12000)>>16
-    vsub.s16        d1, d1, d4      ; op[4] += (d1!=0)
-    vshrn.s32       d3, q12, #16    ; op[12]= (d1*2217 - c1*5352 + 51000)>>16
-
-    vst1.16         {q0, q1}, [r1@128]
-
-    bx              lr
-
-    ENDP
-
-;void vp8_short_fdct8x4_c(short *input, short *output, int pitch)
-|vp8_short_fdct8x4_neon| PROC
-
-    ; Part one
-
-    vld1.16         {q0}, [r0@128], r2
-    adr             r12, coeff
-    vld1.16         {q1}, [r0@128], r2
-    vld1.16         {q8}, [r12@128]!        ; d16=5352,  d17=2217
-    vld1.16         {q2}, [r0@128], r2
-    vld1.32         {q9, q10}, [r12@128]!   ;  q9=14500, q10=7500
-    vld1.16         {q3}, [r0@128], r2
-
-    ; transpose q0=ip[0], q1=ip[1], q2=ip[2], q3=ip[3]
-    vtrn.32         q0, q2          ; [A0|B0]
-    vtrn.32         q1, q3          ; [A1|B1]
-    vtrn.16         q0, q1          ; [A2|B2]
-    vtrn.16         q2, q3          ; [A3|B3]
-
-    vadd.s16        q11, q0, q3     ; a1 = ip[0] + ip[3]
-    vadd.s16        q12, q1, q2     ; b1 = ip[1] + ip[2]
-    vsub.s16        q13, q1, q2     ; c1 = ip[1] - ip[2]
-    vsub.s16        q14, q0, q3     ; d1 = ip[0] - ip[3]
-
-    vshl.s16        q11, q11, #3    ; a1 << 3
-    vshl.s16        q12, q12, #3    ; b1 << 3
-    vshl.s16        q13, q13, #3    ; c1 << 3
-    vshl.s16        q14, q14, #3    ; d1 << 3
-
-    vadd.s16        q0, q11, q12    ; [A0 | B0] = a1 + b1
-    vsub.s16        q2, q11, q12    ; [A2 | B2] = a1 - b1
-
-    vmov.s16        q11, q9         ; 14500
-    vmov.s16        q12, q10        ; 7500
-
-    vmlal.s16       q9, d28, d16    ; A[1] = d1*5352 + 14500
-    vmlal.s16       q10, d28, d17   ; A[3] = d1*2217 + 7500
-    vmlal.s16       q11, d29, d16   ; B[1] = d1*5352 + 14500
-    vmlal.s16       q12, d29, d17   ; B[3] = d1*2217 + 7500
-
-    vmlal.s16       q9, d26, d17    ; A[1] = c1*2217 + d1*5352 + 14500
-    vmlsl.s16       q10, d26, d16   ; A[3] = d1*2217 - c1*5352 + 7500
-    vmlal.s16       q11, d27, d17   ; B[1] = c1*2217 + d1*5352 + 14500
-    vmlsl.s16       q12, d27, d16   ; B[3] = d1*2217 - c1*5352 + 7500
-
-    vshrn.s32       d2, q9, #12     ; A[1] = (c1*2217 + d1*5352 + 14500)>>12
-    vshrn.s32       d6, q10, #12    ; A[3] = (d1*2217 - c1*5352 +  7500)>>12
-    vshrn.s32       d3, q11, #12    ; B[1] = (c1*2217 + d1*5352 + 14500)>>12
-    vshrn.s32       d7, q12, #12    ; B[3] = (d1*2217 - c1*5352 +  7500)>>12
-
-
-    ; Part two
-    vld1.32         {q9,q10}, [r12@128]    ; q9=12000, q10=51000
-
-    ; transpose q0=ip[0], q1=ip[4], q2=ip[8], q3=ip[12]
-    vtrn.32         q0, q2          ; q0=[A0 | B0]
-    vtrn.32         q1, q3          ; q1=[A4 | B4]
-    vtrn.16         q0, q1          ; q2=[A8 | B8]
-    vtrn.16         q2, q3          ; q3=[A12|B12]
-
-    vmov.s16        q15, #7
-
-    vadd.s16        q11, q0, q3     ; a1 = ip[0] + ip[12]
-    vadd.s16        q12, q1, q2     ; b1 = ip[4] + ip[8]
-    vadd.s16        q11, q11, q15   ; a1 + 7
-    vsub.s16        q13, q1, q2     ; c1 = ip[4] - ip[8]
-    vsub.s16        q14, q0, q3     ; d1 = ip[0] - ip[12]
-
-    vadd.s16        q0, q11, q12    ; a1 + b1 + 7
-    vsub.s16        q1, q11, q12    ; a1 - b1 + 7
-
-    vmov.s16        q11, q9         ; 12000
-    vmov.s16        q12, q10        ; 51000
-
-    vshr.s16        d0, d0, #4      ; A[0] = (a1 + b1 + 7)>>4
-    vshr.s16        d4, d1, #4      ; B[0] = (a1 + b1 + 7)>>4
-    vshr.s16        d2, d2, #4      ; A[8] = (a1 + b1 + 7)>>4
-    vshr.s16        d6, d3, #4      ; B[8] = (a1 + b1 + 7)>>4
-
-
-    vmlal.s16       q9, d28, d16    ; A[4]  = d1*5352 + 12000
-    vmlal.s16       q10, d28, d17   ; A[12] = d1*2217 + 51000
-    vmlal.s16       q11, d29, d16   ; B[4]  = d1*5352 + 12000
-    vmlal.s16       q12, d29, d17   ; B[12] = d1*2217 + 51000
-
-    vceq.s16        q14, q14, #0
-
-    vmlal.s16       q9, d26, d17    ; A[4]  = c1*2217 + d1*5352 + 12000
-    vmlsl.s16       q10, d26, d16   ; A[12] = d1*2217 - c1*5352 + 51000
-    vmlal.s16       q11, d27, d17   ; B[4]  = c1*2217 + d1*5352 + 12000
-    vmlsl.s16       q12, d27, d16   ; B[12] = d1*2217 - c1*5352 + 51000
-
-    vmvn.s16        q14, q14
-
-    vshrn.s32       d1, q9, #16     ; A[4] = (c1*2217 + d1*5352 + 12000)>>16
-    vshrn.s32       d3, q10, #16    ; A[12]= (d1*2217 - c1*5352 + 51000)>>16
-    vsub.s16        d1, d1, d28     ; A[4] += (d1!=0)
-
-    vshrn.s32       d5, q11, #16    ; B[4] = (c1*2217 + d1*5352 + 12000)>>16
-    vshrn.s32       d7, q12, #16    ; B[12]= (d1*2217 - c1*5352 + 51000)>>16
-    vsub.s16        d5, d5, d29     ; B[4] += (d1!=0)
-
-    vst1.16         {q0, q1}, [r1@128]! ; block A
-    vst1.16         {q2, q3}, [r1@128]! ; block B
-
-    bx              lr
-
-    ENDP
-
-    END
-
--- a/vp8/encoder/arm/neon/subtract_neon.asm
+++ /dev/null
@@ -1,185 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-    EXPORT |vp8_subtract_b_neon|
-    EXPORT |vp8_subtract_mby_neon|
-    EXPORT |vp8_subtract_mbuv_neon|
-
-    INCLUDE asm_enc_offsets.asm
-
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-;void vp8_subtract_b_neon(BLOCK *be, BLOCKD *bd, int pitch)
-|vp8_subtract_b_neon| PROC
-
-    stmfd   sp!, {r4-r7}
-
-    ldr     r3, [r0, #vp8_block_base_src]
-    ldr     r4, [r0, #vp8_block_src]
-    ldr     r5, [r0, #vp8_block_src_diff]
-    ldr     r3, [r3]
-    ldr     r6, [r0, #vp8_block_src_stride]
-    add     r3, r3, r4                      ; src = *base_src + src
-    ldr     r7, [r1, #vp8_blockd_predictor]
-
-    vld1.8          {d0}, [r3], r6          ;load src
-    vld1.8          {d1}, [r7], r2          ;load pred
-    vld1.8          {d2}, [r3], r6
-    vld1.8          {d3}, [r7], r2
-    vld1.8          {d4}, [r3], r6
-    vld1.8          {d5}, [r7], r2
-    vld1.8          {d6}, [r3], r6
-    vld1.8          {d7}, [r7], r2
-
-    vsubl.u8        q10, d0, d1
-    vsubl.u8        q11, d2, d3
-    vsubl.u8        q12, d4, d5
-    vsubl.u8        q13, d6, d7
-
-    mov             r2, r2, lsl #1
-
-    vst1.16         {d20}, [r5], r2         ;store diff
-    vst1.16         {d22}, [r5], r2
-    vst1.16         {d24}, [r5], r2
-    vst1.16         {d26}, [r5], r2
-
-    ldmfd   sp!, {r4-r7}
-    bx              lr
-
-    ENDP
-
-
-;==========================================
-;void vp8_subtract_mby_neon(short *diff, unsigned char *src, unsigned char *pred, int stride)
-|vp8_subtract_mby_neon| PROC
-    mov             r12, #4
-
-subtract_mby_loop
-    vld1.8          {q0}, [r1], r3          ;load src
-    vld1.8          {q1}, [r2]!             ;load pred
-    vld1.8          {q2}, [r1], r3
-    vld1.8          {q3}, [r2]!
-    vld1.8          {q4}, [r1], r3
-    vld1.8          {q5}, [r2]!
-    vld1.8          {q6}, [r1], r3
-    vld1.8          {q7}, [r2]!
-
-    vsubl.u8        q8, d0, d2
-    vsubl.u8        q9, d1, d3
-    vsubl.u8        q10, d4, d6
-    vsubl.u8        q11, d5, d7
-    vsubl.u8        q12, d8, d10
-    vsubl.u8        q13, d9, d11
-    vsubl.u8        q14, d12, d14
-    vsubl.u8        q15, d13, d15
-
-    vst1.16         {q8}, [r0]!             ;store diff
-    vst1.16         {q9}, [r0]!
-    vst1.16         {q10}, [r0]!
-    vst1.16         {q11}, [r0]!
-    vst1.16         {q12}, [r0]!
-    vst1.16         {q13}, [r0]!
-    vst1.16         {q14}, [r0]!
-    vst1.16         {q15}, [r0]!
-
-    subs            r12, r12, #1
-    bne             subtract_mby_loop
-
-    bx              lr
-    ENDP
-
-;=================================
-;void vp8_subtract_mbuv_neon(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride)
-|vp8_subtract_mbuv_neon| PROC
-    ldr             r12, [sp]
-
-;u
-    add             r0, r0, #512        ;   short *udiff = diff + 256;
-    add             r3, r3, #256        ;   unsigned char *upred = pred + 256;
-
-    vld1.8          {d0}, [r1], r12         ;load src
-    vld1.8          {d1}, [r3]!             ;load pred
-    vld1.8          {d2}, [r1], r12
-    vld1.8          {d3}, [r3]!
-    vld1.8          {d4}, [r1], r12
-    vld1.8          {d5}, [r3]!
-    vld1.8          {d6}, [r1], r12
-    vld1.8          {d7}, [r3]!
-    vld1.8          {d8}, [r1], r12
-    vld1.8          {d9}, [r3]!
-    vld1.8          {d10}, [r1], r12
-    vld1.8          {d11}, [r3]!
-    vld1.8          {d12}, [r1], r12
-    vld1.8          {d13}, [r3]!
-    vld1.8          {d14}, [r1], r12
-    vld1.8          {d15}, [r3]!
-
-    vsubl.u8        q8, d0, d1
-    vsubl.u8        q9, d2, d3
-    vsubl.u8        q10, d4, d5
-    vsubl.u8        q11, d6, d7
-    vsubl.u8        q12, d8, d9
-    vsubl.u8        q13, d10, d11
-    vsubl.u8        q14, d12, d13
-    vsubl.u8        q15, d14, d15
-
-    vst1.16         {q8}, [r0]!             ;store diff
-    vst1.16         {q9}, [r0]!
-    vst1.16         {q10}, [r0]!
-    vst1.16         {q11}, [r0]!
-    vst1.16         {q12}, [r0]!
-    vst1.16         {q13}, [r0]!
-    vst1.16         {q14}, [r0]!
-    vst1.16         {q15}, [r0]!
-
-;v
-    vld1.8          {d0}, [r2], r12         ;load src
-    vld1.8          {d1}, [r3]!             ;load pred
-    vld1.8          {d2}, [r2], r12
-    vld1.8          {d3}, [r3]!
-    vld1.8          {d4}, [r2], r12
-    vld1.8          {d5}, [r3]!
-    vld1.8          {d6}, [r2], r12
-    vld1.8          {d7}, [r3]!
-    vld1.8          {d8}, [r2], r12
-    vld1.8          {d9}, [r3]!
-    vld1.8          {d10}, [r2], r12
-    vld1.8          {d11}, [r3]!
-    vld1.8          {d12}, [r2], r12
-    vld1.8          {d13}, [r3]!
-    vld1.8          {d14}, [r2], r12
-    vld1.8          {d15}, [r3]!
-
-    vsubl.u8        q8, d0, d1
-    vsubl.u8        q9, d2, d3
-    vsubl.u8        q10, d4, d5
-    vsubl.u8        q11, d6, d7
-    vsubl.u8        q12, d8, d9
-    vsubl.u8        q13, d10, d11
-    vsubl.u8        q14, d12, d13
-    vsubl.u8        q15, d14, d15
-
-    vst1.16         {q8}, [r0]!             ;store diff
-    vst1.16         {q9}, [r0]!
-    vst1.16         {q10}, [r0]!
-    vst1.16         {q11}, [r0]!
-    vst1.16         {q12}, [r0]!
-    vst1.16         {q13}, [r0]!
-    vst1.16         {q14}, [r0]!
-    vst1.16         {q15}, [r0]!
-
-    bx              lr
-    ENDP
-
-    END
--- a/vp8/encoder/arm/neon/variance_neon.asm
+++ /dev/null
@@ -1,276 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp9_variance16x16_neon|
-    EXPORT  |vp9_variance16x8_neon|
-    EXPORT  |vp9_variance8x16_neon|
-    EXPORT  |vp9_variance8x8_neon|
-
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-; r0    unsigned char *src_ptr
-; r1    int source_stride
-; r2    unsigned char *ref_ptr
-; r3    int  recon_stride
-; stack unsigned int *sse
-|vp9_variance16x16_neon| PROC
-    vmov.i8         q8, #0                      ;q8 - sum
-    vmov.i8         q9, #0                      ;q9, q10 - sse
-    vmov.i8         q10, #0
-
-    mov             r12, #8
-
-variance16x16_neon_loop
-    vld1.8          {q0}, [r0], r1              ;Load up source and reference
-    vld1.8          {q2}, [r2], r3
-    vld1.8          {q1}, [r0], r1
-    vld1.8          {q3}, [r2], r3
-
-    vsubl.u8        q11, d0, d4                 ;calculate diff
-    vsubl.u8        q12, d1, d5
-    vsubl.u8        q13, d2, d6
-    vsubl.u8        q14, d3, d7
-
-    ;VPADAL adds adjacent pairs of elements of a vector, and accumulates
-    ;the results into the elements of the destination vector. The explanation
-    ;in ARM guide is wrong.
-    vpadal.s16      q8, q11                     ;calculate sum
-    vmlal.s16       q9, d22, d22                ;calculate sse
-    vmlal.s16       q10, d23, d23
-
-    subs            r12, r12, #1
-
-    vpadal.s16      q8, q12
-    vmlal.s16       q9, d24, d24
-    vmlal.s16       q10, d25, d25
-    vpadal.s16      q8, q13
-    vmlal.s16       q9, d26, d26
-    vmlal.s16       q10, d27, d27
-    vpadal.s16      q8, q14
-    vmlal.s16       q9, d28, d28
-    vmlal.s16       q10, d29, d29
-
-    bne             variance16x16_neon_loop
-
-    vadd.u32        q10, q9, q10                ;accumulate sse
-    vpaddl.s32      q0, q8                      ;accumulate sum
-
-    ldr             r12, [sp]                   ;load *sse from stack
-
-    vpaddl.u32      q1, q10
-    vadd.s64        d0, d0, d1
-    vadd.u64        d1, d2, d3
-
-    ;vmov.32        r0, d0[0]                   ;this instruction costs a lot
-    ;vmov.32        r1, d1[0]
-    ;mul            r0, r0, r0
-    ;str            r1, [r12]
-    ;sub            r0, r1, r0, asr #8
-
-    ;sum is in [-255x256, 255x256]. sumxsum is 32-bit. Shift to right should
-    ;have sign-bit exension, which is vshr.s. Have to use s32 to make it right.
-    vmull.s32       q5, d0, d0
-    vst1.32         {d1[0]}, [r12]              ;store sse
-    vshr.s32        d10, d10, #8
-    vsub.s32        d0, d1, d10
-
-    vmov.32         r0, d0[0]                   ;return
-    bx              lr
-
-    ENDP
-
-;================================
-;unsigned int vp9_variance16x8_c(
-;    unsigned char *src_ptr,
-;    int  source_stride,
-;    unsigned char *ref_ptr,
-;    int  recon_stride,
-;   unsigned int *sse)
-|vp9_variance16x8_neon| PROC
-    vmov.i8         q8, #0                      ;q8 - sum
-    vmov.i8         q9, #0                      ;q9, q10 - sse
-    vmov.i8         q10, #0
-
-    mov             r12, #4
-
-variance16x8_neon_loop
-    vld1.8          {q0}, [r0], r1              ;Load up source and reference
-    vld1.8          {q2}, [r2], r3
-    vld1.8          {q1}, [r0], r1
-    vld1.8          {q3}, [r2], r3
-
-    vsubl.u8        q11, d0, d4                 ;calculate diff
-    vsubl.u8        q12, d1, d5
-    vsubl.u8        q13, d2, d6
-    vsubl.u8        q14, d3, d7
-
-    vpadal.s16      q8, q11                     ;calculate sum
-    vmlal.s16       q9, d22, d22                ;calculate sse
-    vmlal.s16       q10, d23, d23
-
-    subs            r12, r12, #1
-
-    vpadal.s16      q8, q12
-    vmlal.s16       q9, d24, d24
-    vmlal.s16       q10, d25, d25
-    vpadal.s16      q8, q13
-    vmlal.s16       q9, d26, d26
-    vmlal.s16       q10, d27, d27
-    vpadal.s16      q8, q14
-    vmlal.s16       q9, d28, d28
-    vmlal.s16       q10, d29, d29
-
-    bne             variance16x8_neon_loop
-
-    vadd.u32        q10, q9, q10                ;accumulate sse
-    vpaddl.s32      q0, q8                      ;accumulate sum
-
-    ldr             r12, [sp]                   ;load *sse from stack
-
-    vpaddl.u32      q1, q10
-    vadd.s64        d0, d0, d1
-    vadd.u64        d1, d2, d3
-
-    vmull.s32       q5, d0, d0
-    vst1.32         {d1[0]}, [r12]              ;store sse
-    vshr.s32        d10, d10, #7
-    vsub.s32        d0, d1, d10
-
-    vmov.32         r0, d0[0]                   ;return
-    bx              lr
-
-    ENDP
-
-;=================================
-;unsigned int vp9_variance8x16_c(
-;    unsigned char *src_ptr,
-;    int  source_stride,
-;    unsigned char *ref_ptr,
-;    int  recon_stride,
-;   unsigned int *sse)
-
-|vp9_variance8x16_neon| PROC
-    vmov.i8         q8, #0                      ;q8 - sum
-    vmov.i8         q9, #0                      ;q9, q10 - sse
-    vmov.i8         q10, #0
-
-    mov             r12, #8
-
-variance8x16_neon_loop
-    vld1.8          {d0}, [r0], r1              ;Load up source and reference
-    vld1.8          {d4}, [r2], r3
-    vld1.8          {d2}, [r0], r1
-    vld1.8          {d6}, [r2], r3
-
-    vsubl.u8        q11, d0, d4                 ;calculate diff
-    vsubl.u8        q12, d2, d6
-
-    vpadal.s16      q8, q11                     ;calculate sum
-    vmlal.s16       q9, d22, d22                ;calculate sse
-    vmlal.s16       q10, d23, d23
-
-    subs            r12, r12, #1
-
-    vpadal.s16      q8, q12
-    vmlal.s16       q9, d24, d24
-    vmlal.s16       q10, d25, d25
-
-    bne             variance8x16_neon_loop
-
-    vadd.u32        q10, q9, q10                ;accumulate sse
-    vpaddl.s32      q0, q8                      ;accumulate sum
-
-    ldr             r12, [sp]                   ;load *sse from stack
-
-    vpaddl.u32      q1, q10
-    vadd.s64        d0, d0, d1
-    vadd.u64        d1, d2, d3
-
-    vmull.s32       q5, d0, d0
-    vst1.32         {d1[0]}, [r12]              ;store sse
-    vshr.s32        d10, d10, #7
-    vsub.s32        d0, d1, d10
-
-    vmov.32         r0, d0[0]                   ;return
-    bx              lr
-
-    ENDP
-
-;==================================
-; r0    unsigned char *src_ptr
-; r1    int source_stride
-; r2    unsigned char *ref_ptr
-; r3    int  recon_stride
-; stack unsigned int *sse
-|vp9_variance8x8_neon| PROC
-    vmov.i8         q8, #0                      ;q8 - sum
-    vmov.i8         q9, #0                      ;q9, q10 - sse
-    vmov.i8         q10, #0
-
-    mov             r12, #2
-
-variance8x8_neon_loop
-    vld1.8          {d0}, [r0], r1              ;Load up source and reference
-    vld1.8          {d4}, [r2], r3
-    vld1.8          {d1}, [r0], r1
-    vld1.8          {d5}, [r2], r3
-    vld1.8          {d2}, [r0], r1
-    vld1.8          {d6}, [r2], r3
-    vld1.8          {d3}, [r0], r1
-    vld1.8          {d7}, [r2], r3
-
-    vsubl.u8        q11, d0, d4                 ;calculate diff
-    vsubl.u8        q12, d1, d5
-    vsubl.u8        q13, d2, d6
-    vsubl.u8        q14, d3, d7
-
-    vpadal.s16      q8, q11                     ;calculate sum
-    vmlal.s16       q9, d22, d22                ;calculate sse
-    vmlal.s16       q10, d23, d23
-
-    subs            r12, r12, #1
-
-    vpadal.s16      q8, q12
-    vmlal.s16       q9, d24, d24
-    vmlal.s16       q10, d25, d25
-    vpadal.s16      q8, q13
-    vmlal.s16       q9, d26, d26
-    vmlal.s16       q10, d27, d27
-    vpadal.s16      q8, q14
-    vmlal.s16       q9, d28, d28
-    vmlal.s16       q10, d29, d29
-
-    bne             variance8x8_neon_loop
-
-    vadd.u32        q10, q9, q10                ;accumulate sse
-    vpaddl.s32      q0, q8                      ;accumulate sum
-
-    ldr             r12, [sp]                   ;load *sse from stack
-
-    vpaddl.u32      q1, q10
-    vadd.s64        d0, d0, d1
-    vadd.u64        d1, d2, d3
-
-    vmull.s32       q5, d0, d0
-    vst1.32         {d1[0]}, [r12]              ;store sse
-    vshr.s32        d10, d10, #6
-    vsub.s32        d0, d1, d10
-
-    vmov.32         r0, d0[0]                   ;return
-    bx              lr
-
-    ENDP
-
-    END
--- a/vp8/encoder/arm/neon/vp8_memcpy_neon.asm
+++ /dev/null
@@ -1,68 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT |vp8_memcpy_neon|
-
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-;=========================================
-;void vp8_memcpy_neon(unsigned char *dst_ptr, unsigned char *src_ptr, int sz);
-|vp8_memcpy_neon| PROC
-    ;pld                [r1]                        ;preload pred data
-    ;pld                [r1, #128]
-    ;pld                [r1, #256]
-    ;pld                [r1, #384]
-
-    mov             r12, r2, lsr #8                 ;copy 256 bytes data at one time
-
-memcpy_neon_loop
-    vld1.8          {q0, q1}, [r1]!                 ;load src data
-    subs            r12, r12, #1
-    vld1.8          {q2, q3}, [r1]!
-    vst1.8          {q0, q1}, [r0]!                 ;copy to dst_ptr
-    vld1.8          {q4, q5}, [r1]!
-    vst1.8          {q2, q3}, [r0]!
-    vld1.8          {q6, q7}, [r1]!
-    vst1.8          {q4, q5}, [r0]!
-    vld1.8          {q8, q9}, [r1]!
-    vst1.8          {q6, q7}, [r0]!
-    vld1.8          {q10, q11}, [r1]!
-    vst1.8          {q8, q9}, [r0]!
-    vld1.8          {q12, q13}, [r1]!
-    vst1.8          {q10, q11}, [r0]!
-    vld1.8          {q14, q15}, [r1]!
-    vst1.8          {q12, q13}, [r0]!
-    vst1.8          {q14, q15}, [r0]!
-
-    ;pld                [r1]                        ;preload pred data -- need to adjust for real device
-    ;pld                [r1, #128]
-    ;pld                [r1, #256]
-    ;pld                [r1, #384]
-
-    bne             memcpy_neon_loop
-
-    ands            r3, r2, #0xff                   ;extra copy
-    beq             done_copy_neon_loop
-
-extra_copy_neon_loop
-    vld1.8          {q0}, [r1]!                 ;load src data
-    subs            r3, r3, #16
-    vst1.8          {q0}, [r0]!
-    bne             extra_copy_neon_loop
-
-done_copy_neon_loop
-    bx              lr
-    ENDP
-
-    END
--- a/vp8/encoder/arm/neon/vp8_mse16x16_neon.asm
+++ /dev/null
@@ -1,116 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp8_mse16x16_neon|
-    EXPORT  |vp8_get4x4sse_cs_neon|
-
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-;============================
-; r0    unsigned char *src_ptr
-; r1    int source_stride
-; r2    unsigned char *ref_ptr
-; r3    int  recon_stride
-; stack unsigned int *sse
-;note: in this function, sum is never used. So, we can remove this part of calculation
-;from vp9_variance().
-
-|vp8_mse16x16_neon| PROC
-    vmov.i8         q7, #0                      ;q7, q8, q9, q10 - sse
-    vmov.i8         q8, #0
-    vmov.i8         q9, #0
-    vmov.i8         q10, #0
-
-    mov             r12, #8
-
-mse16x16_neon_loop
-    vld1.8          {q0}, [r0], r1              ;Load up source and reference
-    vld1.8          {q2}, [r2], r3
-    vld1.8          {q1}, [r0], r1
-    vld1.8          {q3}, [r2], r3
-
-    vsubl.u8        q11, d0, d4
-    vsubl.u8        q12, d1, d5
-    vsubl.u8        q13, d2, d6
-    vsubl.u8        q14, d3, d7
-
-    vmlal.s16       q7, d22, d22
-    vmlal.s16       q8, d23, d23
-
-    subs            r12, r12, #1
-
-    vmlal.s16       q9, d24, d24
-    vmlal.s16       q10, d25, d25
-    vmlal.s16       q7, d26, d26
-    vmlal.s16       q8, d27, d27
-    vmlal.s16       q9, d28, d28
-    vmlal.s16       q10, d29, d29
-
-    bne             mse16x16_neon_loop
-
-    vadd.u32        q7, q7, q8
-    vadd.u32        q9, q9, q10
-
-    ldr             r12, [sp]               ;load *sse from stack
-
-    vadd.u32        q10, q7, q9
-    vpaddl.u32      q1, q10
-    vadd.u64        d0, d2, d3
-
-    vst1.32         {d0[0]}, [r12]
-    vmov.32         r0, d0[0]
-
-    bx              lr
-
-    ENDP
-
-
-;=============================
-; r0    unsigned char *src_ptr,
-; r1    int  source_stride,
-; r2    unsigned char *ref_ptr,
-; r3    int  recon_stride
-|vp8_get4x4sse_cs_neon| PROC
-    vld1.8          {d0}, [r0], r1              ;Load up source and reference
-    vld1.8          {d4}, [r2], r3
-    vld1.8          {d1}, [r0], r1
-    vld1.8          {d5}, [r2], r3
-    vld1.8          {d2}, [r0], r1
-    vld1.8          {d6}, [r2], r3
-    vld1.8          {d3}, [r0], r1
-    vld1.8          {d7}, [r2], r3
-
-    vsubl.u8        q11, d0, d4
-    vsubl.u8        q12, d1, d5
-    vsubl.u8        q13, d2, d6
-    vsubl.u8        q14, d3, d7
-
-    vmull.s16       q7, d22, d22
-    vmull.s16       q8, d24, d24
-    vmull.s16       q9, d26, d26
-    vmull.s16       q10, d28, d28
-
-    vadd.u32        q7, q7, q8
-    vadd.u32        q9, q9, q10
-    vadd.u32        q9, q7, q9
-
-    vpaddl.u32      q1, q9
-    vadd.u64        d0, d2, d3
-
-    vmov.32         r0, d0[0]
-    bx              lr
-
-    ENDP
-
-    END
--- a/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.asm
+++ /dev/null
@@ -1,103 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp8_short_walsh4x4_neon|
-
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-;void vp8_short_walsh4x4_neon(short *input, short *output, int pitch)
-; r0   short *input,
-; r1   short *output,
-; r2   int pitch
-|vp8_short_walsh4x4_neon| PROC
-
-    vld1.16         {d0}, [r0@64], r2   ; load input
-    vld1.16         {d1}, [r0@64], r2
-    vld1.16         {d2}, [r0@64], r2
-    vld1.16         {d3}, [r0@64]
-
-    ;First for-loop
-    ;transpose d0, d1, d2, d3. Then, d0=ip[0], d1=ip[1], d2=ip[2], d3=ip[3]
-    vtrn.32         d0, d2
-    vtrn.32         d1, d3
-
-    vmov.s32        q15, #3             ; add 3 to all values
-
-    vtrn.16         d0, d1
-    vtrn.16         d2, d3
-
-    vadd.s16        d4, d0, d2          ; ip[0] + ip[2]
-    vadd.s16        d5, d1, d3          ; ip[1] + ip[3]
-    vsub.s16        d6, d1, d3          ; ip[1] - ip[3]
-    vsub.s16        d7, d0, d2          ; ip[0] - ip[2]
-
-    vshl.s16        d4, d4, #2          ; a1 = (ip[0] + ip[2]) << 2
-    vshl.s16        d5, d5, #2          ; d1 = (ip[1] + ip[3]) << 2
-    vshl.s16        d6, d6, #2          ; c1 = (ip[1] - ip[3]) << 2
-    vceq.s16        d16, d4, #0         ; a1 == 0
-    vshl.s16        d7, d7, #2          ; b1 = (ip[0] - ip[2]) << 2
-
-    vadd.s16        d0, d4, d5          ; a1 + d1
-    vmvn            d16, d16            ; a1 != 0
-    vsub.s16        d3, d4, d5          ; op[3] = a1 - d1
-    vadd.s16        d1, d7, d6          ; op[1] = b1 + c1
-    vsub.s16        d2, d7, d6          ; op[2] = b1 - c1
-    vsub.s16        d0, d0, d16         ; op[0] = a1 + d1 + (a1 != 0)
-
-    ;Second for-loop
-    ;transpose d0, d1, d2, d3, Then, d0=ip[0], d1=ip[4], d2=ip[8], d3=ip[12]
-    vtrn.32         d1, d3
-    vtrn.32         d0, d2
-    vtrn.16         d2, d3
-    vtrn.16         d0, d1
-
-    vaddl.s16       q8, d0, d2          ; a1 = ip[0]+ip[8]
-    vaddl.s16       q9, d1, d3          ; d1 = ip[4]+ip[12]
-    vsubl.s16       q10, d1, d3         ; c1 = ip[4]-ip[12]
-    vsubl.s16       q11, d0, d2         ; b1 = ip[0]-ip[8]
-
-    vadd.s32        q0, q8, q9          ; a2 = a1 + d1
-    vadd.s32        q1, q11, q10        ; b2 = b1 + c1
-    vsub.s32        q2, q11, q10        ; c2 = b1 - c1
-    vsub.s32        q3, q8, q9          ; d2 = a1 - d1
-
-    vclt.s32        q8, q0, #0
-    vclt.s32        q9, q1, #0
-    vclt.s32        q10, q2, #0
-    vclt.s32        q11, q3, #0
-
-    ; subtract -1 (or 0)
-    vsub.s32        q0, q0, q8          ; a2 += a2 < 0
-    vsub.s32        q1, q1, q9          ; b2 += b2 < 0
-    vsub.s32        q2, q2, q10         ; c2 += c2 < 0
-    vsub.s32        q3, q3, q11         ; d2 += d2 < 0
-
-    vadd.s32        q8, q0, q15         ; a2 + 3
-    vadd.s32        q9, q1, q15         ; b2 + 3
-    vadd.s32        q10, q2, q15        ; c2 + 3
-    vadd.s32        q11, q3, q15        ; d2 + 3
-
-    ; vrshrn? would add 1 << 3-1 = 2
-    vshrn.s32       d0, q8, #3
-    vshrn.s32       d1, q9, #3
-    vshrn.s32       d2, q10, #3
-    vshrn.s32       d3, q11, #3
-
-    vst1.16         {q0, q1}, [r1@128]
-
-    bx              lr
-
-    ENDP
-
-    END
--- a/vp8/encoder/arm/neon/vp8_subpixelvariance16x16_neon.asm
+++ /dev/null
@@ -1,425 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp9_sub_pixel_variance16x16_neon_func|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-; r0    unsigned char  *src_ptr,
-; r1    int  src_pixels_per_line,
-; r2    int  xoffset,
-; r3    int  yoffset,
-; stack(r4) unsigned char *dst_ptr,
-; stack(r5) int dst_pixels_per_line,
-; stack(r6) unsigned int *sse
-;note: most of the code is copied from bilinear_predict16x16_neon and vp9_variance16x16_neon.
-
-|vp9_sub_pixel_variance16x16_neon_func| PROC
-    push            {r4-r6, lr}
-
-    ldr             r12, _BilinearTaps_coeff_
-    ldr             r4, [sp, #16]           ;load *dst_ptr from stack
-    ldr             r5, [sp, #20]           ;load dst_pixels_per_line from stack
-    ldr             r6, [sp, #24]           ;load *sse from stack
-
-    cmp             r2, #0                  ;skip first_pass filter if xoffset=0
-    beq             secondpass_bfilter16x16_only
-
-    add             r2, r12, r2, lsl #3     ;calculate filter location
-
-    cmp             r3, #0                  ;skip second_pass filter if yoffset=0
-
-    vld1.s32        {d31}, [r2]             ;load first_pass filter
-
-    beq             firstpass_bfilter16x16_only
-
-    sub             sp, sp, #272            ;reserve space on stack for temporary storage
-    vld1.u8         {d2, d3, d4}, [r0], r1      ;load src data
-    mov             lr, sp
-    vld1.u8         {d5, d6, d7}, [r0], r1
-
-    mov             r2, #3                  ;loop counter
-    vld1.u8         {d8, d9, d10}, [r0], r1
-
-    vdup.8          d0, d31[0]              ;first_pass filter (d0 d1)
-    vld1.u8         {d11, d12, d13}, [r0], r1
-
-    vdup.8          d1, d31[4]
-
-;First Pass: output_height lines x output_width columns (17x16)
-vp8e_filt_blk2d_fp16x16_loop_neon
-    pld             [r0]
-    pld             [r0, r1]
-    pld             [r0, r1, lsl #1]
-
-    vmull.u8        q7, d2, d0              ;(src_ptr[0] * Filter[0])
-    vmull.u8        q8, d3, d0
-    vmull.u8        q9, d5, d0
-    vmull.u8        q10, d6, d0
-    vmull.u8        q11, d8, d0
-    vmull.u8        q12, d9, d0
-    vmull.u8        q13, d11, d0
-    vmull.u8        q14, d12, d0
-
-    vext.8          d2, d2, d3, #1          ;construct src_ptr[1]
-    vext.8          d5, d5, d6, #1
-    vext.8          d8, d8, d9, #1
-    vext.8          d11, d11, d12, #1
-
-    vmlal.u8        q7, d2, d1              ;(src_ptr[0] * Filter[1])
-    vmlal.u8        q9, d5, d1
-    vmlal.u8        q11, d8, d1
-    vmlal.u8        q13, d11, d1
-
-    vext.8          d3, d3, d4, #1
-    vext.8          d6, d6, d7, #1
-    vext.8          d9, d9, d10, #1
-    vext.8          d12, d12, d13, #1
-
-    vmlal.u8        q8, d3, d1              ;(src_ptr[0] * Filter[1])
-    vmlal.u8        q10, d6, d1
-    vmlal.u8        q12, d9, d1
-    vmlal.u8        q14, d12, d1
-
-    subs            r2, r2, #1
-
-    vqrshrn.u16    d14, q7, #7              ;shift/round/saturate to u8
-    vqrshrn.u16    d15, q8, #7
-    vqrshrn.u16    d16, q9, #7
-    vqrshrn.u16    d17, q10, #7
-    vqrshrn.u16    d18, q11, #7
-    vqrshrn.u16    d19, q12, #7
-    vqrshrn.u16    d20, q13, #7
-
-    vld1.u8         {d2, d3, d4}, [r0], r1      ;load src data
-    vqrshrn.u16    d21, q14, #7
-    vld1.u8         {d5, d6, d7}, [r0], r1
-
-    vst1.u8         {d14, d15, d16, d17}, [lr]!     ;store result
-    vld1.u8         {d8, d9, d10}, [r0], r1
-    vst1.u8         {d18, d19, d20, d21}, [lr]!
-    vld1.u8         {d11, d12, d13}, [r0], r1
-
-    bne             vp8e_filt_blk2d_fp16x16_loop_neon
-
-;First-pass filtering for rest 5 lines
-    vld1.u8         {d14, d15, d16}, [r0], r1
-
-    vmull.u8        q9, d2, d0              ;(src_ptr[0] * Filter[0])
-    vmull.u8        q10, d3, d0
-    vmull.u8        q11, d5, d0
-    vmull.u8        q12, d6, d0
-    vmull.u8        q13, d8, d0
-    vmull.u8        q14, d9, d0
-
-    vext.8          d2, d2, d3, #1          ;construct src_ptr[1]
-    vext.8          d5, d5, d6, #1
-    vext.8          d8, d8, d9, #1
-
-    vmlal.u8        q9, d2, d1              ;(src_ptr[0] * Filter[1])
-    vmlal.u8        q11, d5, d1
-    vmlal.u8        q13, d8, d1
-
-    vext.8          d3, d3, d4, #1
-    vext.8          d6, d6, d7, #1
-    vext.8          d9, d9, d10, #1
-
-    vmlal.u8        q10, d3, d1             ;(src_ptr[0] * Filter[1])
-    vmlal.u8        q12, d6, d1
-    vmlal.u8        q14, d9, d1
-
-    vmull.u8        q1, d11, d0
-    vmull.u8        q2, d12, d0
-    vmull.u8        q3, d14, d0
-    vmull.u8        q4, d15, d0
-
-    vext.8          d11, d11, d12, #1       ;construct src_ptr[1]
-    vext.8          d14, d14, d15, #1
-
-    vmlal.u8        q1, d11, d1             ;(src_ptr[0] * Filter[1])
-    vmlal.u8        q3, d14, d1
-
-    vext.8          d12, d12, d13, #1
-    vext.8          d15, d15, d16, #1
-
-    vmlal.u8        q2, d12, d1             ;(src_ptr[0] * Filter[1])
-    vmlal.u8        q4, d15, d1
-
-    vqrshrn.u16    d10, q9, #7              ;shift/round/saturate to u8
-    vqrshrn.u16    d11, q10, #7
-    vqrshrn.u16    d12, q11, #7
-    vqrshrn.u16    d13, q12, #7
-    vqrshrn.u16    d14, q13, #7
-    vqrshrn.u16    d15, q14, #7
-    vqrshrn.u16    d16, q1, #7
-    vqrshrn.u16    d17, q2, #7
-    vqrshrn.u16    d18, q3, #7
-    vqrshrn.u16    d19, q4, #7
-
-    vst1.u8         {d10, d11, d12, d13}, [lr]!         ;store result
-    vst1.u8         {d14, d15, d16, d17}, [lr]!
-    vst1.u8         {d18, d19}, [lr]!
-
-;Second pass: 16x16
-;secondpass_filter
-    add             r3, r12, r3, lsl #3
-    sub             lr, lr, #272
-
-    vld1.u32        {d31}, [r3]             ;load second_pass filter
-
-    sub             sp, sp, #256
-    mov             r3, sp
-
-    vld1.u8         {d22, d23}, [lr]!       ;load src data
-
-    vdup.8          d0, d31[0]              ;second_pass filter parameters (d0 d1)
-    vdup.8          d1, d31[4]
-    mov             r12, #4                 ;loop counter
-
-vp8e_filt_blk2d_sp16x16_loop_neon
-    vld1.u8         {d24, d25}, [lr]!
-    vmull.u8        q1, d22, d0             ;(src_ptr[0] * Filter[0])
-    vld1.u8         {d26, d27}, [lr]!
-    vmull.u8        q2, d23, d0
-    vld1.u8         {d28, d29}, [lr]!
-    vmull.u8        q3, d24, d0
-    vld1.u8         {d30, d31}, [lr]!
-
-    vmull.u8        q4, d25, d0
-    vmull.u8        q5, d26, d0
-    vmull.u8        q6, d27, d0
-    vmull.u8        q7, d28, d0
-    vmull.u8        q8, d29, d0
-
-    vmlal.u8        q1, d24, d1             ;(src_ptr[pixel_step] * Filter[1])
-    vmlal.u8        q2, d25, d1
-    vmlal.u8        q3, d26, d1
-    vmlal.u8        q4, d27, d1
-    vmlal.u8        q5, d28, d1
-    vmlal.u8        q6, d29, d1
-    vmlal.u8        q7, d30, d1
-    vmlal.u8        q8, d31, d1
-
-    subs            r12, r12, #1
-
-    vqrshrn.u16    d2, q1, #7               ;shift/round/saturate to u8
-    vqrshrn.u16    d3, q2, #7
-    vqrshrn.u16    d4, q3, #7
-    vqrshrn.u16    d5, q4, #7
-    vqrshrn.u16    d6, q5, #7
-    vqrshrn.u16    d7, q6, #7
-    vqrshrn.u16    d8, q7, #7
-    vqrshrn.u16    d9, q8, #7
-
-    vst1.u8         {d2, d3}, [r3]!         ;store result
-    vst1.u8         {d4, d5}, [r3]!
-    vst1.u8         {d6, d7}, [r3]!
-    vmov            q11, q15
-    vst1.u8         {d8, d9}, [r3]!
-
-    bne             vp8e_filt_blk2d_sp16x16_loop_neon
-
-    b               sub_pixel_variance16x16_neon
-
-;--------------------
-firstpass_bfilter16x16_only
-    mov             r2, #4                      ;loop counter
-    sub             sp, sp, #528            ;reserve space on stack for temporary storage
-    vdup.8          d0, d31[0]                  ;first_pass filter (d0 d1)
-    vdup.8          d1, d31[4]
-    mov             r3, sp
-
-;First Pass: output_height lines x output_width columns (16x16)
-vp8e_filt_blk2d_fpo16x16_loop_neon
-    vld1.u8         {d2, d3, d4}, [r0], r1      ;load src data
-    vld1.u8         {d5, d6, d7}, [r0], r1
-    vld1.u8         {d8, d9, d10}, [r0], r1
-    vld1.u8         {d11, d12, d13}, [r0], r1
-
-    pld             [r0]
-    pld             [r0, r1]
-    pld             [r0, r1, lsl #1]
-
-    vmull.u8        q7, d2, d0              ;(src_ptr[0] * Filter[0])
-    vmull.u8        q8, d3, d0
-    vmull.u8        q9, d5, d0
-    vmull.u8        q10, d6, d0
-    vmull.u8        q11, d8, d0
-    vmull.u8        q12, d9, d0
-    vmull.u8        q13, d11, d0
-    vmull.u8        q14, d12, d0
-
-    vext.8          d2, d2, d3, #1          ;construct src_ptr[1]
-    vext.8          d5, d5, d6, #1
-    vext.8          d8, d8, d9, #1
-    vext.8          d11, d11, d12, #1
-
-    vmlal.u8        q7, d2, d1              ;(src_ptr[0] * Filter[1])
-    vmlal.u8        q9, d5, d1
-    vmlal.u8        q11, d8, d1
-    vmlal.u8        q13, d11, d1
-
-    vext.8          d3, d3, d4, #1
-    vext.8          d6, d6, d7, #1
-    vext.8          d9, d9, d10, #1
-    vext.8          d12, d12, d13, #1
-
-    vmlal.u8        q8, d3, d1              ;(src_ptr[0] * Filter[1])
-    vmlal.u8        q10, d6, d1
-    vmlal.u8        q12, d9, d1
-    vmlal.u8        q14, d12, d1
-
-    subs            r2, r2, #1
-
-    vqrshrn.u16    d14, q7, #7              ;shift/round/saturate to u8
-    vqrshrn.u16    d15, q8, #7
-    vqrshrn.u16    d16, q9, #7
-    vqrshrn.u16    d17, q10, #7
-    vqrshrn.u16    d18, q11, #7
-    vqrshrn.u16    d19, q12, #7
-    vqrshrn.u16    d20, q13, #7
-    vst1.u8         {d14, d15}, [r3]!       ;store result
-    vqrshrn.u16    d21, q14, #7
-
-    vst1.u8         {d16, d17}, [r3]!
-    vst1.u8         {d18, d19}, [r3]!
-    vst1.u8         {d20, d21}, [r3]!
-
-    bne             vp8e_filt_blk2d_fpo16x16_loop_neon
-
-    b               sub_pixel_variance16x16_neon
-
-;---------------------
-secondpass_bfilter16x16_only
-;Second pass: 16x16
-;secondpass_filter
-    sub             sp, sp, #528            ;reserve space on stack for temporary storage
-    add             r3, r12, r3, lsl #3
-    mov             r12, #4                     ;loop counter
-    vld1.u32        {d31}, [r3]                 ;load second_pass filter
-    vld1.u8         {d22, d23}, [r0], r1        ;load src data
-    mov             r3, sp
-
-    vdup.8          d0, d31[0]                  ;second_pass filter parameters (d0 d1)
-    vdup.8          d1, d31[4]
-
-vp8e_filt_blk2d_spo16x16_loop_neon
-    vld1.u8         {d24, d25}, [r0], r1
-    vmull.u8        q1, d22, d0             ;(src_ptr[0] * Filter[0])
-    vld1.u8         {d26, d27}, [r0], r1
-    vmull.u8        q2, d23, d0
-    vld1.u8         {d28, d29}, [r0], r1
-    vmull.u8        q3, d24, d0
-    vld1.u8         {d30, d31}, [r0], r1
-
-    vmull.u8        q4, d25, d0
-    vmull.u8        q5, d26, d0
-    vmull.u8        q6, d27, d0
-    vmull.u8        q7, d28, d0
-    vmull.u8        q8, d29, d0
-
-    vmlal.u8        q1, d24, d1             ;(src_ptr[pixel_step] * Filter[1])
-    vmlal.u8        q2, d25, d1
-    vmlal.u8        q3, d26, d1
-    vmlal.u8        q4, d27, d1
-    vmlal.u8        q5, d28, d1
-    vmlal.u8        q6, d29, d1
-    vmlal.u8        q7, d30, d1
-    vmlal.u8        q8, d31, d1
-
-    vqrshrn.u16    d2, q1, #7               ;shift/round/saturate to u8
-    vqrshrn.u16    d3, q2, #7
-    vqrshrn.u16    d4, q3, #7
-    vqrshrn.u16    d5, q4, #7
-    vqrshrn.u16    d6, q5, #7
-    vqrshrn.u16    d7, q6, #7
-    vqrshrn.u16    d8, q7, #7
-    vqrshrn.u16    d9, q8, #7
-
-    vst1.u8         {d2, d3}, [r3]!         ;store result
-    subs            r12, r12, #1
-    vst1.u8         {d4, d5}, [r3]!
-    vmov            q11, q15
-    vst1.u8         {d6, d7}, [r3]!
-    vst1.u8         {d8, d9}, [r3]!
-
-    bne             vp8e_filt_blk2d_spo16x16_loop_neon
-
-    b               sub_pixel_variance16x16_neon
-
-;----------------------------
-;variance16x16
-sub_pixel_variance16x16_neon
-    vmov.i8         q8, #0                      ;q8 - sum
-    vmov.i8         q9, #0                      ;q9, q10 - sse
-    vmov.i8         q10, #0
-
-    sub             r3, r3, #256
-    mov             r12, #8
-
-sub_pixel_variance16x16_neon_loop
-    vld1.8          {q0}, [r3]!                 ;Load up source and reference
-    vld1.8          {q2}, [r4], r5
-    vld1.8          {q1}, [r3]!
-    vld1.8          {q3}, [r4], r5
-
-    vsubl.u8        q11, d0, d4                 ;diff
-    vsubl.u8        q12, d1, d5
-    vsubl.u8        q13, d2, d6
-    vsubl.u8        q14, d3, d7
-
-    vpadal.s16      q8, q11                     ;sum
-    vmlal.s16       q9, d22, d22                ;sse
-    vmlal.s16       q10, d23, d23
-
-    subs            r12, r12, #1
-
-    vpadal.s16      q8, q12
-    vmlal.s16       q9, d24, d24
-    vmlal.s16       q10, d25, d25
-    vpadal.s16      q8, q13
-    vmlal.s16       q9, d26, d26
-    vmlal.s16       q10, d27, d27
-    vpadal.s16      q8, q14
-    vmlal.s16       q9, d28, d28
-    vmlal.s16       q10, d29, d29
-
-    bne             sub_pixel_variance16x16_neon_loop
-
-    vadd.u32        q10, q9, q10                ;accumulate sse
-    vpaddl.s32      q0, q8                      ;accumulate sum
-
-    vpaddl.u32      q1, q10
-    vadd.s64        d0, d0, d1
-    vadd.u64        d1, d2, d3
-
-    vmull.s32       q5, d0, d0
-    vst1.32         {d1[0]}, [r6]               ;store sse
-    vshr.s32        d10, d10, #8
-    vsub.s32        d0, d1, d10
-
-    add             sp, sp, #528
-    vmov.32         r0, d0[0]                   ;return
-
-    pop             {r4-r6,pc}
-
-    ENDP
-
-;-----------------
-
-_BilinearTaps_coeff_
-    DCD     bilinear_taps_coeff
-bilinear_taps_coeff
-    DCD     128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112
-
-    END
--- a/vp8/encoder/arm/neon/vp8_subpixelvariance16x16s_neon.asm
+++ /dev/null
@@ -1,572 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp9_variance_halfpixvar16x16_h_neon|
-    EXPORT  |vp9_variance_halfpixvar16x16_v_neon|
-    EXPORT  |vp9_variance_halfpixvar16x16_hv_neon|
-    EXPORT  |vp9_sub_pixel_variance16x16s_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-;================================================
-;unsigned int vp9_variance_halfpixvar16x16_h_neon
-;(
-;    unsigned char  *src_ptr, r0
-;    int  src_pixels_per_line,  r1
-;    unsigned char *dst_ptr,  r2
-;    int dst_pixels_per_line,   r3
-;    unsigned int *sse
-;);
-;================================================
-|vp9_variance_halfpixvar16x16_h_neon| PROC
-    push            {lr}
-
-    mov             r12, #4                  ;loop counter
-    ldr             lr, [sp, #4]           ;load *sse from stack
-    vmov.i8         q8, #0                      ;q8 - sum
-    vmov.i8         q9, #0                      ;q9, q10 - sse
-    vmov.i8         q10, #0
-
-;First Pass: output_height lines x output_width columns (16x16)
-vp8_filt_fpo16x16s_4_0_loop_neon
-    vld1.u8         {d0, d1, d2, d3}, [r0], r1      ;load src data
-    vld1.8          {q11}, [r2], r3
-    vld1.u8         {d4, d5, d6, d7}, [r0], r1
-    vld1.8          {q12}, [r2], r3
-    vld1.u8         {d8, d9, d10, d11}, [r0], r1
-    vld1.8          {q13}, [r2], r3
-    vld1.u8         {d12, d13, d14, d15}, [r0], r1
-
-    ;pld                [r0]
-    ;pld                [r0, r1]
-    ;pld                [r0, r1, lsl #1]
-
-    vext.8          q1, q0, q1, #1          ;construct src_ptr[1]
-    vext.8          q3, q2, q3, #1
-    vext.8          q5, q4, q5, #1
-    vext.8          q7, q6, q7, #1
-
-    vrhadd.u8       q0, q0, q1              ;(src_ptr[0]+src_ptr[1])/round/shift right 1
-    vld1.8          {q14}, [r2], r3
-    vrhadd.u8       q1, q2, q3
-    vrhadd.u8       q2, q4, q5
-    vrhadd.u8       q3, q6, q7
-
-    vsubl.u8        q4, d0, d22                 ;diff
-    vsubl.u8        q5, d1, d23
-    vsubl.u8        q6, d2, d24
-    vsubl.u8        q7, d3, d25
-    vsubl.u8        q0, d4, d26
-    vsubl.u8        q1, d5, d27
-    vsubl.u8        q2, d6, d28
-    vsubl.u8        q3, d7, d29
-
-    vpadal.s16      q8, q4                     ;sum
-    vmlal.s16       q9, d8, d8                ;sse
-    vmlal.s16       q10, d9, d9
-
-    subs            r12, r12, #1
-
-    vpadal.s16      q8, q5
-    vmlal.s16       q9, d10, d10
-    vmlal.s16       q10, d11, d11
-    vpadal.s16      q8, q6
-    vmlal.s16       q9, d12, d12
-    vmlal.s16       q10, d13, d13
-    vpadal.s16      q8, q7
-    vmlal.s16       q9, d14, d14
-    vmlal.s16       q10, d15, d15
-
-    vpadal.s16      q8, q0                     ;sum
-    vmlal.s16       q9, d0, d0                ;sse
-    vmlal.s16       q10, d1, d1
-    vpadal.s16      q8, q1
-    vmlal.s16       q9, d2, d2
-    vmlal.s16       q10, d3, d3
-    vpadal.s16      q8, q2
-    vmlal.s16       q9, d4, d4
-    vmlal.s16       q10, d5, d5
-    vpadal.s16      q8, q3
-    vmlal.s16       q9, d6, d6
-    vmlal.s16       q10, d7, d7
-
-    bne             vp8_filt_fpo16x16s_4_0_loop_neon
-
-    vadd.u32        q10, q9, q10                ;accumulate sse
-    vpaddl.s32      q0, q8                      ;accumulate sum
-
-    vpaddl.u32      q1, q10
-    vadd.s64        d0, d0, d1
-    vadd.u64        d1, d2, d3
-
-    vmull.s32       q5, d0, d0
-    vst1.32         {d1[0]}, [lr]               ;store sse
-    vshr.s32        d10, d10, #8
-    vsub.s32        d0, d1, d10
-
-    vmov.32         r0, d0[0]                   ;return
-    pop             {pc}
-    ENDP
-
-;================================================
-;unsigned int vp9_variance_halfpixvar16x16_v_neon
-;(
-;    unsigned char  *src_ptr, r0
-;    int  src_pixels_per_line,  r1
-;    unsigned char *dst_ptr,  r2
-;    int dst_pixels_per_line,   r3
-;    unsigned int *sse
-;);
-;================================================
-|vp9_variance_halfpixvar16x16_v_neon| PROC
-    push            {lr}
-
-    mov             r12, #4                     ;loop counter
-
-    vld1.u8         {q0}, [r0], r1              ;load src data
-    ldr             lr, [sp, #4]                ;load *sse from stack
-
-    vmov.i8         q8, #0                      ;q8 - sum
-    vmov.i8         q9, #0                      ;q9, q10 - sse
-    vmov.i8         q10, #0
-
-vp8_filt_spo16x16s_0_4_loop_neon
-    vld1.u8         {q2}, [r0], r1
-    vld1.8          {q1}, [r2], r3
-    vld1.u8         {q4}, [r0], r1
-    vld1.8          {q3}, [r2], r3
-    vld1.u8         {q6}, [r0], r1
-    vld1.8          {q5}, [r2], r3
-    vld1.u8         {q15}, [r0], r1
-
-    vrhadd.u8       q0, q0, q2
-    vld1.8          {q7}, [r2], r3
-    vrhadd.u8       q2, q2, q4
-    vrhadd.u8       q4, q4, q6
-    vrhadd.u8       q6, q6, q15
-
-    vsubl.u8        q11, d0, d2                 ;diff
-    vsubl.u8        q12, d1, d3
-    vsubl.u8        q13, d4, d6
-    vsubl.u8        q14, d5, d7
-    vsubl.u8        q0, d8, d10
-    vsubl.u8        q1, d9, d11
-    vsubl.u8        q2, d12, d14
-    vsubl.u8        q3, d13, d15
-
-    vpadal.s16      q8, q11                     ;sum
-    vmlal.s16       q9, d22, d22                ;sse
-    vmlal.s16       q10, d23, d23
-
-    subs            r12, r12, #1
-
-    vpadal.s16      q8, q12
-    vmlal.s16       q9, d24, d24
-    vmlal.s16       q10, d25, d25
-    vpadal.s16      q8, q13
-    vmlal.s16       q9, d26, d26
-    vmlal.s16       q10, d27, d27
-    vpadal.s16      q8, q14
-    vmlal.s16       q9, d28, d28
-    vmlal.s16       q10, d29, d29
-
-    vpadal.s16      q8, q0                     ;sum
-    vmlal.s16       q9, d0, d0                 ;sse
-    vmlal.s16       q10, d1, d1
-    vpadal.s16      q8, q1
-    vmlal.s16       q9, d2, d2
-    vmlal.s16       q10, d3, d3
-    vpadal.s16      q8, q2
-    vmlal.s16       q9, d4, d4
-    vmlal.s16       q10, d5, d5
-
-    vmov            q0, q15
-
-    vpadal.s16      q8, q3
-    vmlal.s16       q9, d6, d6
-    vmlal.s16       q10, d7, d7
-
-    bne             vp8_filt_spo16x16s_0_4_loop_neon
-
-    vadd.u32        q10, q9, q10                ;accumulate sse
-    vpaddl.s32      q0, q8                      ;accumulate sum
-
-    vpaddl.u32      q1, q10
-    vadd.s64        d0, d0, d1
-    vadd.u64        d1, d2, d3
-
-    vmull.s32       q5, d0, d0
-    vst1.32         {d1[0]}, [lr]               ;store sse
-    vshr.s32        d10, d10, #8
-    vsub.s32        d0, d1, d10
-
-    vmov.32         r0, d0[0]                   ;return
-    pop             {pc}
-    ENDP
-
-;================================================
-;unsigned int vp9_variance_halfpixvar16x16_hv_neon
-;(
-;    unsigned char  *src_ptr, r0
-;    int  src_pixels_per_line,  r1
-;    unsigned char *dst_ptr,  r2
-;    int dst_pixels_per_line,   r3
-;    unsigned int *sse
-;);
-;================================================
-|vp9_variance_halfpixvar16x16_hv_neon| PROC
-    push            {lr}
-
-    vld1.u8         {d0, d1, d2, d3}, [r0], r1      ;load src data
-
-    ldr             lr, [sp, #4]           ;load *sse from stack
-    vmov.i8         q13, #0                      ;q8 - sum
-    vext.8          q1, q0, q1, #1          ;construct src_ptr[1]
-
-    vmov.i8         q14, #0                      ;q9, q10 - sse
-    vmov.i8         q15, #0
-
-    mov             r12, #4                  ;loop counter
-    vrhadd.u8       q0, q0, q1              ;(src_ptr[0]+src_ptr[1])/round/shift right 1
-
-;First Pass: output_height lines x output_width columns (17x16)
-vp8_filt16x16s_4_4_loop_neon
-    vld1.u8         {d4, d5, d6, d7}, [r0], r1
-    vld1.u8         {d8, d9, d10, d11}, [r0], r1
-    vld1.u8         {d12, d13, d14, d15}, [r0], r1
-    vld1.u8         {d16, d17, d18, d19}, [r0], r1
-
-    ;pld                [r0]
-    ;pld                [r0, r1]
-    ;pld                [r0, r1, lsl #1]
-
-    vext.8          q3, q2, q3, #1          ;construct src_ptr[1]
-    vext.8          q5, q4, q5, #1
-    vext.8          q7, q6, q7, #1
-    vext.8          q9, q8, q9, #1
-
-    vrhadd.u8       q1, q2, q3              ;(src_ptr[0]+src_ptr[1])/round/shift right 1
-    vrhadd.u8       q2, q4, q5
-    vrhadd.u8       q3, q6, q7
-    vrhadd.u8       q4, q8, q9
-
-    vld1.8          {q5}, [r2], r3
-    vrhadd.u8       q0, q0, q1
-    vld1.8          {q6}, [r2], r3
-    vrhadd.u8       q1, q1, q2
-    vld1.8          {q7}, [r2], r3
-    vrhadd.u8       q2, q2, q3
-    vld1.8          {q8}, [r2], r3
-    vrhadd.u8       q3, q3, q4
-
-    vsubl.u8        q9, d0, d10                 ;diff
-    vsubl.u8        q10, d1, d11
-    vsubl.u8        q11, d2, d12
-    vsubl.u8        q12, d3, d13
-
-    vsubl.u8        q0, d4, d14                 ;diff
-    vsubl.u8        q1, d5, d15
-    vsubl.u8        q5, d6, d16
-    vsubl.u8        q6, d7, d17
-
-    vpadal.s16      q13, q9                     ;sum
-    vmlal.s16       q14, d18, d18                ;sse
-    vmlal.s16       q15, d19, d19
-
-    vpadal.s16      q13, q10                     ;sum
-    vmlal.s16       q14, d20, d20                ;sse
-    vmlal.s16       q15, d21, d21
-
-    vpadal.s16      q13, q11                     ;sum
-    vmlal.s16       q14, d22, d22                ;sse
-    vmlal.s16       q15, d23, d23
-
-    vpadal.s16      q13, q12                     ;sum
-    vmlal.s16       q14, d24, d24                ;sse
-    vmlal.s16       q15, d25, d25
-
-    subs            r12, r12, #1
-
-    vpadal.s16      q13, q0                     ;sum
-    vmlal.s16       q14, d0, d0                ;sse
-    vmlal.s16       q15, d1, d1
-
-    vpadal.s16      q13, q1                     ;sum
-    vmlal.s16       q14, d2, d2                ;sse
-    vmlal.s16       q15, d3, d3
-
-    vpadal.s16      q13, q5                     ;sum
-    vmlal.s16       q14, d10, d10                ;sse
-    vmlal.s16       q15, d11, d11
-
-    vmov            q0, q4
-
-    vpadal.s16      q13, q6                     ;sum
-    vmlal.s16       q14, d12, d12                ;sse
-    vmlal.s16       q15, d13, d13
-
-    bne             vp8_filt16x16s_4_4_loop_neon
-
-    vadd.u32        q15, q14, q15                ;accumulate sse
-    vpaddl.s32      q0, q13                      ;accumulate sum
-
-    vpaddl.u32      q1, q15
-    vadd.s64        d0, d0, d1
-    vadd.u64        d1, d2, d3
-
-    vmull.s32       q5, d0, d0
-    vst1.32         {d1[0]}, [lr]               ;store sse
-    vshr.s32        d10, d10, #8
-    vsub.s32        d0, d1, d10
-
-    vmov.32         r0, d0[0]                   ;return
-    pop             {pc}
-    ENDP
-
-;==============================
-; r0    unsigned char  *src_ptr,
-; r1    int  src_pixels_per_line,
-; r2    int  xoffset,
-; r3    int  yoffset,
-; stack unsigned char *dst_ptr,
-; stack int dst_pixels_per_line,
-; stack unsigned int *sse
-;note: in vp8_find_best_half_pixel_step()(called when 8<Speed<15), and first call of vp8_find_best_sub_pixel_step()
-;(called when speed<=8). xoffset/yoffset can only be 4 or 0, which means either by pass the filter,
-;or filter coeff is {64, 64}. This simplified program only works in this situation.
-;note: It happens that both xoffset and yoffset are zero. This can be handled in c code later.
-
-|vp9_sub_pixel_variance16x16s_neon| PROC
-    push            {r4, lr}
-
-    ldr             r4, [sp, #8]            ;load *dst_ptr from stack
-    ldr             r12, [sp, #12]          ;load dst_pixels_per_line from stack
-    ldr             lr, [sp, #16]           ;load *sse from stack
-
-    cmp             r2, #0                  ;skip first_pass filter if xoffset=0
-    beq             secondpass_bfilter16x16s_only
-
-    cmp             r3, #0                  ;skip second_pass filter if yoffset=0
-    beq             firstpass_bfilter16x16s_only
-
-    vld1.u8         {d0, d1, d2, d3}, [r0], r1      ;load src data
-    sub             sp, sp, #256            ;reserve space on stack for temporary storage
-    vext.8          q1, q0, q1, #1          ;construct src_ptr[1]
-    mov             r3, sp
-    mov             r2, #4                  ;loop counter
-    vrhadd.u8       q0, q0, q1              ;(src_ptr[0]+src_ptr[1])/round/shift right 1
-
-;First Pass: output_height lines x output_width columns (17x16)
-vp8e_filt_blk2d_fp16x16s_loop_neon
-    vld1.u8         {d4, d5, d6, d7}, [r0], r1
-    vld1.u8         {d8, d9, d10, d11}, [r0], r1
-    vld1.u8         {d12, d13, d14, d15}, [r0], r1
-    vld1.u8         {d16, d17, d18, d19}, [r0], r1
-
-    ;pld                [r0]
-    ;pld                [r0, r1]
-    ;pld                [r0, r1, lsl #1]
-
-    vext.8          q3, q2, q3, #1          ;construct src_ptr[1]
-    vext.8          q5, q4, q5, #1
-    vext.8          q7, q6, q7, #1
-    vext.8          q9, q8, q9, #1
-
-    vrhadd.u8       q1, q2, q3              ;(src_ptr[0]+src_ptr[1])/round/shift right 1
-    vrhadd.u8       q2, q4, q5
-    vrhadd.u8       q3, q6, q7
-    vrhadd.u8       q4, q8, q9
-
-    vrhadd.u8       q0, q0, q1
-    vrhadd.u8       q1, q1, q2
-    vrhadd.u8       q2, q2, q3
-    vrhadd.u8       q3, q3, q4
-
-    subs            r2, r2, #1
-    vst1.u8         {d0, d1 ,d2, d3}, [r3]!         ;store result
-    vmov            q0, q4
-    vst1.u8         {d4, d5, d6, d7}, [r3]!
-
-    bne             vp8e_filt_blk2d_fp16x16s_loop_neon
-
-    b               sub_pixel_variance16x16s_neon
-
-;--------------------
-firstpass_bfilter16x16s_only
-    mov             r2, #2                  ;loop counter
-    sub             sp, sp, #256            ;reserve space on stack for temporary storage
-    mov             r3, sp
-
-;First Pass: output_height lines x output_width columns (16x16)
-vp8e_filt_blk2d_fpo16x16s_loop_neon
-    vld1.u8         {d0, d1, d2, d3}, [r0], r1      ;load src data
-    vld1.u8         {d4, d5, d6, d7}, [r0], r1
-    vld1.u8         {d8, d9, d10, d11}, [r0], r1
-    vld1.u8         {d12, d13, d14, d15}, [r0], r1
-
-    ;pld                [r0]
-    ;pld                [r0, r1]
-    ;pld                [r0, r1, lsl #1]
-
-    vext.8          q1, q0, q1, #1          ;construct src_ptr[1]
-    vld1.u8         {d16, d17, d18, d19}, [r0], r1
-    vext.8          q3, q2, q3, #1
-    vld1.u8         {d20, d21, d22, d23}, [r0], r1
-    vext.8          q5, q4, q5, #1
-    vld1.u8         {d24, d25, d26, d27}, [r0], r1
-    vext.8          q7, q6, q7, #1
-    vld1.u8         {d28, d29, d30, d31}, [r0], r1
-    vext.8          q9, q8, q9, #1
-    vext.8          q11, q10, q11, #1
-    vext.8          q13, q12, q13, #1
-    vext.8          q15, q14, q15, #1
-
-    vrhadd.u8       q0, q0, q1              ;(src_ptr[0]+src_ptr[1])/round/shift right 1
-    vrhadd.u8       q1, q2, q3
-    vrhadd.u8       q2, q4, q5
-    vrhadd.u8       q3, q6, q7
-    vrhadd.u8       q4, q8, q9
-    vrhadd.u8       q5, q10, q11
-    vrhadd.u8       q6, q12, q13
-    vrhadd.u8       q7, q14, q15
-
-    subs            r2, r2, #1
-
-    vst1.u8         {d0, d1, d2, d3}, [r3]!         ;store result
-    vst1.u8         {d4, d5, d6, d7}, [r3]!
-    vst1.u8         {d8, d9, d10, d11}, [r3]!
-    vst1.u8         {d12, d13, d14, d15}, [r3]!
-
-    bne             vp8e_filt_blk2d_fpo16x16s_loop_neon
-
-    b               sub_pixel_variance16x16s_neon
-
-;---------------------
-secondpass_bfilter16x16s_only
-    sub             sp, sp, #256            ;reserve space on stack for temporary storage
-
-    mov             r2, #2                  ;loop counter
-    vld1.u8         {d0, d1}, [r0], r1      ;load src data
-    mov             r3, sp
-
-vp8e_filt_blk2d_spo16x16s_loop_neon
-    vld1.u8         {d2, d3}, [r0], r1
-    vld1.u8         {d4, d5}, [r0], r1
-    vld1.u8         {d6, d7}, [r0], r1
-    vld1.u8         {d8, d9}, [r0], r1
-
-    vrhadd.u8       q0, q0, q1
-    vld1.u8         {d10, d11}, [r0], r1
-    vrhadd.u8       q1, q1, q2
-    vld1.u8         {d12, d13}, [r0], r1
-    vrhadd.u8       q2, q2, q3
-    vld1.u8         {d14, d15}, [r0], r1
-    vrhadd.u8       q3, q3, q4
-    vld1.u8         {d16, d17}, [r0], r1
-    vrhadd.u8       q4, q4, q5
-    vrhadd.u8       q5, q5, q6
-    vrhadd.u8       q6, q6, q7
-    vrhadd.u8       q7, q7, q8
-
-    subs            r2, r2, #1
-
-    vst1.u8         {d0, d1, d2, d3}, [r3]!         ;store result
-    vmov            q0, q8
-    vst1.u8         {d4, d5, d6, d7}, [r3]!
-    vst1.u8         {d8, d9, d10, d11}, [r3]!           ;store result
-    vst1.u8         {d12, d13, d14, d15}, [r3]!
-
-    bne             vp8e_filt_blk2d_spo16x16s_loop_neon
-
-    b               sub_pixel_variance16x16s_neon
-
-;----------------------------
-;variance16x16
-sub_pixel_variance16x16s_neon
-    vmov.i8         q8, #0                      ;q8 - sum
-    vmov.i8         q9, #0                      ;q9, q10 - sse
-    vmov.i8         q10, #0
-
-    sub             r3, r3, #256
-    mov             r2, #4
-
-sub_pixel_variance16x16s_neon_loop
-    vld1.8          {q0}, [r3]!                 ;Load up source and reference
-    vld1.8          {q1}, [r4], r12
-    vld1.8          {q2}, [r3]!
-    vld1.8          {q3}, [r4], r12
-    vld1.8          {q4}, [r3]!
-    vld1.8          {q5}, [r4], r12
-    vld1.8          {q6}, [r3]!
-    vld1.8          {q7}, [r4], r12
-
-    vsubl.u8        q11, d0, d2                 ;diff
-    vsubl.u8        q12, d1, d3
-    vsubl.u8        q13, d4, d6
-    vsubl.u8        q14, d5, d7
-    vsubl.u8        q0, d8, d10
-    vsubl.u8        q1, d9, d11
-    vsubl.u8        q2, d12, d14
-    vsubl.u8        q3, d13, d15
-
-    vpadal.s16      q8, q11                     ;sum
-    vmlal.s16       q9, d22, d22                ;sse
-    vmlal.s16       q10, d23, d23
-
-    subs            r2, r2, #1
-
-    vpadal.s16      q8, q12
-    vmlal.s16       q9, d24, d24
-    vmlal.s16       q10, d25, d25
-    vpadal.s16      q8, q13
-    vmlal.s16       q9, d26, d26
-    vmlal.s16       q10, d27, d27
-    vpadal.s16      q8, q14
-    vmlal.s16       q9, d28, d28
-    vmlal.s16       q10, d29, d29
-
-    vpadal.s16      q8, q0                     ;sum
-    vmlal.s16       q9, d0, d0                ;sse
-    vmlal.s16       q10, d1, d1
-    vpadal.s16      q8, q1
-    vmlal.s16       q9, d2, d2
-    vmlal.s16       q10, d3, d3
-    vpadal.s16      q8, q2
-    vmlal.s16       q9, d4, d4
-    vmlal.s16       q10, d5, d5
-    vpadal.s16      q8, q3
-    vmlal.s16       q9, d6, d6
-    vmlal.s16       q10, d7, d7
-
-    bne             sub_pixel_variance16x16s_neon_loop
-
-    vadd.u32        q10, q9, q10                ;accumulate sse
-    vpaddl.s32      q0, q8                      ;accumulate sum
-
-    vpaddl.u32      q1, q10
-    vadd.s64        d0, d0, d1
-    vadd.u64        d1, d2, d3
-
-    vmull.s32       q5, d0, d0
-    vst1.32         {d1[0]}, [lr]               ;store sse
-    vshr.s32        d10, d10, #8
-    vsub.s32        d0, d1, d10
-
-    add             sp, sp, #256
-    vmov.32         r0, d0[0]                   ;return
-
-    pop             {r4, pc}
-    ENDP
-
-    END
--- a/vp8/encoder/arm/neon/vp8_subpixelvariance8x8_neon.asm
+++ /dev/null
@@ -1,224 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp9_sub_pixel_variance8x8_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-; r0    unsigned char  *src_ptr,
-; r1    int  src_pixels_per_line,
-; r2    int  xoffset,
-; r3    int  yoffset,
-; stack(r4) unsigned char *dst_ptr,
-; stack(r5) int dst_pixels_per_line,
-; stack(r6) unsigned int *sse
-;note: most of the code is copied from bilinear_predict8x8_neon and vp9_variance8x8_neon.
-
-|vp9_sub_pixel_variance8x8_neon| PROC
-    push            {r4-r5, lr}
-
-    ldr             r12, _BilinearTaps_coeff_
-    ldr             r4, [sp, #12]           ;load *dst_ptr from stack
-    ldr             r5, [sp, #16]           ;load dst_pixels_per_line from stack
-    ldr             lr, [sp, #20]           ;load *sse from stack
-
-    cmp             r2, #0                  ;skip first_pass filter if xoffset=0
-    beq             skip_firstpass_filter
-
-;First pass: output_height lines x output_width columns (9x8)
-    add             r2, r12, r2, lsl #3     ;calculate filter location
-
-    vld1.u8         {q1}, [r0], r1          ;load src data
-    vld1.u32        {d31}, [r2]             ;load first_pass filter
-    vld1.u8         {q2}, [r0], r1
-    vdup.8          d0, d31[0]              ;first_pass filter (d0 d1)
-    vld1.u8         {q3}, [r0], r1
-    vdup.8          d1, d31[4]
-    vld1.u8         {q4}, [r0], r1
-
-    vmull.u8        q6, d2, d0              ;(src_ptr[0] * Filter[0])
-    vmull.u8        q7, d4, d0
-    vmull.u8        q8, d6, d0
-    vmull.u8        q9, d8, d0
-
-    vext.8          d3, d2, d3, #1          ;construct src_ptr[-1]
-    vext.8          d5, d4, d5, #1
-    vext.8          d7, d6, d7, #1
-    vext.8          d9, d8, d9, #1
-
-    vmlal.u8        q6, d3, d1              ;(src_ptr[1] * Filter[1])
-    vmlal.u8        q7, d5, d1
-    vmlal.u8        q8, d7, d1
-    vmlal.u8        q9, d9, d1
-
-    vld1.u8         {q1}, [r0], r1          ;load src data
-    vqrshrn.u16    d22, q6, #7              ;shift/round/saturate to u8
-    vld1.u8         {q2}, [r0], r1
-    vqrshrn.u16    d23, q7, #7
-    vld1.u8         {q3}, [r0], r1
-    vqrshrn.u16    d24, q8, #7
-    vld1.u8         {q4}, [r0], r1
-    vqrshrn.u16    d25, q9, #7
-
-    ;first_pass filtering on the rest 5-line data
-    vld1.u8         {q5}, [r0], r1
-
-    vmull.u8        q6, d2, d0              ;(src_ptr[0] * Filter[0])
-    vmull.u8        q7, d4, d0
-    vmull.u8        q8, d6, d0
-    vmull.u8        q9, d8, d0
-    vmull.u8        q10, d10, d0
-
-    vext.8          d3, d2, d3, #1          ;construct src_ptr[-1]
-    vext.8          d5, d4, d5, #1
-    vext.8          d7, d6, d7, #1
-    vext.8          d9, d8, d9, #1
-    vext.8          d11, d10, d11, #1
-
-    vmlal.u8        q6, d3, d1              ;(src_ptr[1] * Filter[1])
-    vmlal.u8        q7, d5, d1
-    vmlal.u8        q8, d7, d1
-    vmlal.u8        q9, d9, d1
-    vmlal.u8        q10, d11, d1
-
-    vqrshrn.u16    d26, q6, #7              ;shift/round/saturate to u8
-    vqrshrn.u16    d27, q7, #7
-    vqrshrn.u16    d28, q8, #7
-    vqrshrn.u16    d29, q9, #7
-    vqrshrn.u16    d30, q10, #7
-
-;Second pass: 8x8
-secondpass_filter
-    cmp             r3, #0                  ;skip second_pass filter if yoffset=0
-    ;skip_secondpass_filter
-    beq             sub_pixel_variance8x8_neon
-
-    add             r3, r12, r3, lsl #3
-
-    vld1.u32        {d31}, [r3]             ;load second_pass filter
-
-    vdup.8          d0, d31[0]              ;second_pass filter parameters (d0 d1)
-    vdup.8          d1, d31[4]
-
-    vmull.u8        q1, d22, d0             ;(src_ptr[0] * Filter[0])
-    vmull.u8        q2, d23, d0
-    vmull.u8        q3, d24, d0
-    vmull.u8        q4, d25, d0
-    vmull.u8        q5, d26, d0
-    vmull.u8        q6, d27, d0
-    vmull.u8        q7, d28, d0
-    vmull.u8        q8, d29, d0
-
-    vmlal.u8        q1, d23, d1             ;(src_ptr[pixel_step] * Filter[1])
-    vmlal.u8        q2, d24, d1
-    vmlal.u8        q3, d25, d1
-    vmlal.u8        q4, d26, d1
-    vmlal.u8        q5, d27, d1
-    vmlal.u8        q6, d28, d1
-    vmlal.u8        q7, d29, d1
-    vmlal.u8        q8, d30, d1
-
-    vqrshrn.u16    d22, q1, #7              ;shift/round/saturate to u8
-    vqrshrn.u16    d23, q2, #7
-    vqrshrn.u16    d24, q3, #7
-    vqrshrn.u16    d25, q4, #7
-    vqrshrn.u16    d26, q5, #7
-    vqrshrn.u16    d27, q6, #7
-    vqrshrn.u16    d28, q7, #7
-    vqrshrn.u16    d29, q8, #7
-
-    b               sub_pixel_variance8x8_neon
-
-;--------------------
-skip_firstpass_filter
-    vld1.u8         {d22}, [r0], r1         ;load src data
-    vld1.u8         {d23}, [r0], r1
-    vld1.u8         {d24}, [r0], r1
-    vld1.u8         {d25}, [r0], r1
-    vld1.u8         {d26}, [r0], r1
-    vld1.u8         {d27}, [r0], r1
-    vld1.u8         {d28}, [r0], r1
-    vld1.u8         {d29}, [r0], r1
-    vld1.u8         {d30}, [r0], r1
-
-    b               secondpass_filter
-
-;----------------------
-;vp9_variance8x8_neon
-sub_pixel_variance8x8_neon
-    vmov.i8         q8, #0                      ;q8 - sum
-    vmov.i8         q9, #0                      ;q9, q10 - sse
-    vmov.i8         q10, #0
-
-    mov             r12, #2
-
-sub_pixel_variance8x8_neon_loop
-    vld1.8          {d0}, [r4], r5              ;load dst data
-    subs            r12, r12, #1
-    vld1.8          {d1}, [r4], r5
-    vld1.8          {d2}, [r4], r5
-    vsubl.u8        q4, d22, d0                 ;calculate diff
-    vld1.8          {d3}, [r4], r5
-
-    vsubl.u8        q5, d23, d1
-    vsubl.u8        q6, d24, d2
-
-    vpadal.s16      q8, q4                      ;sum
-    vmlal.s16       q9, d8, d8                  ;sse
-    vmlal.s16       q10, d9, d9
-
-    vsubl.u8        q7, d25, d3
-
-    vpadal.s16      q8, q5
-    vmlal.s16       q9, d10, d10
-    vmlal.s16       q10, d11, d11
-
-    vmov            q11, q13
-
-    vpadal.s16      q8, q6
-    vmlal.s16       q9, d12, d12
-    vmlal.s16       q10, d13, d13
-
-    vmov            q12, q14
-
-    vpadal.s16      q8, q7
-    vmlal.s16       q9, d14, d14
-    vmlal.s16       q10, d15, d15
-
-    bne             sub_pixel_variance8x8_neon_loop
-
-    vadd.u32        q10, q9, q10                ;accumulate sse
-    vpaddl.s32      q0, q8                      ;accumulate sum
-
-    vpaddl.u32      q1, q10
-    vadd.s64        d0, d0, d1
-    vadd.u64        d1, d2, d3
-
-    vmull.s32       q5, d0, d0
-    vst1.32         {d1[0]}, [lr]               ;store sse
-    vshr.s32        d10, d10, #6
-    vsub.s32        d0, d1, d10
-
-    vmov.32         r0, d0[0]                   ;return
-    pop             {r4-r5, pc}
-
-    ENDP
-
-;-----------------
-
-_BilinearTaps_coeff_
-    DCD     bilinear_taps_coeff
-bilinear_taps_coeff
-    DCD     128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112
-
-    END
--- a/vp8/encoder/arm/quantize_arm.c
+++ /dev/null
@@ -1,59 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include <math.h>
-#include "vpx_mem/vpx_mem.h"
-
-#include "vp8/encoder/quantize.h"
-#include "vp8/common/entropy.h"
-
-
-#if HAVE_ARMV7
-
-/* vp8_quantize_mbX functions here differs from corresponding ones in
- * quantize.c only by using quantize_b_pair function pointer instead of
- * the regular quantize_b function pointer */
-void vp8_quantize_mby_neon(MACROBLOCK *x) {
-  int i;
-  int has_2nd_order = (x->e_mbd.mode_info_context->mbmi.mode != B_PRED
-                       && x->e_mbd.mode_info_context->mbmi.mode != SPLITMV);
-
-  for (i = 0; i < 16; i += 2)
-    x->quantize_b_pair(&x->block[i], &x->block[i + 1],
-                       &x->e_mbd.block[i], &x->e_mbd.block[i + 1]);
-
-  if (has_2nd_order)
-    x->quantize_b(&x->block[24], &x->e_mbd.block[24]);
-}
-
-void vp8_quantize_mb_neon(MACROBLOCK *x) {
-  int i;
-  int has_2nd_order = (x->e_mbd.mode_info_context->mbmi.mode != B_PRED
-                       && x->e_mbd.mode_info_context->mbmi.mode != SPLITMV);
-
-  for (i = 0; i < 24; i += 2)
-    x->quantize_b_pair(&x->block[i], &x->block[i + 1],
-                       &x->e_mbd.block[i], &x->e_mbd.block[i + 1]);
-
-  if (has_2nd_order)
-    x->quantize_b(&x->block[i], &x->e_mbd.block[i]);
-}
-
-
-void vp8_quantize_mbuv_neon(MACROBLOCK *x) {
-  int i;
-
-  for (i = 16; i < 24; i += 2)
-    x->quantize_b_pair(&x->block[i], &x->block[i + 1],
-                       &x->e_mbd.block[i], &x->e_mbd.block[i + 1]);
-}
-
-#endif /* HAVE_ARMV7 */
--- a/vp8/encoder/arm/quantize_arm.h
+++ /dev/null
@@ -1,52 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef QUANTIZE_ARM_H
-#define QUANTIZE_ARM_H
-
-#if HAVE_ARMV6
-
-extern prototype_quantize_block(vp8_fast_quantize_b_armv6);
-
-#if !CONFIG_RUNTIME_CPU_DETECT
-#undef  vp8_quantize_fastquantb
-#define vp8_quantize_fastquantb vp8_fast_quantize_b_armv6
-#endif
-
-#endif /* HAVE_ARMV6 */
-
-
-#if HAVE_ARMV7
-
-extern prototype_quantize_block(vp8_fast_quantize_b_neon);
-extern prototype_quantize_block_pair(vp8_fast_quantize_b_pair_neon);
-
-#if !CONFIG_RUNTIME_CPU_DETECT
-#undef  vp8_quantize_fastquantb
-#define vp8_quantize_fastquantb vp8_fast_quantize_b_neon
-
-#undef  vp8_quantize_fastquantb_pair
-#define vp8_quantize_fastquantb_pair vp8_fast_quantize_b_pair_neon
-
-#undef vp8_quantize_mb
-#define vp8_quantize_mb vp8_quantize_mb_neon
-
-#undef vp8_quantize_mbuv
-#define vp8_quantize_mbuv vp8_quantize_mbuv_neon
-
-#undef vp8_quantize_mby
-#define vp8_quantize_mby vp8_quantize_mby_neon
-#endif
-
-#endif /* HAVE_ARMV7 */
-
-#endif
-
--- a/vp8/encoder/arm/variance_arm.c
+++ /dev/null
@@ -1,112 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "vpx_config.h"
-#include "vp8/encoder/variance.h"
-#include "vp8/common/filter.h"
-#include "vp8/common/arm/bilinearfilter_arm.h"
-
-#define HALFNDX 8
-
-#if HAVE_ARMV6
-
-unsigned int vp9_sub_pixel_variance8x8_armv6
-(
-  const unsigned char  *src_ptr,
-  int  src_pixels_per_line,
-  int  xoffset,
-  int  yoffset,
-  const unsigned char *dst_ptr,
-  int dst_pixels_per_line,
-  unsigned int *sse
-) {
-  unsigned short first_pass[10 * 8];
-  unsigned char  second_pass[8 * 8];
-  const short *HFilter, *VFilter;
-
-  HFilter = vp8_bilinear_filters[xoffset];
-  VFilter = vp8_bilinear_filters[yoffset];
-
-  vp9_filter_block2d_bil_first_pass_armv6(src_ptr, first_pass,
-                                          src_pixels_per_line,
-                                          9, 8, HFilter);
-  vp9_filter_block2d_bil_second_pass_armv6(first_pass, second_pass,
-                                           8, 8, 8, VFilter);
-
-  return vp9_variance8x8_armv6(second_pass, 8, dst_ptr,
-                               dst_pixels_per_line, sse);
-}
-
-unsigned int vp9_sub_pixel_variance16x16_armv6
-(
-  const unsigned char  *src_ptr,
-  int  src_pixels_per_line,
-  int  xoffset,
-  int  yoffset,
-  const unsigned char *dst_ptr,
-  int dst_pixels_per_line,
-  unsigned int *sse
-) {
-  unsigned short first_pass[36 * 16];
-  unsigned char  second_pass[20 * 16];
-  const short *HFilter, *VFilter;
-  unsigned int var;
-
-  if (xoffset == HALFNDX && yoffset == 0) {
-    var = vp9_variance_halfpixvar16x16_h_armv6(src_ptr, src_pixels_per_line,
-                                               dst_ptr, dst_pixels_per_line, sse);
-  } else if (xoffset == 0 && yoffset == HALFNDX) {
-    var = vp9_variance_halfpixvar16x16_v_armv6(src_ptr, src_pixels_per_line,
-                                               dst_ptr, dst_pixels_per_line, sse);
-  } else if (xoffset == HALFNDX && yoffset == HALFNDX) {
-    var = vp9_variance_halfpixvar16x16_hv_armv6(src_ptr, src_pixels_per_line,
-                                                dst_ptr, dst_pixels_per_line, sse);
-  } else {
-    HFilter = vp8_bilinear_filters[xoffset];
-    VFilter = vp8_bilinear_filters[yoffset];
-
-    vp9_filter_block2d_bil_first_pass_armv6(src_ptr, first_pass,
-                                            src_pixels_per_line,
-                                            17, 16, HFilter);
-    vp9_filter_block2d_bil_second_pass_armv6(first_pass, second_pass,
-                                             16, 16, 16, VFilter);
-
-    var = vp9_variance16x16_armv6(second_pass, 16, dst_ptr,
-                                  dst_pixels_per_line, sse);
-  }
-  return var;
-}
-
-#endif /* HAVE_ARMV6 */
-
-
-#if HAVE_ARMV7
-
-unsigned int vp9_sub_pixel_variance16x16_neon
-(
-  const unsigned char  *src_ptr,
-  int  src_pixels_per_line,
-  int  xoffset,
-  int  yoffset,
-  const unsigned char *dst_ptr,
-  int dst_pixels_per_line,
-  unsigned int *sse
-) {
-  if (xoffset == HALFNDX && yoffset == 0)
-    return vp9_variance_halfpixvar16x16_h_neon(src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse);
-  else if (xoffset == 0 && yoffset == HALFNDX)
-    return vp9_variance_halfpixvar16x16_v_neon(src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse);
-  else if (xoffset == HALFNDX && yoffset == HALFNDX)
-    return vp9_variance_halfpixvar16x16_hv_neon(src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse);
-  else
-    return vp9_sub_pixel_variance16x16_neon_func(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse);
-}
-
-#endif
--- a/vp8/encoder/arm/variance_arm.h
+++ /dev/null
@@ -1,132 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef VARIANCE_ARM_H
-#define VARIANCE_ARM_H
-
-#if HAVE_ARMV6
-
-extern prototype_sad(vp9_sad16x16_armv6);
-extern prototype_variance(vp9_variance16x16_armv6);
-extern prototype_variance(vp9_variance8x8_armv6);
-extern prototype_subpixvariance(vp9_sub_pixel_variance16x16_armv6);
-extern prototype_subpixvariance(vp9_sub_pixel_variance8x8_armv6);
-extern prototype_variance(vp9_variance_halfpixvar16x16_h_armv6);
-extern prototype_variance(vp9_variance_halfpixvar16x16_v_armv6);
-extern prototype_variance(vp9_variance_halfpixvar16x16_hv_armv6);
-extern prototype_variance(vp9_mse16x16_armv6);
-
-#if !CONFIG_RUNTIME_CPU_DETECT
-
-#undef  vp9_variance_sad16x16
-#define vp9_variance_sad16x16 vp9_sad16x16_armv6
-
-#undef  vp9_variance_subpixvar16x16
-#define vp9_variance_subpixvar16x16 vp9_sub_pixel_variance16x16_armv6
-
-#undef  vp9_variance_subpixvar8x8
-#define vp9_variance_subpixvar8x8 vp9_sub_pixel_variance8x8_armv6
-
-#undef  vp9_variance_var16x16
-#define vp9_variance_var16x16 vp9_variance16x16_armv6
-
-#undef  vp9_variance_mse16x16
-#define vp9_variance_mse16x16 vp9_mse16x16_armv6
-
-#undef  vp9_variance_var8x8
-#define vp9_variance_var8x8 vp9_variance8x8_armv6
-
-#undef  vp9_variance_halfpixvar16x16_h
-#define vp9_variance_halfpixvar16x16_h vp9_variance_halfpixvar16x16_h_armv6
-
-#undef  vp9_variance_halfpixvar16x16_v
-#define vp9_variance_halfpixvar16x16_v vp9_variance_halfpixvar16x16_v_armv6
-
-#undef  vp9_variance_halfpixvar16x16_hv
-#define vp9_variance_halfpixvar16x16_hv vp9_variance_halfpixvar16x16_hv_armv6
-
-#endif /* !CONFIG_RUNTIME_CPU_DETECT */
-
-#endif /* HAVE_ARMV6 */
-
-
-#if HAVE_ARMV7
-extern prototype_sad(vp9_sad4x4_neon);
-extern prototype_sad(vp9_sad8x8_neon);
-extern prototype_sad(vp9_sad8x16_neon);
-extern prototype_sad(vp9_sad16x8_neon);
-extern prototype_sad(vp9_sad16x16_neon);
-
-extern prototype_variance(vp9_variance8x8_neon);
-extern prototype_variance(vp9_variance8x16_neon);
-extern prototype_variance(vp9_variance16x8_neon);
-extern prototype_variance(vp9_variance16x16_neon);
-
-extern prototype_subpixvariance(vp9_sub_pixel_variance8x8_neon);
-extern prototype_subpixvariance(vp9_sub_pixel_variance16x16_neon);
-extern prototype_subpixvariance(vp9_sub_pixel_variance16x16_neon_func);
-extern prototype_variance(vp9_variance_halfpixvar16x16_h_neon);
-extern prototype_variance(vp9_variance_halfpixvar16x16_v_neon);
-extern prototype_variance(vp9_variance_halfpixvar16x16_hv_neon);
-
-extern prototype_variance(vp9_mse16x16_neon);
-
-#if !CONFIG_RUNTIME_CPU_DETECT
-#undef  vp9_variance_sad4x4
-#define vp9_variance_sad4x4 vp9_sad4x4_neon
-
-#undef  vp9_variance_sad8x8
-#define vp9_variance_sad8x8 vp9_sad8x8_neon
-
-#undef  vp9_variance_sad8x16
-#define vp9_variance_sad8x16 vp9_sad8x16_neon
-
-#undef  vp9_variance_sad16x8
-#define vp9_variance_sad16x8 vp9_sad16x8_neon
-
-#undef  vp9_variance_sad16x16
-#define vp9_variance_sad16x16 vp9_sad16x16_neon
-
-#undef  vp9_variance_var8x8
-#define vp9_variance_var8x8 vp9_variance8x8_neon
-
-#undef  vp9_variance_var8x16
-#define vp9_variance_var8x16 vp9_variance8x16_neon
-
-#undef  vp9_variance_var16x8
-#define vp9_variance_var16x8 vp9_variance16x8_neon
-
-#undef  vp9_variance_var16x16
-#define vp9_variance_var16x16 vp9_variance16x16_neon
-
-#undef  vp9_variance_subpixvar8x8
-#define vp9_variance_subpixvar8x8 vp9_sub_pixel_variance8x8_neon
-
-#undef  vp9_variance_subpixvar16x16
-#define vp9_variance_subpixvar16x16 vp9_sub_pixel_variance16x16_neon
-
-#undef  vp9_variance_halfpixvar16x16_h
-#define vp9_variance_halfpixvar16x16_h vp9_variance_halfpixvar16x16_h_neon
-
-#undef  vp9_variance_halfpixvar16x16_v
-#define vp9_variance_halfpixvar16x16_v vp9_variance_halfpixvar16x16_v_neon
-
-#undef  vp9_variance_halfpixvar16x16_hv
-#define vp9_variance_halfpixvar16x16_hv vp9_variance_halfpixvar16x16_hv_neon
-
-#undef  vp9_variance_mse16x16
-#define vp9_variance_mse16x16 vp9_mse16x16_neon
-
-#endif
-
-#endif
-
-#endif
--- a/vp8/encoder/asm_enc_offsets.c
+++ /dev/null
@@ -1,90 +1,0 @@
-/*
- *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vpx_ports/asm_offsets.h"
-#include "vpx_config.h"
-#include "block.h"
-#include "vp8/common/blockd.h"
-#include "onyx_int.h"
-#include "treewriter.h"
-#include "tokenize.h"
-
-BEGIN
-
-/* regular quantize */
-DEFINE(vp9_block_coeff,                         offsetof(BLOCK, coeff));
-DEFINE(vp9_block_zbin,                          offsetof(BLOCK, zbin));
-DEFINE(vp9_block_round,                         offsetof(BLOCK, round));
-DEFINE(vp9_block_quant,                         offsetof(BLOCK, quant));
-DEFINE(vp9_block_quant_fast,                    offsetof(BLOCK, quant_fast));
-DEFINE(vp9_block_zbin_extra,                    offsetof(BLOCK, zbin_extra));
-DEFINE(vp9_block_zrun_zbin_boost,               offsetof(BLOCK, zrun_zbin_boost));
-DEFINE(vp9_block_quant_shift,                   offsetof(BLOCK, quant_shift));
-
-DEFINE(vp9_blockd_qcoeff,                       offsetof(BLOCKD, qcoeff));
-DEFINE(vp9_blockd_dequant,                      offsetof(BLOCKD, dequant));
-DEFINE(vp9_blockd_dqcoeff,                      offsetof(BLOCKD, dqcoeff));
-DEFINE(vp9_blockd_eob,                          offsetof(BLOCKD, eob));
-
-/* subtract */
-DEFINE(vp9_block_base_src,                      offsetof(BLOCK, base_src));
-DEFINE(vp9_block_src,                           offsetof(BLOCK, src));
-DEFINE(vp9_block_src_diff,                      offsetof(BLOCK, src_diff));
-DEFINE(vp9_block_src_stride,                    offsetof(BLOCK, src_stride));
-
-DEFINE(vp9_blockd_predictor,                    offsetof(BLOCKD, predictor));
-
-/* pack tokens */
-DEFINE(vp9_writer_lowvalue,                     offsetof(vp9_writer, lowvalue));
-DEFINE(vp9_writer_range,                        offsetof(vp9_writer, range));
-DEFINE(vp9_writer_value,                        offsetof(vp9_writer, value));
-DEFINE(vp9_writer_count,                        offsetof(vp9_writer, count));
-DEFINE(vp9_writer_pos,                          offsetof(vp9_writer, pos));
-DEFINE(vp9_writer_buffer,                       offsetof(vp9_writer, buffer));
-
-DEFINE(tokenextra_token,                        offsetof(TOKENEXTRA, Token));
-DEFINE(tokenextra_extra,                        offsetof(TOKENEXTRA, Extra));
-DEFINE(tokenextra_context_tree,                 offsetof(TOKENEXTRA, context_tree));
-DEFINE(tokenextra_skip_eob_node,                offsetof(TOKENEXTRA, skip_eob_node));
-DEFINE(TOKENEXTRA_SZ,                           sizeof(TOKENEXTRA));
-
-DEFINE(vp9_extra_bit_struct_sz,                 sizeof(vp9_extra_bit_struct));
-
-DEFINE(vp9_token_value,                         offsetof(vp9_token, value));
-DEFINE(vp9_token_len,                           offsetof(vp9_token, Len));
-
-DEFINE(vp9_extra_bit_struct_tree,               offsetof(vp9_extra_bit_struct, tree));
-DEFINE(vp9_extra_bit_struct_prob,               offsetof(vp9_extra_bit_struct, prob));
-DEFINE(vp9_extra_bit_struct_len,                offsetof(vp9_extra_bit_struct, Len));
-DEFINE(vp9_extra_bit_struct_base_val,           offsetof(vp9_extra_bit_struct, base_val));
-
-DEFINE(vp9_comp_tplist,                         offsetof(VP9_COMP, tplist));
-DEFINE(vp9_comp_common,                         offsetof(VP9_COMP, common));
-
-DEFINE(tokenlist_start,                         offsetof(TOKENLIST, start));
-DEFINE(tokenlist_stop,                          offsetof(TOKENLIST, stop));
-DEFINE(TOKENLIST_SZ,                            sizeof(TOKENLIST));
-
-DEFINE(vp9_common_mb_rows,                      offsetof(VP9_COMMON, mb_rows));
-
-END
-
-/* add asserts for any offset that is not supported by assembly code
- * add asserts for any size that is not supported by assembly code
-
- * These are used in vp8cx_pack_tokens.  They are hard coded so if their sizes
- * change they will have to be adjusted.
- */
-
-#if HAVE_ARMV5TE
-ct_assert(TOKENEXTRA_SZ, sizeof(TOKENEXTRA) == 8)
-ct_assert(vp9_extra_bit_struct_sz, sizeof(vp9_extra_bit_struct) == 16)
-#endif
--- a/vp8/encoder/bitstream.c
+++ /dev/null
@@ -1,2394 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vp8/common/header.h"
-#include "encodemv.h"
-#include "vp8/common/entropymode.h"
-#include "vp8/common/findnearmv.h"
-#include "mcomp.h"
-#include "vp8/common/systemdependent.h"
-#include <assert.h>
-#include <stdio.h>
-#include <limits.h>
-#include "vp8/common/pragmas.h"
-#include "vpx/vpx_encoder.h"
-#include "vpx_mem/vpx_mem.h"
-#include "bitstream.h"
-#include "segmentation.h"
-
-#include "vp8/common/seg_common.h"
-#include "vp8/common/pred_common.h"
-#include "vp8/common/entropy.h"
-#include "vp8/encoder/encodemv.h"
-#include "vp8/common/entropymv.h"
-
-#if CONFIG_NEWBESTREFMV
-#include "vp8/common/mvref_common.h"
-#endif
-
-#if defined(SECTIONBITS_OUTPUT)
-unsigned __int64 Sectionbits[500];
-#endif
-
-#ifdef ENTROPY_STATS
-int intra_mode_stats [VP9_BINTRAMODES] [VP9_BINTRAMODES] [VP9_BINTRAMODES];
-unsigned int tree_update_hist [BLOCK_TYPES]
-                              [COEF_BANDS]
-                              [PREV_COEF_CONTEXTS]
-                              [ENTROPY_NODES][2];
-unsigned int hybrid_tree_update_hist [BLOCK_TYPES]
-                                     [COEF_BANDS]
-                                     [PREV_COEF_CONTEXTS]
-                                     [ENTROPY_NODES][2];
-unsigned int tree_update_hist_8x8 [BLOCK_TYPES_8X8]
-                                  [COEF_BANDS]
-                                  [PREV_COEF_CONTEXTS]
-                                  [ENTROPY_NODES] [2];
-unsigned int hybrid_tree_update_hist_8x8 [BLOCK_TYPES_8X8]
-                                         [COEF_BANDS]
-                                         [PREV_COEF_CONTEXTS]
-                                         [ENTROPY_NODES] [2];
-unsigned int tree_update_hist_16x16 [BLOCK_TYPES_16X16]
-                                    [COEF_BANDS]
-                                    [PREV_COEF_CONTEXTS]
-                                    [ENTROPY_NODES] [2];
-unsigned int hybrid_tree_update_hist_16x16 [BLOCK_TYPES_16X16]
-                                           [COEF_BANDS]
-                                           [PREV_COEF_CONTEXTS]
-                                           [ENTROPY_NODES] [2];
-
-extern unsigned int active_section;
-#endif
-
-#ifdef MODE_STATS
-int count_mb_seg[4] = { 0, 0, 0, 0 };
-#endif
-
-#define vp9_cost_upd  ((int)(vp9_cost_one(upd) - vp9_cost_zero(upd)) >> 8)
-#define vp9_cost_upd256  ((int)(vp9_cost_one(upd) - vp9_cost_zero(upd)))
-
-#define SEARCH_NEWP
-static int update_bits[255];
-
-static void compute_update_table() {
-  int i;
-  for (i = 0; i < 255; i++)
-    update_bits[i] = vp9_count_term_subexp(i, SUBEXP_PARAM, 255);
-}
-
-static int split_index(int i, int n, int modulus) {
-  int max1 = (n - 1 - modulus / 2) / modulus + 1;
-  if (i % modulus == modulus / 2) i = i / modulus;
-  else i = max1 + i - (i + modulus - modulus / 2) / modulus;
-  return i;
-}
-
-static int remap_prob(int v, int m) {
-  const int n = 256;
-  const int modulus = MODULUS_PARAM;
-  int i;
-  if ((m << 1) <= n)
-    i = vp9_recenter_nonneg(v, m) - 1;
-  else
-    i = vp9_recenter_nonneg(n - 1 - v, n - 1 - m) - 1;
-
-  i = split_index(i, n - 1, modulus);
-  return i;
-}
-
-static void write_prob_diff_update(vp9_writer *const bc,
-                                   vp9_prob newp, vp9_prob oldp) {
-  int delp = remap_prob(newp, oldp);
-  vp9_encode_term_subexp(bc, delp, SUBEXP_PARAM, 255);
-}
-
-static int prob_diff_update_cost(vp9_prob newp, vp9_prob oldp) {
-  int delp = remap_prob(newp, oldp);
-  return update_bits[delp] * 256;
-}
-
-static void update_mode(
-  vp9_writer *const bc,
-  int n,
-  vp9_token tok               [/* n */],
-  vp9_tree tree,
-  vp9_prob Pnew               [/* n-1 */],
-  vp9_prob Pcur               [/* n-1 */],
-  unsigned int bct            [/* n-1 */] [2],
-  const unsigned int num_events[/* n */]
-) {
-  unsigned int new_b = 0, old_b = 0;
-  int i = 0;
-
-  vp9_tree_probs_from_distribution(
-    n--, tok, tree,
-    Pnew, bct, num_events,
-    256, 1
-  );
-
-  do {
-    new_b += cost_branch(bct[i], Pnew[i]);
-    old_b += cost_branch(bct[i], Pcur[i]);
-  } while (++i < n);
-
-  if (new_b + (n << 8) < old_b) {
-    int i = 0;
-
-    vp9_write_bit(bc, 1);
-
-    do {
-      const vp9_prob p = Pnew[i];
-
-      vp9_write_literal(bc, Pcur[i] = p ? p : 1, 8);
-    } while (++i < n);
-  } else
-    vp9_write_bit(bc, 0);
-}
-
-static void update_mbintra_mode_probs(VP9_COMP* const cpi,
-                                      vp9_writer* const bc) {
-  VP9_COMMON *const cm = &cpi->common;
-
-  {
-    vp9_prob Pnew   [VP9_YMODES - 1];
-    unsigned int bct [VP9_YMODES - 1] [2];
-
-    update_mode(
-      bc, VP9_YMODES, vp9_ymode_encodings, vp9_ymode_tree,
-      Pnew, cm->fc.ymode_prob, bct, (unsigned int *)cpi->ymode_count
-    );
-  }
-}
-
-static int get_prob(int num, int den) {
-  int p;
-  if (den <= 0)
-    return 128;
-  p = (num * 255 + (den >> 1)) / den;
-  if (p > 255)
-    return 255;
-  else if (p < 1)
-    return 1;
-  return p;
-}
-
-static int get_binary_prob(int n0, int n1) {
-  return get_prob(n0, n0 + n1);
-}
-
-void vp9_update_skip_probs(VP9_COMP *cpi) {
-  VP9_COMMON *const pc = &cpi->common;
-  int prob_skip_false[3] = {0, 0, 0};
-  int k;
-
-  for (k = 0; k < MBSKIP_CONTEXTS; ++k) {
-    pc->mbskip_pred_probs[k] = get_binary_prob(cpi->skip_false_count[k],
-                                               cpi->skip_true_count[k]);
-  }
-}
-
-static void update_switchable_interp_probs(VP9_COMP *cpi,
-                                           vp9_writer* const bc) {
-  VP9_COMMON *const pc = &cpi->common;
-  unsigned int branch_ct[32][2];
-  int i, j;
-  for (j = 0; j <= VP9_SWITCHABLE_FILTERS; ++j) {
-    vp9_tree_probs_from_distribution(
-        VP9_SWITCHABLE_FILTERS,
-        vp9_switchable_interp_encodings, vp9_switchable_interp_tree,
-        pc->fc.switchable_interp_prob[j], branch_ct,
-        cpi->switchable_interp_count[j], 256, 1);
-    for (i = 0; i < VP9_SWITCHABLE_FILTERS - 1; ++i) {
-      if (pc->fc.switchable_interp_prob[j][i] < 1)
-        pc->fc.switchable_interp_prob[j][i] = 1;
-      vp9_write_literal(bc, pc->fc.switchable_interp_prob[j][i], 8);
-    }
-  }
-}
-
-// This function updates the reference frame prediction stats
-static void update_refpred_stats(VP9_COMP *cpi) {
-  VP9_COMMON *const cm = &cpi->common;
-  int i;
-  int tot_count;
-  vp9_prob new_pred_probs[PREDICTION_PROBS];
-  int old_cost, new_cost;
-
-  // Set the prediction probability structures to defaults
-  if (cm->frame_type == KEY_FRAME) {
-    // Set the prediction probabilities to defaults
-    cm->ref_pred_probs[0] = 120;
-    cm->ref_pred_probs[1] = 80;
-    cm->ref_pred_probs[2] = 40;
-
-    vpx_memset(cpi->ref_pred_probs_update, 0,
-               sizeof(cpi->ref_pred_probs_update));
-  } else {
-    // From the prediction counts set the probabilities for each context
-    for (i = 0; i < PREDICTION_PROBS; i++) {
-      new_pred_probs[i] = get_binary_prob(cpi->ref_pred_count[i][0],
-                                          cpi->ref_pred_count[i][1]);
-
-      // Decide whether or not to update the reference frame probs.
-      // Returned costs are in 1/256 bit units.
-      old_cost =
-        (cpi->ref_pred_count[i][0] * vp9_cost_zero(cm->ref_pred_probs[i])) +
-        (cpi->ref_pred_count[i][1] * vp9_cost_one(cm->ref_pred_probs[i]));
-
-      new_cost =
-        (cpi->ref_pred_count[i][0] * vp9_cost_zero(new_pred_probs[i])) +
-        (cpi->ref_pred_count[i][1] * vp9_cost_one(new_pred_probs[i]));
-
-      // Cost saving must be >= 8 bits (2048 in these units)
-      if ((old_cost - new_cost) >= 2048) {
-        cpi->ref_pred_probs_update[i] = 1;
-        cm->ref_pred_probs[i] = new_pred_probs[i];
-      } else
-        cpi->ref_pred_probs_update[i] = 0;
-
-    }
-  }
-}
-
-static void update_mvcount(VP9_COMP *cpi, MACROBLOCK *x,
-                           int_mv *best_ref_mv, int_mv *second_best_ref_mv) {
-  MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;
-  MV mv;
-
-  if (mbmi->mode == SPLITMV) {
-    int i;
-
-    for (i = 0; i < x->partition_info->count; i++) {
-      if (x->partition_info->bmi[i].mode == NEW4X4) {
-        if (x->e_mbd.allow_high_precision_mv) {
-          mv.row = (x->partition_info->bmi[i].mv.as_mv.row
-                    - best_ref_mv->as_mv.row);
-          mv.col = (x->partition_info->bmi[i].mv.as_mv.col
-                    - best_ref_mv->as_mv.col);
-          vp9_increment_nmv(&mv, &best_ref_mv->as_mv, &cpi->NMVcount, 1);
-          if (x->e_mbd.mode_info_context->mbmi.second_ref_frame) {
-            mv.row = (x->partition_info->bmi[i].second_mv.as_mv.row
-                      - second_best_ref_mv->as_mv.row);
-            mv.col = (x->partition_info->bmi[i].second_mv.as_mv.col
-                      - second_best_ref_mv->as_mv.col);
-            vp9_increment_nmv(&mv, &second_best_ref_mv->as_mv,
-                              &cpi->NMVcount, 1);
-          }
-        } else {
-          mv.row = (x->partition_info->bmi[i].mv.as_mv.row
-                    - best_ref_mv->as_mv.row);
-          mv.col = (x->partition_info->bmi[i].mv.as_mv.col
-                    - best_ref_mv->as_mv.col);
-          vp9_increment_nmv(&mv, &best_ref_mv->as_mv, &cpi->NMVcount, 0);
-          if (x->e_mbd.mode_info_context->mbmi.second_ref_frame) {
-            mv.row = (x->partition_info->bmi[i].second_mv.as_mv.row
-                      - second_best_ref_mv->as_mv.row);
-            mv.col = (x->partition_info->bmi[i].second_mv.as_mv.col
-                      - second_best_ref_mv->as_mv.col);
-            vp9_increment_nmv(&mv, &second_best_ref_mv->as_mv,
-                              &cpi->NMVcount, 0);
-          }
-        }
-      }
-    }
-  } else if (mbmi->mode == NEWMV) {
-    if (x->e_mbd.allow_high_precision_mv) {
-      mv.row = (mbmi->mv[0].as_mv.row - best_ref_mv->as_mv.row);
-      mv.col = (mbmi->mv[0].as_mv.col - best_ref_mv->as_mv.col);
-      vp9_increment_nmv(&mv, &best_ref_mv->as_mv, &cpi->NMVcount, 1);
-      if (mbmi->second_ref_frame) {
-        mv.row = (mbmi->mv[1].as_mv.row - second_best_ref_mv->as_mv.row);
-        mv.col = (mbmi->mv[1].as_mv.col - second_best_ref_mv->as_mv.col);
-        vp9_increment_nmv(&mv, &second_best_ref_mv->as_mv, &cpi->NMVcount, 1);
-      }
-    } else {
-      mv.row = (mbmi->mv[0].as_mv.row - best_ref_mv->as_mv.row);
-      mv.col = (mbmi->mv[0].as_mv.col - best_ref_mv->as_mv.col);
-      vp9_increment_nmv(&mv, &best_ref_mv->as_mv, &cpi->NMVcount, 0);
-      if (mbmi->second_ref_frame) {
-        mv.row = (mbmi->mv[1].as_mv.row - second_best_ref_mv->as_mv.row);
-        mv.col = (mbmi->mv[1].as_mv.col - second_best_ref_mv->as_mv.col);
-        vp9_increment_nmv(&mv, &second_best_ref_mv->as_mv, &cpi->NMVcount, 0);
-      }
-    }
-  }
-}
-
-static void write_ymode(vp9_writer *bc, int m, const vp9_prob *p) {
-  write_token(bc, vp9_ymode_tree, p, vp9_ymode_encodings + m);
-}
-
-static void kfwrite_ymode(vp9_writer *bc, int m, const vp9_prob *p) {
-  write_token(bc, vp9_kf_ymode_tree, p, vp9_kf_ymode_encodings + m);
-}
-
-#if CONFIG_SUPERBLOCKS
-static void sb_kfwrite_ymode(vp9_writer *bc, int m, const vp9_prob *p) {
-  write_token(bc, vp9_uv_mode_tree, p, vp9_sb_kf_ymode_encodings + m);
-}
-#endif
-
-static void write_i8x8_mode(vp9_writer *bc, int m, const vp9_prob *p) {
-  write_token(bc, vp9_i8x8_mode_tree, p, vp9_i8x8_mode_encodings + m);
-}
-
-static void write_uv_mode(vp9_writer *bc, int m, const vp9_prob *p) {
-  write_token(bc, vp9_uv_mode_tree, p, vp9_uv_mode_encodings + m);
-}
-
-
-static void write_bmode(vp9_writer *bc, int m, const vp9_prob *p) {
-  write_token(bc, vp9_bmode_tree, p, vp9_bmode_encodings + m);
-}
-
-static void write_split(vp9_writer *bc, int x, const vp9_prob *p) {
-  write_token(bc, vp9_mbsplit_tree, p, vp9_mbsplit_encodings + x);
-}
-
-static int prob_update_savings(const unsigned int *ct,
-                               const vp9_prob oldp, const vp9_prob newp,
-                               const vp9_prob upd) {
-  const int old_b = cost_branch256(ct, oldp);
-  const int new_b = cost_branch256(ct, newp);
-  const int update_b = 2048 + vp9_cost_upd256;
-  return (old_b - new_b - update_b);
-}
-
-static int prob_diff_update_savings(const unsigned int *ct,
-                                    const vp9_prob oldp, const vp9_prob newp,
-                                    const vp9_prob upd) {
-  const int old_b = cost_branch256(ct, oldp);
-  const int new_b = cost_branch256(ct, newp);
-  const int update_b = (newp == oldp ? 0 :
-                        prob_diff_update_cost(newp, oldp) + vp9_cost_upd256);
-  return (old_b - new_b - update_b);
-}
-
-static int prob_diff_update_savings_search(const unsigned int *ct,
-                                           const vp9_prob oldp, vp9_prob *bestp,
-                                           const vp9_prob upd) {
-  const int old_b = cost_branch256(ct, oldp);
-  int new_b, update_b, savings, bestsavings, step;
-  vp9_prob newp, bestnewp;
-
-  bestsavings = 0;
-  bestnewp = oldp;
-
-  step = (*bestp > oldp ? -1 : 1);
-  for (newp = *bestp; newp != oldp; newp += step) {
-    new_b = cost_branch256(ct, newp);
-    update_b = prob_diff_update_cost(newp, oldp) + vp9_cost_upd256;
-    savings = old_b - new_b - update_b;
-    if (savings > bestsavings) {
-      bestsavings = savings;
-      bestnewp = newp;
-    }
-  }
-  *bestp = bestnewp;
-  return bestsavings;
-}
-
-static void pack_mb_tokens(vp9_writer* const bc,
-                           TOKENEXTRA **tp,
-                           const TOKENEXTRA *const stop) {
-  unsigned int split;
-  unsigned int shift;
-  int count = bc->count;
-  unsigned int range = bc->range;
-  unsigned int lowvalue = bc->lowvalue;
-  TOKENEXTRA *p = *tp;
-
-  while (p < stop) {
-    const int t = p->Token;
-    vp9_token *const a = vp9_coef_encodings + t;
-    const vp9_extra_bit_struct *const b = vp9_extra_bits + t;
-    int i = 0;
-    const unsigned char *pp = p->context_tree;
-    int v = a->value;
-    int n = a->Len;
-
-    if (t == EOSB_TOKEN)
-    {
-      ++p;
-      break;
-    }
-
-    /* skip one or two nodes */
-    if (p->skip_eob_node) {
-      n -= p->skip_eob_node;
-      i = 2 * p->skip_eob_node;
-    }
-
-    do {
-      const int bb = (v >> --n) & 1;
-      split = 1 + (((range - 1) * pp[i >> 1]) >> 8);
-      i = vp9_coef_tree[i + bb];
-
-      if (bb) {
-        lowvalue += split;
-        range = range - split;
-      } else {
-        range = split;
-      }
-
-      shift = vp9_norm[range];
-      range <<= shift;
-      count += shift;
-
-      if (count >= 0) {
-        int offset = shift - count;
-
-        if ((lowvalue << (offset - 1)) & 0x80000000) {
-          int x = bc->pos - 1;
-
-          while (x >= 0 && bc->buffer[x] == 0xff) {
-            bc->buffer[x] = (unsigned char)0;
-            x--;
-          }
-
-          bc->buffer[x] += 1;
-        }
-
-        bc->buffer[bc->pos++] = (lowvalue >> (24 - offset));
-        lowvalue <<= offset;
-        shift = count;
-        lowvalue &= 0xffffff;
-        count -= 8;
-      }
-
-      lowvalue <<= shift;
-    } while (n);
-
-
-    if (b->base_val) {
-      const int e = p->Extra, L = b->Len;
-
-      if (L) {
-        const unsigned char *pp = b->prob;
-        int v = e >> 1;
-        int n = L;              /* number of bits in v, assumed nonzero */
-        int i = 0;
-
-        do {
-          const int bb = (v >> --n) & 1;
-          split = 1 + (((range - 1) * pp[i >> 1]) >> 8);
-          i = b->tree[i + bb];
-
-          if (bb) {
-            lowvalue += split;
-            range = range - split;
-          } else {
-            range = split;
-          }
-
-          shift = vp9_norm[range];
-          range <<= shift;
-          count += shift;
-
-          if (count >= 0) {
-            int offset = shift - count;
-
-            if ((lowvalue << (offset - 1)) & 0x80000000) {
-              int x = bc->pos - 1;
-
-              while (x >= 0 && bc->buffer[x] == 0xff) {
-                bc->buffer[x] = (unsigned char)0;
-                x--;
-              }
-
-              bc->buffer[x] += 1;
-            }
-
-            bc->buffer[bc->pos++] = (lowvalue >> (24 - offset));
-            lowvalue <<= offset;
-            shift = count;
-            lowvalue &= 0xffffff;
-            count -= 8;
-          }
-
-          lowvalue <<= shift;
-        } while (n);
-      }
-
-
-      {
-
-        split = (range + 1) >> 1;
-
-        if (e & 1) {
-          lowvalue += split;
-          range = range - split;
-        } else {
-          range = split;
-        }
-
-        range <<= 1;
-
-        if ((lowvalue & 0x80000000)) {
-          int x = bc->pos - 1;
-
-          while (x >= 0 && bc->buffer[x] == 0xff) {
-            bc->buffer[x] = (unsigned char)0;
-            x--;
-          }
-
-          bc->buffer[x] += 1;
-
-        }
-
-        lowvalue  <<= 1;
-
-        if (!++count) {
-          count = -8;
-          bc->buffer[bc->pos++] = (lowvalue >> 24);
-          lowvalue &= 0xffffff;
-        }
-      }
-
-    }
-    ++p;
-  }
-
-  bc->count = count;
-  bc->lowvalue = lowvalue;
-  bc->range = range;
-  *tp = p;
-}
-
-static void write_partition_size(unsigned char *cx_data, int size) {
-  signed char csize;
-
-  csize = size & 0xff;
-  *cx_data = csize;
-  csize = (size >> 8) & 0xff;
-  *(cx_data + 1) = csize;
-  csize = (size >> 16) & 0xff;
-  *(cx_data + 2) = csize;
-
-}
-
-static void write_mv_ref
-(
-  vp9_writer *bc, MB_PREDICTION_MODE m, const vp9_prob *p
-) {
-#if CONFIG_DEBUG
-  assert(NEARESTMV <= m  &&  m <= SPLITMV);
-#endif
-  write_token(bc, vp9_mv_ref_tree, p,
-              vp9_mv_ref_encoding_array - NEARESTMV + m);
-}
-
-#if CONFIG_SUPERBLOCKS
-static void write_sb_mv_ref(vp9_writer *bc, MB_PREDICTION_MODE m,
-                            const vp9_prob *p) {
-#if CONFIG_DEBUG
-  assert(NEARESTMV <= m  &&  m < SPLITMV);
-#endif
-  write_token(bc, vp9_sb_mv_ref_tree, p,
-              vp9_sb_mv_ref_encoding_array - NEARESTMV + m);
-}
-#endif
-
-static void write_sub_mv_ref
-(
-  vp9_writer *bc, B_PREDICTION_MODE m, const vp9_prob *p
-) {
-#if CONFIG_DEBUG
-  assert(LEFT4X4 <= m  &&  m <= NEW4X4);
-#endif
-  write_token(bc, vp9_sub_mv_ref_tree, p,
-              vp9_sub_mv_ref_encoding_array - LEFT4X4 + m);
-}
-
-static void write_nmv(vp9_writer *bc, const MV *mv, const int_mv *ref,
-                      const nmv_context *nmvc, int usehp) {
-  MV e;
-  e.row = mv->row - ref->as_mv.row;
-  e.col = mv->col - ref->as_mv.col;
-
-  vp9_encode_nmv(bc, &e, &ref->as_mv, nmvc);
-  vp9_encode_nmv_fp(bc, &e, &ref->as_mv, nmvc, usehp);
-}
-
-#if CONFIG_NEW_MVREF
-static int vp9_cost_mv_ref_id(vp9_prob * ref_id_probs, int mv_ref_id) {
-  int cost;
-
-  // Encode the index for the MV reference.
-  switch (mv_ref_id) {
-    case 0:
-      cost = vp9_cost_zero(ref_id_probs[0]);
-      break;
-    case 1:
-      cost = vp9_cost_one(ref_id_probs[0]);
-      cost += vp9_cost_zero(ref_id_probs[1]);
-      break;
-    case 2:
-      cost = vp9_cost_one(ref_id_probs[0]);
-      cost += vp9_cost_one(ref_id_probs[1]);
-      cost += vp9_cost_zero(ref_id_probs[2]);
-      break;
-    case 3:
-      cost = vp9_cost_one(ref_id_probs[0]);
-      cost += vp9_cost_one(ref_id_probs[1]);
-      cost += vp9_cost_one(ref_id_probs[2]);
-      break;
-
-      // TRAP.. This should not happen
-    default:
-      assert(0);
-      break;
-  }
-
-  return cost;
-}
-
-static void vp9_write_mv_ref_id(vp9_writer *w,
-                                vp9_prob * ref_id_probs,
-                                int mv_ref_id) {
-  // Encode the index for the MV reference.
-  switch (mv_ref_id) {
-    case 0:
-      vp9_write(w, 0, ref_id_probs[0]);
-      break;
-    case 1:
-      vp9_write(w, 1, ref_id_probs[0]);
-      vp9_write(w, 0, ref_id_probs[1]);
-      break;
-    case 2:
-      vp9_write(w, 1, ref_id_probs[0]);
-      vp9_write(w, 1, ref_id_probs[1]);
-      vp9_write(w, 0, ref_id_probs[2]);
-      break;
-    case 3:
-      vp9_write(w, 1, ref_id_probs[0]);
-      vp9_write(w, 1, ref_id_probs[1]);
-      vp9_write(w, 1, ref_id_probs[2]);
-      break;
-
-      // TRAP.. This should not happen
-    default:
-      assert(0);
-      break;
-  }
-}
-
-// Estimate the cost of each coding the vector using each reference candidate
-static unsigned int pick_best_mv_ref(MACROBLOCK *x,
-                                     MV_REFERENCE_FRAME ref_frame,
-                                     int_mv target_mv,
-                                     int_mv * mv_ref_list,
-                                     int_mv * best_ref) {
-  int i;
-  int best_index = 0;
-  int cost, cost2;
-  int zero_seen = (mv_ref_list[0].as_int) ? FALSE : TRUE;
-  MACROBLOCKD *xd = &x->e_mbd;
-  int max_mv = MV_MAX;
-
-  cost = vp9_cost_mv_ref_id(xd->mb_mv_ref_id_probs[ref_frame], 0) +
-         vp9_mv_bit_cost(&target_mv,
-                         &mv_ref_list[0],
-                         XMVCOST, 96,
-                         xd->allow_high_precision_mv);
-
-
-  // Use 4 for now : for (i = 1; i < MAX_MV_REFS; ++i ) {
-  for (i = 1; i < 4; ++i) {
-    // If we see a 0,0 reference vector for a second time we have reached
-    // the end of the list of valid candidate vectors.
-    if (!mv_ref_list[i].as_int)
-      if (zero_seen)
-        break;
-      else
-        zero_seen = TRUE;
-
-    // Check for cases where the reference choice would give rise to an
-    // uncodable/out of range residual for row or col.
-    if ((abs(target_mv.as_mv.row - mv_ref_list[i].as_mv.row) > max_mv) ||
-        (abs(target_mv.as_mv.col - mv_ref_list[i].as_mv.col) > max_mv)) {
-      continue;
-    }
-
-    cost2 = vp9_cost_mv_ref_id(xd->mb_mv_ref_id_probs[ref_frame], i) +
-            vp9_mv_bit_cost(&target_mv,
-                            &mv_ref_list[i],
-                            XMVCOST, 96,
-                            xd->allow_high_precision_mv);
-
-    if (cost2 < cost) {
-      cost = cost2;
-      best_index = i;
-    }
-  }
-
-  (*best_ref).as_int = mv_ref_list[best_index].as_int;
-
-  return best_index;
-}
-#endif
-
-// This function writes the current macro block's segnment id to the bitstream
-// It should only be called if a segment map update is indicated.
-static void write_mb_segid(vp9_writer *bc,
-                           const MB_MODE_INFO *mi, const MACROBLOCKD *xd) {
-  // Encode the MB segment id.
-  int seg_id = mi->segment_id;
-#if CONFIG_SUPERBLOCKS
-  if (mi->encoded_as_sb) {
-    if (xd->mb_to_right_edge > 0)
-      seg_id = seg_id && xd->mode_info_context[1].mbmi.segment_id;
-    if (xd->mb_to_bottom_edge > 0) {
-      seg_id = seg_id &&
-               xd->mode_info_context[xd->mode_info_stride].mbmi.segment_id;
-      if (xd->mb_to_right_edge > 0)
-        seg_id = seg_id &&
-                xd->mode_info_context[xd->mode_info_stride + 1].mbmi.segment_id;
-    }
-  }
-#endif
-  if (xd->segmentation_enabled && xd->update_mb_segmentation_map) {
-    switch (seg_id) {
-      case 0:
-        vp9_write(bc, 0, xd->mb_segment_tree_probs[0]);
-        vp9_write(bc, 0, xd->mb_segment_tree_probs[1]);
-        break;
-      case 1:
-        vp9_write(bc, 0, xd->mb_segment_tree_probs[0]);
-        vp9_write(bc, 1, xd->mb_segment_tree_probs[1]);
-        break;
-      case 2:
-        vp9_write(bc, 1, xd->mb_segment_tree_probs[0]);
-        vp9_write(bc, 0, xd->mb_segment_tree_probs[2]);
-        break;
-      case 3:
-        vp9_write(bc, 1, xd->mb_segment_tree_probs[0]);
-        vp9_write(bc, 1, xd->mb_segment_tree_probs[2]);
-        break;
-
-        // TRAP.. This should not happen
-      default:
-        vp9_write(bc, 0, xd->mb_segment_tree_probs[0]);
-        vp9_write(bc, 0, xd->mb_segment_tree_probs[1]);
-        break;
-    }
-  }
-}
-
-// This function encodes the reference frame
-static void encode_ref_frame(vp9_writer *const bc,
-                             VP9_COMMON *const cm,
-                             MACROBLOCKD *xd,
-                             int segment_id,
-                             MV_REFERENCE_FRAME rf) {
-  int seg_ref_active;
-  int seg_ref_count = 0;
-  seg_ref_active = vp9_segfeature_active(xd,
-                                         segment_id,
-                                         SEG_LVL_REF_FRAME);
-
-  if (seg_ref_active) {
-    seg_ref_count = vp9_check_segref(xd, segment_id, INTRA_FRAME) +
-                    vp9_check_segref(xd, segment_id, LAST_FRAME) +
-                    vp9_check_segref(xd, segment_id, GOLDEN_FRAME) +
-                    vp9_check_segref(xd, segment_id, ALTREF_FRAME);
-  }
-
-  // If segment level coding of this signal is disabled...
-  // or the segment allows multiple reference frame options
-  if (!seg_ref_active || (seg_ref_count > 1)) {
-    // Values used in prediction model coding
-    unsigned char prediction_flag;
-    vp9_prob pred_prob;
-    MV_REFERENCE_FRAME pred_rf;
-
-    // Get the context probability the prediction flag
-    pred_prob = vp9_get_pred_prob(cm, xd, PRED_REF);
-
-    // Get the predicted value.
-    pred_rf = vp9_get_pred_ref(cm, xd);
-
-    // Did the chosen reference frame match its predicted value.
-    prediction_flag =
-      (xd->mode_info_context->mbmi.ref_frame == pred_rf);
-
-    vp9_set_pred_flag(xd, PRED_REF, prediction_flag);
-    vp9_write(bc, prediction_flag, pred_prob);
-
-    // If not predicted correctly then code value explicitly
-    if (!prediction_flag) {
-      vp9_prob mod_refprobs[PREDICTION_PROBS];
-
-      vpx_memcpy(mod_refprobs,
-                 cm->mod_refprobs[pred_rf], sizeof(mod_refprobs));
-
-      // If segment coding enabled blank out options that cant occur by
-      // setting the branch probability to 0.
-      if (seg_ref_active) {
-        mod_refprobs[INTRA_FRAME] *=
-          vp9_check_segref(xd, segment_id, INTRA_FRAME);
-        mod_refprobs[LAST_FRAME] *=
-          vp9_check_segref(xd, segment_id, LAST_FRAME);
-        mod_refprobs[GOLDEN_FRAME] *=
-          (vp9_check_segref(xd, segment_id, GOLDEN_FRAME) *
-           vp9_check_segref(xd, segment_id, ALTREF_FRAME));
-      }
-
-      if (mod_refprobs[0]) {
-        vp9_write(bc, (rf != INTRA_FRAME), mod_refprobs[0]);
-      }
-
-      // Inter coded
-      if (rf != INTRA_FRAME) {
-        if (mod_refprobs[1]) {
-          vp9_write(bc, (rf != LAST_FRAME), mod_refprobs[1]);
-        }
-
-        if (rf != LAST_FRAME) {
-          if (mod_refprobs[2]) {
-            vp9_write(bc, (rf != GOLDEN_FRAME), mod_refprobs[2]);
-          }
-        }
-      }
-    }
-  }
-
-  // if using the prediction mdoel we have nothing further to do because
-  // the reference frame is fully coded by the segment
-}
-
-// Update the probabilities used to encode reference frame data
-static void update_ref_probs(VP9_COMP *const cpi) {
-  VP9_COMMON *const cm = &cpi->common;
-
-  const int *const rfct = cpi->count_mb_ref_frame_usage;
-  const int rf_intra = rfct[INTRA_FRAME];
-  const int rf_inter = rfct[LAST_FRAME] +
-                       rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME];
-
-  cm->prob_intra_coded = get_binary_prob(rf_intra, rf_inter);
-  cm->prob_last_coded = get_prob(rfct[LAST_FRAME], rf_inter);
-  cm->prob_gf_coded = get_binary_prob(rfct[GOLDEN_FRAME], rfct[ALTREF_FRAME]);
-
-  // Compute a modified set of probabilities to use when prediction of the
-  // reference frame fails
-  vp9_compute_mod_refprobs(cm);
-}
-
-static void pack_inter_mode_mvs(VP9_COMP *const cpi, vp9_writer *const bc) {
-  int i;
-  VP9_COMMON *const pc = &cpi->common;
-  const nmv_context *nmvc = &pc->fc.nmvc;
-  MACROBLOCK *x = &cpi->mb;
-  MACROBLOCKD *xd = &cpi->mb.e_mbd;
-  MODE_INFO *m;
-  MODE_INFO *prev_m;
-  TOKENEXTRA *tok = cpi->tok;
-  TOKENEXTRA *tok_end = tok + cpi->tok_count;
-
-  const int mis = pc->mode_info_stride;
-  int mb_row, mb_col;
-  int row, col;
-
-  // Values used in prediction model coding
-  vp9_prob pred_prob;
-  unsigned char prediction_flag;
-
-  int row_delta[4] = { 0, +1,  0, -1};
-  int col_delta[4] = { +1, -1, +1, +1};
-
-  cpi->mb.partition_info = cpi->mb.pi;
-
-  mb_row = 0;
-  for (row = 0; row < pc->mb_rows; row += 2) {
-    m = pc->mi + row * mis;
-    prev_m = pc->prev_mi + row * mis;
-
-    mb_col = 0;
-    for (col = 0; col < pc->mb_cols; col += 2) {
-      int i;
-
-      // Process the 4 MBs in the order:
-      // top-left, top-right, bottom-left, bottom-right
-#if CONFIG_SUPERBLOCKS
-      vp9_write(bc, m->mbmi.encoded_as_sb, pc->sb_coded);
-#endif
-      for (i = 0; i < 4; i++) {
-        MB_MODE_INFO *mi;
-        MV_REFERENCE_FRAME rf;
-        MB_PREDICTION_MODE mode;
-        int segment_id;
-
-        int dy = row_delta[i];
-        int dx = col_delta[i];
-        int offset_extended = dy * mis + dx;
-
-        if ((mb_row >= pc->mb_rows) || (mb_col >= pc->mb_cols)) {
-          // MB lies outside frame, move on
-          mb_row += dy;
-          mb_col += dx;
-          m += offset_extended;
-          prev_m += offset_extended;
-          cpi->mb.partition_info += offset_extended;
-          continue;
-        }
-
-        mi = &m->mbmi;
-        rf = mi->ref_frame;
-        mode = mi->mode;
-        segment_id = mi->segment_id;
-
-        // Distance of Mb to the various image edges.
-        // These specified to 8th pel as they are always compared to MV
-        // values that are in 1/8th pel units
-        xd->mb_to_left_edge = -((mb_col * 16) << 3);
-        xd->mb_to_right_edge = ((pc->mb_cols - 1 - mb_col) * 16) << 3;
-        xd->mb_to_top_edge = -((mb_row * 16)) << 3;
-        xd->mb_to_bottom_edge = ((pc->mb_rows - 1 - mb_row) * 16) << 3;
-
-        // Make sure the MacroBlockD mode info pointer is set correctly
-        xd->mode_info_context = m;
-        xd->prev_mode_info_context = prev_m;
-
-#ifdef ENTROPY_STATS
-        active_section = 9;
-#endif
-        if (cpi->mb.e_mbd.update_mb_segmentation_map) {
-          // Is temporal coding of the segment map enabled
-          if (pc->temporal_update) {
-            prediction_flag = vp9_get_pred_flag(xd, PRED_SEG_ID);
-            pred_prob = vp9_get_pred_prob(pc, xd, PRED_SEG_ID);
-
-            // Code the segment id prediction flag for this mb
-            vp9_write(bc, prediction_flag, pred_prob);
-
-            // If the mb segment id wasn't predicted code explicitly
-            if (!prediction_flag)
-              write_mb_segid(bc, mi, &cpi->mb.e_mbd);
-          } else {
-            // Normal unpredicted coding
-            write_mb_segid(bc, mi, &cpi->mb.e_mbd);
-          }
-        }
-
-        if (pc->mb_no_coeff_skip &&
-            (!vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) ||
-             (vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) != 0))) {
-          int skip_coeff = mi->mb_skip_coeff;
-#if CONFIG_SUPERBLOCKS
-          if (mi->encoded_as_sb) {
-            skip_coeff &= m[1].mbmi.mb_skip_coeff;
-            skip_coeff &= m[mis].mbmi.mb_skip_coeff;
-            skip_coeff &= m[mis + 1].mbmi.mb_skip_coeff;
-          }
-#endif
-          vp9_write(bc, skip_coeff,
-                    vp9_get_pred_prob(pc, xd, PRED_MBSKIP));
-        }
-
-        // Encode the reference frame.
-        encode_ref_frame(bc, pc, xd, segment_id, rf);
-
-        if (rf == INTRA_FRAME) {
-#ifdef ENTROPY_STATS
-          active_section = 6;
-#endif
-
-          // TODO(rbultje) write using SB tree structure
-
-          if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_MODE)) {
-            write_ymode(bc, mode, pc->fc.ymode_prob);
-          }
-
-          if (mode == B_PRED) {
-            int j = 0;
-#if CONFIG_COMP_INTRA_PRED
-            int uses_second =
-              m->bmi[0].as_mode.second !=
-              (B_PREDICTION_MODE)(B_DC_PRED - 1);
-            vp9_write(bc, uses_second, 128);
-#endif
-            do {
-#if CONFIG_COMP_INTRA_PRED
-              B_PREDICTION_MODE mode2 = m->bmi[j].as_mode.second;
-#endif
-              write_bmode(bc, m->bmi[j].as_mode.first,
-                          pc->fc.bmode_prob);
-              /*
-              if (!cpi->dummy_packing) {
-                int p;
-                for (p = 0; p < VP9_BINTRAMODES - 1; ++p)
-                  printf(" %d", pc->fc.bmode_prob[p]);
-                printf("\nbmode[%d][%d]: %d\n", pc->current_video_frame, j, m->bmi[j].as_mode.first);
-              }
-              */
-#if CONFIG_COMP_INTRA_PRED
-              if (uses_second) {
-                write_bmode(bc, mode2, pc->fc.bmode_prob);
-              }
-#endif
-            } while (++j < 16);
-          }
-          if (mode == I8X8_PRED) {
-            write_i8x8_mode(bc, m->bmi[0].as_mode.first,
-                            pc->fc.i8x8_mode_prob);
-            write_i8x8_mode(bc, m->bmi[2].as_mode.first,
-                            pc->fc.i8x8_mode_prob);
-            write_i8x8_mode(bc, m->bmi[8].as_mode.first,
-                            pc->fc.i8x8_mode_prob);
-            write_i8x8_mode(bc, m->bmi[10].as_mode.first,
-                            pc->fc.i8x8_mode_prob);
-          } else {
-            write_uv_mode(bc, mi->uv_mode,
-                          pc->fc.uv_mode_prob[mode]);
-          }
-        } else {
-          int_mv best_mv, best_second_mv;
-          int ct[4];
-
-          vp9_prob mv_ref_p [VP9_MVREFS - 1];
-
-          {
-            int_mv n1, n2;
-
-            // Only used for context just now and soon to be deprecated.
-            vp9_find_near_mvs(xd, m, prev_m, &n1, &n2, &best_mv, ct,
-                              rf, cpi->common.ref_frame_sign_bias);
-#if CONFIG_NEWBESTREFMV
-            best_mv.as_int = mi->ref_mvs[rf][0].as_int;
-#endif
-
-            vp9_mv_ref_probs(&cpi->common, mv_ref_p, ct);
-
-#ifdef ENTROPY_STATS
-            accum_mv_refs(mode, ct);
-#endif
-          }
-
-#ifdef ENTROPY_STATS
-          active_section = 3;
-#endif
-
-          // Is the segment coding of mode enabled
-          if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_MODE)) {
-#if CONFIG_SUPERBLOCKS
-            if (mi->encoded_as_sb) {
-              write_sb_mv_ref(bc, mode, mv_ref_p);
-            } else
-#endif
-            {
-              write_mv_ref(bc, mode, mv_ref_p);
-            }
-            vp9_accum_mv_refs(&cpi->common, mode, ct);
-          }
-
-#if CONFIG_PRED_FILTER
-          // Is the prediction filter enabled
-          if (mode >= NEARESTMV && mode < SPLITMV) {
-            if (cpi->common.pred_filter_mode == 2)
-              vp9_write(bc, mi->pred_filter_enabled,
-                        pc->prob_pred_filter_off);
-            else
-              assert(mi->pred_filter_enabled ==
-                     cpi->common.pred_filter_mode);
-          }
-#endif
-          if (mode >= NEARESTMV && mode <= SPLITMV)
-          {
-            if (cpi->common.mcomp_filter_type == SWITCHABLE) {
-              write_token(bc, vp9_switchable_interp_tree,
-                          vp9_get_pred_probs(&cpi->common, xd,
-                                             PRED_SWITCHABLE_INTERP),
-                          vp9_switchable_interp_encodings +
-                              vp9_switchable_interp_map[mi->interp_filter]);
-            } else {
-              assert (mi->interp_filter ==
-                      cpi->common.mcomp_filter_type);
-            }
-          }
-          if (mi->second_ref_frame &&
-              (mode == NEWMV || mode == SPLITMV)) {
-            int_mv n1, n2;
-
-            // Only used for context just now and soon to be deprecated.
-            vp9_find_near_mvs(xd, m, prev_m,
-                              &n1, &n2, &best_second_mv, ct,
-                              mi->second_ref_frame,
-                              cpi->common.ref_frame_sign_bias);
-
-#if CONFIG_NEWBESTREFMV
-            best_second_mv.as_int =
-              mi->ref_mvs[mi->second_ref_frame][0].as_int;
-#endif
-          }
-
-          // does the feature use compound prediction or not
-          // (if not specified at the frame/segment level)
-          if (cpi->common.comp_pred_mode == HYBRID_PREDICTION) {
-            vp9_write(bc, mi->second_ref_frame != INTRA_FRAME,
-                      vp9_get_pred_prob(pc, xd, PRED_COMP));
-          }
-
-          {
-            switch (mode) { /* new, split require MVs */
-              case NEWMV:
-#ifdef ENTROPY_STATS
-                active_section = 5;
-#endif
-
-#if CONFIG_NEW_MVREF
-                {
-                  unsigned int best_index;
-
-                  // Choose the best mv reference
-                  best_index = pick_best_mv_ref(x, rf, mi->mv[0],
-                                                mi->ref_mvs[rf], &best_mv);
-
-                  // Encode the index of the choice.
-                  vp9_write_mv_ref_id(bc,
-                                      xd->mb_mv_ref_id_probs[rf], best_index);
-
-                  cpi->best_ref_index_counts[rf][best_index]++;
-
-                }
-#endif
-
-                write_nmv(bc, &mi->mv[0].as_mv, &best_mv,
-                          (const nmv_context*) nmvc,
-                          xd->allow_high_precision_mv);
-
-                if (mi->second_ref_frame) {
-#if CONFIG_NEW_MVREF
-                  unsigned int best_index;
-                  MV_REFERENCE_FRAME sec_ref_frame = mi->second_ref_frame;
-
-                  best_index =
-                    pick_best_mv_ref(x, sec_ref_frame, mi->mv[1],
-                                     mi->ref_mvs[sec_ref_frame],
-                                     &best_second_mv);
-
-                  // Encode the index of the choice.
-                  vp9_write_mv_ref_id(bc,
-                                      xd->mb_mv_ref_id_probs[sec_ref_frame],
-                                      best_index);
-
-                  cpi->best_ref_index_counts[sec_ref_frame][best_index]++;
-#endif
-                  write_nmv(bc, &mi->mv[1].as_mv, &best_second_mv,
-                            (const nmv_context*) nmvc,
-                            xd->allow_high_precision_mv);
-                }
-                break;
-              case SPLITMV: {
-                int j = 0;
-
-#ifdef MODE_STATS
-                ++count_mb_seg [mi->partitioning];
-#endif
-
-                write_split(bc, mi->partitioning, cpi->common.fc.mbsplit_prob);
-                cpi->mbsplit_count[mi->partitioning]++;
-
-                do {
-                  B_PREDICTION_MODE blockmode;
-                  int_mv blockmv;
-                  const int *const  L =
-                    vp9_mbsplits [mi->partitioning];
-                  int k = -1;  /* first block in subset j */
-                  int mv_contz;
-                  int_mv leftmv, abovemv;
-
-                  blockmode = cpi->mb.partition_info->bmi[j].mode;
-                  blockmv = cpi->mb.partition_info->bmi[j].mv;
-#if CONFIG_DEBUG
-                  while (j != L[++k])
-                    if (k >= 16)
-                      assert(0);
-#else
-                  while (j != L[++k]);
-#endif
-                  leftmv.as_int = left_block_mv(m, k);
-                  abovemv.as_int = above_block_mv(m, k, mis);
-                  mv_contz = vp9_mv_cont(&leftmv, &abovemv);
-
-                  write_sub_mv_ref(bc, blockmode,
-                                   cpi->common.fc.sub_mv_ref_prob [mv_contz]);
-                  cpi->sub_mv_ref_count[mv_contz][blockmode - LEFT4X4]++;
-                  if (blockmode == NEW4X4) {
-#ifdef ENTROPY_STATS
-                    active_section = 11;
-#endif
-                    write_nmv(bc, &blockmv.as_mv, &best_mv,
-                              (const nmv_context*) nmvc,
-                              xd->allow_high_precision_mv);
-
-                    if (mi->second_ref_frame) {
-                      write_nmv(bc,
-                                &cpi->mb.partition_info->bmi[j].second_mv.as_mv,
-                                &best_second_mv,
-                                (const nmv_context*) nmvc,
-                                xd->allow_high_precision_mv);
-                    }
-                  }
-                } while (++j < cpi->mb.partition_info->count);
-              }
-              break;
-              default:
-                break;
-            }
-          }
-
-          // Update the mvcounts used to tune mv probs but only if this is
-          // the real pack run.
-          if ( !cpi->dummy_packing ) {
-            update_mvcount(cpi, x, &best_mv, &best_second_mv);
-          }
-        }
-
-        if (
-#if CONFIG_SUPERBLOCKS
-            !mi->encoded_as_sb &&
-#endif
-            ((rf == INTRA_FRAME && mode <= I8X8_PRED) ||
-             (rf != INTRA_FRAME && !(mode == SPLITMV &&
-                                     mi->partitioning == PARTITIONING_4X4))) &&
-            pc->txfm_mode == TX_MODE_SELECT &&
-            !((pc->mb_no_coeff_skip && mi->mb_skip_coeff) ||
-              (vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) &&
-               vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) == 0))) {
-          TX_SIZE sz = mi->txfm_size;
-          // FIXME(rbultje) code ternary symbol once all experiments are merged
-          vp9_write(bc, sz != TX_4X4, pc->prob_tx[0]);
-          if (sz != TX_4X4 && mode != I8X8_PRED && mode != SPLITMV)
-            vp9_write(bc, sz != TX_8X8, pc->prob_tx[1]);
-        }
-
-#ifdef ENTROPY_STATS
-        active_section = 1;
-#endif
-        assert(tok < tok_end);
-        pack_mb_tokens(bc, &tok, tok_end);
-
-#if CONFIG_SUPERBLOCKS
-        if (m->mbmi.encoded_as_sb) {
-          assert(!i);
-          mb_col += 2;
-          m += 2;
-          cpi->mb.partition_info += 2;
-          prev_m += 2;
-          break;
-        }
-#endif
-
-        // Next MB
-        mb_row += dy;
-        mb_col += dx;
-        m += offset_extended;
-        prev_m += offset_extended;
-        cpi->mb.partition_info += offset_extended;
-#if CONFIG_DEBUG
-        assert((prev_m - cpi->common.prev_mip) == (m - cpi->common.mip));
-        assert((prev_m - cpi->common.prev_mi) == (m - cpi->common.mi));
-#endif
-      }
-    }
-
-    // Next SB
-    mb_row += 2;
-    m += mis + (1 - (pc->mb_cols & 0x1));
-    prev_m += mis + (1 - (pc->mb_cols & 0x1));
-    cpi->mb.partition_info += mis + (1 - (pc->mb_cols & 0x1));
-  }
-}
-
-
-static void write_mb_modes_kf(const VP9_COMMON  *c,
-                              const MACROBLOCKD *xd,
-                              const MODE_INFO   *m,
-                              int                mode_info_stride,
-                              vp9_writer *const  bc) {
-  const int mis = mode_info_stride;
-  int ym;
-  int segment_id;
-
-  ym = m->mbmi.mode;
-  segment_id = m->mbmi.segment_id;
-
-  if (xd->update_mb_segmentation_map) {
-    write_mb_segid(bc, &m->mbmi, xd);
-  }
-
-  if (c->mb_no_coeff_skip &&
-      (!vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) ||
-       (vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) != 0))) {
-        int skip_coeff = m->mbmi.mb_skip_coeff;
-#if CONFIG_SUPERBLOCKS
-        if (m->mbmi.encoded_as_sb) {
-          skip_coeff &= m[1].mbmi.mb_skip_coeff;
-          skip_coeff &= m[mis].mbmi.mb_skip_coeff;
-          skip_coeff &= m[mis + 1].mbmi.mb_skip_coeff;
-        }
-#endif
-        vp9_write(bc, skip_coeff,
-                  vp9_get_pred_prob(c, xd, PRED_MBSKIP));
-  }
-
-#if CONFIG_SUPERBLOCKS
-  if (m->mbmi.encoded_as_sb) {
-    sb_kfwrite_ymode(bc, ym,
-                     c->sb_kf_ymode_prob[c->kf_ymode_probs_index]);
-  } else
-#endif
-  {
-    kfwrite_ymode(bc, ym,
-                  c->kf_ymode_prob[c->kf_ymode_probs_index]);
-  }
-
-  if (ym == B_PRED) {
-    const int mis = c->mode_info_stride;
-    int i = 0;
-#if CONFIG_COMP_INTRA_PRED
-    int uses_second =
-      m->bmi[0].as_mode.second !=
-      (B_PREDICTION_MODE)(B_DC_PRED - 1);
-    vp9_write(bc, uses_second, 128);
-#endif
-    do {
-      const B_PREDICTION_MODE A = above_block_mode(m, i, mis);
-      const B_PREDICTION_MODE L = left_block_mode(m, i);
-      const int bm = m->bmi[i].as_mode.first;
-#if CONFIG_COMP_INTRA_PRED
-      const int bm2 = m->bmi[i].as_mode.second;
-#endif
-
-#ifdef ENTROPY_STATS
-      ++intra_mode_stats [A] [L] [bm];
-#endif
-
-      write_bmode(bc, bm, c->kf_bmode_prob [A] [L]);
-      // printf("    mode: %d\n", bm);
-#if CONFIG_COMP_INTRA_PRED
-      if (uses_second) {
-        write_bmode(bc, bm2, c->kf_bmode_prob [A] [L]);
-      }
-#endif
-    } while (++i < 16);
-  }
-  if (ym == I8X8_PRED) {
-    write_i8x8_mode(bc, m->bmi[0].as_mode.first,
-                    c->fc.i8x8_mode_prob);
-    // printf("    mode: %d\n", m->bmi[0].as_mode.first); fflush(stdout);
-    write_i8x8_mode(bc, m->bmi[2].as_mode.first,
-                    c->fc.i8x8_mode_prob);
-    // printf("    mode: %d\n", m->bmi[2].as_mode.first); fflush(stdout);
-    write_i8x8_mode(bc, m->bmi[8].as_mode.first,
-                    c->fc.i8x8_mode_prob);
-    // printf("    mode: %d\n", m->bmi[8].as_mode.first); fflush(stdout);
-    write_i8x8_mode(bc, m->bmi[10].as_mode.first,
-                    c->fc.i8x8_mode_prob);
-    // printf("    mode: %d\n", m->bmi[10].as_mode.first); fflush(stdout);
-  } else
-    write_uv_mode(bc, m->mbmi.uv_mode, c->kf_uv_mode_prob[ym]);
-
-  if (
-#if CONFIG_SUPERBLOCKS
-      !m->mbmi.encoded_as_sb &&
-#endif
-      ym <= I8X8_PRED && c->txfm_mode == TX_MODE_SELECT &&
-      !((c->mb_no_coeff_skip && m->mbmi.mb_skip_coeff) ||
-        (vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) &&
-         vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) == 0))) {
-    TX_SIZE sz = m->mbmi.txfm_size;
-    // FIXME(rbultje) code ternary symbol once all experiments are merged
-    vp9_write(bc, sz != TX_4X4, c->prob_tx[0]);
-    if (sz != TX_4X4 && ym <= TM_PRED)
-      vp9_write(bc, sz != TX_8X8, c->prob_tx[1]);
-  }
-}
-
-static void write_kfmodes(VP9_COMP* const cpi, vp9_writer* const bc) {
-  VP9_COMMON *const c = &cpi->common;
-  const int mis = c->mode_info_stride;
-  MACROBLOCKD *xd = &cpi->mb.e_mbd;
-  MODE_INFO *m;
-  int i;
-  int row, col;
-  int mb_row, mb_col;
-  int row_delta[4] = { 0, +1,  0, -1};
-  int col_delta[4] = { +1, -1, +1, +1};
-  TOKENEXTRA *tok = cpi->tok;
-  TOKENEXTRA *tok_end = tok + cpi->tok_count;
-
-  mb_row = 0;
-  for (row = 0; row < c->mb_rows; row += 2) {
-    m = c->mi + row * mis;
-
-    mb_col = 0;
-    for (col = 0; col < c->mb_cols; col += 2) {
-#if CONFIG_SUPERBLOCKS
-      vp9_write(bc, m->mbmi.encoded_as_sb, c->sb_coded);
-#endif
-      // Process the 4 MBs in the order:
-      // top-left, top-right, bottom-left, bottom-right
-      for (i = 0; i < 4; i++) {
-        int dy = row_delta[i];
-        int dx = col_delta[i];
-        int offset_extended = dy * mis + dx;
-
-        if ((mb_row >= c->mb_rows) || (mb_col >= c->mb_cols)) {
-          // MB lies outside frame, move on
-          mb_row += dy;
-          mb_col += dx;
-          m += offset_extended;
-          continue;
-        }
-
-        // Make sure the MacroBlockD mode info pointer is set correctly
-        xd->mode_info_context = m;
-
-        write_mb_modes_kf(c, xd, m, mis, bc);
-#ifdef ENTROPY_STATS
-        active_section = 8;
-#endif
-        assert(tok < tok_end);
-        pack_mb_tokens(bc, &tok, tok_end);
-
-#if CONFIG_SUPERBLOCKS
-        if (m->mbmi.encoded_as_sb) {
-          assert(!i);
-          mb_col += 2;
-          m += 2;
-          break;
-        }
-#endif
-        // Next MB
-        mb_row += dy;
-        mb_col += dx;
-        m += offset_extended;
-      }
-    }
-    mb_row += 2;
-  }
-}
-
-
-/* This function is used for debugging probability trees. */
-static void print_prob_tree(vp9_prob
-                            coef_probs[BLOCK_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES]) {
-  /* print coef probability tree */
-  int i, j, k, l;
-  FILE *f = fopen("enc_tree_probs.txt", "a");
-  fprintf(f, "{\n");
-  for (i = 0; i < BLOCK_TYPES; i++) {
-    fprintf(f, "  {\n");
-    for (j = 0; j < COEF_BANDS; j++) {
-      fprintf(f, "    {\n");
-      for (k = 0; k < PREV_COEF_CONTEXTS; k++) {
-        fprintf(f, "      {");
-        for (l = 0; l < ENTROPY_NODES; l++) {
-          fprintf(f, "%3u, ",
-                  (unsigned int)(coef_probs [i][j][k][l]));
-        }
-        fprintf(f, " }\n");
-      }
-      fprintf(f, "    }\n");
-    }
-    fprintf(f, "  }\n");
-  }
-  fprintf(f, "}\n");
-  fclose(f);
-}
-
-static void build_coeff_contexts(VP9_COMP *cpi) {
-  int i = 0, j, k;
-#ifdef ENTROPY_STATS
-  int t = 0;
-#endif
-  for (i = 0; i < BLOCK_TYPES; ++i) {
-    for (j = 0; j < COEF_BANDS; ++j) {
-      for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {
-        if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0)))
-          continue;
-        vp9_tree_probs_from_distribution(
-          MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree,
-          cpi->frame_coef_probs [i][j][k],
-          cpi->frame_branch_ct [i][j][k],
-          cpi->coef_counts [i][j][k],
-          256, 1
-        );
-#ifdef ENTROPY_STATS
-        if (!cpi->dummy_packing)
-          for (t = 0; t < MAX_ENTROPY_TOKENS; ++t)
-            context_counters[i][j][k][t] += cpi->coef_counts[i][j][k][t];
-#endif
-      }
-    }
-  }
-  for (i = 0; i < BLOCK_TYPES; ++i) {
-    for (j = 0; j < COEF_BANDS; ++j) {
-      for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {
-        if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0)))
-          continue;
-        vp9_tree_probs_from_distribution(
-          MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree,
-          cpi->frame_hybrid_coef_probs [i][j][k],
-          cpi->frame_hybrid_branch_ct [i][j][k],
-          cpi->hybrid_coef_counts [i][j][k],
-          256, 1
-        );
-#ifdef ENTROPY_STATS
-        if (!cpi->dummy_packing)
-          for (t = 0; t < MAX_ENTROPY_TOKENS; ++t)
-            hybrid_context_counters[i][j][k][t] += cpi->hybrid_coef_counts[i][j][k][t];
-#endif
-      }
-    }
-  }
-
-  if (cpi->common.txfm_mode != ONLY_4X4) {
-    for (i = 0; i < BLOCK_TYPES_8X8; ++i) {
-      for (j = 0; j < COEF_BANDS; ++j) {
-        for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {
-          /* at every context */
-          /* calc probs and branch cts for this frame only */
-          // vp9_prob new_p           [ENTROPY_NODES];
-          // unsigned int branch_ct   [ENTROPY_NODES] [2];
-          if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0)))
-            continue;
-          vp9_tree_probs_from_distribution(
-            MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree,
-            cpi->frame_coef_probs_8x8 [i][j][k],
-            cpi->frame_branch_ct_8x8 [i][j][k],
-            cpi->coef_counts_8x8 [i][j][k],
-            256, 1
-          );
-#ifdef ENTROPY_STATS
-          if (!cpi->dummy_packing)
-            for (t = 0; t < MAX_ENTROPY_TOKENS; ++t)
-              context_counters_8x8[i][j][k][t] += cpi->coef_counts_8x8[i][j][k][t];
-#endif
-        }
-      }
-    }
-    for (i = 0; i < BLOCK_TYPES_8X8; ++i) {
-      for (j = 0; j < COEF_BANDS; ++j) {
-        for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {
-          /* at every context */
-          /* calc probs and branch cts for this frame only */
-          // vp9_prob new_p           [ENTROPY_NODES];
-          // unsigned int branch_ct   [ENTROPY_NODES] [2];
-          if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0)))
-            continue;
-          vp9_tree_probs_from_distribution(
-            MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree,
-            cpi->frame_hybrid_coef_probs_8x8 [i][j][k],
-            cpi->frame_hybrid_branch_ct_8x8 [i][j][k],
-            cpi->hybrid_coef_counts_8x8 [i][j][k],
-            256, 1
-          );
-#ifdef ENTROPY_STATS
-          if (!cpi->dummy_packing)
-            for (t = 0; t < MAX_ENTROPY_TOKENS; ++t)
-              hybrid_context_counters_8x8[i][j][k][t] += cpi->hybrid_coef_counts_8x8[i][j][k][t];
-#endif
-        }
-      }
-    }
-  }
-
-  if (cpi->common.txfm_mode > ALLOW_8X8) {
-    for (i = 0; i < BLOCK_TYPES_16X16; ++i) {
-      for (j = 0; j < COEF_BANDS; ++j) {
-        for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {
-          if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0)))
-            continue;
-          vp9_tree_probs_from_distribution(
-            MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree,
-            cpi->frame_coef_probs_16x16[i][j][k],
-            cpi->frame_branch_ct_16x16[i][j][k],
-            cpi->coef_counts_16x16[i][j][k], 256, 1);
-#ifdef ENTROPY_STATS
-          if (!cpi->dummy_packing)
-            for (t = 0; t < MAX_ENTROPY_TOKENS; ++t)
-              context_counters_16x16[i][j][k][t] += cpi->coef_counts_16x16[i][j][k][t];
-#endif
-        }
-      }
-    }
-  }
-  for (i = 0; i < BLOCK_TYPES_16X16; ++i) {
-    for (j = 0; j < COEF_BANDS; ++j) {
-      for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {
-        if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0)))
-          continue;
-        vp9_tree_probs_from_distribution(
-          MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree,
-          cpi->frame_hybrid_coef_probs_16x16[i][j][k],
-          cpi->frame_hybrid_branch_ct_16x16[i][j][k],
-          cpi->hybrid_coef_counts_16x16[i][j][k], 256, 1);
-#ifdef ENTROPY_STATS
-        if (!cpi->dummy_packing)
-          for (t = 0; t < MAX_ENTROPY_TOKENS; ++t)
-            hybrid_context_counters_16x16[i][j][k][t] += cpi->hybrid_coef_counts_16x16[i][j][k][t];
-#endif
-      }
-    }
-  }
-}
-
-static void update_coef_probs_common(
-    vp9_writer* const bc,
-    vp9_prob new_frame_coef_probs[BLOCK_TYPES][COEF_BANDS]
-                                 [PREV_COEF_CONTEXTS][ENTROPY_NODES],
-    vp9_prob old_frame_coef_probs[BLOCK_TYPES][COEF_BANDS]
-                                 [PREV_COEF_CONTEXTS][ENTROPY_NODES],
-    unsigned int frame_branch_ct[BLOCK_TYPES][COEF_BANDS]
-                                [PREV_COEF_CONTEXTS][ENTROPY_NODES][2]) {
-  int i, j, k, t;
-  int update[2] = {0, 0};
-  int savings;
-  // vp9_prob bestupd = find_coef_update_prob(cpi);
-
-  /* dry run to see if there is any udpate at all needed */
-  savings = 0;
-  for (i = 0; i < BLOCK_TYPES; ++i) {
-    for (j = !i; j < COEF_BANDS; ++j) {
-      int prev_coef_savings[ENTROPY_NODES] = {0};
-      for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {
-        for (t = 0; t < ENTROPY_NODES; ++t) {
-          vp9_prob newp = new_frame_coef_probs[i][j][k][t];
-          const vp9_prob oldp = old_frame_coef_probs[i][j][k][t];
-          const vp9_prob upd = COEF_UPDATE_PROB;
-          int s = prev_coef_savings[t];
-          int u = 0;
-          if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0)))
-            continue;
-#if defined(SEARCH_NEWP)
-          s = prob_diff_update_savings_search(
-                frame_branch_ct[i][j][k][t],
-                oldp, &newp, upd);
-          if (s > 0 && newp != oldp)
-            u = 1;
-          if (u)
-            savings += s - (int)(vp9_cost_zero(upd));
-          else
-            savings -= (int)(vp9_cost_zero(upd));
-#else
-          s = prob_update_savings(
-                frame_branch_ct[i][j][k][t],
-                oldp, newp, upd);
-          if (s > 0)
-            u = 1;
-          if (u)
-            savings += s;
-#endif
-
-          update[u]++;
-        }
-      }
-    }
-  }
-
-  // printf("Update %d %d, savings %d\n", update[0], update[1], savings);
-  /* Is coef updated at all */
-  if (update[1] == 0 || savings < 0) {
-    vp9_write_bit(bc, 0);
-  } else {
-    vp9_write_bit(bc, 1);
-    for (i = 0; i < BLOCK_TYPES; ++i) {
-      for (j = !i; j < COEF_BANDS; ++j) {
-        int prev_coef_savings[ENTROPY_NODES] = {0};
-        for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {
-          // calc probs and branch cts for this frame only
-          for (t = 0; t < ENTROPY_NODES; ++t) {
-            vp9_prob newp = new_frame_coef_probs[i][j][k][t];
-            vp9_prob *oldp = old_frame_coef_probs[i][j][k] + t;
-            const vp9_prob upd = COEF_UPDATE_PROB;
-            int s = prev_coef_savings[t];
-            int u = 0;
-            if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0)))
-              continue;
-
-#if defined(SEARCH_NEWP)
-            s = prob_diff_update_savings_search(
-                  frame_branch_ct[i][j][k][t],
-                  *oldp, &newp, upd);
-            if (s > 0 && newp != *oldp)
-              u = 1;
-#else
-            s = prob_update_savings(
-                  frame_branch_ct[i][j][k][t],
-                  *oldp, newp, upd);
-            if (s > 0)
-              u = 1;
-#endif
-            vp9_write(bc, u, upd);
-#ifdef ENTROPY_STATS
-            if (!cpi->dummy_packing)
-              ++ tree_update_hist [i][j][k][t] [u];
-#endif
-            if (u) {
-              /* send/use new probability */
-              write_prob_diff_update(bc, newp, *oldp);
-              *oldp = newp;
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-static void update_coef_probs(VP9_COMP* const cpi, vp9_writer* const bc) {
-  vp9_clear_system_state();
-
-  // Build the cofficient contexts based on counts collected in encode loop
-  build_coeff_contexts(cpi);
-
-  update_coef_probs_common(bc,
-                           cpi->frame_coef_probs,
-                           cpi->common.fc.coef_probs,
-                           cpi->frame_branch_ct);
-
-  update_coef_probs_common(bc,
-                           cpi->frame_hybrid_coef_probs,
-                           cpi->common.fc.hybrid_coef_probs,
-                           cpi->frame_hybrid_branch_ct);
-
-  /* do not do this if not even allowed */
-  if (cpi->common.txfm_mode != ONLY_4X4) {
-    update_coef_probs_common(bc,
-                             cpi->frame_coef_probs_8x8,
-                             cpi->common.fc.coef_probs_8x8,
-                             cpi->frame_branch_ct_8x8);
-
-    update_coef_probs_common(bc,
-                             cpi->frame_hybrid_coef_probs_8x8,
-                             cpi->common.fc.hybrid_coef_probs_8x8,
-                             cpi->frame_hybrid_branch_ct_8x8);
-  }
-
-  if (cpi->common.txfm_mode > ALLOW_8X8) {
-    update_coef_probs_common(bc,
-                             cpi->frame_coef_probs_16x16,
-                             cpi->common.fc.coef_probs_16x16,
-                             cpi->frame_branch_ct_16x16);
-    update_coef_probs_common(bc,
-                             cpi->frame_hybrid_coef_probs_16x16,
-                             cpi->common.fc.hybrid_coef_probs_16x16,
-                             cpi->frame_hybrid_branch_ct_16x16);
-  }
-}
-
-#ifdef PACKET_TESTING
-FILE *vpxlogc = 0;
-#endif
-
-static void put_delta_q(vp9_writer *bc, int delta_q) {
-  if (delta_q != 0) {
-    vp9_write_bit(bc, 1);
-    vp9_write_literal(bc, abs(delta_q), 4);
-
-    if (delta_q < 0)
-      vp9_write_bit(bc, 1);
-    else
-      vp9_write_bit(bc, 0);
-  } else
-    vp9_write_bit(bc, 0);
-}
-
-static void decide_kf_ymode_entropy(VP9_COMP *cpi) {
-
-  int mode_cost[MB_MODE_COUNT];
-  int cost;
-  int bestcost = INT_MAX;
-  int bestindex = 0;
-  int i, j;
-
-  for (i = 0; i < 8; i++) {
-    vp9_cost_tokens(mode_cost, cpi->common.kf_ymode_prob[i], vp9_kf_ymode_tree);
-    cost = 0;
-    for (j = 0; j < VP9_YMODES; j++) {
-      cost += mode_cost[j] * cpi->ymode_count[j];
-    }
-#if CONFIG_SUPERBLOCKS
-    vp9_cost_tokens(mode_cost, cpi->common.sb_kf_ymode_prob[i],
-                    vp9_sb_ymode_tree);
-    for (j = 0; j < VP9_I32X32_MODES; j++) {
-      cost += mode_cost[j] * cpi->sb_ymode_count[j];
-    }
-#endif
-    if (cost < bestcost) {
-      bestindex = i;
-      bestcost = cost;
-    }
-  }
-  cpi->common.kf_ymode_probs_index = bestindex;
-
-}
-static void segment_reference_frames(VP9_COMP *cpi) {
-  VP9_COMMON *oci = &cpi->common;
-  MODE_INFO *mi = oci->mi;
-  int ref[MAX_MB_SEGMENTS] = {0};
-  int i, j;
-  int mb_index = 0;
-  MACROBLOCKD *const xd = &cpi->mb.e_mbd;
-
-  for (i = 0; i < oci->mb_rows; i++) {
-    for (j = 0; j < oci->mb_cols; j++, mb_index++) {
-      ref[mi[mb_index].mbmi.segment_id] |= (1 << mi[mb_index].mbmi.ref_frame);
-    }
-    mb_index++;
-  }
-  for (i = 0; i < MAX_MB_SEGMENTS; i++) {
-    vp9_enable_segfeature(xd, i, SEG_LVL_REF_FRAME);
-    vp9_set_segdata(xd, i, SEG_LVL_REF_FRAME, ref[i]);
-  }
-}
-
-void vp9_pack_bitstream(VP9_COMP *cpi, unsigned char *dest,
-                        unsigned long *size) {
-  int i, j;
-  VP9_HEADER oh;
-  VP9_COMMON *const pc = &cpi->common;
-  vp9_writer header_bc, residual_bc;
-  MACROBLOCKD *const xd = &cpi->mb.e_mbd;
-  int extra_bytes_packed = 0;
-
-  unsigned char *cx_data = dest;
-
-  oh.show_frame = (int) pc->show_frame;
-  oh.type = (int)pc->frame_type;
-  oh.version = pc->version;
-  oh.first_partition_length_in_bytes = 0;
-
-  cx_data += 3;
-
-#if defined(SECTIONBITS_OUTPUT)
-  Sectionbits[active_section = 1] += sizeof(VP9_HEADER) * 8 * 256;
-#endif
-
-  compute_update_table();
-
-  /* vp9_kf_default_bmode_probs() is called in vp9_setup_key_frame() once
-   * for each K frame before encode frame. pc->kf_bmode_prob doesn't get
-   * changed anywhere else. No need to call it again here. --yw
-   * vp9_kf_default_bmode_probs( pc->kf_bmode_prob);
-   */
-
-  /* every keyframe send startcode, width, height, scale factor, clamp
-   * and color type.
-   */
-  if (oh.type == KEY_FRAME) {
-    int v;
-
-    // Start / synch code
-    cx_data[0] = 0x9D;
-    cx_data[1] = 0x01;
-    cx_data[2] = 0x2a;
-
-    v = (pc->horiz_scale << 14) | pc->Width;
-    cx_data[3] = v;
-    cx_data[4] = v >> 8;
-
-    v = (pc->vert_scale << 14) | pc->Height;
-    cx_data[5] = v;
-    cx_data[6] = v >> 8;
-
-    extra_bytes_packed = 7;
-    cx_data += extra_bytes_packed;
-
-    vp9_start_encode(&header_bc, cx_data);
-
-    // signal clr type
-    vp9_write_bit(&header_bc, pc->clr_type);
-    vp9_write_bit(&header_bc, pc->clamp_type);
-
-  } else {
-    vp9_start_encode(&header_bc, cx_data);
-  }
-
-  // Signal whether or not Segmentation is enabled
-  vp9_write_bit(&header_bc, (xd->segmentation_enabled) ? 1 : 0);
-
-  // Indicate which features are enabled
-  if (xd->segmentation_enabled) {
-    // Indicate whether or not the segmentation map is being updated.
-    vp9_write_bit(&header_bc, (xd->update_mb_segmentation_map) ? 1 : 0);
-
-    // If it is, then indicate the method that will be used.
-    if (xd->update_mb_segmentation_map) {
-      // Select the coding strategy (temporal or spatial)
-      vp9_choose_segmap_coding_method(cpi);
-      // Send the tree probabilities used to decode unpredicted
-      // macro-block segments
-      for (i = 0; i < MB_FEATURE_TREE_PROBS; i++) {
-        int data = xd->mb_segment_tree_probs[i];
-
-        if (data != 255) {
-          vp9_write_bit(&header_bc, 1);
-          vp9_write_literal(&header_bc, data, 8);
-        } else {
-          vp9_write_bit(&header_bc, 0);
-        }
-      }
-
-      // Write out the chosen coding method.
-      vp9_write_bit(&header_bc, (pc->temporal_update) ? 1 : 0);
-      if (pc->temporal_update) {
-        for (i = 0; i < PREDICTION_PROBS; i++) {
-          int data = pc->segment_pred_probs[i];
-
-          if (data != 255) {
-            vp9_write_bit(&header_bc, 1);
-            vp9_write_literal(&header_bc, data, 8);
-          } else {
-            vp9_write_bit(&header_bc, 0);
-          }
-        }
-      }
-    }
-
-    vp9_write_bit(&header_bc, (xd->update_mb_segmentation_data) ? 1 : 0);
-
-    // segment_reference_frames(cpi);
-
-    if (xd->update_mb_segmentation_data) {
-      signed char Data;
-
-      vp9_write_bit(&header_bc, (xd->mb_segment_abs_delta) ? 1 : 0);
-
-      // For each segments id...
-      for (i = 0; i < MAX_MB_SEGMENTS; i++) {
-        // For each segmentation codable feature...
-        for (j = 0; j < SEG_LVL_MAX; j++) {
-          Data = vp9_get_segdata(xd, i, j);
-
-          // If the feature is enabled...
-          if (vp9_segfeature_active(xd, i, j)) {
-            vp9_write_bit(&header_bc, 1);
-
-            // Is the segment data signed..
-            if (vp9_is_segfeature_signed(j)) {
-              // Encode the relevant feature data
-              if (Data < 0) {
-                Data = - Data;
-                vp9_write_literal(&header_bc, Data,
-                                  vp9_seg_feature_data_bits(j));
-                vp9_write_bit(&header_bc, 1);
-              } else {
-                vp9_write_literal(&header_bc, Data,
-                                  vp9_seg_feature_data_bits(j));
-                vp9_write_bit(&header_bc, 0);
-              }
-            }
-            // Unsigned data element so no sign bit needed
-            else
-              vp9_write_literal(&header_bc, Data,
-                                vp9_seg_feature_data_bits(j));
-          } else
-            vp9_write_bit(&header_bc, 0);
-        }
-      }
-    }
-  }
-
-  // Encode the common prediction model status flag probability updates for
-  // the reference frame
-  update_refpred_stats(cpi);
-  if (pc->frame_type != KEY_FRAME) {
-    for (i = 0; i < PREDICTION_PROBS; i++) {
-      if (cpi->ref_pred_probs_update[i]) {
-        vp9_write_bit(&header_bc, 1);
-        vp9_write_literal(&header_bc, pc->ref_pred_probs[i], 8);
-      } else {
-        vp9_write_bit(&header_bc, 0);
-      }
-    }
-  }
-
-#if CONFIG_SUPERBLOCKS
-  {
-    /* sb mode probability */
-    const int sb_max = (((pc->mb_rows + 1) >> 1) * ((pc->mb_cols + 1) >> 1));
-
-    pc->sb_coded = get_prob(sb_max - cpi->sb_count, sb_max);
-    vp9_write_literal(&header_bc, pc->sb_coded, 8);
-  }
-#endif
-
-  {
-    if (pc->txfm_mode == TX_MODE_SELECT) {
-      pc->prob_tx[0] = get_prob(cpi->txfm_count[0] + cpi->txfm_count_8x8p[0],
-                                cpi->txfm_count[0] + cpi->txfm_count[1] + cpi->txfm_count[2] +
-                                cpi->txfm_count_8x8p[0] + cpi->txfm_count_8x8p[1]);
-      pc->prob_tx[1] = get_prob(cpi->txfm_count[1], cpi->txfm_count[1] + cpi->txfm_count[2]);
-    } else {
-      pc->prob_tx[0] = 128;
-      pc->prob_tx[1] = 128;
-    }
-    vp9_write_literal(&header_bc, pc->txfm_mode, 2);
-    if (pc->txfm_mode == TX_MODE_SELECT) {
-      vp9_write_literal(&header_bc, pc->prob_tx[0], 8);
-      vp9_write_literal(&header_bc, pc->prob_tx[1], 8);
-    }
-  }
-
-  // Encode the loop filter level and type
-  vp9_write_bit(&header_bc, pc->filter_type);
-  vp9_write_literal(&header_bc, pc->filter_level, 6);
-  vp9_write_literal(&header_bc, pc->sharpness_level, 3);
-
-  // Write out loop filter deltas applied at the MB level based on mode or ref frame (if they are enabled).
-  vp9_write_bit(&header_bc, (xd->mode_ref_lf_delta_enabled) ? 1 : 0);
-
-  if (xd->mode_ref_lf_delta_enabled) {
-    // Do the deltas need to be updated
-    int send_update = xd->mode_ref_lf_delta_update;
-
-    vp9_write_bit(&header_bc, send_update);
-    if (send_update) {
-      int Data;
-
-      // Send update
-      for (i = 0; i < MAX_REF_LF_DELTAS; i++) {
-        Data = xd->ref_lf_deltas[i];
-
-        // Frame level data
-        if (xd->ref_lf_deltas[i] != xd->last_ref_lf_deltas[i]) {
-          xd->last_ref_lf_deltas[i] = xd->ref_lf_deltas[i];
-          vp9_write_bit(&header_bc, 1);
-
-          if (Data > 0) {
-            vp9_write_literal(&header_bc, (Data & 0x3F), 6);
-            vp9_write_bit(&header_bc, 0);    // sign
-          } else {
-            Data = -Data;
-            vp9_write_literal(&header_bc, (Data & 0x3F), 6);
-            vp9_write_bit(&header_bc, 1);    // sign
-          }
-        } else {
-          vp9_write_bit(&header_bc, 0);
-        }
-      }
-
-      // Send update
-      for (i = 0; i < MAX_MODE_LF_DELTAS; i++) {
-        Data = xd->mode_lf_deltas[i];
-
-        if (xd->mode_lf_deltas[i] != xd->last_mode_lf_deltas[i]) {
-          xd->last_mode_lf_deltas[i] = xd->mode_lf_deltas[i];
-          vp9_write_bit(&header_bc, 1);
-
-          if (Data > 0) {
-            vp9_write_literal(&header_bc, (Data & 0x3F), 6);
-            vp9_write_bit(&header_bc, 0);    // sign
-          } else {
-            Data = -Data;
-            vp9_write_literal(&header_bc, (Data & 0x3F), 6);
-            vp9_write_bit(&header_bc, 1);    // sign
-          }
-        } else {
-          vp9_write_bit(&header_bc, 0);
-        }
-      }
-    }
-  }
-
-  // signal here is multi token partition is enabled
-  // vp9_write_literal(&header_bc, pc->multi_token_partition, 2);
-  vp9_write_literal(&header_bc, 0, 2);
-
-  // Frame Q baseline quantizer index
-  vp9_write_literal(&header_bc, pc->base_qindex, QINDEX_BITS);
-
-  // Transmit Dc, Second order and Uv quantizer delta information
-  put_delta_q(&header_bc, pc->y1dc_delta_q);
-  put_delta_q(&header_bc, pc->y2dc_delta_q);
-  put_delta_q(&header_bc, pc->y2ac_delta_q);
-  put_delta_q(&header_bc, pc->uvdc_delta_q);
-  put_delta_q(&header_bc, pc->uvac_delta_q);
-
-  // When there is a key frame all reference buffers are updated using the new key frame
-  if (pc->frame_type != KEY_FRAME) {
-    // Should the GF or ARF be updated using the transmitted frame or buffer
-    vp9_write_bit(&header_bc, pc->refresh_golden_frame);
-    vp9_write_bit(&header_bc, pc->refresh_alt_ref_frame);
-
-    // For inter frames the current default behavior is that when
-    // cm->refresh_golden_frame is set we copy the old GF over to
-    // the ARF buffer. This is purely an encoder decision at present.
-    if (pc->refresh_golden_frame)
-      pc->copy_buffer_to_arf  = 2;
-
-    // If not being updated from current frame should either GF or ARF be updated from another buffer
-    if (!pc->refresh_golden_frame)
-      vp9_write_literal(&header_bc, pc->copy_buffer_to_gf, 2);
-
-    if (!pc->refresh_alt_ref_frame)
-      vp9_write_literal(&header_bc, pc->copy_buffer_to_arf, 2);
-
-    // Indicate reference frame sign bias for Golden and ARF frames (always 0 for last frame buffer)
-    vp9_write_bit(&header_bc, pc->ref_frame_sign_bias[GOLDEN_FRAME]);
-    vp9_write_bit(&header_bc, pc->ref_frame_sign_bias[ALTREF_FRAME]);
-
-    // Signal whether to allow high MV precision
-    vp9_write_bit(&header_bc, (xd->allow_high_precision_mv) ? 1 : 0);
-    if (pc->mcomp_filter_type == SWITCHABLE) {
-      /* Check to see if only one of the filters is actually used */
-      int count[VP9_SWITCHABLE_FILTERS];
-      int i, j, c = 0;
-      for (i = 0; i < VP9_SWITCHABLE_FILTERS; ++i) {
-        count[i] = 0;
-        for (j = 0; j <= VP9_SWITCHABLE_FILTERS; ++j) {
-          count[i] += cpi->switchable_interp_count[j][i];
-        }
-        c += (count[i] > 0);
-      }
-      if (c == 1) {
-        /* Only one filter is used. So set the filter at frame level */
-        for (i = 0; i < VP9_SWITCHABLE_FILTERS; ++i) {
-          if (count[i]) {
-            pc->mcomp_filter_type = vp9_switchable_interp[i];
-            break;
-          }
-        }
-      }
-    }
-    // Signal the type of subpel filter to use
-    vp9_write_bit(&header_bc, (pc->mcomp_filter_type == SWITCHABLE));
-    if (pc->mcomp_filter_type != SWITCHABLE)
-      vp9_write_literal(&header_bc, (pc->mcomp_filter_type), 2);
-  }
-
-  vp9_write_bit(&header_bc, pc->refresh_entropy_probs);
-
-  if (pc->frame_type != KEY_FRAME)
-    vp9_write_bit(&header_bc, pc->refresh_last_frame);
-
-#ifdef ENTROPY_STATS
-  if (pc->frame_type == INTER_FRAME)
-    active_section = 0;
-  else
-    active_section = 7;
-#endif
-
-  vp9_clear_system_state();  // __asm emms;
-
-  vp9_copy(cpi->common.fc.pre_coef_probs, cpi->common.fc.coef_probs);
-  vp9_copy(cpi->common.fc.pre_hybrid_coef_probs, cpi->common.fc.hybrid_coef_probs);
-  vp9_copy(cpi->common.fc.pre_coef_probs_8x8, cpi->common.fc.coef_probs_8x8);
-  vp9_copy(cpi->common.fc.pre_hybrid_coef_probs_8x8, cpi->common.fc.hybrid_coef_probs_8x8);
-  vp9_copy(cpi->common.fc.pre_coef_probs_16x16, cpi->common.fc.coef_probs_16x16);
-  vp9_copy(cpi->common.fc.pre_hybrid_coef_probs_16x16, cpi->common.fc.hybrid_coef_probs_16x16);
-  vp9_copy(cpi->common.fc.pre_ymode_prob, cpi->common.fc.ymode_prob);
-  vp9_copy(cpi->common.fc.pre_uv_mode_prob, cpi->common.fc.uv_mode_prob);
-  vp9_copy(cpi->common.fc.pre_bmode_prob, cpi->common.fc.bmode_prob);
-  vp9_copy(cpi->common.fc.pre_sub_mv_ref_prob, cpi->common.fc.sub_mv_ref_prob);
-  vp9_copy(cpi->common.fc.pre_mbsplit_prob, cpi->common.fc.mbsplit_prob);
-  vp9_copy(cpi->common.fc.pre_i8x8_mode_prob, cpi->common.fc.i8x8_mode_prob);
-  cpi->common.fc.pre_nmvc = cpi->common.fc.nmvc;
-  vp9_zero(cpi->sub_mv_ref_count);
-  vp9_zero(cpi->mbsplit_count);
-  vp9_zero(cpi->common.fc.mv_ref_ct)
-  vp9_zero(cpi->common.fc.mv_ref_ct_a)
-
-  update_coef_probs(cpi, &header_bc);
-
-#ifdef ENTROPY_STATS
-  active_section = 2;
-#endif
-
-  // Write out the mb_no_coeff_skip flag
-  vp9_write_bit(&header_bc, pc->mb_no_coeff_skip);
-  if (pc->mb_no_coeff_skip) {
-    int k;
-
-    vp9_update_skip_probs(cpi);
-    for (k = 0; k < MBSKIP_CONTEXTS; ++k)
-      vp9_write_literal(&header_bc, pc->mbskip_pred_probs[k], 8);
-  }
-
-  if (pc->frame_type == KEY_FRAME) {
-    if (!pc->kf_ymode_probs_update) {
-      vp9_write_literal(&header_bc, pc->kf_ymode_probs_index, 3);
-    }
-  } else {
-    // Update the probabilities used to encode reference frame data
-    update_ref_probs(cpi);
-
-#ifdef ENTROPY_STATS
-    active_section = 1;
-#endif
-
-#if CONFIG_PRED_FILTER
-    // Write the prediction filter mode used for this frame
-    vp9_write_literal(&header_bc, pc->pred_filter_mode, 2);
-
-    // Write prediction filter on/off probability if signaling at MB level
-    if (pc->pred_filter_mode == 2)
-      vp9_write_literal(&header_bc, pc->prob_pred_filter_off, 8);
-
-#endif
-    if (pc->mcomp_filter_type == SWITCHABLE)
-      update_switchable_interp_probs(cpi, &header_bc);
-
-    vp9_write_literal(&header_bc, pc->prob_intra_coded, 8);
-    vp9_write_literal(&header_bc, pc->prob_last_coded, 8);
-    vp9_write_literal(&header_bc, pc->prob_gf_coded, 8);
-
-    {
-      const int comp_pred_mode = cpi->common.comp_pred_mode;
-      const int use_compound_pred = (comp_pred_mode != SINGLE_PREDICTION_ONLY);
-      const int use_hybrid_pred = (comp_pred_mode == HYBRID_PREDICTION);
-
-      vp9_write(&header_bc, use_compound_pred, 128);
-      if (use_compound_pred) {
-        vp9_write(&header_bc, use_hybrid_pred, 128);
-        if (use_hybrid_pred) {
-          for (i = 0; i < COMP_PRED_CONTEXTS; i++) {
-            pc->prob_comppred[i] = get_binary_prob(cpi->single_pred_count[i],
-                                                   cpi->comp_pred_count[i]);
-            vp9_write_literal(&header_bc, pc->prob_comppred[i], 8);
-          }
-        }
-      }
-    }
-
-    update_mbintra_mode_probs(cpi, &header_bc);
-
-#if CONFIG_NEW_MVREF
-    // Temp defaults probabilities for ecnoding the MV ref id signal
-    vpx_memset(xd->mb_mv_ref_id_probs, 192, sizeof(xd->mb_mv_ref_id_probs));
-#endif
-
-    vp9_write_nmvprobs(cpi, xd->allow_high_precision_mv, &header_bc);
-  }
-
-  vp9_stop_encode(&header_bc);
-
-  oh.first_partition_length_in_bytes = header_bc.pos;
-
-  /* update frame tag */
-  {
-    int v = (oh.first_partition_length_in_bytes << 5) |
-            (oh.show_frame << 4) |
-            (oh.version << 1) |
-            oh.type;
-
-    dest[0] = v;
-    dest[1] = v >> 8;
-    dest[2] = v >> 16;
-  }
-
-  *size = VP9_HEADER_SIZE + extra_bytes_packed + header_bc.pos;
-  vp9_start_encode(&residual_bc, cx_data + header_bc.pos);
-
-  if (pc->frame_type == KEY_FRAME) {
-    decide_kf_ymode_entropy(cpi);
-    write_kfmodes(cpi, &residual_bc);
-  } else {
-    pack_inter_mode_mvs(cpi, &residual_bc);
-    vp9_update_mode_context(&cpi->common);
-  }
-
-
-  vp9_stop_encode(&residual_bc);
-
-  *size += residual_bc.pos;
-
-}
-
-#ifdef ENTROPY_STATS
-void print_tree_update_probs() {
-  int i, j, k, l;
-  FILE *f = fopen("coefupdprob.h", "w");
-  int Sum;
-  fprintf(f, "\n/* Update probabilities for token entropy tree. */\n\n");
-
-  fprintf(f, "const vp9_prob\n"
-          "vp9_coef_update_probs[BLOCK_TYPES]\n"
-          "                     [COEF_BANDS]\n"
-          "                     [PREV_COEF_CONTEXTS]\n"
-          "                     [ENTROPY_NODES] = {\n");
-  for (i = 0; i < BLOCK_TYPES; i++) {
-    fprintf(f, "  { \n");
-    for (j = 0; j < COEF_BANDS; j++) {
-      fprintf(f, "    {\n");
-      for (k = 0; k < PREV_COEF_CONTEXTS; k++) {
-        fprintf(f, "      {");
-        for (l = 0; l < ENTROPY_NODES; l++) {
-          fprintf(f, "%3ld, ",
-              get_binary_prob(tree_update_hist[i][j][k][l][0],
-                              tree_update_hist[i][j][k][l][1]));
-        }
-        fprintf(f, "},\n");
-      }
-      fprintf(f, "    },\n");
-    }
-    fprintf(f, "  },\n");
-  }
-  fprintf(f, "};\n");
-
-  fprintf(f, "const vp9_prob\n"
-          "vp9_coef_update_probs_8x8[BLOCK_TYPES_8X8]\n"
-          "                         [COEF_BANDS]\n"
-          "                         [PREV_COEF_CONTEXTS]\n"
-          "                         [ENTROPY_NODES] = {\n");
-  for (i = 0; i < BLOCK_TYPES_8X8; i++) {
-    fprintf(f, "  { \n");
-    for (j = 0; j < COEF_BANDS; j++) {
-      fprintf(f, "    {\n");
-      for (k = 0; k < PREV_COEF_CONTEXTS; k++) {
-        fprintf(f, "      {");
-        for (l = 0; l < MAX_ENTROPY_TOKENS - 1; l++) {
-          fprintf(f, "%3ld, ",
-              get_binary_prob(tree_update_hist_8x8[i][j][k][l][0],
-                              tree_update_hist_8x8[i][j][k][l][1]));
-        }
-        fprintf(f, "},\n");
-      }
-      fprintf(f, "    },\n");
-    }
-    fprintf(f, "  },\n");
-  }
-
-  fprintf(f, "const vp9_prob\n"
-          "vp9_coef_update_probs_16x16[BLOCK_TYPES_16X16]\n"
-          "                           [COEF_BANDS]\n"
-          "                           [PREV_COEF_CONTEXTS]\n"
-          "                           [ENTROPY_NODES] = {\n");
-  for (i = 0; i < BLOCK_TYPES_16X16; i++) {
-    fprintf(f, "  { \n");
-    for (j = 0; j < COEF_BANDS; j++) {
-      fprintf(f, "    {\n");
-      for (k = 0; k < PREV_COEF_CONTEXTS; k++) {
-        fprintf(f, "      {");
-        for (l = 0; l < MAX_ENTROPY_TOKENS - 1; l++) {
-          fprintf(f, "%3ld, ",
-              get_binary_prob(tree_update_hist_16x16[i][j][k][l][0],
-                              tree_update_hist_16x16[i][j][k][l][1]));
-        }
-        fprintf(f, "},\n");
-      }
-      fprintf(f, "    },\n");
-    }
-    fprintf(f, "  },\n");
-  }
-
-  fclose(f);
-  f = fopen("treeupdate.bin", "wb");
-  fwrite(tree_update_hist, sizeof(tree_update_hist), 1, f);
-  fwrite(tree_update_hist_8x8, sizeof(tree_update_hist_8x8), 1, f);
-  fwrite(tree_update_hist_16x16, sizeof(tree_update_hist_16x16), 1, f);
-  fclose(f);
-}
-#endif
--- a/vp8/encoder/bitstream.h
+++ /dev/null
@@ -1,17 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef __INC_BITSTREAM_H
-#define __INC_BITSTREAM_H
-
-void vp9_update_skip_probs(VP9_COMP *cpi);
-
-#endif
--- a/vp8/encoder/block.h
+++ /dev/null
@@ -1,184 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef __INC_BLOCK_H
-#define __INC_BLOCK_H
-
-#include "vp8/common/onyx.h"
-#include "vp8/common/entropymv.h"
-#include "vp8/common/entropy.h"
-#include "vpx_ports/mem.h"
-#include "vp8/common/onyxc_int.h"
-
-// motion search site
-typedef struct {
-  MV mv;
-  int offset;
-} search_site;
-
-typedef struct block {
-  // 16 Y blocks, 4 U blocks, 4 V blocks each with 16 entries
-  short *src_diff;
-  short *coeff;
-
-  // 16 Y blocks, 4 U blocks, 4 V blocks each with 16 entries
-  short *quant;
-  short *quant_fast;      // fast quant deprecated for now
-  unsigned char *quant_shift;
-  short *zbin;
-  short *zbin_8x8;
-  short *zbin_16x16;
-  short *zrun_zbin_boost;
-  short *zrun_zbin_boost_8x8;
-  short *zrun_zbin_boost_16x16;
-  short *round;
-
-  // Zbin Over Quant value
-  short zbin_extra;
-
-  unsigned char **base_src;
-  unsigned char **base_second_src;
-  int src;
-  int src_stride;
-
-  int eob_max_offset;
-  int eob_max_offset_8x8;
-  int eob_max_offset_16x16;
-} BLOCK;
-
-typedef struct {
-  int count;
-  struct {
-    B_PREDICTION_MODE mode;
-    int_mv mv;
-    int_mv second_mv;
-  } bmi[16];
-} PARTITION_INFO;
-
-// Structure to hold snapshot of coding context during the mode picking process
-// TODO Do we need all of these?
-typedef struct {
-  MODE_INFO mic;
-  PARTITION_INFO partition_info;
-  int_mv best_ref_mv;
-  int_mv second_best_ref_mv;
-#if CONFIG_NEWBESTREFMV || CONFIG_NEW_MVREF
-  int_mv ref_mvs[MAX_REF_FRAMES][MAX_MV_REFS];
-#endif
-  int rate;
-  int distortion;
-  int64_t intra_error;
-  int best_mode_index;
-  int rddiv;
-  int rdmult;
-  int hybrid_pred_diff;
-  int comp_pred_diff;
-  int single_pred_diff;
-  int64_t txfm_rd_diff[NB_TXFM_MODES];
-} PICK_MODE_CONTEXT;
-
-typedef struct macroblock {
-  DECLARE_ALIGNED(16, short, src_diff[400]);  // 16x16 Y 8x8 U 8x8 V 4x4 2nd Y
-  DECLARE_ALIGNED(16, short, coeff[400]);     // 16x16 Y 8x8 U 8x8 V 4x4 2nd Y
-  DECLARE_ALIGNED(16, unsigned char, thismb[256]);    // 16x16 Y
-
-  unsigned char *thismb_ptr;
-  // 16 Y blocks, 4 U blocks, 4 V blocks,
-  // 1 DC 2nd order block each with 16 entries
-  BLOCK block[25];
-
-  YV12_BUFFER_CONFIG src;
-
-  MACROBLOCKD e_mbd;
-  PARTITION_INFO *partition_info; /* work pointer */
-  PARTITION_INFO *pi;   /* Corresponds to upper left visible macroblock */
-  PARTITION_INFO *pip;  /* Base of allocated array */
-
-  search_site *ss;
-  int ss_count;
-  int searches_per_step;
-
-  int errorperbit;
-  int sadperbit16;
-  int sadperbit4;
-  int rddiv;
-  int rdmult;
-  unsigned int *mb_activity_ptr;
-  int *mb_norm_activity_ptr;
-  signed int act_zbin_adj;
-
-  int nmvjointcost[MV_JOINTS];
-  int nmvcosts[2][MV_VALS];
-  int *nmvcost[2];
-  int nmvcosts_hp[2][MV_VALS];
-  int *nmvcost_hp[2];
-
-  int nmvjointsadcost[MV_JOINTS];
-  int nmvsadcosts[2][MV_VALS];
-  int *nmvsadcost[2];
-  int nmvsadcosts_hp[2][MV_VALS];
-  int *nmvsadcost_hp[2];
-
-  int mbmode_cost[2][MB_MODE_COUNT];
-  int intra_uv_mode_cost[2][MB_MODE_COUNT];
-  int bmode_costs[VP9_BINTRAMODES][VP9_BINTRAMODES][VP9_BINTRAMODES];
-  int i8x8_mode_costs[MB_MODE_COUNT];
-  int inter_bmode_costs[B_MODE_COUNT];
-  int switchable_interp_costs[VP9_SWITCHABLE_FILTERS + 1]
-                             [VP9_SWITCHABLE_FILTERS];
-
-  // These define limits to motion vector components to prevent them
-  // from extending outside the UMV borders
-  int mv_col_min;
-  int mv_col_max;
-  int mv_row_min;
-  int mv_row_max;
-
-  int skip;
-
-  int encode_breakout;
-
-  // char * gf_active_ptr;
-  signed char *gf_active_ptr;
-
-  unsigned char *active_ptr;
-
-  unsigned int token_costs[TX_SIZE_MAX][BLOCK_TYPES][COEF_BANDS]
-    [PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS];
-  unsigned int hybrid_token_costs[TX_SIZE_MAX][BLOCK_TYPES][COEF_BANDS]
-    [PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS];
-
-  int optimize;
-
-  // Structure to hold context for each of the 4 MBs within a SB:
-  // when encoded as 4 independent MBs:
-  PICK_MODE_CONTEXT mb_context[4];
-#if CONFIG_SUPERBLOCKS
-  // when 4 MBs share coding parameters:
-  PICK_MODE_CONTEXT sb_context[4];
-#endif
-
-  void (*vp9_short_fdct4x4)(short *input, short *output, int pitch);
-  void (*vp9_short_fdct8x4)(short *input, short *output, int pitch);
-  void (*short_walsh4x4)(short *input, short *output, int pitch);
-  void (*quantize_b_4x4)(BLOCK *b, BLOCKD *d);
-  void (*quantize_b_4x4_pair)(BLOCK *b1, BLOCK *b2, BLOCKD *d0, BLOCKD *d1);
-  void (*vp9_short_fdct8x8)(short *input, short *output, int pitch);
-  void (*vp9_short_fdct16x16)(short *input, short *output, int pitch);
-  void (*short_fhaar2x2)(short *input, short *output, int pitch);
-  void (*quantize_b_16x16)(BLOCK *b, BLOCKD *d);
-  void (*quantize_b_8x8)(BLOCK *b, BLOCKD *d);
-  void (*quantize_b_2x2)(BLOCK *b, BLOCKD *d);
-
-} MACROBLOCK;
-
-
-#endif
--- a/vp8/encoder/boolhuff.c
+++ /dev/null
@@ -1,153 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "boolhuff.h"
-
-#if defined(SECTIONBITS_OUTPUT)
-unsigned __int64 Sectionbits[500];
-
-#endif
-
-#ifdef ENTROPY_STATS
-unsigned int active_section = 0;
-#endif
-
-const unsigned int vp9_prob_cost[256] = {
-  2047, 2047, 1791, 1641, 1535, 1452, 1385, 1328, 1279, 1235, 1196, 1161, 1129, 1099, 1072, 1046,
-  1023, 1000,  979,  959,  940,  922,  905,  889,  873,  858,  843,  829,  816,  803,  790,  778,
-  767,  755,  744,  733,  723,  713,  703,  693,  684,  675,  666,  657,  649,  641,  633,  625,
-  617,  609,  602,  594,  587,  580,  573,  567,  560,  553,  547,  541,  534,  528,  522,  516,
-  511,  505,  499,  494,  488,  483,  477,  472,  467,  462,  457,  452,  447,  442,  437,  433,
-  428,  424,  419,  415,  410,  406,  401,  397,  393,  389,  385,  381,  377,  373,  369,  365,
-  361,  357,  353,  349,  346,  342,  338,  335,  331,  328,  324,  321,  317,  314,  311,  307,
-  304,  301,  297,  294,  291,  288,  285,  281,  278,  275,  272,  269,  266,  263,  260,  257,
-  255,  252,  249,  246,  243,  240,  238,  235,  232,  229,  227,  224,  221,  219,  216,  214,
-  211,  208,  206,  203,  201,  198,  196,  194,  191,  189,  186,  184,  181,  179,  177,  174,
-  172,  170,  168,  165,  163,  161,  159,  156,  154,  152,  150,  148,  145,  143,  141,  139,
-  137,  135,  133,  131,  129,  127,  125,  123,  121,  119,  117,  115,  113,  111,  109,  107,
-  105,  103,  101,   99,   97,   95,   93,   92,   90,   88,   86,   84,   82,   81,   79,   77,
-  75,   73,   72,   70,   68,   66,   65,   63,   61,   60,   58,   56,   55,   53,   51,   50,
-  48,   46,   45,   43,   41,   40,   38,   37,   35,   33,   32,   30,   29,   27,   25,   24,
-  22,   21,   19,   18,   16,   15,   13,   12,   10,    9,    7,    6,    4,    3,    1,   1
-};
-
-void vp9_start_encode(BOOL_CODER *br, unsigned char *source) {
-
-  br->lowvalue = 0;
-  br->range    = 255;
-  br->value    = 0;
-  br->count    = -24;
-  br->buffer   = source;
-  br->pos      = 0;
-}
-
-void vp9_stop_encode(BOOL_CODER *br) {
-  int i;
-
-  for (i = 0; i < 32; i++)
-    encode_bool(br, 0, 128);
-}
-
-
-void vp9_encode_value(BOOL_CODER *br, int data, int bits) {
-  int bit;
-
-  for (bit = bits - 1; bit >= 0; bit--)
-    encode_bool(br, (1 & (data >> bit)), 0x80);
-}
-
-int vp9_recenter_nonneg(int v, int m) {
-  if (v > (m << 1)) return v;
-  else if (v >= m) return ((v - m) << 1);
-  else return ((m - v) << 1) - 1;
-}
-
-static int get_unsigned_bits(unsigned num_values) {
-  int cat = 0;
-  if ((num_values--) <= 1) return 0;
-  while (num_values > 0) {
-    cat++;
-    num_values >>= 1;
-  }
-  return cat;
-}
-
-void vp9_encode_uniform(BOOL_CODER *br, int v, int n) {
-  int l = get_unsigned_bits(n);
-  int m;
-  if (l == 0) return;
-  m = (1 << l) - n;
-  if (v < m)
-    vp9_encode_value(br, v, l - 1);
-  else {
-    vp9_encode_value(br, m + ((v - m) >> 1), l - 1);
-    vp9_encode_value(br, (v - m) & 1, 1);
-  }
-}
-
-int vp9_count_uniform(int v, int n) {
-  int l = get_unsigned_bits(n);
-  int m;
-  if (l == 0) return 0;
-  m = (1 << l) - n;
-  if (v < m)
-    return l - 1;
-  else
-    return l;
-}
-
-void vp9_encode_term_subexp(BOOL_CODER *br, int word, int k, int num_syms) {
-  int i = 0;
-  int mk = 0;
-  while (1) {
-    int b = (i ? k + i - 1 : k);
-    int a = (1 << b);
-    if (num_syms <= mk + 3 * a) {
-      vp9_encode_uniform(br, word - mk, num_syms - mk);
-      break;
-    } else {
-      int t = (word >= mk + a);
-      vp9_encode_value(br, t, 1);
-      if (t) {
-        i = i + 1;
-        mk += a;
-      } else {
-        vp9_encode_value(br, word - mk, b);
-        break;
-      }
-    }
-  }
-}
-
-int vp9_count_term_subexp(int word, int k, int num_syms) {
-  int count = 0;
-  int i = 0;
-  int mk = 0;
-  while (1) {
-    int b = (i ? k + i - 1 : k);
-    int a = (1 << b);
-    if (num_syms <= mk + 3 * a) {
-      count += vp9_count_uniform(word - mk, num_syms - mk);
-      break;
-    } else {
-      int t = (word >= mk + a);
-      count++;
-      if (t) {
-        i = i + 1;
-        mk += a;
-      } else {
-        count += b;
-        break;
-      }
-    }
-  }
-  return count;
-}
--- a/vp8/encoder/boolhuff.h
+++ /dev/null
@@ -1,111 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-/****************************************************************************
-*
-*   Module Title :     boolhuff.h
-*
-*   Description  :     Bool Coder header file.
-*
-****************************************************************************/
-#ifndef __INC_BOOLHUFF_H
-#define __INC_BOOLHUFF_H
-
-#include "vpx_ports/mem.h"
-
-typedef struct {
-  unsigned int lowvalue;
-  unsigned int range;
-  unsigned int value;
-  int count;
-  unsigned int pos;
-  unsigned char *buffer;
-
-  // Variables used to track bit costs without outputing to the bitstream
-  unsigned int  measure_cost;
-  unsigned long bit_counter;
-} BOOL_CODER;
-
-extern void vp9_start_encode(BOOL_CODER *bc, unsigned char *buffer);
-
-extern void vp9_encode_value(BOOL_CODER *br, int data, int bits);
-extern void vp9_stop_encode(BOOL_CODER *bc);
-extern const unsigned int vp9_prob_cost[256];
-
-extern void vp9_encode_uniform(BOOL_CODER *bc, int v, int n);
-extern void vp9_encode_term_subexp(BOOL_CODER *bc, int v, int k, int n);
-extern int vp9_count_uniform(int v, int n);
-extern int vp9_count_term_subexp(int v, int k, int n);
-extern int vp9_recenter_nonneg(int v, int m);
-
-DECLARE_ALIGNED(16, extern const unsigned char, vp9_norm[256]);
-
-
-static void encode_bool(BOOL_CODER *br, int bit, int probability) {
-  unsigned int split;
-  int count = br->count;
-  unsigned int range = br->range;
-  unsigned int lowvalue = br->lowvalue;
-  register unsigned int shift;
-
-#ifdef ENTROPY_STATS
-#if defined(SECTIONBITS_OUTPUT)
-
-  if (bit)
-    Sectionbits[active_section] += vp9_prob_cost[255 - probability];
-  else
-    Sectionbits[active_section] += vp9_prob_cost[probability];
-
-#endif
-#endif
-
-  split = 1 + (((range - 1) * probability) >> 8);
-
-  range = split;
-
-  if (bit) {
-    lowvalue += split;
-    range = br->range - split;
-  }
-
-  shift = vp9_norm[range];
-
-  range <<= shift;
-  count += shift;
-
-  if (count >= 0) {
-    int offset = shift - count;
-
-    if ((lowvalue << (offset - 1)) & 0x80000000) {
-      int x = br->pos - 1;
-
-      while (x >= 0 && br->buffer[x] == 0xff) {
-        br->buffer[x] = (unsigned char)0;
-        x--;
-      }
-
-      br->buffer[x] += 1;
-    }
-
-    br->buffer[br->pos++] = (lowvalue >> (24 - offset));
-    lowvalue <<= offset;
-    shift = count;
-    lowvalue &= 0xffffff;
-    count -= 8;
-  }
-
-  lowvalue <<= shift;
-  br->count = count;
-  br->lowvalue = lowvalue;
-  br->range = range;
-}
-
-#endif
--- a/vp8/encoder/dct.c
+++ /dev/null
@@ -1,1109 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include <assert.h>
-#include <math.h>
-#include "vpx_ports/config.h"
-#include "vp8/common/idct.h"
-#include "vp8/common/systemdependent.h"
-
-#include "vp8/common/blockd.h"
-
-// TODO: these transforms can be converted into integer forms to reduce
-//       the complexity
-static const float dct_4[16] = {
-  0.500000000000000,  0.500000000000000,  0.500000000000000,  0.500000000000000,
-  0.653281482438188,  0.270598050073099, -0.270598050073099, -0.653281482438188,
-  0.500000000000000, -0.500000000000000, -0.500000000000000,  0.500000000000000,
-  0.270598050073099, -0.653281482438188,  0.653281482438188, -0.270598050073099
-};
-
-static const float adst_4[16] = {
-  0.228013428883779,  0.428525073124360,  0.577350269189626,  0.656538502008139,
-  0.577350269189626,  0.577350269189626,  0.000000000000000, -0.577350269189626,
-  0.656538502008139, -0.228013428883779, -0.577350269189626,  0.428525073124359,
-  0.428525073124360, -0.656538502008139,  0.577350269189626, -0.228013428883779
-};
-
-static const float dct_8[64] = {
-  0.353553390593274,   0.353553390593274,   0.353553390593274,   0.353553390593274,
-  0.353553390593274,   0.353553390593274,   0.353553390593274,   0.353553390593274,
-  0.490392640201615,   0.415734806151273,   0.277785116509801,   0.097545161008064,
- -0.097545161008064,  -0.277785116509801,  -0.415734806151273,  -0.490392640201615,
-  0.461939766255643,   0.191341716182545,  -0.191341716182545,  -0.461939766255643,
- -0.461939766255643,  -0.191341716182545,   0.191341716182545,   0.461939766255643,
-  0.415734806151273,  -0.097545161008064,  -0.490392640201615,  -0.277785116509801,
-  0.277785116509801,   0.490392640201615,   0.097545161008064,  -0.415734806151273,
-  0.353553390593274,  -0.353553390593274,  -0.353553390593274,   0.353553390593274,
-  0.353553390593274,  -0.353553390593274,  -0.353553390593274,   0.353553390593274,
-  0.277785116509801,  -0.490392640201615,   0.097545161008064,   0.415734806151273,
- -0.415734806151273,  -0.097545161008064,   0.490392640201615,  -0.277785116509801,
-  0.191341716182545,  -0.461939766255643,   0.461939766255643,  -0.191341716182545,
- -0.191341716182545,   0.461939766255643,  -0.461939766255643,   0.191341716182545,
-  0.097545161008064,  -0.277785116509801,   0.415734806151273,  -0.490392640201615,
-  0.490392640201615,  -0.415734806151273,   0.277785116509801,  -0.097545161008064
-};
-
-static const float adst_8[64] = {
-  0.089131608307533,   0.175227946595735,   0.255357107325376,   0.326790388032145,
-  0.387095214016349,   0.434217976756762,   0.466553967085785,   0.483002021635509,
-  0.255357107325376,   0.434217976756762,   0.483002021635509,   0.387095214016349,
-  0.175227946595735,  -0.089131608307533,  -0.326790388032145,  -0.466553967085785,
-  0.387095214016349,   0.466553967085785,   0.175227946595735,  -0.255357107325376,
- -0.483002021635509,  -0.326790388032145,   0.089131608307533,   0.434217976756762,
-  0.466553967085785,   0.255357107325376,  -0.326790388032145,  -0.434217976756762,
-  0.089131608307533,   0.483002021635509,   0.175227946595735,  -0.387095214016348,
-  0.483002021635509,  -0.089131608307533,  -0.466553967085785,   0.175227946595735,
-  0.434217976756762,  -0.255357107325376,  -0.387095214016348,   0.326790388032145,
-  0.434217976756762,  -0.387095214016348,  -0.089131608307533,   0.466553967085786,
- -0.326790388032145,  -0.175227946595735,   0.483002021635509,  -0.255357107325375,
-  0.326790388032145,  -0.483002021635509,   0.387095214016349,  -0.089131608307534,
- -0.255357107325377,   0.466553967085785,  -0.434217976756762,   0.175227946595736,
-  0.175227946595735,  -0.326790388032145,   0.434217976756762,  -0.483002021635509,
-  0.466553967085785,  -0.387095214016348,   0.255357107325376,  -0.089131608307532
-};
-
-/* Converted the transforms to integers. */
-static const int16_t dct_i4[16] = {
-  16384,  16384,  16384,  16384,
-  21407,   8867,  -8867, -21407,
-  16384, -16384, -16384,  16384,
-   8867, -21407,  21407,  -8867
-};
-
-static const int16_t adst_i4[16] = {
-   7472,  14042,  18919,  21513,
-  18919,  18919,      0, -18919,
-  21513,  -7472, -18919,  14042,
-  14042, -21513,  18919,  -7472
-};
-
-static const int16_t dct_i8[64] = {
-   11585,  11585,  11585,  11585,
-   11585,  11585,  11585,  11585,
-   16069,  13623,   9102,   3196,
-   -3196,  -9102, -13623, -16069,
-   15137,   6270,  -6270, -15137,
-  -15137,  -6270,   6270,  15137,
-   13623,  -3196, -16069,  -9102,
-    9102,  16069,   3196, -13623,
-   11585, -11585, -11585,  11585,
-   11585, -11585, -11585,  11585,
-    9102, -16069,   3196,  13623,
-  -13623,  -3196,  16069,  -9102,
-    6270, -15137,  15137,  -6270,
-   -6270,  15137, -15137,   6270,
-    3196,  -9102,  13623, -16069,
-   16069, -13623,   9102,  -3196
-};
-
-static const int16_t adst_i8[64] = {
-    2921,   5742,   8368,  10708,
-   12684,  14228,  15288,  15827,
-    8368,  14228,  15827,  12684,
-    5742,  -2921, -10708, -15288,
-   12684,  15288,   5742,  -8368,
-  -15827, -10708,   2921,  14228,
-   15288,   8368, -10708, -14228,
-    2921,  15827,   5742, -12684,
-   15827,  -2921, -15288,   5742,
-   14228,  -8368, -12684,  10708,
-   14228, -12684,  -2921,  15288,
-  -10708,  -5742,  15827,  -8368,
-   10708, -15827,  12684,  -2921,
-   -8368,  15288, -14228,   5742,
-    5742, -10708,  14228, -15827,
-   15288, -12684,   8368,  -2921
-};
-
-static const float dct_16[256] = {
-  0.250000,  0.250000,  0.250000,  0.250000,  0.250000,  0.250000,  0.250000,  0.250000,
-  0.250000,  0.250000,  0.250000,  0.250000,  0.250000,  0.250000,  0.250000,  0.250000,
-  0.351851,  0.338330,  0.311806,  0.273300,  0.224292,  0.166664,  0.102631,  0.034654,
- -0.034654, -0.102631, -0.166664, -0.224292, -0.273300, -0.311806, -0.338330, -0.351851,
-  0.346760,  0.293969,  0.196424,  0.068975, -0.068975, -0.196424, -0.293969, -0.346760,
- -0.346760, -0.293969, -0.196424, -0.068975,  0.068975,  0.196424,  0.293969,  0.346760,
-  0.338330,  0.224292,  0.034654, -0.166664, -0.311806, -0.351851, -0.273300, -0.102631,
-  0.102631,  0.273300,  0.351851,  0.311806,  0.166664, -0.034654, -0.224292, -0.338330,
-  0.326641,  0.135299, -0.135299, -0.326641, -0.326641, -0.135299,  0.135299,  0.326641,
-  0.326641,  0.135299, -0.135299, -0.326641, -0.326641, -0.135299,  0.135299,  0.326641,
-  0.311806,  0.034654, -0.273300, -0.338330, -0.102631,  0.224292,  0.351851,  0.166664,
- -0.166664, -0.351851, -0.224292,  0.102631,  0.338330,  0.273300, -0.034654, -0.311806,
-  0.293969, -0.068975, -0.346760, -0.196424,  0.196424,  0.346760,  0.068975, -0.293969,
- -0.293969,  0.068975,  0.346760,  0.196424, -0.196424, -0.346760, -0.068975,  0.293969,
-  0.273300, -0.166664, -0.338330,  0.034654,  0.351851,  0.102631, -0.311806, -0.224292,
-  0.224292,  0.311806, -0.102631, -0.351851, -0.034654,  0.338330,  0.166664, -0.273300,
-  0.250000, -0.250000, -0.250000,  0.250000,  0.250000, -0.250000, -0.250000,  0.250000,
-  0.250000, -0.250000, -0.250000,  0.250000,  0.250000, -0.250000, -0.250000,  0.250000,
-  0.224292, -0.311806, -0.102631,  0.351851, -0.034654, -0.338330,  0.166664,  0.273300,
- -0.273300, -0.166664,  0.338330,  0.034654, -0.351851,  0.102631,  0.311806, -0.224292,
-  0.196424, -0.346760,  0.068975,  0.293969, -0.293969, -0.068975,  0.346760, -0.196424,
- -0.196424,  0.346760, -0.068975, -0.293969,  0.293969,  0.068975, -0.346760,  0.196424,
-  0.166664, -0.351851,  0.224292,  0.102631, -0.338330,  0.273300,  0.034654, -0.311806,
-  0.311806, -0.034654, -0.273300,  0.338330, -0.102631, -0.224292,  0.351851, -0.166664,
-  0.135299, -0.326641,  0.326641, -0.135299, -0.135299,  0.326641, -0.326641,  0.135299,
-  0.135299, -0.326641,  0.326641, -0.135299, -0.135299,  0.326641, -0.326641,  0.135299,
-  0.102631, -0.273300,  0.351851, -0.311806,  0.166664,  0.034654, -0.224292,  0.338330,
- -0.338330,  0.224292, -0.034654, -0.166664,  0.311806, -0.351851,  0.273300, -0.102631,
-  0.068975, -0.196424,  0.293969, -0.346760,  0.346760, -0.293969,  0.196424, -0.068975,
- -0.068975,  0.196424, -0.293969,  0.346760, -0.346760,  0.293969, -0.196424,  0.068975,
-  0.034654, -0.102631,  0.166664, -0.224292,  0.273300, -0.311806,  0.338330, -0.351851,
-  0.351851, -0.338330,  0.311806, -0.273300,  0.224292, -0.166664,  0.102631, -0.034654
-};
-
-static const float adst_16[256] = {
-  0.033094,  0.065889,  0.098087,  0.129396,  0.159534,  0.188227,  0.215215,  0.240255,
-  0.263118,  0.283599,  0.301511,  0.316693,  0.329007,  0.338341,  0.344612,  0.347761,
-  0.098087,  0.188227,  0.263118,  0.316693,  0.344612,  0.344612,  0.316693,  0.263118,
-  0.188227,  0.098087,  0.000000, -0.098087, -0.188227, -0.263118, -0.316693, -0.344612,
-  0.159534,  0.283599,  0.344612,  0.329007,  0.240255,  0.098087, -0.065889, -0.215215,
- -0.316693, -0.347761, -0.301511, -0.188227, -0.033094,  0.129396,  0.263118,  0.338341,
-  0.215215,  0.338341,  0.316693,  0.159534, -0.065889, -0.263118, -0.347761, -0.283599,
- -0.098087,  0.129396,  0.301511,  0.344612,  0.240255,  0.033094, -0.188227, -0.329007,
-  0.263118,  0.344612,  0.188227, -0.098087, -0.316693, -0.316693, -0.098087,  0.188227,
-  0.344612,  0.263118,  0.000000, -0.263118, -0.344612, -0.188227,  0.098087,  0.316693,
-  0.301511,  0.301511,  0.000000, -0.301511, -0.301511, -0.000000,  0.301511,  0.301511,
-  0.000000, -0.301511, -0.301511, -0.000000,  0.301511,  0.301511,  0.000000, -0.301511,
-  0.329007,  0.215215, -0.188227, -0.338341, -0.033094,  0.316693,  0.240255, -0.159534,
- -0.344612, -0.065889,  0.301511,  0.263118, -0.129396, -0.347761, -0.098087,  0.283599,
-  0.344612,  0.098087, -0.316693, -0.188227,  0.263118,  0.263118, -0.188227, -0.316693,
-  0.098087,  0.344612,  0.000000, -0.344612, -0.098087,  0.316693,  0.188227, -0.263118,
-  0.347761, -0.033094, -0.344612,  0.065889,  0.338341, -0.098087, -0.329007,  0.129396,
-  0.316693, -0.159534, -0.301511,  0.188227,  0.283599, -0.215215, -0.263118,  0.240255,
-  0.338341, -0.159534, -0.263118,  0.283599,  0.129396, -0.344612,  0.033094,  0.329007,
- -0.188227, -0.240255,  0.301511,  0.098087, -0.347761,  0.065889,  0.316693, -0.215215,
-  0.316693, -0.263118, -0.098087,  0.344612, -0.188227, -0.188227,  0.344612, -0.098087,
- -0.263118,  0.316693,  0.000000, -0.316693,  0.263118,  0.098087, -0.344612,  0.188227,
-  0.283599, -0.329007,  0.098087,  0.215215, -0.347761,  0.188227,  0.129396, -0.338341,
-  0.263118,  0.033094, -0.301511,  0.316693, -0.065889, -0.240255,  0.344612, -0.159534,
-  0.240255, -0.347761,  0.263118, -0.033094, -0.215215,  0.344612, -0.283599,  0.065889,
-  0.188227, -0.338341,  0.301511, -0.098087, -0.159534,  0.329007, -0.316693,  0.129396,
-  0.188227, -0.316693,  0.344612, -0.263118,  0.098087,  0.098087, -0.263118,  0.344612,
- -0.316693,  0.188227,  0.000000, -0.188227,  0.316693, -0.344612,  0.263118, -0.098087,
-  0.129396, -0.240255,  0.316693, -0.347761,  0.329007, -0.263118,  0.159534, -0.033094,
- -0.098087,  0.215215, -0.301511,  0.344612, -0.338341,  0.283599, -0.188227,  0.065889,
-  0.065889, -0.129396,  0.188227, -0.240255,  0.283599, -0.316693,  0.338341, -0.347761,
-  0.344612, -0.329007,  0.301511, -0.263118,  0.215215, -0.159534,  0.098087, -0.033094
-};
-
-/* Converted the transforms to integers. */
-static const int16_t dct_i16[256] = {
-    8192,   8192,   8192,   8192,   8192,   8192,   8192,   8192,
-    8192,   8192,   8192,   8192,   8192,   8192,   8192,   8192,
-   11529,  11086,  10217,   8955,   7350,   5461,   3363,   1136,
-   -1136,  -3363,  -5461,  -7350,  -8955, -10217, -11086, -11529,
-   11363,   9633,   6436,   2260,  -2260,  -6436,  -9633, -11363,
-  -11363,  -9633,  -6436,  -2260,   2260,   6436,   9633,  11363,
-   11086,   7350,   1136,  -5461, -10217, -11529,  -8955,  -3363,
-    3363,   8955,  11529,  10217,   5461,  -1136,  -7350, -11086,
-   10703,   4433,  -4433, -10703, -10703,  -4433,   4433,  10703,
-   10703,   4433,  -4433, -10703, -10703,  -4433,   4433,  10703,
-   10217,   1136,  -8955, -11086,  -3363,   7350,  11529,   5461,
-   -5461, -11529,  -7350,   3363,  11086,   8955,  -1136, -10217,
-    9633,  -2260, -11363,  -6436,   6436,  11363,   2260,  -9633,
-   -9633,   2260,  11363,   6436,  -6436, -11363,  -2260,   9633,
-    8955,  -5461, -11086,   1136,  11529,   3363, -10217,  -7350,
-    7350,  10217,  -3363, -11529,  -1136,  11086,   5461,  -8955,
-    8192,  -8192,  -8192,   8192,   8192,  -8192,  -8192,   8192,
-    8192,  -8192,  -8192,   8192,   8192,  -8192,  -8192,   8192,
-    7350, -10217,  -3363,  11529,  -1136, -11086,   5461,   8955,
-   -8955,  -5461,  11086,   1136, -11529,   3363,  10217,  -7350,
-    6436, -11363,   2260,   9633,  -9633,  -2260,  11363,  -6436,
-   -6436,  11363,  -2260,  -9633,   9633,   2260, -11363,   6436,
-    5461, -11529,   7350,   3363, -11086,   8955,   1136, -10217,
-   10217,  -1136,  -8955,  11086,  -3363,  -7350,  11529,  -5461,
-    4433, -10703,  10703,  -4433,  -4433,  10703, -10703,   4433,
-    4433, -10703,  10703,  -4433,  -4433,  10703, -10703,   4433,
-    3363,  -8955,  11529, -10217,   5461,   1136,  -7350,  11086,
-  -11086,   7350,  -1136,  -5461,  10217, -11529,   8955,  -3363,
-    2260,  -6436,   9633, -11363,  11363,  -9633,   6436,  -2260,
-   -2260,   6436,  -9633,  11363, -11363,   9633,  -6436,   2260,
-    1136,  -3363,   5461,  -7350,   8955, -10217,  11086, -11529,
-   11529, -11086,  10217,  -8955,   7350,  -5461,   3363,  -1136
-};
-
-static const int16_t adst_i16[256] = {
-    1084,   2159,   3214,   4240,   5228,   6168,   7052,   7873,
-    8622,   9293,   9880,  10377,  10781,  11087,  11292,  11395,
-    3214,   6168,   8622,  10377,  11292,  11292,  10377,   8622,
-    6168,   3214,      0,  -3214,  -6168,  -8622, -10377, -11292,
-    5228,   9293,  11292,  10781,   7873,   3214,  -2159,  -7052,
-  -10377, -11395,  -9880,  -6168,  -1084,   4240,   8622,  11087,
-    7052,  11087,  10377,   5228,  -2159,  -8622, -11395,  -9293,
-   -3214,   4240,   9880,  11292,   7873,   1084,  -6168, -10781,
-    8622,  11292,   6168,  -3214, -10377, -10377,  -3214,   6168,
-   11292,   8622,      0,  -8622, -11292,  -6168,   3214,  10377,
-    9880,   9880,      0,  -9880,  -9880,      0,   9880,   9880,
-       0,  -9880,  -9880,      0,   9880,   9880,      0,  -9880,
-   10781,   7052,  -6168, -11087,  -1084,  10377,   7873,  -5228,
-  -11292,  -2159,   9880,   8622,  -4240, -11395,  -3214,   9293,
-   11292,   3214, -10377,  -6168,   8622,   8622,  -6168, -10377,
-    3214,  11292,      0, -11292,  -3214,  10377,   6168,  -8622,
-   11395,  -1084, -11292,   2159,  11087,  -3214, -10781,   4240,
-   10377,  -5228,  -9880,   6168,   9293,  -7052,  -8622,   7873,
-   11087,  -5228,  -8622,   9293,   4240, -11292,   1084,  10781,
-   -6168,  -7873,   9880,   3214, -11395,   2159,  10377,  -7052,
-   10377,  -8622,  -3214,  11292,  -6168,  -6168,  11292,  -3214,
-   -8622,  10377,      0, -10377,   8622,   3214, -11292,   6168,
-    9293, -10781,   3214,   7052, -11395,   6168,   4240, -11087,
-    8622,   1084,  -9880,  10377,  -2159,  -7873,  11292,  -5228,
-    7873, -11395,   8622,  -1084,  -7052,  11292,  -9293,   2159,
-    6168, -11087,   9880,  -3214,  -5228,  10781, -10377,   4240,
-    6168, -10377,  11292,  -8622,   3214,   3214,  -8622,  11292,
-  -10377,   6168,      0,  -6168,  10377, -11292,   8622,  -3214,
-    4240,  -7873,  10377, -11395,  10781,  -8622,   5228,  -1084,
-   -3214,   7052,  -9880,  11292, -11087,   9293,  -6168,   2159,
-    2159,  -4240,   6168,  -7873,   9293, -10377,  11087, -11395,
-   11292, -10781,   9880,  -8622,   7052,  -5228,   3214,  -1084
-};
-
-static const int xC1S7 = 16069;
-static const int xC2S6 = 15137;
-static const int xC3S5 = 13623;
-static const int xC4S4 = 11585;
-static const int xC5S3 =  9102;
-static const int xC6S2 =  6270;
-static const int xC7S1 =  3196;
-
-#define SHIFT_BITS 14
-#define DOROUND(X) X += (1<<(SHIFT_BITS-1));
-
-#define FINAL_SHIFT 3
-#define FINAL_ROUNDING (1<<(FINAL_SHIFT -1))
-#define IN_SHIFT (FINAL_SHIFT+1)
-
-
-void vp9_short_fdct8x8_c(short *InputData, short *OutputData, int pitch) {
-  int loop;
-  int short_pitch = pitch >> 1;
-  int is07, is12, is34, is56;
-  int is0734, is1256;
-  int id07, id12, id34, id56;
-  int irot_input_x, irot_input_y;
-  int icommon_product1;      // Re-used product  (c4s4 * (s12 - s56))
-  int icommon_product2;      // Re-used product  (c4s4 * (d12 + d56))
-  int temp1, temp2;          // intermediate variable for computation
-
-  int  InterData[64];
-  int  *ip = InterData;
-  short *op = OutputData;
-
-  for (loop = 0; loop < 8; loop++) {
-    // Pre calculate some common sums and differences.
-    is07 = (InputData[0] + InputData[7]) << IN_SHIFT;
-    is12 = (InputData[1] + InputData[2]) << IN_SHIFT;
-    is34 = (InputData[3] + InputData[4]) << IN_SHIFT;
-    is56 = (InputData[5] + InputData[6]) << IN_SHIFT;
-    id07 = (InputData[0] - InputData[7]) << IN_SHIFT;
-    id12 = (InputData[1] - InputData[2]) << IN_SHIFT;
-    id34 = (InputData[3] - InputData[4]) << IN_SHIFT;
-    id56 = (InputData[5] - InputData[6]) << IN_SHIFT;
-
-    is0734 = is07 + is34;
-    is1256 = is12 + is56;
-
-    // Pre-Calculate some common product terms.
-    icommon_product1 = xC4S4 * (is12 - is56);
-    DOROUND(icommon_product1)
-    icommon_product1 >>= SHIFT_BITS;
-
-    icommon_product2 = xC4S4 * (id12 + id56);
-    DOROUND(icommon_product2)
-    icommon_product2 >>= SHIFT_BITS;
-
-
-    ip[0] = (xC4S4 * (is0734 + is1256));
-    DOROUND(ip[0]);
-    ip[0] >>= SHIFT_BITS;
-
-    ip[4] = (xC4S4 * (is0734 - is1256));
-    DOROUND(ip[4]);
-    ip[4] >>= SHIFT_BITS;
-
-    // Define inputs to rotation for outputs 2 and 6
-    irot_input_x = id12 - id56;
-    irot_input_y = is07 - is34;
-
-    // Apply rotation for outputs 2 and 6.
-    temp1 = xC6S2 * irot_input_x;
-    DOROUND(temp1);
-    temp1 >>= SHIFT_BITS;
-    temp2 = xC2S6 * irot_input_y;
-    DOROUND(temp2);
-    temp2 >>= SHIFT_BITS;
-    ip[2] = temp1 + temp2;
-
-    temp1 = xC6S2 * irot_input_y;
-    DOROUND(temp1);
-    temp1 >>= SHIFT_BITS;
-    temp2 = xC2S6 * irot_input_x;
-    DOROUND(temp2);
-    temp2 >>= SHIFT_BITS;
-    ip[6] = temp1 - temp2;
-
-    // Define inputs to rotation for outputs 1 and 7
-    irot_input_x = icommon_product1 + id07;
-    irot_input_y = -(id34 + icommon_product2);
-
-    // Apply rotation for outputs 1 and 7.
-    temp1 = xC1S7 * irot_input_x;
-    DOROUND(temp1);
-    temp1 >>= SHIFT_BITS;
-    temp2 = xC7S1 * irot_input_y;
-    DOROUND(temp2);
-    temp2 >>= SHIFT_BITS;
-    ip[1] = temp1 - temp2;
-
-    temp1 = xC7S1 * irot_input_x;
-    DOROUND(temp1);
-    temp1 >>= SHIFT_BITS;
-    temp2 = xC1S7 * irot_input_y;
-    DOROUND(temp2);
-    temp2 >>= SHIFT_BITS;
-    ip[7] = temp1 + temp2;
-
-    // Define inputs to rotation for outputs 3 and 5
-    irot_input_x = id07 - icommon_product1;
-    irot_input_y = id34 - icommon_product2;
-
-    // Apply rotation for outputs 3 and 5.
-    temp1 = xC3S5 * irot_input_x;
-    DOROUND(temp1);
-    temp1 >>= SHIFT_BITS;
-    temp2 = xC5S3 * irot_input_y;
-    DOROUND(temp2);
-    temp2 >>= SHIFT_BITS;
-    ip[3] = temp1 - temp2;
-
-
-    temp1 = xC5S3 * irot_input_x;
-    DOROUND(temp1);
-    temp1 >>= SHIFT_BITS;
-    temp2 = xC3S5 * irot_input_y;
-    DOROUND(temp2);
-    temp2 >>= SHIFT_BITS;
-    ip[5] = temp1 + temp2;
-
-    // Increment data pointer for next row
-    InputData += short_pitch;
-    ip += 8;
-  }
-
-  // Performed DCT on rows, now transform the columns
-  ip = InterData;
-  for (loop = 0; loop < 8; loop++) {
-    // Pre calculate some common sums and differences.
-    is07 = ip[0 * 8] + ip[7 * 8];
-    is12 = ip[1 * 8] + ip[2 * 8];
-    is34 = ip[3 * 8] + ip[4 * 8];
-    is56 = ip[5 * 8] + ip[6 * 8];
-
-    id07 = ip[0 * 8] - ip[7 * 8];
-    id12 = ip[1 * 8] - ip[2 * 8];
-    id34 = ip[3 * 8] - ip[4 * 8];
-    id56 = ip[5 * 8] - ip[6 * 8];
-
-    is0734 = is07 + is34;
-    is1256 = is12 + is56;
-
-    // Pre-Calculate some common product terms
-    icommon_product1 = xC4S4 * (is12 - is56);
-    icommon_product2 = xC4S4 * (id12 + id56);
-    DOROUND(icommon_product1)
-    DOROUND(icommon_product2)
-    icommon_product1 >>= SHIFT_BITS;
-    icommon_product2 >>= SHIFT_BITS;
-
-
-    temp1 = xC4S4 * (is0734 + is1256);
-    temp2 = xC4S4 * (is0734 - is1256);
-    DOROUND(temp1);
-    DOROUND(temp2);
-    temp1 >>= SHIFT_BITS;
-
-    temp2 >>= SHIFT_BITS;
-    op[0 * 8] = (temp1 + FINAL_ROUNDING) >> FINAL_SHIFT;
-    op[4 * 8] = (temp2 + FINAL_ROUNDING) >> FINAL_SHIFT;
-
-    // Define inputs to rotation for outputs 2 and 6
-    irot_input_x = id12 - id56;
-    irot_input_y = is07 - is34;
-
-    // Apply rotation for outputs 2 and 6.
-    temp1 = xC6S2 * irot_input_x;
-    DOROUND(temp1);
-    temp1 >>= SHIFT_BITS;
-    temp2 = xC2S6 * irot_input_y;
-    DOROUND(temp2);
-    temp2 >>= SHIFT_BITS;
-    op[2 * 8] = (temp1 + temp2 + FINAL_ROUNDING) >> FINAL_SHIFT;
-
-    temp1 = xC6S2 * irot_input_y;
-    DOROUND(temp1);
-    temp1 >>= SHIFT_BITS;
-    temp2 = xC2S6 * irot_input_x;
-    DOROUND(temp2);
-    temp2 >>= SHIFT_BITS;
-    op[6 * 8] = (temp1 - temp2 + FINAL_ROUNDING) >> FINAL_SHIFT;
-
-    // Define inputs to rotation for outputs 1 and 7
-    irot_input_x = icommon_product1 + id07;
-    irot_input_y = -(id34 + icommon_product2);
-
-    // Apply rotation for outputs 1 and 7.
-    temp1 = xC1S7 * irot_input_x;
-    DOROUND(temp1);
-    temp1 >>= SHIFT_BITS;
-    temp2 = xC7S1 * irot_input_y;
-    DOROUND(temp2);
-    temp2 >>= SHIFT_BITS;
-    op[1 * 8] = (temp1 - temp2 + FINAL_ROUNDING) >> FINAL_SHIFT;
-
-    temp1 = xC7S1 * irot_input_x;
-    DOROUND(temp1);
-    temp1 >>= SHIFT_BITS;
-    temp2 = xC1S7 * irot_input_y;
-    DOROUND(temp2);
-    temp2 >>= SHIFT_BITS;
-    op[7 * 8] = (temp1 + temp2 + FINAL_ROUNDING) >> FINAL_SHIFT;
-
-    // Define inputs to rotation for outputs 3 and 5
-    irot_input_x = id07 - icommon_product1;
-    irot_input_y = id34 - icommon_product2;
-
-    // Apply rotation for outputs 3 and 5.
-    temp1 = xC3S5 * irot_input_x;
-    DOROUND(temp1);
-    temp1 >>= SHIFT_BITS;
-    temp2 = xC5S3 * irot_input_y;
-    DOROUND(temp2);
-    temp2 >>= SHIFT_BITS;
-    op[3 * 8] = (temp1 - temp2 + FINAL_ROUNDING) >> FINAL_SHIFT;
-
-
-    temp1 = xC5S3 * irot_input_x;
-    DOROUND(temp1);
-    temp1 >>= SHIFT_BITS;
-    temp2 = xC3S5 * irot_input_y;
-    DOROUND(temp2);
-    temp2 >>= SHIFT_BITS;
-    op[5 * 8] = (temp1 + temp2 + FINAL_ROUNDING) >> FINAL_SHIFT;
-
-    // Increment data pointer for next column.
-    ip++;
-    op++;
-  }
-}
-
-void vp9_short_fhaar2x2_c(short *input, short *output, int pitch) {
-  /* [1 1; 1 -1] orthogonal transform */
-  /* use position: 0,1, 4, 8 */
-  int i;
-  short *ip1 = input;
-  short *op1 = output;
-  for (i = 0; i < 16; i++) {
-    op1[i] = 0;
-  }
-
-  op1[0] = (ip1[0] + ip1[1] + ip1[4] + ip1[8] + 1) >> 1;
-  op1[1] = (ip1[0] - ip1[1] + ip1[4] - ip1[8]) >> 1;
-  op1[4] = (ip1[0] + ip1[1] - ip1[4] - ip1[8]) >> 1;
-  op1[8] = (ip1[0] - ip1[1] - ip1[4] + ip1[8]) >> 1;
-}
-
-/* For test */
-#define TEST_INT 1
-#if TEST_INT
-#define vp9_fht_int_c vp9_fht_c
-#else
-#define vp9_fht_float_c vp9_fht_c
-#endif
-
-void vp9_fht_float_c(const int16_t *input, int pitch, int16_t *output,
-               TX_TYPE tx_type, int tx_dim) {
-  vp9_clear_system_state();  // Make it simd safe : __asm emms;
-  {
-    int i, j, k;
-    float bufa[256], bufb[256];  // buffers are for floating-point test purpose
-                                 // the implementation could be simplified in
-                                 // conjunction with integer transform
-    const int16_t *ip = input;
-    int16_t *op = output;
-
-    float *pfa = &bufa[0];
-    float *pfb = &bufb[0];
-
-    // pointers to vertical and horizontal transforms
-    const float *ptv, *pth;
-
-    assert(tx_type != DCT_DCT);
-    // load and convert residual array into floating-point
-    for (j = 0; j < tx_dim; j++) {
-      for (i = 0; i < tx_dim; i++) {
-        pfa[i] = (float)ip[i];
-      }
-      pfa += tx_dim;
-      ip  += pitch / 2;
-    }
-
-    // vertical transformation
-    pfa = &bufa[0];
-    pfb = &bufb[0];
-
-    switch (tx_type) {
-      case ADST_ADST :
-      case ADST_DCT  :
-        ptv = (tx_dim == 4) ? &adst_4[0] :
-                              ((tx_dim == 8) ? &adst_8[0] : &adst_16[0]);
-        break;
-
-      default :
-        ptv = (tx_dim == 4) ? &dct_4[0] :
-                              ((tx_dim == 8) ? &dct_8[0] : &dct_16[0]);
-        break;
-    }
-
-    for (j = 0; j < tx_dim; j++) {
-      for (i = 0; i < tx_dim; i++) {
-        pfb[i] = 0;
-        for (k = 0; k < tx_dim; k++) {
-          pfb[i] += ptv[k] * pfa[(k * tx_dim)];
-        }
-        pfa += 1;
-      }
-      pfb += tx_dim;
-      ptv += tx_dim;
-      pfa = &bufa[0];
-    }
-
-    // horizontal transformation
-    pfa = &bufa[0];
-    pfb = &bufb[0];
-
-    switch (tx_type) {
-      case ADST_ADST :
-      case  DCT_ADST :
-        pth = (tx_dim == 4) ? &adst_4[0] :
-                              ((tx_dim == 8) ? &adst_8[0] : &adst_16[0]);
-        break;
-
-      default :
-        pth = (tx_dim == 4) ? &dct_4[0] :
-                              ((tx_dim == 8) ? &dct_8[0] : &dct_16[0]);
-        break;
-    }
-
-    for (j = 0; j < tx_dim; j++) {
-      for (i = 0; i < tx_dim; i++) {
-        pfa[i] = 0;
-        for (k = 0; k < tx_dim; k++) {
-          pfa[i] += pfb[k] * pth[k];
-        }
-        pth += tx_dim;
-      }
-
-      pfa += tx_dim;
-      pfb += tx_dim;
-      // pth -= tx_dim * tx_dim;
-
-      switch (tx_type) {
-        case ADST_ADST :
-        case  DCT_ADST :
-          pth = (tx_dim == 4) ? &adst_4[0] :
-                                ((tx_dim == 8) ? &adst_8[0] : &adst_16[0]);
-          break;
-
-        default :
-          pth = (tx_dim == 4) ? &dct_4[0] :
-                                ((tx_dim == 8) ? &dct_8[0] : &dct_16[0]);
-          break;
-      }
-    }
-
-    // convert to short integer format and load BLOCKD buffer
-    op = output;
-    pfa = &bufa[0];
-
-    for (j = 0; j < tx_dim; j++) {
-      for (i = 0; i < tx_dim; i++) {
-        op[i] = (pfa[i] > 0 ) ? (int16_t)( 8 * pfa[i] + 0.49) :
-                                     -(int16_t)(- 8 * pfa[i] + 0.49);
-      }
-      op  += tx_dim;
-      pfa += tx_dim;
-    }
-  }
-  vp9_clear_system_state();  // Make it simd safe : __asm emms;
-}
-
-/* Converted the transforms to integer form. */
-#define VERTICAL_SHIFT 11
-#define VERTICAL_ROUNDING ((1 << (VERTICAL_SHIFT - 1)) - 1)
-#define HORIZONTAL_SHIFT 16
-#define HORIZONTAL_ROUNDING ((1 << (HORIZONTAL_SHIFT - 1)) - 1)
-void vp9_fht_int_c(const int16_t *input, int pitch, int16_t *output,
-                   TX_TYPE tx_type, int tx_dim) {
-  int i, j, k;
-  int16_t imbuf[256];
-
-  const int16_t *ip = input;
-  int16_t *op = output;
-  int16_t *im = &imbuf[0];
-
-  /* pointers to vertical and horizontal transforms. */
-  const int16_t *ptv = NULL, *pth = NULL;
-
-  switch (tx_type) {
-    case ADST_ADST :
-      ptv = pth = (tx_dim == 4) ? &adst_i4[0]
-                                  : ((tx_dim == 8) ? &adst_i8[0]
-                                                     : &adst_i16[0]);
-      break;
-    case ADST_DCT  :
-      ptv = (tx_dim == 4) ? &adst_i4[0]
-                            : ((tx_dim == 8) ? &adst_i8[0] : &adst_i16[0]);
-      pth = (tx_dim == 4) ? &dct_i4[0]
-                            : ((tx_dim == 8) ? &dct_i8[0] : &dct_i16[0]);
-      break;
-    case  DCT_ADST :
-      ptv = (tx_dim == 4) ? &dct_i4[0]
-                            : ((tx_dim == 8) ? &dct_i8[0] : &dct_i16[0]);
-      pth = (tx_dim == 4) ? &adst_i4[0]
-                            : ((tx_dim == 8) ? &adst_i8[0] : &adst_i16[0]);
-      break;
-    case  DCT_DCT :
-      ptv = pth = (tx_dim == 4) ? &dct_i4[0]
-                                  : ((tx_dim == 8) ? &dct_i8[0] : &dct_i16[0]);
-      break;
-    default:
-      assert(0);
-      break;
-  }
-
-  /* vertical transformation */
-  for (j = 0; j < tx_dim; j++) {
-    for (i = 0; i < tx_dim; i++) {
-      int temp = 0;
-
-      for (k = 0; k < tx_dim; k++) {
-        temp += ptv[k] * ip[(k * (pitch >> 1))];
-      }
-
-      im[i] = (int16_t)((temp + VERTICAL_ROUNDING) >> VERTICAL_SHIFT);
-      ip++;
-    }
-    im += tx_dim;  // 16
-    ptv += tx_dim;
-    ip = input;
-  }
-
-  /* horizontal transformation */
-  im = &imbuf[0];
-
-  for (j = 0; j < tx_dim; j++) {
-    const int16_t *pthc = pth;
-
-    for (i = 0; i < tx_dim; i++) {
-      int temp = 0;
-
-      for (k = 0; k < tx_dim; k++) {
-        temp += im[k] * pthc[k];
-      }
-
-      op[i] = (int16_t)((temp + HORIZONTAL_ROUNDING) >> HORIZONTAL_SHIFT);
-      pthc += tx_dim;
-    }
-
-    im += tx_dim;  // 16
-    op += tx_dim;
-  }
-}
-
-void vp9_short_fdct4x4_c(short *input, short *output, int pitch) {
-  int i;
-  int a1, b1, c1, d1;
-  short *ip = input;
-  short *op = output;
-
-  for (i = 0; i < 4; i++) {
-    a1 = ((ip[0] + ip[3]) << 5);
-    b1 = ((ip[1] + ip[2]) << 5);
-    c1 = ((ip[1] - ip[2]) << 5);
-    d1 = ((ip[0] - ip[3]) << 5);
-
-    op[0] = a1 + b1;
-    op[2] = a1 - b1;
-
-    op[1] = (c1 * 2217 + d1 * 5352 +  14500) >> 12;
-    op[3] = (d1 * 2217 - c1 * 5352 +   7500) >> 12;
-
-    ip += pitch / 2;
-    op += 4;
-
-  }
-  ip = output;
-  op = output;
-  for (i = 0; i < 4; i++) {
-    a1 = ip[0] + ip[12];
-    b1 = ip[4] + ip[8];
-    c1 = ip[4] - ip[8];
-    d1 = ip[0] - ip[12];
-
-    op[0]  = (a1 + b1 + 7) >> 4;
-    op[8]  = (a1 - b1 + 7) >> 4;
-
-    op[4]  = ((c1 * 2217 + d1 * 5352 +  12000) >> 16) + (d1 != 0);
-    op[12] = (d1 * 2217 - c1 * 5352 +  51000) >> 16;
-
-    ip++;
-    op++;
-  }
-}
-
-void vp9_short_fdct8x4_c(short *input, short *output, int pitch)
-{
-    vp9_short_fdct4x4_c(input,   output,    pitch);
-    vp9_short_fdct4x4_c(input + 4, output + 16, pitch);
-}
-
-void vp9_short_walsh4x4_c(short *input, short *output, int pitch) {
-  int i;
-  int a1, b1, c1, d1;
-  short *ip = input;
-  short *op = output;
-  int pitch_short = pitch >> 1;
-
-  for (i = 0; i < 4; i++) {
-    a1 = ip[0 * pitch_short] + ip[3 * pitch_short];
-    b1 = ip[1 * pitch_short] + ip[2 * pitch_short];
-    c1 = ip[1 * pitch_short] - ip[2 * pitch_short];
-    d1 = ip[0 * pitch_short] - ip[3 * pitch_short];
-
-    op[0] = (a1 + b1 + 1) >> 1;
-    op[4] = (c1 + d1) >> 1;
-    op[8] = (a1 - b1) >> 1;
-    op[12] = (d1 - c1) >> 1;
-
-    ip++;
-    op++;
-  }
-  ip = output;
-  op = output;
-
-  for (i = 0; i < 4; i++) {
-    a1 = ip[0] + ip[3];
-    b1 = ip[1] + ip[2];
-    c1 = ip[1] - ip[2];
-    d1 = ip[0] - ip[3];
-
-    op[0] = (a1 + b1 + 1) >> 1;
-    op[1] = (c1 + d1) >> 1;
-    op[2] = (a1 - b1) >> 1;
-    op[3] = (d1 - c1) >> 1;
-
-    ip += 4;
-    op += 4;
-  }
-}
-
-#if CONFIG_LOSSLESS
-void vp9_short_walsh4x4_lossless_c(short *input, short *output, int pitch) {
-  int i;
-  int a1, b1, c1, d1;
-  short *ip = input;
-  short *op = output;
-  int pitch_short = pitch >> 1;
-
-  for (i = 0; i < 4; i++) {
-    a1 = (ip[0 * pitch_short] + ip[3 * pitch_short]) >> Y2_WHT_UPSCALE_FACTOR;
-    b1 = (ip[1 * pitch_short] + ip[2 * pitch_short]) >> Y2_WHT_UPSCALE_FACTOR;
-    c1 = (ip[1 * pitch_short] - ip[2 * pitch_short]) >> Y2_WHT_UPSCALE_FACTOR;
-    d1 = (ip[0 * pitch_short] - ip[3 * pitch_short]) >> Y2_WHT_UPSCALE_FACTOR;
-
-    op[0] = (a1 + b1 + 1) >> 1;
-    op[4] = (c1 + d1) >> 1;
-    op[8] = (a1 - b1) >> 1;
-    op[12] = (d1 - c1) >> 1;
-
-    ip++;
-    op++;
-  }
-  ip = output;
-  op = output;
-
-  for (i = 0; i < 4; i++) {
-    a1 = ip[0] + ip[3];
-    b1 = ip[1] + ip[2];
-    c1 = ip[1] - ip[2];
-    d1 = ip[0] - ip[3];
-
-    op[0] = ((a1 + b1 + 1) >> 1) << Y2_WHT_UPSCALE_FACTOR;
-    op[1] = ((c1 + d1) >> 1) << Y2_WHT_UPSCALE_FACTOR;
-    op[2] = ((a1 - b1) >> 1) << Y2_WHT_UPSCALE_FACTOR;
-    op[3] = ((d1 - c1) >> 1) << Y2_WHT_UPSCALE_FACTOR;
-
-    ip += 4;
-    op += 4;
-  }
-}
-
-void vp9_short_walsh4x4_x8_c(short *input, short *output, int pitch) {
-  int i;
-  int a1, b1, c1, d1;
-  short *ip = input;
-  short *op = output;
-  int pitch_short = pitch >> 1;
-
-  for (i = 0; i < 4; i++) {
-    a1 = ip[0 * pitch_short] + ip[3 * pitch_short];
-    b1 = ip[1 * pitch_short] + ip[2 * pitch_short];
-    c1 = ip[1 * pitch_short] - ip[2 * pitch_short];
-    d1 = ip[0 * pitch_short] - ip[3 * pitch_short];
-
-    op[0] = (a1 + b1 + 1) >> 1;
-    op[4] = (c1 + d1) >> 1;
-    op[8] = (a1 - b1) >> 1;
-    op[12] = (d1 - c1) >> 1;
-
-    ip++;
-    op++;
-  }
-  ip = output;
-  op = output;
-
-  for (i = 0; i < 4; i++) {
-    a1 = ip[0] + ip[3];
-    b1 = ip[1] + ip[2];
-    c1 = ip[1] - ip[2];
-    d1 = ip[0] - ip[3];
-
-    op[0] = ((a1 + b1 + 1) >> 1) << WHT_UPSCALE_FACTOR;
-    op[1] = ((c1 + d1) >> 1) << WHT_UPSCALE_FACTOR;
-    op[2] = ((a1 - b1) >> 1) << WHT_UPSCALE_FACTOR;
-    op[3] = ((d1 - c1) >> 1) << WHT_UPSCALE_FACTOR;
-
-    ip += 4;
-    op += 4;
-  }
-}
-
-void vp9_short_walsh8x4_x8_c(short *input, short *output, int pitch) {
-  vp9_short_walsh4x4_x8_c(input,   output,    pitch);
-  vp9_short_walsh4x4_x8_c(input + 4, output + 16, pitch);
-}
-#endif
-
-static const double C1 = 0.995184726672197;
-static const double C2 = 0.98078528040323;
-static const double C3 = 0.956940335732209;
-static const double C4 = 0.923879532511287;
-static const double C5 = 0.881921264348355;
-static const double C6 = 0.831469612302545;
-static const double C7 = 0.773010453362737;
-static const double C8 = 0.707106781186548;
-static const double C9 = 0.634393284163646;
-static const double C10 = 0.555570233019602;
-static const double C11 = 0.471396736825998;
-static const double C12 = 0.38268343236509;
-static const double C13 = 0.290284677254462;
-static const double C14 = 0.195090322016128;
-static const double C15 = 0.098017140329561;
-
-static void dct16x16_1d(double input[16], double output[16]) {
-  vp9_clear_system_state(); // Make it simd safe : __asm emms;
-  {
-    double step[16];
-    double intermediate[16];
-    double temp1, temp2;
-
-    // step 1
-    step[ 0] = input[0] + input[15];
-    step[ 1] = input[1] + input[14];
-    step[ 2] = input[2] + input[13];
-    step[ 3] = input[3] + input[12];
-    step[ 4] = input[4] + input[11];
-    step[ 5] = input[5] + input[10];
-    step[ 6] = input[6] + input[ 9];
-    step[ 7] = input[7] + input[ 8];
-    step[ 8] = input[7] - input[ 8];
-    step[ 9] = input[6] - input[ 9];
-    step[10] = input[5] - input[10];
-    step[11] = input[4] - input[11];
-    step[12] = input[3] - input[12];
-    step[13] = input[2] - input[13];
-    step[14] = input[1] - input[14];
-    step[15] = input[0] - input[15];
-
-    // step 2
-    output[0] = step[0] + step[7];
-    output[1] = step[1] + step[6];
-    output[2] = step[2] + step[5];
-    output[3] = step[3] + step[4];
-    output[4] = step[3] - step[4];
-    output[5] = step[2] - step[5];
-    output[6] = step[1] - step[6];
-    output[7] = step[0] - step[7];
-
-    temp1 = step[ 8]*C7;
-    temp2 = step[15]*C9;
-    output[ 8] = temp1 + temp2;
-
-    temp1 = step[ 9]*C11;
-    temp2 = step[14]*C5;
-    output[ 9] = temp1 - temp2;
-
-    temp1 = step[10]*C3;
-    temp2 = step[13]*C13;
-    output[10] = temp1 + temp2;
-
-    temp1 = step[11]*C15;
-    temp2 = step[12]*C1;
-    output[11] = temp1 - temp2;
-
-    temp1 = step[11]*C1;
-    temp2 = step[12]*C15;
-    output[12] = temp2 + temp1;
-
-    temp1 = step[10]*C13;
-    temp2 = step[13]*C3;
-    output[13] = temp2 - temp1;
-
-    temp1 = step[ 9]*C5;
-    temp2 = step[14]*C11;
-    output[14] = temp2 + temp1;
-
-    temp1 = step[ 8]*C9;
-    temp2 = step[15]*C7;
-    output[15] = temp2 - temp1;
-
-    // step 3
-    step[ 0] = output[0] + output[3];
-    step[ 1] = output[1] + output[2];
-    step[ 2] = output[1] - output[2];
-    step[ 3] = output[0] - output[3];
-
-    temp1 = output[4]*C14;
-    temp2 = output[7]*C2;
-    step[ 4] = temp1 + temp2;
-
-    temp1 = output[5]*C10;
-    temp2 = output[6]*C6;
-    step[ 5] = temp1 + temp2;
-
-    temp1 = output[5]*C6;
-    temp2 = output[6]*C10;
-    step[ 6] = temp2 - temp1;
-
-    temp1 = output[4]*C2;
-    temp2 = output[7]*C14;
-    step[ 7] = temp2 - temp1;
-
-    step[ 8] = output[ 8] + output[11];
-    step[ 9] = output[ 9] + output[10];
-    step[10] = output[ 9] - output[10];
-    step[11] = output[ 8] - output[11];
-
-    step[12] = output[12] + output[15];
-    step[13] = output[13] + output[14];
-    step[14] = output[13] - output[14];
-    step[15] = output[12] - output[15];
-
-    // step 4
-    output[ 0] = (step[ 0] + step[ 1]);
-    output[ 8] = (step[ 0] - step[ 1]);
-
-    temp1 = step[2]*C12;
-    temp2 = step[3]*C4;
-    temp1 = temp1 + temp2;
-    output[ 4] = 2*(temp1*C8);
-
-    temp1 = step[2]*C4;
-    temp2 = step[3]*C12;
-    temp1 = temp2 - temp1;
-    output[12] = 2*(temp1*C8);
-
-    output[ 2] = 2*((step[4] + step[ 5])*C8);
-    output[14] = 2*((step[7] - step[ 6])*C8);
-
-    temp1 = step[4] - step[5];
-    temp2 = step[6] + step[7];
-    output[ 6] = (temp1 + temp2);
-    output[10] = (temp1 - temp2);
-
-    intermediate[8] = step[8] + step[14];
-    intermediate[9] = step[9] + step[15];
-
-    temp1 = intermediate[8]*C12;
-    temp2 = intermediate[9]*C4;
-    temp1 = temp1 - temp2;
-    output[3] = 2*(temp1*C8);
-
-    temp1 = intermediate[8]*C4;
-    temp2 = intermediate[9]*C12;
-    temp1 = temp2 + temp1;
-    output[13] = 2*(temp1*C8);
-
-    output[ 9] = 2*((step[10] + step[11])*C8);
-
-    intermediate[11] = step[10] - step[11];
-    intermediate[12] = step[12] + step[13];
-    intermediate[13] = step[12] - step[13];
-    intermediate[14] = step[ 8] - step[14];
-    intermediate[15] = step[ 9] - step[15];
-
-    output[15] = (intermediate[11] + intermediate[12]);
-    output[ 1] = -(intermediate[11] - intermediate[12]);
-
-    output[ 7] = 2*(intermediate[13]*C8);
-
-    temp1 = intermediate[14]*C12;
-    temp2 = intermediate[15]*C4;
-    temp1 = temp1 - temp2;
-    output[11] = -2*(temp1*C8);
-
-    temp1 = intermediate[14]*C4;
-    temp2 = intermediate[15]*C12;
-    temp1 = temp2 + temp1;
-    output[ 5] = 2*(temp1*C8);
-  }
-  vp9_clear_system_state(); // Make it simd safe : __asm emms;
-}
-
-void vp9_short_fdct16x16_c(short *input, short *out, int pitch) {
-  vp9_clear_system_state(); // Make it simd safe : __asm emms;
-  {
-    int shortpitch = pitch >> 1;
-    int i, j;
-    double output[256];
-    // First transform columns
-    for (i = 0; i < 16; i++) {
-        double temp_in[16], temp_out[16];
-        for (j = 0; j < 16; j++)
-            temp_in[j] = input[j*shortpitch + i];
-        dct16x16_1d(temp_in, temp_out);
-        for (j = 0; j < 16; j++)
-            output[j*16 + i] = temp_out[j];
-    }
-    // Then transform rows
-    for (i = 0; i < 16; ++i) {
-        double temp_in[16], temp_out[16];
-        for (j = 0; j < 16; ++j)
-            temp_in[j] = output[j + i*16];
-        dct16x16_1d(temp_in, temp_out);
-        for (j = 0; j < 16; ++j)
-            output[j + i*16] = temp_out[j];
-    }
-    // Scale by some magic number
-    for (i = 0; i < 256; i++)
-        out[i] = (short)round(output[i]/2);
-  }
-  vp9_clear_system_state(); // Make it simd safe : __asm emms;
-}
--- a/vp8/encoder/encodeframe.c
+++ /dev/null
@@ -1,2342 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vpx_ports/config.h"
-#include "encodemb.h"
-#include "encodemv.h"
-#include "vp8/common/common.h"
-#include "onyx_int.h"
-#include "vp8/common/extend.h"
-#include "vp8/common/entropymode.h"
-#include "vp8/common/quant_common.h"
-#include "segmentation.h"
-#include "vp8/common/setupintrarecon.h"
-#include "vp8/common/reconintra4x4.h"
-#include "encodeintra.h"
-#include "vp8/common/reconinter.h"
-#include "vp8/common/invtrans.h"
-#include "rdopt.h"
-#include "vp8/common/findnearmv.h"
-#include "vp8/common/reconintra.h"
-#include "vp8/common/seg_common.h"
-#include "vpx_rtcd.h"
-#include <stdio.h>
-#include <math.h>
-#include <limits.h>
-#include "vp8/common/subpixel.h"
-#include "vpx_ports/vpx_timer.h"
-#include "vp8/common/pred_common.h"
-
-#define DBG_PRNT_SEGMAP 0
-#if CONFIG_NEWBESTREFMV
-#include "vp8/common/mvref_common.h"
-#endif
-
-
-#if CONFIG_RUNTIME_CPU_DETECT
-#define RTCD(x)     &cpi->common.rtcd.x
-#define IF_RTCD(x)  (x)
-#else
-#define RTCD(x)     NULL
-#define IF_RTCD(x)  NULL
-#endif
-
-#ifdef ENC_DEBUG
-int enc_debug = 0;
-int mb_row_debug, mb_col_debug;
-#endif
-
-extern void vp9_initialize_me_consts(VP9_COMP *cpi, int QIndex);
-
-extern void vp9_auto_select_speed(VP9_COMP *cpi);
-
-int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
-                              int recon_yoffset, int recon_uvoffset,
-                              int *returnrate, int *returndistortion);
-
-extern void vp9_pick_mode_inter_macroblock(VP9_COMP *cpi, MACROBLOCK *x,
-                                           int recon_yoffset,
-                                           int recon_uvoffset, int *r, int *d);
-
-void vp9_build_block_offsets(MACROBLOCK *x);
-
-void vp9_setup_block_ptrs(MACROBLOCK *x);
-
-void vp9_encode_inter_macroblock(VP9_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t,
-                                 int recon_yoffset, int recon_uvoffset,
-                                 int output_enabled);
-
-void vp9_encode_inter_superblock(VP9_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t,
-                                 int recon_yoffset, int recon_uvoffset,
-                                 int mb_col, int mb_row);
-
-void vp9_encode_intra_macro_block(VP9_COMP *cpi, MACROBLOCK *x,
-                                  TOKENEXTRA **t, int output_enabled);
-
-void vp9_encode_intra_super_block(VP9_COMP *cpi, MACROBLOCK *x,
-                                  TOKENEXTRA **t, int mb_col);
-
-static void adjust_act_zbin(VP9_COMP *cpi, MACROBLOCK *x);
-
-#ifdef MODE_STATS
-unsigned int inter_y_modes[MB_MODE_COUNT];
-unsigned int inter_uv_modes[VP9_UV_MODES];
-unsigned int inter_b_modes[B_MODE_COUNT];
-unsigned int y_modes[VP9_YMODES];
-unsigned int i8x8_modes[VP9_I8X8_MODES];
-unsigned int uv_modes[VP9_UV_MODES];
-unsigned int uv_modes_y[VP9_YMODES][VP9_UV_MODES];
-unsigned int b_modes[B_MODE_COUNT];
-#endif
-
-
-/* activity_avg must be positive, or flat regions could get a zero weight
- *  (infinite lambda), which confounds analysis.
- * This also avoids the need for divide by zero checks in
- *  vp9_activity_masking().
- */
-#define VP9_ACTIVITY_AVG_MIN (64)
-
-/* This is used as a reference when computing the source variance for the
- *  purposes of activity masking.
- * Eventually this should be replaced by custom no-reference routines,
- *  which will be faster.
- */
-static const unsigned char VP9_VAR_OFFS[16] = {
-  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128
-};
-
-
-// Original activity measure from Tim T's code.
-static unsigned int tt_activity_measure(VP9_COMP *cpi, MACROBLOCK *x) {
-  unsigned int act;
-  unsigned int sse;
-  /* TODO: This could also be done over smaller areas (8x8), but that would
-   *  require extensive changes elsewhere, as lambda is assumed to be fixed
-   *  over an entire MB in most of the code.
-   * Another option is to compute four 8x8 variances, and pick a single
-   *  lambda using a non-linear combination (e.g., the smallest, or second
-   *  smallest, etc.).
-   */
-  act = vp9_variance16x16(x->src.y_buffer, x->src.y_stride, VP9_VAR_OFFS, 0,
-                          &sse);
-  act = act << 4;
-
-  /* If the region is flat, lower the activity some more. */
-  if (act < 8 << 12)
-    act = act < 5 << 12 ? act : 5 << 12;
-
-  return act;
-}
-
-// Stub for alternative experimental activity measures.
-static unsigned int alt_activity_measure(VP9_COMP *cpi,
-                                         MACROBLOCK *x, int use_dc_pred) {
-  return vp9_encode_intra(cpi, x, use_dc_pred);
-}
-
-
-// Measure the activity of the current macroblock
-// What we measure here is TBD so abstracted to this function
-#define ALT_ACT_MEASURE 1
-static unsigned int mb_activity_measure(VP9_COMP *cpi, MACROBLOCK *x,
-                                        int mb_row, int mb_col) {
-  unsigned int mb_activity;
-
-  if (ALT_ACT_MEASURE) {
-    int use_dc_pred = (mb_col || mb_row) && (!mb_col || !mb_row);
-
-    // Or use and alternative.
-    mb_activity = alt_activity_measure(cpi, x, use_dc_pred);
-  } else {
-    // Original activity measure from Tim T's code.
-    mb_activity = tt_activity_measure(cpi, x);
-  }
-
-  if (mb_activity < VP9_ACTIVITY_AVG_MIN)
-    mb_activity = VP9_ACTIVITY_AVG_MIN;
-
-  return mb_activity;
-}
-
-// Calculate an "average" mb activity value for the frame
-#define ACT_MEDIAN 0
-static void calc_av_activity(VP9_COMP *cpi, int64_t activity_sum) {
-#if ACT_MEDIAN
-  // Find median: Simple n^2 algorithm for experimentation
-  {
-    unsigned int median;
-    unsigned int i, j;
-    unsigned int *sortlist;
-    unsigned int tmp;
-
-    // Create a list to sort to
-    CHECK_MEM_ERROR(sortlist,
-    vpx_calloc(sizeof(unsigned int),
-    cpi->common.MBs));
-
-    // Copy map to sort list
-    vpx_memcpy(sortlist, cpi->mb_activity_map,
-    sizeof(unsigned int) * cpi->common.MBs);
-
-
-    // Ripple each value down to its correct position
-    for (i = 1; i < cpi->common.MBs; i ++) {
-      for (j = i; j > 0; j --) {
-        if (sortlist[j] < sortlist[j - 1]) {
-          // Swap values
-          tmp = sortlist[j - 1];
-          sortlist[j - 1] = sortlist[j];
-          sortlist[j] = tmp;
-        } else
-          break;
-      }
-    }
-
-    // Even number MBs so estimate median as mean of two either side.
-    median = (1 + sortlist[cpi->common.MBs >> 1] +
-              sortlist[(cpi->common.MBs >> 1) + 1]) >> 1;
-
-    cpi->activity_avg = median;
-
-    vpx_free(sortlist);
-  }
-#else
-  // Simple mean for now
-  cpi->activity_avg = (unsigned int)(activity_sum / cpi->common.MBs);
-#endif
-
-  if (cpi->activity_avg < VP9_ACTIVITY_AVG_MIN)
-    cpi->activity_avg = VP9_ACTIVITY_AVG_MIN;
-
-  // Experimental code: return fixed value normalized for several clips
-  if (ALT_ACT_MEASURE)
-    cpi->activity_avg = 100000;
-}
-
-#define USE_ACT_INDEX   0
-#define OUTPUT_NORM_ACT_STATS   0
-
-#if USE_ACT_INDEX
-// Calculate and activity index for each mb
-static void calc_activity_index(VP9_COMP *cpi, MACROBLOCK *x) {
-  VP9_COMMON *const cm = &cpi->common;
-  int mb_row, mb_col;
-
-  int64_t act;
-  int64_t a;
-  int64_t b;
-
-#if OUTPUT_NORM_ACT_STATS
-  FILE *f = fopen("norm_act.stt", "a");
-  fprintf(f, "\n%12d\n", cpi->activity_avg);
-#endif
-
-  // Reset pointers to start of activity map
-  x->mb_activity_ptr = cpi->mb_activity_map;
-
-  // Calculate normalized mb activity number.
-  for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) {
-    // for each macroblock col in image
-    for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) {
-      // Read activity from the map
-      act = *(x->mb_activity_ptr);
-
-      // Calculate a normalized activity number
-      a = act + 4 * cpi->activity_avg;
-      b = 4 * act + cpi->activity_avg;
-
-      if (b >= a)
-        *(x->activity_ptr) = (int)((b + (a >> 1)) / a) - 1;
-      else
-        *(x->activity_ptr) = 1 - (int)((a + (b >> 1)) / b);
-
-#if OUTPUT_NORM_ACT_STATS
-      fprintf(f, " %6d", *(x->mb_activity_ptr));
-#endif
-      // Increment activity map pointers
-      x->mb_activity_ptr++;
-    }
-
-#if OUTPUT_NORM_ACT_STATS
-    fprintf(f, "\n");
-#endif
-
-  }
-
-#if OUTPUT_NORM_ACT_STATS
-  fclose(f);
-#endif
-
-}
-#endif
-
-// Loop through all MBs. Note activity of each, average activity and
-// calculate a normalized activity for each
-static void build_activity_map(VP9_COMP *cpi) {
-  MACROBLOCK *const x = &cpi->mb;
-  MACROBLOCKD *xd = &x->e_mbd;
-  VP9_COMMON *const cm = &cpi->common;
-
-#if ALT_ACT_MEASURE
-  YV12_BUFFER_CONFIG *new_yv12 = &cm->yv12_fb[cm->new_fb_idx];
-  int recon_yoffset;
-  int recon_y_stride = new_yv12->y_stride;
-#endif
-
-  int mb_row, mb_col;
-  unsigned int mb_activity;
-  int64_t activity_sum = 0;
-
-  // for each macroblock row in image
-  for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) {
-#if ALT_ACT_MEASURE
-    // reset above block coeffs
-    xd->up_available = (mb_row != 0);
-    recon_yoffset = (mb_row * recon_y_stride * 16);
-#endif
-    // for each macroblock col in image
-    for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) {
-#if ALT_ACT_MEASURE
-      xd->dst.y_buffer = new_yv12->y_buffer + recon_yoffset;
-      xd->left_available = (mb_col != 0);
-      recon_yoffset += 16;
-#endif
-      // Copy current mb to a buffer
-      vp9_copy_mem16x16(x->src.y_buffer, x->src.y_stride, x->thismb, 16);
-
-      // measure activity
-      mb_activity = mb_activity_measure(cpi, x, mb_row, mb_col);
-
-      // Keep frame sum
-      activity_sum += mb_activity;
-
-      // Store MB level activity details.
-      *x->mb_activity_ptr = mb_activity;
-
-      // Increment activity map pointer
-      x->mb_activity_ptr++;
-
-      // adjust to the next column of source macroblocks
-      x->src.y_buffer += 16;
-    }
-
-
-    // adjust to the next row of mbs
-    x->src.y_buffer += 16 * x->src.y_stride - 16 * cm->mb_cols;
-
-#if ALT_ACT_MEASURE
-    // extend the recon for intra prediction
-    vp9_extend_mb_row(new_yv12, xd->dst.y_buffer + 16,
-                      xd->dst.u_buffer + 8, xd->dst.v_buffer + 8);
-#endif
-
-  }
-
-  // Calculate an "average" MB activity
-  calc_av_activity(cpi, activity_sum);
-
-#if USE_ACT_INDEX
-  // Calculate an activity index number of each mb
-  calc_activity_index(cpi, x);
-#endif
-
-}
-
-// Macroblock activity masking
-void vp9_activity_masking(VP9_COMP *cpi, MACROBLOCK *x) {
-#if USE_ACT_INDEX
-  x->rdmult += *(x->mb_activity_ptr) * (x->rdmult >> 2);
-  x->errorperbit = x->rdmult * 100 / (110 * x->rddiv);
-  x->errorperbit += (x->errorperbit == 0);
-#else
-  int64_t a;
-  int64_t b;
-  int64_t act = *(x->mb_activity_ptr);
-
-  // Apply the masking to the RD multiplier.
-  a = act + (2 * cpi->activity_avg);
-  b = (2 * act) + cpi->activity_avg;
-
-  x->rdmult = (unsigned int)(((int64_t)x->rdmult * b + (a >> 1)) / a);
-  x->errorperbit = x->rdmult * 100 / (110 * x->rddiv);
-  x->errorperbit += (x->errorperbit == 0);
-#endif
-
-  // Activity based Zbin adjustment
-  adjust_act_zbin(cpi, x);
-}
-
-static void update_state(VP9_COMP *cpi, MACROBLOCK *x, PICK_MODE_CONTEXT *ctx) {
-  int i;
-  MACROBLOCKD *xd = &x->e_mbd;
-  MODE_INFO *mi = &ctx->mic;
-  MB_MODE_INFO * mbmi = &xd->mode_info_context->mbmi;
-  int mb_mode = mi->mbmi.mode;
-  int mb_mode_index = ctx->best_mode_index;
-
-#if CONFIG_DEBUG
-  assert(mb_mode < MB_MODE_COUNT);
-  assert(mb_mode_index < MAX_MODES);
-  assert(mi->mbmi.ref_frame < MAX_REF_FRAMES);
-#endif
-
-  // Restore the coding context of the MB to that that was in place
-  // when the mode was picked for it
-  vpx_memcpy(xd->mode_info_context, mi, sizeof(MODE_INFO));
-#if CONFIG_SUPERBLOCKS
-  if (mi->mbmi.encoded_as_sb) {
-    const int mis = cpi->common.mode_info_stride;
-    if (xd->mb_to_right_edge > 0)
-      vpx_memcpy(xd->mode_info_context + 1, mi, sizeof(MODE_INFO));
-    if (xd->mb_to_bottom_edge > 0) {
-      vpx_memcpy(xd->mode_info_context + mis, mi, sizeof(MODE_INFO));
-      if (xd->mb_to_right_edge > 0)
-        vpx_memcpy(xd->mode_info_context + mis + 1, mi, sizeof(MODE_INFO));
-    }
-  }
-#endif
-
-  if (mb_mode == B_PRED) {
-    for (i = 0; i < 16; i++) {
-      xd->block[i].bmi.as_mode = xd->mode_info_context->bmi[i].as_mode;
-      assert(xd->block[i].bmi.as_mode.first < MB_MODE_COUNT);
-    }
-  } else if (mb_mode == I8X8_PRED) {
-    for (i = 0; i < 16; i++) {
-      xd->block[i].bmi = xd->mode_info_context->bmi[i];
-    }
-  } else if (mb_mode == SPLITMV) {
-    vpx_memcpy(x->partition_info, &ctx->partition_info,
-               sizeof(PARTITION_INFO));
-
-    mbmi->mv[0].as_int = x->partition_info->bmi[15].mv.as_int;
-    mbmi->mv[1].as_int = x->partition_info->bmi[15].second_mv.as_int;
-  }
-
-  {
-    int segment_id = mbmi->segment_id;
-    if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) ||
-        vp9_get_segdata(xd, segment_id, SEG_LVL_EOB)) {
-      for (i = 0; i < NB_TXFM_MODES; i++) {
-        cpi->rd_tx_select_diff[i] += ctx->txfm_rd_diff[i];
-      }
-    }
-  }
-
-  if (cpi->common.frame_type == KEY_FRAME) {
-    // Restore the coding modes to that held in the coding context
-    // if (mb_mode == B_PRED)
-    //    for (i = 0; i < 16; i++)
-    //    {
-    //        xd->block[i].bmi.as_mode =
-    //                          xd->mode_info_context->bmi[i].as_mode;
-    //        assert(xd->mode_info_context->bmi[i].as_mode < MB_MODE_COUNT);
-    //    }
-#if CONFIG_INTERNAL_STATS
-    static const int kf_mode_index[] = {
-      THR_DC /*DC_PRED*/,
-      THR_V_PRED /*V_PRED*/,
-      THR_H_PRED /*H_PRED*/,
-      THR_D45_PRED /*D45_PRED*/,
-      THR_D135_PRED /*D135_PRED*/,
-      THR_D117_PRED /*D117_PRED*/,
-      THR_D153_PRED /*D153_PRED*/,
-      THR_D27_PRED /*D27_PRED*/,
-      THR_D63_PRED /*D63_PRED*/,
-      THR_TM /*TM_PRED*/,
-      THR_I8X8_PRED /*I8X8_PRED*/,
-      THR_B_PRED /*B_PRED*/,
-    };
-    cpi->mode_chosen_counts[kf_mode_index[mb_mode]]++;
-#endif
-  } else {
-    /*
-            // Reduce the activation RD thresholds for the best choice mode
-            if ((cpi->rd_baseline_thresh[mb_mode_index] > 0) &&
-                (cpi->rd_baseline_thresh[mb_mode_index] < (INT_MAX >> 2)))
-            {
-                int best_adjustment = (cpi->rd_thresh_mult[mb_mode_index] >> 2);
-
-                cpi->rd_thresh_mult[mb_mode_index] =
-                        (cpi->rd_thresh_mult[mb_mode_index]
-                         >= (MIN_THRESHMULT + best_adjustment)) ?
-                                cpi->rd_thresh_mult[mb_mode_index] - best_adjustment :
-                                MIN_THRESHMULT;
-                cpi->rd_threshes[mb_mode_index] =
-                        (cpi->rd_baseline_thresh[mb_mode_index] >> 7)
-                        * cpi->rd_thresh_mult[mb_mode_index];
-
-            }
-    */
-    // Note how often each mode chosen as best
-    cpi->mode_chosen_counts[mb_mode_index]++;
-
-    cpi->prediction_error += ctx->distortion;
-    cpi->intra_error += ctx->intra_error;
-
-    cpi->rd_comp_pred_diff[0] += ctx->single_pred_diff;
-    cpi->rd_comp_pred_diff[1] += ctx->comp_pred_diff;
-    cpi->rd_comp_pred_diff[2] += ctx->hybrid_pred_diff;
-  }
-}
-
-static void pick_mb_modes(VP9_COMP *cpi,
-                          VP9_COMMON *cm,
-                          int mb_row,
-                          int mb_col,
-                          MACROBLOCK  *x,
-                          MACROBLOCKD *xd,
-                          TOKENEXTRA **tp,
-                          int *totalrate,
-                          int *totaldist) {
-  int i;
-  int map_index;
-  int recon_yoffset, recon_uvoffset;
-  int ref_fb_idx = cm->lst_fb_idx;
-  int dst_fb_idx = cm->new_fb_idx;
-  int recon_y_stride = cm->yv12_fb[ref_fb_idx].y_stride;
-  int recon_uv_stride = cm->yv12_fb[ref_fb_idx].uv_stride;
-  ENTROPY_CONTEXT_PLANES left_context[2];
-  ENTROPY_CONTEXT_PLANES above_context[2];
-  ENTROPY_CONTEXT_PLANES *initial_above_context_ptr = cm->above_context
-                                                      + mb_col;
-
-  // Offsets to move pointers from MB to MB within a SB in raster order
-  int row_delta[4] = { 0, +1,  0, -1};
-  int col_delta[4] = { +1, -1, +1, +1};
-
-  /* Function should not modify L & A contexts; save and restore on exit */
-  vpx_memcpy(left_context,
-             cm->left_context,
-             sizeof(left_context));
-  vpx_memcpy(above_context,
-             initial_above_context_ptr,
-             sizeof(above_context));
-
-  /* Encode MBs in raster order within the SB */
-  for (i = 0; i < 4; i++) {
-    int dy = row_delta[i];
-    int dx = col_delta[i];
-    int offset_unextended = dy * cm->mb_cols + dx;
-    int offset_extended   = dy * xd->mode_info_stride + dx;
-    MB_MODE_INFO * mbmi = &xd->mode_info_context->mbmi;
-
-    // TODO Many of the index items here can be computed more efficiently!
-
-    if ((mb_row >= cm->mb_rows) || (mb_col >= cm->mb_cols)) {
-      // MB lies outside frame, move on
-      mb_row += dy;
-      mb_col += dx;
-
-      // Update pointers
-      x->src.y_buffer += 16 * (dx + dy * x->src.y_stride);
-      x->src.u_buffer += 8  * (dx + dy * x->src.uv_stride);
-      x->src.v_buffer += 8  * (dx + dy * x->src.uv_stride);
-
-      x->gf_active_ptr += offset_unextended;
-      x->partition_info += offset_extended;
-      xd->mode_info_context += offset_extended;
-      xd->prev_mode_info_context += offset_extended;
-#if CONFIG_DEBUG
-      assert((xd->prev_mode_info_context - cpi->common.prev_mip) ==
-             (xd->mode_info_context - cpi->common.mip));
-#endif
-      continue;
-    }
-
-    // Index of the MB in the SB 0..3
-    xd->mb_index = i;
-
-    map_index = (mb_row * cpi->common.mb_cols) + mb_col;
-    x->mb_activity_ptr = &cpi->mb_activity_map[map_index];
-
-    // set above context pointer
-    xd->above_context = cm->above_context + mb_col;
-
-    // Restore the appropriate left context depending on which
-    // row in the SB the MB is situated
-    xd->left_context = cm->left_context + (i >> 1);
-
-    // Set up distance of MB to edge of frame in 1/8th pel units
-    xd->mb_to_top_edge    = -((mb_row * 16) << 3);
-    xd->mb_to_left_edge   = -((mb_col * 16) << 3);
-    xd->mb_to_bottom_edge = ((cm->mb_rows - 1 - mb_row) * 16) << 3;
-    xd->mb_to_right_edge  = ((cm->mb_cols - 1 - mb_col) * 16) << 3;
-
-    // Set up limit values for MV components to prevent them from
-    // extending beyond the UMV borders assuming 16x16 block size
-    x->mv_row_min = -((mb_row * 16) + VP8BORDERINPIXELS - INTERP_EXTEND);
-    x->mv_col_min = -((mb_col * 16) + VP8BORDERINPIXELS - INTERP_EXTEND);
-    x->mv_row_max = ((cm->mb_rows - mb_row) * 16 +
-                     (VP8BORDERINPIXELS - 16 - INTERP_EXTEND));
-    x->mv_col_max = ((cm->mb_cols - mb_col) * 16 +
-                     (VP8BORDERINPIXELS - 16 - INTERP_EXTEND));
-
-    xd->up_available   = (mb_row != 0);
-    xd->left_available = (mb_col != 0);
-
-    recon_yoffset  = (mb_row * recon_y_stride * 16) + (mb_col * 16);
-    recon_uvoffset = (mb_row * recon_uv_stride * 8) + (mb_col *  8);
-
-    xd->dst.y_buffer = cm->yv12_fb[dst_fb_idx].y_buffer + recon_yoffset;
-    xd->dst.u_buffer = cm->yv12_fb[dst_fb_idx].u_buffer + recon_uvoffset;
-    xd->dst.v_buffer = cm->yv12_fb[dst_fb_idx].v_buffer + recon_uvoffset;
-
-    // Copy current MB to a work buffer
-    vp9_copy_mem16x16(x->src.y_buffer, x->src.y_stride, x->thismb, 16);
-
-    x->rddiv = cpi->RDDIV;
-    x->rdmult = cpi->RDMULT;
-
-    if (cpi->oxcf.tuning == VP8_TUNE_SSIM)
-      vp9_activity_masking(cpi, x);
-
-    // Is segmentation enabled
-    if (xd->segmentation_enabled) {
-      // Code to set segment id in xd->mbmi.segment_id
-      if (xd->update_mb_segmentation_map)
-        mbmi->segment_id = cpi->segmentation_map[map_index];
-      else
-        mbmi->segment_id = cm->last_frame_seg_map[map_index];
-      if (mbmi->segment_id > 3)
-        mbmi->segment_id = 0;
-
-      vp9_mb_init_quantizer(cpi, x);
-    } else
-      // Set to Segment 0 by default
-      mbmi->segment_id = 0;
-
-    x->active_ptr = cpi->active_map + map_index;
-
-#if CONFIG_SUPERBLOCKS
-    xd->mode_info_context->mbmi.encoded_as_sb = 0;
-#endif
-
-    cpi->update_context = 0;    // TODO Do we need this now??
-
-    vp9_intra_prediction_down_copy(xd);
-
-    // Find best coding mode & reconstruct the MB so it is available
-    // as a predictor for MBs that follow in the SB
-    if (cm->frame_type == KEY_FRAME) {
-      int r, d;
-      vp9_rd_pick_intra_mode(cpi, x, &r, &d);
-      *totalrate += r;
-      *totaldist += d;
-
-      // Dummy encode, do not do the tokenization
-      vp9_encode_intra_macro_block(cpi, x, tp, 0);
-      // Note the encoder may have changed the segment_id
-
-      // Save the coding context
-      vpx_memcpy(&x->mb_context[i].mic, xd->mode_info_context,
-                 sizeof(MODE_INFO));
-    } else {
-      int seg_id, r, d;
-
-      if (xd->segmentation_enabled && cpi->seg0_cnt > 0 &&
-          !vp9_segfeature_active(xd, 0, SEG_LVL_REF_FRAME) &&
-          vp9_segfeature_active(xd, 1, SEG_LVL_REF_FRAME) &&
-          vp9_check_segref(xd, 1, INTRA_FRAME)  +
-          vp9_check_segref(xd, 1, LAST_FRAME)   +
-          vp9_check_segref(xd, 1, GOLDEN_FRAME) +
-          vp9_check_segref(xd, 1, ALTREF_FRAME) == 1) {
-        cpi->seg0_progress = (cpi->seg0_idx << 16) / cpi->seg0_cnt;
-      } else {
-        cpi->seg0_progress = (((mb_col & ~1) * 2 + (mb_row & ~1) * cm->mb_cols + i) << 16) / cm->MBs;
-      }
-
-      vp9_pick_mode_inter_macroblock(cpi, x, recon_yoffset,
-                                     recon_uvoffset, &r, &d);
-      *totalrate += r;
-      *totaldist += d;
-
-      // Dummy encode, do not do the tokenization
-      vp9_encode_inter_macroblock(cpi, x, tp,
-                                  recon_yoffset, recon_uvoffset, 0);
-
-      seg_id = mbmi->segment_id;
-      if (cpi->mb.e_mbd.segmentation_enabled && seg_id == 0) {
-        cpi->seg0_idx++;
-      }
-      if (!xd->segmentation_enabled ||
-          !vp9_segfeature_active(xd, seg_id, SEG_LVL_REF_FRAME) ||
-          vp9_check_segref(xd, seg_id, INTRA_FRAME)  +
-          vp9_check_segref(xd, seg_id, LAST_FRAME)   +
-          vp9_check_segref(xd, seg_id, GOLDEN_FRAME) +
-          vp9_check_segref(xd, seg_id, ALTREF_FRAME) > 1) {
-        // Get the prediction context and status
-        int pred_flag = vp9_get_pred_flag(xd, PRED_REF);
-        int pred_context = vp9_get_pred_context(cm, xd, PRED_REF);
-
-        // Count prediction success
-        cpi->ref_pred_count[pred_context][pred_flag]++;
-      }
-    }
-
-    // Next MB
-    mb_row += dy;
-    mb_col += dx;
-
-    x->src.y_buffer += 16 * (dx + dy * x->src.y_stride);
-    x->src.u_buffer += 8  * (dx + dy * x->src.uv_stride);
-    x->src.v_buffer += 8  * (dx + dy * x->src.uv_stride);
-
-    x->gf_active_ptr += offset_unextended;
-    x->partition_info += offset_extended;
-    xd->mode_info_context += offset_extended;
-    xd->prev_mode_info_context += offset_extended;
-
-#if CONFIG_DEBUG
-    assert((xd->prev_mode_info_context - cpi->common.prev_mip) ==
-           (xd->mode_info_context - cpi->common.mip));
-#endif
-  }
-
-  /* Restore L & A coding context to those in place on entry */
-  vpx_memcpy(cm->left_context,
-             left_context,
-             sizeof(left_context));
-  vpx_memcpy(initial_above_context_ptr,
-             above_context,
-             sizeof(above_context));
-}
-
-#if CONFIG_SUPERBLOCKS
-static void pick_sb_modes (VP9_COMP *cpi,
-                           VP9_COMMON *cm,
-                           int mb_row,
-                           int mb_col,
-                           MACROBLOCK  *x,
-                           MACROBLOCKD *xd,
-                           TOKENEXTRA **tp,
-                           int *totalrate,
-                           int *totaldist)
-{
-  int map_index;
-  int recon_yoffset, recon_uvoffset;
-  int ref_fb_idx = cm->lst_fb_idx;
-  int dst_fb_idx = cm->new_fb_idx;
-  int recon_y_stride = cm->yv12_fb[ref_fb_idx].y_stride;
-  int recon_uv_stride = cm->yv12_fb[ref_fb_idx].uv_stride;
-  ENTROPY_CONTEXT_PLANES left_context[2];
-  ENTROPY_CONTEXT_PLANES above_context[2];
-  ENTROPY_CONTEXT_PLANES *initial_above_context_ptr = cm->above_context
-    + mb_col;
-
-  /* Function should not modify L & A contexts; save and restore on exit */
-  vpx_memcpy (left_context,
-              cm->left_context,
-              sizeof(left_context));
-  vpx_memcpy (above_context,
-              initial_above_context_ptr,
-              sizeof(above_context));
-
-  map_index = (mb_row * cpi->common.mb_cols) + mb_col;
-  x->mb_activity_ptr = &cpi->mb_activity_map[map_index];
-
-  /* set above context pointer */
-  xd->above_context = cm->above_context + mb_col;
-
-  /* Restore the appropriate left context depending on which
-   * row in the SB the MB is situated */
-  xd->left_context = cm->left_context;
-
-  // Set up distance of MB to edge of frame in 1/8th pel units
-  xd->mb_to_top_edge    = -((mb_row * 16) << 3);
-  xd->mb_to_left_edge   = -((mb_col * 16) << 3);
-  xd->mb_to_bottom_edge = ((cm->mb_rows - 1 - mb_row) * 16) << 3;
-  xd->mb_to_right_edge  = ((cm->mb_cols - 1 - mb_col) * 16) << 3;
-
-  /* Set up limit values for MV components to prevent them from
-   * extending beyond the UMV borders assuming 16x16 block size */
-  x->mv_row_min = -((mb_row * 16) + VP8BORDERINPIXELS - INTERP_EXTEND);
-  x->mv_col_min = -((mb_col * 16) + VP8BORDERINPIXELS - INTERP_EXTEND);
-  x->mv_row_max = ((cm->mb_rows - mb_row) * 16 +
-                   (VP8BORDERINPIXELS - 32 - INTERP_EXTEND));
-  x->mv_col_max = ((cm->mb_cols - mb_col) * 16 +
-                   (VP8BORDERINPIXELS - 32 - INTERP_EXTEND));
-
-  xd->up_available   = (mb_row != 0);
-  xd->left_available = (mb_col != 0);
-
-  recon_yoffset  = (mb_row * recon_y_stride * 16) + (mb_col * 16);
-  recon_uvoffset = (mb_row * recon_uv_stride * 8) + (mb_col *  8);
-
-  xd->dst.y_buffer = cm->yv12_fb[dst_fb_idx].y_buffer + recon_yoffset;
-  xd->dst.u_buffer = cm->yv12_fb[dst_fb_idx].u_buffer + recon_uvoffset;
-  xd->dst.v_buffer = cm->yv12_fb[dst_fb_idx].v_buffer + recon_uvoffset;
-#if 0 // FIXME
-  /* Copy current MB to a work buffer */
-  vp9_copy_mem16x16(x->src.y_buffer, x->src.y_stride, x->thismb, 16);
-#endif
-  x->rddiv = cpi->RDDIV;
-  x->rdmult = cpi->RDMULT;
-  if(cpi->oxcf.tuning == VP8_TUNE_SSIM)
-    vp9_activity_masking(cpi, x);
-  /* Is segmentation enabled */
-  if (xd->segmentation_enabled)
-  {
-    /* Code to set segment id in xd->mbmi.segment_id */
-    if (xd->update_mb_segmentation_map)
-      xd->mode_info_context->mbmi.segment_id =
-            cpi->segmentation_map[map_index] &&
-            cpi->segmentation_map[map_index + 1] &&
-            cpi->segmentation_map[map_index + cm->mb_cols] &&
-            cpi->segmentation_map[map_index + cm->mb_cols + 1];
-    else
-      xd->mode_info_context->mbmi.segment_id =
-            cm->last_frame_seg_map[map_index] &&
-            cm->last_frame_seg_map[map_index + 1] &&
-            cm->last_frame_seg_map[map_index + cm->mb_cols] &&
-            cm->last_frame_seg_map[map_index + cm->mb_cols + 1];
-    if (xd->mode_info_context->mbmi.segment_id > 3)
-      xd->mode_info_context->mbmi.segment_id = 0;
-
-    vp9_mb_init_quantizer(cpi, x);
-  }
-  else
-    /* Set to Segment 0 by default */
-    xd->mode_info_context->mbmi.segment_id = 0;
-
-  x->active_ptr = cpi->active_map + map_index;
-  
-  cpi->update_context = 0;    // TODO Do we need this now??
-
-  /* Find best coding mode & reconstruct the MB so it is available
-   * as a predictor for MBs that follow in the SB */
-  if (cm->frame_type == KEY_FRAME)
-  {
-    vp9_rd_pick_intra_mode_sb(cpi, x,
-                              totalrate,
-                              totaldist);
-
-    /* Save the coding context */
-    vpx_memcpy(&x->sb_context[0].mic, xd->mode_info_context,
-               sizeof(MODE_INFO));
-  } else {
-    if (xd->segmentation_enabled && cpi->seg0_cnt > 0 &&
-        !vp9_segfeature_active(xd, 0, SEG_LVL_REF_FRAME) &&
-        vp9_segfeature_active(xd, 1, SEG_LVL_REF_FRAME) &&
-        vp9_check_segref(xd, 1, INTRA_FRAME)  +
-        vp9_check_segref(xd, 1, LAST_FRAME)   +
-        vp9_check_segref(xd, 1, GOLDEN_FRAME) +
-        vp9_check_segref(xd, 1, ALTREF_FRAME) == 1) {
-      cpi->seg0_progress = (cpi->seg0_idx << 16) / cpi->seg0_cnt;
-    } else {
-      cpi->seg0_progress =
-        (((mb_col & ~1) * 2 + (mb_row & ~1) * cm->mb_cols) << 16) / cm->MBs;
-    }
-
-    vp9_rd_pick_inter_mode_sb(cpi, x,
-                              recon_yoffset,
-                              recon_uvoffset,
-                              totalrate,
-                              totaldist);
-  }
-
-  /* Restore L & A coding context to those in place on entry */
-  vpx_memcpy (cm->left_context,
-              left_context,
-              sizeof(left_context));
-  vpx_memcpy (initial_above_context_ptr,
-              above_context,
-              sizeof(above_context));
-}
-#endif
-
-static void encode_sb(VP9_COMP *cpi,
-                      VP9_COMMON *cm,
-                      int mbrow,
-                      int mbcol,
-                      MACROBLOCK  *x,
-                      MACROBLOCKD *xd,
-                      TOKENEXTRA **tp) {
-  int i;
-  int map_index;
-  int mb_row, mb_col;
-  int recon_yoffset, recon_uvoffset;
-  int ref_fb_idx = cm->lst_fb_idx;
-  int dst_fb_idx = cm->new_fb_idx;
-  int recon_y_stride = cm->yv12_fb[ref_fb_idx].y_stride;
-  int recon_uv_stride = cm->yv12_fb[ref_fb_idx].uv_stride;
-  int row_delta[4] = { 0, +1,  0, -1};
-  int col_delta[4] = { +1, -1, +1, +1};
-
-  mb_row = mbrow;
-  mb_col = mbcol;
-
-  /* Encode MBs in raster order within the SB */
-  for (i = 0; i < 4; i++) {
-    int dy = row_delta[i];
-    int dx = col_delta[i];
-    int offset_extended   = dy * xd->mode_info_stride + dx;
-    int offset_unextended = dy * cm->mb_cols + dx;
-    MB_MODE_INFO * mbmi = &xd->mode_info_context->mbmi;
-
-    if ((mb_row >= cm->mb_rows) || (mb_col >= cm->mb_cols)) {
-      // MB lies outside frame, move on
-      mb_row += dy;
-      mb_col += dx;
-
-      x->src.y_buffer += 16 * (dx + dy * x->src.y_stride);
-      x->src.u_buffer += 8  * (dx + dy * x->src.uv_stride);
-      x->src.v_buffer += 8  * (dx + dy * x->src.uv_stride);
-
-      x->gf_active_ptr      += offset_unextended;
-      x->partition_info     += offset_extended;
-      xd->mode_info_context += offset_extended;
-      xd->prev_mode_info_context += offset_extended;
-
-#if CONFIG_DEBUG
-      assert((xd->prev_mode_info_context - cpi->common.prev_mip) ==
-             (xd->mode_info_context - cpi->common.mip));
-#endif
-      continue;
-    }
-
-    xd->mb_index = i;
-
-#ifdef ENC_DEBUG
-    enc_debug = (cpi->common.current_video_frame == 0 &&
-                 mb_row == 0 && mb_col == 0);
-    mb_col_debug = mb_col;
-    mb_row_debug = mb_row;
-#endif
-
-    // Restore MB state to that when it was picked
-#if CONFIG_SUPERBLOCKS
-    if (xd->mode_info_context->mbmi.encoded_as_sb) {
-      update_state(cpi, x, &x->sb_context[i]);
-      cpi->sb_count++;
-    } else
-#endif
-      update_state(cpi, x, &x->mb_context[i]);
-
-    map_index = (mb_row * cpi->common.mb_cols) + mb_col;
-    x->mb_activity_ptr = &cpi->mb_activity_map[map_index];
-
-    // reset above block coeffs
-    xd->above_context = cm->above_context + mb_col;
-    xd->left_context  = cm->left_context + (i >> 1);
-
-    // Set up distance of MB to edge of the frame in 1/8th pel units
-    xd->mb_to_top_edge    = -((mb_row * 16) << 3);
-    xd->mb_to_left_edge   = -((mb_col * 16) << 3);
-    xd->mb_to_bottom_edge = ((cm->mb_rows - 1 - mb_row) * 16) << 3;
-    xd->mb_to_right_edge  = ((cm->mb_cols - 1 - mb_col) * 16) << 3;
-
-#if CONFIG_SUPERBLOCKS
-    if (xd->mode_info_context->mbmi.encoded_as_sb) {
-      // Set up limit values for MV components to prevent them from
-      // extending beyond the UMV borders assuming 32x32 block size
-      x->mv_row_min = -((mb_row * 16) + VP8BORDERINPIXELS - INTERP_EXTEND);
-      x->mv_col_min = -((mb_col * 16) + VP8BORDERINPIXELS - INTERP_EXTEND);
-      x->mv_row_max = ((cm->mb_rows - mb_row) * 16 +
-                       (VP8BORDERINPIXELS - 32 - INTERP_EXTEND));
-      x->mv_col_max = ((cm->mb_cols - mb_col) * 16 +
-                       (VP8BORDERINPIXELS - 32 - INTERP_EXTEND));
-    } else {
-#endif
-      // Set up limit values for MV components to prevent them from
-      // extending beyond the UMV borders assuming 16x16 block size
-      x->mv_row_min = -((mb_row * 16) + VP8BORDERINPIXELS - INTERP_EXTEND);
-      x->mv_col_min = -((mb_col * 16) + VP8BORDERINPIXELS - INTERP_EXTEND);
-      x->mv_row_max = ((cm->mb_rows - mb_row) * 16 +
-                       (VP8BORDERINPIXELS - 16 - INTERP_EXTEND));
-      x->mv_col_max = ((cm->mb_cols - mb_col) * 16 +
-                       (VP8BORDERINPIXELS - 16 - INTERP_EXTEND));
-#if CONFIG_SUPERBLOCKS
-    }
-#endif
-
-    xd->up_available = (mb_row != 0);
-    xd->left_available = (mb_col != 0);
-
-    recon_yoffset = (mb_row * recon_y_stride * 16) + (mb_col * 16);
-    recon_uvoffset = (mb_row * recon_uv_stride * 8) + (mb_col * 8);
-
-    xd->dst.y_buffer = cm->yv12_fb[dst_fb_idx].y_buffer + recon_yoffset;
-    xd->dst.u_buffer = cm->yv12_fb[dst_fb_idx].u_buffer + recon_uvoffset;
-    xd->dst.v_buffer = cm->yv12_fb[dst_fb_idx].v_buffer + recon_uvoffset;
-
-    // Copy current MB to a work buffer
-    vp9_copy_mem16x16(x->src.y_buffer, x->src.y_stride, x->thismb, 16);
-
-    if (cpi->oxcf.tuning == VP8_TUNE_SSIM)
-      vp9_activity_masking(cpi, x);
-
-    // Is segmentation enabled
-    if (xd->segmentation_enabled) {
-      vp9_mb_init_quantizer(cpi, x);
-    }
-
-    x->active_ptr = cpi->active_map + map_index;
-
-    cpi->update_context = 0;
-
-#if CONFIG_SUPERBLOCKS
-    if (!xd->mode_info_context->mbmi.encoded_as_sb)
-#endif
-      vp9_intra_prediction_down_copy(xd);
-
-    if (cm->frame_type == KEY_FRAME) {
-#if CONFIG_SUPERBLOCKS
-      if (xd->mode_info_context->mbmi.encoded_as_sb)
-        vp9_encode_intra_super_block(cpi, x, tp, mb_col);
-      else
-#endif
-        vp9_encode_intra_macro_block(cpi, x, tp, 1);
-        // Note the encoder may have changed the segment_id
-
-#ifdef MODE_STATS
-      y_modes[mbmi->mode]++;
-#endif
-    } else {
-      unsigned char *segment_id;
-      int seg_ref_active;
-
-      if (xd->mode_info_context->mbmi.ref_frame) {
-        unsigned char pred_context;
-
-        pred_context = vp9_get_pred_context(cm, xd, PRED_COMP);
-
-        if (xd->mode_info_context->mbmi.second_ref_frame == INTRA_FRAME)
-          cpi->single_pred_count[pred_context]++;
-        else
-          cpi->comp_pred_count[pred_context]++;
-      }
-
-#if CONFIG_SUPERBLOCKS
-      if (xd->mode_info_context->mbmi.encoded_as_sb)
-        vp9_encode_inter_superblock(cpi, x, tp, recon_yoffset, recon_uvoffset,
-                                    mb_col, mb_row);
-      else
-#endif
-        vp9_encode_inter_macroblock(cpi, x, tp,
-                                    recon_yoffset, recon_uvoffset, 1);
-        // Note the encoder may have changed the segment_id
-
-#ifdef MODE_STATS
-      inter_y_modes[mbmi->mode]++;
-
-      if (mbmi->mode == SPLITMV) {
-        int b;
-
-        for (b = 0; b < x->partition_info->count; b++) {
-          inter_b_modes[x->partition_info->bmi[b].mode]++;
-        }
-      }
-
-#endif
-
-      // If we have just a single reference frame coded for a segment then
-      // exclude from the reference frame counts used to work out
-      // probabilities. NOTE: At the moment we dont support custom trees
-      // for the reference frame coding for each segment but this is a
-      // possible future action.
-      segment_id = &mbmi->segment_id;
-      seg_ref_active = vp9_segfeature_active(xd, *segment_id,
-                                             SEG_LVL_REF_FRAME);
-      if (!seg_ref_active ||
-          ((vp9_check_segref(xd, *segment_id, INTRA_FRAME) +
-            vp9_check_segref(xd, *segment_id, LAST_FRAME) +
-            vp9_check_segref(xd, *segment_id, GOLDEN_FRAME) +
-            vp9_check_segref(xd, *segment_id, ALTREF_FRAME)) > 1)) {
-        {
-          cpi->count_mb_ref_frame_usage[mbmi->ref_frame]++;
-        }
-      }
-
-      // Count of last ref frame 0,0 usage
-      if ((mbmi->mode == ZEROMV) && (mbmi->ref_frame == LAST_FRAME))
-        cpi->inter_zz_count++;
-    }
-
-#if CONFIG_SUPERBLOCKS
-    if (xd->mode_info_context->mbmi.encoded_as_sb) {
-      x->src.y_buffer += 32;
-      x->src.u_buffer += 16;
-      x->src.v_buffer += 16;
-
-      x->gf_active_ptr      += 2;
-      x->partition_info     += 2;
-      xd->mode_info_context += 2;
-      xd->prev_mode_info_context += 2;
-
-      (*tp)->Token = EOSB_TOKEN;
-      (*tp)++;
-      if (mb_row < cm->mb_rows) cpi->tplist[mb_row].stop = *tp;
-      break;
-    }
-#endif
-
-    // Next MB
-    mb_row += dy;
-    mb_col += dx;
-
-    x->src.y_buffer += 16 * (dx + dy * x->src.y_stride);
-    x->src.u_buffer += 8  * (dx + dy * x->src.uv_stride);
-    x->src.v_buffer += 8  * (dx + dy * x->src.uv_stride);
-
-    x->gf_active_ptr      += offset_unextended;
-    x->partition_info     += offset_extended;
-    xd->mode_info_context += offset_extended;
-    xd->prev_mode_info_context += offset_extended;
-
-#if CONFIG_DEBUG
-    assert((xd->prev_mode_info_context - cpi->common.prev_mip) ==
-           (xd->mode_info_context - cpi->common.mip));
-#endif
-    (*tp)->Token = EOSB_TOKEN;
-    (*tp)++;
-    if (mb_row < cm->mb_rows) cpi->tplist[mb_row].stop = *tp;
-  }
-
-  // debug output
-#if DBG_PRNT_SEGMAP
-  {
-    FILE *statsfile;
-    statsfile = fopen("segmap2.stt", "a");
-    fprintf(statsfile, "\n");
-    fclose(statsfile);
-  }
-#endif
-}
-
-static
-void encode_sb_row(VP9_COMP *cpi,
-                   VP9_COMMON *cm,
-                   int mb_row,
-                   MACROBLOCK  *x,
-                   MACROBLOCKD *xd,
-                   TOKENEXTRA **tp,
-                   int *totalrate) {
-  int mb_col;
-  int mb_cols = cm->mb_cols;
-
-  // Initialize the left context for the new SB row
-  vpx_memset(cm->left_context, 0, sizeof(cm->left_context));
-
-  // Code each SB in the row
-  for (mb_col = 0; mb_col < mb_cols; mb_col += 2) {
-    int mb_rate = 0, mb_dist = 0;
-#if CONFIG_SUPERBLOCKS
-    int sb_rate = INT_MAX, sb_dist;
-#endif
-
-#if CONFIG_DEBUG
-    MODE_INFO *mic = xd->mode_info_context;
-    PARTITION_INFO *pi = x->partition_info;
-    signed char  *gfa = x->gf_active_ptr;
-    unsigned char *yb = x->src.y_buffer;
-    unsigned char *ub = x->src.u_buffer;
-    unsigned char *vb = x->src.v_buffer;
-#endif
-
-#if CONFIG_SUPERBLOCKS
-    // Pick modes assuming the SB is coded as 4 independent MBs
-    xd->mode_info_context->mbmi.encoded_as_sb = 0;
-#endif
-    pick_mb_modes(cpi, cm, mb_row, mb_col, x, xd, tp, &mb_rate, &mb_dist);
-#if CONFIG_SUPERBLOCKS
-    mb_rate += vp9_cost_bit(cm->sb_coded, 0);
-#endif
-
-    x->src.y_buffer -= 32;
-    x->src.u_buffer -= 16;
-    x->src.v_buffer -= 16;
-
-    x->gf_active_ptr -= 2;
-    x->partition_info -= 2;
-    xd->mode_info_context -= 2;
-    xd->prev_mode_info_context -= 2;
-
-#if CONFIG_DEBUG
-    assert(x->gf_active_ptr == gfa);
-    assert(x->partition_info == pi);
-    assert(xd->mode_info_context == mic);
-    assert(x->src.y_buffer == yb);
-    assert(x->src.u_buffer == ub);
-    assert(x->src.v_buffer == vb);
-#endif
-
-#if CONFIG_SUPERBLOCKS
-    if (!(((    mb_cols & 1) && mb_col ==     mb_cols - 1) ||
-          ((cm->mb_rows & 1) && mb_row == cm->mb_rows - 1))) {
-      /* Pick a mode assuming that it applies to all 4 of the MBs in the SB */
-      xd->mode_info_context->mbmi.encoded_as_sb = 1;
-      pick_sb_modes(cpi, cm, mb_row, mb_col, x, xd, tp, &sb_rate, &sb_dist);
-      sb_rate += vp9_cost_bit(cm->sb_coded, 1);
-    }
-
-    /* Decide whether to encode as a SB or 4xMBs */
-    if (sb_rate < INT_MAX &&
-        RDCOST(x->rdmult, x->rddiv, sb_rate, sb_dist) <
-          RDCOST(x->rdmult, x->rddiv, mb_rate, mb_dist)) {
-      xd->mode_info_context->mbmi.encoded_as_sb = 1;
-      xd->mode_info_context[1].mbmi.encoded_as_sb = 1;
-      xd->mode_info_context[cm->mode_info_stride].mbmi.encoded_as_sb = 1;
-      xd->mode_info_context[1 + cm->mode_info_stride].mbmi.encoded_as_sb = 1;
-      *totalrate += sb_rate;
-    } else
-#endif
-    {
-#if CONFIG_SUPERBLOCKS
-      xd->mode_info_context->mbmi.encoded_as_sb = 0;
-      if (cm->mb_cols - 1 > mb_col)
-        xd->mode_info_context[1].mbmi.encoded_as_sb = 0;
-      if (cm->mb_rows - 1 > mb_row) {
-        xd->mode_info_context[cm->mode_info_stride].mbmi.encoded_as_sb = 0;
-        if (cm->mb_cols - 1 > mb_col)
-          xd->mode_info_context[1 + cm->mode_info_stride].mbmi.encoded_as_sb = 0;
-      }
-#endif
-      *totalrate += mb_rate;
-    }
-
-    /* Encode SB using best computed mode(s) */
-    encode_sb(cpi, cm, mb_row, mb_col, x, xd, tp);
-
-#if CONFIG_DEBUG
-    assert(x->gf_active_ptr == gfa + 2);
-    assert(x->partition_info == pi + 2);
-    assert(xd->mode_info_context == mic + 2);
-    assert(x->src.y_buffer == yb + 32);
-    assert(x->src.u_buffer == ub + 16);
-    assert(x->src.v_buffer == vb + 16);
-#endif
-  }
-
-  // this is to account for the border
-  x->gf_active_ptr += mb_cols - (mb_cols & 0x1);
-  x->partition_info += xd->mode_info_stride + 1 - (mb_cols & 0x1);
-  xd->mode_info_context += xd->mode_info_stride + 1 - (mb_cols & 0x1);
-  xd->prev_mode_info_context += xd->mode_info_stride + 1 - (mb_cols & 0x1);
-
-#if CONFIG_DEBUG
-  assert((xd->prev_mode_info_context - cpi->common.prev_mip) ==
-         (xd->mode_info_context - cpi->common.mip));
-#endif
-}
-
-static void init_encode_frame_mb_context(VP9_COMP *cpi) {
-  MACROBLOCK *const x = &cpi->mb;
-  VP9_COMMON *const cm = &cpi->common;
-  MACROBLOCKD *const xd = &x->e_mbd;
-
-  // GF active flags data structure
-  x->gf_active_ptr = (signed char *)cpi->gf_active_flags;
-
-  // Activity map pointer
-  x->mb_activity_ptr = cpi->mb_activity_map;
-
-  x->act_zbin_adj = 0;
-  cpi->seg0_idx = 0;
-  vpx_memset(cpi->ref_pred_count, 0, sizeof(cpi->ref_pred_count));
-
-  x->partition_info = x->pi;
-
-  xd->mode_info_context = cm->mi;
-  xd->mode_info_stride = cm->mode_info_stride;
-  xd->prev_mode_info_context = cm->prev_mi;
-
-  xd->frame_type = cm->frame_type;
-
-  xd->frames_since_golden = cm->frames_since_golden;
-  xd->frames_till_alt_ref_frame = cm->frames_till_alt_ref_frame;
-
-  // reset intra mode contexts
-  if (cm->frame_type == KEY_FRAME)
-    vp9_init_mbmode_probs(cm);
-
-  // Copy data over into macro block data structures.
-  x->src = * cpi->Source;
-  xd->pre = cm->yv12_fb[cm->lst_fb_idx];
-  xd->dst = cm->yv12_fb[cm->new_fb_idx];
-
-  // set up frame for intra coded blocks
-  vp9_setup_intra_recon(&cm->yv12_fb[cm->new_fb_idx]);
-
-  vp9_build_block_offsets(x);
-
-  vp9_setup_block_dptrs(&x->e_mbd);
-
-  vp9_setup_block_ptrs(x);
-
-  xd->mode_info_context->mbmi.mode = DC_PRED;
-  xd->mode_info_context->mbmi.uv_mode = DC_PRED;
-
-  vp9_zero(cpi->count_mb_ref_frame_usage)
-  vp9_zero(cpi->bmode_count)
-  vp9_zero(cpi->ymode_count)
-  vp9_zero(cpi->i8x8_mode_count)
-  vp9_zero(cpi->y_uv_mode_count)
-  vp9_zero(cpi->sub_mv_ref_count)
-  vp9_zero(cpi->mbsplit_count)
-  vp9_zero(cpi->common.fc.mv_ref_ct)
-  vp9_zero(cpi->common.fc.mv_ref_ct_a)
-#if CONFIG_SUPERBLOCKS
-  vp9_zero(cpi->sb_ymode_count)
-  cpi->sb_count = 0;
-#endif
-
-  vpx_memset(cm->above_context, 0,
-             sizeof(ENTROPY_CONTEXT_PLANES) * cm->mb_cols);
-
-  xd->fullpixel_mask = 0xffffffff;
-  if (cm->full_pixel)
-    xd->fullpixel_mask = 0xfffffff8;
-}
-
-static void encode_frame_internal(VP9_COMP *cpi) {
-  int mb_row;
-  MACROBLOCK *const x = &cpi->mb;
-  VP9_COMMON *const cm = &cpi->common;
-  MACROBLOCKD *const xd = &x->e_mbd;
-
-  TOKENEXTRA *tp = cpi->tok;
-  int totalrate;
-
-  //printf("encode_frame_internal\n");
-
-  // Compute a modified set of reference frame probabilities to use when
-  // prediction fails. These are based on the current general estimates for
-  // this frame which may be updated with each iteration of the recode loop.
-  vp9_compute_mod_refprobs(cm);
-
-#if CONFIG_NEW_MVREF
-  // temp stats reset
-  vp9_zero( cpi->best_ref_index_counts );
-#endif
-
-// debug output
-#if DBG_PRNT_SEGMAP
-  {
-    FILE *statsfile;
-    statsfile = fopen("segmap2.stt", "a");
-    fprintf(statsfile, "\n");
-    fclose(statsfile);
-  }
-#endif
-
-  totalrate = 0;
-
-  // Functions setup for all frame types so we can use MC in AltRef
-  vp9_setup_interp_filters(xd, cm->mcomp_filter_type, cm);
-
-  // Reset frame count of inter 0,0 motion vector usage.
-  cpi->inter_zz_count = 0;
-
-  cpi->prediction_error = 0;
-  cpi->intra_error = 0;
-  cpi->skip_true_count[0] = cpi->skip_true_count[1] = cpi->skip_true_count[2] = 0;
-  cpi->skip_false_count[0] = cpi->skip_false_count[1] = cpi->skip_false_count[2] = 0;
-
-#if CONFIG_PRED_FILTER
-  if (cm->current_video_frame == 0) {
-    // Initially assume that we'll signal the prediction filter
-    // state at the frame level and that it is off.
-    cpi->common.pred_filter_mode = 0;
-    cpi->common.prob_pred_filter_off = 128;
-  }
-  cpi->pred_filter_on_count = 0;
-  cpi->pred_filter_off_count = 0;
-#endif
-  vp9_zero(cpi->switchable_interp_count);
-
-  xd->mode_info_context = cm->mi;
-  xd->prev_mode_info_context = cm->prev_mi;
-
-  vp9_zero(cpi->NMVcount);
-  vp9_zero(cpi->coef_counts);
-  vp9_zero(cpi->hybrid_coef_counts);
-  vp9_zero(cpi->coef_counts_8x8);
-  vp9_zero(cpi->hybrid_coef_counts_8x8);
-  vp9_zero(cpi->coef_counts_16x16);
-  vp9_zero(cpi->hybrid_coef_counts_16x16);
-
-  vp9_frame_init_quantizer(cpi);
-
-  vp9_initialize_rd_consts(cpi, cm->base_qindex + cm->y1dc_delta_q);
-  vp9_initialize_me_consts(cpi, cm->base_qindex);
-
-  if (cpi->oxcf.tuning == VP8_TUNE_SSIM) {
-    // Initialize encode frame context.
-    init_encode_frame_mb_context(cpi);
-
-    // Build a frame level activity map
-    build_activity_map(cpi);
-  }
-
-  // re-initencode frame context.
-  init_encode_frame_mb_context(cpi);
-
-  vpx_memset(cpi->rd_comp_pred_diff, 0, sizeof(cpi->rd_comp_pred_diff));
-  vpx_memset(cpi->single_pred_count, 0, sizeof(cpi->single_pred_count));
-  vpx_memset(cpi->comp_pred_count, 0, sizeof(cpi->comp_pred_count));
-  vpx_memset(cpi->txfm_count, 0, sizeof(cpi->txfm_count));
-  vpx_memset(cpi->txfm_count_8x8p, 0, sizeof(cpi->txfm_count_8x8p));
-  vpx_memset(cpi->rd_tx_select_diff, 0, sizeof(cpi->rd_tx_select_diff));
-  {
-    struct vpx_usec_timer  emr_timer;
-    vpx_usec_timer_start(&emr_timer);
-
-    {
-      // For each row of SBs in the frame
-      for (mb_row = 0; mb_row < cm->mb_rows; mb_row += 2) {
-        int offset = (cm->mb_cols + 1) & ~0x1;
-
-        encode_sb_row(cpi, cm, mb_row, x, xd, &tp, &totalrate);
-
-        // adjust to the next row of SBs
-        x->src.y_buffer += 32 * x->src.y_stride - 16 * offset;
-        x->src.u_buffer += 16 * x->src.uv_stride - 8 * offset;
-        x->src.v_buffer += 16 * x->src.uv_stride - 8 * offset;
-      }
-
-      cpi->tok_count = tp - cpi->tok;
-    }
-
-    vpx_usec_timer_mark(&emr_timer);
-    cpi->time_encode_mb_row += vpx_usec_timer_elapsed(&emr_timer);
-
-  }
-
-  // 256 rate units to the bit,
-  // projected_frame_size in units of BYTES
-  cpi->projected_frame_size = totalrate >> 8;
-
-
-#if 0
-  // Keep record of the total distortion this time around for future use
-  cpi->last_frame_distortion = cpi->frame_distortion;
-#endif
-
-}
-
-static int check_dual_ref_flags(VP9_COMP *cpi) {
-  MACROBLOCKD *xd = &cpi->mb.e_mbd;
-  int ref_flags = cpi->ref_frame_flags;
-
-  if (vp9_segfeature_active(xd, 1, SEG_LVL_REF_FRAME)) {
-    if ((ref_flags & (VP9_LAST_FLAG | VP9_GOLD_FLAG)) == (VP9_LAST_FLAG | VP9_GOLD_FLAG) &&
-        vp9_check_segref(xd, 1, LAST_FRAME))
-      return 1;
-    if ((ref_flags & (VP9_GOLD_FLAG | VP9_ALT_FLAG)) == (VP9_GOLD_FLAG | VP9_ALT_FLAG) &&
-        vp9_check_segref(xd, 1, GOLDEN_FRAME))
-      return 1;
-    if ((ref_flags & (VP9_ALT_FLAG  | VP9_LAST_FLAG)) == (VP9_ALT_FLAG  | VP9_LAST_FLAG) &&
-        vp9_check_segref(xd, 1, ALTREF_FRAME))
-      return 1;
-    return 0;
-  } else {
-    return (!!(ref_flags & VP9_GOLD_FLAG) +
-            !!(ref_flags & VP9_LAST_FLAG) +
-            !!(ref_flags & VP9_ALT_FLAG)) >= 2;
-  }
-}
-
-static void reset_skip_txfm_size(VP9_COMP *cpi, TX_SIZE txfm_max) {
-  VP9_COMMON *cm = &cpi->common;
-  int mb_row, mb_col, mis = cm->mode_info_stride, segment_id;
-  MODE_INFO *mi, *mi_ptr = cm->mi;
-#if CONFIG_SUPERBLOCKS
-  MODE_INFO *sb_mi_ptr = cm->mi, *sb_mi;
-  MB_MODE_INFO *sb_mbmi;
-#endif
-  MB_MODE_INFO *mbmi;
-  MACROBLOCK *x = &cpi->mb;
-  MACROBLOCKD *xd = &x->e_mbd;
-
-  for (mb_row = 0; mb_row < cm->mb_rows; mb_row++, mi_ptr += mis) {
-    mi = mi_ptr;
-#if CONFIG_SUPERBLOCKS
-    sb_mi = sb_mi_ptr;
-#endif
-    for (mb_col = 0; mb_col < cm->mb_cols; mb_col++, mi++) {
-      mbmi = &mi->mbmi;
-#if CONFIG_SUPERBLOCKS
-      sb_mbmi = &sb_mi->mbmi;
-#endif
-      if (
-#if CONFIG_SUPERBLOCKS
-          !sb_mbmi->encoded_as_sb &&
-#endif
-          mbmi->txfm_size > txfm_max) {
-        segment_id = mbmi->segment_id;
-        xd->mode_info_context = mi;
-        assert((vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) &&
-                vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) == 0) ||
-               (cm->mb_no_coeff_skip && mbmi->mb_skip_coeff));
-        mbmi->txfm_size = txfm_max;
-      }
-#if CONFIG_SUPERBLOCKS
-      if (mb_col & 1)
-        sb_mi += 2;
-#endif
-    }
-#if CONFIG_SUPERBLOCKS
-    if (mb_row & 1)
-      sb_mi_ptr += 2 * mis;
-#endif
-  }
-}
-
-void vp9_encode_frame(VP9_COMP *cpi) {
-  if (cpi->sf.RD) {
-    int i, frame_type, pred_type;
-    TXFM_MODE txfm_type;
-
-    /*
-     * This code does a single RD pass over the whole frame assuming
-     * either compound, single or hybrid prediction as per whatever has
-     * worked best for that type of frame in the past.
-     * It also predicts whether another coding mode would have worked
-     * better that this coding mode. If that is the case, it remembers
-     * that for subsequent frames.
-     * It does the same analysis for transform size selection also.
-     */
-    if (cpi->common.frame_type == KEY_FRAME)
-      frame_type = 0;
-    else if (cpi->is_src_frame_alt_ref && cpi->common.refresh_golden_frame)
-      frame_type = 3;
-    else if (cpi->common.refresh_golden_frame || cpi->common.refresh_alt_ref_frame)
-      frame_type = 1;
-    else
-      frame_type = 2;
-
-    /* prediction (compound, single or hybrid) mode selection */
-    if (frame_type == 3)
-      pred_type = SINGLE_PREDICTION_ONLY;
-    else if (cpi->rd_prediction_type_threshes[frame_type][1] >
-                 cpi->rd_prediction_type_threshes[frame_type][0] &&
-             cpi->rd_prediction_type_threshes[frame_type][1] >
-                 cpi->rd_prediction_type_threshes[frame_type][2] &&
-             check_dual_ref_flags(cpi) && cpi->static_mb_pct == 100)
-      pred_type = COMP_PREDICTION_ONLY;
-    else if (cpi->rd_prediction_type_threshes[frame_type][0] >
-                 cpi->rd_prediction_type_threshes[frame_type][2])
-      pred_type = SINGLE_PREDICTION_ONLY;
-    else
-      pred_type = HYBRID_PREDICTION;
-
-    /* transform size (4x4, 8x8, 16x16 or select-per-mb) selection */
-#if CONFIG_LOSSLESS
-    if (cpi->oxcf.lossless) {
-      txfm_type = ONLY_4X4;
-    } else
-#endif
-    /* FIXME (rbultje)
-     * this is a hack (no really), basically to work around the complete
-     * nonsense coefficient cost prediction for keyframes. The probabilities
-     * are reset to defaults, and thus we basically have no idea how expensive
-     * a 4x4 vs. 8x8 will really be. The result is that any estimate at which
-     * of the two is better is utterly bogus.
-     * I'd like to eventually remove this hack, but in order to do that, we
-     * need to move the frame reset code from the frame encode init to the
-     * bitstream write code, or alternatively keep a backup of the previous
-     * keyframe's probabilities as an estimate of what the current keyframe's
-     * coefficient cost distributions may look like. */
-    if (frame_type == 0) {
-      txfm_type = ALLOW_16X16;
-    } else
-#if 0
-    /* FIXME (rbultje)
-     * this code is disabled for a similar reason as the code above; the
-     * problem is that each time we "revert" to 4x4 only (or even 8x8 only),
-     * the coefficient probabilities for 16x16 (and 8x8) start lagging behind,
-     * thus leading to them lagging further behind and not being chosen for
-     * subsequent frames either. This is essentially a local minimum problem
-     * that we can probably fix by estimating real costs more closely within
-     * a frame, perhaps by re-calculating costs on-the-fly as frame encoding
-     * progresses. */
-    if (cpi->rd_tx_select_threshes[frame_type][TX_MODE_SELECT] >
-            cpi->rd_tx_select_threshes[frame_type][ONLY_4X4] &&
-        cpi->rd_tx_select_threshes[frame_type][TX_MODE_SELECT] >
-            cpi->rd_tx_select_threshes[frame_type][ALLOW_16X16] &&
-        cpi->rd_tx_select_threshes[frame_type][TX_MODE_SELECT] >
-            cpi->rd_tx_select_threshes[frame_type][ALLOW_8X8]) {
-      txfm_type = TX_MODE_SELECT;
-    } else if (cpi->rd_tx_select_threshes[frame_type][ONLY_4X4] >
-                  cpi->rd_tx_select_threshes[frame_type][ALLOW_8X8]
-            && cpi->rd_tx_select_threshes[frame_type][ONLY_4X4] >
-                  cpi->rd_tx_select_threshes[frame_type][ALLOW_16X16]
-               ) {
-      txfm_type = ONLY_4X4;
-    } else if (cpi->rd_tx_select_threshes[frame_type][ALLOW_16X16] >=
-                  cpi->rd_tx_select_threshes[frame_type][ALLOW_8X8]) {
-      txfm_type = ALLOW_16X16;
-    } else
-      txfm_type = ALLOW_8X8;
-#else
-    txfm_type = cpi->rd_tx_select_threshes[frame_type][ALLOW_16X16] >=
-                 cpi->rd_tx_select_threshes[frame_type][TX_MODE_SELECT] ?
-    ALLOW_16X16 : TX_MODE_SELECT;
-#endif
-    cpi->common.txfm_mode = txfm_type;
-    if (txfm_type != TX_MODE_SELECT) {
-      cpi->common.prob_tx[0] = 128;
-      cpi->common.prob_tx[1] = 128;
-    }
-    cpi->common.comp_pred_mode = pred_type;
-    encode_frame_internal(cpi);
-
-    for (i = 0; i < NB_PREDICTION_TYPES; ++i) {
-      const int diff = cpi->rd_comp_pred_diff[i] / cpi->common.MBs;
-      cpi->rd_prediction_type_threshes[frame_type][i] += diff;
-      cpi->rd_prediction_type_threshes[frame_type][i] >>= 1;
-    }
-
-    for (i = 0; i < NB_TXFM_MODES; ++i) {
-      int64_t pd = cpi->rd_tx_select_diff[i];
-      int diff;
-      if (i == TX_MODE_SELECT)
-        pd -= RDCOST(cpi->mb.rdmult, cpi->mb.rddiv, 2048 * (TX_SIZE_MAX - 1), 0);
-      diff = pd / cpi->common.MBs;
-      cpi->rd_tx_select_threshes[frame_type][i] += diff;
-      cpi->rd_tx_select_threshes[frame_type][i] /= 2;
-    }
-
-    if (cpi->common.comp_pred_mode == HYBRID_PREDICTION) {
-      int single_count_zero = 0;
-      int comp_count_zero = 0;
-
-      for (i = 0; i < COMP_PRED_CONTEXTS; i++) {
-        single_count_zero += cpi->single_pred_count[i];
-        comp_count_zero += cpi->comp_pred_count[i];
-      }
-
-      if (comp_count_zero == 0) {
-        cpi->common.comp_pred_mode = SINGLE_PREDICTION_ONLY;
-      } else if (single_count_zero == 0) {
-        cpi->common.comp_pred_mode = COMP_PREDICTION_ONLY;
-      }
-    }
-
-    if (cpi->common.txfm_mode == TX_MODE_SELECT) {
-      const int count4x4 = cpi->txfm_count[TX_4X4] + cpi->txfm_count_8x8p[TX_4X4];
-      const int count8x8 = cpi->txfm_count[TX_8X8];
-      const int count8x8_8x8p = cpi->txfm_count_8x8p[TX_8X8];
-      const int count16x16 = cpi->txfm_count[TX_16X16];
-
-      if (count4x4 == 0 && count16x16 == 0) {
-        cpi->common.txfm_mode = ALLOW_8X8;
-        reset_skip_txfm_size(cpi, TX_8X8);
-      } else if (count8x8 == 0 && count16x16 == 0 && count8x8_8x8p == 0) {
-        cpi->common.txfm_mode = ONLY_4X4;
-        reset_skip_txfm_size(cpi, TX_4X4);
-      } else if (count8x8 == 0 && count4x4 == 0) {
-        cpi->common.txfm_mode = ALLOW_16X16;
-      }
-    }
-  } else {
-    encode_frame_internal(cpi);
-  }
-
-}
-
-void vp9_setup_block_ptrs(MACROBLOCK *x) {
-  int r, c;
-  int i;
-
-  for (r = 0; r < 4; r++) {
-    for (c = 0; c < 4; c++) {
-      x->block[r * 4 + c].src_diff = x->src_diff + r * 4 * 16 + c * 4;
-    }
-  }
-
-  for (r = 0; r < 2; r++) {
-    for (c = 0; c < 2; c++) {
-      x->block[16 + r * 2 + c].src_diff = x->src_diff + 256 + r * 4 * 8 + c * 4;
-    }
-  }
-
-
-  for (r = 0; r < 2; r++) {
-    for (c = 0; c < 2; c++) {
-      x->block[20 + r * 2 + c].src_diff = x->src_diff + 320 + r * 4 * 8 + c * 4;
-    }
-  }
-
-  x->block[24].src_diff = x->src_diff + 384;
-
-
-  for (i = 0; i < 25; i++) {
-    x->block[i].coeff = x->coeff + i * 16;
-  }
-}
-
-void vp9_build_block_offsets(MACROBLOCK *x) {
-  int block = 0;
-  int br, bc;
-
-  vp9_build_block_doffsets(&x->e_mbd);
-
-  // y blocks
-  x->thismb_ptr = &x->thismb[0];
-  for (br = 0; br < 4; br++) {
-    for (bc = 0; bc < 4; bc++) {
-      BLOCK *this_block = &x->block[block];
-      // this_block->base_src = &x->src.y_buffer;
-      // this_block->src_stride = x->src.y_stride;
-      // this_block->src = 4 * br * this_block->src_stride + 4 * bc;
-      this_block->base_src = &x->thismb_ptr;
-      this_block->src_stride = 16;
-      this_block->src = 4 * br * 16 + 4 * bc;
-      ++block;
-    }
-  }
-
-  // u blocks
-  for (br = 0; br < 2; br++) {
-    for (bc = 0; bc < 2; bc++) {
-      BLOCK *this_block = &x->block[block];
-      this_block->base_src = &x->src.u_buffer;
-      this_block->src_stride = x->src.uv_stride;
-      this_block->src = 4 * br * this_block->src_stride + 4 * bc;
-      ++block;
-    }
-  }
-
-  // v blocks
-  for (br = 0; br < 2; br++) {
-    for (bc = 0; bc < 2; bc++) {
-      BLOCK *this_block = &x->block[block];
-      this_block->base_src = &x->src.v_buffer;
-      this_block->src_stride = x->src.uv_stride;
-      this_block->src = 4 * br * this_block->src_stride + 4 * bc;
-      ++block;
-    }
-  }
-}
-
-static void sum_intra_stats(VP9_COMP *cpi, MACROBLOCK *x) {
-  const MACROBLOCKD *xd = &x->e_mbd;
-  const MB_PREDICTION_MODE m = xd->mode_info_context->mbmi.mode;
-  const MB_PREDICTION_MODE uvm = xd->mode_info_context->mbmi.uv_mode;
-
-#ifdef MODE_STATS
-  const int is_key = cpi->common.frame_type == KEY_FRAME;
-
-  ++ (is_key ? uv_modes : inter_uv_modes)[uvm];
-  ++ uv_modes_y[m][uvm];
-
-  if (m == B_PRED) {
-    unsigned int *const bct = is_key ? b_modes : inter_b_modes;
-
-    int b = 0;
-
-    do {
-      ++ bct[xd->block[b].bmi.as_mode.first];
-    } while (++b < 16);
-  }
-
-  if (m == I8X8_PRED) {
-    i8x8_modes[xd->block[0].bmi.as_mode.first]++;
-    i8x8_modes[xd->block[2].bmi.as_mode.first]++;
-    i8x8_modes[xd->block[8].bmi.as_mode.first]++;
-    i8x8_modes[xd->block[10].bmi.as_mode.first]++;
-  }
-#endif
-
-#if CONFIG_SUPERBLOCKS
-  if (xd->mode_info_context->mbmi.encoded_as_sb) {
-    ++cpi->sb_ymode_count[m];
-  } else
-#endif
-    ++cpi->ymode_count[m];
-  if (m != I8X8_PRED)
-    ++cpi->y_uv_mode_count[m][uvm];
-  else {
-    cpi->i8x8_mode_count[xd->block[0].bmi.as_mode.first]++;
-    cpi->i8x8_mode_count[xd->block[2].bmi.as_mode.first]++;
-    cpi->i8x8_mode_count[xd->block[8].bmi.as_mode.first]++;
-    cpi->i8x8_mode_count[xd->block[10].bmi.as_mode.first]++;
-  }
-  if (m == B_PRED) {
-    int b = 0;
-    do {
-      ++ cpi->bmode_count[xd->block[b].bmi.as_mode.first];
-    } while (++b < 16);
-  }
-}
-
-// Experimental stub function to create a per MB zbin adjustment based on
-// some previously calculated measure of MB activity.
-static void adjust_act_zbin(VP9_COMP *cpi, MACROBLOCK *x) {
-#if USE_ACT_INDEX
-  x->act_zbin_adj = *(x->mb_activity_ptr);
-#else
-  int64_t a;
-  int64_t b;
-  int64_t act = *(x->mb_activity_ptr);
-
-  // Apply the masking to the RD multiplier.
-  a = act + 4 * cpi->activity_avg;
-  b = 4 * act + cpi->activity_avg;
-
-  if (act > cpi->activity_avg)
-    x->act_zbin_adj = (int)(((int64_t)b + (a >> 1)) / a) - 1;
-  else
-    x->act_zbin_adj = 1 - (int)(((int64_t)a + (b >> 1)) / b);
-#endif
-}
-
-#if CONFIG_SUPERBLOCKS
-static void update_sb_skip_coeff_state(VP9_COMP *cpi,
-                                       MACROBLOCK *x,
-                                       ENTROPY_CONTEXT_PLANES ta[4],
-                                       ENTROPY_CONTEXT_PLANES tl[4],
-                                       TOKENEXTRA *t[4],
-                                       TOKENEXTRA **tp,
-                                       int skip[4])
-{
-  TOKENEXTRA tokens[4][16 * 24];
-  int n_tokens[4], n;
-
-  // if there were no skips, we don't need to do anything
-  if (!skip[0] && !skip[1] && !skip[2] && !skip[3])
-    return;
-
-  // if we don't do coeff skipping for this frame, we don't
-  // need to do anything here
-  if (!cpi->common.mb_no_coeff_skip)
-    return;
-
-  // if all 4 MBs skipped coeff coding, nothing to be done
-  if (skip[0] && skip[1] && skip[2] && skip[3])
-    return;
-
-  // so the situation now is that we want to skip coeffs
-  // for some MBs, but not all, and we didn't code EOB
-  // coefficients for them. However, the skip flag for this
-  // SB will be 0 overall, so we need to insert EOBs in the
-  // middle of the token tree. Do so here.
-  n_tokens[0] = t[1] - t[0];
-  n_tokens[1] = t[2] - t[1];
-  n_tokens[2] = t[3] - t[2];
-  n_tokens[3] = *tp  - t[3];
-  if (n_tokens[0])
-    memcpy(tokens[0], t[0], n_tokens[0] * sizeof(*t[0]));
-  if (n_tokens[1])
-    memcpy(tokens[1], t[1], n_tokens[1] * sizeof(*t[0]));
-  if (n_tokens[2])
-    memcpy(tokens[2], t[2], n_tokens[2] * sizeof(*t[0]));
-  if (n_tokens[3])
-    memcpy(tokens[3], t[3], n_tokens[3] * sizeof(*t[0]));
-
-  // reset pointer, stuff EOBs where necessary
-  *tp = t[0];
-  for (n = 0; n < 4; n++) {
-    if (skip[n]) {
-      x->e_mbd.above_context = &ta[n];
-      x->e_mbd.left_context  = &tl[n];
-      vp9_stuff_mb(cpi, &x->e_mbd, tp, 0);
-    } else {
-      if (n_tokens[n]) {
-        memcpy(*tp, tokens[n], sizeof(*t[0]) * n_tokens[n]);
-      }
-      (*tp) += n_tokens[n];
-    }
-  }
-}
-
-void vp9_encode_intra_super_block(VP9_COMP *cpi,
-                                  MACROBLOCK *x,
-                                  TOKENEXTRA **t,
-                                  int mb_col) {
-  const int output_enabled = 1;
-  int n;
-  MACROBLOCKD *xd = &x->e_mbd;
-  VP9_COMMON *cm = &cpi->common;
-  const uint8_t *src = x->src.y_buffer;
-  uint8_t *dst = xd->dst.y_buffer;
-  const uint8_t *usrc = x->src.u_buffer;
-  uint8_t *udst = xd->dst.u_buffer;
-  const uint8_t *vsrc = x->src.v_buffer;
-  uint8_t *vdst = xd->dst.v_buffer;
-  int src_y_stride = x->src.y_stride, dst_y_stride = xd->dst.y_stride;
-  int src_uv_stride = x->src.uv_stride, dst_uv_stride = xd->dst.uv_stride;
-  const VP9_ENCODER_RTCD *rtcd = IF_RTCD(&cpi->rtcd);
-  TOKENEXTRA *tp[4];
-  int skip[4];
-  MODE_INFO *mi = x->e_mbd.mode_info_context;
-  ENTROPY_CONTEXT_PLANES ta[4], tl[4];
-
-  if ((cpi->oxcf.tuning == VP8_TUNE_SSIM) && output_enabled) {
-    adjust_act_zbin(cpi, x);
-    vp9_update_zbin_extra(cpi, x);
-  }
-
-  vp9_build_intra_predictors_sby_s(&x->e_mbd);
-  vp9_build_intra_predictors_sbuv_s(&x->e_mbd);
-
-  assert(x->e_mbd.mode_info_context->mbmi.txfm_size == TX_8X8);
-  for (n = 0; n < 4; n++) {
-    int x_idx = n & 1, y_idx = n >> 1;
-
-    xd->above_context = cm->above_context + mb_col + (n & 1);
-    xd->left_context = cm->left_context + (n >> 1);
-
-    vp9_subtract_mby_s_c(x->src_diff,
-                         src + x_idx * 16 + y_idx * 16 * src_y_stride,
-                         src_y_stride,
-                         dst + x_idx * 16 + y_idx * 16 * dst_y_stride,
-                         dst_y_stride);
-    vp9_subtract_mbuv_s_c(x->src_diff,
-                          usrc + x_idx * 8 + y_idx * 8 * src_uv_stride,
-                          vsrc + x_idx * 8 + y_idx * 8 * src_uv_stride,
-                          src_uv_stride,
-                          udst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
-                          vdst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
-                          dst_uv_stride);
-    vp9_transform_mb_8x8(x);
-    vp9_quantize_mb_8x8(x);
-    if (x->optimize) {
-      vp9_optimize_mby_8x8(x, rtcd);
-      vp9_optimize_mbuv_8x8(x, rtcd);
-    }
-    vp9_inverse_transform_mb_8x8(IF_RTCD(&rtcd->common->idct), &x->e_mbd);
-    vp9_recon_mby_s_c(&x->e_mbd, dst + x_idx * 16 + y_idx * 16 * dst_y_stride);
-    vp9_recon_mbuv_s_c(&x->e_mbd,
-                       udst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
-                       vdst + x_idx * 8 + y_idx * 8 * dst_uv_stride);
-
-    if (output_enabled) {
-      memcpy(&ta[n], xd->above_context, sizeof(ta[n]));
-      memcpy(&tl[n], xd->left_context, sizeof(tl[n]));
-      tp[n] = *t;
-      xd->mode_info_context = mi + x_idx + y_idx * cm->mode_info_stride;
-      vp9_tokenize_mb(cpi, &x->e_mbd, t, 0);
-      skip[n] = xd->mode_info_context->mbmi.mb_skip_coeff;
-    }
-  }
-
-  if (output_enabled) {
-    // Tokenize
-    xd->mode_info_context = mi;
-    sum_intra_stats(cpi, x);
-    update_sb_skip_coeff_state(cpi, x, ta, tl, tp, t, skip);
-  }
-}
-#endif /* CONFIG_SUPERBLOCKS */
-
-void vp9_encode_intra_macro_block(VP9_COMP *cpi,
-                                  MACROBLOCK *x,
-                                  TOKENEXTRA **t,
-                                  int output_enabled) {
-  MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;
-  if ((cpi->oxcf.tuning == VP8_TUNE_SSIM) && output_enabled) {
-    adjust_act_zbin(cpi, x);
-    vp9_update_zbin_extra(cpi, x);
-  }
-  if (mbmi->mode == I8X8_PRED) {
-    vp9_encode_intra8x8mby(IF_RTCD(&cpi->rtcd), x);
-    vp9_encode_intra8x8mbuv(IF_RTCD(&cpi->rtcd), x);
-  } else if (mbmi->mode == B_PRED) {
-    vp9_encode_intra4x4mby(IF_RTCD(&cpi->rtcd), x);
-  } else {
-    vp9_encode_intra16x16mby(IF_RTCD(&cpi->rtcd), x);
-  }
-
-  if (mbmi->mode != I8X8_PRED) {
-    vp9_encode_intra16x16mbuv(IF_RTCD(&cpi->rtcd), x);
-  }
-
-  if (output_enabled) {
-    int segment_id = mbmi->segment_id;
-
-    // Tokenize
-    sum_intra_stats(cpi, x);
-    vp9_tokenize_mb(cpi, &x->e_mbd, t, 0);
-
-    if (cpi->common.txfm_mode == TX_MODE_SELECT &&
-        !((cpi->common.mb_no_coeff_skip && mbmi->mb_skip_coeff) ||
-          (vp9_segfeature_active(&x->e_mbd, segment_id, SEG_LVL_EOB) &&
-           vp9_get_segdata(&x->e_mbd, segment_id, SEG_LVL_EOB) == 0))) {
-      if (mbmi->mode != B_PRED && mbmi->mode != I8X8_PRED) {
-        cpi->txfm_count[mbmi->txfm_size]++;
-      } else if (mbmi->mode == I8X8_PRED) {
-        cpi->txfm_count_8x8p[mbmi->txfm_size]++;
-      }
-    } else if (cpi->common.txfm_mode >= ALLOW_16X16 && mbmi->mode <= TM_PRED) {
-      mbmi->txfm_size = TX_16X16;
-    } else
-    if (cpi->common.txfm_mode >= ALLOW_8X8 && mbmi->mode != B_PRED) {
-      mbmi->txfm_size = TX_8X8;
-    } else {
-      mbmi->txfm_size = TX_4X4;
-    }
-  }
-#if CONFIG_NEWBESTREFMV
-  else
-    vp9_tokenize_mb(cpi, &x->e_mbd, t, 1);
-#endif
-}
-
-extern void vp9_fix_contexts(MACROBLOCKD *xd);
-
-void vp9_encode_inter_macroblock(VP9_COMP *cpi, MACROBLOCK *x,
-                                 TOKENEXTRA **t, int recon_yoffset,
-                                 int recon_uvoffset, int output_enabled) {
-  VP9_COMMON *cm = &cpi->common;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO * mbmi = &xd->mode_info_context->mbmi;
-  unsigned char *segment_id = &mbmi->segment_id;
-  int seg_ref_active;
-  unsigned char ref_pred_flag;
-
-  x->skip = 0;
-#if CONFIG_SUPERBLOCKS
-  assert(!xd->mode_info_context->mbmi.encoded_as_sb);
-#endif
-
-  vp9_setup_interp_filters(xd, mbmi->interp_filter, cm);
-  if (cpi->oxcf.tuning == VP8_TUNE_SSIM) {
-    // Adjust the zbin based on this MB rate.
-    adjust_act_zbin(cpi, x);
-  }
-
-  {
-    // Experimental code. Special case for gf and arf zeromv modes.
-    // Increase zbin size to suppress noise
-    cpi->zbin_mode_boost = 0;
-    if (cpi->zbin_mode_boost_enabled) {
-      if (mbmi->ref_frame != INTRA_FRAME) {
-        if (mbmi->mode == ZEROMV) {
-          if (mbmi->ref_frame != LAST_FRAME)
-            cpi->zbin_mode_boost = GF_ZEROMV_ZBIN_BOOST;
-          else
-            cpi->zbin_mode_boost = LF_ZEROMV_ZBIN_BOOST;
-        } else if (mbmi->mode == SPLITMV)
-          cpi->zbin_mode_boost = 0;
-        else
-          cpi->zbin_mode_boost = MV_ZBIN_BOOST;
-      }
-    }
-
-    vp9_update_zbin_extra(cpi, x);
-  }
-
-  seg_ref_active = vp9_segfeature_active(xd, *segment_id, SEG_LVL_REF_FRAME);
-
-  // SET VARIOUS PREDICTION FLAGS
-
-  // Did the chosen reference frame match its predicted value.
-  ref_pred_flag = ((mbmi->ref_frame == vp9_get_pred_ref(cm, xd)));
-  vp9_set_pred_flag(xd, PRED_REF, ref_pred_flag);
-
-  if (mbmi->ref_frame == INTRA_FRAME) {
-    if (mbmi->mode == B_PRED) {
-      vp9_encode_intra16x16mbuv(IF_RTCD(&cpi->rtcd), x);
-      vp9_encode_intra4x4mby(IF_RTCD(&cpi->rtcd), x);
-    } else if (mbmi->mode == I8X8_PRED) {
-      vp9_encode_intra8x8mby(IF_RTCD(&cpi->rtcd), x);
-      vp9_encode_intra8x8mbuv(IF_RTCD(&cpi->rtcd), x);
-    } else {
-      vp9_encode_intra16x16mbuv(IF_RTCD(&cpi->rtcd), x);
-      vp9_encode_intra16x16mby(IF_RTCD(&cpi->rtcd), x);
-    }
-
-    if (output_enabled)
-      sum_intra_stats(cpi, x);
-  } else {
-    int ref_fb_idx;
-
-    if (mbmi->ref_frame == LAST_FRAME)
-      ref_fb_idx = cpi->common.lst_fb_idx;
-    else if (mbmi->ref_frame == GOLDEN_FRAME)
-      ref_fb_idx = cpi->common.gld_fb_idx;
-    else
-      ref_fb_idx = cpi->common.alt_fb_idx;
-
-    xd->pre.y_buffer = cpi->common.yv12_fb[ref_fb_idx].y_buffer + recon_yoffset;
-    xd->pre.u_buffer = cpi->common.yv12_fb[ref_fb_idx].u_buffer + recon_uvoffset;
-    xd->pre.v_buffer = cpi->common.yv12_fb[ref_fb_idx].v_buffer + recon_uvoffset;
-
-    if (mbmi->second_ref_frame) {
-      int second_ref_fb_idx;
-
-      if (mbmi->second_ref_frame == LAST_FRAME)
-        second_ref_fb_idx = cpi->common.lst_fb_idx;
-      else if (mbmi->second_ref_frame == GOLDEN_FRAME)
-        second_ref_fb_idx = cpi->common.gld_fb_idx;
-      else
-        second_ref_fb_idx = cpi->common.alt_fb_idx;
-
-      xd->second_pre.y_buffer = cpi->common.yv12_fb[second_ref_fb_idx].y_buffer +
-                                recon_yoffset;
-      xd->second_pre.u_buffer = cpi->common.yv12_fb[second_ref_fb_idx].u_buffer +
-                                recon_uvoffset;
-      xd->second_pre.v_buffer = cpi->common.yv12_fb[second_ref_fb_idx].v_buffer +
-                                recon_uvoffset;
-    }
-
-    if (!x->skip) {
-      vp9_encode_inter16x16(IF_RTCD(&cpi->rtcd), x);
-
-      // Clear mb_skip_coeff if mb_no_coeff_skip is not set
-      if (!cpi->common.mb_no_coeff_skip)
-        mbmi->mb_skip_coeff = 0;
-
-    } else {
-      vp9_build_1st_inter16x16_predictors_mb(xd, xd->dst.y_buffer,
-                                             xd->dst.u_buffer, xd->dst.v_buffer,
-                                             xd->dst.y_stride,
-                                             xd->dst.uv_stride);
-    }
-  }
-
-  if (!x->skip) {
-#ifdef ENC_DEBUG
-    if (enc_debug) {
-      int i;
-      printf("Segment=%d [%d, %d]: %d %d:\n", mbmi->segment_id, mb_col_debug,
-             mb_row_debug, xd->mb_to_left_edge, xd->mb_to_top_edge);
-      for (i = 0; i < 400; i++) {
-        printf("%3d ", xd->qcoeff[i]);
-        if (i % 16 == 15) printf("\n");
-      }
-      printf("\n");
-      printf("eobs = ");
-      for (i = 0; i < 25; i++)
-        printf("%d:%d ", i, xd->block[i].eob);
-      printf("\n");
-      fflush(stdout);
-    }
-#endif
-
-    vp9_tokenize_mb(cpi, xd, t, !output_enabled);
-
-#ifdef ENC_DEBUG
-    if (enc_debug) {
-      printf("Tokenized\n");
-      fflush(stdout);
-    }
-#endif
-  } else {
-    int mb_skip_context =
-      cpi->common.mb_no_coeff_skip ?
-      (x->e_mbd.mode_info_context - 1)->mbmi.mb_skip_coeff +
-      (x->e_mbd.mode_info_context - cpi->common.mode_info_stride)->mbmi.mb_skip_coeff :
-      0;
-    if (cpi->common.mb_no_coeff_skip) {
-      mbmi->mb_skip_coeff = 1;
-      if (output_enabled)
-        cpi->skip_true_count[mb_skip_context]++;
-      vp9_fix_contexts(xd);
-    } else {
-      vp9_stuff_mb(cpi, xd, t, !output_enabled);
-      mbmi->mb_skip_coeff = 0;
-      if (output_enabled)
-        cpi->skip_false_count[mb_skip_context]++;
-    }
-  }
-
-  if (output_enabled) {
-    int segment_id = mbmi->segment_id;
-    if (cpi->common.txfm_mode == TX_MODE_SELECT &&
-        !((cpi->common.mb_no_coeff_skip && mbmi->mb_skip_coeff) ||
-          (vp9_segfeature_active(&x->e_mbd, segment_id, SEG_LVL_EOB) &&
-           vp9_get_segdata(&x->e_mbd, segment_id, SEG_LVL_EOB) == 0))) {
-      if (mbmi->mode != B_PRED && mbmi->mode != I8X8_PRED &&
-          mbmi->mode != SPLITMV) {
-        cpi->txfm_count[mbmi->txfm_size]++;
-      } else if (mbmi->mode == I8X8_PRED ||
-                 (mbmi->mode == SPLITMV &&
-                  mbmi->partitioning != PARTITIONING_4X4)) {
-        cpi->txfm_count_8x8p[mbmi->txfm_size]++;
-      }
-    } else if (mbmi->mode != B_PRED && mbmi->mode != I8X8_PRED &&
-        mbmi->mode != SPLITMV && cpi->common.txfm_mode >= ALLOW_16X16) {
-      mbmi->txfm_size = TX_16X16;
-    } else if (mbmi->mode != B_PRED &&
-               !(mbmi->mode == SPLITMV &&
-                 mbmi->partitioning == PARTITIONING_4X4) &&
-               cpi->common.txfm_mode >= ALLOW_8X8) {
-      mbmi->txfm_size = TX_8X8;
-    } else {
-      mbmi->txfm_size = TX_4X4;
-    }
-  }
-}
-
-#if CONFIG_SUPERBLOCKS
-void vp9_encode_inter_superblock(VP9_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t,
-                                 int recon_yoffset, int recon_uvoffset,
-                                 int mb_col, int mb_row) {
-  const int output_enabled = 1;
-  VP9_COMMON *cm = &cpi->common;
-  MACROBLOCKD *xd = &x->e_mbd;
-  const uint8_t *src = x->src.y_buffer;
-  uint8_t *dst = xd->dst.y_buffer;
-  const uint8_t *usrc = x->src.u_buffer;
-  uint8_t *udst = xd->dst.u_buffer;
-  const uint8_t *vsrc = x->src.v_buffer;
-  uint8_t *vdst = xd->dst.v_buffer;
-  int src_y_stride = x->src.y_stride, dst_y_stride = xd->dst.y_stride;
-  int src_uv_stride = x->src.uv_stride, dst_uv_stride = xd->dst.uv_stride;
-  const VP9_ENCODER_RTCD *rtcd = IF_RTCD(&cpi->rtcd);
-  unsigned int segment_id = xd->mode_info_context->mbmi.segment_id;
-  int seg_ref_active;
-  unsigned char ref_pred_flag;
-  int n;
-  TOKENEXTRA *tp[4];
-  int skip[4];
-  MODE_INFO *mi = x->e_mbd.mode_info_context;
-  ENTROPY_CONTEXT_PLANES ta[4], tl[4];
-
-  x->skip = 0;
-
-  if (cpi->oxcf.tuning == VP8_TUNE_SSIM) {
-    // Adjust the zbin based on this MB rate.
-    adjust_act_zbin(cpi, x);
-  }
-
-  {
-    // Experimental code. Special case for gf and arf zeromv modes.
-    // Increase zbin size to suppress noise
-    cpi->zbin_mode_boost = 0;
-    if (cpi->zbin_mode_boost_enabled) {
-      if (xd->mode_info_context->mbmi.ref_frame != INTRA_FRAME) {
-        if (xd->mode_info_context->mbmi.mode == ZEROMV) {
-          if (xd->mode_info_context->mbmi.ref_frame != LAST_FRAME)
-            cpi->zbin_mode_boost = GF_ZEROMV_ZBIN_BOOST;
-          else
-            cpi->zbin_mode_boost = LF_ZEROMV_ZBIN_BOOST;
-        } else if (xd->mode_info_context->mbmi.mode == SPLITMV)
-          cpi->zbin_mode_boost = 0;
-        else
-          cpi->zbin_mode_boost = MV_ZBIN_BOOST;
-      }
-    }
-
-    vp9_update_zbin_extra(cpi, x);
-  }
-
-  seg_ref_active = vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME);
-
-  // SET VARIOUS PREDICTION FLAGS
-
-  // Did the chosen reference frame match its predicted value.
-  ref_pred_flag = ((xd->mode_info_context->mbmi.ref_frame ==
-                    vp9_get_pred_ref(cm, xd)));
-  vp9_set_pred_flag(xd, PRED_REF, ref_pred_flag);
-
-  if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) {
-    vp9_build_intra_predictors_sby_s(&x->e_mbd);
-    vp9_build_intra_predictors_sbuv_s(&x->e_mbd);
-  } else {
-    int ref_fb_idx;
-
-    if (xd->mode_info_context->mbmi.ref_frame == LAST_FRAME)
-      ref_fb_idx = cpi->common.lst_fb_idx;
-    else if (xd->mode_info_context->mbmi.ref_frame == GOLDEN_FRAME)
-      ref_fb_idx = cpi->common.gld_fb_idx;
-    else
-      ref_fb_idx = cpi->common.alt_fb_idx;
-
-    xd->pre.y_buffer = cpi->common.yv12_fb[ref_fb_idx].y_buffer + recon_yoffset;
-    xd->pre.u_buffer = cpi->common.yv12_fb[ref_fb_idx].u_buffer + recon_uvoffset;
-    xd->pre.v_buffer = cpi->common.yv12_fb[ref_fb_idx].v_buffer + recon_uvoffset;
-
-    if (xd->mode_info_context->mbmi.second_ref_frame) {
-      int second_ref_fb_idx;
-
-      if (xd->mode_info_context->mbmi.second_ref_frame == LAST_FRAME)
-        second_ref_fb_idx = cpi->common.lst_fb_idx;
-      else if (xd->mode_info_context->mbmi.second_ref_frame == GOLDEN_FRAME)
-        second_ref_fb_idx = cpi->common.gld_fb_idx;
-      else
-        second_ref_fb_idx = cpi->common.alt_fb_idx;
-
-      xd->second_pre.y_buffer = cpi->common.yv12_fb[second_ref_fb_idx].y_buffer +
-                                    recon_yoffset;
-      xd->second_pre.u_buffer = cpi->common.yv12_fb[second_ref_fb_idx].u_buffer +
-                                    recon_uvoffset;
-      xd->second_pre.v_buffer = cpi->common.yv12_fb[second_ref_fb_idx].v_buffer +
-                                    recon_uvoffset;
-    }
-
-    vp9_build_inter32x32_predictors_sb(xd, xd->dst.y_buffer,
-                                       xd->dst.u_buffer, xd->dst.v_buffer,
-                                       xd->dst.y_stride, xd->dst.uv_stride);
-  }
-
-  assert(x->e_mbd.mode_info_context->mbmi.txfm_size == TX_8X8);
-  for (n = 0; n < 4; n++) {
-    int x_idx = n & 1, y_idx = n >> 1;
-
-    vp9_subtract_mby_s_c(x->src_diff,
-                         src + x_idx * 16 + y_idx * 16 * src_y_stride,
-                         src_y_stride,
-                         dst + x_idx * 16 + y_idx * 16 * dst_y_stride,
-                         dst_y_stride);
-    vp9_subtract_mbuv_s_c(x->src_diff,
-                          usrc + x_idx * 8 + y_idx * 8 * src_uv_stride,
-                          vsrc + x_idx * 8 + y_idx * 8 * src_uv_stride,
-                          src_uv_stride,
-                          udst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
-                          vdst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
-                          dst_uv_stride);
-    vp9_transform_mb_8x8(x);
-    vp9_quantize_mb_8x8(x);
-    if (x->optimize) {
-      vp9_optimize_mby_8x8(x, rtcd);
-      vp9_optimize_mbuv_8x8(x, rtcd);
-    }
-    vp9_inverse_transform_mb_8x8(IF_RTCD(&rtcd->common->idct), &x->e_mbd);
-    vp9_recon_mby_s_c(&x->e_mbd,
-                      dst + x_idx * 16 + y_idx * 16 * dst_y_stride);
-    vp9_recon_mbuv_s_c(&x->e_mbd,
-                       udst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
-                       vdst + x_idx * 8 + y_idx * 8 * dst_uv_stride);
-
-    if (!x->skip) {
-      if (output_enabled) {
-        xd->left_context = cm->left_context + (n >> 1);
-        xd->above_context = cm->above_context + mb_col + (n & 1);
-        memcpy(&ta[n], xd->above_context, sizeof(ta[n]));
-        memcpy(&tl[n], xd->left_context, sizeof(tl[n]));
-        tp[n] = *t;
-        xd->mode_info_context = mi + x_idx + y_idx * cm->mode_info_stride;
-        vp9_tokenize_mb(cpi, &x->e_mbd, t, 0);
-        skip[n] = xd->mode_info_context->mbmi.mb_skip_coeff;
-      }
-    } else {
-      int mb_skip_context =
-        cpi->common.mb_no_coeff_skip ?
-          (x->e_mbd.mode_info_context - 1)->mbmi.mb_skip_coeff +
-            (x->e_mbd.mode_info_context - cpi->common.mode_info_stride)->mbmi.mb_skip_coeff :
-          0;
-      if (cpi->common.mb_no_coeff_skip) {
-        skip[n] = xd->mode_info_context->mbmi.mb_skip_coeff = 1;
-        xd->left_context = cm->left_context + (n >> 1);
-        xd->above_context = cm->above_context + mb_col + (n & 1);
-        memcpy(&ta[n], xd->above_context, sizeof(ta[n]));
-        memcpy(&tl[n], xd->left_context, sizeof(tl[n]));
-        tp[n] = *t;
-        cpi->skip_true_count[mb_skip_context]++;
-        vp9_fix_contexts(xd);
-      } else {
-        vp9_stuff_mb(cpi, xd, t, 0);
-        xd->mode_info_context->mbmi.mb_skip_coeff = 0;
-        cpi->skip_false_count[mb_skip_context]++;
-      }
-    }
-  }
-
-  xd->mode_info_context = mi;
-  update_sb_skip_coeff_state(cpi, x, ta, tl, tp, t, skip);
-}
-#endif
--- a/vp8/encoder/encodeintra.c
+++ /dev/null
@@ -1,289 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "vpx_ports/config.h"
-#include "vpx_rtcd.h"
-#include "vp8/common/idct.h"
-#include "quantize.h"
-#include "vp8/common/reconintra.h"
-#include "vp8/common/reconintra4x4.h"
-#include "encodemb.h"
-#include "vp8/common/invtrans.h"
-#include "encodeintra.h"
-
-#if CONFIG_RUNTIME_CPU_DETECT
-#define IF_RTCD(x) (x)
-#else
-#define IF_RTCD(x) NULL
-#endif
-
-int vp9_encode_intra(VP9_COMP *cpi, MACROBLOCK *x, int use_16x16_pred) {
-  int i;
-  int intra_pred_var = 0;
-  MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;
-  (void) cpi;
-
-  if (use_16x16_pred) {
-    mbmi->mode = DC_PRED;
-#if CONFIG_COMP_INTRA_PRED
-    mbmi->second_mode = (MB_PREDICTION_MODE)(DC_PRED - 1);
-#endif
-    mbmi->uv_mode = DC_PRED;
-    mbmi->ref_frame = INTRA_FRAME;
-
-    vp9_encode_intra16x16mby(IF_RTCD(&cpi->rtcd), x);
-  } else {
-    for (i = 0; i < 16; i++) {
-      x->e_mbd.block[i].bmi.as_mode.first = B_DC_PRED;
-      vp9_encode_intra4x4block(IF_RTCD(&cpi->rtcd), x, i);
-    }
-  }
-
-  intra_pred_var = vp9_get_mb_ss(x->src_diff);
-
-  return intra_pred_var;
-}
-
-void vp9_encode_intra4x4block(const VP9_ENCODER_RTCD *rtcd,
-                              MACROBLOCK *x, int ib) {
-  BLOCKD *b = &x->e_mbd.block[ib];
-  BLOCK *be = &x->block[ib];
-  TX_TYPE tx_type;
-
-#if CONFIG_COMP_INTRA_PRED
-  if (b->bmi.as_mode.second == (B_PREDICTION_MODE)(B_DC_PRED - 1)) {
-#endif
-    vp9_intra4x4_predict(b, b->bmi.as_mode.first, b->predictor);
-#if CONFIG_COMP_INTRA_PRED
-  } else {
-    vp9_comp_intra4x4_predict(b, b->bmi.as_mode.first, b->bmi.as_mode.second,
-                              b->predictor);
-  }
-#endif
-
-  vp9_subtract_b(be, b, 16);
-
-  tx_type = get_tx_type(&x->e_mbd, b);
-  if (tx_type != DCT_DCT) {
-    vp9_fht(be->src_diff, 32, be->coeff, tx_type, 4);
-    vp9_ht_quantize_b_4x4(be, b, tx_type);
-    vp9_ihtllm_c(b->dqcoeff, b->diff, 32, tx_type, 4);
-  } else {
-    x->vp9_short_fdct4x4(be->src_diff, be->coeff, 32);
-    x->quantize_b_4x4(be, b) ;
-    vp9_inverse_transform_b_4x4(IF_RTCD(&rtcd->common->idct), b, 32);
-  }
-
-  vp9_recon_b(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
-}
-
-void vp9_encode_intra4x4mby(const VP9_ENCODER_RTCD *rtcd, MACROBLOCK *mb) {
-  int i;
-
-  for (i = 0; i < 16; i++)
-    vp9_encode_intra4x4block(rtcd, mb, i);
-  return;
-}
-
-void vp9_encode_intra16x16mby(const VP9_ENCODER_RTCD *rtcd, MACROBLOCK *x) {
-  MACROBLOCKD *xd = &x->e_mbd;
-  BLOCK *b = &x->block[0];
-  TX_SIZE tx_size = xd->mode_info_context->mbmi.txfm_size;
-  TX_TYPE tx_type;
-
-#if CONFIG_COMP_INTRA_PRED
-  if (xd->mode_info_context->mbmi.second_mode == (MB_PREDICTION_MODE)(DC_PRED - 1))
-#endif
-    vp9_build_intra_predictors_mby(xd);
-#if CONFIG_COMP_INTRA_PRED
-  else
-    vp9_build_comp_intra_predictors_mby(xd);
-#endif
-
-  vp9_subtract_mby(x->src_diff, *(b->base_src), xd->predictor, b->src_stride);
-
-  if (tx_size == TX_16X16) {
-    BLOCKD  *bd = &xd->block[0];
-    tx_type = get_tx_type(xd, bd);
-    if (tx_type != DCT_DCT) {
-      vp9_fht(b->src_diff, 32, b->coeff, tx_type, 16);
-      vp9_quantize_mby_16x16(x);
-      if (x->optimize)
-        vp9_optimize_mby_16x16(x, rtcd);
-      vp9_ihtllm_c(bd->dqcoeff, bd->diff, 32, tx_type, 16);
-    } else {
-      vp9_transform_mby_16x16(x);
-      vp9_quantize_mby_16x16(x);
-      if (x->optimize)
-        vp9_optimize_mby_16x16(x, rtcd);
-      vp9_inverse_transform_mby_16x16(IF_RTCD(&rtcd->common->idct), xd);
-    }
-  } else if (tx_size == TX_8X8) {
-    vp9_transform_mby_8x8(x);
-    vp9_quantize_mby_8x8(x);
-    if (x->optimize)
-      vp9_optimize_mby_8x8(x, rtcd);
-    vp9_inverse_transform_mby_8x8(IF_RTCD(&rtcd->common->idct), xd);
-  } else {
-    vp9_transform_mby_4x4(x);
-    vp9_quantize_mby_4x4(x);
-    if (x->optimize)
-      vp9_optimize_mby_4x4(x, rtcd);
-    vp9_inverse_transform_mby_4x4(IF_RTCD(&rtcd->common->idct), xd);
-  }
-
-  vp9_recon_mby(xd);
-}
-
-void vp9_encode_intra16x16mbuv(const VP9_ENCODER_RTCD *rtcd, MACROBLOCK *x) {
-  MACROBLOCKD *xd = &x->e_mbd;
-  TX_SIZE tx_size = xd->mode_info_context->mbmi.txfm_size;
-
-#if CONFIG_COMP_INTRA_PRED
-  if (xd->mode_info_context->mbmi.second_uv_mode == (MB_PREDICTION_MODE)(DC_PRED - 1)) {
-#endif
-    vp9_build_intra_predictors_mbuv(xd);
-#if CONFIG_COMP_INTRA_PRED
-  } else {
-    vp9_build_comp_intra_predictors_mbuv(xd);
-  }
-#endif
-
-  vp9_subtract_mbuv(x->src_diff, x->src.u_buffer, x->src.v_buffer,
-                    xd->predictor, x->src.uv_stride);
-
-  if (tx_size == TX_4X4) {
-    vp9_transform_mbuv_4x4(x);
-    vp9_quantize_mbuv_4x4(x);
-    if (x->optimize)
-      vp9_optimize_mbuv_4x4(x, rtcd);
-    vp9_inverse_transform_mbuv_4x4(IF_RTCD(&rtcd->common->idct), xd);
-  } else /* 16x16 or 8x8 */ {
-    vp9_transform_mbuv_8x8(x);
-    vp9_quantize_mbuv_8x8(x);
-    if (x->optimize)
-      vp9_optimize_mbuv_8x8(x, rtcd);
-    vp9_inverse_transform_mbuv_8x8(IF_RTCD(&rtcd->common->idct), xd);
-  }
-
-  vp9_recon_intra_mbuv(xd);
-}
-
-void vp9_encode_intra8x8(const VP9_ENCODER_RTCD *rtcd,
-                         MACROBLOCK *x, int ib) {
-  MACROBLOCKD *xd = &x->e_mbd;
-  BLOCKD *b = &xd->block[ib];
-  BLOCK *be = &x->block[ib];
-  const int iblock[4] = {0, 1, 4, 5};
-  int i;
-  TX_TYPE tx_type;
-
-#if CONFIG_COMP_INTRA_PRED
-  if (b->bmi.as_mode.second == (MB_PREDICTION_MODE)(DC_PRED - 1)) {
-#endif
-    vp9_intra8x8_predict(b, b->bmi.as_mode.first, b->predictor);
-#if CONFIG_COMP_INTRA_PRED
-  } else {
-    vp9_comp_intra8x8_predict(b, b->bmi.as_mode.first, b->bmi.as_mode.second,
-                              b->predictor);
-  }
-#endif
-
-  if (xd->mode_info_context->mbmi.txfm_size == TX_8X8) {
-    int idx = (ib & 0x02) ? (ib + 2) : ib;
-
-    // generate residual blocks
-    vp9_subtract_4b_c(be, b, 16);
-
-    tx_type = get_tx_type(xd, xd->block + idx);
-    if (tx_type != DCT_DCT) {
-      vp9_fht(be->src_diff, 32, (x->block + idx)->coeff,
-                tx_type, 8);
-      x->quantize_b_8x8(x->block + idx, xd->block + idx);
-      vp9_ihtllm_c(xd->block[idx].dqcoeff, xd->block[ib].diff, 32,
-                   tx_type, 8);
-    } else {
-      x->vp9_short_fdct8x8(be->src_diff, (x->block + idx)->coeff, 32);
-      x->quantize_b_8x8(x->block + idx, xd->block + idx);
-      vp9_idct_idct8(xd->block[idx].dqcoeff, xd->block[ib].diff, 32);
-    }
-  } else {
-    for (i = 0; i < 4; i++) {
-      b = &xd->block[ib + iblock[i]];
-      be = &x->block[ib + iblock[i]];
-      vp9_subtract_b(be, b, 16);
-      x->vp9_short_fdct4x4(be->src_diff, be->coeff, 32);
-      x->quantize_b_4x4(be, b);
-      vp9_inverse_transform_b_4x4(IF_RTCD(&rtcd->common->idct), b, 32);
-    }
-  }
-
-  // reconstruct submacroblock
-  for (i = 0; i < 4; i++) {
-    b = &xd->block[ib + iblock[i]];
-    vp9_recon_b_c(b->predictor, b->diff, *(b->base_dst) + b->dst,
-                  b->dst_stride);
-  }
-}
-
-void vp9_encode_intra8x8mby(const VP9_ENCODER_RTCD *rtcd, MACROBLOCK *x) {
-  int i, ib;
-
-  for (i = 0; i < 4; i++) {
-    ib = vp9_i8x8_block[i];
-    vp9_encode_intra8x8(rtcd, x, ib);
-  }
-}
-
-void vp9_encode_intra_uv4x4(const VP9_ENCODER_RTCD *rtcd,
-                            MACROBLOCK *x, int ib,
-                            int mode, int second) {
-  BLOCKD *b = &x->e_mbd.block[ib];
-  BLOCK *be = &x->block[ib];
-
-#if CONFIG_COMP_INTRA_PRED
-  if (second == -1) {
-#endif
-    vp9_intra_uv4x4_predict(b, mode, b->predictor);
-#if CONFIG_COMP_INTRA_PRED
-  } else {
-    vp9_comp_intra_uv4x4_predict(b, mode, second, b->predictor);
-  }
-#endif
-
-  vp9_subtract_b(be, b, 8);
-
-  x->vp9_short_fdct4x4(be->src_diff, be->coeff, 16);
-  x->quantize_b_4x4(be, b);
-  vp9_inverse_transform_b_4x4(IF_RTCD(&rtcd->common->idct), b, 16);
-
-  vp9_recon_uv_b_c(b->predictor, b->diff, *(b->base_dst) + b->dst,
-                   b->dst_stride);
-}
-
-void vp9_encode_intra8x8mbuv(const VP9_ENCODER_RTCD *rtcd, MACROBLOCK *x) {
-  int i, ib, mode, second;
-  BLOCKD *b;
-
-  for (i = 0; i < 4; i++) {
-    ib = vp9_i8x8_block[i];
-    b = &x->e_mbd.block[ib];
-    mode = b->bmi.as_mode.first;
-#if CONFIG_COMP_INTRA_PRED
-    second = b->bmi.as_mode.second;
-#else
-    second = -1;
-#endif
-    /*u */
-    vp9_encode_intra_uv4x4(rtcd, x, i + 16, mode, second);
-    /*v */
-    vp9_encode_intra_uv4x4(rtcd, x, i + 20, mode, second);
-  }
-}
--- a/vp8/encoder/encodeintra.h
+++ /dev/null
@@ -1,27 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef __ENCODEINTRA_H_
-#define __ENCODEINTRA_H_
-
-#include "onyx_int.h"
-
-int vp9_encode_intra(VP9_COMP *cpi, MACROBLOCK *x, int use_16x16_pred);
-void vp9_encode_intra16x16mby(const VP9_ENCODER_RTCD *, MACROBLOCK *x);
-void vp9_encode_intra16x16mbuv(const VP9_ENCODER_RTCD *, MACROBLOCK *x);
-void vp9_encode_intra4x4mby(const VP9_ENCODER_RTCD *, MACROBLOCK *mb);
-void vp9_encode_intra4x4block(const VP9_ENCODER_RTCD *rtcd,
-                              MACROBLOCK *x, int ib);
-void vp9_encode_intra8x8mby(const VP9_ENCODER_RTCD *rtcd, MACROBLOCK *x);
-void vp9_encode_intra8x8mbuv(const VP9_ENCODER_RTCD *rtcd, MACROBLOCK *x);
-void vp9_encode_intra8x8(const VP9_ENCODER_RTCD *rtcd,
-                         MACROBLOCK *x, int ib);
-
-#endif  // __ENCODEINTRA_H_
--- a/vp8/encoder/encodemb.c
+++ /dev/null
@@ -1,950 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "vpx_ports/config.h"
-#include "encodemb.h"
-#include "vp8/common/reconinter.h"
-#include "quantize.h"
-#include "tokenize.h"
-#include "vp8/common/invtrans.h"
-#include "vp8/common/reconintra.h"
-#include "vpx_mem/vpx_mem.h"
-#include "rdopt.h"
-#include "vp8/common/systemdependent.h"
-#include "vpx_rtcd.h"
-
-#if CONFIG_RUNTIME_CPU_DETECT
-#define IF_RTCD(x) (x)
-#else
-#define IF_RTCD(x) NULL
-#endif
-
-void vp9_subtract_b_c(BLOCK *be, BLOCKD *bd, int pitch) {
-  unsigned char *src_ptr = (*(be->base_src) + be->src);
-  short *diff_ptr = be->src_diff;
-  unsigned char *pred_ptr = bd->predictor;
-  int src_stride = be->src_stride;
-
-  int r, c;
-
-  for (r = 0; r < 4; r++) {
-    for (c = 0; c < 4; c++) {
-      diff_ptr[c] = src_ptr[c] - pred_ptr[c];
-    }
-
-    diff_ptr += pitch;
-    pred_ptr += pitch;
-    src_ptr  += src_stride;
-  }
-}
-
-void vp9_subtract_4b_c(BLOCK *be, BLOCKD *bd, int pitch) {
-  unsigned char *src_ptr = (*(be->base_src) + be->src);
-  short *diff_ptr = be->src_diff;
-  unsigned char *pred_ptr = bd->predictor;
-  int src_stride = be->src_stride;
-  int r, c;
-
-  for (r = 0; r < 8; r++) {
-    for (c = 0; c < 8; c++) {
-      diff_ptr[c] = src_ptr[c] - pred_ptr[c];
-    }
-    diff_ptr += pitch;
-    pred_ptr += pitch;
-    src_ptr  += src_stride;
-  }
-}
-
-void vp9_subtract_mbuv_s_c(short *diff, const unsigned char *usrc,
-                           const unsigned char *vsrc, int src_stride,
-                           const unsigned char *upred,
-                           const unsigned char *vpred, int dst_stride) {
-  short *udiff = diff + 256;
-  short *vdiff = diff + 320;
-  int r, c;
-
-  for (r = 0; r < 8; r++) {
-    for (c = 0; c < 8; c++) {
-      udiff[c] = usrc[c] - upred[c];
-    }
-
-    udiff += 8;
-    upred += dst_stride;
-    usrc  += src_stride;
-  }
-
-  for (r = 0; r < 8; r++) {
-    for (c = 0; c < 8; c++) {
-      vdiff[c] = vsrc[c] - vpred[c];
-    }
-
-    vdiff += 8;
-    vpred += dst_stride;
-    vsrc  += src_stride;
-  }
-}
-
-void vp9_subtract_mbuv_c(short *diff, unsigned char *usrc,
-                         unsigned char *vsrc, unsigned char *pred, int stride) {
-  unsigned char *upred = pred + 256;
-  unsigned char *vpred = pred + 320;
-
-  vp9_subtract_mbuv_s_c(diff, usrc, vsrc, stride, upred, vpred, 8);
-}
-
-void vp9_subtract_mby_s_c(short *diff, const unsigned char *src, int src_stride,
-                          const unsigned char *pred, int dst_stride) {
-  int r, c;
-
-  for (r = 0; r < 16; r++) {
-    for (c = 0; c < 16; c++) {
-      diff[c] = src[c] - pred[c];
-    }
-
-    diff += 16;
-    pred += dst_stride;
-    src  += src_stride;
-  }
-}
-
-void vp9_subtract_mby_c(short *diff, unsigned char *src,
-                        unsigned char *pred, int stride) {
-  vp9_subtract_mby_s_c(diff, src, stride, pred, 16);
-}
-
-static void subtract_mb(const VP9_ENCODER_RTCD *rtcd, MACROBLOCK *x) {
-  BLOCK *b = &x->block[0];
-
-  vp9_subtract_mby(x->src_diff, *(b->base_src), x->e_mbd.predictor,
-                   b->src_stride);
-  vp9_subtract_mbuv(x->src_diff, x->src.u_buffer, x->src.v_buffer,
-                    x->e_mbd.predictor, x->src.uv_stride);
-}
-
-static void build_dcblock_4x4(MACROBLOCK *x) {
-  short *src_diff_ptr = &x->src_diff[384];
-  int i;
-
-  for (i = 0; i < 16; i++) {
-    src_diff_ptr[i] = x->coeff[i * 16];
-  }
-}
-
-void vp9_transform_mby_4x4(MACROBLOCK *x) {
-  int i;
-
-  for (i = 0; i < 16; i += 2) {
-    x->vp9_short_fdct8x4(&x->block[i].src_diff[0],
-                         &x->block[i].coeff[0], 32);
-  }
-
-  if (x->e_mbd.mode_info_context->mbmi.mode != SPLITMV) {
-    // build dc block from 16 y dc values
-    build_dcblock_4x4(x);
-
-    // do 2nd order transform on the dc block
-    x->short_walsh4x4(&x->block[24].src_diff[0],
-                      &x->block[24].coeff[0], 8);
-  }
-}
-
-void vp9_transform_mbuv_4x4(MACROBLOCK *x) {
-  int i;
-
-  for (i = 16; i < 24; i += 2) {
-    x->vp9_short_fdct8x4(&x->block[i].src_diff[0],
-                         &x->block[i].coeff[0], 16);
-  }
-}
-
-static void transform_mb_4x4(MACROBLOCK *x) {
-  vp9_transform_mby_4x4(x);
-  vp9_transform_mbuv_4x4(x);
-}
-
-static void build_dcblock_8x8(MACROBLOCK *x) {
-  int16_t *src_diff_ptr = x->block[24].src_diff;
-  int i;
-
-  for (i = 0; i < 16; i++) {
-    src_diff_ptr[i] = 0;
-  }
-  src_diff_ptr[0] = x->coeff[0 * 16];
-  src_diff_ptr[1] = x->coeff[4 * 16];
-  src_diff_ptr[4] = x->coeff[8 * 16];
-  src_diff_ptr[8] = x->coeff[12 * 16];
-}
-
-void vp9_transform_mby_8x8(MACROBLOCK *x) {
-  int i;
-
-  for (i = 0; i < 9; i += 8) {
-    x->vp9_short_fdct8x8(&x->block[i].src_diff[0],
-                         &x->block[i].coeff[0], 32);
-  }
-  for (i = 2; i < 11; i += 8) {
-    x->vp9_short_fdct8x8(&x->block[i].src_diff[0],
-                         &x->block[i + 2].coeff[0], 32);
-  }
-
-  if (x->e_mbd.mode_info_context->mbmi.mode != SPLITMV) {
-    // build dc block from 2x2 y dc values
-    build_dcblock_8x8(x);
-
-    // do 2nd order transform on the dc block
-    x->short_fhaar2x2(&x->block[24].src_diff[0],
-                      &x->block[24].coeff[0], 8);
-  }
-}
-
-void vp9_transform_mbuv_8x8(MACROBLOCK *x) {
-  int i;
-
-  for (i = 16; i < 24; i += 4) {
-    x->vp9_short_fdct8x8(&x->block[i].src_diff[0],
-                         &x->block[i].coeff[0], 16);
-  }
-}
-
-void vp9_transform_mb_8x8(MACROBLOCK *x) {
-  vp9_transform_mby_8x8(x);
-  vp9_transform_mbuv_8x8(x);
-}
-
-void vp9_transform_mby_16x16(MACROBLOCK *x) {
-  vp9_clear_system_state();
-  x->vp9_short_fdct16x16(&x->block[0].src_diff[0],
-                         &x->block[0].coeff[0], 32);
-}
-
-void vp9_transform_mb_16x16(MACROBLOCK *x) {
-  vp9_transform_mby_16x16(x);
-  vp9_transform_mbuv_8x8(x);
-}
-
-#define RDTRUNC(RM,DM,R,D) ( (128+(R)*(RM)) & 0xFF )
-#define RDTRUNC_8x8(RM,DM,R,D) ( (128+(R)*(RM)) & 0xFF )
-typedef struct vp9_token_state vp9_token_state;
-
-struct vp9_token_state {
-  int           rate;
-  int           error;
-  int           next;
-  signed char   token;
-  short         qc;
-};
-
-// TODO: experiments to find optimal multiple numbers
-#define Y1_RD_MULT 4
-#define UV_RD_MULT 2
-#define Y2_RD_MULT 4
-
-static const int plane_rd_mult[4] = {
-  Y1_RD_MULT,
-  Y2_RD_MULT,
-  UV_RD_MULT,
-  Y1_RD_MULT
-};
-
-#define UPDATE_RD_COST()\
-{\
-  rd_cost0 = RDCOST(rdmult, rddiv, rate0, error0);\
-  rd_cost1 = RDCOST(rdmult, rddiv, rate1, error1);\
-  if (rd_cost0 == rd_cost1) {\
-    rd_cost0 = RDTRUNC(rdmult, rddiv, rate0, error0);\
-    rd_cost1 = RDTRUNC(rdmult, rddiv, rate1, error1);\
-  }\
-}
-
-static void optimize_b(MACROBLOCK *mb, int i, PLANE_TYPE type,
-                       ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
-                       const VP9_ENCODER_RTCD *rtcd, int tx_size) {
-  BLOCK *b;
-  BLOCKD *d;
-  vp9_token_state tokens[65][2];
-  uint64_t best_mask[2];
-  const short *dequant_ptr;
-  const short *coeff_ptr;
-  short *qcoeff_ptr;
-  short *dqcoeff_ptr;
-  int eob;
-  int i0;
-  int rc;
-  int x;
-  int sz = 0;
-  int next;
-  int rdmult;
-  int rddiv;
-  int final_eob;
-  int64_t rd_cost0, rd_cost1;
-  int rate0, rate1;
-  int error0, error1;
-  int t0, t1;
-  int best;
-  int band;
-  int pt;
-  int err_mult = plane_rd_mult[type];
-  int default_eob;
-  int const *scan, *bands;
-
-  b = &mb->block[i];
-  d = &mb->e_mbd.block[i];
-  switch (tx_size) {
-    default:
-    case TX_4X4:
-      scan = vp9_default_zig_zag1d;
-      bands = vp9_coef_bands;
-      default_eob = 16;
-      // TODO: this isn't called (for intra4x4 modes), but will be left in
-      // since it could be used later
-      {
-        TX_TYPE tx_type = get_tx_type(&mb->e_mbd, d);
-        if (tx_type != DCT_DCT) {
-          switch (tx_type) {
-            case ADST_DCT:
-              scan = vp9_row_scan;
-              break;
-
-            case DCT_ADST:
-              scan = vp9_col_scan;
-              break;
-
-            default:
-              scan = vp9_default_zig_zag1d;
-              break;
-          }
-        } else {
-          scan = vp9_default_zig_zag1d;
-        }
-      }
-      break;
-    case TX_8X8:
-      scan = vp9_default_zig_zag1d_8x8;
-      bands = vp9_coef_bands_8x8;
-      default_eob = 64;
-      break;
-  }
-
-  dequant_ptr = d->dequant;
-  coeff_ptr = b->coeff;
-  qcoeff_ptr = d->qcoeff;
-  dqcoeff_ptr = d->dqcoeff;
-  i0 = (type == PLANE_TYPE_Y_NO_DC);
-  eob = d->eob;
-
-  /* Now set up a Viterbi trellis to evaluate alternative roundings. */
-  rdmult = mb->rdmult * err_mult;
-  if (mb->e_mbd.mode_info_context->mbmi.ref_frame == INTRA_FRAME)
-    rdmult = (rdmult * 9) >> 4;
-  rddiv = mb->rddiv;
-  best_mask[0] = best_mask[1] = 0;
-  /* Initialize the sentinel node of the trellis. */
-  tokens[eob][0].rate = 0;
-  tokens[eob][0].error = 0;
-  tokens[eob][0].next = default_eob;
-  tokens[eob][0].token = DCT_EOB_TOKEN;
-  tokens[eob][0].qc = 0;
-  *(tokens[eob] + 1) = *(tokens[eob] + 0);
-  next = eob;
-  for (i = eob; i-- > i0;) {
-    int base_bits;
-    int d2;
-    int dx;
-
-    rc = scan[i];
-    x = qcoeff_ptr[rc];
-    /* Only add a trellis state for non-zero coefficients. */
-    if (x) {
-      int shortcut = 0;
-      error0 = tokens[next][0].error;
-      error1 = tokens[next][1].error;
-      /* Evaluate the first possibility for this state. */
-      rate0 = tokens[next][0].rate;
-      rate1 = tokens[next][1].rate;
-      t0 = (vp9_dct_value_tokens_ptr + x)->Token;
-      /* Consider both possible successor states. */
-      if (next < default_eob) {
-        band = bands[i + 1];
-        pt = vp9_prev_token_class[t0];
-        rate0 +=
-          mb->token_costs[tx_size][type][band][pt][tokens[next][0].token];
-        rate1 +=
-          mb->token_costs[tx_size][type][band][pt][tokens[next][1].token];
-      }
-      UPDATE_RD_COST();
-      /* And pick the best. */
-      best = rd_cost1 < rd_cost0;
-      base_bits = *(vp9_dct_value_cost_ptr + x);
-      dx = dqcoeff_ptr[rc] - coeff_ptr[rc];
-      d2 = dx * dx;
-      tokens[i][0].rate = base_bits + (best ? rate1 : rate0);
-      tokens[i][0].error = d2 + (best ? error1 : error0);
-      tokens[i][0].next = next;
-      tokens[i][0].token = t0;
-      tokens[i][0].qc = x;
-      best_mask[0] |= best << i;
-      /* Evaluate the second possibility for this state. */
-      rate0 = tokens[next][0].rate;
-      rate1 = tokens[next][1].rate;
-
-      if ((abs(x)*dequant_ptr[rc != 0] > abs(coeff_ptr[rc])) &&
-          (abs(x)*dequant_ptr[rc != 0] < abs(coeff_ptr[rc]) + dequant_ptr[rc != 0]))
-        shortcut = 1;
-      else
-        shortcut = 0;
-
-      if (shortcut) {
-        sz = -(x < 0);
-        x -= 2 * sz + 1;
-      }
-
-      /* Consider both possible successor states. */
-      if (!x) {
-        /* If we reduced this coefficient to zero, check to see if
-         *  we need to move the EOB back here.
-         */
-        t0 = tokens[next][0].token == DCT_EOB_TOKEN ?
-             DCT_EOB_TOKEN : ZERO_TOKEN;
-        t1 = tokens[next][1].token == DCT_EOB_TOKEN ?
-             DCT_EOB_TOKEN : ZERO_TOKEN;
-      } else {
-        t0 = t1 = (vp9_dct_value_tokens_ptr + x)->Token;
-      }
-      if (next < default_eob) {
-        band = bands[i + 1];
-        if (t0 != DCT_EOB_TOKEN) {
-          pt = vp9_prev_token_class[t0];
-          rate0 += mb->token_costs[tx_size][type][band][pt][
-              tokens[next][0].token];
-        }
-        if (t1 != DCT_EOB_TOKEN) {
-          pt = vp9_prev_token_class[t1];
-          rate1 += mb->token_costs[tx_size][type][band][pt][
-              tokens[next][1].token];
-        }
-      }
-
-      UPDATE_RD_COST();
-      /* And pick the best. */
-      best = rd_cost1 < rd_cost0;
-      base_bits = *(vp9_dct_value_cost_ptr + x);
-
-      if (shortcut) {
-        dx -= (dequant_ptr[rc != 0] + sz) ^ sz;
-        d2 = dx * dx;
-      }
-      tokens[i][1].rate = base_bits + (best ? rate1 : rate0);
-      tokens[i][1].error = d2 + (best ? error1 : error0);
-      tokens[i][1].next = next;
-      tokens[i][1].token = best ? t1 : t0;
-      tokens[i][1].qc = x;
-      best_mask[1] |= best << i;
-      /* Finally, make this the new head of the trellis. */
-      next = i;
-    }
-    /* There's no choice to make for a zero coefficient, so we don't
-     *  add a new trellis node, but we do need to update the costs.
-     */
-    else {
-      band = bands[i + 1];
-      t0 = tokens[next][0].token;
-      t1 = tokens[next][1].token;
-      /* Update the cost of each path if we're past the EOB token. */
-      if (t0 != DCT_EOB_TOKEN) {
-        tokens[next][0].rate += mb->token_costs[tx_size][type][band][0][t0];
-        tokens[next][0].token = ZERO_TOKEN;
-      }
-      if (t1 != DCT_EOB_TOKEN) {
-        tokens[next][1].rate += mb->token_costs[tx_size][type][band][0][t1];
-        tokens[next][1].token = ZERO_TOKEN;
-      }
-      /* Don't update next, because we didn't add a new node. */
-    }
-  }
-
-  /* Now pick the best path through the whole trellis. */
-  band = bands[i + 1];
-  VP9_COMBINEENTROPYCONTEXTS(pt, *a, *l);
-  rate0 = tokens[next][0].rate;
-  rate1 = tokens[next][1].rate;
-  error0 = tokens[next][0].error;
-  error1 = tokens[next][1].error;
-  t0 = tokens[next][0].token;
-  t1 = tokens[next][1].token;
-  rate0 += mb->token_costs[tx_size][type][band][pt][t0];
-  rate1 += mb->token_costs[tx_size][type][band][pt][t1];
-  UPDATE_RD_COST();
-  best = rd_cost1 < rd_cost0;
-  final_eob = i0 - 1;
-  for (i = next; i < eob; i = next) {
-    x = tokens[i][best].qc;
-    if (x)
-      final_eob = i;
-    rc = scan[i];
-    qcoeff_ptr[rc] = x;
-    dqcoeff_ptr[rc] = (x * dequant_ptr[rc != 0]);
-
-    next = tokens[i][best].next;
-    best = (best_mask[best] >> i) & 1;
-  }
-  final_eob++;
-
-  d->eob = final_eob;
-  *a = *l = (d->eob != !type);
-}
-
-/**************************************************************************
-our inverse hadamard transform effectively is weighted sum of all 16 inputs
-with weight either 1 or -1. It has a last stage scaling of (sum+1)>>2. And
-dc only idct is (dc+16)>>5. So if all the sums are between -65 and 63 the
-output after inverse wht and idct will be all zero. A sum of absolute value
-smaller than 65 guarantees all 16 different (+1/-1) weighted sums in wht
-fall between -65 and +65.
-**************************************************************************/
-#define SUM_2ND_COEFF_THRESH 65
-
-static void check_reset_2nd_coeffs(MACROBLOCKD *xd,
-                                   ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l) {
-  int sum = 0;
-  int i;
-  BLOCKD *bd = &xd->block[24];
-  if (bd->dequant[0] >= SUM_2ND_COEFF_THRESH
-      && bd->dequant[1] >= SUM_2ND_COEFF_THRESH)
-    return;
-
-  for (i = 0; i < bd->eob; i++) {
-    int coef = bd->dqcoeff[vp9_default_zig_zag1d[i]];
-    sum += (coef >= 0) ? coef : -coef;
-    if (sum >= SUM_2ND_COEFF_THRESH)
-      return;
-  }
-
-  if (sum < SUM_2ND_COEFF_THRESH) {
-    for (i = 0; i < bd->eob; i++) {
-      int rc = vp9_default_zig_zag1d[i];
-      bd->qcoeff[rc] = 0;
-      bd->dqcoeff[rc] = 0;
-    }
-    bd->eob = 0;
-    *a = *l = (bd->eob != 0);
-  }
-}
-
-#define SUM_2ND_COEFF_THRESH_8X8 32
-static void check_reset_8x8_2nd_coeffs(MACROBLOCKD *xd,
-                                       ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l) {
-  int sum = 0;
-  BLOCKD *bd = &xd->block[24];
-  int coef;
-
-  coef = bd->dqcoeff[0];
-  sum += (coef >= 0) ? coef : -coef;
-  coef = bd->dqcoeff[1];
-  sum += (coef >= 0) ? coef : -coef;
-  coef = bd->dqcoeff[4];
-  sum += (coef >= 0) ? coef : -coef;
-  coef = bd->dqcoeff[8];
-  sum += (coef >= 0) ? coef : -coef;
-
-  if (sum < SUM_2ND_COEFF_THRESH_8X8) {
-    bd->qcoeff[0] = 0;
-    bd->dqcoeff[0] = 0;
-    bd->qcoeff[1] = 0;
-    bd->dqcoeff[1] = 0;
-    bd->qcoeff[4] = 0;
-    bd->dqcoeff[4] = 0;
-    bd->qcoeff[8] = 0;
-    bd->dqcoeff[8] = 0;
-    bd->eob = 0;
-    *a = *l = (bd->eob != 0);
-  }
-}
-
-void vp9_optimize_mby_4x4(MACROBLOCK *x, const VP9_ENCODER_RTCD *rtcd) {
-  int b;
-  PLANE_TYPE type;
-  int has_2nd_order;
-  ENTROPY_CONTEXT_PLANES t_above, t_left;
-  ENTROPY_CONTEXT *ta;
-  ENTROPY_CONTEXT *tl;
-  MB_PREDICTION_MODE mode = x->e_mbd.mode_info_context->mbmi.mode;
-
-  if (!x->e_mbd.above_context || !x->e_mbd.left_context)
-    return;
-
-  vpx_memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
-  vpx_memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
-
-  ta = (ENTROPY_CONTEXT *)&t_above;
-  tl = (ENTROPY_CONTEXT *)&t_left;
-
-  has_2nd_order = (mode != B_PRED && mode != I8X8_PRED && mode != SPLITMV);
-  type = has_2nd_order ? PLANE_TYPE_Y_NO_DC : PLANE_TYPE_Y_WITH_DC;
-
-  for (b = 0; b < 16; b++) {
-    optimize_b(x, b, type,
-               ta + vp9_block2above[b], tl + vp9_block2left[b], rtcd, TX_4X4);
-  }
-
-  if (has_2nd_order) {
-    b = 24;
-    optimize_b(x, b, PLANE_TYPE_Y2,
-               ta + vp9_block2above[b], tl + vp9_block2left[b], rtcd, TX_4X4);
-    check_reset_2nd_coeffs(&x->e_mbd,
-                           ta + vp9_block2above[b], tl + vp9_block2left[b]);
-  }
-}
-
-void vp9_optimize_mbuv_4x4(MACROBLOCK *x, const VP9_ENCODER_RTCD *rtcd) {
-  int b;
-  ENTROPY_CONTEXT_PLANES t_above, t_left;
-  ENTROPY_CONTEXT *ta;
-  ENTROPY_CONTEXT *tl;
-
-  if (!x->e_mbd.above_context || !x->e_mbd.left_context)
-    return;
-
-  vpx_memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
-  vpx_memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
-
-  ta = (ENTROPY_CONTEXT *)&t_above;
-  tl = (ENTROPY_CONTEXT *)&t_left;
-
-  for (b = 16; b < 24; b++) {
-    optimize_b(x, b, PLANE_TYPE_UV,
-               ta + vp9_block2above[b], tl + vp9_block2left[b], rtcd, TX_4X4);
-  }
-}
-
-static void optimize_mb_4x4(MACROBLOCK *x, const VP9_ENCODER_RTCD *rtcd) {
-  vp9_optimize_mby_4x4(x, rtcd);
-  vp9_optimize_mbuv_4x4(x, rtcd);
-}
-
-void vp9_optimize_mby_8x8(MACROBLOCK *x, const VP9_ENCODER_RTCD *rtcd) {
-  int b;
-  PLANE_TYPE type;
-  ENTROPY_CONTEXT_PLANES t_above, t_left;
-  ENTROPY_CONTEXT *ta;
-  ENTROPY_CONTEXT *tl;
-  int has_2nd_order = x->e_mbd.mode_info_context->mbmi.mode != SPLITMV;
-
-  if (!x->e_mbd.above_context || !x->e_mbd.left_context)
-    return;
-
-  vpx_memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
-  vpx_memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
-
-  ta = (ENTROPY_CONTEXT *)&t_above;
-  tl = (ENTROPY_CONTEXT *)&t_left;
-  type = has_2nd_order ? PLANE_TYPE_Y_NO_DC : PLANE_TYPE_Y_WITH_DC;
-  for (b = 0; b < 16; b += 4) {
-    optimize_b(x, b, type,
-               ta + vp9_block2above_8x8[b], tl + vp9_block2left_8x8[b],
-               rtcd, TX_8X8);
-    ta[vp9_block2above_8x8[b] + 1] = ta[vp9_block2above_8x8[b]];
-    tl[vp9_block2left_8x8[b] + 1]  = tl[vp9_block2left_8x8[b]];
-  }
-
-  // 8x8 always have 2nd roder haar block
-  if (has_2nd_order) {
-    check_reset_8x8_2nd_coeffs(&x->e_mbd,
-                               ta + vp9_block2above_8x8[24],
-                               tl + vp9_block2left_8x8[24]);
-  }
-}
-
-void vp9_optimize_mbuv_8x8(MACROBLOCK *x, const VP9_ENCODER_RTCD *rtcd) {
-  int b;
-  ENTROPY_CONTEXT_PLANES t_above, t_left;
-  ENTROPY_CONTEXT *ta;
-  ENTROPY_CONTEXT *tl;
-
-  if (!x->e_mbd.above_context || !x->e_mbd.left_context)
-    return;
-
-  vpx_memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
-  vpx_memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
-
-  ta = (ENTROPY_CONTEXT *)&t_above;
-  tl = (ENTROPY_CONTEXT *)&t_left;
-
-  for (b = 16; b < 24; b += 4) {
-    optimize_b(x, b, PLANE_TYPE_UV,
-               ta + vp9_block2above_8x8[b], tl + vp9_block2left_8x8[b],
-               rtcd, TX_8X8);
-    ta[vp9_block2above_8x8[b] + 1] = ta[vp9_block2above_8x8[b]];
-    tl[vp9_block2left_8x8[b] + 1]  = tl[vp9_block2left_8x8[b]];
-  }
-}
-
-static void optimize_mb_8x8(MACROBLOCK *x, const VP9_ENCODER_RTCD *rtcd) {
-  vp9_optimize_mby_8x8(x, rtcd);
-  vp9_optimize_mbuv_8x8(x, rtcd);
-}
-
-static void optimize_b_16x16(MACROBLOCK *mb, int i, PLANE_TYPE type,
-                             ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
-                             const VP9_ENCODER_RTCD *rtcd) {
-  BLOCK *b = &mb->block[i];
-  BLOCKD *d = &mb->e_mbd.block[i];
-  vp9_token_state tokens[257][2];
-  unsigned best_index[257][2];
-  const short *dequant_ptr = d->dequant, *coeff_ptr = b->coeff;
-  short *qcoeff_ptr = qcoeff_ptr = d->qcoeff;
-  short *dqcoeff_ptr = dqcoeff_ptr = d->dqcoeff;
-  int eob = d->eob, final_eob, sz = 0;
-  int rc, x, next;
-  int64_t rdmult, rddiv, rd_cost0, rd_cost1;
-  int rate0, rate1, error0, error1, t0, t1;
-  int best, band, pt;
-  int err_mult = plane_rd_mult[type];
-
-  /* Now set up a Viterbi trellis to evaluate alternative roundings. */
-  rdmult = mb->rdmult * err_mult;
-  if (mb->e_mbd.mode_info_context->mbmi.ref_frame == INTRA_FRAME)
-      rdmult = (rdmult * 9)>>4;
-  rddiv = mb->rddiv;
-  memset(best_index, 0, sizeof(best_index));
-  /* Initialize the sentinel node of the trellis. */
-  tokens[eob][0].rate = 0;
-  tokens[eob][0].error = 0;
-  tokens[eob][0].next = 256;
-  tokens[eob][0].token = DCT_EOB_TOKEN;
-  tokens[eob][0].qc = 0;
-  *(tokens[eob] + 1) = *(tokens[eob] + 0);
-  next = eob;
-  for (i = eob; i-- > 0;) {
-    int base_bits, d2, dx;
-
-    rc = vp9_default_zig_zag1d_16x16[i];
-    x = qcoeff_ptr[rc];
-    /* Only add a trellis state for non-zero coefficients. */
-    if (x) {
-      int shortcut = 0;
-      error0 = tokens[next][0].error;
-      error1 = tokens[next][1].error;
-      /* Evaluate the first possibility for this state. */
-      rate0 = tokens[next][0].rate;
-      rate1 = tokens[next][1].rate;
-      t0 = (vp9_dct_value_tokens_ptr + x)->Token;
-      /* Consider both possible successor states. */
-      if (next < 256) {
-        band = vp9_coef_bands_16x16[i + 1];
-        pt = vp9_prev_token_class[t0];
-        rate0 += mb->token_costs[TX_16X16][type][band][pt][tokens[next][0].token];
-        rate1 += mb->token_costs[TX_16X16][type][band][pt][tokens[next][1].token];
-      }
-      UPDATE_RD_COST();
-      /* And pick the best. */
-      best = rd_cost1 < rd_cost0;
-      base_bits = *(vp9_dct_value_cost_ptr + x);
-      dx = dqcoeff_ptr[rc] - coeff_ptr[rc];
-      d2 = dx*dx;
-      tokens[i][0].rate = base_bits + (best ? rate1 : rate0);
-      tokens[i][0].error = d2 + (best ? error1 : error0);
-      tokens[i][0].next = next;
-      tokens[i][0].token = t0;
-      tokens[i][0].qc = x;
-      best_index[i][0] = best;
-      /* Evaluate the second possibility for this state. */
-      rate0 = tokens[next][0].rate;
-      rate1 = tokens[next][1].rate;
-
-      if((abs(x)*dequant_ptr[rc!=0]>abs(coeff_ptr[rc])) &&
-         (abs(x)*dequant_ptr[rc!=0]<abs(coeff_ptr[rc])+dequant_ptr[rc!=0]))
-        shortcut = 1;
-      else
-        shortcut = 0;
-
-      if (shortcut) {
-        sz = -(x < 0);
-        x -= 2*sz + 1;
-      }
-
-      /* Consider both possible successor states. */
-      if (!x) {
-        /* If we reduced this coefficient to zero, check to see if
-         *  we need to move the EOB back here.
-         */
-        t0 = tokens[next][0].token == DCT_EOB_TOKEN ?
-             DCT_EOB_TOKEN : ZERO_TOKEN;
-        t1 = tokens[next][1].token == DCT_EOB_TOKEN ?
-             DCT_EOB_TOKEN : ZERO_TOKEN;
-      }
-      else
-        t0=t1 = (vp9_dct_value_tokens_ptr + x)->Token;
-      if (next < 256) {
-        band = vp9_coef_bands_16x16[i + 1];
-        if (t0 != DCT_EOB_TOKEN) {
-            pt = vp9_prev_token_class[t0];
-            rate0 += mb->token_costs[TX_16X16][type][band][pt]
-                [tokens[next][0].token];
-        }
-        if (t1!=DCT_EOB_TOKEN) {
-            pt = vp9_prev_token_class[t1];
-            rate1 += mb->token_costs[TX_16X16][type][band][pt]
-                [tokens[next][1].token];
-        }
-      }
-      UPDATE_RD_COST();
-      /* And pick the best. */
-      best = rd_cost1 < rd_cost0;
-      base_bits = *(vp9_dct_value_cost_ptr + x);
-
-      if(shortcut) {
-        dx -= (dequant_ptr[rc!=0] + sz) ^ sz;
-        d2 = dx*dx;
-      }
-      tokens[i][1].rate = base_bits + (best ? rate1 : rate0);
-      tokens[i][1].error = d2 + (best ? error1 : error0);
-      tokens[i][1].next = next;
-      tokens[i][1].token = best ? t1 : t0;
-      tokens[i][1].qc = x;
-      best_index[i][1] = best;
-      /* Finally, make this the new head of the trellis. */
-      next = i;
-    }
-    /* There's no choice to make for a zero coefficient, so we don't
-     *  add a new trellis node, but we do need to update the costs.
-     */
-    else {
-      band = vp9_coef_bands_16x16[i + 1];
-      t0 = tokens[next][0].token;
-      t1 = tokens[next][1].token;
-      /* Update the cost of each path if we're past the EOB token. */
-      if (t0 != DCT_EOB_TOKEN) {
-        tokens[next][0].rate += mb->token_costs[TX_16X16][type][band][0][t0];
-        tokens[next][0].token = ZERO_TOKEN;
-      }
-      if (t1 != DCT_EOB_TOKEN) {
-        tokens[next][1].rate += mb->token_costs[TX_16X16][type][band][0][t1];
-        tokens[next][1].token = ZERO_TOKEN;
-      }
-      /* Don't update next, because we didn't add a new node. */
-    }
-  }
-
-  /* Now pick the best path through the whole trellis. */
-  band = vp9_coef_bands_16x16[i + 1];
-  VP9_COMBINEENTROPYCONTEXTS(pt, *a, *l);
-  rate0 = tokens[next][0].rate;
-  rate1 = tokens[next][1].rate;
-  error0 = tokens[next][0].error;
-  error1 = tokens[next][1].error;
-  t0 = tokens[next][0].token;
-  t1 = tokens[next][1].token;
-  rate0 += mb->token_costs[TX_16X16][type][band][pt][t0];
-  rate1 += mb->token_costs[TX_16X16][type][band][pt][t1];
-  UPDATE_RD_COST();
-  best = rd_cost1 < rd_cost0;
-  final_eob = -1;
-
-  for (i = next; i < eob; i = next) {
-    x = tokens[i][best].qc;
-    if (x)
-      final_eob = i;
-    rc = vp9_default_zig_zag1d_16x16[i];
-    qcoeff_ptr[rc] = x;
-    dqcoeff_ptr[rc] = (x * dequant_ptr[rc!=0]);
-
-    next = tokens[i][best].next;
-    best = best_index[i][best];
-  }
-  final_eob++;
-
-  d->eob = final_eob;
-  *a = *l = (d->eob != !type);
-}
-
-void vp9_optimize_mby_16x16(MACROBLOCK *x, const VP9_ENCODER_RTCD *rtcd) {
-  ENTROPY_CONTEXT_PLANES t_above, t_left;
-  ENTROPY_CONTEXT *ta, *tl;
-
-  if (!x->e_mbd.above_context || !x->e_mbd.left_context)
-    return;
-
-  vpx_memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
-  vpx_memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
-
-  ta = (ENTROPY_CONTEXT *)&t_above;
-  tl = (ENTROPY_CONTEXT *)&t_left;
-  optimize_b_16x16(x, 0, PLANE_TYPE_Y_WITH_DC, ta, tl, rtcd);
-}
-
-static void optimize_mb_16x16(MACROBLOCK *x, const VP9_ENCODER_RTCD *rtcd) {
-  vp9_optimize_mby_16x16(x, rtcd);
-  vp9_optimize_mbuv_8x8(x, rtcd);
-}
-
-void vp9_encode_inter16x16(const VP9_ENCODER_RTCD *rtcd, MACROBLOCK *x) {
-  MACROBLOCKD *xd = &x->e_mbd;
-  TX_SIZE tx_size = xd->mode_info_context->mbmi.txfm_size;
-
-  vp9_build_inter_predictors_mb(xd);
-  subtract_mb(rtcd, x);
-
-  if (tx_size == TX_16X16) {
-    vp9_transform_mb_16x16(x);
-    vp9_quantize_mb_16x16(x);
-    if (x->optimize)
-      optimize_mb_16x16(x, rtcd);
-    vp9_inverse_transform_mb_16x16(IF_RTCD(&rtcd->common->idct), xd);
-  } else if (tx_size == TX_8X8) {
-    if (xd->mode_info_context->mbmi.mode == SPLITMV) {
-      assert(xd->mode_info_context->mbmi.partitioning != PARTITIONING_4X4);
-      vp9_transform_mby_8x8(x);
-      vp9_transform_mbuv_4x4(x);
-      vp9_quantize_mby_8x8(x);
-      vp9_quantize_mbuv_4x4(x);
-      if (x->optimize) {
-        vp9_optimize_mby_8x8(x, rtcd);
-        vp9_optimize_mbuv_4x4(x, rtcd);
-      }
-      vp9_inverse_transform_mby_8x8(IF_RTCD(&rtcd->common->idct), xd);
-      vp9_inverse_transform_mbuv_4x4(IF_RTCD(&rtcd->common->idct), xd);
-    } else {
-      vp9_transform_mb_8x8(x);
-      vp9_quantize_mb_8x8(x);
-      if (x->optimize)
-        optimize_mb_8x8(x, rtcd);
-      vp9_inverse_transform_mb_8x8(IF_RTCD(&rtcd->common->idct), xd);
-    }
-  } else {
-    transform_mb_4x4(x);
-    vp9_quantize_mb_4x4(x);
-    if (x->optimize)
-      optimize_mb_4x4(x, rtcd);
-    vp9_inverse_transform_mb_4x4(IF_RTCD(&rtcd->common->idct), xd);
-  }
-
-  vp9_recon_mb(xd);
-}
-
-/* this function is used by first pass only */
-void vp9_encode_inter16x16y(const VP9_ENCODER_RTCD *rtcd, MACROBLOCK *x) {
-  MACROBLOCKD *xd = &x->e_mbd;
-  BLOCK *b = &x->block[0];
-
-#if CONFIG_PRED_FILTER
-  // Disable the prediction filter for firstpass
-  xd->mode_info_context->mbmi.pred_filter_enabled = 0;
-#endif
-
-  vp9_build_1st_inter16x16_predictors_mby(xd, xd->predictor, 16, 0);
-
-  vp9_subtract_mby(x->src_diff, *(b->base_src), xd->predictor, b->src_stride);
-
-  vp9_transform_mby_4x4(x);
-  vp9_quantize_mby_4x4(x);
-  vp9_inverse_transform_mby_4x4(IF_RTCD(&rtcd->common->idct), xd);
-
-  vp9_recon_mby(xd);
-}
--- a/vp8/encoder/encodemb.h
+++ /dev/null
@@ -1,70 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef __INC_ENCODEMB_H
-#define __INC_ENCODEMB_H
-
-#include "vpx_ports/config.h"
-#include "block.h"
-
-typedef struct {
-  MB_PREDICTION_MODE mode;
-  MV_REFERENCE_FRAME ref_frame;
-  MV_REFERENCE_FRAME second_ref_frame;
-#if CONFIG_PRED_FILTER
-  int pred_filter_flag;
-#endif
-} MODE_DEFINITION;
-
-
-#if CONFIG_RUNTIME_CPU_DETECT
-#define ENCODEMB_INVOKE(ctx,fn) (ctx)->fn
-#else
-#define ENCODEMB_INVOKE(ctx,fn) vp9_encodemb_##fn
-#endif
-
-
-
-#include "onyx_int.h"
-struct VP9_ENCODER_RTCD;
-void vp9_encode_inter16x16(const struct VP9_ENCODER_RTCD *rtcd, MACROBLOCK *x);
-
-void vp9_transform_mbuv_4x4(MACROBLOCK *x);
-void vp9_transform_mby_4x4(MACROBLOCK *x);
-
-void vp9_optimize_mby_4x4(MACROBLOCK *x, const struct VP9_ENCODER_RTCD *rtcd);
-void vp9_optimize_mbuv_4x4(MACROBLOCK *x, const struct VP9_ENCODER_RTCD *rtcd);
-void vp9_encode_inter16x16y(const struct VP9_ENCODER_RTCD *rtcd, MACROBLOCK *x);
-
-void vp9_transform_mb_8x8(MACROBLOCK *mb);
-void vp9_transform_mby_8x8(MACROBLOCK *x);
-void vp9_transform_mbuv_8x8(MACROBLOCK *x);
-void vp9_build_dcblock_8x8(MACROBLOCK *b);
-void vp9_optimize_mby_8x8(MACROBLOCK *x, const struct VP9_ENCODER_RTCD *rtcd);
-void vp9_optimize_mbuv_8x8(MACROBLOCK *x, const struct VP9_ENCODER_RTCD *rtcd);
-
-void vp9_transform_mb_16x16(MACROBLOCK *mb);
-void vp9_transform_mby_16x16(MACROBLOCK *x);
-void vp9_optimize_mby_16x16(MACROBLOCK *x, const struct VP9_ENCODER_RTCD *rtcd);
-
-void vp9_subtract_4b_c(BLOCK *be, BLOCKD *bd, int pitch);
-
-#if CONFIG_SUPERBLOCKS
-void vp9_subtract_mbuv_s_c(short *diff, const unsigned char *usrc,
-                           const unsigned char *vsrc, int src_stride,
-                           const unsigned char *upred,
-                           const unsigned char *vpred, int dst_stride);
-void vp9_subtract_mby_s_c(short *diff, const unsigned char *src,
-                          int src_stride, const unsigned char *pred,
-                          int dst_stride);
-#endif
-
-#endif
--- a/vp8/encoder/encodemv.c
+++ /dev/null
@@ -1,547 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vp8/common/common.h"
-#include "encodemv.h"
-#include "vp8/common/entropymode.h"
-#include "vp8/common/systemdependent.h"
-
-#include <math.h>
-
-#ifdef ENTROPY_STATS
-extern unsigned int active_section;
-#endif
-
-#ifdef NMV_STATS
-nmv_context_counts tnmvcounts;
-#endif
-
-static void encode_nmv_component(vp9_writer* const bc,
-                                 int v,
-                                 int r,
-                                 const nmv_component* const mvcomp) {
-  int s, z, c, o, d;
-  assert (v != 0);            /* should not be zero */
-  s = v < 0;
-  vp9_write(bc, s, mvcomp->sign);
-  z = (s ? -v : v) - 1;       /* magnitude - 1 */
-
-  c = vp9_get_mv_class(z, &o);
-
-  write_token(bc, vp9_mv_class_tree, mvcomp->classes,
-              vp9_mv_class_encodings + c);
-
-  d = (o >> 3);               /* int mv data */
-
-  if (c == MV_CLASS_0) {
-    write_token(bc, vp9_mv_class0_tree, mvcomp->class0,
-                vp9_mv_class0_encodings + d);
-  } else {
-    int i, b;
-    b = c + CLASS0_BITS - 1;  /* number of bits */
-    for (i = 0; i < b; ++i)
-      vp9_write(bc, ((d >> i) & 1), mvcomp->bits[i]);
-  }
-}
-
-static void encode_nmv_component_fp(vp9_writer *bc,
-                                    int v,
-                                    int r,
-                                    const nmv_component* const mvcomp,
-                                    int usehp) {
-  int s, z, c, o, d, f, e;
-  assert (v != 0);            /* should not be zero */
-  s = v < 0;
-  z = (s ? -v : v) - 1;       /* magnitude - 1 */
-
-  c = vp9_get_mv_class(z, &o);
-
-  d = (o >> 3);               /* int mv data */
-  f = (o >> 1) & 3;           /* fractional pel mv data */
-  e = (o & 1);                /* high precision mv data */
-
-  /* Code the fractional pel bits */
-  if (c == MV_CLASS_0) {
-    write_token(bc, vp9_mv_fp_tree, mvcomp->class0_fp[d],
-                vp9_mv_fp_encodings + f);
-  } else {
-    write_token(bc, vp9_mv_fp_tree, mvcomp->fp,
-                vp9_mv_fp_encodings + f);
-  }
-  /* Code the high precision bit */
-  if (usehp) {
-    if (c == MV_CLASS_0) {
-      vp9_write(bc, e, mvcomp->class0_hp);
-    } else {
-      vp9_write(bc, e, mvcomp->hp);
-    }
-  }
-}
-
-static void build_nmv_component_cost_table(int *mvcost,
-                                           const nmv_component* const mvcomp,
-                                           int usehp) {
-  int i, v;
-  int sign_cost[2], class_cost[MV_CLASSES], class0_cost[CLASS0_SIZE];
-  int bits_cost[MV_OFFSET_BITS][2];
-  int class0_fp_cost[CLASS0_SIZE][4], fp_cost[4];
-  int class0_hp_cost[2], hp_cost[2];
-
-  sign_cost[0] = vp9_cost_zero(mvcomp->sign);
-  sign_cost[1] = vp9_cost_one(mvcomp->sign);
-  vp9_cost_tokens(class_cost, mvcomp->classes, vp9_mv_class_tree);
-  vp9_cost_tokens(class0_cost, mvcomp->class0, vp9_mv_class0_tree);
-  for (i = 0; i < MV_OFFSET_BITS; ++i) {
-    bits_cost[i][0] = vp9_cost_zero(mvcomp->bits[i]);
-    bits_cost[i][1] = vp9_cost_one(mvcomp->bits[i]);
-  }
-
-  for (i = 0; i < CLASS0_SIZE; ++i)
-    vp9_cost_tokens(class0_fp_cost[i], mvcomp->class0_fp[i], vp9_mv_fp_tree);
-  vp9_cost_tokens(fp_cost, mvcomp->fp, vp9_mv_fp_tree);
-
-  if (usehp) {
-    class0_hp_cost[0] = vp9_cost_zero(mvcomp->class0_hp);
-    class0_hp_cost[1] = vp9_cost_one(mvcomp->class0_hp);
-    hp_cost[0] = vp9_cost_zero(mvcomp->hp);
-    hp_cost[1] = vp9_cost_one(mvcomp->hp);
-  }
-  mvcost[0] = 0;
-  for (v = 1; v <= MV_MAX; ++v) {
-    int z, c, o, d, e, f, cost = 0;
-    z = v - 1;
-    c = vp9_get_mv_class(z, &o);
-    cost += class_cost[c];
-    d = (o >> 3);               /* int mv data */
-    f = (o >> 1) & 3;           /* fractional pel mv data */
-    e = (o & 1);                /* high precision mv data */
-    if (c == MV_CLASS_0) {
-      cost += class0_cost[d];
-    } else {
-      int i, b;
-      b = c + CLASS0_BITS - 1;  /* number of bits */
-      for (i = 0; i < b; ++i)
-        cost += bits_cost[i][((d >> i) & 1)];
-    }
-    if (c == MV_CLASS_0) {
-      cost += class0_fp_cost[d][f];
-    } else {
-      cost += fp_cost[f];
-    }
-    if (usehp) {
-      if (c == MV_CLASS_0) {
-        cost += class0_hp_cost[e];
-      } else {
-        cost += hp_cost[e];
-      }
-    }
-    mvcost[v] = cost + sign_cost[0];
-    mvcost[-v] = cost + sign_cost[1];
-  }
-}
-
-static int update_nmv_savings(const unsigned int ct[2],
-                              const vp9_prob cur_p,
-                              const vp9_prob new_p,
-                              const vp9_prob upd_p) {
-
-#ifdef LOW_PRECISION_MV_UPDATE
-  vp9_prob mod_p = new_p | 1;
-#else
-  vp9_prob mod_p = new_p;
-#endif
-  const int cur_b = cost_branch256(ct, cur_p);
-  const int mod_b = cost_branch256(ct, mod_p);
-  const int cost = 7 * 256 +
-#ifndef LOW_PRECISION_MV_UPDATE
-      256 +
-#endif
-      (vp9_cost_one(upd_p) - vp9_cost_zero(upd_p));
-  if (cur_b - mod_b - cost > 0) {
-    return cur_b - mod_b - cost;
-  } else {
-    return -vp9_cost_zero(upd_p);
-  }
-}
-
-static int update_nmv(
-  vp9_writer *const bc,
-  const unsigned int ct[2],
-  vp9_prob *const cur_p,
-  const vp9_prob new_p,
-  const vp9_prob upd_p) {
-
-#ifdef LOW_PRECISION_MV_UPDATE
-  vp9_prob mod_p = new_p | 1;
-#else
-  vp9_prob mod_p = new_p;
-#endif
-
-  const int cur_b = cost_branch256(ct, *cur_p);
-  const int mod_b = cost_branch256(ct, mod_p);
-  const int cost = 7 * 256 +
-#ifndef LOW_PRECISION_MV_UPDATE
-      256 +
-#endif
-      (vp9_cost_one(upd_p) - vp9_cost_zero(upd_p));
-
-  if (cur_b - mod_b > cost) {
-    *cur_p = mod_p;
-    vp9_write(bc, 1, upd_p);
-#ifdef LOW_PRECISION_MV_UPDATE
-    vp9_write_literal(bc, mod_p >> 1, 7);
-#else
-    vp9_write_literal(bc, mod_p, 8);
-#endif
-    return 1;
-  } else {
-    vp9_write(bc, 0, upd_p);
-    return 0;
-  }
-}
-
-#ifdef NMV_STATS
-void init_nmvstats() {
-  vp9_zero(tnmvcounts);
-}
-
-void print_nmvstats() {
-  nmv_context prob;
-  unsigned int branch_ct_joint[MV_JOINTS - 1][2];
-  unsigned int branch_ct_sign[2][2];
-  unsigned int branch_ct_classes[2][MV_CLASSES - 1][2];
-  unsigned int branch_ct_class0[2][CLASS0_SIZE - 1][2];
-  unsigned int branch_ct_bits[2][MV_OFFSET_BITS][2];
-  unsigned int branch_ct_class0_fp[2][CLASS0_SIZE][4 - 1][2];
-  unsigned int branch_ct_fp[2][4 - 1][2];
-  unsigned int branch_ct_class0_hp[2][2];
-  unsigned int branch_ct_hp[2][2];
-  int i, j, k;
-  vp9_counts_to_nmv_context(&tnmvcounts, &prob, 1,
-                            branch_ct_joint, branch_ct_sign, branch_ct_classes,
-                            branch_ct_class0, branch_ct_bits,
-                            branch_ct_class0_fp, branch_ct_fp,
-                            branch_ct_class0_hp, branch_ct_hp);
-
-  printf("\nCounts =\n  { ");
-  for (j = 0; j < MV_JOINTS; ++j)
-    printf("%d, ", tnmvcounts.joints[j]);
-  printf("},\n");
-  for (i=0; i< 2; ++i) {
-    printf("  {\n");
-    printf("    %d/%d,\n", tnmvcounts.comps[i].sign[0],
-                           tnmvcounts.comps[i].sign[1]);
-    printf("    { ");
-    for (j = 0; j < MV_CLASSES; ++j)
-      printf("%d, ", tnmvcounts.comps[i].classes[j]);
-    printf("},\n");
-    printf("    { ");
-    for (j = 0; j < CLASS0_SIZE; ++j)
-      printf("%d, ", tnmvcounts.comps[i].class0[j]);
-    printf("},\n");
-    printf("    { ");
-    for (j = 0; j < MV_OFFSET_BITS; ++j)
-      printf("%d/%d, ", tnmvcounts.comps[i].bits[j][0],
-                        tnmvcounts.comps[i].bits[j][1]);
-    printf("},\n");
-
-    printf("    {");
-    for (j = 0; j < CLASS0_SIZE; ++j) {
-      printf("{");
-      for (k = 0; k < 4; ++k)
-        printf("%d, ", tnmvcounts.comps[i].class0_fp[j][k]);
-      printf("}, ");
-    }
-    printf("},\n");
-
-    printf("    { ");
-    for (j = 0; j < 4; ++j)
-      printf("%d, ", tnmvcounts.comps[i].fp[j]);
-    printf("},\n");
-
-    printf("    %d/%d,\n",
-           tnmvcounts.comps[i].class0_hp[0],
-           tnmvcounts.comps[i].class0_hp[1]);
-    printf("    %d/%d,\n",
-           tnmvcounts.comps[i].hp[0],
-           tnmvcounts.comps[i].hp[1]);
-    printf("  },\n");
-  }
-
-  printf("\nProbs =\n  { ");
-  for (j = 0; j < MV_JOINTS - 1; ++j)
-    printf("%d, ", prob.joints[j]);
-  printf("},\n");
-  for (i=0; i< 2; ++i) {
-    printf("  {\n");
-    printf("    %d,\n", prob.comps[i].sign);
-    printf("    { ");
-    for (j = 0; j < MV_CLASSES - 1; ++j)
-      printf("%d, ", prob.comps[i].classes[j]);
-    printf("},\n");
-    printf("    { ");
-    for (j = 0; j < CLASS0_SIZE - 1; ++j)
-      printf("%d, ", prob.comps[i].class0[j]);
-    printf("},\n");
-    printf("    { ");
-    for (j = 0; j < MV_OFFSET_BITS; ++j)
-      printf("%d, ", prob.comps[i].bits[j]);
-    printf("},\n");
-    printf("    { ");
-    for (j = 0; j < CLASS0_SIZE; ++j) {
-      printf("{");
-      for (k = 0; k < 3; ++k)
-        printf("%d, ", prob.comps[i].class0_fp[j][k]);
-      printf("}, ");
-    }
-    printf("},\n");
-    printf("    { ");
-    for (j = 0; j < 3; ++j)
-      printf("%d, ", prob.comps[i].fp[j]);
-    printf("},\n");
-
-    printf("    %d,\n", prob.comps[i].class0_hp);
-    printf("    %d,\n", prob.comps[i].hp);
-    printf("  },\n");
-  }
-}
-
-static void add_nmvcount(nmv_context_counts* const dst,
-                         const nmv_context_counts* const src) {
-  int i, j, k;
-  for (j = 0; j < MV_JOINTS; ++j) {
-    dst->joints[j] += src->joints[j];
-  }
-  for (i = 0; i < 2; ++i) {
-    for (j = 0; j < MV_VALS; ++j) {
-      dst->comps[i].mvcount[j] += src->comps[i].mvcount[j];
-    }
-    dst->comps[i].sign[0] += src->comps[i].sign[0];
-    dst->comps[i].sign[1] += src->comps[i].sign[1];
-    for (j = 0; j < MV_CLASSES; ++j) {
-      dst->comps[i].classes[j] += src->comps[i].classes[j];
-    }
-    for (j = 0; j < CLASS0_SIZE; ++j) {
-      dst->comps[i].class0[j] += src->comps[i].class0[j];
-    }
-    for (j = 0; j < MV_OFFSET_BITS; ++j) {
-      dst->comps[i].bits[j][0] += src->comps[i].bits[j][0];
-      dst->comps[i].bits[j][1] += src->comps[i].bits[j][1];
-    }
-  }
-  for (i = 0; i < 2; ++i) {
-    for (j = 0; j < CLASS0_SIZE; ++j) {
-      for (k = 0; k < 4; ++k) {
-        dst->comps[i].class0_fp[j][k] += src->comps[i].class0_fp[j][k];
-      }
-    }
-    for (j = 0; j < 4; ++j) {
-      dst->comps[i].fp[j] += src->comps[i].fp[j];
-    }
-    dst->comps[i].class0_hp[0] += src->comps[i].class0_hp[0];
-    dst->comps[i].class0_hp[1] += src->comps[i].class0_hp[1];
-    dst->comps[i].hp[0] += src->comps[i].hp[0];
-    dst->comps[i].hp[1] += src->comps[i].hp[1];
-  }
-}
-#endif
-
-void vp9_write_nmvprobs(VP9_COMP* const cpi, int usehp, vp9_writer* const bc) {
-  int i, j;
-  nmv_context prob;
-  unsigned int branch_ct_joint[MV_JOINTS - 1][2];
-  unsigned int branch_ct_sign[2][2];
-  unsigned int branch_ct_classes[2][MV_CLASSES - 1][2];
-  unsigned int branch_ct_class0[2][CLASS0_SIZE - 1][2];
-  unsigned int branch_ct_bits[2][MV_OFFSET_BITS][2];
-  unsigned int branch_ct_class0_fp[2][CLASS0_SIZE][4 - 1][2];
-  unsigned int branch_ct_fp[2][4 - 1][2];
-  unsigned int branch_ct_class0_hp[2][2];
-  unsigned int branch_ct_hp[2][2];
-  int savings = 0;
-
-#ifdef NMV_STATS
-  if (!cpi->dummy_packing)
-    add_nmvcount(&tnmvcounts, &cpi->NMVcount);
-#endif
-  vp9_counts_to_nmv_context(&cpi->NMVcount, &prob, usehp,
-                            branch_ct_joint, branch_ct_sign, branch_ct_classes,
-                            branch_ct_class0, branch_ct_bits,
-                            branch_ct_class0_fp, branch_ct_fp,
-                            branch_ct_class0_hp, branch_ct_hp);
-  /* write updates if they help */
-#ifdef MV_GROUP_UPDATE
-  for (j = 0; j < MV_JOINTS - 1; ++j) {
-    savings += update_nmv_savings(branch_ct_joint[j],
-                                  cpi->common.fc.nmvc.joints[j],
-                                  prob.joints[j],
-                                  VP9_NMV_UPDATE_PROB);
-  }
-  for (i = 0; i < 2; ++i) {
-    savings += update_nmv_savings(branch_ct_sign[i],
-                                  cpi->common.fc.nmvc.comps[i].sign,
-                                  prob.comps[i].sign,
-                                  VP9_NMV_UPDATE_PROB);
-    for (j = 0; j < MV_CLASSES - 1; ++j) {
-      savings += update_nmv_savings(branch_ct_classes[i][j],
-                                    cpi->common.fc.nmvc.comps[i].classes[j],
-                                    prob.comps[i].classes[j],
-                                    VP9_NMV_UPDATE_PROB);
-    }
-    for (j = 0; j < CLASS0_SIZE - 1; ++j) {
-      savings += update_nmv_savings(branch_ct_class0[i][j],
-                                    cpi->common.fc.nmvc.comps[i].class0[j],
-                                    prob.comps[i].class0[j],
-                                    VP9_NMV_UPDATE_PROB);
-    }
-    for (j = 0; j < MV_OFFSET_BITS; ++j) {
-      savings += update_nmv_savings(branch_ct_bits[i][j],
-                                    cpi->common.fc.nmvc.comps[i].bits[j],
-                                    prob.comps[i].bits[j],
-                                    VP9_NMV_UPDATE_PROB);
-    }
-  }
-  for (i = 0; i < 2; ++i) {
-    for (j = 0; j < CLASS0_SIZE; ++j) {
-      int k;
-      for (k = 0; k < 3; ++k) {
-        savings += update_nmv_savings(branch_ct_class0_fp[i][j][k],
-                                      cpi->common.fc.nmvc.comps[i].class0_fp[j][k],
-                                      prob.comps[i].class0_fp[j][k],
-                                      VP9_NMV_UPDATE_PROB);
-      }
-    }
-    for (j = 0; j < 3; ++j) {
-      savings += update_nmv_savings(branch_ct_fp[i][j],
-                                    cpi->common.fc.nmvc.comps[i].fp[j],
-                                    prob.comps[i].fp[j],
-                                    VP9_NMV_UPDATE_PROB);
-    }
-  }
-  if (usehp) {
-    for (i = 0; i < 2; ++i) {
-      savings += update_nmv_savings(branch_ct_class0_hp[i],
-                                    cpi->common.fc.nmvc.comps[i].class0_hp,
-                                    prob.comps[i].class0_hp,
-                                    VP9_NMV_UPDATE_PROB);
-      savings += update_nmv_savings(branch_ct_hp[i],
-                                    cpi->common.fc.nmvc.comps[i].hp,
-                                    prob.comps[i].hp,
-                                    VP9_NMV_UPDATE_PROB);
-    }
-  }
-  if (savings <= 0) {
-    vp9_write_bit(bc, 0);
-    return;
-  }
-  vp9_write_bit(bc, 1);
-#endif
-
-  for (j = 0; j < MV_JOINTS - 1; ++j) {
-    update_nmv(bc, branch_ct_joint[j],
-               &cpi->common.fc.nmvc.joints[j],
-               prob.joints[j],
-               VP9_NMV_UPDATE_PROB);
-  }
-  for (i = 0; i < 2; ++i) {
-    update_nmv(bc, branch_ct_sign[i],
-               &cpi->common.fc.nmvc.comps[i].sign,
-               prob.comps[i].sign,
-               VP9_NMV_UPDATE_PROB);
-    for (j = 0; j < MV_CLASSES - 1; ++j) {
-      update_nmv(bc, branch_ct_classes[i][j],
-                 &cpi->common.fc.nmvc.comps[i].classes[j],
-                 prob.comps[i].classes[j],
-                 VP9_NMV_UPDATE_PROB);
-    }
-    for (j = 0; j < CLASS0_SIZE - 1; ++j) {
-      update_nmv(bc, branch_ct_class0[i][j],
-                 &cpi->common.fc.nmvc.comps[i].class0[j],
-                 prob.comps[i].class0[j],
-                 VP9_NMV_UPDATE_PROB);
-    }
-    for (j = 0; j < MV_OFFSET_BITS; ++j) {
-      update_nmv(bc, branch_ct_bits[i][j],
-                 &cpi->common.fc.nmvc.comps[i].bits[j],
-                 prob.comps[i].bits[j],
-                 VP9_NMV_UPDATE_PROB);
-    }
-  }
-  for (i = 0; i < 2; ++i) {
-    for (j = 0; j < CLASS0_SIZE; ++j) {
-      int k;
-      for (k = 0; k < 3; ++k) {
-        update_nmv(bc, branch_ct_class0_fp[i][j][k],
-                   &cpi->common.fc.nmvc.comps[i].class0_fp[j][k],
-                   prob.comps[i].class0_fp[j][k],
-                   VP9_NMV_UPDATE_PROB);
-      }
-    }
-    for (j = 0; j < 3; ++j) {
-      update_nmv(bc, branch_ct_fp[i][j],
-                 &cpi->common.fc.nmvc.comps[i].fp[j],
-                 prob.comps[i].fp[j],
-                 VP9_NMV_UPDATE_PROB);
-    }
-  }
-  if (usehp) {
-    for (i = 0; i < 2; ++i) {
-      update_nmv(bc, branch_ct_class0_hp[i],
-                 &cpi->common.fc.nmvc.comps[i].class0_hp,
-                 prob.comps[i].class0_hp,
-                 VP9_NMV_UPDATE_PROB);
-      update_nmv(bc, branch_ct_hp[i],
-                 &cpi->common.fc.nmvc.comps[i].hp,
-                 prob.comps[i].hp,
-                 VP9_NMV_UPDATE_PROB);
-    }
-  }
-}
-
-void vp9_encode_nmv(vp9_writer* const bc, const MV* const mv,
-                    const MV* const ref, const nmv_context* const mvctx) {
-  MV_JOINT_TYPE j = vp9_get_mv_joint(*mv);
-  write_token(bc, vp9_mv_joint_tree, mvctx->joints,
-              vp9_mv_joint_encodings + j);
-  if (j == MV_JOINT_HZVNZ || j == MV_JOINT_HNZVNZ) {
-    encode_nmv_component(bc, mv->row, ref->col, &mvctx->comps[0]);
-  }
-  if (j == MV_JOINT_HNZVZ || j == MV_JOINT_HNZVNZ) {
-    encode_nmv_component(bc, mv->col, ref->col, &mvctx->comps[1]);
-  }
-}
-
-void vp9_encode_nmv_fp(vp9_writer* const bc, const MV* const mv,
-                       const MV* const ref, const nmv_context* const mvctx,
-                       int usehp) {
-  MV_JOINT_TYPE j = vp9_get_mv_joint(*mv);
-  usehp = usehp && vp9_use_nmv_hp(ref);
-  if (j == MV_JOINT_HZVNZ || j == MV_JOINT_HNZVNZ) {
-    encode_nmv_component_fp(bc, mv->row, ref->row, &mvctx->comps[0], usehp);
-  }
-  if (j == MV_JOINT_HNZVZ || j == MV_JOINT_HNZVNZ) {
-    encode_nmv_component_fp(bc, mv->col, ref->col, &mvctx->comps[1], usehp);
-  }
-}
-
-void vp9_build_nmv_cost_table(int *mvjoint,
-                              int *mvcost[2],
-                              const nmv_context* const mvctx,
-                              int usehp,
-                              int mvc_flag_v,
-                              int mvc_flag_h) {
-  vp9_clear_system_state();
-  vp9_cost_tokens(mvjoint, mvctx->joints, vp9_mv_joint_tree);
-  if (mvc_flag_v)
-    build_nmv_component_cost_table(mvcost[0], &mvctx->comps[0], usehp);
-  if (mvc_flag_h)
-    build_nmv_component_cost_table(mvcost[1], &mvctx->comps[1], usehp);
-}
--- a/vp8/encoder/encodemv.h
+++ /dev/null
@@ -1,30 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef __INC_ENCODEMV_H
-#define __INC_ENCODEMV_H
-
-#include "onyx_int.h"
-
-void vp9_write_nmvprobs(VP9_COMP* const, int usehp, vp9_writer* const);
-void vp9_encode_nmv(vp9_writer* const w, const MV* const mv,
-                    const MV* const ref, const nmv_context* const mvctx);
-void vp9_encode_nmv_fp(vp9_writer* const w, const MV* const mv,
-                       const MV* const ref, const nmv_context *mvctx,
-                       int usehp);
-void vp9_build_nmv_cost_table(int *mvjoint,
-                              int *mvcost[2],
-                              const nmv_context *mvctx,
-                              int usehp,
-                              int mvc_flag_v,
-                              int mvc_flag_h);
-
-#endif
--- a/vp8/encoder/firstpass.c
+++ /dev/null
@@ -1,2533 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "math.h"
-#include "limits.h"
-#include "block.h"
-#include "onyx_int.h"
-#include "variance.h"
-#include "encodeintra.h"
-#include "vp8/common/setupintrarecon.h"
-#include "mcomp.h"
-#include "firstpass.h"
-#include "vpx_scale/vpxscale.h"
-#include "encodemb.h"
-#include "vp8/common/extend.h"
-#include "vp8/common/systemdependent.h"
-#include "vpx_scale/yv12extend.h"
-#include "vpx_mem/vpx_mem.h"
-#include "vp8/common/swapyv12buffer.h"
-#include <stdio.h>
-#include "rdopt.h"
-#include "ratectrl.h"
-#include "vp8/common/quant_common.h"
-#include "vp8/common/entropymv.h"
-#include "encodemv.h"
-
-#define OUTPUT_FPF 0
-
-#if CONFIG_RUNTIME_CPU_DETECT
-#define IF_RTCD(x) (x)
-#else
-#define IF_RTCD(x) NULL
-#endif
-
-extern void vp9_build_block_offsets(MACROBLOCK *x);
-
-extern void vp9_setup_block_ptrs(MACROBLOCK *x);
-
-extern void vp9_frame_init_quantizer(VP9_COMP *cpi);
-
-extern void vp9_set_mbmode_and_mvs(MACROBLOCK *x, MB_PREDICTION_MODE mb,
-                                   int_mv *mv);
-
-extern void vp9_alloc_compressor_data(VP9_COMP *cpi);
-
-#define IIFACTOR   12.5
-#define IIKFACTOR1 12.5
-#define IIKFACTOR2 15.0
-#define RMAX       128.0
-#define GF_RMAX    96.0
-#define ERR_DIVISOR   150.0
-
-#define KF_MB_INTRA_MIN 300
-#define GF_MB_INTRA_MIN 200
-
-#define DOUBLE_DIVIDE_CHECK(X) ((X)<0?(X)-.000001:(X)+.000001)
-
-#define POW1 (double)cpi->oxcf.two_pass_vbrbias/100.0
-#define POW2 (double)cpi->oxcf.two_pass_vbrbias/100.0
-
-static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame);
-
-static int select_cq_level(int qindex) {
-  int ret_val = QINDEX_RANGE - 1;
-  int i;
-
-  double target_q = (vp9_convert_qindex_to_q(qindex) * 0.5847) + 1.0;
-
-  for (i = 0; i < QINDEX_RANGE; i++) {
-    if (target_q <= vp9_convert_qindex_to_q(i)) {
-      ret_val = i;
-      break;
-    }
-  }
-
-  return ret_val;
-}
-
-
-// Resets the first pass file to the given position using a relative seek from the current position
-static void reset_fpf_position(VP9_COMP *cpi, FIRSTPASS_STATS *Position) {
-  cpi->twopass.stats_in = Position;
-}
-
-static int lookup_next_frame_stats(VP9_COMP *cpi, FIRSTPASS_STATS *next_frame) {
-  if (cpi->twopass.stats_in >= cpi->twopass.stats_in_end)
-    return EOF;
-
-  *next_frame = *cpi->twopass.stats_in;
-  return 1;
-}
-
-// Read frame stats at an offset from the current position
-static int read_frame_stats(VP9_COMP *cpi,
-                            FIRSTPASS_STATS *frame_stats,
-                            int offset) {
-  FIRSTPASS_STATS *fps_ptr = cpi->twopass.stats_in;
-
-  // Check legality of offset
-  if (offset >= 0) {
-    if (&fps_ptr[offset] >= cpi->twopass.stats_in_end)
-      return EOF;
-  } else if (offset < 0) {
-    if (&fps_ptr[offset] < cpi->twopass.stats_in_start)
-      return EOF;
-  }
-
-  *frame_stats = fps_ptr[offset];
-  return 1;
-}
-
-static int input_stats(VP9_COMP *cpi, FIRSTPASS_STATS *fps) {
-  if (cpi->twopass.stats_in >= cpi->twopass.stats_in_end)
-    return EOF;
-
-  *fps = *cpi->twopass.stats_in;
-  cpi->twopass.stats_in =
-    (void *)((char *)cpi->twopass.stats_in + sizeof(FIRSTPASS_STATS));
-  return 1;
-}
-
-static void output_stats(const VP9_COMP            *cpi,
-                         struct vpx_codec_pkt_list *pktlist,
-                         FIRSTPASS_STATS            *stats) {
-  struct vpx_codec_cx_pkt pkt;
-  pkt.kind = VPX_CODEC_STATS_PKT;
-  pkt.data.twopass_stats.buf = stats;
-  pkt.data.twopass_stats.sz = sizeof(FIRSTPASS_STATS);
-  vpx_codec_pkt_list_add(pktlist, &pkt);
-
-// TEMP debug code
-#if OUTPUT_FPF
-
-  {
-    FILE *fpfile;
-    fpfile = fopen("firstpass.stt", "a");
-
-    fprintf(fpfile, "%12.0f %12.0f %12.0f %12.0f %12.0f %12.4f %12.4f"
-            "%12.4f %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f"
-            "%12.0f %12.0f %12.4f %12.0f %12.0f %12.4f\n",
-            stats->frame,
-            stats->intra_error,
-            stats->coded_error,
-            stats->sr_coded_error,
-            stats->ssim_weighted_pred_err,
-            stats->pcnt_inter,
-            stats->pcnt_motion,
-            stats->pcnt_second_ref,
-            stats->pcnt_neutral,
-            stats->MVr,
-            stats->mvr_abs,
-            stats->MVc,
-            stats->mvc_abs,
-            stats->MVrv,
-            stats->MVcv,
-            stats->mv_in_out_count,
-            stats->new_mv_count,
-            stats->count,
-            stats->duration);
-    fclose(fpfile);
-  }
-#endif
-}
-
-static void zero_stats(FIRSTPASS_STATS *section) {
-  section->frame      = 0.0;
-  section->intra_error = 0.0;
-  section->coded_error = 0.0;
-  section->sr_coded_error = 0.0;
-  section->ssim_weighted_pred_err = 0.0;
-  section->pcnt_inter  = 0.0;
-  section->pcnt_motion  = 0.0;
-  section->pcnt_second_ref = 0.0;
-  section->pcnt_neutral = 0.0;
-  section->MVr        = 0.0;
-  section->mvr_abs     = 0.0;
-  section->MVc        = 0.0;
-  section->mvc_abs     = 0.0;
-  section->MVrv       = 0.0;
-  section->MVcv       = 0.0;
-  section->mv_in_out_count  = 0.0;
-  section->new_mv_count = 0.0;
-  section->count      = 0.0;
-  section->duration   = 1.0;
-}
-
-static void accumulate_stats(FIRSTPASS_STATS *section, FIRSTPASS_STATS *frame) {
-  section->frame += frame->frame;
-  section->intra_error += frame->intra_error;
-  section->coded_error += frame->coded_error;
-  section->sr_coded_error += frame->sr_coded_error;
-  section->ssim_weighted_pred_err += frame->ssim_weighted_pred_err;
-  section->pcnt_inter  += frame->pcnt_inter;
-  section->pcnt_motion += frame->pcnt_motion;
-  section->pcnt_second_ref += frame->pcnt_second_ref;
-  section->pcnt_neutral += frame->pcnt_neutral;
-  section->MVr        += frame->MVr;
-  section->mvr_abs     += frame->mvr_abs;
-  section->MVc        += frame->MVc;
-  section->mvc_abs     += frame->mvc_abs;
-  section->MVrv       += frame->MVrv;
-  section->MVcv       += frame->MVcv;
-  section->mv_in_out_count  += frame->mv_in_out_count;
-  section->new_mv_count += frame->new_mv_count;
-  section->count      += frame->count;
-  section->duration   += frame->duration;
-}
-
-static void subtract_stats(FIRSTPASS_STATS *section, FIRSTPASS_STATS *frame) {
-  section->frame -= frame->frame;
-  section->intra_error -= frame->intra_error;
-  section->coded_error -= frame->coded_error;
-  section->sr_coded_error -= frame->sr_coded_error;
-  section->ssim_weighted_pred_err -= frame->ssim_weighted_pred_err;
-  section->pcnt_inter  -= frame->pcnt_inter;
-  section->pcnt_motion -= frame->pcnt_motion;
-  section->pcnt_second_ref -= frame->pcnt_second_ref;
-  section->pcnt_neutral -= frame->pcnt_neutral;
-  section->MVr        -= frame->MVr;
-  section->mvr_abs     -= frame->mvr_abs;
-  section->MVc        -= frame->MVc;
-  section->mvc_abs     -= frame->mvc_abs;
-  section->MVrv       -= frame->MVrv;
-  section->MVcv       -= frame->MVcv;
-  section->mv_in_out_count  -= frame->mv_in_out_count;
-  section->new_mv_count -= frame->new_mv_count;
-  section->count      -= frame->count;
-  section->duration   -= frame->duration;
-}
-
-static void avg_stats(FIRSTPASS_STATS *section) {
-  if (section->count < 1.0)
-    return;
-
-  section->intra_error /= section->count;
-  section->coded_error /= section->count;
-  section->sr_coded_error /= section->count;
-  section->ssim_weighted_pred_err /= section->count;
-  section->pcnt_inter  /= section->count;
-  section->pcnt_second_ref /= section->count;
-  section->pcnt_neutral /= section->count;
-  section->pcnt_motion /= section->count;
-  section->MVr        /= section->count;
-  section->mvr_abs     /= section->count;
-  section->MVc        /= section->count;
-  section->mvc_abs     /= section->count;
-  section->MVrv       /= section->count;
-  section->MVcv       /= section->count;
-  section->mv_in_out_count   /= section->count;
-  section->duration   /= section->count;
-}
-
-// Calculate a modified Error used in distributing bits between easier and harder frames
-static double calculate_modified_err(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
-  double av_err = (cpi->twopass.total_stats->ssim_weighted_pred_err /
-                   cpi->twopass.total_stats->count);
-  double this_err = this_frame->ssim_weighted_pred_err;
-  double modified_err;
-
-  if (this_err > av_err)
-    modified_err = av_err * pow((this_err / DOUBLE_DIVIDE_CHECK(av_err)), POW1);
-  else
-    modified_err = av_err * pow((this_err / DOUBLE_DIVIDE_CHECK(av_err)), POW2);
-
-  return modified_err;
-}
-
-static const double weight_table[256] = {
-  0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000,
-  0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000,
-  0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000,
-  0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000,
-  0.020000, 0.031250, 0.062500, 0.093750, 0.125000, 0.156250, 0.187500, 0.218750,
-  0.250000, 0.281250, 0.312500, 0.343750, 0.375000, 0.406250, 0.437500, 0.468750,
-  0.500000, 0.531250, 0.562500, 0.593750, 0.625000, 0.656250, 0.687500, 0.718750,
-  0.750000, 0.781250, 0.812500, 0.843750, 0.875000, 0.906250, 0.937500, 0.968750,
-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000
-};
-
-static double simple_weight(YV12_BUFFER_CONFIG *source) {
-  int i, j;
-
-  unsigned char *src = source->y_buffer;
-  double sum_weights = 0.0;
-
-  // Loop throught the Y plane raw examining levels and creating a weight for the image
-  i = source->y_height;
-  do {
-    j = source->y_width;
-    do {
-      sum_weights += weight_table[ *src];
-      src++;
-    } while (--j);
-    src -= source->y_width;
-    src += source->y_stride;
-  } while (--i);
-
-  sum_weights /= (source->y_height * source->y_width);
-
-  return sum_weights;
-}
-
-
-// This function returns the current per frame maximum bitrate target
-static int frame_max_bits(VP9_COMP *cpi) {
-  // Max allocation for a single frame based on the max section guidelines passed in and how many bits are left
-  int max_bits;
-
-  // For VBR base this on the bits and frames left plus the two_pass_vbrmax_section rate passed in by the user
-  max_bits = (int)(((double)cpi->twopass.bits_left / (cpi->twopass.total_stats->count - (double)cpi->common.current_video_frame)) * ((double)cpi->oxcf.two_pass_vbrmax_section / 100.0));
-
-  // Trap case where we are out of bits
-  if (max_bits < 0)
-    max_bits = 0;
-
-  return max_bits;
-}
-
-void vp9_init_first_pass(VP9_COMP *cpi) {
-  zero_stats(cpi->twopass.total_stats);
-}
-
-void vp9_end_first_pass(VP9_COMP *cpi) {
-  output_stats(cpi, cpi->output_pkt_list, cpi->twopass.total_stats);
-}
-
-static void zz_motion_search(VP9_COMP *cpi, MACROBLOCK *x, YV12_BUFFER_CONFIG *recon_buffer, int *best_motion_err, int recon_yoffset) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  BLOCK *b = &x->block[0];
-  BLOCKD *d = &x->e_mbd.block[0];
-
-  unsigned char *src_ptr = (*(b->base_src) + b->src);
-  int src_stride = b->src_stride;
-  unsigned char *ref_ptr;
-  int ref_stride = d->pre_stride;
-
-  // Set up pointers for this macro block recon buffer
-  xd->pre.y_buffer = recon_buffer->y_buffer + recon_yoffset;
-
-  ref_ptr = (unsigned char *)(*(d->base_pre) + d->pre);
-
-  vp9_mse16x16(src_ptr, src_stride, ref_ptr, ref_stride,
-               (unsigned int *)(best_motion_err));
-}
-
-static void first_pass_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
-                                     int_mv *ref_mv, MV *best_mv,
-                                     YV12_BUFFER_CONFIG *recon_buffer,
-                                     int *best_motion_err, int recon_yoffset) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  BLOCK *b = &x->block[0];
-  BLOCKD *d = &x->e_mbd.block[0];
-  int num00;
-
-  int_mv tmp_mv;
-  int_mv ref_mv_full;
-
-  int tmp_err;
-  int step_param = 3;
-  int further_steps = (MAX_MVSEARCH_STEPS - 1) - step_param;
-  int n;
-  vp9_variance_fn_ptr_t v_fn_ptr = cpi->fn_ptr[BLOCK_16X16];
-  int new_mv_mode_penalty = 256;
-
-  // override the default variance function to use MSE
-  v_fn_ptr.vf = vp9_mse16x16;
-
-  // Set up pointers for this macro block recon buffer
-  xd->pre.y_buffer = recon_buffer->y_buffer + recon_yoffset;
-
-  // Initial step/diamond search centred on best mv
-  tmp_mv.as_int = 0;
-  ref_mv_full.as_mv.col = ref_mv->as_mv.col >> 3;
-  ref_mv_full.as_mv.row = ref_mv->as_mv.row >> 3;
-  tmp_err = cpi->diamond_search_sad(x, b, d, &ref_mv_full, &tmp_mv, step_param,
-                                    x->sadperbit16, &num00, &v_fn_ptr,
-                                    XMVCOST, ref_mv);
-  if (tmp_err < INT_MAX - new_mv_mode_penalty)
-    tmp_err += new_mv_mode_penalty;
-
-  if (tmp_err < *best_motion_err) {
-    *best_motion_err = tmp_err;
-    best_mv->row = tmp_mv.as_mv.row;
-    best_mv->col = tmp_mv.as_mv.col;
-  }
-
-  // Further step/diamond searches as necessary
-  n = num00;
-  num00 = 0;
-
-  while (n < further_steps) {
-    n++;
-
-    if (num00)
-      num00--;
-    else {
-      tmp_err = cpi->diamond_search_sad(x, b, d, &ref_mv_full, &tmp_mv,
-                                        step_param + n, x->sadperbit16,
-                                        &num00, &v_fn_ptr,
-                                        XMVCOST, ref_mv);
-      if (tmp_err < INT_MAX - new_mv_mode_penalty)
-        tmp_err += new_mv_mode_penalty;
-
-      if (tmp_err < *best_motion_err) {
-        *best_motion_err = tmp_err;
-        best_mv->row = tmp_mv.as_mv.row;
-        best_mv->col = tmp_mv.as_mv.col;
-      }
-    }
-  }
-}
-
-void vp9_first_pass(VP9_COMP *cpi) {
-  int mb_row, mb_col;
-  MACROBLOCK *const x = &cpi->mb;
-  VP9_COMMON *const cm = &cpi->common;
-  MACROBLOCKD *const xd = &x->e_mbd;
-
-  int recon_yoffset, recon_uvoffset;
-  YV12_BUFFER_CONFIG *lst_yv12 = &cm->yv12_fb[cm->lst_fb_idx];
-  YV12_BUFFER_CONFIG *new_yv12 = &cm->yv12_fb[cm->new_fb_idx];
-  YV12_BUFFER_CONFIG *gld_yv12 = &cm->yv12_fb[cm->gld_fb_idx];
-  int recon_y_stride = lst_yv12->y_stride;
-  int recon_uv_stride = lst_yv12->uv_stride;
-  int64_t intra_error = 0;
-  int64_t coded_error = 0;
-  int64_t sr_coded_error = 0;
-
-  int sum_mvr = 0, sum_mvc = 0;
-  int sum_mvr_abs = 0, sum_mvc_abs = 0;
-  int sum_mvrs = 0, sum_mvcs = 0;
-  int mvcount = 0;
-  int intercount = 0;
-  int second_ref_count = 0;
-  int intrapenalty = 256;
-  int neutral_count = 0;
-  int new_mv_count = 0;
-  int sum_in_vectors = 0;
-  uint32_t lastmv_as_int = 0;
-
-  int_mv zero_ref_mv;
-
-  zero_ref_mv.as_int = 0;
-
-  vp9_clear_system_state();  // __asm emms;
-
-  x->src = * cpi->Source;
-  xd->pre = *lst_yv12;
-  xd->dst = *new_yv12;
-
-  x->partition_info = x->pi;
-
-  xd->mode_info_context = cm->mi;
-
-  vp9_build_block_offsets(x);
-
-  vp9_setup_block_dptrs(&x->e_mbd);
-
-  vp9_setup_block_ptrs(x);
-
-  // set up frame new frame for intra coded blocks
-  vp9_setup_intra_recon(new_yv12);
-  vp9_frame_init_quantizer(cpi);
-
-  // Initialise the MV cost table to the defaults
-  // if( cm->current_video_frame == 0)
-  // if ( 0 )
-  {
-    int flag[2] = {1, 1};
-    vp9_init_mv_probs(cm);
-    vp9_initialize_rd_consts(cpi, cm->base_qindex + cm->y1dc_delta_q);
-  }
-
-  // for each macroblock row in image
-  for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) {
-    int_mv best_ref_mv;
-
-    best_ref_mv.as_int = 0;
-
-    // reset above block coeffs
-    xd->up_available = (mb_row != 0);
-    recon_yoffset = (mb_row * recon_y_stride * 16);
-    recon_uvoffset = (mb_row * recon_uv_stride * 8);
-
-    // Set up limit values for motion vectors to prevent them extending outside the UMV borders
-    x->mv_row_min = -((mb_row * 16) + (VP8BORDERINPIXELS - 16));
-    x->mv_row_max = ((cm->mb_rows - 1 - mb_row) * 16) + (VP8BORDERINPIXELS - 16);
-
-
-    // for each macroblock col in image
-    for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) {
-      int this_error;
-      int gf_motion_error = INT_MAX;
-      int use_dc_pred = (mb_col || mb_row) && (!mb_col || !mb_row);
-
-      xd->dst.y_buffer = new_yv12->y_buffer + recon_yoffset;
-      xd->dst.u_buffer = new_yv12->u_buffer + recon_uvoffset;
-      xd->dst.v_buffer = new_yv12->v_buffer + recon_uvoffset;
-      xd->left_available = (mb_col != 0);
-
-      // Copy current mb to a buffer
-      vp9_copy_mem16x16(x->src.y_buffer, x->src.y_stride, x->thismb, 16);
-
-      // do intra 16x16 prediction
-      this_error = vp9_encode_intra(cpi, x, use_dc_pred);
-
-      // "intrapenalty" below deals with situations where the intra and inter error scores are very low (eg a plain black frame)
-      // We do not have special cases in first pass for 0,0 and nearest etc so all inter modes carry an overhead cost estimate fot the mv.
-      // When the error score is very low this causes us to pick all or lots of INTRA modes and throw lots of key frames.
-      // This penalty adds a cost matching that of a 0,0 mv to the intra case.
-      this_error += intrapenalty;
-
-      // Cumulative intra error total
-      intra_error += (int64_t)this_error;
-
-      // Set up limit values for motion vectors to prevent them extending outside the UMV borders
-      x->mv_col_min = -((mb_col * 16) + (VP8BORDERINPIXELS - 16));
-      x->mv_col_max = ((cm->mb_cols - 1 - mb_col) * 16) + (VP8BORDERINPIXELS - 16);
-
-      // Other than for the first frame do a motion search
-      if (cm->current_video_frame > 0) {
-        int tmp_err;
-        int motion_error = INT_MAX;
-        int_mv mv, tmp_mv;
-
-        // Simple 0,0 motion with no mv overhead
-        zz_motion_search(cpi, x, lst_yv12, &motion_error, recon_yoffset);
-        mv.as_int = tmp_mv.as_int = 0;
-
-        // Test last reference frame using the previous best mv as the
-        // starting point (best reference) for the search
-        first_pass_motion_search(cpi, x, &best_ref_mv,
-                                 &mv.as_mv, lst_yv12,
-                                 &motion_error, recon_yoffset);
-
-        // If the current best reference mv is not centred on 0,0 then do a 0,0 based search as well
-        if (best_ref_mv.as_int) {
-          tmp_err = INT_MAX;
-          first_pass_motion_search(cpi, x, &zero_ref_mv, &tmp_mv.as_mv,
-                                   lst_yv12, &tmp_err, recon_yoffset);
-
-          if (tmp_err < motion_error) {
-            motion_error = tmp_err;
-            mv.as_int = tmp_mv.as_int;
-          }
-        }
-
-        // Experimental search in an older reference frame
-        if (cm->current_video_frame > 1) {
-          // Simple 0,0 motion with no mv overhead
-          zz_motion_search(cpi, x, gld_yv12,
-                           &gf_motion_error, recon_yoffset);
-
-          first_pass_motion_search(cpi, x, &zero_ref_mv,
-                                   &tmp_mv.as_mv, gld_yv12,
-                                   &gf_motion_error, recon_yoffset);
-
-          if ((gf_motion_error < motion_error) &&
-              (gf_motion_error < this_error)) {
-            second_ref_count++;
-          }
-
-          // Reset to last frame as reference buffer
-          xd->pre.y_buffer = lst_yv12->y_buffer + recon_yoffset;
-          xd->pre.u_buffer = lst_yv12->u_buffer + recon_uvoffset;
-          xd->pre.v_buffer = lst_yv12->v_buffer + recon_uvoffset;
-
-          // In accumulating a score for the older reference frame
-          // take the best of the motion predicted score and
-          // the intra coded error (just as will be done for)
-          // accumulation of "coded_error" for the last frame.
-          if (gf_motion_error < this_error)
-            sr_coded_error += gf_motion_error;
-          else
-            sr_coded_error += this_error;
-        } else
-          sr_coded_error += motion_error;
-
-        /* Intra assumed best */
-        best_ref_mv.as_int = 0;
-
-        if (motion_error <= this_error) {
-          // Keep a count of cases where the inter and intra were
-          // very close and very low. This helps with scene cut
-          // detection for example in cropped clips with black bars
-          // at the sides or top and bottom.
-          if ((((this_error - intrapenalty) * 9) <=
-               (motion_error * 10)) &&
-              (this_error < (2 * intrapenalty))) {
-            neutral_count++;
-          }
-
-          mv.as_mv.row <<= 3;
-          mv.as_mv.col <<= 3;
-          this_error = motion_error;
-          vp9_set_mbmode_and_mvs(x, NEWMV, &mv);
-          xd->mode_info_context->mbmi.txfm_size = TX_4X4;
-          vp9_encode_inter16x16y(IF_RTCD(&cpi->rtcd), x);
-          sum_mvr += mv.as_mv.row;
-          sum_mvr_abs += abs(mv.as_mv.row);
-          sum_mvc += mv.as_mv.col;
-          sum_mvc_abs += abs(mv.as_mv.col);
-          sum_mvrs += mv.as_mv.row * mv.as_mv.row;
-          sum_mvcs += mv.as_mv.col * mv.as_mv.col;
-          intercount++;
-
-          best_ref_mv.as_int = mv.as_int;
-
-          // Was the vector non-zero
-          if (mv.as_int) {
-            mvcount++;
-
-            // Was it different from the last non zero vector
-            if (mv.as_int != lastmv_as_int)
-              new_mv_count++;
-            lastmv_as_int = mv.as_int;
-
-            // Does the Row vector point inwards or outwards
-            if (mb_row < cm->mb_rows / 2) {
-              if (mv.as_mv.row > 0)
-                sum_in_vectors--;
-              else if (mv.as_mv.row < 0)
-                sum_in_vectors++;
-            } else if (mb_row > cm->mb_rows / 2) {
-              if (mv.as_mv.row > 0)
-                sum_in_vectors++;
-              else if (mv.as_mv.row < 0)
-                sum_in_vectors--;
-            }
-
-            // Does the Row vector point inwards or outwards
-            if (mb_col < cm->mb_cols / 2) {
-              if (mv.as_mv.col > 0)
-                sum_in_vectors--;
-              else if (mv.as_mv.col < 0)
-                sum_in_vectors++;
-            } else if (mb_col > cm->mb_cols / 2) {
-              if (mv.as_mv.col > 0)
-                sum_in_vectors++;
-              else if (mv.as_mv.col < 0)
-                sum_in_vectors--;
-            }
-          }
-        }
-      } else
-        sr_coded_error += (int64_t)this_error;
-
-      coded_error += (int64_t)this_error;
-
-      // adjust to the next column of macroblocks
-      x->src.y_buffer += 16;
-      x->src.u_buffer += 8;
-      x->src.v_buffer += 8;
-
-      recon_yoffset += 16;
-      recon_uvoffset += 8;
-    }
-
-    // adjust to the next row of mbs
-    x->src.y_buffer += 16 * x->src.y_stride - 16 * cm->mb_cols;
-    x->src.u_buffer += 8 * x->src.uv_stride - 8 * cm->mb_cols;
-    x->src.v_buffer += 8 * x->src.uv_stride - 8 * cm->mb_cols;
-
-    // extend the recon for intra prediction
-    vp9_extend_mb_row(new_yv12, xd->dst.y_buffer + 16,
-                      xd->dst.u_buffer + 8, xd->dst.v_buffer + 8);
-    vp9_clear_system_state();  // __asm emms;
-  }
-
-  vp9_clear_system_state();  // __asm emms;
-  {
-    double weight = 0.0;
-
-    FIRSTPASS_STATS fps;
-
-    fps.frame      = cm->current_video_frame;
-    fps.intra_error = intra_error >> 8;
-    fps.coded_error = coded_error >> 8;
-    fps.sr_coded_error = sr_coded_error >> 8;
-    weight = simple_weight(cpi->Source);
-
-
-    if (weight < 0.1)
-      weight = 0.1;
-
-    fps.ssim_weighted_pred_err = fps.coded_error * weight;
-
-    fps.pcnt_inter  = 0.0;
-    fps.pcnt_motion = 0.0;
-    fps.MVr        = 0.0;
-    fps.mvr_abs     = 0.0;
-    fps.MVc        = 0.0;
-    fps.mvc_abs     = 0.0;
-    fps.MVrv       = 0.0;
-    fps.MVcv       = 0.0;
-    fps.mv_in_out_count  = 0.0;
-    fps.new_mv_count = 0.0;
-    fps.count      = 1.0;
-
-    fps.pcnt_inter   = 1.0 * (double)intercount / cm->MBs;
-    fps.pcnt_second_ref = 1.0 * (double)second_ref_count / cm->MBs;
-    fps.pcnt_neutral = 1.0 * (double)neutral_count / cm->MBs;
-
-    if (mvcount > 0) {
-      fps.MVr = (double)sum_mvr / (double)mvcount;
-      fps.mvr_abs = (double)sum_mvr_abs / (double)mvcount;
-      fps.MVc = (double)sum_mvc / (double)mvcount;
-      fps.mvc_abs = (double)sum_mvc_abs / (double)mvcount;
-      fps.MVrv = ((double)sum_mvrs - (fps.MVr * fps.MVr / (double)mvcount)) / (double)mvcount;
-      fps.MVcv = ((double)sum_mvcs - (fps.MVc * fps.MVc / (double)mvcount)) / (double)mvcount;
-      fps.mv_in_out_count = (double)sum_in_vectors / (double)(mvcount * 2);
-      fps.new_mv_count = new_mv_count;
-
-      fps.pcnt_motion = 1.0 * (double)mvcount / cpi->common.MBs;
-    }
-
-    // TODO:  handle the case when duration is set to 0, or something less
-    // than the full time between subsequent cpi->source_time_stamp s  .
-    fps.duration = cpi->source->ts_end
-                   - cpi->source->ts_start;
-
-    // don't want to do output stats with a stack variable!
-    memcpy(cpi->twopass.this_frame_stats,
-           &fps,
-           sizeof(FIRSTPASS_STATS));
-    output_stats(cpi, cpi->output_pkt_list, cpi->twopass.this_frame_stats);
-    accumulate_stats(cpi->twopass.total_stats, &fps);
-  }
-
-  // Copy the previous Last Frame back into gf and and arf buffers if
-  // the prediction is good enough... but also dont allow it to lag too far
-  if ((cpi->twopass.sr_update_lag > 3) ||
-      ((cm->current_video_frame > 0) &&
-       (cpi->twopass.this_frame_stats->pcnt_inter > 0.20) &&
-       ((cpi->twopass.this_frame_stats->intra_error /
-         cpi->twopass.this_frame_stats->coded_error) > 2.0))) {
-    vp8_yv12_copy_frame_ptr(lst_yv12, gld_yv12);
-    cpi->twopass.sr_update_lag = 1;
-  } else
-    cpi->twopass.sr_update_lag++;
-
-  // swap frame pointers so last frame refers to the frame we just compressed
-  vp9_swap_yv12_buffer(lst_yv12, new_yv12);
-  vp8_yv12_extend_frame_borders(lst_yv12);
-
-  // Special case for the first frame. Copy into the GF buffer as a second reference.
-  if (cm->current_video_frame == 0) {
-    vp8_yv12_copy_frame_ptr(lst_yv12, gld_yv12);
-  }
-
-
-  // use this to see what the first pass reconstruction looks like
-  if (0) {
-    char filename[512];
-    FILE *recon_file;
-    sprintf(filename, "enc%04d.yuv", (int) cm->current_video_frame);
-
-    if (cm->current_video_frame == 0)
-      recon_file = fopen(filename, "wb");
-    else
-      recon_file = fopen(filename, "ab");
-
-    if (fwrite(lst_yv12->buffer_alloc, lst_yv12->frame_size, 1, recon_file));
-    fclose(recon_file);
-  }
-
-  cm->current_video_frame++;
-
-}
-
-// Estimate a cost per mb attributable to overheads such as the coding of
-// modes and motion vectors.
-// Currently simplistic in its assumptions for testing.
-//
-
-
-static double bitcost(double prob) {
-  return -(log(prob) / log(2.0));
-}
-
-static long long estimate_modemvcost(VP9_COMP *cpi,
-                                     FIRSTPASS_STATS *fpstats) {
-  int mv_cost;
-  int mode_cost;
-
-  double av_pct_inter = fpstats->pcnt_inter / fpstats->count;
-  double av_pct_motion = fpstats->pcnt_motion / fpstats->count;
-  double av_intra = (1.0 - av_pct_inter);
-
-  double zz_cost;
-  double motion_cost;
-  double intra_cost;
-
-  zz_cost = bitcost(av_pct_inter - av_pct_motion);
-  motion_cost = bitcost(av_pct_motion);
-  intra_cost = bitcost(av_intra);
-
-  // Estimate of extra bits per mv overhead for mbs
-  // << 9 is the normalization to the (bits * 512) used in vp9_bits_per_mb
-  mv_cost = ((int)(fpstats->new_mv_count / fpstats->count) * 8) << 9;
-
-  // Crude estimate of overhead cost from modes
-  // << 9 is the normalization to (bits * 512) used in vp9_bits_per_mb
-  mode_cost =
-    (int)((((av_pct_inter - av_pct_motion) * zz_cost) +
-           (av_pct_motion * motion_cost) +
-           (av_intra * intra_cost)) * cpi->common.MBs) << 9;
-
-  // return mv_cost + mode_cost;
-  // TODO PGW Fix overhead costs for extended Q range
-  return 0;
-}
-
-static double calc_correction_factor(double err_per_mb,
-                                     double err_divisor,
-                                     double pt_low,
-                                     double pt_high,
-                                     int Q) {
-  double power_term;
-  double error_term = err_per_mb / err_divisor;
-  double correction_factor;
-
-  // Adjustment based on actual quantizer to power term.
-  power_term = (vp9_convert_qindex_to_q(Q) * 0.01) + pt_low;
-  power_term = (power_term > pt_high) ? pt_high : power_term;
-
-  // Adjustments to error term
-  // TBD
-
-  // Calculate correction factor
-  correction_factor = pow(error_term, power_term);
-
-  // Clip range
-  correction_factor =
-    (correction_factor < 0.05)
-    ? 0.05 : (correction_factor > 2.0) ? 2.0 : correction_factor;
-
-  return correction_factor;
-}
-
-// Given a current maxQ value sets a range for future values.
-// PGW TODO..
-// This code removes direct dependency on QIndex to determin the range
-// (now uses the actual quantizer) but has not been tuned.
-static void adjust_maxq_qrange(VP9_COMP *cpi) {
-  int i;
-  double q;
-
-  // Set the max corresponding to cpi->avg_q * 2.0
-  q = cpi->avg_q * 2.0;
-  cpi->twopass.maxq_max_limit = cpi->worst_quality;
-  for (i = cpi->best_quality; i <= cpi->worst_quality; i++) {
-    cpi->twopass.maxq_max_limit = i;
-    if (vp9_convert_qindex_to_q(i) >= q)
-      break;
-  }
-
-  // Set the min corresponding to cpi->avg_q * 0.5
-  q = cpi->avg_q * 0.5;
-  cpi->twopass.maxq_min_limit = cpi->best_quality;
-  for (i = cpi->worst_quality; i >= cpi->best_quality; i--) {
-    cpi->twopass.maxq_min_limit = i;
-    if (vp9_convert_qindex_to_q(i) <= q)
-      break;
-  }
-}
-
-static int estimate_max_q(VP9_COMP *cpi,
-                          FIRSTPASS_STATS *fpstats,
-                          int section_target_bandwitdh,
-                          int overhead_bits) {
-  int Q;
-  int num_mbs = cpi->common.MBs;
-  int target_norm_bits_per_mb;
-
-  double section_err = (fpstats->coded_error / fpstats->count);
-  double sr_err_diff;
-  double sr_correction;
-  double err_per_mb = section_err / num_mbs;
-  double err_correction_factor;
-  double speed_correction = 1.0;
-  int overhead_bits_per_mb;
-
-  if (section_target_bandwitdh <= 0)
-    return cpi->twopass.maxq_max_limit;          // Highest value allowed
-
-  target_norm_bits_per_mb =
-    (section_target_bandwitdh < (1 << 20))
-    ? (512 * section_target_bandwitdh) / num_mbs
-    : 512 * (section_target_bandwitdh / num_mbs);
-
-  // Look at the drop in prediction quality between the last frame
-  // and the GF buffer (which contained an older frame).
-  sr_err_diff =
-    (fpstats->sr_coded_error - fpstats->coded_error) /
-    (fpstats->count * cpi->common.MBs);
-  sr_correction = (sr_err_diff / 32.0);
-  sr_correction = pow(sr_correction, 0.25);
-  if (sr_correction < 0.75)
-    sr_correction = 0.75;
-  else if (sr_correction > 1.25)
-    sr_correction = 1.25;
-
-  // Calculate a corrective factor based on a rolling ratio of bits spent
-  // vs target bits
-  if ((cpi->rolling_target_bits > 0) &&
-      (cpi->active_worst_quality < cpi->worst_quality)) {
-    double rolling_ratio;
-
-    rolling_ratio = (double)cpi->rolling_actual_bits /
-                    (double)cpi->rolling_target_bits;
-
-    if (rolling_ratio < 0.95)
-      cpi->twopass.est_max_qcorrection_factor -= 0.005;
-    else if (rolling_ratio > 1.05)
-      cpi->twopass.est_max_qcorrection_factor += 0.005;
-
-    cpi->twopass.est_max_qcorrection_factor =
-      (cpi->twopass.est_max_qcorrection_factor < 0.1)
-      ? 0.1
-      : (cpi->twopass.est_max_qcorrection_factor > 10.0)
-      ? 10.0 : cpi->twopass.est_max_qcorrection_factor;
-  }
-
-  // Corrections for higher compression speed settings
-  // (reduced compression expected)
-  if (cpi->compressor_speed == 1) {
-    if (cpi->oxcf.cpu_used <= 5)
-      speed_correction = 1.04 + (cpi->oxcf.cpu_used * 0.04);
-    else
-      speed_correction = 1.25;
-  }
-
-  // Estimate of overhead bits per mb
-  // Correction to overhead bits for min allowed Q.
-  // PGW TODO.. This code is broken for the extended Q range
-  //            for now overhead set to 0.
-  overhead_bits_per_mb = overhead_bits / num_mbs;
-  overhead_bits_per_mb *= pow(0.98, (double)cpi->twopass.maxq_min_limit);
-
-  // Try and pick a max Q that will be high enough to encode the
-  // content at the given rate.
-  for (Q = cpi->twopass.maxq_min_limit; Q < cpi->twopass.maxq_max_limit; Q++) {
-    int bits_per_mb_at_this_q;
-
-    err_correction_factor =
-      calc_correction_factor(err_per_mb, ERR_DIVISOR, 0.4, 0.90, Q) *
-      sr_correction * speed_correction *
-      cpi->twopass.est_max_qcorrection_factor;
-
-    if (err_correction_factor < 0.05)
-      err_correction_factor = 0.05;
-    else if (err_correction_factor > 5.0)
-      err_correction_factor = 5.0;
-
-    bits_per_mb_at_this_q =
-      vp9_bits_per_mb(INTER_FRAME, Q) + overhead_bits_per_mb;
-
-    bits_per_mb_at_this_q = (int)(.5 + err_correction_factor *
-                                  (double)bits_per_mb_at_this_q);
-
-    // Mode and motion overhead
-    // As Q rises in real encode loop rd code will force overhead down
-    // We make a crude adjustment for this here as *.98 per Q step.
-    // PGW TODO.. This code is broken for the extended Q range
-    //            for now overhead set to 0.
-    // overhead_bits_per_mb = (int)((double)overhead_bits_per_mb * 0.98);
-
-    if (bits_per_mb_at_this_q <= target_norm_bits_per_mb)
-      break;
-  }
-
-  // Restriction on active max q for constrained quality mode.
-  if ((cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) &&
-      (Q < cpi->cq_target_quality)) {
-    Q = cpi->cq_target_quality;
-  }
-
-  // Adjust maxq_min_limit and maxq_max_limit limits based on
-  // averaga q observed in clip for non kf/gf/arf frames
-  // Give average a chance to settle though.
-  // PGW TODO.. This code is broken for the extended Q range
-  if ((cpi->ni_frames >
-       ((unsigned int)cpi->twopass.total_stats->count >> 8)) &&
-      (cpi->ni_frames > 150)) {
-    adjust_maxq_qrange(cpi);
-  }
-
-  return Q;
-}
-
-// For cq mode estimate a cq level that matches the observed
-// complexity and data rate.
-static int estimate_cq(VP9_COMP *cpi,
-                       FIRSTPASS_STATS *fpstats,
-                       int section_target_bandwitdh,
-                       int overhead_bits) {
-  int Q;
-  int num_mbs = cpi->common.MBs;
-  int target_norm_bits_per_mb;
-
-  double section_err = (fpstats->coded_error / fpstats->count);
-  double err_per_mb = section_err / num_mbs;
-  double err_correction_factor;
-  double sr_err_diff;
-  double sr_correction;
-  double speed_correction = 1.0;
-  double clip_iiratio;
-  double clip_iifactor;
-  int overhead_bits_per_mb;
-
-
-  target_norm_bits_per_mb = (section_target_bandwitdh < (1 << 20))
-                            ? (512 * section_target_bandwitdh) / num_mbs
-                            : 512 * (section_target_bandwitdh / num_mbs);
-
-  // Estimate of overhead bits per mb
-  overhead_bits_per_mb = overhead_bits / num_mbs;
-
-  // Corrections for higher compression speed settings
-  // (reduced compression expected)
-  if (cpi->compressor_speed == 1) {
-    if (cpi->oxcf.cpu_used <= 5)
-      speed_correction = 1.04 + (cpi->oxcf.cpu_used * 0.04);
-    else
-      speed_correction = 1.25;
-  }
-
-  // Look at the drop in prediction quality between the last frame
-  // and the GF buffer (which contained an older frame).
-  sr_err_diff =
-    (fpstats->sr_coded_error - fpstats->coded_error) /
-    (fpstats->count * cpi->common.MBs);
-  sr_correction = (sr_err_diff / 32.0);
-  sr_correction = pow(sr_correction, 0.25);
-  if (sr_correction < 0.75)
-    sr_correction = 0.75;
-  else if (sr_correction > 1.25)
-    sr_correction = 1.25;
-
-  // II ratio correction factor for clip as a whole
-  clip_iiratio = cpi->twopass.total_stats->intra_error /
-                 DOUBLE_DIVIDE_CHECK(cpi->twopass.total_stats->coded_error);
-  clip_iifactor = 1.0 - ((clip_iiratio - 10.0) * 0.025);
-  if (clip_iifactor < 0.80)
-    clip_iifactor = 0.80;
-
-  // Try and pick a Q that can encode the content at the given rate.
-  for (Q = 0; Q < MAXQ; Q++) {
-    int bits_per_mb_at_this_q;
-
-    // Error per MB based correction factor
-    err_correction_factor =
-      calc_correction_factor(err_per_mb, 100.0, 0.4, 0.90, Q) *
-      sr_correction * speed_correction * clip_iifactor;
-
-    if (err_correction_factor < 0.05)
-      err_correction_factor = 0.05;
-    else if (err_correction_factor > 5.0)
-      err_correction_factor = 5.0;
-
-    bits_per_mb_at_this_q =
-      vp9_bits_per_mb(INTER_FRAME, Q) + overhead_bits_per_mb;
-
-    bits_per_mb_at_this_q = (int)(.5 + err_correction_factor *
-                                  (double)bits_per_mb_at_this_q);
-
-    // Mode and motion overhead
-    // As Q rises in real encode loop rd code will force overhead down
-    // We make a crude adjustment for this here as *.98 per Q step.
-    // PGW TODO.. This code is broken for the extended Q range
-    //            for now overhead set to 0.
-    overhead_bits_per_mb = (int)((double)overhead_bits_per_mb * 0.98);
-
-    if (bits_per_mb_at_this_q <= target_norm_bits_per_mb)
-      break;
-  }
-
-  // Clip value to range "best allowed to (worst allowed - 1)"
-  Q = select_cq_level(Q);
-  if (Q >= cpi->worst_quality)
-    Q = cpi->worst_quality - 1;
-  if (Q < cpi->best_quality)
-    Q = cpi->best_quality;
-
-  return Q;
-}
-
-
-extern void vp9_new_frame_rate(VP9_COMP *cpi, double framerate);
-
-void vp9_init_second_pass(VP9_COMP *cpi) {
-  FIRSTPASS_STATS this_frame;
-  FIRSTPASS_STATS *start_pos;
-
-  double lower_bounds_min_rate = FRAME_OVERHEAD_BITS * cpi->oxcf.frame_rate;
-  double two_pass_min_rate = (double)(cpi->oxcf.target_bandwidth
-                                      * cpi->oxcf.two_pass_vbrmin_section / 100);
-
-  if (two_pass_min_rate < lower_bounds_min_rate)
-    two_pass_min_rate = lower_bounds_min_rate;
-
-  zero_stats(cpi->twopass.total_stats);
-  zero_stats(cpi->twopass.total_left_stats);
-
-  if (!cpi->twopass.stats_in_end)
-    return;
-
-  *cpi->twopass.total_stats = *cpi->twopass.stats_in_end;
-  *cpi->twopass.total_left_stats = *cpi->twopass.total_stats;
-
-  // each frame can have a different duration, as the frame rate in the source
-  // isn't guaranteed to be constant.   The frame rate prior to the first frame
-  // encoded in the second pass is a guess.  However the sum duration is not.
-  // Its calculated based on the actual durations of all frames from the first
-  // pass.
-  vp9_new_frame_rate(cpi,
-                     10000000.0 * cpi->twopass.total_stats->count /
-                     cpi->twopass.total_stats->duration);
-
-  cpi->output_frame_rate = cpi->oxcf.frame_rate;
-  cpi->twopass.bits_left = (int64_t)(cpi->twopass.total_stats->duration *
-                                     cpi->oxcf.target_bandwidth / 10000000.0);
-  cpi->twopass.bits_left -= (int64_t)(cpi->twopass.total_stats->duration *
-                                      two_pass_min_rate / 10000000.0);
-
-  // Calculate a minimum intra value to be used in determining the IIratio
-  // scores used in the second pass. We have this minimum to make sure
-  // that clips that are static but "low complexity" in the intra domain
-  // are still boosted appropriately for KF/GF/ARF
-  cpi->twopass.kf_intra_err_min = KF_MB_INTRA_MIN * cpi->common.MBs;
-  cpi->twopass.gf_intra_err_min = GF_MB_INTRA_MIN * cpi->common.MBs;
-
-  // This variable monitors how far behind the second ref update is lagging
-  cpi->twopass.sr_update_lag = 1;
-
-  // Scan the first pass file and calculate an average Intra / Inter error score ratio for the sequence
-  {
-    double sum_iiratio = 0.0;
-    double IIRatio;
-
-    start_pos = cpi->twopass.stats_in;               // Note starting "file" position
-
-    while (input_stats(cpi, &this_frame) != EOF) {
-      IIRatio = this_frame.intra_error / DOUBLE_DIVIDE_CHECK(this_frame.coded_error);
-      IIRatio = (IIRatio < 1.0) ? 1.0 : (IIRatio > 20.0) ? 20.0 : IIRatio;
-      sum_iiratio += IIRatio;
-    }
-
-    cpi->twopass.avg_iiratio = sum_iiratio / DOUBLE_DIVIDE_CHECK((double)cpi->twopass.total_stats->count);
-
-    // Reset file position
-    reset_fpf_position(cpi, start_pos);
-  }
-
-  // Scan the first pass file and calculate a modified total error based upon the bias/power function
-  // used to allocate bits
-  {
-    start_pos = cpi->twopass.stats_in;               // Note starting "file" position
-
-    cpi->twopass.modified_error_total = 0.0;
-    cpi->twopass.modified_error_used = 0.0;
-
-    while (input_stats(cpi, &this_frame) != EOF) {
-      cpi->twopass.modified_error_total += calculate_modified_err(cpi, &this_frame);
-    }
-    cpi->twopass.modified_error_left = cpi->twopass.modified_error_total;
-
-    reset_fpf_position(cpi, start_pos);            // Reset file position
-
-  }
-}
-
-void vp9_end_second_pass(VP9_COMP *cpi) {
-}
-
-// This function gives and estimate of how badly we believe
-// the prediction quality is decaying from frame to frame.
-static double get_prediction_decay_rate(VP9_COMP *cpi,
-                                        FIRSTPASS_STATS *next_frame) {
-  double prediction_decay_rate;
-  double second_ref_decay;
-  double mb_sr_err_diff;
-
-  // Initial basis is the % mbs inter coded
-  prediction_decay_rate = next_frame->pcnt_inter;
-
-  // Look at the observed drop in prediction quality between the last frame
-  // and the GF buffer (which contains an older frame).
-  mb_sr_err_diff =
-    (next_frame->sr_coded_error - next_frame->coded_error) /
-    (cpi->common.MBs);
-  second_ref_decay = 1.0 - (mb_sr_err_diff / 512.0);
-  second_ref_decay = pow(second_ref_decay, 0.5);
-  if (second_ref_decay < 0.85)
-    second_ref_decay = 0.85;
-  else if (second_ref_decay > 1.0)
-    second_ref_decay = 1.0;
-
-  if (second_ref_decay < prediction_decay_rate)
-    prediction_decay_rate = second_ref_decay;
-
-  return prediction_decay_rate;
-}
-
-// Function to test for a condition where a complex transition is followed
-// by a static section. For example in slide shows where there is a fade
-// between slides. This is to help with more optimal kf and gf positioning.
-static int detect_transition_to_still(
-  VP9_COMP *cpi,
-  int frame_interval,
-  int still_interval,
-  double loop_decay_rate,
-  double last_decay_rate) {
-  BOOL trans_to_still = FALSE;
-
-  // Break clause to detect very still sections after motion
-  // For example a static image after a fade or other transition
-  // instead of a clean scene cut.
-  if ((frame_interval > MIN_GF_INTERVAL) &&
-      (loop_decay_rate >= 0.999) &&
-      (last_decay_rate < 0.9)) {
-    int j;
-    FIRSTPASS_STATS *position = cpi->twopass.stats_in;
-    FIRSTPASS_STATS tmp_next_frame;
-    double zz_inter;
-
-    // Look ahead a few frames to see if static condition
-    // persists...
-    for (j = 0; j < still_interval; j++) {
-      if (EOF == input_stats(cpi, &tmp_next_frame))
-        break;
-
-      zz_inter =
-        (tmp_next_frame.pcnt_inter - tmp_next_frame.pcnt_motion);
-      if (zz_inter < 0.999)
-        break;
-    }
-    // Reset file position
-    reset_fpf_position(cpi, position);
-
-    // Only if it does do we signal a transition to still
-    if (j == still_interval)
-      trans_to_still = TRUE;
-  }
-
-  return trans_to_still;
-}
-
-// This function detects a flash through the high relative pcnt_second_ref
-// score in the frame following a flash frame. The offset passed in should
-// reflect this
-static BOOL detect_flash(VP9_COMP *cpi, int offset) {
-  FIRSTPASS_STATS next_frame;
-
-  BOOL flash_detected = FALSE;
-
-  // Read the frame data.
-  // The return is FALSE (no flash detected) if not a valid frame
-  if (read_frame_stats(cpi, &next_frame, offset) != EOF) {
-    // What we are looking for here is a situation where there is a
-    // brief break in prediction (such as a flash) but subsequent frames
-    // are reasonably well predicted by an earlier (pre flash) frame.
-    // The recovery after a flash is indicated by a high pcnt_second_ref
-    // comapred to pcnt_inter.
-    if ((next_frame.pcnt_second_ref > next_frame.pcnt_inter) &&
-        (next_frame.pcnt_second_ref >= 0.5)) {
-      flash_detected = TRUE;
-    }
-  }
-
-  return flash_detected;
-}
-
-// Update the motion related elements to the GF arf boost calculation
-static void accumulate_frame_motion_stats(
-  VP9_COMP *cpi,
-  FIRSTPASS_STATS *this_frame,
-  double *this_frame_mv_in_out,
-  double *mv_in_out_accumulator,
-  double *abs_mv_in_out_accumulator,
-  double *mv_ratio_accumulator) {
-  // double this_frame_mv_in_out;
-  double this_frame_mvr_ratio;
-  double this_frame_mvc_ratio;
-  double motion_pct;
-
-  // Accumulate motion stats.
-  motion_pct = this_frame->pcnt_motion;
-
-  // Accumulate Motion In/Out of frame stats
-  *this_frame_mv_in_out = this_frame->mv_in_out_count * motion_pct;
-  *mv_in_out_accumulator += this_frame->mv_in_out_count * motion_pct;
-  *abs_mv_in_out_accumulator +=
-    fabs(this_frame->mv_in_out_count * motion_pct);
-
-  // Accumulate a measure of how uniform (or conversely how random)
-  // the motion field is. (A ratio of absmv / mv)
-  if (motion_pct > 0.05) {
-    this_frame_mvr_ratio = fabs(this_frame->mvr_abs) /
-                           DOUBLE_DIVIDE_CHECK(fabs(this_frame->MVr));
-
-    this_frame_mvc_ratio = fabs(this_frame->mvc_abs) /
-                           DOUBLE_DIVIDE_CHECK(fabs(this_frame->MVc));
-
-    *mv_ratio_accumulator +=
-      (this_frame_mvr_ratio < this_frame->mvr_abs)
-      ? (this_frame_mvr_ratio * motion_pct)
-      : this_frame->mvr_abs * motion_pct;
-
-    *mv_ratio_accumulator +=
-      (this_frame_mvc_ratio < this_frame->mvc_abs)
-      ? (this_frame_mvc_ratio * motion_pct)
-      : this_frame->mvc_abs * motion_pct;
-
-  }
-}
-
-// Calculate a baseline boost number for the current frame.
-static double calc_frame_boost(
-  VP9_COMP *cpi,
-  FIRSTPASS_STATS *this_frame,
-  double this_frame_mv_in_out) {
-  double frame_boost;
-
-  // Underlying boost factor is based on inter intra error ratio
-  if (this_frame->intra_error > cpi->twopass.gf_intra_err_min)
-    frame_boost = (IIFACTOR * this_frame->intra_error /
-                   DOUBLE_DIVIDE_CHECK(this_frame->coded_error));
-  else
-    frame_boost = (IIFACTOR * cpi->twopass.gf_intra_err_min /
-                   DOUBLE_DIVIDE_CHECK(this_frame->coded_error));
-
-  // Increase boost for frames where new data coming into frame
-  // (eg zoom out). Slightly reduce boost if there is a net balance
-  // of motion out of the frame (zoom in).
-  // The range for this_frame_mv_in_out is -1.0 to +1.0
-  if (this_frame_mv_in_out > 0.0)
-    frame_boost += frame_boost * (this_frame_mv_in_out * 2.0);
-  // In extreme case boost is halved
-  else
-    frame_boost += frame_boost * (this_frame_mv_in_out / 2.0);
-
-  // Clip to maximum
-  if (frame_boost > GF_RMAX)
-    frame_boost = GF_RMAX;
-
-  return frame_boost;
-}
-
-static int calc_arf_boost(
-  VP9_COMP *cpi,
-  int offset,
-  int f_frames,
-  int b_frames,
-  int *f_boost,
-  int *b_boost) {
-  FIRSTPASS_STATS this_frame;
-
-  int i;
-  double boost_score = 0.0;
-  double mv_ratio_accumulator = 0.0;
-  double decay_accumulator = 1.0;
-  double this_frame_mv_in_out = 0.0;
-  double mv_in_out_accumulator = 0.0;
-  double abs_mv_in_out_accumulator = 0.0;
-  int arf_boost;
-  BOOL flash_detected = FALSE;
-
-  // Search forward from the proposed arf/next gf position
-  for (i = 0; i < f_frames; i++) {
-    if (read_frame_stats(cpi, &this_frame, (i + offset)) == EOF)
-      break;
-
-    // Update the motion related elements to the boost calculation
-    accumulate_frame_motion_stats(cpi, &this_frame,
-                                  &this_frame_mv_in_out, &mv_in_out_accumulator,
-                                  &abs_mv_in_out_accumulator, &mv_ratio_accumulator);
-
-    // We want to discount the the flash frame itself and the recovery
-    // frame that follows as both will have poor scores.
-    flash_detected = detect_flash(cpi, (i + offset)) ||
-                     detect_flash(cpi, (i + offset + 1));
-
-    // Cumulative effect of prediction quality decay
-    if (!flash_detected) {
-      decay_accumulator =
-        decay_accumulator *
-        get_prediction_decay_rate(cpi, &this_frame);
-      decay_accumulator =
-        decay_accumulator < 0.1 ? 0.1 : decay_accumulator;
-    }
-
-    boost_score += (decay_accumulator *
-                    calc_frame_boost(cpi, &this_frame, this_frame_mv_in_out));
-  }
-
-  *f_boost = boost_score;
-
-  // Reset for backward looking loop
-  boost_score = 0.0;
-  mv_ratio_accumulator = 0.0;
-  decay_accumulator = 1.0;
-  this_frame_mv_in_out = 0.0;
-  mv_in_out_accumulator = 0.0;
-  abs_mv_in_out_accumulator = 0.0;
-
-  // Search backward towards last gf position
-  for (i = -1; i >= -b_frames; i--) {
-    if (read_frame_stats(cpi, &this_frame, (i + offset)) == EOF)
-      break;
-
-    // Update the motion related elements to the boost calculation
-    accumulate_frame_motion_stats(cpi, &this_frame,
-                                  &this_frame_mv_in_out, &mv_in_out_accumulator,
-                                  &abs_mv_in_out_accumulator, &mv_ratio_accumulator);
-
-    // We want to discount the the flash frame itself and the recovery
-    // frame that follows as both will have poor scores.
-    flash_detected = detect_flash(cpi, (i + offset)) ||
-                     detect_flash(cpi, (i + offset + 1));
-
-    // Cumulative effect of prediction quality decay
-    if (!flash_detected) {
-      decay_accumulator =
-        decay_accumulator *
-        get_prediction_decay_rate(cpi, &this_frame);
-      decay_accumulator =
-        decay_accumulator < 0.1 ? 0.1 : decay_accumulator;
-    }
-
-    boost_score += (decay_accumulator *
-                    calc_frame_boost(cpi, &this_frame, this_frame_mv_in_out));
-
-  }
-  *b_boost = boost_score;
-
-  arf_boost = (*f_boost + *b_boost);
-  if (arf_boost < ((b_frames + f_frames) * 20))
-    arf_boost = ((b_frames + f_frames) * 20);
-
-  return arf_boost;
-}
-
-static void configure_arnr_filter(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
-  int half_gf_int;
-  int frames_after_arf;
-  int frames_bwd = cpi->oxcf.arnr_max_frames - 1;
-  int frames_fwd = cpi->oxcf.arnr_max_frames - 1;
-
-  // Define the arnr filter width for this group of frames:
-  // We only filter frames that lie within a distance of half
-  // the GF interval from the ARF frame. We also have to trap
-  // cases where the filter extends beyond the end of clip.
-  // Note: this_frame->frame has been updated in the loop
-  // so it now points at the ARF frame.
-  half_gf_int = cpi->baseline_gf_interval >> 1;
-  frames_after_arf = cpi->twopass.total_stats->count -
-                     this_frame->frame - 1;
-
-  switch (cpi->oxcf.arnr_type) {
-    case 1: // Backward filter
-      frames_fwd = 0;
-      if (frames_bwd > half_gf_int)
-        frames_bwd = half_gf_int;
-      break;
-
-    case 2: // Forward filter
-      if (frames_fwd > half_gf_int)
-        frames_fwd = half_gf_int;
-      if (frames_fwd > frames_after_arf)
-        frames_fwd = frames_after_arf;
-      frames_bwd = 0;
-      break;
-
-    case 3: // Centered filter
-    default:
-      frames_fwd >>= 1;
-      if (frames_fwd > frames_after_arf)
-        frames_fwd = frames_after_arf;
-      if (frames_fwd > half_gf_int)
-        frames_fwd = half_gf_int;
-
-      frames_bwd = frames_fwd;
-
-      // For even length filter there is one more frame backward
-      // than forward: e.g. len=6 ==> bbbAff, len=7 ==> bbbAfff.
-      if (frames_bwd < half_gf_int)
-        frames_bwd += (cpi->oxcf.arnr_max_frames + 1) & 0x1;
-      break;
-  }
-
-  cpi->active_arnr_frames = frames_bwd + 1 + frames_fwd;
-}
-
-// Analyse and define a gf/arf group .
-static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
-  FIRSTPASS_STATS next_frame;
-  FIRSTPASS_STATS *start_pos;
-  int i;
-  double boost_score = 0.0;
-  double old_boost_score = 0.0;
-  double gf_group_err = 0.0;
-  double gf_first_frame_err = 0.0;
-  double mod_frame_err = 0.0;
-
-  double mv_ratio_accumulator = 0.0;
-  double decay_accumulator = 1.0;
-  double zero_motion_accumulator = 1.0;
-
-  double loop_decay_rate = 1.00;          // Starting decay rate
-  double last_loop_decay_rate = 1.00;
-
-  double this_frame_mv_in_out = 0.0;
-  double mv_in_out_accumulator = 0.0;
-  double abs_mv_in_out_accumulator = 0.0;
-
-  int max_bits = frame_max_bits(cpi);     // Max for a single frame
-
-  unsigned int allow_alt_ref =
-    cpi->oxcf.play_alternate && cpi->oxcf.lag_in_frames;
-
-  int f_boost = 0;
-  int b_boost = 0;
-  BOOL flash_detected;
-
-  cpi->twopass.gf_group_bits = 0;
-
-  vp9_clear_system_state();  // __asm emms;
-
-  start_pos = cpi->twopass.stats_in;
-
-  vpx_memset(&next_frame, 0, sizeof(next_frame)); // assure clean
-
-  // Load stats for the current frame.
-  mod_frame_err = calculate_modified_err(cpi, this_frame);
-
-  // Note the error of the frame at the start of the group (this will be
-  // the GF frame error if we code a normal gf
-  gf_first_frame_err = mod_frame_err;
-
-  // Special treatment if the current frame is a key frame (which is also
-  // a gf). If it is then its error score (and hence bit allocation) need
-  // to be subtracted out from the calculation for the GF group
-  if (cpi->common.frame_type == KEY_FRAME)
-    gf_group_err -= gf_first_frame_err;
-
-  // Scan forward to try and work out how many frames the next gf group
-  // should contain and what level of boost is appropriate for the GF
-  // or ARF that will be coded with the group
-  i = 0;
-
-  while (((i < cpi->twopass.static_scene_max_gf_interval) ||
-          ((cpi->twopass.frames_to_key - i) < MIN_GF_INTERVAL)) &&
-         (i < cpi->twopass.frames_to_key)) {
-    i++;    // Increment the loop counter
-
-    // Accumulate error score of frames in this gf group
-    mod_frame_err = calculate_modified_err(cpi, this_frame);
-    gf_group_err += mod_frame_err;
-
-    if (EOF == input_stats(cpi, &next_frame))
-      break;
-
-    // Test for the case where there is a brief flash but the prediction
-    // quality back to an earlier frame is then restored.
-    flash_detected = detect_flash(cpi, 0);
-
-    // Update the motion related elements to the boost calculation
-    accumulate_frame_motion_stats(cpi, &next_frame,
-                                  &this_frame_mv_in_out, &mv_in_out_accumulator,
-                                  &abs_mv_in_out_accumulator, &mv_ratio_accumulator);
-
-    // Cumulative effect of prediction quality decay
-    if (!flash_detected) {
-      last_loop_decay_rate = loop_decay_rate;
-      loop_decay_rate = get_prediction_decay_rate(cpi, &next_frame);
-      decay_accumulator = decay_accumulator * loop_decay_rate;
-
-      // Monitor for static sections.
-      if ((next_frame.pcnt_inter - next_frame.pcnt_motion) <
-          zero_motion_accumulator) {
-        zero_motion_accumulator =
-          (next_frame.pcnt_inter - next_frame.pcnt_motion);
-      }
-
-      // Break clause to detect very still sections after motion
-      // (for example a staic image after a fade or other transition).
-      if (detect_transition_to_still(cpi, i, 5, loop_decay_rate,
-                                     last_loop_decay_rate)) {
-        allow_alt_ref = FALSE;
-        break;
-      }
-    }
-
-    // Calculate a boost number for this frame
-    boost_score +=
-      (decay_accumulator *
-       calc_frame_boost(cpi, &next_frame, this_frame_mv_in_out));
-
-    // Break out conditions.
-    if (
-      // Break at cpi->max_gf_interval unless almost totally static
-      (i >= cpi->max_gf_interval && (zero_motion_accumulator < 0.995)) ||
-      (
-        // Dont break out with a very short interval
-        (i > MIN_GF_INTERVAL) &&
-        // Dont break out very close to a key frame
-        ((cpi->twopass.frames_to_key - i) >= MIN_GF_INTERVAL) &&
-        ((boost_score > 125.0) || (next_frame.pcnt_inter < 0.75)) &&
-        (!flash_detected) &&
-        ((mv_ratio_accumulator > 100.0) ||
-         (abs_mv_in_out_accumulator > 3.0) ||
-         (mv_in_out_accumulator < -2.0) ||
-         ((boost_score - old_boost_score) < 12.5))
-      )) {
-      boost_score = old_boost_score;
-      break;
-    }
-
-    vpx_memcpy(this_frame, &next_frame, sizeof(*this_frame));
-
-    old_boost_score = boost_score;
-  }
-
-  // Dont allow a gf too near the next kf
-  if ((cpi->twopass.frames_to_key - i) < MIN_GF_INTERVAL) {
-    while (i < cpi->twopass.frames_to_key) {
-      i++;
-
-      if (EOF == input_stats(cpi, this_frame))
-        break;
-
-      if (i < cpi->twopass.frames_to_key) {
-        mod_frame_err = calculate_modified_err(cpi, this_frame);
-        gf_group_err += mod_frame_err;
-      }
-    }
-  }
-
-  // Set the interval till the next gf or arf.
-  cpi->baseline_gf_interval = i;
-
-  // Should we use the alternate refernce frame
-  if (allow_alt_ref &&
-      (i < cpi->oxcf.lag_in_frames) &&
-      (i >= MIN_GF_INTERVAL) &&
-      // dont use ARF very near next kf
-      (i <= (cpi->twopass.frames_to_key - MIN_GF_INTERVAL)) &&
-      ((next_frame.pcnt_inter > 0.75) ||
-       (next_frame.pcnt_second_ref > 0.5)) &&
-      ((mv_in_out_accumulator / (double)i > -0.2) ||
-       (mv_in_out_accumulator > -2.0)) &&
-      (boost_score > 100)) {
-    // Alterrnative boost calculation for alt ref
-    cpi->gfu_boost = calc_arf_boost(cpi, 0, (i - 1), (i - 1), &f_boost, &b_boost);
-    cpi->source_alt_ref_pending = TRUE;
-
-    configure_arnr_filter(cpi, this_frame);
-  } else {
-    cpi->gfu_boost = (int)boost_score;
-    cpi->source_alt_ref_pending = FALSE;
-  }
-
-  // Now decide how many bits should be allocated to the GF group as  a
-  // proportion of those remaining in the kf group.
-  // The final key frame group in the clip is treated as a special case
-  // where cpi->twopass.kf_group_bits is tied to cpi->twopass.bits_left.
-  // This is also important for short clips where there may only be one
-  // key frame.
-  if (cpi->twopass.frames_to_key >= (int)(cpi->twopass.total_stats->count -
-                                          cpi->common.current_video_frame)) {
-    cpi->twopass.kf_group_bits =
-      (cpi->twopass.bits_left > 0) ? cpi->twopass.bits_left : 0;
-  }
-
-  // Calculate the bits to be allocated to the group as a whole
-  if ((cpi->twopass.kf_group_bits > 0) &&
-      (cpi->twopass.kf_group_error_left > 0)) {
-    cpi->twopass.gf_group_bits =
-      (int)((double)cpi->twopass.kf_group_bits *
-            (gf_group_err / (double)cpi->twopass.kf_group_error_left));
-  } else
-    cpi->twopass.gf_group_bits = 0;
-
-  cpi->twopass.gf_group_bits =
-    (cpi->twopass.gf_group_bits < 0)
-    ? 0
-    : (cpi->twopass.gf_group_bits > cpi->twopass.kf_group_bits)
-    ? cpi->twopass.kf_group_bits : cpi->twopass.gf_group_bits;
-
-  // Clip cpi->twopass.gf_group_bits based on user supplied data rate
-  // variability limit (cpi->oxcf.two_pass_vbrmax_section)
-  if (cpi->twopass.gf_group_bits > max_bits * cpi->baseline_gf_interval)
-    cpi->twopass.gf_group_bits = max_bits * cpi->baseline_gf_interval;
-
-  // Reset the file position
-  reset_fpf_position(cpi, start_pos);
-
-  // Update the record of error used so far (only done once per gf group)
-  cpi->twopass.modified_error_used += gf_group_err;
-
-  // Assign  bits to the arf or gf.
-  for (i = 0; i <= (cpi->source_alt_ref_pending && cpi->common.frame_type != KEY_FRAME); i++) {
-    int boost;
-    int allocation_chunks;
-    int Q = (cpi->oxcf.fixed_q < 0) ? cpi->last_q[INTER_FRAME] : cpi->oxcf.fixed_q;
-    int gf_bits;
-
-    boost = (cpi->gfu_boost * vp9_gfboost_qadjust(Q)) / 100;
-
-    // Set max and minimum boost and hence minimum allocation
-    if (boost > ((cpi->baseline_gf_interval + 1) * 200))
-      boost = ((cpi->baseline_gf_interval + 1) * 200);
-    else if (boost < 125)
-      boost = 125;
-
-    if (cpi->source_alt_ref_pending && i == 0)
-      allocation_chunks =
-        ((cpi->baseline_gf_interval + 1) * 100) + boost;
-    else
-      allocation_chunks =
-        (cpi->baseline_gf_interval * 100) + (boost - 100);
-
-    // Prevent overflow
-    if (boost > 1028) {
-      int divisor = boost >> 10;
-      boost /= divisor;
-      allocation_chunks /= divisor;
-    }
-
-    // Calculate the number of bits to be spent on the gf or arf based on
-    // the boost number
-    gf_bits = (int)((double)boost *
-                    (cpi->twopass.gf_group_bits /
-                     (double)allocation_chunks));
-
-    // If the frame that is to be boosted is simpler than the average for
-    // the gf/arf group then use an alternative calculation
-    // based on the error score of the frame itself
-    if (mod_frame_err < gf_group_err / (double)cpi->baseline_gf_interval) {
-      double  alt_gf_grp_bits;
-      int     alt_gf_bits;
-
-      alt_gf_grp_bits =
-        (double)cpi->twopass.kf_group_bits  *
-        (mod_frame_err * (double)cpi->baseline_gf_interval) /
-        DOUBLE_DIVIDE_CHECK((double)cpi->twopass.kf_group_error_left);
-
-      alt_gf_bits = (int)((double)boost * (alt_gf_grp_bits /
-                                           (double)allocation_chunks));
-
-      if (gf_bits > alt_gf_bits) {
-        gf_bits = alt_gf_bits;
-      }
-    }
-    // Else if it is harder than other frames in the group make sure it at
-    // least receives an allocation in keeping with its relative error
-    // score, otherwise it may be worse off than an "un-boosted" frame
-    else {
-      int alt_gf_bits =
-        (int)((double)cpi->twopass.kf_group_bits *
-              mod_frame_err /
-              DOUBLE_DIVIDE_CHECK((double)cpi->twopass.kf_group_error_left));
-
-      if (alt_gf_bits > gf_bits) {
-        gf_bits = alt_gf_bits;
-      }
-    }
-
-    // Dont allow a negative value for gf_bits
-    if (gf_bits < 0)
-      gf_bits = 0;
-
-    gf_bits += cpi->min_frame_bandwidth;                     // Add in minimum for a frame
-
-    if (i == 0) {
-      cpi->twopass.gf_bits = gf_bits;
-    }
-    if (i == 1 || (!cpi->source_alt_ref_pending && (cpi->common.frame_type != KEY_FRAME))) {
-      cpi->per_frame_bandwidth = gf_bits;                 // Per frame bit target for this frame
-    }
-  }
-
-  {
-    // Adjust KF group bits and error remainin
-    cpi->twopass.kf_group_error_left -= gf_group_err;
-    cpi->twopass.kf_group_bits -= cpi->twopass.gf_group_bits;
-
-    if (cpi->twopass.kf_group_bits < 0)
-      cpi->twopass.kf_group_bits = 0;
-
-    // Note the error score left in the remaining frames of the group.
-    // For normal GFs we want to remove the error score for the first frame
-    // of the group (except in Key frame case where this has already
-    // happened)
-    if (!cpi->source_alt_ref_pending && cpi->common.frame_type != KEY_FRAME)
-      cpi->twopass.gf_group_error_left = gf_group_err - gf_first_frame_err;
-    else
-      cpi->twopass.gf_group_error_left = gf_group_err;
-
-    cpi->twopass.gf_group_bits -= cpi->twopass.gf_bits - cpi->min_frame_bandwidth;
-
-    if (cpi->twopass.gf_group_bits < 0)
-      cpi->twopass.gf_group_bits = 0;
-
-    // This condition could fail if there are two kfs very close together
-    // despite (MIN_GF_INTERVAL) and would cause a devide by 0 in the
-    // calculation of cpi->twopass.alt_extra_bits.
-    if (cpi->baseline_gf_interval >= 3) {
-      int boost = (cpi->source_alt_ref_pending)
-                  ? b_boost : cpi->gfu_boost;
-
-      if (boost >= 150) {
-        int pct_extra;
-
-        pct_extra = (boost - 100) / 50;
-        pct_extra = (pct_extra > 20) ? 20 : pct_extra;
-
-        cpi->twopass.alt_extra_bits =
-          (cpi->twopass.gf_group_bits * pct_extra) / 100;
-        cpi->twopass.gf_group_bits -= cpi->twopass.alt_extra_bits;
-        cpi->twopass.alt_extra_bits /=
-          ((cpi->baseline_gf_interval - 1) >> 1);
-      } else
-        cpi->twopass.alt_extra_bits = 0;
-    } else
-      cpi->twopass.alt_extra_bits = 0;
-  }
-
-  if (cpi->common.frame_type != KEY_FRAME) {
-    FIRSTPASS_STATS sectionstats;
-
-    zero_stats(&sectionstats);
-    reset_fpf_position(cpi, start_pos);
-
-    for (i = 0; i < cpi->baseline_gf_interval; i++) {
-      input_stats(cpi, &next_frame);
-      accumulate_stats(&sectionstats, &next_frame);
-    }
-
-    avg_stats(&sectionstats);
-
-    cpi->twopass.section_intra_rating =
-      sectionstats.intra_error /
-      DOUBLE_DIVIDE_CHECK(sectionstats.coded_error);
-
-    reset_fpf_position(cpi, start_pos);
-  }
-}
-
-// Allocate bits to a normal frame that is neither a gf an arf or a key frame.
-static void assign_std_frame_bits(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
-  int    target_frame_size;                                                             // gf_group_error_left
-
-  double modified_err;
-  double err_fraction;                                                                 // What portion of the remaining GF group error is used by this frame
-
-  int max_bits = frame_max_bits(cpi);    // Max for a single frame
-
-  // Calculate modified prediction error used in bit allocation
-  modified_err = calculate_modified_err(cpi, this_frame);
-
-  if (cpi->twopass.gf_group_error_left > 0)
-    err_fraction = modified_err / cpi->twopass.gf_group_error_left;                              // What portion of the remaining GF group error is used by this frame
-  else
-    err_fraction = 0.0;
-
-  target_frame_size = (int)((double)cpi->twopass.gf_group_bits * err_fraction);                    // How many of those bits available for allocation should we give it?
-
-  // Clip to target size to 0 - max_bits (or cpi->twopass.gf_group_bits) at the top end.
-  if (target_frame_size < 0)
-    target_frame_size = 0;
-  else {
-    if (target_frame_size > max_bits)
-      target_frame_size = max_bits;
-
-    if (target_frame_size > cpi->twopass.gf_group_bits)
-      target_frame_size = cpi->twopass.gf_group_bits;
-  }
-
-  cpi->twopass.gf_group_error_left -= modified_err;                                               // Adjust error remaining
-  cpi->twopass.gf_group_bits -= target_frame_size;                                                // Adjust bits remaining
-
-  if (cpi->twopass.gf_group_bits < 0)
-    cpi->twopass.gf_group_bits = 0;
-
-  target_frame_size += cpi->min_frame_bandwidth;                                          // Add in the minimum number of bits that is set aside for every frame.
-
-
-  cpi->per_frame_bandwidth = target_frame_size;                                           // Per frame bit target for this frame
-}
-
-// Make a damped adjustment to the active max q.
-static int adjust_active_maxq(int old_maxqi, int new_maxqi) {
-  int i;
-  int ret_val = new_maxqi;
-  double old_q;
-  double new_q;
-  double target_q;
-
-  old_q = vp9_convert_qindex_to_q(old_maxqi);
-  new_q = vp9_convert_qindex_to_q(new_maxqi);
-
-  target_q = ((old_q * 7.0) + new_q) / 8.0;
-
-  if (target_q > old_q) {
-    for (i = old_maxqi; i <= new_maxqi; i++) {
-      if (vp9_convert_qindex_to_q(i) >= target_q) {
-        ret_val = i;
-        break;
-      }
-    }
-  } else {
-    for (i = old_maxqi; i >= new_maxqi; i--) {
-      if (vp9_convert_qindex_to_q(i) <= target_q) {
-        ret_val = i;
-        break;
-      }
-    }
-  }
-
-  return ret_val;
-}
-
-void vp9_second_pass(VP9_COMP *cpi) {
-  int tmp_q;
-  int frames_left = (int)(cpi->twopass.total_stats->count - cpi->common.current_video_frame);
-
-  FIRSTPASS_STATS this_frame;
-  FIRSTPASS_STATS this_frame_copy;
-
-  double this_frame_error;
-  double this_frame_intra_error;
-  double this_frame_coded_error;
-
-  FIRSTPASS_STATS *start_pos;
-
-  int overhead_bits;
-
-  if (!cpi->twopass.stats_in) {
-    return;
-  }
-
-  vp9_clear_system_state();
-
-  vpx_memset(&this_frame, 0, sizeof(FIRSTPASS_STATS));
-
-  if (EOF == input_stats(cpi, &this_frame))
-    return;
-
-  this_frame_error = this_frame.ssim_weighted_pred_err;
-  this_frame_intra_error = this_frame.intra_error;
-  this_frame_coded_error = this_frame.coded_error;
-
-  start_pos = cpi->twopass.stats_in;
-
-  // keyframe and section processing !
-  if (cpi->twopass.frames_to_key == 0) {
-    // Define next KF group and assign bits to it
-    vpx_memcpy(&this_frame_copy, &this_frame, sizeof(this_frame));
-    find_next_key_frame(cpi, &this_frame_copy);
-  }
-
-  // Is this a GF / ARF (Note that a KF is always also a GF)
-  if (cpi->frames_till_gf_update_due == 0) {
-    // Define next gf group and assign bits to it
-    vpx_memcpy(&this_frame_copy, &this_frame, sizeof(this_frame));
-    define_gf_group(cpi, &this_frame_copy);
-
-    // If we are going to code an altref frame at the end of the group and the current frame is not a key frame....
-    // If the previous group used an arf this frame has already benefited from that arf boost and it should not be given extra bits
-    // If the previous group was NOT coded using arf we may want to apply some boost to this GF as well
-    if (cpi->source_alt_ref_pending && (cpi->common.frame_type != KEY_FRAME)) {
-      // Assign a standard frames worth of bits from those allocated to the GF group
-      int bak = cpi->per_frame_bandwidth;
-      vpx_memcpy(&this_frame_copy, &this_frame, sizeof(this_frame));
-      assign_std_frame_bits(cpi, &this_frame_copy);
-      cpi->per_frame_bandwidth = bak;
-    }
-  }
-
-  // Otherwise this is an ordinary frame
-  else {
-    // Assign bits from those allocated to the GF group
-    vpx_memcpy(&this_frame_copy, &this_frame, sizeof(this_frame));
-    assign_std_frame_bits(cpi, &this_frame_copy);
-  }
-
-  // Keep a globally available copy of this and the next frame's iiratio.
-  cpi->twopass.this_iiratio = this_frame_intra_error /
-                              DOUBLE_DIVIDE_CHECK(this_frame_coded_error);
-  {
-    FIRSTPASS_STATS next_frame;
-    if (lookup_next_frame_stats(cpi, &next_frame) != EOF) {
-      cpi->twopass.next_iiratio = next_frame.intra_error /
-                                  DOUBLE_DIVIDE_CHECK(next_frame.coded_error);
-    }
-  }
-
-  // Set nominal per second bandwidth for this frame
-  cpi->target_bandwidth = cpi->per_frame_bandwidth * cpi->output_frame_rate;
-  if (cpi->target_bandwidth < 0)
-    cpi->target_bandwidth = 0;
-
-
-  // Account for mv, mode and other overheads.
-  overhead_bits = estimate_modemvcost(
-                    cpi, cpi->twopass.total_left_stats);
-
-  // Special case code for first frame.
-  if (cpi->common.current_video_frame == 0) {
-    cpi->twopass.est_max_qcorrection_factor = 1.0;
-
-    // Set a cq_level in constrained quality mode.
-    if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) {
-      int est_cq;
-
-      est_cq =
-        estimate_cq(cpi,
-                    cpi->twopass.total_left_stats,
-                    (int)(cpi->twopass.bits_left / frames_left),
-                    overhead_bits);
-
-      cpi->cq_target_quality = cpi->oxcf.cq_level;
-      if (est_cq > cpi->cq_target_quality)
-        cpi->cq_target_quality = est_cq;
-    }
-
-    // guess at maxq needed in 2nd pass
-    cpi->twopass.maxq_max_limit = cpi->worst_quality;
-    cpi->twopass.maxq_min_limit = cpi->best_quality;
-
-    tmp_q = estimate_max_q(
-              cpi,
-              cpi->twopass.total_left_stats,
-              (int)(cpi->twopass.bits_left / frames_left),
-              overhead_bits);
-
-    cpi->active_worst_quality         = tmp_q;
-    cpi->ni_av_qi                     = tmp_q;
-    cpi->avg_q                        = vp9_convert_qindex_to_q(tmp_q);
-
-    // Limit the maxq value returned subsequently.
-    // This increases the risk of overspend or underspend if the initial
-    // estimate for the clip is bad, but helps prevent excessive
-    // variation in Q, especially near the end of a clip
-    // where for example a small overspend may cause Q to crash
-    adjust_maxq_qrange(cpi);
-  }
-
-  // The last few frames of a clip almost always have to few or too many
-  // bits and for the sake of over exact rate control we dont want to make
-  // radical adjustments to the allowed quantizer range just to use up a
-  // few surplus bits or get beneath the target rate.
-  else if ((cpi->common.current_video_frame <
-            (((unsigned int)cpi->twopass.total_stats->count * 255) >> 8)) &&
-           ((cpi->common.current_video_frame + cpi->baseline_gf_interval) <
-            (unsigned int)cpi->twopass.total_stats->count)) {
-    if (frames_left < 1)
-      frames_left = 1;
-
-    tmp_q = estimate_max_q(
-              cpi,
-              cpi->twopass.total_left_stats,
-              (int)(cpi->twopass.bits_left / frames_left),
-              overhead_bits);
-
-    // Make a damped adjustment to active max Q
-    cpi->active_worst_quality =
-      adjust_active_maxq(cpi->active_worst_quality, tmp_q);
-  }
-
-  cpi->twopass.frames_to_key--;
-
-  // Update the total stats remaining sturcture
-  subtract_stats(cpi->twopass.total_left_stats, &this_frame);
-}
-
-
-static BOOL test_candidate_kf(VP9_COMP *cpi,  FIRSTPASS_STATS *last_frame, FIRSTPASS_STATS *this_frame, FIRSTPASS_STATS *next_frame) {
-  BOOL is_viable_kf = FALSE;
-
-  // Does the frame satisfy the primary criteria of a key frame
-  //      If so, then examine how well it predicts subsequent frames
-  if ((this_frame->pcnt_second_ref < 0.10) &&
-      (next_frame->pcnt_second_ref < 0.10) &&
-      ((this_frame->pcnt_inter < 0.05) ||
-       (
-         ((this_frame->pcnt_inter - this_frame->pcnt_neutral) < .35) &&
-         ((this_frame->intra_error / DOUBLE_DIVIDE_CHECK(this_frame->coded_error)) < 2.5) &&
-         ((fabs(last_frame->coded_error - this_frame->coded_error) / DOUBLE_DIVIDE_CHECK(this_frame->coded_error) > .40) ||
-          (fabs(last_frame->intra_error - this_frame->intra_error) / DOUBLE_DIVIDE_CHECK(this_frame->intra_error) > .40) ||
-          ((next_frame->intra_error / DOUBLE_DIVIDE_CHECK(next_frame->coded_error)) > 3.5)
-         )
-       )
-      )
-     ) {
-    int i;
-    FIRSTPASS_STATS *start_pos;
-
-    FIRSTPASS_STATS local_next_frame;
-
-    double boost_score = 0.0;
-    double old_boost_score = 0.0;
-    double decay_accumulator = 1.0;
-    double next_iiratio;
-
-    vpx_memcpy(&local_next_frame, next_frame, sizeof(*next_frame));
-
-    // Note the starting file position so we can reset to it
-    start_pos = cpi->twopass.stats_in;
-
-    // Examine how well the key frame predicts subsequent frames
-    for (i = 0; i < 16; i++) {
-      next_iiratio = (IIKFACTOR1 * local_next_frame.intra_error / DOUBLE_DIVIDE_CHECK(local_next_frame.coded_error));
-
-      if (next_iiratio > RMAX)
-        next_iiratio = RMAX;
-
-      // Cumulative effect of decay in prediction quality
-      if (local_next_frame.pcnt_inter > 0.85)
-        decay_accumulator = decay_accumulator * local_next_frame.pcnt_inter;
-      else
-        decay_accumulator = decay_accumulator * ((0.85 + local_next_frame.pcnt_inter) / 2.0);
-
-      // decay_accumulator = decay_accumulator * local_next_frame.pcnt_inter;
-
-      // Keep a running total
-      boost_score += (decay_accumulator * next_iiratio);
-
-      // Test various breakout clauses
-      if ((local_next_frame.pcnt_inter < 0.05) ||
-          (next_iiratio < 1.5) ||
-          (((local_next_frame.pcnt_inter -
-             local_next_frame.pcnt_neutral) < 0.20) &&
-           (next_iiratio < 3.0)) ||
-          ((boost_score - old_boost_score) < 3.0) ||
-          (local_next_frame.intra_error < 200)
-         ) {
-        break;
-      }
-
-      old_boost_score = boost_score;
-
-      // Get the next frame details
-      if (EOF == input_stats(cpi, &local_next_frame))
-        break;
-    }
-
-    // If there is tolerable prediction for at least the next 3 frames then break out else discard this pottential key frame and move on
-    if (boost_score > 30.0 && (i > 3))
-      is_viable_kf = TRUE;
-    else {
-      // Reset the file position
-      reset_fpf_position(cpi, start_pos);
-
-      is_viable_kf = FALSE;
-    }
-  }
-
-  return is_viable_kf;
-}
-static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
-  int i, j;
-  FIRSTPASS_STATS last_frame;
-  FIRSTPASS_STATS first_frame;
-  FIRSTPASS_STATS next_frame;
-  FIRSTPASS_STATS *start_position;
-
-  double decay_accumulator = 1.0;
-  double zero_motion_accumulator = 1.0;
-  double boost_score = 0;
-  double old_boost_score = 0.0;
-  double loop_decay_rate;
-
-  double kf_mod_err = 0.0;
-  double kf_group_err = 0.0;
-  double kf_group_intra_err = 0.0;
-  double kf_group_coded_err = 0.0;
-  double recent_loop_decay[8] = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
-
-  vpx_memset(&next_frame, 0, sizeof(next_frame)); // assure clean
-
-  vp9_clear_system_state();  // __asm emms;
-  start_position = cpi->twopass.stats_in;
-
-  cpi->common.frame_type = KEY_FRAME;
-
-  // is this a forced key frame by interval
-  cpi->this_key_frame_forced = cpi->next_key_frame_forced;
-
-  // Clear the alt ref active flag as this can never be active on a key frame
-  cpi->source_alt_ref_active = FALSE;
-
-  // Kf is always a gf so clear frames till next gf counter
-  cpi->frames_till_gf_update_due = 0;
-
-  cpi->twopass.frames_to_key = 1;
-
-  // Take a copy of the initial frame details
-  vpx_memcpy(&first_frame, this_frame, sizeof(*this_frame));
-
-  cpi->twopass.kf_group_bits = 0;        // Total bits avaialable to kf group
-  cpi->twopass.kf_group_error_left = 0;  // Group modified error score.
-
-  kf_mod_err = calculate_modified_err(cpi, this_frame);
-
-  // find the next keyframe
-  i = 0;
-  while (cpi->twopass.stats_in < cpi->twopass.stats_in_end) {
-    // Accumulate kf group error
-    kf_group_err += calculate_modified_err(cpi, this_frame);
-
-    // These figures keep intra and coded error counts for all frames including key frames in the group.
-    // The effect of the key frame itself can be subtracted out using the first_frame data collected above
-    kf_group_intra_err += this_frame->intra_error;
-    kf_group_coded_err += this_frame->coded_error;
-
-    // load a the next frame's stats
-    vpx_memcpy(&last_frame, this_frame, sizeof(*this_frame));
-    input_stats(cpi, this_frame);
-
-    // Provided that we are not at the end of the file...
-    if (cpi->oxcf.auto_key
-        && lookup_next_frame_stats(cpi, &next_frame) != EOF) {
-      // Normal scene cut check
-      if (test_candidate_kf(cpi, &last_frame, this_frame, &next_frame)) {
-        break;
-      }
-
-      // How fast is prediction quality decaying
-      loop_decay_rate = get_prediction_decay_rate(cpi, &next_frame);
-
-      // We want to know something about the recent past... rather than
-      // as used elsewhere where we are concened with decay in prediction
-      // quality since the last GF or KF.
-      recent_loop_decay[i % 8] = loop_decay_rate;
-      decay_accumulator = 1.0;
-      for (j = 0; j < 8; j++) {
-        decay_accumulator = decay_accumulator * recent_loop_decay[j];
-      }
-
-      // Special check for transition or high motion followed by a
-      // to a static scene.
-      if (detect_transition_to_still(cpi, i,
-                                     (cpi->key_frame_frequency - i),
-                                     loop_decay_rate,
-                                     decay_accumulator)) {
-        break;
-      }
-
-
-      // Step on to the next frame
-      cpi->twopass.frames_to_key++;
-
-      // If we don't have a real key frame within the next two
-      // forcekeyframeevery intervals then break out of the loop.
-      if (cpi->twopass.frames_to_key >= 2 * (int)cpi->key_frame_frequency)
-        break;
-    } else
-      cpi->twopass.frames_to_key++;
-
-    i++;
-  }
-
-  // If there is a max kf interval set by the user we must obey it.
-  // We already breakout of the loop above at 2x max.
-  // This code centers the extra kf if the actual natural
-  // interval is between 1x and 2x
-  if (cpi->oxcf.auto_key
-      && cpi->twopass.frames_to_key > (int)cpi->key_frame_frequency) {
-    FIRSTPASS_STATS *current_pos = cpi->twopass.stats_in;
-    FIRSTPASS_STATS tmp_frame;
-
-    cpi->twopass.frames_to_key /= 2;
-
-    // Copy first frame details
-    vpx_memcpy(&tmp_frame, &first_frame, sizeof(first_frame));
-
-    // Reset to the start of the group
-    reset_fpf_position(cpi, start_position);
-
-    kf_group_err = 0;
-    kf_group_intra_err = 0;
-    kf_group_coded_err = 0;
-
-    // Rescan to get the correct error data for the forced kf group
-    for (i = 0; i < cpi->twopass.frames_to_key; i++) {
-      // Accumulate kf group errors
-      kf_group_err += calculate_modified_err(cpi, &tmp_frame);
-      kf_group_intra_err += tmp_frame.intra_error;
-      kf_group_coded_err += tmp_frame.coded_error;
-
-      // Load a the next frame's stats
-      input_stats(cpi, &tmp_frame);
-    }
-
-    // Reset to the start of the group
-    reset_fpf_position(cpi, current_pos);
-
-    cpi->next_key_frame_forced = TRUE;
-  } else
-    cpi->next_key_frame_forced = FALSE;
-
-  // Special case for the last frame of the file
-  if (cpi->twopass.stats_in >= cpi->twopass.stats_in_end) {
-    // Accumulate kf group error
-    kf_group_err += calculate_modified_err(cpi, this_frame);
-
-    // These figures keep intra and coded error counts for all frames including key frames in the group.
-    // The effect of the key frame itself can be subtracted out using the first_frame data collected above
-    kf_group_intra_err += this_frame->intra_error;
-    kf_group_coded_err += this_frame->coded_error;
-  }
-
-  // Calculate the number of bits that should be assigned to the kf group.
-  if ((cpi->twopass.bits_left > 0) && (cpi->twopass.modified_error_left > 0.0)) {
-    // Max for a single normal frame (not key frame)
-    int max_bits = frame_max_bits(cpi);
-
-    // Maximum bits for the kf group
-    int64_t max_grp_bits;
-
-    // Default allocation based on bits left and relative
-    // complexity of the section
-    cpi->twopass.kf_group_bits = (int64_t)(cpi->twopass.bits_left *
-                                           (kf_group_err /
-                                            cpi->twopass.modified_error_left));
-
-    // Clip based on maximum per frame rate defined by the user.
-    max_grp_bits = (int64_t)max_bits * (int64_t)cpi->twopass.frames_to_key;
-    if (cpi->twopass.kf_group_bits > max_grp_bits)
-      cpi->twopass.kf_group_bits = max_grp_bits;
-  } else
-    cpi->twopass.kf_group_bits = 0;
-
-  // Reset the first pass file position
-  reset_fpf_position(cpi, start_position);
-
-  // determine how big to make this keyframe based on how well the subsequent frames use inter blocks
-  decay_accumulator = 1.0;
-  boost_score = 0.0;
-  loop_decay_rate = 1.00;       // Starting decay rate
-
-  for (i = 0; i < cpi->twopass.frames_to_key; i++) {
-    double r;
-
-    if (EOF == input_stats(cpi, &next_frame))
-      break;
-
-    if (next_frame.intra_error > cpi->twopass.kf_intra_err_min)
-      r = (IIKFACTOR2 * next_frame.intra_error /
-           DOUBLE_DIVIDE_CHECK(next_frame.coded_error));
-    else
-      r = (IIKFACTOR2 * cpi->twopass.kf_intra_err_min /
-           DOUBLE_DIVIDE_CHECK(next_frame.coded_error));
-
-    if (r > RMAX)
-      r = RMAX;
-
-    // Monitor for static sections.
-    if ((next_frame.pcnt_inter - next_frame.pcnt_motion) <
-        zero_motion_accumulator) {
-      zero_motion_accumulator =
-        (next_frame.pcnt_inter - next_frame.pcnt_motion);
-    }
-
-    // How fast is prediction quality decaying
-    if (!detect_flash(cpi, 0)) {
-      loop_decay_rate = get_prediction_decay_rate(cpi, &next_frame);
-      decay_accumulator = decay_accumulator * loop_decay_rate;
-      decay_accumulator = decay_accumulator < 0.1 ? 0.1 : decay_accumulator;
-    }
-
-    boost_score += (decay_accumulator * r);
-
-    if ((i > MIN_GF_INTERVAL) &&
-        ((boost_score - old_boost_score) < 6.25)) {
-      break;
-    }
-
-    old_boost_score = boost_score;
-  }
-
-  {
-    FIRSTPASS_STATS sectionstats;
-
-    zero_stats(&sectionstats);
-    reset_fpf_position(cpi, start_position);
-
-    for (i = 0; i < cpi->twopass.frames_to_key; i++) {
-      input_stats(cpi, &next_frame);
-      accumulate_stats(&sectionstats, &next_frame);
-    }
-
-    avg_stats(&sectionstats);
-
-    cpi->twopass.section_intra_rating =
-      sectionstats.intra_error
-      / DOUBLE_DIVIDE_CHECK(sectionstats.coded_error);
-  }
-
-  // Reset the first pass file position
-  reset_fpf_position(cpi, start_position);
-
-  // Work out how many bits to allocate for the key frame itself
-  if (1) {
-    int kf_boost = boost_score;
-    int allocation_chunks;
-    int alt_kf_bits;
-
-    if (kf_boost < 300) {
-      kf_boost += (cpi->twopass.frames_to_key * 3);
-      if (kf_boost > 300)
-        kf_boost = 300;
-    }
-
-    if (kf_boost < 250)                                                      // Min KF boost
-      kf_boost = 250;
-
-    // Make a note of baseline boost and the zero motion
-    // accumulator value for use elsewhere.
-    cpi->kf_boost = kf_boost;
-    cpi->kf_zeromotion_pct = (int)(zero_motion_accumulator * 100.0);
-
-    // We do three calculations for kf size.
-    // The first is based on the error score for the whole kf group.
-    // The second (optionaly) on the key frames own error if this is
-    // smaller than the average for the group.
-    // The final one insures that the frame receives at least the
-    // allocation it would have received based on its own error score vs
-    // the error score remaining
-    // Special case if the sequence appears almost totaly static
-    // In this case we want to spend almost all of the bits on the
-    // key frame.
-    // cpi->twopass.frames_to_key-1 because key frame itself is taken
-    // care of by kf_boost.
-    if (zero_motion_accumulator >= 0.99) {
-      allocation_chunks =
-        ((cpi->twopass.frames_to_key - 1) * 10) + kf_boost;
-    } else {
-      allocation_chunks =
-        ((cpi->twopass.frames_to_key - 1) * 100) + kf_boost;
-    }
-
-    // Prevent overflow
-    if (kf_boost > 1028) {
-      int divisor = kf_boost >> 10;
-      kf_boost /= divisor;
-      allocation_chunks /= divisor;
-    }
-
-    cpi->twopass.kf_group_bits = (cpi->twopass.kf_group_bits < 0) ? 0 : cpi->twopass.kf_group_bits;
-
-    // Calculate the number of bits to be spent on the key frame
-    cpi->twopass.kf_bits  = (int)((double)kf_boost * ((double)cpi->twopass.kf_group_bits / (double)allocation_chunks));
-
-    // If the key frame is actually easier than the average for the
-    // kf group (which does sometimes happen... eg a blank intro frame)
-    // Then use an alternate calculation based on the kf error score
-    // which should give a smaller key frame.
-    if (kf_mod_err < kf_group_err / cpi->twopass.frames_to_key) {
-      double  alt_kf_grp_bits =
-        ((double)cpi->twopass.bits_left *
-         (kf_mod_err * (double)cpi->twopass.frames_to_key) /
-         DOUBLE_DIVIDE_CHECK(cpi->twopass.modified_error_left));
-
-      alt_kf_bits = (int)((double)kf_boost *
-                          (alt_kf_grp_bits / (double)allocation_chunks));
-
-      if (cpi->twopass.kf_bits > alt_kf_bits) {
-        cpi->twopass.kf_bits = alt_kf_bits;
-      }
-    }
-    // Else if it is much harder than other frames in the group make sure
-    // it at least receives an allocation in keeping with its relative
-    // error score
-    else {
-      alt_kf_bits =
-        (int)((double)cpi->twopass.bits_left *
-              (kf_mod_err /
-               DOUBLE_DIVIDE_CHECK(cpi->twopass.modified_error_left)));
-
-      if (alt_kf_bits > cpi->twopass.kf_bits) {
-        cpi->twopass.kf_bits = alt_kf_bits;
-      }
-    }
-
-    cpi->twopass.kf_group_bits -= cpi->twopass.kf_bits;
-    cpi->twopass.kf_bits += cpi->min_frame_bandwidth;                                          // Add in the minimum frame allowance
-
-    cpi->per_frame_bandwidth = cpi->twopass.kf_bits;                                           // Peer frame bit target for this frame
-    cpi->target_bandwidth = cpi->twopass.kf_bits * cpi->output_frame_rate;                      // Convert to a per second bitrate
-  }
-
-  // Note the total error score of the kf group minus the key frame itself
-  cpi->twopass.kf_group_error_left = (int)(kf_group_err - kf_mod_err);
-
-  // Adjust the count of total modified error left.
-  // The count of bits left is adjusted elsewhere based on real coded frame sizes
-  cpi->twopass.modified_error_left -= kf_group_err;
-}
--- a/vp8/encoder/firstpass.h
+++ /dev/null
@@ -1,23 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#if !defined __INC_FIRSTPASS_H
-#define      __INC_FIRSTPASS_H
-
-extern void vp9_init_first_pass(VP9_COMP *cpi);
-extern void vp9_first_pass(VP9_COMP *cpi);
-extern void vp9_end_first_pass(VP9_COMP *cpi);
-
-extern void vp9_init_second_pass(VP9_COMP *cpi);
-extern void vp9_second_pass(VP9_COMP *cpi);
-extern void vp9_end_second_pass(VP9_COMP *cpi);
-
-#endif
--- a/vp8/encoder/generic/csystemdependent.c
+++ /dev/null
@@ -1,48 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vpx_ports/config.h"
-#include "vp8/encoder/variance.h"
-#include "vp8/encoder/onyx_int.h"
-
-
-void vp9_arch_x86_encoder_init(VP9_COMP *cpi);
-void vp9_arch_arm_encoder_init(VP9_COMP *cpi);
-
-void (*vp9_yv12_copy_partial_frame_ptr)(YV12_BUFFER_CONFIG *src_ybc,
-                                        YV12_BUFFER_CONFIG *dst_ybc,
-                                        int fraction);
-extern void vp9_yv12_copy_partial_frame(YV12_BUFFER_CONFIG *src_ybc,
-                                        YV12_BUFFER_CONFIG *dst_ybc,
-                                        int fraction);
-
-void vp9_cmachine_specific_config(VP9_COMP *cpi) {
-#if CONFIG_RUNTIME_CPU_DETECT
-  cpi->rtcd.common                    = &cpi->common.rtcd;
-
-  cpi->rtcd.search.full_search             = vp9_full_search_sad;
-  cpi->rtcd.search.refining_search         = vp9_refining_search_sad;
-  cpi->rtcd.search.diamond_search          = vp9_diamond_search_sad;
-  cpi->rtcd.temporal.apply                 = vp9_temporal_filter_apply_c;
-#endif
-
-  vp9_yv12_copy_partial_frame_ptr = vp9_yv12_copy_partial_frame;
-
-#if ARCH_X86 || ARCH_X86_64
-  vp9_arch_x86_encoder_init(cpi);
-#endif
-
-#if ARCH_ARM
-  vp9_arch_arm_encoder_init(cpi);
-#endif
-
-
-}
--- a/vp8/encoder/lookahead.c
+++ /dev/null
@@ -1,191 +1,0 @@
-/*
- *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-#include <assert.h>
-#include <stdlib.h>
-#include "vpx_config.h"
-#include "lookahead.h"
-#include "vp8/common/extend.h"
-
-#define MAX_LAG_BUFFERS 25
-
-struct lookahead_ctx {
-  unsigned int max_sz;         /* Absolute size of the queue */
-  unsigned int sz;             /* Number of buffers currently in the queue */
-  unsigned int read_idx;       /* Read index */
-  unsigned int write_idx;      /* Write index */
-  struct lookahead_entry *buf; /* Buffer list */
-};
-
-
-/* Return the buffer at the given absolute index and increment the index */
-static struct lookahead_entry *
-pop(struct lookahead_ctx *ctx,
-    unsigned int         *idx) {
-  unsigned int            index = *idx;
-  struct lookahead_entry *buf = ctx->buf + index;
-
-  assert(index < ctx->max_sz);
-  if (++index >= ctx->max_sz)
-    index -= ctx->max_sz;
-  *idx = index;
-  return buf;
-}
-
-
-void
-vp9_lookahead_destroy(struct lookahead_ctx *ctx) {
-  if (ctx) {
-    if (ctx->buf) {
-      int i;
-
-      for (i = 0; i < ctx->max_sz; i++)
-        vp8_yv12_de_alloc_frame_buffer(&ctx->buf[i].img);
-      free(ctx->buf);
-    }
-    free(ctx);
-  }
-}
-
-
-struct lookahead_ctx *
-vp9_lookahead_init(unsigned int width,
-                   unsigned int height,
-                   unsigned int depth) {
-  struct lookahead_ctx *ctx = NULL;
-  int i;
-
-  /* Clamp the lookahead queue depth */
-  if (depth < 1)
-    depth = 1;
-  else if (depth > MAX_LAG_BUFFERS)
-    depth = MAX_LAG_BUFFERS;
-
-  /* Align the buffer dimensions */
-  width = (width + 15) &~15;
-  height = (height + 15) &~15;
-
-  /* Allocate the lookahead structures */
-  ctx = calloc(1, sizeof(*ctx));
-  if (ctx) {
-    ctx->max_sz = depth;
-    ctx->buf = calloc(depth, sizeof(*ctx->buf));
-    if (!ctx->buf)
-      goto bail;
-    for (i = 0; i < depth; i++)
-      if (vp8_yv12_alloc_frame_buffer(&ctx->buf[i].img,
-                                      width, height, VP8BORDERINPIXELS))
-        goto bail;
-  }
-  return ctx;
-bail:
-  vp9_lookahead_destroy(ctx);
-  return NULL;
-}
-
-
-int
-vp9_lookahead_push(struct lookahead_ctx *ctx,
-                   YV12_BUFFER_CONFIG   *src,
-                   int64_t               ts_start,
-                   int64_t               ts_end,
-                   unsigned int          flags,
-                   unsigned char        *active_map) {
-  struct lookahead_entry *buf;
-  int row, col, active_end;
-  int mb_rows = (src->y_height + 15) >> 4;
-  int mb_cols = (src->y_width + 15) >> 4;
-
-  if (ctx->sz + 1 > ctx->max_sz)
-    return 1;
-  ctx->sz++;
-  buf = pop(ctx, &ctx->write_idx);
-
-  // Only do this partial copy if the following conditions are all met:
-  // 1. Lookahead queue has has size of 1.
-  // 2. Active map is provided.
-  // 3. This is not a key frame, golden nor altref frame.
-  if (ctx->max_sz == 1 && active_map && !flags) {
-    for (row = 0; row < mb_rows; ++row) {
-      col = 0;
-
-      while (1) {
-        // Find the first active macroblock in this row.
-        for (; col < mb_cols; ++col) {
-          if (active_map[col])
-            break;
-        }
-
-        // No more active macroblock in this row.
-        if (col == mb_cols)
-          break;
-
-        // Find the end of active region in this row.
-        active_end = col;
-
-        for (; active_end < mb_cols; ++active_end) {
-          if (!active_map[active_end])
-            break;
-        }
-
-        // Only copy this active region.
-        vp9_copy_and_extend_frame_with_rect(src, &buf->img,
-                                            row << 4,
-                                            col << 4, 16,
-                                            (active_end - col) << 4);
-
-        // Start again from the end of this active region.
-        col = active_end;
-      }
-
-      active_map += mb_cols;
-    }
-  } else {
-    vp9_copy_and_extend_frame(src, &buf->img);
-  }
-  buf->ts_start = ts_start;
-  buf->ts_end = ts_end;
-  buf->flags = flags;
-  return 0;
-}
-
-
-struct lookahead_entry *
-vp9_lookahead_pop(struct lookahead_ctx *ctx,
-                  int                   drain) {
-  struct lookahead_entry *buf = NULL;
-
-  if (ctx->sz && (drain || ctx->sz == ctx->max_sz)) {
-    buf = pop(ctx, &ctx->read_idx);
-    ctx->sz--;
-  }
-  return buf;
-}
-
-
-struct lookahead_entry *
-vp9_lookahead_peek(struct lookahead_ctx *ctx,
-                   int                   index) {
-  struct lookahead_entry *buf = NULL;
-
-  assert(index < ctx->max_sz);
-  if (index < ctx->sz) {
-    index += ctx->read_idx;
-    if (index >= ctx->max_sz)
-      index -= ctx->max_sz;
-    buf = ctx->buf + index;
-  }
-  return buf;
-}
-
-
-unsigned int
-vp9_lookahead_depth(struct lookahead_ctx *ctx) {
-  return ctx->sz;
-}
--- a/vp8/encoder/lookahead.h
+++ /dev/null
@@ -1,105 +1,0 @@
-/*
- *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-#ifndef LOOKAHEAD_H
-#define LOOKAHEAD_H
-#include "vpx_scale/yv12config.h"
-#include "vpx/vpx_integer.h"
-
-struct lookahead_entry {
-  YV12_BUFFER_CONFIG  img;
-  int64_t             ts_start;
-  int64_t             ts_end;
-  unsigned int        flags;
-};
-
-
-struct lookahead_ctx;
-
-/**\brief Initializes the lookahead stage
- *
- * The lookahead stage is a queue of frame buffers on which some analysis
- * may be done when buffers are enqueued.
- *
- *
- */
-struct lookahead_ctx *vp9_lookahead_init(unsigned int width,
-                                         unsigned int height,
-                                         unsigned int depth
-                                        );
-
-
-/**\brief Destroys the lookahead stage
- *
- */
-void vp9_lookahead_destroy(struct lookahead_ctx *ctx);
-
-
-/**\brief Enqueue a source buffer
- *
- * This function will copy the source image into a new framebuffer with
- * the expected stride/border.
- *
- * If active_map is non-NULL and there is only one frame in the queue, then copy
- * only active macroblocks.
- *
- * \param[in] ctx         Pointer to the lookahead context
- * \param[in] src         Pointer to the image to enqueue
- * \param[in] ts_start    Timestamp for the start of this frame
- * \param[in] ts_end      Timestamp for the end of this frame
- * \param[in] flags       Flags set on this frame
- * \param[in] active_map  Map that specifies which macroblock is active
- */
-int
-vp9_lookahead_push(struct lookahead_ctx *ctx,
-                   YV12_BUFFER_CONFIG   *src,
-                   int64_t               ts_start,
-                   int64_t               ts_end,
-                   unsigned int          flags,
-                   unsigned char        *active_map);
-
-
-/**\brief Get the next source buffer to encode
- *
- *
- * \param[in] ctx       Pointer to the lookahead context
- * \param[in] drain     Flag indicating the buffer should be drained
- *                      (return a buffer regardless of the current queue depth)
- *
- * \retval NULL, if drain set and queue is empty
- * \retval NULL, if drain not set and queue not of the configured depth
- *
- */
-struct lookahead_entry *
-vp9_lookahead_pop(struct lookahead_ctx *ctx,
-                  int                   drain);
-
-
-/**\brief Get a future source buffer to encode
- *
- * \param[in] ctx       Pointer to the lookahead context
- * \param[in] index     Index of the frame to be returned, 0 == next frame
- *
- * \retval NULL, if no buffer exists at the specified index
- *
- */
-struct lookahead_entry *
-vp9_lookahead_peek(struct lookahead_ctx *ctx,
-                   int                   index);
-
-
-/**\brief Get the number of frames currently in the lookahead queue
- *
- * \param[in] ctx       Pointer to the lookahead context
- */
-unsigned int
-vp9_lookahead_depth(struct lookahead_ctx *ctx);
-
-
-#endif
--- a/vp8/encoder/mbgraph.c
+++ /dev/null
@@ -1,480 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <limits.h>
-#include <vp8/encoder/encodeintra.h>
-#include <vp8/encoder/rdopt.h>
-#include <vp8/common/setupintrarecon.h>
-#include <vp8/common/blockd.h>
-#include <vp8/common/reconinter.h>
-#include <vp8/common/systemdependent.h>
-#include <vpx_mem/vpx_mem.h>
-#include <vp8/encoder/segmentation.h>
-
-static unsigned int do_16x16_motion_iteration(VP9_COMP *cpi,
-                                              int_mv *ref_mv,
-                                              int_mv *dst_mv) {
-  MACROBLOCK   *const x  = &cpi->mb;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  BLOCK *b  = &x->block[0];
-  BLOCKD *d = &xd->block[0];
-  vp9_variance_fn_ptr_t v_fn_ptr = cpi->fn_ptr[BLOCK_16X16];
-  unsigned int best_err;
-  int step_param, further_steps;
-
-  int tmp_col_min = x->mv_col_min;
-  int tmp_col_max = x->mv_col_max;
-  int tmp_row_min = x->mv_row_min;
-  int tmp_row_max = x->mv_row_max;
-  int_mv ref_full;
-
-  // Further step/diamond searches as necessary
-  if (cpi->Speed < 8) {
-    step_param = cpi->sf.first_step + ((cpi->Speed > 5) ? 1 : 0);
-    further_steps = (cpi->sf.max_step_search_steps - 1) - step_param;
-  } else {
-    step_param = cpi->sf.first_step + 2;
-    further_steps = 0;
-  }
-
-  vp9_clamp_mv_min_max(x, ref_mv);
-
-  ref_full.as_mv.col = ref_mv->as_mv.col >> 3;
-  ref_full.as_mv.row = ref_mv->as_mv.row >> 3;
-
-  /*cpi->sf.search_method == HEX*/
-  best_err = vp9_hex_search(
-      x, b, d,
-      &ref_full, dst_mv,
-      step_param,
-      x->errorperbit,
-      &v_fn_ptr,
-      NULLMVCOST,
-      NULLMVCOST,
-      ref_mv);
-
-  // Try sub-pixel MC
-  // if (bestsme > error_thresh && bestsme < INT_MAX)
-  {
-    int distortion;
-    unsigned int sse;
-    best_err = cpi->find_fractional_mv_step(
-        x, b, d,
-        dst_mv, ref_mv,
-        x->errorperbit, &v_fn_ptr,
-        NULLMVCOST,
-        & distortion, &sse);
-  }
-
-#if CONFIG_PRED_FILTER
-  // Disable the prediction filter
-  xd->mode_info_context->mbmi.pred_filter_enabled = 0;
-#endif
-
-  vp9_set_mbmode_and_mvs(x, NEWMV, dst_mv);
-  vp9_build_1st_inter16x16_predictors_mby(xd, xd->predictor, 16, 0);
-  best_err = vp9_sad16x16(xd->dst.y_buffer, xd->dst.y_stride,
-                          xd->predictor, 16, INT_MAX);
-
-  /* restore UMV window */
-  x->mv_col_min = tmp_col_min;
-  x->mv_col_max = tmp_col_max;
-  x->mv_row_min = tmp_row_min;
-  x->mv_row_max = tmp_row_max;
-
-  return best_err;
-}
-
-static int do_16x16_motion_search
-(
-  VP9_COMP *cpi,
-  int_mv *ref_mv,
-  int_mv *dst_mv,
-  YV12_BUFFER_CONFIG *buf,
-  int buf_mb_y_offset,
-  YV12_BUFFER_CONFIG *ref,
-  int mb_y_offset
-) {
-  MACROBLOCK   *const x  = &cpi->mb;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  unsigned int err, tmp_err;
-  int_mv tmp_mv;
-  int n;
-
-  for (n = 0; n < 16; n++) {
-    BLOCKD *d = &xd->block[n];
-    BLOCK *b  = &x->block[n];
-
-    b->base_src   = &buf->y_buffer;
-    b->src_stride = buf->y_stride;
-    b->src        = buf->y_stride * (n & 12) + (n & 3) * 4 + buf_mb_y_offset;
-
-    d->base_pre   = &ref->y_buffer;
-    d->pre_stride = ref->y_stride;
-    d->pre        = ref->y_stride * (n & 12) + (n & 3) * 4 + mb_y_offset;
-  }
-
-  // Try zero MV first
-  // FIXME should really use something like near/nearest MV and/or MV prediction
-  xd->pre.y_buffer = ref->y_buffer + mb_y_offset;
-  xd->pre.y_stride = ref->y_stride;
-  err = vp9_sad16x16(ref->y_buffer + mb_y_offset, ref->y_stride,
-                     xd->dst.y_buffer, xd->dst.y_stride, INT_MAX);
-  dst_mv->as_int = 0;
-
-  // Test last reference frame using the previous best mv as the
-  // starting point (best reference) for the search
-  tmp_err = do_16x16_motion_iteration(cpi, ref_mv, &tmp_mv);
-  if (tmp_err < err) {
-    err            = tmp_err;
-    dst_mv->as_int = tmp_mv.as_int;
-  }
-
-  // If the current best reference mv is not centred on 0,0 then do a 0,0 based search as well
-  if (ref_mv->as_int) {
-    int tmp_err;
-    int_mv zero_ref_mv, tmp_mv;
-
-    zero_ref_mv.as_int = 0;
-    tmp_err = do_16x16_motion_iteration(cpi, &zero_ref_mv, &tmp_mv);
-    if (tmp_err < err) {
-      dst_mv->as_int = tmp_mv.as_int;
-      err = tmp_err;
-    }
-  }
-
-  return err;
-}
-
-static int do_16x16_zerozero_search
-(
-  VP9_COMP *cpi,
-  int_mv *dst_mv,
-  YV12_BUFFER_CONFIG *buf,
-  int buf_mb_y_offset,
-  YV12_BUFFER_CONFIG *ref,
-  int mb_y_offset
-) {
-  MACROBLOCK   *const x  = &cpi->mb;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  unsigned int err;
-  int n;
-
-  for (n = 0; n < 16; n++) {
-    BLOCKD *d = &xd->block[n];
-    BLOCK *b  = &x->block[n];
-
-    b->base_src   = &buf->y_buffer;
-    b->src_stride = buf->y_stride;
-    b->src        = buf->y_stride * (n & 12) + (n & 3) * 4 + buf_mb_y_offset;
-
-    d->base_pre   = &ref->y_buffer;
-    d->pre_stride = ref->y_stride;
-    d->pre        = ref->y_stride * (n & 12) + (n & 3) * 4 + mb_y_offset;
-  }
-
-  // Try zero MV first
-  // FIXME should really use something like near/nearest MV and/or MV prediction
-  xd->pre.y_buffer = ref->y_buffer + mb_y_offset;
-  xd->pre.y_stride = ref->y_stride;
-  // VARIANCE_INVOKE(&cpi->rtcd.variance, satd16x16)
-  err = vp9_sad16x16(ref->y_buffer + mb_y_offset, ref->y_stride,
-                     xd->dst.y_buffer, xd->dst.y_stride, INT_MAX);
-
-  dst_mv->as_int = 0;
-
-  return err;
-}
-static int find_best_16x16_intra
-(
-  VP9_COMP *cpi,
-  YV12_BUFFER_CONFIG *buf,
-  int mb_y_offset,
-  MB_PREDICTION_MODE *pbest_mode
-) {
-  MACROBLOCK   *const x  = &cpi->mb;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MB_PREDICTION_MODE best_mode = -1, mode;
-  int best_err = INT_MAX;
-
-  // calculate SATD for each intra prediction mode;
-  // we're intentionally not doing 4x4, we just want a rough estimate
-  for (mode = DC_PRED; mode <= TM_PRED; mode++) {
-    unsigned int err;
-
-    xd->mode_info_context->mbmi.mode = mode;
-    vp9_build_intra_predictors_mby(xd);
-    err = vp9_sad16x16(xd->predictor, 16, buf->y_buffer + mb_y_offset,
-                       buf->y_stride, best_err);
-    // find best
-    if (err < best_err) {
-      best_err  = err;
-      best_mode = mode;
-    }
-  }
-
-  if (pbest_mode)
-    *pbest_mode = best_mode;
-
-  return best_err;
-}
-
-static void update_mbgraph_mb_stats
-(
-  VP9_COMP *cpi,
-  MBGRAPH_MB_STATS *stats,
-  YV12_BUFFER_CONFIG *buf,
-  int mb_y_offset,
-  YV12_BUFFER_CONFIG *golden_ref,
-  int_mv *prev_golden_ref_mv,
-  int gld_y_offset,
-  YV12_BUFFER_CONFIG *alt_ref,
-  int_mv *prev_alt_ref_mv,
-  int arf_y_offset
-) {
-  MACROBLOCK   *const x  = &cpi->mb;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  int intra_error;
-
-  // FIXME in practice we're completely ignoring chroma here
-  xd->dst.y_buffer = buf->y_buffer + mb_y_offset;
-
-  // do intra 16x16 prediction
-  intra_error = find_best_16x16_intra(cpi, buf, mb_y_offset, &stats->ref[INTRA_FRAME].m.mode);
-  if (intra_error <= 0)
-    intra_error = 1;
-  stats->ref[INTRA_FRAME].err = intra_error;
-
-  // Golden frame MV search, if it exists and is different than last frame
-  if (golden_ref) {
-    int g_motion_error = do_16x16_motion_search(cpi, prev_golden_ref_mv,
-                                                &stats->ref[GOLDEN_FRAME].m.mv,
-                                                buf, mb_y_offset,
-                                                golden_ref, gld_y_offset);
-    stats->ref[GOLDEN_FRAME].err = g_motion_error;
-  } else {
-    stats->ref[GOLDEN_FRAME].err = INT_MAX;
-    stats->ref[GOLDEN_FRAME].m.mv.as_int = 0;
-  }
-
-  // Alt-ref frame MV search, if it exists and is different than last/golden frame
-  if (alt_ref) {
-    // int a_motion_error = do_16x16_motion_search(cpi, prev_alt_ref_mv,
-    //                                            &stats->ref[ALTREF_FRAME].m.mv,
-    //                                            buf, mb_y_offset,
-    //                                            alt_ref, arf_y_offset);
-
-    int a_motion_error =
-      do_16x16_zerozero_search(cpi,
-                               &stats->ref[ALTREF_FRAME].m.mv,
-                               buf, mb_y_offset,
-                               alt_ref, arf_y_offset);
-
-    stats->ref[ALTREF_FRAME].err = a_motion_error;
-  } else {
-    stats->ref[ALTREF_FRAME].err = INT_MAX;
-    stats->ref[ALTREF_FRAME].m.mv.as_int = 0;
-  }
-}
-
-static void update_mbgraph_frame_stats
-(
-  VP9_COMP *cpi,
-  MBGRAPH_FRAME_STATS *stats,
-  YV12_BUFFER_CONFIG *buf,
-  YV12_BUFFER_CONFIG *golden_ref,
-  YV12_BUFFER_CONFIG *alt_ref
-) {
-  MACROBLOCK   *const x  = &cpi->mb;
-  VP9_COMMON   *const cm = &cpi->common;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  int mb_col, mb_row, offset = 0;
-  int mb_y_offset = 0, arf_y_offset = 0, gld_y_offset = 0;
-  int_mv arf_top_mv, gld_top_mv;
-  MODE_INFO mi_local;
-
-  // Set up limit values for motion vectors to prevent them extending outside the UMV borders
-  arf_top_mv.as_int = 0;
-  gld_top_mv.as_int = 0;
-  x->mv_row_min     = -(VP8BORDERINPIXELS - 16 - INTERP_EXTEND);
-  x->mv_row_max     = (cm->mb_rows - 1) * 16 + VP8BORDERINPIXELS - 16 - INTERP_EXTEND;
-  xd->up_available  = 0;
-  xd->dst.y_stride  = buf->y_stride;
-  xd->pre.y_stride  = buf->y_stride;
-  xd->dst.uv_stride = buf->uv_stride;
-  xd->mode_info_context = &mi_local;
-
-  for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) {
-    int_mv arf_left_mv, gld_left_mv;
-    int mb_y_in_offset  = mb_y_offset;
-    int arf_y_in_offset = arf_y_offset;
-    int gld_y_in_offset = gld_y_offset;
-
-    // Set up limit values for motion vectors to prevent them extending outside the UMV borders
-    arf_left_mv.as_int = arf_top_mv.as_int;
-    gld_left_mv.as_int = gld_top_mv.as_int;
-    x->mv_col_min      = -(VP8BORDERINPIXELS - 16 - INTERP_EXTEND);
-    x->mv_col_max      = (cm->mb_cols - 1) * 16 + VP8BORDERINPIXELS - 16 - INTERP_EXTEND;
-    xd->left_available = 0;
-
-    for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) {
-      MBGRAPH_MB_STATS *mb_stats = &stats->mb_stats[offset + mb_col];
-
-      update_mbgraph_mb_stats(cpi, mb_stats, buf, mb_y_in_offset,
-                              golden_ref, &gld_left_mv, gld_y_in_offset,
-                              alt_ref,    &arf_left_mv, arf_y_in_offset);
-      arf_left_mv.as_int = mb_stats->ref[ALTREF_FRAME].m.mv.as_int;
-      gld_left_mv.as_int = mb_stats->ref[GOLDEN_FRAME].m.mv.as_int;
-      if (mb_col == 0) {
-        arf_top_mv.as_int = arf_left_mv.as_int;
-        gld_top_mv.as_int = gld_left_mv.as_int;
-      }
-      xd->left_available = 1;
-      mb_y_in_offset    += 16;
-      gld_y_in_offset   += 16;
-      arf_y_in_offset   += 16;
-      x->mv_col_min     -= 16;
-      x->mv_col_max     -= 16;
-    }
-    xd->up_available = 1;
-    mb_y_offset     += buf->y_stride * 16;
-    gld_y_offset    += golden_ref->y_stride * 16;
-    if (alt_ref)
-      arf_y_offset    += alt_ref->y_stride * 16;
-    x->mv_row_min   -= 16;
-    x->mv_row_max   -= 16;
-    offset          += cm->mb_cols;
-  }
-}
-
-// void separate_arf_mbs_byzz
-static void separate_arf_mbs(VP9_COMP *cpi) {
-  VP9_COMMON *const cm = &cpi->common;
-  int mb_col, mb_row, offset, i;
-  int ncnt[4];
-  int n_frames = cpi->mbgraph_n_frames;
-
-  int *arf_not_zz;
-
-  CHECK_MEM_ERROR(arf_not_zz,
-                  vpx_calloc(cm->mb_rows * cm->mb_cols * sizeof(*arf_not_zz), 1));
-
-  vpx_memset(arf_not_zz, 0, sizeof(arf_not_zz));
-
-  // We are not interested in results beyond the alt ref itself.
-  if (n_frames > cpi->frames_till_gf_update_due)
-    n_frames = cpi->frames_till_gf_update_due;
-
-  // defer cost to reference frames
-  for (i = n_frames - 1; i >= 0; i--) {
-    MBGRAPH_FRAME_STATS *frame_stats = &cpi->mbgraph_stats[i];
-
-    for (offset = 0, mb_row = 0; mb_row < cm->mb_rows;
-         offset += cm->mb_cols, mb_row++) {
-      for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) {
-        MBGRAPH_MB_STATS *mb_stats =
-          &frame_stats->mb_stats[offset + mb_col];
-
-        int altref_err = mb_stats->ref[ALTREF_FRAME].err;
-        int intra_err  = mb_stats->ref[INTRA_FRAME ].err;
-        int golden_err = mb_stats->ref[GOLDEN_FRAME].err;
-
-        // Test for altref vs intra and gf and that its mv was 0,0.
-        if ((altref_err > 1000) ||
-            (altref_err > intra_err) ||
-            (altref_err > golden_err)) {
-          arf_not_zz[offset + mb_col]++;
-        }
-      }
-    }
-  }
-
-  vpx_memset(ncnt, 0, sizeof(ncnt));
-  for (offset = 0, mb_row = 0; mb_row < cm->mb_rows;
-       offset += cm->mb_cols, mb_row++) {
-    for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) {
-      // If any of the blocks in the sequence failed then the MB
-      // goes in segment 0
-      if (arf_not_zz[offset + mb_col]) {
-        ncnt[0]++;
-        cpi->segmentation_map[offset + mb_col] = 0;
-      } else {
-        ncnt[1]++;
-        cpi->segmentation_map[offset + mb_col] = 1;
-      }
-    }
-  }
-
-  // Only bother with segmentation if over 10% of the MBs in static segment
-  // if ( ncnt[1] && (ncnt[0] / ncnt[1] < 10) )
-  if (1) {
-    // Note % of blocks that are marked as static
-    if (cm->MBs)
-      cpi->static_mb_pct = (ncnt[1] * 100) / cm->MBs;
-
-    // This error case should not be reachable as this function should
-    // never be called with the common data structure unititialized.
-    else
-      cpi->static_mb_pct = 0;
-
-    cpi->seg0_cnt = ncnt[0];
-    vp9_enable_segmentation((VP9_PTR) cpi);
-  } else {
-    cpi->static_mb_pct = 0;
-    vp9_disable_segmentation((VP9_PTR) cpi);
-  }
-
-  // Free localy allocated storage
-  vpx_free(arf_not_zz);
-}
-
-void vp9_update_mbgraph_stats
-(
-  VP9_COMP *cpi
-) {
-  VP9_COMMON *const cm = &cpi->common;
-  int i, n_frames = vp9_lookahead_depth(cpi->lookahead);
-  YV12_BUFFER_CONFIG *golden_ref = &cm->yv12_fb[cm->gld_fb_idx];
-
-  // we need to look ahead beyond where the ARF transitions into
-  // being a GF - so exit if we don't look ahead beyond that
-  if (n_frames <= cpi->frames_till_gf_update_due)
-    return;
-  if (n_frames > cpi->common.frames_till_alt_ref_frame)
-    n_frames = cpi->common.frames_till_alt_ref_frame;
-  if (n_frames > MAX_LAG_BUFFERS)
-    n_frames = MAX_LAG_BUFFERS;
-
-  cpi->mbgraph_n_frames = n_frames;
-  for (i = 0; i < n_frames; i++) {
-    MBGRAPH_FRAME_STATS *frame_stats = &cpi->mbgraph_stats[i];
-    vpx_memset(frame_stats->mb_stats, 0,
-               cm->mb_rows * cm->mb_cols * sizeof(*cpi->mbgraph_stats[i].mb_stats));
-  }
-
-  // do motion search to find contribution of each reference to data
-  // later on in this GF group
-  // FIXME really, the GF/last MC search should be done forward, and
-  // the ARF MC search backwards, to get optimal results for MV caching
-  for (i = 0; i < n_frames; i++) {
-    MBGRAPH_FRAME_STATS *frame_stats = &cpi->mbgraph_stats[i];
-    struct lookahead_entry *q_cur =
-      vp9_lookahead_peek(cpi->lookahead, i);
-
-    assert(q_cur != NULL);
-
-    update_mbgraph_frame_stats(cpi, frame_stats, &q_cur->img,
-                               golden_ref, cpi->Source);
-  }
-
-  vp9_clear_system_state();  // __asm emms;
-
-  separate_arf_mbs(cpi);
-}
--- a/vp8/encoder/mbgraph.h
+++ /dev/null
@@ -1,16 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef __INC_MBGRAPH_H__
-#define __INC_MBGRAPH_H__ 1
-
-extern void vp9_update_mbgraph_stats(VP9_COMP *cpi);
-
-#endif /* __INC_MBGRAPH_H__ */
--- a/vp8/encoder/mcomp.c
+++ /dev/null
@@ -1,2203 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vp8/encoder/onyx_int.h"
-#include "mcomp.h"
-#include "vpx_mem/vpx_mem.h"
-#include "vpx_ports/config.h"
-#include <stdio.h>
-#include <limits.h>
-#include <math.h>
-#include "vp8/common/findnearmv.h"
-
-#ifdef ENTROPY_STATS
-static int mv_ref_ct [31] [4] [2];
-static int mv_mode_cts [4] [2];
-#endif
-
-void vp9_clamp_mv_min_max(MACROBLOCK *x, int_mv *ref_mv) {
-  int col_min = (ref_mv->as_mv.col >> 3) - MAX_FULL_PEL_VAL +
-      ((ref_mv->as_mv.col & 7) ? 1 : 0);
-  int row_min = (ref_mv->as_mv.row >> 3) - MAX_FULL_PEL_VAL +
-      ((ref_mv->as_mv.row & 7) ? 1 : 0);
-  int col_max = (ref_mv->as_mv.col >> 3) + MAX_FULL_PEL_VAL;
-  int row_max = (ref_mv->as_mv.row >> 3) + MAX_FULL_PEL_VAL;
-
-  /* Get intersection of UMV window and valid MV window to reduce # of checks in diamond search. */
-  if (x->mv_col_min < col_min)
-    x->mv_col_min = col_min;
-  if (x->mv_col_max > col_max)
-    x->mv_col_max = col_max;
-  if (x->mv_row_min < row_min)
-    x->mv_row_min = row_min;
-  if (x->mv_row_max > row_max)
-    x->mv_row_max = row_max;
-}
-
-int vp9_mv_bit_cost(int_mv *mv, int_mv *ref, DEC_MVCOSTS,
-                    int Weight, int ishp) {
-  MV v;
-  v.row = (mv->as_mv.row - ref->as_mv.row);
-  v.col = (mv->as_mv.col - ref->as_mv.col);
-  return ((mvjcost[vp9_get_mv_joint(v)] +
-           mvcost[0][v.row] + mvcost[1][v.col]) *
-          Weight) >> 7;
-}
-
-static int mv_err_cost(int_mv *mv, int_mv *ref, DEC_MVCOSTS,
-                       int error_per_bit, int ishp) {
-  if (mvcost) {
-    MV v;
-    v.row = (mv->as_mv.row - ref->as_mv.row);
-    v.col = (mv->as_mv.col - ref->as_mv.col);
-    return ((mvjcost[vp9_get_mv_joint(v)] +
-             mvcost[0][v.row] + mvcost[1][v.col]) *
-            error_per_bit + 128) >> 8;
-  }
-  return 0;
-}
-
-static int mvsad_err_cost(int_mv *mv, int_mv *ref, DEC_MVSADCOSTS,
-                          int error_per_bit) {
-
-  if (mvsadcost) {
-    MV v;
-    v.row = (mv->as_mv.row - ref->as_mv.row);
-    v.col = (mv->as_mv.col - ref->as_mv.col);
-    return ((mvjsadcost[vp9_get_mv_joint(v)] +
-             mvsadcost[0][v.row] + mvsadcost[1][v.col]) *
-            error_per_bit + 128) >> 8;
-  }
-  return 0;
-}
-
-void vp9_init_dsmotion_compensation(MACROBLOCK *x, int stride) {
-  int Len;
-  int search_site_count = 0;
-
-
-  // Generate offsets for 4 search sites per step.
-  Len = MAX_FIRST_STEP;
-  x->ss[search_site_count].mv.col = 0;
-  x->ss[search_site_count].mv.row = 0;
-  x->ss[search_site_count].offset = 0;
-  search_site_count++;
-
-  while (Len > 0) {
-
-    // Compute offsets for search sites.
-    x->ss[search_site_count].mv.col = 0;
-    x->ss[search_site_count].mv.row = -Len;
-    x->ss[search_site_count].offset = -Len * stride;
-    search_site_count++;
-
-    // Compute offsets for search sites.
-    x->ss[search_site_count].mv.col = 0;
-    x->ss[search_site_count].mv.row = Len;
-    x->ss[search_site_count].offset = Len * stride;
-    search_site_count++;
-
-    // Compute offsets for search sites.
-    x->ss[search_site_count].mv.col = -Len;
-    x->ss[search_site_count].mv.row = 0;
-    x->ss[search_site_count].offset = -Len;
-    search_site_count++;
-
-    // Compute offsets for search sites.
-    x->ss[search_site_count].mv.col = Len;
-    x->ss[search_site_count].mv.row = 0;
-    x->ss[search_site_count].offset = Len;
-    search_site_count++;
-
-    // Contract.
-    Len /= 2;
-  }
-
-  x->ss_count = search_site_count;
-  x->searches_per_step = 4;
-}
-
-void vp9_init3smotion_compensation(MACROBLOCK *x, int stride) {
-  int Len;
-  int search_site_count = 0;
-
-  // Generate offsets for 8 search sites per step.
-  Len = MAX_FIRST_STEP;
-  x->ss[search_site_count].mv.col = 0;
-  x->ss[search_site_count].mv.row = 0;
-  x->ss[search_site_count].offset = 0;
-  search_site_count++;
-
-  while (Len > 0) {
-
-    // Compute offsets for search sites.
-    x->ss[search_site_count].mv.col = 0;
-    x->ss[search_site_count].mv.row = -Len;
-    x->ss[search_site_count].offset = -Len * stride;
-    search_site_count++;
-
-    // Compute offsets for search sites.
-    x->ss[search_site_count].mv.col = 0;
-    x->ss[search_site_count].mv.row = Len;
-    x->ss[search_site_count].offset = Len * stride;
-    search_site_count++;
-
-    // Compute offsets for search sites.
-    x->ss[search_site_count].mv.col = -Len;
-    x->ss[search_site_count].mv.row = 0;
-    x->ss[search_site_count].offset = -Len;
-    search_site_count++;
-
-    // Compute offsets for search sites.
-    x->ss[search_site_count].mv.col = Len;
-    x->ss[search_site_count].mv.row = 0;
-    x->ss[search_site_count].offset = Len;
-    search_site_count++;
-
-    // Compute offsets for search sites.
-    x->ss[search_site_count].mv.col = -Len;
-    x->ss[search_site_count].mv.row = -Len;
-    x->ss[search_site_count].offset = -Len * stride - Len;
-    search_site_count++;
-
-    // Compute offsets for search sites.
-    x->ss[search_site_count].mv.col = Len;
-    x->ss[search_site_count].mv.row = -Len;
-    x->ss[search_site_count].offset = -Len * stride + Len;
-    search_site_count++;
-
-    // Compute offsets for search sites.
-    x->ss[search_site_count].mv.col = -Len;
-    x->ss[search_site_count].mv.row = Len;
-    x->ss[search_site_count].offset = Len * stride - Len;
-    search_site_count++;
-
-    // Compute offsets for search sites.
-    x->ss[search_site_count].mv.col = Len;
-    x->ss[search_site_count].mv.row = Len;
-    x->ss[search_site_count].offset = Len * stride + Len;
-    search_site_count++;
-
-    // Contract.
-    Len /= 2;
-  }
-
-  x->ss_count = search_site_count;
-  x->searches_per_step = 8;
-}
-
-/*
- * To avoid the penalty for crossing cache-line read, preload the reference
- * area in a small buffer, which is aligned to make sure there won't be crossing
- * cache-line read while reading from this buffer. This reduced the cpu
- * cycles spent on reading ref data in sub-pixel filter functions.
- * TODO: Currently, since sub-pixel search range here is -3 ~ 3, copy 22 rows x
- * 32 cols area that is enough for 16x16 macroblock. Later, for SPLITMV, we
- * could reduce the area.
- */
-
-/* estimated cost of a motion vector (r,c) */
-#define MVC(r, c)                                       \
-    (mvcost ?                                           \
-     ((mvjcost[((r) != rr) * 2 + ((c) != rc)] +         \
-       mvcost[0][((r) - rr)] + mvcost[1][((c) - rc)]) * \
-      error_per_bit + 128) >> 8 : 0)
-
-#define SP(x) (((x) & 7) << 1)  // convert motion vector component to offset
-                                // for svf calc
-
-#define IFMVCV(r, c, s, e)                                \
-    if (c >= minc && c <= maxc && r >= minr && r <= maxr) \
-      s                                                   \
-    else                                                  \
-      e;
-
-/* pointer to predictor base of a motionvector */
-#define PRE(r, c) (y + (((r) >> 3) * y_stride + ((c) >> 3) -(offset)))
-
-/* returns subpixel variance error function */
-#define DIST(r, c) \
-    vfp->svf(PRE(r, c), y_stride, SP(c), SP(r), z, b->src_stride, &sse)
-
-/* checks if (r, c) has better score than previous best */
-#define CHECK_BETTER(v, r, c) \
-    IFMVCV(r, c, {                                                       \
-      thismse = (DIST(r, c));                                            \
-      if ((v = MVC(r, c) + thismse) < besterr) {                         \
-        besterr = v;                                                     \
-        br = r;                                                          \
-        bc = c;                                                          \
-        *distortion = thismse;                                           \
-        *sse1 = sse;                                                     \
-      }                                                                  \
-    },                                                                   \
-    v = INT_MAX;)
-
-#define MIN(x,y) (((x)<(y))?(x):(y))
-#define MAX(x,y) (((x)>(y))?(x):(y))
-
-int vp9_find_best_sub_pixel_step_iteratively(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
-                                             int_mv *bestmv, int_mv *ref_mv,
-                                             int error_per_bit,
-                                             const vp9_variance_fn_ptr_t *vfp,
-                                             DEC_MVCOSTS,
-                                             int *distortion,
-                                             unsigned int *sse1) {
-  unsigned char *z = (*(b->base_src) + b->src);
-  MACROBLOCKD *xd = &x->e_mbd;
-
-  int rr, rc, br, bc, hstep;
-  int tr, tc;
-  unsigned int besterr = INT_MAX;
-  unsigned int left, right, up, down, diag;
-  unsigned int sse;
-  unsigned int whichdir;
-  unsigned int halfiters = 4;
-  unsigned int quarteriters = 4;
-  unsigned int eighthiters = 4;
-  int thismse;
-  int maxc, minc, maxr, minr;
-  int y_stride;
-  int offset;
-  int usehp = xd->allow_high_precision_mv;
-
-#if !CONFIG_SUPERBLOCKS && (ARCH_X86 || ARCH_X86_64)
-  unsigned char *y0 = *(d->base_pre) + d->pre + (bestmv->as_mv.row) * d->pre_stride + bestmv->as_mv.col;
-  unsigned char *y;
-  int buf_r1, buf_r2, buf_c1, buf_c2;
-
-  // Clamping to avoid out-of-range data access
-  buf_r1 = ((bestmv->as_mv.row - INTERP_EXTEND) < x->mv_row_min) ?
-      (bestmv->as_mv.row - x->mv_row_min) : INTERP_EXTEND - 1;
-  buf_r2 = ((bestmv->as_mv.row + INTERP_EXTEND) > x->mv_row_max) ?
-      (x->mv_row_max - bestmv->as_mv.row) : INTERP_EXTEND - 1;
-  buf_c1 = ((bestmv->as_mv.col - INTERP_EXTEND) < x->mv_col_min) ?
-      (bestmv->as_mv.col - x->mv_col_min) : INTERP_EXTEND - 1;
-  buf_c2 = ((bestmv->as_mv.col + INTERP_EXTEND) > x->mv_col_max) ?
-      (x->mv_col_max - bestmv->as_mv.col) : INTERP_EXTEND - 1;
-  y_stride = 32;
-
-  /* Copy to intermediate buffer before searching. */
-  vfp->copymem(y0 - buf_c1 - d->pre_stride * buf_r1, d->pre_stride, xd->y_buf, y_stride, 16 + buf_r1 + buf_r2);
-  y = xd->y_buf + y_stride * buf_r1 + buf_c1;
-#else
-  unsigned char *y = *(d->base_pre) + d->pre + (bestmv->as_mv.row) * d->pre_stride + bestmv->as_mv.col;
-  y_stride = d->pre_stride;
-#endif
-
-  rr = ref_mv->as_mv.row;
-  rc = ref_mv->as_mv.col;
-  br = bestmv->as_mv.row << 3;
-  bc = bestmv->as_mv.col << 3;
-  hstep = 4;
-  minc = MAX(x->mv_col_min << 3, (ref_mv->as_mv.col) - ((1 << MV_MAX_BITS) - 1));
-  maxc = MIN(x->mv_col_max << 3, (ref_mv->as_mv.col) + ((1 << MV_MAX_BITS) - 1));
-  minr = MAX(x->mv_row_min << 3, (ref_mv->as_mv.row) - ((1 << MV_MAX_BITS) - 1));
-  maxr = MIN(x->mv_row_max << 3, (ref_mv->as_mv.row) + ((1 << MV_MAX_BITS) - 1));
-
-  tr = br;
-  tc = bc;
-
-
-  offset = (bestmv->as_mv.row) * y_stride + bestmv->as_mv.col;
-
-  // central mv
-  bestmv->as_mv.row <<= 3;
-  bestmv->as_mv.col <<= 3;
-
-  // calculate central point error
-  besterr = vfp->vf(y, y_stride, z, b->src_stride, sse1);
-  *distortion = besterr;
-  besterr += mv_err_cost(bestmv, ref_mv, MVCOSTS,
-                         error_per_bit, xd->allow_high_precision_mv);
-
-  // TODO: Each subsequent iteration checks at least one point in
-  // common with the last iteration could be 2 ( if diag selected)
-  while (--halfiters) {
-    // 1/2 pel
-    CHECK_BETTER(left, tr, tc - hstep);
-    CHECK_BETTER(right, tr, tc + hstep);
-    CHECK_BETTER(up, tr - hstep, tc);
-    CHECK_BETTER(down, tr + hstep, tc);
-
-    whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
-
-    switch (whichdir) {
-      case 0:
-        CHECK_BETTER(diag, tr - hstep, tc - hstep);
-        break;
-      case 1:
-        CHECK_BETTER(diag, tr - hstep, tc + hstep);
-        break;
-      case 2:
-        CHECK_BETTER(diag, tr + hstep, tc - hstep);
-        break;
-      case 3:
-        CHECK_BETTER(diag, tr + hstep, tc + hstep);
-        break;
-    }
-
-    // no reason to check the same one again.
-    if (tr == br && tc == bc)
-      break;
-
-    tr = br;
-    tc = bc;
-  }
-
-  // TODO: Each subsequent iteration checks at least one point in common with
-  // the last iteration could be 2 ( if diag selected) 1/4 pel
-  hstep >>= 1;
-  while (--quarteriters) {
-    CHECK_BETTER(left, tr, tc - hstep);
-    CHECK_BETTER(right, tr, tc + hstep);
-    CHECK_BETTER(up, tr - hstep, tc);
-    CHECK_BETTER(down, tr + hstep, tc);
-
-    whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
-
-    switch (whichdir) {
-      case 0:
-        CHECK_BETTER(diag, tr - hstep, tc - hstep);
-        break;
-      case 1:
-        CHECK_BETTER(diag, tr - hstep, tc + hstep);
-        break;
-      case 2:
-        CHECK_BETTER(diag, tr + hstep, tc - hstep);
-        break;
-      case 3:
-        CHECK_BETTER(diag, tr + hstep, tc + hstep);
-        break;
-    }
-
-    // no reason to check the same one again.
-    if (tr == br && tc == bc)
-      break;
-
-    tr = br;
-    tc = bc;
-  }
-
-  if (xd->allow_high_precision_mv) {
-    usehp = vp9_use_nmv_hp(&ref_mv->as_mv);
-  } else {
-    usehp = 0;
-  }
-
-  if (usehp) {
-    hstep >>= 1;
-    while (--eighthiters) {
-      CHECK_BETTER(left, tr, tc - hstep);
-      CHECK_BETTER(right, tr, tc + hstep);
-      CHECK_BETTER(up, tr - hstep, tc);
-      CHECK_BETTER(down, tr + hstep, tc);
-
-      whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
-
-      switch (whichdir) {
-        case 0:
-          CHECK_BETTER(diag, tr - hstep, tc - hstep);
-          break;
-        case 1:
-          CHECK_BETTER(diag, tr - hstep, tc + hstep);
-          break;
-        case 2:
-          CHECK_BETTER(diag, tr + hstep, tc - hstep);
-          break;
-        case 3:
-          CHECK_BETTER(diag, tr + hstep, tc + hstep);
-          break;
-      }
-
-      // no reason to check the same one again.
-      if (tr == br && tc == bc)
-        break;
-
-      tr = br;
-      tc = bc;
-    }
-  }
-  bestmv->as_mv.row = br;
-  bestmv->as_mv.col = bc;
-
-  if ((abs(bestmv->as_mv.col - ref_mv->as_mv.col) > (MAX_FULL_PEL_VAL << 3)) ||
-      (abs(bestmv->as_mv.row - ref_mv->as_mv.row) > (MAX_FULL_PEL_VAL << 3)))
-    return INT_MAX;
-
-  return besterr;
-}
-#undef MVC
-#undef PRE
-#undef DIST
-#undef IFMVCV
-#undef CHECK_BETTER
-#undef MIN
-#undef MAX
-
-int vp9_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
-                                 int_mv *bestmv, int_mv *ref_mv,
-                                 int error_per_bit,
-                                 const vp9_variance_fn_ptr_t *vfp,
-                                 DEC_MVCOSTS, int *distortion,
-                                 unsigned int *sse1) {
-  int bestmse = INT_MAX;
-  int_mv startmv;
-  int_mv this_mv;
-  int_mv orig_mv;
-  int yrow_movedback = 0, ycol_movedback = 0;
-  unsigned char *z = (*(b->base_src) + b->src);
-  int left, right, up, down, diag;
-  unsigned int sse;
-  int whichdir;
-  int thismse;
-  int y_stride;
-  MACROBLOCKD *xd = &x->e_mbd;
-  int usehp = xd->allow_high_precision_mv;
-
-#if !CONFIG_SUPERBLOCKS && (ARCH_X86 || ARCH_X86_64)
-  unsigned char *y0 = *(d->base_pre) + d->pre + (bestmv->as_mv.row) * d->pre_stride + bestmv->as_mv.col;
-  unsigned char *y;
-
-  y_stride = 32;
-  /* Copy 18 rows x 32 cols area to intermediate buffer before searching. */
-  vfp->copymem(y0 - 1 - d->pre_stride, d->pre_stride, xd->y_buf, y_stride, 18);
-  y = xd->y_buf + y_stride + 1;
-#else
-  unsigned char *y = *(d->base_pre) + d->pre + (bestmv->as_mv.row) * d->pre_stride + bestmv->as_mv.col;
-  y_stride = d->pre_stride;
-#endif
-
-  // central mv
-  bestmv->as_mv.row <<= 3;
-  bestmv->as_mv.col <<= 3;
-  startmv = *bestmv;
-  orig_mv = *bestmv;
-
-  // calculate central point error
-  bestmse = vfp->vf(y, y_stride, z, b->src_stride, sse1);
-  *distortion = bestmse;
-  bestmse += mv_err_cost(bestmv, ref_mv, MVCOSTS, error_per_bit,
-                         xd->allow_high_precision_mv);
-
-  // go left then right and check error
-  this_mv.as_mv.row = startmv.as_mv.row;
-  this_mv.as_mv.col = ((startmv.as_mv.col - 8) | 4);
-  thismse = vfp->svf_halfpix_h(y - 1, y_stride, z, b->src_stride, &sse);
-  left = thismse + mv_err_cost(&this_mv, ref_mv, MVCOSTS, error_per_bit,
-                               xd->allow_high_precision_mv);
-
-  if (left < bestmse) {
-    *bestmv = this_mv;
-    bestmse = left;
-    *distortion = thismse;
-    *sse1 = sse;
-  }
-
-  this_mv.as_mv.col += 8;
-  thismse = vfp->svf_halfpix_h(y, y_stride, z, b->src_stride, &sse);
-  right = thismse + mv_err_cost(&this_mv, ref_mv, MVCOSTS, error_per_bit,
-                                xd->allow_high_precision_mv);
-
-  if (right < bestmse) {
-    *bestmv = this_mv;
-    bestmse = right;
-    *distortion = thismse;
-    *sse1 = sse;
-  }
-
-  // go up then down and check error
-  this_mv.as_mv.col = startmv.as_mv.col;
-  this_mv.as_mv.row = ((startmv.as_mv.row - 8) | 4);
-  thismse =  vfp->svf_halfpix_v(y - y_stride, y_stride, z, b->src_stride, &sse);
-  up = thismse + mv_err_cost(&this_mv, ref_mv, MVCOSTS, error_per_bit,
-                             xd->allow_high_precision_mv);
-
-  if (up < bestmse) {
-    *bestmv = this_mv;
-    bestmse = up;
-    *distortion = thismse;
-    *sse1 = sse;
-  }
-
-  this_mv.as_mv.row += 8;
-  thismse = vfp->svf_halfpix_v(y, y_stride, z, b->src_stride, &sse);
-  down = thismse + mv_err_cost(&this_mv, ref_mv, MVCOSTS, error_per_bit,
-                               xd->allow_high_precision_mv);
-
-  if (down < bestmse) {
-    *bestmv = this_mv;
-    bestmse = down;
-    *distortion = thismse;
-    *sse1 = sse;
-  }
-
-
-  // now check 1 more diagonal
-  whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
-  // for(whichdir =0;whichdir<4;whichdir++)
-  // {
-  this_mv = startmv;
-
-  switch (whichdir) {
-    case 0:
-      this_mv.as_mv.col = (this_mv.as_mv.col - 8) | 4;
-      this_mv.as_mv.row = (this_mv.as_mv.row - 8) | 4;
-      thismse = vfp->svf_halfpix_hv(y - 1 - y_stride, y_stride, z, b->src_stride, &sse);
-      break;
-    case 1:
-      this_mv.as_mv.col += 4;
-      this_mv.as_mv.row = (this_mv.as_mv.row - 8) | 4;
-      thismse = vfp->svf_halfpix_hv(y - y_stride, y_stride, z, b->src_stride, &sse);
-      break;
-    case 2:
-      this_mv.as_mv.col = (this_mv.as_mv.col - 8) | 4;
-      this_mv.as_mv.row += 4;
-      thismse = vfp->svf_halfpix_hv(y - 1, y_stride, z, b->src_stride, &sse);
-      break;
-    case 3:
-    default:
-      this_mv.as_mv.col += 4;
-      this_mv.as_mv.row += 4;
-      thismse = vfp->svf_halfpix_hv(y, y_stride, z, b->src_stride, &sse);
-      break;
-  }
-
-  diag = thismse + mv_err_cost(&this_mv, ref_mv, MVCOSTS, error_per_bit,
-                               xd->allow_high_precision_mv);
-
-  if (diag < bestmse) {
-    *bestmv = this_mv;
-    bestmse = diag;
-    *distortion = thismse;
-    *sse1 = sse;
-  }
-
-//  }
-
-
-  // time to check quarter pels.
-  if (bestmv->as_mv.row < startmv.as_mv.row) {
-    y -= y_stride;
-    yrow_movedback = 1;
-  }
-
-  if (bestmv->as_mv.col < startmv.as_mv.col) {
-    y--;
-    ycol_movedback = 1;
-  }
-
-  startmv = *bestmv;
-
-
-
-  // go left then right and check error
-  this_mv.as_mv.row = startmv.as_mv.row;
-
-  if (startmv.as_mv.col & 7) {
-    this_mv.as_mv.col = startmv.as_mv.col - 2;
-    thismse = vfp->svf(y, y_stride,
-                       SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),
-                       z, b->src_stride, &sse);
-  } else {
-    this_mv.as_mv.col = (startmv.as_mv.col - 8) | 6;
-    thismse = vfp->svf(y - 1, y_stride, SP(6), SP(this_mv.as_mv.row), z,
-                       b->src_stride, &sse);
-  }
-
-  left = thismse + mv_err_cost(&this_mv, ref_mv, MVCOSTS, error_per_bit,
-                               xd->allow_high_precision_mv);
-
-  if (left < bestmse) {
-    *bestmv = this_mv;
-    bestmse = left;
-    *distortion = thismse;
-    *sse1 = sse;
-  }
-
-  this_mv.as_mv.col += 4;
-  thismse = vfp->svf(y, y_stride,
-                     SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),
-                     z, b->src_stride, &sse);
-  right = thismse + mv_err_cost(&this_mv, ref_mv, MVCOSTS, error_per_bit,
-                                xd->allow_high_precision_mv);
-
-  if (right < bestmse) {
-    *bestmv = this_mv;
-    bestmse = right;
-    *distortion = thismse;
-    *sse1 = sse;
-  }
-
-  // go up then down and check error
-  this_mv.as_mv.col = startmv.as_mv.col;
-
-  if (startmv.as_mv.row & 7) {
-    this_mv.as_mv.row = startmv.as_mv.row - 2;
-    thismse = vfp->svf(y, y_stride,
-                       SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),
-                       z, b->src_stride, &sse);
-  } else {
-    this_mv.as_mv.row = (startmv.as_mv.row - 8) | 6;
-    thismse = vfp->svf(y - y_stride, y_stride, SP(this_mv.as_mv.col), SP(6),
-                       z, b->src_stride, &sse);
-  }
-
-  up = thismse + mv_err_cost(&this_mv, ref_mv, MVCOSTS, error_per_bit,
-                             xd->allow_high_precision_mv);
-
-  if (up < bestmse) {
-    *bestmv = this_mv;
-    bestmse = up;
-    *distortion = thismse;
-    *sse1 = sse;
-  }
-
-  this_mv.as_mv.row += 4;
-  thismse = vfp->svf(y, y_stride, SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),
-                     z, b->src_stride, &sse);
-  down = thismse + mv_err_cost(&this_mv, ref_mv, MVCOSTS, error_per_bit,
-                               xd->allow_high_precision_mv);
-
-  if (down < bestmse) {
-    *bestmv = this_mv;
-    bestmse = down;
-    *distortion = thismse;
-    *sse1 = sse;
-  }
-
-
-  // now check 1 more diagonal
-  whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
-
-//  for(whichdir=0;whichdir<4;whichdir++)
-//  {
-  this_mv = startmv;
-
-  switch (whichdir) {
-    case 0:
-
-      if (startmv.as_mv.row & 7) {
-        this_mv.as_mv.row -= 2;
-
-        if (startmv.as_mv.col & 7) {
-          this_mv.as_mv.col -= 2;
-          thismse = vfp->svf(y, y_stride, SP(this_mv.as_mv.col), SP(this_mv.as_mv.row), z, b->src_stride, &sse);
-        } else {
-          this_mv.as_mv.col = (startmv.as_mv.col - 8) | 6;
-          thismse = vfp->svf(y - 1, y_stride, SP(6), SP(this_mv.as_mv.row), z, b->src_stride, &sse);;
-        }
-      } else {
-        this_mv.as_mv.row = (startmv.as_mv.row - 8) | 6;
-
-        if (startmv.as_mv.col & 7) {
-          this_mv.as_mv.col -= 2;
-          thismse = vfp->svf(y - y_stride, y_stride, SP(this_mv.as_mv.col), SP(6), z, b->src_stride, &sse);
-        } else {
-          this_mv.as_mv.col = (startmv.as_mv.col - 8) | 6;
-          thismse = vfp->svf(y - y_stride - 1, y_stride, SP(6), SP(6), z, b->src_stride, &sse);
-        }
-      }
-
-      break;
-    case 1:
-      this_mv.as_mv.col += 2;
-
-      if (startmv.as_mv.row & 7) {
-        this_mv.as_mv.row -= 2;
-        thismse = vfp->svf(y, y_stride, SP(this_mv.as_mv.col), SP(this_mv.as_mv.row), z, b->src_stride, &sse);
-      } else {
-        this_mv.as_mv.row = (startmv.as_mv.row - 8) | 6;
-        thismse = vfp->svf(y - y_stride, y_stride, SP(this_mv.as_mv.col), SP(6), z, b->src_stride, &sse);
-      }
-
-      break;
-    case 2:
-      this_mv.as_mv.row += 2;
-
-      if (startmv.as_mv.col & 7) {
-        this_mv.as_mv.col -= 2;
-        thismse = vfp->svf(y, y_stride, SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),
-                           z, b->src_stride, &sse);
-      } else {
-        this_mv.as_mv.col = (startmv.as_mv.col - 8) | 6;
-        thismse = vfp->svf(y - 1, y_stride, SP(6), SP(this_mv.as_mv.row), z,
-                           b->src_stride, &sse);
-      }
-
-      break;
-    case 3:
-      this_mv.as_mv.col += 2;
-      this_mv.as_mv.row += 2;
-      thismse = vfp->svf(y, y_stride,
-                         SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),
-                         z, b->src_stride, &sse);
-      break;
-  }
-
-  diag = thismse + mv_err_cost(&this_mv, ref_mv, MVCOSTS, error_per_bit,
-                               xd->allow_high_precision_mv);
-
-  if (diag < bestmse) {
-    *bestmv = this_mv;
-    bestmse = diag;
-    *distortion = thismse;
-    *sse1 = sse;
-  }
-
-  if (x->e_mbd.allow_high_precision_mv) {
-    usehp = vp9_use_nmv_hp(&ref_mv->as_mv);
-  } else {
-    usehp = 0;
-  }
-  if (!usehp)
-    return bestmse;
-
-  /* Now do 1/8th pixel */
-  if (bestmv->as_mv.row < orig_mv.as_mv.row && !yrow_movedback) {
-    y -= y_stride;
-    yrow_movedback = 1;
-  }
-
-  if (bestmv->as_mv.col < orig_mv.as_mv.col && !ycol_movedback) {
-    y--;
-    ycol_movedback = 1;
-  }
-
-  startmv = *bestmv;
-
-  // go left then right and check error
-  this_mv.as_mv.row = startmv.as_mv.row;
-
-  if (startmv.as_mv.col & 7) {
-    this_mv.as_mv.col = startmv.as_mv.col - 1;
-    thismse = vfp->svf(y, y_stride,
-                       SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),
-                       z, b->src_stride, &sse);
-  } else {
-    this_mv.as_mv.col = (startmv.as_mv.col - 8) | 7;
-    thismse = vfp->svf(y - 1, y_stride, SP(7), SP(this_mv.as_mv.row),
-                       z, b->src_stride, &sse);
-  }
-
-  left = thismse + mv_err_cost(&this_mv, ref_mv, MVCOSTS, error_per_bit,
-                               xd->allow_high_precision_mv);
-
-  if (left < bestmse) {
-    *bestmv = this_mv;
-    bestmse = left;
-    *distortion = thismse;
-    *sse1 = sse;
-  }
-
-  this_mv.as_mv.col += 2;
-  thismse = vfp->svf(y, y_stride, SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),
-                     z, b->src_stride, &sse);
-  right = thismse + mv_err_cost(&this_mv, ref_mv, MVCOSTS, error_per_bit,
-                                xd->allow_high_precision_mv);
-
-  if (right < bestmse) {
-    *bestmv = this_mv;
-    bestmse = right;
-    *distortion = thismse;
-    *sse1 = sse;
-  }
-
-  // go up then down and check error
-  this_mv.as_mv.col = startmv.as_mv.col;
-
-  if (startmv.as_mv.row & 7) {
-    this_mv.as_mv.row = startmv.as_mv.row - 1;
-    thismse = vfp->svf(y, y_stride, SP(this_mv.as_mv.col), SP(this_mv.as_mv.row), z, b->src_stride, &sse);
-  } else {
-    this_mv.as_mv.row = (startmv.as_mv.row - 8) | 7;
-    thismse = vfp->svf(y - y_stride, y_stride, SP(this_mv.as_mv.col), SP(7), z, b->src_stride, &sse);
-  }
-
-  up = thismse + mv_err_cost(&this_mv, ref_mv, MVCOSTS, error_per_bit,
-                             xd->allow_high_precision_mv);
-
-  if (up < bestmse) {
-    *bestmv = this_mv;
-    bestmse = up;
-    *distortion = thismse;
-    *sse1 = sse;
-  }
-
-  this_mv.as_mv.row += 2;
-  thismse = vfp->svf(y, y_stride, SP(this_mv.as_mv.col), SP(this_mv.as_mv.row), z, b->src_stride, &sse);
-  down = thismse + mv_err_cost(&this_mv, ref_mv, MVCOSTS, error_per_bit,
-                               xd->allow_high_precision_mv);
-
-  if (down < bestmse) {
-    *bestmv = this_mv;
-    bestmse = down;
-    *distortion = thismse;
-    *sse1 = sse;
-  }
-
-  // now check 1 more diagonal
-  whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
-
-//  for(whichdir=0;whichdir<4;whichdir++)
-//  {
-  this_mv = startmv;
-
-  switch (whichdir) {
-    case 0:
-
-      if (startmv.as_mv.row & 7) {
-        this_mv.as_mv.row -= 1;
-
-        if (startmv.as_mv.col & 7) {
-          this_mv.as_mv.col -= 1;
-          thismse = vfp->svf(y, y_stride, SP(this_mv.as_mv.col), SP(this_mv.as_mv.row), z, b->src_stride, &sse);
-        } else {
-          this_mv.as_mv.col = (startmv.as_mv.col - 8) | 7;
-          thismse = vfp->svf(y - 1, y_stride, SP(7), SP(this_mv.as_mv.row), z, b->src_stride, &sse);;
-        }
-      } else {
-        this_mv.as_mv.row = (startmv.as_mv.row - 8) | 7;
-
-        if (startmv.as_mv.col & 7) {
-          this_mv.as_mv.col -= 1;
-          thismse = vfp->svf(y - y_stride, y_stride, SP(this_mv.as_mv.col), SP(7), z, b->src_stride, &sse);
-        } else {
-          this_mv.as_mv.col = (startmv.as_mv.col - 8) | 7;
-          thismse = vfp->svf(y - y_stride - 1, y_stride, SP(7), SP(7), z, b->src_stride, &sse);
-        }
-      }
-
-      break;
-    case 1:
-      this_mv.as_mv.col += 1;
-
-      if (startmv.as_mv.row & 7) {
-        this_mv.as_mv.row -= 1;
-        thismse = vfp->svf(y, y_stride, SP(this_mv.as_mv.col), SP(this_mv.as_mv.row), z, b->src_stride, &sse);
-      } else {
-        this_mv.as_mv.row = (startmv.as_mv.row - 8) | 7;
-        thismse = vfp->svf(y - y_stride, y_stride, SP(this_mv.as_mv.col), SP(7), z, b->src_stride, &sse);
-      }
-
-      break;
-    case 2:
-      this_mv.as_mv.row += 1;
-
-      if (startmv.as_mv.col & 7) {
-        this_mv.as_mv.col -= 1;
-        thismse = vfp->svf(y, y_stride, SP(this_mv.as_mv.col), SP(this_mv.as_mv.row), z, b->src_stride, &sse);
-      } else {
-        this_mv.as_mv.col = (startmv.as_mv.col - 8) | 7;
-        thismse = vfp->svf(y - 1, y_stride, SP(7), SP(this_mv.as_mv.row), z, b->src_stride, &sse);
-      }
-
-      break;
-    case 3:
-      this_mv.as_mv.col += 1;
-      this_mv.as_mv.row += 1;
-      thismse = vfp->svf(y, y_stride,  SP(this_mv.as_mv.col), SP(this_mv.as_mv.row), z, b->src_stride, &sse);
-      break;
-  }
-
-  diag = thismse + mv_err_cost(&this_mv, ref_mv, MVCOSTS, error_per_bit,
-                               xd->allow_high_precision_mv);
-
-  if (diag < bestmse) {
-    *bestmv = this_mv;
-    bestmse = diag;
-    *distortion = thismse;
-    *sse1 = sse;
-  }
-
-  return bestmse;
-}
-
-#undef SP
-
-int vp9_find_best_half_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
-                                  int_mv *bestmv, int_mv *ref_mv,
-                                  int error_per_bit,
-                                  const vp9_variance_fn_ptr_t *vfp,
-                                  DEC_MVCOSTS,
-                                  int *distortion,
-                                  unsigned int *sse1) {
-  int bestmse = INT_MAX;
-  int_mv startmv;
-  int_mv this_mv;
-  unsigned char *z = (*(b->base_src) + b->src);
-  int left, right, up, down, diag;
-  unsigned int sse;
-  int whichdir;
-  int thismse;
-  int y_stride;
-  MACROBLOCKD *xd = &x->e_mbd;
-
-#if !CONFIG_SUPERBLOCKS && (ARCH_X86 || ARCH_X86_64)
-  unsigned char *y0 = *(d->base_pre) + d->pre +
-      (bestmv->as_mv.row) * d->pre_stride + bestmv->as_mv.col;
-  unsigned char *y;
-
-  y_stride = 32;
-  /* Copy 18 rows x 32 cols area to intermediate buffer before searching. */
-  vfp->copymem(y0 - 1 - d->pre_stride, d->pre_stride, xd->y_buf, y_stride, 18);
-  y = xd->y_buf + y_stride + 1;
-#else
-  unsigned char *y = *(d->base_pre) + d->pre +
-      (bestmv->as_mv.row) * d->pre_stride + bestmv->as_mv.col;
-  y_stride = d->pre_stride;
-#endif
-
-  // central mv
-  bestmv->as_mv.row <<= 3;
-  bestmv->as_mv.col <<= 3;
-  startmv = *bestmv;
-
-  // calculate central point error
-  bestmse = vfp->vf(y, y_stride, z, b->src_stride, sse1);
-  *distortion = bestmse;
-  bestmse += mv_err_cost(bestmv, ref_mv, MVCOSTS, error_per_bit,
-                         xd->allow_high_precision_mv);
-
-  // go left then right and check error
-  this_mv.as_mv.row = startmv.as_mv.row;
-  this_mv.as_mv.col = ((startmv.as_mv.col - 8) | 4);
-  thismse = vfp->svf_halfpix_h(y - 1, y_stride, z, b->src_stride, &sse);
-  left = thismse + mv_err_cost(&this_mv, ref_mv, MVCOSTS, error_per_bit,
-                               xd->allow_high_precision_mv);
-
-  if (left < bestmse) {
-    *bestmv = this_mv;
-    bestmse = left;
-    *distortion = thismse;
-    *sse1 = sse;
-  }
-
-  this_mv.as_mv.col += 8;
-  thismse = vfp->svf_halfpix_h(y, y_stride, z, b->src_stride, &sse);
-  right = thismse + mv_err_cost(&this_mv, ref_mv, MVCOSTS, error_per_bit,
-                                xd->allow_high_precision_mv);
-
-  if (right < bestmse) {
-    *bestmv = this_mv;
-    bestmse = right;
-    *distortion = thismse;
-    *sse1 = sse;
-  }
-
-  // go up then down and check error
-  this_mv.as_mv.col = startmv.as_mv.col;
-  this_mv.as_mv.row = ((startmv.as_mv.row - 8) | 4);
-  thismse = vfp->svf_halfpix_v(y - y_stride, y_stride, z, b->src_stride, &sse);
-  up = thismse + mv_err_cost(&this_mv, ref_mv, MVCOSTS, error_per_bit,
-                             xd->allow_high_precision_mv);
-
-  if (up < bestmse) {
-    *bestmv = this_mv;
-    bestmse = up;
-    *distortion = thismse;
-    *sse1 = sse;
-  }
-
-  this_mv.as_mv.row += 8;
-  thismse = vfp->svf_halfpix_v(y, y_stride, z, b->src_stride, &sse);
-  down = thismse + mv_err_cost(&this_mv, ref_mv, MVCOSTS, error_per_bit,
-                               xd->allow_high_precision_mv);
-
-  if (down < bestmse) {
-    *bestmv = this_mv;
-    bestmse = down;
-    *distortion = thismse;
-    *sse1 = sse;
-  }
-
-  // now check 1 more diagonal -
-  whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
-  this_mv = startmv;
-
-  switch (whichdir) {
-    case 0:
-      this_mv.as_mv.col = (this_mv.as_mv.col - 8) | 4;
-      this_mv.as_mv.row = (this_mv.as_mv.row - 8) | 4;
-      thismse = vfp->svf_halfpix_hv(y - 1 - y_stride, y_stride, z, b->src_stride, &sse);
-      break;
-    case 1:
-      this_mv.as_mv.col += 4;
-      this_mv.as_mv.row = (this_mv.as_mv.row - 8) | 4;
-      thismse = vfp->svf_halfpix_hv(y - y_stride, y_stride, z, b->src_stride, &sse);
-      break;
-    case 2:
-      this_mv.as_mv.col = (this_mv.as_mv.col - 8) | 4;
-      this_mv.as_mv.row += 4;
-      thismse = vfp->svf_halfpix_hv(y - 1, y_stride, z, b->src_stride, &sse);
-      break;
-    case 3:
-    default:
-      this_mv.as_mv.col += 4;
-      this_mv.as_mv.row += 4;
-      thismse = vfp->svf_halfpix_hv(y, y_stride, z, b->src_stride, &sse);
-      break;
-  }
-
-  diag = thismse + mv_err_cost(&this_mv, ref_mv, MVCOSTS, error_per_bit,
-                               xd->allow_high_precision_mv);
-
-  if (diag < bestmse) {
-    *bestmv = this_mv;
-    bestmse = diag;
-    *distortion = thismse;
-    *sse1 = sse;
-  }
-
-  return bestmse;
-}
-
-#define CHECK_BOUNDS(range) \
-  {\
-    all_in = 1;\
-    all_in &= ((br-range) >= x->mv_row_min);\
-    all_in &= ((br+range) <= x->mv_row_max);\
-    all_in &= ((bc-range) >= x->mv_col_min);\
-    all_in &= ((bc+range) <= x->mv_col_max);\
-  }
-
-#define CHECK_POINT \
-  {\
-    if (this_mv.as_mv.col < x->mv_col_min) continue;\
-    if (this_mv.as_mv.col > x->mv_col_max) continue;\
-    if (this_mv.as_mv.row < x->mv_row_min) continue;\
-    if (this_mv.as_mv.row > x->mv_row_max) continue;\
-  }
-
-#define CHECK_BETTER \
-  {\
-    if (thissad < bestsad)\
-    {\
-      thissad += mvsad_err_cost(&this_mv, &fcenter_mv, MVSADCOSTS, sad_per_bit);\
-      if (thissad < bestsad)\
-      {\
-        bestsad = thissad;\
-        best_site = i;\
-      }\
-    }\
-  }
-
-static const MV next_chkpts[6][3] = {
-  {{ -2, 0}, { -1, -2}, {1, -2}},
-  {{ -1, -2}, {1, -2}, {2, 0}},
-  {{1, -2}, {2, 0}, {1, 2}},
-  {{2, 0}, {1, 2}, { -1, 2}},
-  {{1, 2}, { -1, 2}, { -2, 0}},
-  {{ -1, 2}, { -2, 0}, { -1, -2}}
-};
-
-int vp9_hex_search
-(
-  MACROBLOCK *x,
-  BLOCK *b,
-  BLOCKD *d,
-  int_mv *ref_mv,
-  int_mv *best_mv,
-  int search_param,
-  int sad_per_bit,
-  const vp9_variance_fn_ptr_t *vfp,
-  DEC_MVSADCOSTS,
-  DEC_MVCOSTS,
-  int_mv *center_mv
-) {
-  MV hex[6] = { { -1, -2}, {1, -2}, {2, 0}, {1, 2}, { -1, 2}, { -2, 0} };
-  MV neighbors[4] = {{0, -1}, { -1, 0}, {1, 0}, {0, 1}};
-  int i, j;
-
-  unsigned char *what = (*(b->base_src) + b->src);
-  int what_stride = b->src_stride;
-  int in_what_stride = d->pre_stride;
-  int br, bc;
-  int_mv this_mv;
-  unsigned int bestsad = 0x7fffffff;
-  unsigned int thissad;
-  unsigned char *base_offset;
-  unsigned char *this_offset;
-  int k = -1;
-  int all_in;
-  int best_site = -1;
-
-  int_mv fcenter_mv;
-  fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
-  fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
-
-  // adjust ref_mv to make sure it is within MV range
-  clamp_mv(ref_mv, x->mv_col_min, x->mv_col_max, x->mv_row_min, x->mv_row_max);
-  br = ref_mv->as_mv.row;
-  bc = ref_mv->as_mv.col;
-
-  // Work out the start point for the search
-  base_offset = (unsigned char *)(*(d->base_pre) + d->pre);
-  this_offset = base_offset + (br * (d->pre_stride)) + bc;
-  this_mv.as_mv.row = br;
-  this_mv.as_mv.col = bc;
-  bestsad = vfp->sdf(what, what_stride, this_offset,
-                     in_what_stride, 0x7fffffff)
-            + mvsad_err_cost(&this_mv, &fcenter_mv, MVSADCOSTS, sad_per_bit);
-
-  // hex search
-  // j=0
-  CHECK_BOUNDS(2)
-
-  if (all_in) {
-    for (i = 0; i < 6; i++) {
-      this_mv.as_mv.row = br + hex[i].row;
-      this_mv.as_mv.col = bc + hex[i].col;
-      this_offset = base_offset + (this_mv.as_mv.row * in_what_stride) + this_mv.as_mv.col;
-      thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, bestsad);
-      CHECK_BETTER
-    }
-  } else {
-    for (i = 0; i < 6; i++) {
-      this_mv.as_mv.row = br + hex[i].row;
-      this_mv.as_mv.col = bc + hex[i].col;
-      CHECK_POINT
-      this_offset = base_offset + (this_mv.as_mv.row * in_what_stride) + this_mv.as_mv.col;
-      thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, bestsad);
-      CHECK_BETTER
-    }
-  }
-
-  if (best_site == -1)
-    goto cal_neighbors;
-  else {
-    br += hex[best_site].row;
-    bc += hex[best_site].col;
-    k = best_site;
-  }
-
-  for (j = 1; j < 127; j++) {
-    best_site = -1;
-    CHECK_BOUNDS(2)
-
-    if (all_in) {
-      for (i = 0; i < 3; i++) {
-        this_mv.as_mv.row = br + next_chkpts[k][i].row;
-        this_mv.as_mv.col = bc + next_chkpts[k][i].col;
-        this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) + this_mv.as_mv.col;
-        thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, bestsad);
-        CHECK_BETTER
-      }
-    } else {
-      for (i = 0; i < 3; i++) {
-        this_mv.as_mv.row = br + next_chkpts[k][i].row;
-        this_mv.as_mv.col = bc + next_chkpts[k][i].col;
-        CHECK_POINT
-        this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) + this_mv.as_mv.col;
-        thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, bestsad);
-        CHECK_BETTER
-      }
-    }
-
-    if (best_site == -1)
-      break;
-    else {
-      br += next_chkpts[k][best_site].row;
-      bc += next_chkpts[k][best_site].col;
-      k += 5 + best_site;
-      if (k >= 12) k -= 12;
-      else if (k >= 6) k -= 6;
-    }
-  }
-
-  // check 4 1-away neighbors
-cal_neighbors:
-  for (j = 0; j < 32; j++) {
-    best_site = -1;
-    CHECK_BOUNDS(1)
-
-    if (all_in) {
-      for (i = 0; i < 4; i++) {
-        this_mv.as_mv.row = br + neighbors[i].row;
-        this_mv.as_mv.col = bc + neighbors[i].col;
-        this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) + this_mv.as_mv.col;
-        thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, bestsad);
-        CHECK_BETTER
-      }
-    } else {
-      for (i = 0; i < 4; i++) {
-        this_mv.as_mv.row = br + neighbors[i].row;
-        this_mv.as_mv.col = bc + neighbors[i].col;
-        CHECK_POINT
-        this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) + this_mv.as_mv.col;
-        thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, bestsad);
-        CHECK_BETTER
-      }
-    }
-
-    if (best_site == -1)
-      break;
-    else {
-      br += neighbors[best_site].row;
-      bc += neighbors[best_site].col;
-    }
-  }
-
-  best_mv->as_mv.row = br;
-  best_mv->as_mv.col = bc;
-
-  return bestsad;
-}
-#undef CHECK_BOUNDS
-#undef CHECK_POINT
-#undef CHECK_BETTER
-
-int vp9_diamond_search_sad(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
-                           int_mv *ref_mv, int_mv *best_mv,
-                           int search_param, int sad_per_bit, int *num00,
-                           vp9_variance_fn_ptr_t *fn_ptr, DEC_MVCOSTS,
-                           int_mv *center_mv) {
-  int i, j, step;
-
-  unsigned char *what = (*(b->base_src) + b->src);
-  int what_stride = b->src_stride;
-  unsigned char *in_what;
-  int in_what_stride = d->pre_stride;
-  unsigned char *best_address;
-
-  int tot_steps;
-  int_mv this_mv;
-
-  int bestsad = INT_MAX;
-  int best_site = 0;
-  int last_site = 0;
-
-  int ref_row, ref_col;
-  int this_row_offset, this_col_offset;
-  search_site *ss;
-
-  unsigned char *check_here;
-  int thissad;
-  MACROBLOCKD *xd = &x->e_mbd;
-  int_mv fcenter_mv;
-
-  int *mvjsadcost = x->nmvjointsadcost;
-  int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]};
-
-  fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
-  fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
-
-  clamp_mv(ref_mv, x->mv_col_min, x->mv_col_max, x->mv_row_min, x->mv_row_max);
-  ref_row = ref_mv->as_mv.row;
-  ref_col = ref_mv->as_mv.col;
-  *num00 = 0;
-  best_mv->as_mv.row = ref_row;
-  best_mv->as_mv.col = ref_col;
-
-  // Work out the start point for the search
-  in_what = (unsigned char *)(*(d->base_pre) + d->pre + (ref_row * (d->pre_stride)) + ref_col);
-  best_address = in_what;
-
-  // Check the starting position
-  bestsad = fn_ptr->sdf(what, what_stride, in_what,
-                        in_what_stride, 0x7fffffff)
-            + mvsad_err_cost(best_mv, &fcenter_mv, MVSADCOSTS, sad_per_bit);
-
-  // search_param determines the length of the initial step and hence the number of iterations
-  // 0 = initial step (MAX_FIRST_STEP) pel : 1 = (MAX_FIRST_STEP/2) pel, 2 = (MAX_FIRST_STEP/4) pel... etc.
-  ss = &x->ss[search_param * x->searches_per_step];
-  tot_steps = (x->ss_count / x->searches_per_step) - search_param;
-
-  i = 1;
-
-  for (step = 0; step < tot_steps; step++) {
-    for (j = 0; j < x->searches_per_step; j++) {
-      // Trap illegal vectors
-      this_row_offset = best_mv->as_mv.row + ss[i].mv.row;
-      this_col_offset = best_mv->as_mv.col + ss[i].mv.col;
-
-      if ((this_col_offset > x->mv_col_min) && (this_col_offset < x->mv_col_max) &&
-          (this_row_offset > x->mv_row_min) && (this_row_offset < x->mv_row_max))
-
-      {
-        check_here = ss[i].offset + best_address;
-        thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, bestsad);
-
-        if (thissad < bestsad) {
-          this_mv.as_mv.row = this_row_offset;
-          this_mv.as_mv.col = this_col_offset;
-          thissad += mvsad_err_cost(&this_mv, &fcenter_mv,
-                                    MVSADCOSTS, sad_per_bit);
-
-          if (thissad < bestsad) {
-            bestsad = thissad;
-            best_site = i;
-          }
-        }
-      }
-
-      i++;
-    }
-
-    if (best_site != last_site) {
-      best_mv->as_mv.row += ss[best_site].mv.row;
-      best_mv->as_mv.col += ss[best_site].mv.col;
-      best_address += ss[best_site].offset;
-      last_site = best_site;
-    } else if (best_address == in_what)
-      (*num00)++;
-  }
-
-  this_mv.as_mv.row = best_mv->as_mv.row << 3;
-  this_mv.as_mv.col = best_mv->as_mv.col << 3;
-
-  if (bestsad == INT_MAX)
-    return INT_MAX;
-
-  return
-      fn_ptr->vf(what, what_stride, best_address, in_what_stride,
-                 (unsigned int *)(&thissad)) +
-      mv_err_cost(&this_mv, center_mv, MVCOSTS, x->errorperbit,
-                  xd->allow_high_precision_mv);
-}
-
-int vp9_diamond_search_sadx4(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
-                             int_mv *ref_mv, int_mv *best_mv, int search_param,
-                             int sad_per_bit, int *num00,
-                             vp9_variance_fn_ptr_t *fn_ptr,
-                             DEC_MVCOSTS, int_mv *center_mv) {
-  int i, j, step;
-
-  unsigned char *what = (*(b->base_src) + b->src);
-  int what_stride = b->src_stride;
-  unsigned char *in_what;
-  int in_what_stride = d->pre_stride;
-  unsigned char *best_address;
-
-  int tot_steps;
-  int_mv this_mv;
-
-  int bestsad = INT_MAX;
-  int best_site = 0;
-  int last_site = 0;
-
-  int ref_row;
-  int ref_col;
-  int this_row_offset;
-  int this_col_offset;
-  search_site *ss;
-
-  unsigned char *check_here;
-  unsigned int thissad;
-  MACROBLOCKD *xd = &x->e_mbd;
-  int_mv fcenter_mv;
-
-  int *mvjsadcost = x->nmvjointsadcost;
-  int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]};
-
-  fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
-  fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
-
-  clamp_mv(ref_mv, x->mv_col_min, x->mv_col_max, x->mv_row_min, x->mv_row_max);
-  ref_row = ref_mv->as_mv.row;
-  ref_col = ref_mv->as_mv.col;
-  *num00 = 0;
-  best_mv->as_mv.row = ref_row;
-  best_mv->as_mv.col = ref_col;
-
-  // Work out the start point for the search
-  in_what = (unsigned char *)(*(d->base_pre) + d->pre + (ref_row * (d->pre_stride)) + ref_col);
-  best_address = in_what;
-
-  // Check the starting position
-  bestsad = fn_ptr->sdf(what, what_stride,
-                        in_what, in_what_stride, 0x7fffffff)
-            + mvsad_err_cost(best_mv, &fcenter_mv, MVSADCOSTS, sad_per_bit);
-
-  // search_param determines the length of the initial step and hence the number of iterations
-  // 0 = initial step (MAX_FIRST_STEP) pel : 1 = (MAX_FIRST_STEP/2) pel, 2 = (MAX_FIRST_STEP/4) pel... etc.
-  ss = &x->ss[search_param * x->searches_per_step];
-  tot_steps = (x->ss_count / x->searches_per_step) - search_param;
-
-  i = 1;
-
-  for (step = 0; step < tot_steps; step++) {
-    int all_in = 1, t;
-
-    // To know if all neighbor points are within the bounds, 4 bounds checking are enough instead of
-    // checking 4 bounds for each points.
-    all_in &= ((best_mv->as_mv.row + ss[i].mv.row) > x->mv_row_min);
-    all_in &= ((best_mv->as_mv.row + ss[i + 1].mv.row) < x->mv_row_max);
-    all_in &= ((best_mv->as_mv.col + ss[i + 2].mv.col) > x->mv_col_min);
-    all_in &= ((best_mv->as_mv.col + ss[i + 3].mv.col) < x->mv_col_max);
-
-    if (all_in) {
-      unsigned int sad_array[4];
-
-      for (j = 0; j < x->searches_per_step; j += 4) {
-        unsigned char *block_offset[4];
-
-        for (t = 0; t < 4; t++)
-          block_offset[t] = ss[i + t].offset + best_address;
-
-        fn_ptr->sdx4df(what, what_stride, block_offset, in_what_stride,
-                       sad_array);
-
-        for (t = 0; t < 4; t++, i++) {
-          if (sad_array[t] < bestsad) {
-            this_mv.as_mv.row = best_mv->as_mv.row + ss[i].mv.row;
-            this_mv.as_mv.col = best_mv->as_mv.col + ss[i].mv.col;
-            sad_array[t] += mvsad_err_cost(&this_mv, &fcenter_mv,
-                                           MVSADCOSTS, sad_per_bit);
-
-            if (sad_array[t] < bestsad) {
-              bestsad = sad_array[t];
-              best_site = i;
-            }
-          }
-        }
-      }
-    } else {
-      for (j = 0; j < x->searches_per_step; j++) {
-        // Trap illegal vectors
-        this_row_offset = best_mv->as_mv.row + ss[i].mv.row;
-        this_col_offset = best_mv->as_mv.col + ss[i].mv.col;
-
-        if ((this_col_offset > x->mv_col_min) && (this_col_offset < x->mv_col_max) &&
-            (this_row_offset > x->mv_row_min) && (this_row_offset < x->mv_row_max)) {
-          check_here = ss[i].offset + best_address;
-          thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, bestsad);
-
-          if (thissad < bestsad) {
-            this_mv.as_mv.row = this_row_offset;
-            this_mv.as_mv.col = this_col_offset;
-            thissad += mvsad_err_cost(&this_mv, &fcenter_mv,
-                                      MVSADCOSTS, sad_per_bit);
-
-            if (thissad < bestsad) {
-              bestsad = thissad;
-              best_site = i;
-            }
-          }
-        }
-        i++;
-      }
-    }
-
-    if (best_site != last_site) {
-      best_mv->as_mv.row += ss[best_site].mv.row;
-      best_mv->as_mv.col += ss[best_site].mv.col;
-      best_address += ss[best_site].offset;
-      last_site = best_site;
-    } else if (best_address == in_what)
-      (*num00)++;
-  }
-
-  this_mv.as_mv.row = best_mv->as_mv.row << 3;
-  this_mv.as_mv.col = best_mv->as_mv.col << 3;
-
-  if (bestsad == INT_MAX)
-    return INT_MAX;
-
-  return
-      fn_ptr->vf(what, what_stride, best_address, in_what_stride,
-                 (unsigned int *)(&thissad)) +
-      mv_err_cost(&this_mv, center_mv, MVCOSTS, x->errorperbit,
-                  xd->allow_high_precision_mv);
-}
-
-/* do_refine: If last step (1-away) of n-step search doesn't pick the center
-              point as the best match, we will do a final 1-away diamond
-              refining search  */
-int vp9_full_pixel_diamond(VP9_COMP *cpi, MACROBLOCK *x, BLOCK *b,
-                           BLOCKD *d, int_mv *mvp_full, int step_param,
-                           int sadpb, int further_steps,
-                           int do_refine, vp9_variance_fn_ptr_t *fn_ptr,
-                           int_mv *ref_mv, int_mv *dst_mv) {
-  int_mv temp_mv;
-  int thissme, n, num00;
-  int bestsme = cpi->diamond_search_sad(x, b, d, mvp_full, &temp_mv,
-                                        step_param, sadpb, &num00,
-                                        fn_ptr, XMVCOST, ref_mv);
-  dst_mv->as_int = temp_mv.as_int;
-
-  n = num00;
-  num00 = 0;
-
-  /* If there won't be more n-step search, check to see if refining search is needed. */
-  if (n > further_steps)
-    do_refine = 0;
-
-  while (n < further_steps) {
-    n++;
-
-    if (num00)
-      num00--;
-    else {
-      thissme = cpi->diamond_search_sad(x, b, d, mvp_full, &temp_mv,
-                                        step_param + n, sadpb, &num00,
-                                        fn_ptr, XMVCOST, ref_mv);
-
-      /* check to see if refining search is needed. */
-      if (num00 > (further_steps - n))
-        do_refine = 0;
-
-      if (thissme < bestsme) {
-        bestsme = thissme;
-        dst_mv->as_int = temp_mv.as_int;
-      }
-    }
-  }
-
-  /* final 1-away diamond refining search */
-  if (do_refine == 1) {
-    int search_range = 8;
-    int_mv best_mv;
-    best_mv.as_int = dst_mv->as_int;
-    thissme = cpi->refining_search_sad(x, b, d, &best_mv, sadpb, search_range,
-                                       fn_ptr, XMVCOST, ref_mv);
-
-    if (thissme < bestsme) {
-      bestsme = thissme;
-      dst_mv->as_int = best_mv.as_int;
-    }
-  }
-  return bestsme;
-}
-
-int vp9_full_search_sad(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
-                        int sad_per_bit, int distance,
-                        vp9_variance_fn_ptr_t *fn_ptr, DEC_MVCOSTS,
-                        int_mv *center_mv) {
-  unsigned char *what = (*(b->base_src) + b->src);
-  int what_stride = b->src_stride;
-  unsigned char *in_what;
-  int in_what_stride = d->pre_stride;
-  int mv_stride = d->pre_stride;
-  unsigned char *bestaddress;
-  int_mv *best_mv = &d->bmi.as_mv.first;
-  int_mv this_mv;
-  int bestsad = INT_MAX;
-  int r, c;
-
-  unsigned char *check_here;
-  int thissad;
-  MACROBLOCKD *xd = &x->e_mbd;
-
-  int ref_row = ref_mv->as_mv.row;
-  int ref_col = ref_mv->as_mv.col;
-
-  int row_min = ref_row - distance;
-  int row_max = ref_row + distance;
-  int col_min = ref_col - distance;
-  int col_max = ref_col + distance;
-  int_mv fcenter_mv;
-
-  int *mvjsadcost = x->nmvjointsadcost;
-  int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]};
-
-  fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
-  fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
-
-  // Work out the mid point for the search
-  in_what = *(d->base_pre) + d->pre;
-  bestaddress = in_what + (ref_row * d->pre_stride) + ref_col;
-
-  best_mv->as_mv.row = ref_row;
-  best_mv->as_mv.col = ref_col;
-
-  // Baseline value at the centre
-  bestsad = fn_ptr->sdf(what, what_stride, bestaddress,
-                        in_what_stride, 0x7fffffff)
-            + mvsad_err_cost(best_mv, &fcenter_mv, MVSADCOSTS, sad_per_bit);
-
-  // Apply further limits to prevent us looking using vectors that stretch beyiond the UMV border
-  if (col_min < x->mv_col_min)
-    col_min = x->mv_col_min;
-
-  if (col_max > x->mv_col_max)
-    col_max = x->mv_col_max;
-
-  if (row_min < x->mv_row_min)
-    row_min = x->mv_row_min;
-
-  if (row_max > x->mv_row_max)
-    row_max = x->mv_row_max;
-
-  for (r = row_min; r < row_max; r++) {
-    this_mv.as_mv.row = r;
-    check_here = r * mv_stride + in_what + col_min;
-
-    for (c = col_min; c < col_max; c++) {
-      thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, bestsad);
-
-      this_mv.as_mv.col = c;
-      thissad  += mvsad_err_cost(&this_mv, &fcenter_mv,
-                                 MVSADCOSTS, sad_per_bit);
-
-      if (thissad < bestsad) {
-        bestsad = thissad;
-        best_mv->as_mv.row = r;
-        best_mv->as_mv.col = c;
-        bestaddress = check_here;
-      }
-
-      check_here++;
-    }
-  }
-
-  this_mv.as_mv.row = best_mv->as_mv.row << 3;
-  this_mv.as_mv.col = best_mv->as_mv.col << 3;
-
-  if (bestsad < INT_MAX)
-    return
-        fn_ptr->vf(what, what_stride, bestaddress, in_what_stride,
-                   (unsigned int *)(&thissad)) +
-        mv_err_cost(&this_mv, center_mv, MVCOSTS, x->errorperbit,
-                    xd->allow_high_precision_mv);
-  else
-    return INT_MAX;
-}
-
-int vp9_full_search_sadx3(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
-                          int sad_per_bit, int distance,
-                          vp9_variance_fn_ptr_t *fn_ptr, DEC_MVCOSTS,
-                          int_mv *center_mv) {
-  unsigned char *what = (*(b->base_src) + b->src);
-  int what_stride = b->src_stride;
-  unsigned char *in_what;
-  int in_what_stride = d->pre_stride;
-  int mv_stride = d->pre_stride;
-  unsigned char *bestaddress;
-  int_mv *best_mv = &d->bmi.as_mv.first;
-  int_mv this_mv;
-  int bestsad = INT_MAX;
-  int r, c;
-
-  unsigned char *check_here;
-  unsigned int thissad;
-  MACROBLOCKD *xd = &x->e_mbd;
-
-  int ref_row = ref_mv->as_mv.row;
-  int ref_col = ref_mv->as_mv.col;
-
-  int row_min = ref_row - distance;
-  int row_max = ref_row + distance;
-  int col_min = ref_col - distance;
-  int col_max = ref_col + distance;
-
-  unsigned int sad_array[3];
-  int_mv fcenter_mv;
-
-  int *mvjsadcost = x->nmvjointsadcost;
-  int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]};
-
-  fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
-  fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
-
-  // Work out the mid point for the search
-  in_what = *(d->base_pre) + d->pre;
-  bestaddress = in_what + (ref_row * d->pre_stride) + ref_col;
-
-  best_mv->as_mv.row = ref_row;
-  best_mv->as_mv.col = ref_col;
-
-  // Baseline value at the centre
-  bestsad = fn_ptr->sdf(what, what_stride,
-                        bestaddress, in_what_stride, 0x7fffffff)
-            + mvsad_err_cost(best_mv, &fcenter_mv, MVSADCOSTS, sad_per_bit);
-
-  // Apply further limits to prevent us looking using vectors that stretch beyiond the UMV border
-  if (col_min < x->mv_col_min)
-    col_min = x->mv_col_min;
-
-  if (col_max > x->mv_col_max)
-    col_max = x->mv_col_max;
-
-  if (row_min < x->mv_row_min)
-    row_min = x->mv_row_min;
-
-  if (row_max > x->mv_row_max)
-    row_max = x->mv_row_max;
-
-  for (r = row_min; r < row_max; r++) {
-    this_mv.as_mv.row = r;
-    check_here = r * mv_stride + in_what + col_min;
-    c = col_min;
-
-    while ((c + 2) < col_max) {
-      int i;
-
-      fn_ptr->sdx3f(what, what_stride, check_here, in_what_stride, sad_array);
-
-      for (i = 0; i < 3; i++) {
-        thissad = sad_array[i];
-
-        if (thissad < bestsad) {
-          this_mv.as_mv.col = c;
-          thissad  += mvsad_err_cost(&this_mv, &fcenter_mv,
-                                     MVSADCOSTS, sad_per_bit);
-
-          if (thissad < bestsad) {
-            bestsad = thissad;
-            best_mv->as_mv.row = r;
-            best_mv->as_mv.col = c;
-            bestaddress = check_here;
-          }
-        }
-
-        check_here++;
-        c++;
-      }
-    }
-
-    while (c < col_max) {
-      thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, bestsad);
-
-      if (thissad < bestsad) {
-        this_mv.as_mv.col = c;
-        thissad  += mvsad_err_cost(&this_mv, &fcenter_mv,
-                                   MVSADCOSTS, sad_per_bit);
-
-        if (thissad < bestsad) {
-          bestsad = thissad;
-          best_mv->as_mv.row = r;
-          best_mv->as_mv.col = c;
-          bestaddress = check_here;
-        }
-      }
-
-      check_here++;
-      c++;
-    }
-
-  }
-
-  this_mv.as_mv.row = best_mv->as_mv.row << 3;
-  this_mv.as_mv.col = best_mv->as_mv.col << 3;
-
-  if (bestsad < INT_MAX)
-    return
-        fn_ptr->vf(what, what_stride, bestaddress, in_what_stride,
-                   (unsigned int *)(&thissad)) +
-        mv_err_cost(&this_mv, center_mv, MVCOSTS, x->errorperbit,
-                    xd->allow_high_precision_mv);
-  else
-    return INT_MAX;
-}
-
-int vp9_full_search_sadx8(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
-                          int sad_per_bit, int distance,
-                          vp9_variance_fn_ptr_t *fn_ptr,
-                          DEC_MVCOSTS,
-                          int_mv *center_mv) {
-  unsigned char *what = (*(b->base_src) + b->src);
-  int what_stride = b->src_stride;
-  unsigned char *in_what;
-  int in_what_stride = d->pre_stride;
-  int mv_stride = d->pre_stride;
-  unsigned char *bestaddress;
-  int_mv *best_mv = &d->bmi.as_mv.first;
-  int_mv this_mv;
-  int bestsad = INT_MAX;
-  int r, c;
-
-  unsigned char *check_here;
-  unsigned int thissad;
-  MACROBLOCKD *xd = &x->e_mbd;
-
-  int ref_row = ref_mv->as_mv.row;
-  int ref_col = ref_mv->as_mv.col;
-
-  int row_min = ref_row - distance;
-  int row_max = ref_row + distance;
-  int col_min = ref_col - distance;
-  int col_max = ref_col + distance;
-
-  DECLARE_ALIGNED_ARRAY(16, unsigned short, sad_array8, 8);
-  unsigned int sad_array[3];
-  int_mv fcenter_mv;
-
-  int *mvjsadcost = x->nmvjointsadcost;
-  int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]};
-
-  fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
-  fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
-
-  // Work out the mid point for the search
-  in_what = *(d->base_pre) + d->pre;
-  bestaddress = in_what + (ref_row * d->pre_stride) + ref_col;
-
-  best_mv->as_mv.row = ref_row;
-  best_mv->as_mv.col = ref_col;
-
-  // Baseline value at the centre
-  bestsad = fn_ptr->sdf(what, what_stride,
-                        bestaddress, in_what_stride, 0x7fffffff)
-            + mvsad_err_cost(best_mv, &fcenter_mv, MVSADCOSTS, sad_per_bit);
-
-  // Apply further limits to prevent us looking using vectors that stretch beyiond the UMV border
-  if (col_min < x->mv_col_min)
-    col_min = x->mv_col_min;
-
-  if (col_max > x->mv_col_max)
-    col_max = x->mv_col_max;
-
-  if (row_min < x->mv_row_min)
-    row_min = x->mv_row_min;
-
-  if (row_max > x->mv_row_max)
-    row_max = x->mv_row_max;
-
-  for (r = row_min; r < row_max; r++) {
-    this_mv.as_mv.row = r;
-    check_here = r * mv_stride + in_what + col_min;
-    c = col_min;
-
-    while ((c + 7) < col_max) {
-      int i;
-
-      fn_ptr->sdx8f(what, what_stride, check_here, in_what_stride, sad_array8);
-
-      for (i = 0; i < 8; i++) {
-        thissad = (unsigned int)sad_array8[i];
-
-        if (thissad < bestsad) {
-          this_mv.as_mv.col = c;
-          thissad  += mvsad_err_cost(&this_mv, &fcenter_mv,
-                                     MVSADCOSTS, sad_per_bit);
-
-          if (thissad < bestsad) {
-            bestsad = thissad;
-            best_mv->as_mv.row = r;
-            best_mv->as_mv.col = c;
-            bestaddress = check_here;
-          }
-        }
-
-        check_here++;
-        c++;
-      }
-    }
-
-    while ((c + 2) < col_max) {
-      int i;
-
-      fn_ptr->sdx3f(what, what_stride, check_here, in_what_stride, sad_array);
-
-      for (i = 0; i < 3; i++) {
-        thissad = sad_array[i];
-
-        if (thissad < bestsad) {
-          this_mv.as_mv.col = c;
-          thissad  += mvsad_err_cost(&this_mv, &fcenter_mv,
-                                     MVSADCOSTS, sad_per_bit);
-
-          if (thissad < bestsad) {
-            bestsad = thissad;
-            best_mv->as_mv.row = r;
-            best_mv->as_mv.col = c;
-            bestaddress = check_here;
-          }
-        }
-
-        check_here++;
-        c++;
-      }
-    }
-
-    while (c < col_max) {
-      thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, bestsad);
-
-      if (thissad < bestsad) {
-        this_mv.as_mv.col = c;
-        thissad  += mvsad_err_cost(&this_mv, &fcenter_mv,
-                                   MVSADCOSTS, sad_per_bit);
-
-        if (thissad < bestsad) {
-          bestsad = thissad;
-          best_mv->as_mv.row = r;
-          best_mv->as_mv.col = c;
-          bestaddress = check_here;
-        }
-      }
-
-      check_here++;
-      c++;
-    }
-  }
-
-  this_mv.as_mv.row = best_mv->as_mv.row << 3;
-  this_mv.as_mv.col = best_mv->as_mv.col << 3;
-
-  if (bestsad < INT_MAX)
-    return
-        fn_ptr->vf(what, what_stride, bestaddress, in_what_stride,
-                   (unsigned int *)(&thissad)) +
-        mv_err_cost(&this_mv, center_mv, MVCOSTS, x->errorperbit,
-                    xd->allow_high_precision_mv);
-  else
-    return INT_MAX;
-}
-
-int vp9_refining_search_sad(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
-                            int error_per_bit, int search_range,
-                            vp9_variance_fn_ptr_t *fn_ptr, DEC_MVCOSTS,
-                            int_mv *center_mv) {
-  MV neighbors[4] = {{ -1, 0}, {0, -1}, {0, 1}, {1, 0}};
-  int i, j;
-  short this_row_offset, this_col_offset;
-
-  int what_stride = b->src_stride;
-  int in_what_stride = d->pre_stride;
-  unsigned char *what = (*(b->base_src) + b->src);
-  unsigned char *best_address = (unsigned char *)(*(d->base_pre) + d->pre +
-                                                  (ref_mv->as_mv.row * (d->pre_stride)) + ref_mv->as_mv.col);
-  unsigned char *check_here;
-  unsigned int thissad;
-  int_mv this_mv;
-  unsigned int bestsad = INT_MAX;
-  MACROBLOCKD *xd = &x->e_mbd;
-  int_mv fcenter_mv;
-
-  int *mvjsadcost = x->nmvjointsadcost;
-  int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]};
-
-  fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
-  fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
-
-  bestsad = fn_ptr->sdf(what, what_stride, best_address, in_what_stride, 0x7fffffff) +
-      mvsad_err_cost(ref_mv, &fcenter_mv, MVSADCOSTS, error_per_bit);
-
-  for (i = 0; i < search_range; i++) {
-    int best_site = -1;
-
-    for (j = 0; j < 4; j++) {
-      this_row_offset = ref_mv->as_mv.row + neighbors[j].row;
-      this_col_offset = ref_mv->as_mv.col + neighbors[j].col;
-
-      if ((this_col_offset > x->mv_col_min) && (this_col_offset < x->mv_col_max) &&
-          (this_row_offset > x->mv_row_min) && (this_row_offset < x->mv_row_max)) {
-        check_here = (neighbors[j].row) * in_what_stride + neighbors[j].col + best_address;
-        thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, bestsad);
-
-        if (thissad < bestsad) {
-          this_mv.as_mv.row = this_row_offset;
-          this_mv.as_mv.col = this_col_offset;
-          thissad += mvsad_err_cost(&this_mv, &fcenter_mv, MVSADCOSTS, error_per_bit);
-
-          if (thissad < bestsad) {
-            bestsad = thissad;
-            best_site = j;
-          }
-        }
-      }
-    }
-
-    if (best_site == -1)
-      break;
-    else {
-      ref_mv->as_mv.row += neighbors[best_site].row;
-      ref_mv->as_mv.col += neighbors[best_site].col;
-      best_address += (neighbors[best_site].row) * in_what_stride + neighbors[best_site].col;
-    }
-  }
-
-  this_mv.as_mv.row = ref_mv->as_mv.row << 3;
-  this_mv.as_mv.col = ref_mv->as_mv.col << 3;
-
-  if (bestsad < INT_MAX)
-    return
-        fn_ptr->vf(what, what_stride, best_address, in_what_stride,
-                   (unsigned int *)(&thissad)) +
-        mv_err_cost(&this_mv, center_mv, MVCOSTS, x->errorperbit,
-                    xd->allow_high_precision_mv);
-  else
-    return INT_MAX;
-}
-
-int vp9_refining_search_sadx4(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
-                              int_mv *ref_mv, int error_per_bit,
-                              int search_range, vp9_variance_fn_ptr_t *fn_ptr,
-                              DEC_MVCOSTS, int_mv *center_mv) {
-  MV neighbors[4] = {{ -1, 0}, {0, -1}, {0, 1}, {1, 0}};
-  int i, j;
-  short this_row_offset, this_col_offset;
-
-  int what_stride = b->src_stride;
-  int in_what_stride = d->pre_stride;
-  unsigned char *what = (*(b->base_src) + b->src);
-  unsigned char *best_address = (unsigned char *)(*(d->base_pre) + d->pre +
-                                                  (ref_mv->as_mv.row * (d->pre_stride)) + ref_mv->as_mv.col);
-  unsigned char *check_here;
-  unsigned int thissad;
-  int_mv this_mv;
-  unsigned int bestsad = INT_MAX;
-  MACROBLOCKD *xd = &x->e_mbd;
-  int_mv fcenter_mv;
-
-  int *mvjsadcost = x->nmvjointsadcost;
-  int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]};
-
-  fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
-  fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
-
-  bestsad = fn_ptr->sdf(what, what_stride, best_address, in_what_stride, 0x7fffffff) +
-      mvsad_err_cost(ref_mv, &fcenter_mv, MVSADCOSTS, error_per_bit);
-
-  for (i = 0; i < search_range; i++) {
-    int best_site = -1;
-    int all_in = 1;
-
-    all_in &= ((ref_mv->as_mv.row - 1) > x->mv_row_min);
-    all_in &= ((ref_mv->as_mv.row + 1) < x->mv_row_max);
-    all_in &= ((ref_mv->as_mv.col - 1) > x->mv_col_min);
-    all_in &= ((ref_mv->as_mv.col + 1) < x->mv_col_max);
-
-    if (all_in) {
-      unsigned int sad_array[4];
-      unsigned char *block_offset[4];
-      block_offset[0] = best_address - in_what_stride;
-      block_offset[1] = best_address - 1;
-      block_offset[2] = best_address + 1;
-      block_offset[3] = best_address + in_what_stride;
-
-      fn_ptr->sdx4df(what, what_stride, block_offset, in_what_stride, sad_array);
-
-      for (j = 0; j < 4; j++) {
-        if (sad_array[j] < bestsad) {
-          this_mv.as_mv.row = ref_mv->as_mv.row + neighbors[j].row;
-          this_mv.as_mv.col = ref_mv->as_mv.col + neighbors[j].col;
-          sad_array[j] += mvsad_err_cost(&this_mv, &fcenter_mv, MVSADCOSTS, error_per_bit);
-
-          if (sad_array[j] < bestsad) {
-            bestsad = sad_array[j];
-            best_site = j;
-          }
-        }
-      }
-    } else {
-      for (j = 0; j < 4; j++) {
-        this_row_offset = ref_mv->as_mv.row + neighbors[j].row;
-        this_col_offset = ref_mv->as_mv.col + neighbors[j].col;
-
-        if ((this_col_offset > x->mv_col_min) && (this_col_offset < x->mv_col_max) &&
-            (this_row_offset > x->mv_row_min) && (this_row_offset < x->mv_row_max)) {
-          check_here = (neighbors[j].row) * in_what_stride + neighbors[j].col + best_address;
-          thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, bestsad);
-
-          if (thissad < bestsad) {
-            this_mv.as_mv.row = this_row_offset;
-            this_mv.as_mv.col = this_col_offset;
-            thissad += mvsad_err_cost(&this_mv, &fcenter_mv, MVSADCOSTS, error_per_bit);
-
-            if (thissad < bestsad) {
-              bestsad = thissad;
-              best_site = j;
-            }
-          }
-        }
-      }
-    }
-
-    if (best_site == -1)
-      break;
-    else {
-      ref_mv->as_mv.row += neighbors[best_site].row;
-      ref_mv->as_mv.col += neighbors[best_site].col;
-      best_address += (neighbors[best_site].row) * in_what_stride + neighbors[best_site].col;
-    }
-  }
-
-  this_mv.as_mv.row = ref_mv->as_mv.row << 3;
-  this_mv.as_mv.col = ref_mv->as_mv.col << 3;
-
-  if (bestsad < INT_MAX)
-    return
-        fn_ptr->vf(what, what_stride, best_address, in_what_stride,
-                   (unsigned int *)(&thissad)) +
-        mv_err_cost(&this_mv, center_mv, MVCOSTS, x->errorperbit,
-                    xd->allow_high_precision_mv);
-  else
-    return INT_MAX;
-}
-
-
-
-#ifdef ENTROPY_STATS
-void print_mode_context(void) {
-  FILE *f = fopen("modecont.c", "a");
-  int i, j;
-
-  fprintf(f, "#include \"entropy.h\"\n");
-  fprintf(f, "const int vp9_mode_contexts[6][4] =");
-  fprintf(f, "{\n");
-  for (j = 0; j < 6; j++) {
-    fprintf(f, "  {/* %d */ ", j);
-    fprintf(f, "    ");
-    for (i = 0; i < 4; i++) {
-      int this_prob;
-      int count;
-
-      // context probs
-      count = mv_ref_ct[j][i][0] + mv_ref_ct[j][i][1];
-      if (count)
-        this_prob = 256 * mv_ref_ct[j][i][0] / count;
-      else
-        this_prob = 128;
-
-      if (this_prob == 0)
-        this_prob = 1;
-      fprintf(f, "%5d, ", this_prob);
-    }
-    fprintf(f, "  },\n");
-  }
-
-  fprintf(f, "};\n");
-  fclose(f);
-}
-
-/* MV ref count ENTROPY_STATS stats code */
-void init_mv_ref_counts() {
-  vpx_memset(mv_ref_ct, 0, sizeof(mv_ref_ct));
-  vpx_memset(mv_mode_cts, 0, sizeof(mv_mode_cts));
-}
-
-void accum_mv_refs(MB_PREDICTION_MODE m, const int ct[4]) {
-  if (m == ZEROMV) {
-    ++mv_ref_ct [ct[0]] [0] [0];
-    ++mv_mode_cts[0][0];
-  } else {
-    ++mv_ref_ct [ct[0]] [0] [1];
-    ++mv_mode_cts[0][1];
-
-    if (m == NEARESTMV) {
-      ++mv_ref_ct [ct[1]] [1] [0];
-      ++mv_mode_cts[1][0];
-    } else {
-      ++mv_ref_ct [ct[1]] [1] [1];
-      ++mv_mode_cts[1][1];
-
-      if (m == NEARMV) {
-        ++mv_ref_ct [ct[2]] [2] [0];
-        ++mv_mode_cts[2][0];
-      } else {
-        ++mv_ref_ct [ct[2]] [2] [1];
-        ++mv_mode_cts[2][1];
-
-        if (m == NEWMV) {
-          ++mv_ref_ct [ct[3]] [3] [0];
-          ++mv_mode_cts[3][0];
-        } else {
-          ++mv_ref_ct [ct[3]] [3] [1];
-          ++mv_mode_cts[3][1];
-        }
-      }
-    }
-  }
-}
-
-#endif/* END MV ref count ENTROPY_STATS stats code */
--- a/vp8/encoder/mcomp.h
+++ /dev/null
@@ -1,159 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef __INC_MCOMP_H
-#define __INC_MCOMP_H
-
-#include "block.h"
-#include "variance.h"
-
-#define MVCOSTS mvjcost, mvcost
-#define MVSADCOSTS mvjsadcost, mvsadcost
-#define DEC_MVCOSTS int *mvjcost, int *mvcost[2]
-#define DEC_MVSADCOSTS int *mvjsadcost, int *mvsadcost[2]
-#define NULLMVCOST NULL, NULL
-#define XMVCOST x->nmvjointcost, (x->e_mbd.allow_high_precision_mv?x->nmvcost_hp:x->nmvcost)
-
-#ifdef ENTROPY_STATS
-extern void init_mv_ref_counts();
-extern void accum_mv_refs(MB_PREDICTION_MODE, const int near_mv_ref_cts[4]);
-#endif
-
-
-#define MAX_MVSEARCH_STEPS 8                                    // The maximum number of steps in a step search given the largest allowed initial step
-#define MAX_FULL_PEL_VAL ((1 << (MAX_MVSEARCH_STEPS)) - 1)      // Max full pel mv specified in 1 pel units
-#define MAX_FIRST_STEP (1 << (MAX_MVSEARCH_STEPS-1))            // Maximum size of the first step in full pel units
-
-extern void vp9_clamp_mv_min_max(MACROBLOCK *x, int_mv *ref_mv);
-extern int vp9_mv_bit_cost(int_mv *mv, int_mv *ref, DEC_MVCOSTS,
-                           int Weight, int ishp);
-extern void vp9_init_dsmotion_compensation(MACROBLOCK *x, int stride);
-extern void vp9_init3smotion_compensation(MACROBLOCK *x,  int stride);
-// Runs sequence of diamond searches in smaller steps for RD
-struct VP9_COMP;
-int vp9_full_pixel_diamond(struct VP9_COMP *cpi, MACROBLOCK *x, BLOCK *b,
-                           BLOCKD *d, int_mv *mvp_full, int step_param,
-                           int sadpb, int further_steps, int do_refine,
-                           vp9_variance_fn_ptr_t *fn_ptr,
-                           int_mv *ref_mv, int_mv *dst_mv);
-
-extern int vp9_hex_search
-(
-  MACROBLOCK *x,
-  BLOCK *b,
-  BLOCKD *d,
-  int_mv *ref_mv,
-  int_mv *best_mv,
-  int search_param,
-  int error_per_bit,
-  const vp9_variance_fn_ptr_t *vf,
-  DEC_MVSADCOSTS,
-  DEC_MVCOSTS,
-  int_mv *center_mv
-);
-
-typedef int (fractional_mv_step_fp)
-(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *bestmv, int_mv *ref_mv,
- int error_per_bit, const vp9_variance_fn_ptr_t *vfp, DEC_MVCOSTS,
- int *distortion, unsigned int *sse);
-extern fractional_mv_step_fp vp9_find_best_sub_pixel_step_iteratively;
-extern fractional_mv_step_fp vp9_find_best_sub_pixel_step;
-extern fractional_mv_step_fp vp9_find_best_half_pixel_step;
-
-#define prototype_full_search_sad(sym)\
-  int (sym)\
-  (\
-   MACROBLOCK *x, \
-   BLOCK *b, \
-   BLOCKD *d, \
-   int_mv *ref_mv, \
-   int sad_per_bit, \
-   int distance, \
-   vp9_variance_fn_ptr_t *fn_ptr, \
-   DEC_MVSADCOSTS, \
-   int_mv *center_mv \
-  )
-
-#define prototype_refining_search_sad(sym)\
-  int (sym)\
-  (\
-   MACROBLOCK *x, \
-   BLOCK *b, \
-   BLOCKD *d, \
-   int_mv *ref_mv, \
-   int sad_per_bit, \
-   int distance, \
-   vp9_variance_fn_ptr_t *fn_ptr, \
-   DEC_MVSADCOSTS, \
-   int_mv *center_mv \
-  )
-
-#define prototype_diamond_search_sad(sym)\
-  int (sym)\
-  (\
-   MACROBLOCK *x, \
-   BLOCK *b, \
-   BLOCKD *d, \
-   int_mv *ref_mv, \
-   int_mv *best_mv, \
-   int search_param, \
-   int sad_per_bit, \
-   int *num00, \
-   vp9_variance_fn_ptr_t *fn_ptr, \
-   DEC_MVSADCOSTS, \
-   int_mv *center_mv \
-  )
-
-#if ARCH_X86 || ARCH_X86_64
-#include "x86/mcomp_x86.h"
-#endif
-
-typedef prototype_full_search_sad(*vp9_full_search_fn_t);
-extern prototype_full_search_sad(vp9_full_search_sad);
-extern prototype_full_search_sad(vp9_full_search_sadx3);
-extern prototype_full_search_sad(vp9_full_search_sadx8);
-
-typedef prototype_refining_search_sad(*vp9_refining_search_fn_t);
-extern prototype_refining_search_sad(vp9_refining_search_sad);
-extern prototype_refining_search_sad(vp9_refining_search_sadx4);
-
-typedef prototype_diamond_search_sad(*vp9_diamond_search_fn_t);
-extern prototype_diamond_search_sad(vp9_diamond_search_sad);
-extern prototype_diamond_search_sad(vp9_diamond_search_sadx4);
-
-#ifndef vp9_search_full_search
-#define vp9_search_full_search vp9_full_search_sad
-#endif
-extern prototype_full_search_sad(vp9_search_full_search);
-
-#ifndef vp9_search_refining_search
-#define vp9_search_refining_search vp9_refining_search_sad
-#endif
-extern prototype_refining_search_sad(vp9_search_refining_search);
-
-#ifndef vp9_search_diamond_search
-#define vp9_search_diamond_search vp9_diamond_search_sad
-#endif
-extern prototype_diamond_search_sad(vp9_search_diamond_search);
-
-typedef struct {
-  prototype_full_search_sad(*full_search);
-  prototype_refining_search_sad(*refining_search);
-  prototype_diamond_search_sad(*diamond_search);
-} vp9_search_rtcd_vtable_t;
-
-#if CONFIG_RUNTIME_CPU_DETECT
-#define SEARCH_INVOKE(ctx,fn) (ctx)->fn
-#else
-#define SEARCH_INVOKE(ctx,fn) vp9_search_##fn
-#endif
-
-#endif
--- a/vp8/encoder/modecosts.c
+++ /dev/null
@@ -1,49 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vp8/common/blockd.h"
-#include "onyx_int.h"
-#include "treewriter.h"
-#include "vp8/common/entropymode.h"
-
-
-void vp9_init_mode_costs(VP9_COMP *c) {
-  VP9_COMMON *x = &c->common;
-  const vp9_tree_p T = vp9_bmode_tree;
-  int i, j;
-
-  for (i = 0; i < VP9_BINTRAMODES; i++) {
-    for (j = 0; j < VP9_BINTRAMODES; j++) {
-      vp9_cost_tokens((int *)c->mb.bmode_costs[i][j],
-                      x->kf_bmode_prob[i][j], T);
-    }
-  }
-
-  vp9_cost_tokens((int *)c->mb.inter_bmode_costs, x->fc.bmode_prob, T);
-  vp9_cost_tokens((int *)c->mb.inter_bmode_costs,
-                  x->fc.sub_mv_ref_prob[0], vp9_sub_mv_ref_tree);
-
-  vp9_cost_tokens(c->mb.mbmode_cost[1], x->fc.ymode_prob, vp9_ymode_tree);
-  vp9_cost_tokens(c->mb.mbmode_cost[0],
-                  x->kf_ymode_prob[c->common.kf_ymode_probs_index],
-                  vp9_kf_ymode_tree);
-  vp9_cost_tokens(c->mb.intra_uv_mode_cost[1],
-                  x->fc.uv_mode_prob[VP9_YMODES - 1], vp9_uv_mode_tree);
-  vp9_cost_tokens(c->mb.intra_uv_mode_cost[0],
-                  x->kf_uv_mode_prob[VP9_YMODES - 1], vp9_uv_mode_tree);
-  vp9_cost_tokens(c->mb.i8x8_mode_costs,
-                  x->fc.i8x8_mode_prob, vp9_i8x8_mode_tree);
-
-  for (i = 0; i <= VP9_SWITCHABLE_FILTERS; ++i)
-    vp9_cost_tokens((int *)c->mb.switchable_interp_costs[i],
-                    x->fc.switchable_interp_prob[i],
-                    vp9_switchable_interp_tree);
-}
--- a/vp8/encoder/modecosts.h
+++ /dev/null
@@ -1,17 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef __INC_MODECOSTS_H
-#define __INC_MODECOSTS_H
-
-void vp9_init_mode_costs(VP9_COMP *x);
-
-#endif
--- a/vp8/encoder/onyx_if.c
+++ /dev/null
@@ -1,4486 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vpx_config.h"
-#include "vp8/common/onyxc_int.h"
-#include "onyx_int.h"
-#include "vp8/common/systemdependent.h"
-#include "quantize.h"
-#include "vp8/common/alloccommon.h"
-#include "mcomp.h"
-#include "firstpass.h"
-#include "psnr.h"
-#include "vpx_scale/vpxscale.h"
-#include "vp8/common/extend.h"
-#include "ratectrl.h"
-#include "vp8/common/quant_common.h"
-#include "segmentation.h"
-#include "vpx_scale/yv12extend.h"
-#if CONFIG_POSTPROC
-#include "vp8/common/postproc.h"
-#endif
-#include "vpx_mem/vpx_mem.h"
-#include "vp8/common/swapyv12buffer.h"
-#include "vpx_ports/vpx_timer.h"
-#include "temporal_filter.h"
-
-#include "vp8/common/seg_common.h"
-#include "mbgraph.h"
-#include "vp8/common/pred_common.h"
-#include "vp8/encoder/rdopt.h"
-#include "bitstream.h"
-#include "ratectrl.h"
-
-#if CONFIG_NEWBESTREFMV
-#include "vp8/common/mvref_common.h"
-#endif
-
-#if ARCH_ARM
-#include "vpx_ports/arm.h"
-#endif
-
-#include <math.h>
-#include <stdio.h>
-#include <limits.h>
-
-#if CONFIG_RUNTIME_CPU_DETECT
-#define IF_RTCD(x) (x)
-#define RTCD(x) &cpi->common.rtcd.x
-#else
-#define IF_RTCD(x) NULL
-#define RTCD(x) NULL
-#endif
-
-extern void vp9_pick_filter_level_fast(YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi);
-
-extern void vp9_set_alt_lf_level(VP9_COMP *cpi, int filt_val);
-
-extern void vp9_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi);
-
-extern void vp9_cmachine_specific_config(VP9_COMP *cpi);
-
-extern void vp9_deblock_frame(YV12_BUFFER_CONFIG *source,
-                              YV12_BUFFER_CONFIG *post,
-                              int filt_lvl, int low_var_thresh, int flag);
-
-extern void print_tree_update_probs();
-
-#if HAVE_ARMV7
-extern void vp8_yv12_copy_frame_func_neon(YV12_BUFFER_CONFIG *src_ybc,
-                                          YV12_BUFFER_CONFIG *dst_ybc);
-
-extern void vp8_yv12_copy_src_frame_func_neon(YV12_BUFFER_CONFIG *src_ybc,
-                                              YV12_BUFFER_CONFIG *dst_ybc);
-#endif
-
-int vp9_calc_ss_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest);
-
-extern void vp9_temporal_filter_prepare_c(VP9_COMP *cpi, int distance);
-
-static void set_default_lf_deltas(VP9_COMP *cpi);
-
-#define DEFAULT_INTERP_FILTER EIGHTTAP  /* SWITCHABLE for better performance */
-#define SEARCH_BEST_FILTER 0            /* to search exhaustively for
-                                           best filter */
-#define RESET_FOREACH_FILTER 0          /* whether to reset the encoder state
-                                           before trying each new filter */
-#define SHARP_FILTER_QTHRESH 0          /* Q threshold for 8-tap sharp filter */
-
-#define ALTREF_HIGH_PRECISION_MV 1      /* whether to use high precision mv
-                                           for altref computation */
-#define HIGH_PRECISION_MV_QTHRESH 200   /* Q threshold for use of high precision
-                                           mv. Choose a very high value for
-                                           now so that HIGH_PRECISION is always
-                                           chosen */
-
-#if CONFIG_INTERNAL_STATS
-#include "math.h"
-
-extern double vp9_calc_ssim(YV12_BUFFER_CONFIG *source,
-                            YV12_BUFFER_CONFIG *dest, int lumamask,
-                            double *weight);
-
-
-extern double vp9_calc_ssimg(YV12_BUFFER_CONFIG *source,
-                             YV12_BUFFER_CONFIG *dest, double *ssim_y,
-                             double *ssim_u, double *ssim_v);
-
-
-#endif
-
-// #define OUTPUT_YUV_REC
-
-#ifdef OUTPUT_YUV_SRC
-FILE *yuv_file;
-#endif
-#ifdef OUTPUT_YUV_REC
-FILE *yuv_rec_file;
-#endif
-
-#if 0
-FILE *framepsnr;
-FILE *kf_list;
-FILE *keyfile;
-#endif
-
-#if 0
-extern int skip_true_count;
-extern int skip_false_count;
-#endif
-
-
-#ifdef ENTROPY_STATS
-extern int intra_mode_stats[VP9_BINTRAMODES][VP9_BINTRAMODES][VP9_BINTRAMODES];
-#endif
-
-#ifdef NMV_STATS
-extern void init_nmvstats();
-extern void print_nmvstats();
-#endif
-
-#ifdef SPEEDSTATS
-unsigned int frames_at_speed[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
-#endif
-
-#if defined(SECTIONBITS_OUTPUT)
-extern unsigned __int64 Sectionbits[500];
-#endif
-#ifdef MODE_STATS
-extern INT64 Sectionbits[500];
-extern unsigned int y_modes[VP9_YMODES];
-extern unsigned int i8x8_modes[VP9_I8X8_MODES];
-extern unsigned int uv_modes[VP9_UV_MODES];
-extern unsigned int uv_modes_y[VP9_YMODES][VP9_UV_MODES];
-extern unsigned int b_modes[B_MODE_COUNT];
-extern unsigned int inter_y_modes[MB_MODE_COUNT];
-extern unsigned int inter_uv_modes[VP9_UV_MODES];
-extern unsigned int inter_b_modes[B_MODE_COUNT];
-#endif
-
-extern void vp9_init_quantizer(VP9_COMP *cpi);
-
-static int base_skip_false_prob[QINDEX_RANGE][3];
-
-// Tables relating active max Q to active min Q
-static int kf_low_motion_minq[QINDEX_RANGE];
-static int kf_high_motion_minq[QINDEX_RANGE];
-static int gf_low_motion_minq[QINDEX_RANGE];
-static int gf_high_motion_minq[QINDEX_RANGE];
-static int inter_minq[QINDEX_RANGE];
-
-// Functions to compute the active minq lookup table entries based on a
-// formulaic approach to facilitate easier adjustment of the Q tables.
-// The formulae were derived from computing a 3rd order polynomial best
-// fit to the original data (after plotting real maxq vs minq (not q index))
-static int calculate_minq_index(double maxq,
-                                double x3, double x2, double x, double c) {
-  int i;
-  double minqtarget;
-  double thisq;
-
-  minqtarget = ((x3 * maxq * maxq * maxq) +
-                (x2 * maxq * maxq) +
-                (x * maxq) +
-                c);
-
-  if (minqtarget > maxq)
-    minqtarget = maxq;
-
-  for (i = 0; i < QINDEX_RANGE; i++) {
-    thisq = vp9_convert_qindex_to_q(i);
-    if (minqtarget <= vp9_convert_qindex_to_q(i))
-      return i;
-  }
-  return QINDEX_RANGE - 1;
-}
-
-static void init_minq_luts(void) {
-  int i;
-  double maxq;
-
-  for (i = 0; i < QINDEX_RANGE; i++) {
-    maxq = vp9_convert_qindex_to_q(i);
-
-
-    kf_low_motion_minq[i] = calculate_minq_index(maxq,
-                                                 0.0000003,
-                                                 -0.000015,
-                                                 0.074,
-                                                 0.0);
-    kf_high_motion_minq[i] = calculate_minq_index(maxq,
-                                                  0.0000004,
-                                                  -0.000125,
-                                                  0.14,
-                                                  0.0);
-    gf_low_motion_minq[i] = calculate_minq_index(maxq,
-                                                 0.0000015,
-                                                 -0.0009,
-                                                 0.33,
-                                                 0.0);
-    gf_high_motion_minq[i] = calculate_minq_index(maxq,
-                                                  0.0000021,
-                                                  -0.00125,
-                                                  0.45,
-                                                  0.0);
-    inter_minq[i] = calculate_minq_index(maxq,
-                                         0.00000271,
-                                         -0.00113,
-                                         0.697,
-                                         0.0);
-
-  }
-}
-
-static void init_base_skip_probs(void) {
-  int i;
-  double q;
-  int skip_prob, t;
-
-  for (i = 0; i < QINDEX_RANGE; i++) {
-    q = vp9_convert_qindex_to_q(i);
-
-    // Exponential decay caluclation of baseline skip prob with clamping
-    // Based on crude best fit of old table.
-    t = (int)(564.25 * pow(2.71828, (-0.012 * q)));
-
-    skip_prob = t;
-    if (skip_prob < 1)
-      skip_prob = 1;
-    else if (skip_prob > 255)
-      skip_prob = 255;
-    base_skip_false_prob[i][1] = skip_prob;
-
-    skip_prob = t * 0.75;
-    if (skip_prob < 1)
-      skip_prob = 1;
-    else if (skip_prob > 255)
-      skip_prob = 255;
-    base_skip_false_prob[i][2] = skip_prob;
-
-    skip_prob = t * 1.25;
-    if (skip_prob < 1)
-      skip_prob = 1;
-    else if (skip_prob > 255)
-      skip_prob = 255;
-    base_skip_false_prob[i][0] = skip_prob;
-  }
-}
-
-static void update_base_skip_probs(VP9_COMP *cpi) {
-  VP9_COMMON *cm = &cpi->common;
-
-  if (cm->frame_type != KEY_FRAME) {
-    vp9_update_skip_probs(cpi);
-
-    if (cm->refresh_alt_ref_frame) {
-      int k;
-      for (k = 0; k < MBSKIP_CONTEXTS; ++k)
-        cpi->last_skip_false_probs[2][k] = cm->mbskip_pred_probs[k];
-      cpi->last_skip_probs_q[2] = cm->base_qindex;
-    } else if (cpi->common.refresh_golden_frame) {
-      int k;
-      for (k = 0; k < MBSKIP_CONTEXTS; ++k)
-        cpi->last_skip_false_probs[1][k] = cm->mbskip_pred_probs[k];
-      cpi->last_skip_probs_q[1] = cm->base_qindex;
-    } else {
-      int k;
-      for (k = 0; k < MBSKIP_CONTEXTS; ++k)
-        cpi->last_skip_false_probs[0][k] = cm->mbskip_pred_probs[k];
-      cpi->last_skip_probs_q[0] = cm->base_qindex;
-
-      // update the baseline table for the current q
-      for (k = 0; k < MBSKIP_CONTEXTS; ++k)
-        cpi->base_skip_false_prob[cm->base_qindex][k] =
-          cm->mbskip_pred_probs[k];
-    }
-  }
-
-}
-
-void vp9_initialize_enc() {
-  static int init_done = 0;
-
-  if (!init_done) {
-    vp8_scale_machine_specific_config();
-    vp9_initialize_common();
-    vp9_tokenize_initialize();
-    vp9_init_quant_tables();
-    vp9_init_me_luts();
-    init_minq_luts();
-    init_base_skip_probs();
-    init_done = 1;
-  }
-}
-#ifdef PACKET_TESTING
-extern FILE *vpxlogc;
-#endif
-
-static void setup_features(VP9_COMP *cpi) {
-  MACROBLOCKD *xd = &cpi->mb.e_mbd;
-
-  // Set up default state for MB feature flags
-
-  xd->segmentation_enabled = 0;   // Default segmentation disabled
-
-  xd->update_mb_segmentation_map = 0;
-  xd->update_mb_segmentation_data = 0;
-  vpx_memset(xd->mb_segment_tree_probs, 255, sizeof(xd->mb_segment_tree_probs));
-
-  vp9_clearall_segfeatures(xd);
-
-  xd->mode_ref_lf_delta_enabled = 0;
-  xd->mode_ref_lf_delta_update = 0;
-  vpx_memset(xd->ref_lf_deltas, 0, sizeof(xd->ref_lf_deltas));
-  vpx_memset(xd->mode_lf_deltas, 0, sizeof(xd->mode_lf_deltas));
-  vpx_memset(xd->last_ref_lf_deltas, 0, sizeof(xd->ref_lf_deltas));
-  vpx_memset(xd->last_mode_lf_deltas, 0, sizeof(xd->mode_lf_deltas));
-
-  set_default_lf_deltas(cpi);
-
-}
-
-
-static void dealloc_compressor_data(VP9_COMP *cpi) {
-  vpx_free(cpi->tplist);
-  cpi->tplist = NULL;
-
-  // Delete last frame MV storage buffers
-  vpx_free(cpi->lfmv);
-  cpi->lfmv = 0;
-
-  vpx_free(cpi->lf_ref_frame_sign_bias);
-  cpi->lf_ref_frame_sign_bias = 0;
-
-  vpx_free(cpi->lf_ref_frame);
-  cpi->lf_ref_frame = 0;
-
-  // Delete sementation map
-  vpx_free(cpi->segmentation_map);
-  cpi->segmentation_map = 0;
-  vpx_free(cpi->common.last_frame_seg_map);
-  cpi->common.last_frame_seg_map = 0;
-  vpx_free(cpi->coding_context.last_frame_seg_map_copy);
-  cpi->coding_context.last_frame_seg_map_copy = 0;
-
-  vpx_free(cpi->active_map);
-  cpi->active_map = 0;
-
-  vp9_de_alloc_frame_buffers(&cpi->common);
-
-  vp8_yv12_de_alloc_frame_buffer(&cpi->last_frame_uf);
-  vp8_yv12_de_alloc_frame_buffer(&cpi->scaled_source);
-#if VP9_TEMPORAL_ALT_REF
-  vp8_yv12_de_alloc_frame_buffer(&cpi->alt_ref_buffer);
-#endif
-  vp9_lookahead_destroy(cpi->lookahead);
-
-  vpx_free(cpi->tok);
-  cpi->tok = 0;
-
-  // Structure used to monitor GF usage
-  vpx_free(cpi->gf_active_flags);
-  cpi->gf_active_flags = 0;
-
-  // Activity mask based per mb zbin adjustments
-  vpx_free(cpi->mb_activity_map);
-  cpi->mb_activity_map = 0;
-  vpx_free(cpi->mb_norm_activity_map);
-  cpi->mb_norm_activity_map = 0;
-
-  vpx_free(cpi->mb.pip);
-  cpi->mb.pip = 0;
-
-  vpx_free(cpi->twopass.total_stats);
-  cpi->twopass.total_stats = 0;
-
-  vpx_free(cpi->twopass.total_left_stats);
-  cpi->twopass.total_left_stats = 0;
-
-  vpx_free(cpi->twopass.this_frame_stats);
-  cpi->twopass.this_frame_stats = 0;
-}
-
-// Computes a q delta (in "q index" terms) to get from a starting q value
-// to a target value
-// target q value
-static int compute_qdelta(VP9_COMP *cpi, double qstart, double qtarget) {
-  int i;
-  int start_index = cpi->worst_quality;
-  int target_index = cpi->worst_quality;
-
-  // Convert the average q value to an index.
-  for (i = cpi->best_quality; i < cpi->worst_quality; i++) {
-    start_index = i;
-    if (vp9_convert_qindex_to_q(i) >= qstart)
-      break;
-  }
-
-  // Convert the q target to an index
-  for (i = cpi->best_quality; i < cpi->worst_quality; i++) {
-    target_index = i;
-    if (vp9_convert_qindex_to_q(i) >= qtarget)
-      break;
-  }
-
-  return target_index - start_index;
-}
-
-static void init_seg_features(VP9_COMP *cpi) {
-  VP9_COMMON *cm = &cpi->common;
-  MACROBLOCKD *xd = &cpi->mb.e_mbd;
-
-  int high_q = (int)(cpi->avg_q > 48.0);
-  int qi_delta;
-
-  // Disable and clear down for KF
-  if (cm->frame_type == KEY_FRAME) {
-    // Clear down the global segmentation map
-    vpx_memset(cpi->segmentation_map, 0, (cm->mb_rows * cm->mb_cols));
-    xd->update_mb_segmentation_map = 0;
-    xd->update_mb_segmentation_data = 0;
-    cpi->static_mb_pct = 0;
-
-    // Disable segmentation
-    vp9_disable_segmentation((VP9_PTR)cpi);
-
-    // Clear down the segment features.
-    vp9_clearall_segfeatures(xd);
-  }
-
-  // If this is an alt ref frame
-  else if (cm->refresh_alt_ref_frame) {
-    // Clear down the global segmentation map
-    vpx_memset(cpi->segmentation_map, 0, (cm->mb_rows * cm->mb_cols));
-    xd->update_mb_segmentation_map = 0;
-    xd->update_mb_segmentation_data = 0;
-    cpi->static_mb_pct = 0;
-
-    // Disable segmentation and individual segment features by default
-    vp9_disable_segmentation((VP9_PTR)cpi);
-    vp9_clearall_segfeatures(xd);
-
-    // Scan frames from current to arf frame.
-    // This function re-enables segmentation if appropriate.
-    vp9_update_mbgraph_stats(cpi);
-
-    // If segmentation was enabled set those features needed for the
-    // arf itself.
-    if (xd->segmentation_enabled) {
-      xd->update_mb_segmentation_map = 1;
-      xd->update_mb_segmentation_data = 1;
-
-      qi_delta = compute_qdelta(cpi, cpi->avg_q, (cpi->avg_q * 0.875));
-      vp9_set_segdata(xd, 1, SEG_LVL_ALT_Q, (qi_delta - 2));
-      vp9_set_segdata(xd, 1, SEG_LVL_ALT_LF, -2);
-
-      vp9_enable_segfeature(xd, 1, SEG_LVL_ALT_Q);
-      vp9_enable_segfeature(xd, 1, SEG_LVL_ALT_LF);
-
-      // Where relevant assume segment data is delta data
-      xd->mb_segment_abs_delta = SEGMENT_DELTADATA;
-
-    }
-  }
-  // All other frames if segmentation has been enabled
-  else if (xd->segmentation_enabled) {
-    // First normal frame in a valid gf or alt ref group
-    if (cpi->common.frames_since_golden == 0) {
-      // Set up segment features for normal frames in an af group
-      if (cpi->source_alt_ref_active) {
-        xd->update_mb_segmentation_map = 0;
-        xd->update_mb_segmentation_data = 1;
-        xd->mb_segment_abs_delta = SEGMENT_DELTADATA;
-
-        qi_delta = compute_qdelta(cpi, cpi->avg_q,
-                                  (cpi->avg_q * 1.125));
-        vp9_set_segdata(xd, 1, SEG_LVL_ALT_Q, (qi_delta + 2));
-        vp9_set_segdata(xd, 1, SEG_LVL_ALT_Q, 0);
-        vp9_enable_segfeature(xd, 1, SEG_LVL_ALT_Q);
-
-        vp9_set_segdata(xd, 1, SEG_LVL_ALT_LF, -2);
-        vp9_enable_segfeature(xd, 1, SEG_LVL_ALT_LF);
-
-        // Segment coding disabled for compred testing
-        if (high_q || (cpi->static_mb_pct == 100)) {
-          // set_segref(xd, 1, LAST_FRAME);
-          vp9_set_segref(xd, 1, ALTREF_FRAME);
-          vp9_enable_segfeature(xd, 1, SEG_LVL_REF_FRAME);
-
-          vp9_set_segdata(xd, 1, SEG_LVL_MODE, ZEROMV);
-          vp9_enable_segfeature(xd, 1, SEG_LVL_MODE);
-
-          // EOB segment coding not fixed for 8x8 yet
-          vp9_set_segdata(xd, 1, SEG_LVL_EOB, 0);
-          vp9_enable_segfeature(xd, 1, SEG_LVL_EOB);
-        }
-      }
-      // Disable segmentation and clear down features if alt ref
-      // is not active for this group
-      else {
-        vp9_disable_segmentation((VP9_PTR)cpi);
-
-        vpx_memset(cpi->segmentation_map, 0,
-                   (cm->mb_rows * cm->mb_cols));
-
-        xd->update_mb_segmentation_map = 0;
-        xd->update_mb_segmentation_data = 0;
-
-        vp9_clearall_segfeatures(xd);
-      }
-    }
-
-    // Special case where we are coding over the top of a previous
-    // alt ref frame
-    // Segment coding disabled for compred testing
-    else if (cpi->is_src_frame_alt_ref) {
-      // Enable mode and ref frame features for segment 0 as well
-      vp9_enable_segfeature(xd, 0, SEG_LVL_REF_FRAME);
-      vp9_enable_segfeature(xd, 0, SEG_LVL_MODE);
-      vp9_enable_segfeature(xd, 1, SEG_LVL_REF_FRAME);
-      vp9_enable_segfeature(xd, 1, SEG_LVL_MODE);
-
-      // All mbs should use ALTREF_FRAME, ZEROMV exclusively
-      vp9_clear_segref(xd, 0);
-      vp9_set_segref(xd, 0, ALTREF_FRAME);
-      vp9_clear_segref(xd, 1);
-      vp9_set_segref(xd, 1, ALTREF_FRAME);
-      vp9_set_segdata(xd, 0, SEG_LVL_MODE, ZEROMV);
-      vp9_set_segdata(xd, 1, SEG_LVL_MODE, ZEROMV);
-
-      // Skip all MBs if high Q
-      if (high_q) {
-        vp9_enable_segfeature(xd, 0, SEG_LVL_EOB);
-        vp9_set_segdata(xd, 0, SEG_LVL_EOB, 0);
-        vp9_enable_segfeature(xd, 1, SEG_LVL_EOB);
-        vp9_set_segdata(xd, 1, SEG_LVL_EOB, 0);
-      }
-      // Enable data udpate
-      xd->update_mb_segmentation_data = 1;
-    }
-    // All other frames.
-    else {
-      // No updates.. leave things as they are.
-      xd->update_mb_segmentation_map = 0;
-      xd->update_mb_segmentation_data = 0;
-    }
-  }
-}
-
-// DEBUG: Print out the segment id of each MB in the current frame.
-static void print_seg_map(VP9_COMP *cpi) {
-  VP9_COMMON *cm = &cpi->common;
-  int row, col;
-  int map_index = 0;
-  FILE *statsfile;
-
-  statsfile = fopen("segmap.stt", "a");
-
-  fprintf(statsfile, "%10d\n",
-          cm->current_video_frame);
-
-  for (row = 0; row < cpi->common.mb_rows; row++) {
-    for (col = 0; col < cpi->common.mb_cols; col++) {
-      fprintf(statsfile, "%10d",
-              cpi->segmentation_map[map_index]);
-      map_index++;
-    }
-    fprintf(statsfile, "\n");
-  }
-  fprintf(statsfile, "\n");
-
-  fclose(statsfile);
-}
-
-static void update_reference_segmentation_map(VP9_COMP *cpi) {
-  VP9_COMMON *cm = &cpi->common;
-  int row, col, sb_rows = (cm->mb_rows + 1) >> 1, sb_cols = (cm->mb_cols + 1) >> 1;
-  MODE_INFO *mi = cm->mi;
-  uint8_t *segmap = cpi->segmentation_map;
-  uint8_t *segcache = cm->last_frame_seg_map;
-
-  for (row = 0; row < sb_rows; row++) {
-    for (col = 0; col < sb_cols; col++) {
-      MODE_INFO *miptr = mi + col * 2;
-      uint8_t *cache = segcache + col * 2;
-#if CONFIG_SUPERBLOCKS
-      if (miptr->mbmi.encoded_as_sb) {
-        cache[0] = miptr->mbmi.segment_id;
-        if (!(cm->mb_cols & 1) || col < sb_cols - 1)
-          cache[1] = miptr->mbmi.segment_id;
-        if (!(cm->mb_rows & 1) || row < sb_rows - 1) {
-          cache[cm->mb_cols] = miptr->mbmi.segment_id;
-          if (!(cm->mb_cols & 1) || col < sb_cols - 1)
-            cache[cm->mb_cols + 1] = miptr->mbmi.segment_id;
-        }
-      } else
-#endif
-      {
-        cache[0] = miptr[0].mbmi.segment_id;
-        if (!(cm->mb_cols & 1) || col < sb_cols - 1)
-          cache[1] = miptr[1].mbmi.segment_id;
-        if (!(cm->mb_rows & 1) || row < sb_rows - 1) {
-          cache[cm->mb_cols] = miptr[cm->mode_info_stride].mbmi.segment_id;
-          if (!(cm->mb_cols & 1) || col < sb_cols - 1)
-            cache[1] = miptr[1].mbmi.segment_id;
-          cache[cm->mb_cols + 1] = miptr[cm->mode_info_stride + 1].mbmi.segment_id;
-        }
-      }
-    }
-    segmap += 2 * cm->mb_cols;
-    segcache += 2 * cm->mb_cols;
-    mi += 2 * cm->mode_info_stride;
-  }
-}
-
-static void set_default_lf_deltas(VP9_COMP *cpi) {
-  cpi->mb.e_mbd.mode_ref_lf_delta_enabled = 1;
-  cpi->mb.e_mbd.mode_ref_lf_delta_update = 1;
-
-  vpx_memset(cpi->mb.e_mbd.ref_lf_deltas, 0, sizeof(cpi->mb.e_mbd.ref_lf_deltas));
-  vpx_memset(cpi->mb.e_mbd.mode_lf_deltas, 0, sizeof(cpi->mb.e_mbd.mode_lf_deltas));
-
-  // Test of ref frame deltas
-  cpi->mb.e_mbd.ref_lf_deltas[INTRA_FRAME] = 2;
-  cpi->mb.e_mbd.ref_lf_deltas[LAST_FRAME] = 0;
-  cpi->mb.e_mbd.ref_lf_deltas[GOLDEN_FRAME] = -2;
-  cpi->mb.e_mbd.ref_lf_deltas[ALTREF_FRAME] = -2;
-
-  cpi->mb.e_mbd.mode_lf_deltas[0] = 4;               // BPRED
-  cpi->mb.e_mbd.mode_lf_deltas[1] = -2;              // Zero
-  cpi->mb.e_mbd.mode_lf_deltas[2] = 2;               // New mv
-  cpi->mb.e_mbd.mode_lf_deltas[3] = 4;               // Split mv
-}
-
-void vp9_set_speed_features(VP9_COMP *cpi) {
-  SPEED_FEATURES *sf = &cpi->sf;
-  int Mode = cpi->compressor_speed;
-  int Speed = cpi->Speed;
-  int i;
-  VP9_COMMON *cm = &cpi->common;
-
-  // Only modes 0 and 1 supported for now in experimental code basae
-  if (Mode > 1)
-    Mode = 1;
-
-  // Initialise default mode frequency sampling variables
-  for (i = 0; i < MAX_MODES; i ++) {
-    cpi->mode_check_freq[i] = 0;
-    cpi->mode_test_hit_counts[i] = 0;
-    cpi->mode_chosen_counts[i] = 0;
-  }
-
-  // best quality defaults
-  sf->RD = 1;
-  sf->search_method = NSTEP;
-  sf->improved_dct = 1;
-  sf->auto_filter = 1;
-  sf->recode_loop = 1;
-  sf->quarter_pixel_search = 1;
-  sf->half_pixel_search = 1;
-  sf->iterative_sub_pixel = 1;
-#if CONFIG_LOSSLESS
-  sf->optimize_coefficients = 0;
-#else
-  sf->optimize_coefficients = 1;
-#endif
-  sf->no_skip_block4x4_search = 1;
-
-  sf->first_step = 0;
-  sf->max_step_search_steps = MAX_MVSEARCH_STEPS;
-  sf->improved_mv_pred = 1;
-
-  // default thresholds to 0
-  for (i = 0; i < MAX_MODES; i++)
-    sf->thresh_mult[i] = 0;
-
-  switch (Mode) {
-    case 0: // best quality mode
-#if CONFIG_PRED_FILTER
-      sf->thresh_mult[THR_ZEROMV        ] = 0;
-      sf->thresh_mult[THR_ZEROMV_FILT   ] = 0;
-      sf->thresh_mult[THR_ZEROG         ] = 0;
-      sf->thresh_mult[THR_ZEROG_FILT    ] = 0;
-      sf->thresh_mult[THR_ZEROA         ] = 0;
-      sf->thresh_mult[THR_ZEROA_FILT    ] = 0;
-      sf->thresh_mult[THR_NEARESTMV     ] = 0;
-      sf->thresh_mult[THR_NEARESTMV_FILT] = 0;
-      sf->thresh_mult[THR_NEARESTG      ] = 0;
-      sf->thresh_mult[THR_NEARESTG_FILT ] = 0;
-      sf->thresh_mult[THR_NEARESTA      ] = 0;
-      sf->thresh_mult[THR_NEARESTA_FILT ] = 0;
-      sf->thresh_mult[THR_NEARMV        ] = 0;
-      sf->thresh_mult[THR_NEARMV_FILT   ] = 0;
-      sf->thresh_mult[THR_NEARG         ] = 0;
-      sf->thresh_mult[THR_NEARG_FILT    ] = 0;
-      sf->thresh_mult[THR_NEARA         ] = 0;
-      sf->thresh_mult[THR_NEARA_FILT    ] = 0;
-
-      sf->thresh_mult[THR_DC       ] = 0;
-
-      sf->thresh_mult[THR_V_PRED   ] = 1000;
-      sf->thresh_mult[THR_H_PRED   ] = 1000;
-      sf->thresh_mult[THR_D45_PRED ] = 1000;
-      sf->thresh_mult[THR_D135_PRED] = 1000;
-      sf->thresh_mult[THR_D117_PRED] = 1000;
-      sf->thresh_mult[THR_D153_PRED] = 1000;
-      sf->thresh_mult[THR_D27_PRED ] = 1000;
-      sf->thresh_mult[THR_D63_PRED ] = 1000;
-      sf->thresh_mult[THR_B_PRED   ] = 2000;
-      sf->thresh_mult[THR_I8X8_PRED] = 2000;
-      sf->thresh_mult[THR_TM       ] = 1000;
-
-      sf->thresh_mult[THR_NEWMV    ] = 1000;
-      sf->thresh_mult[THR_NEWG     ] = 1000;
-      sf->thresh_mult[THR_NEWA     ] = 1000;
-      sf->thresh_mult[THR_NEWMV_FILT    ] = 1000;
-      sf->thresh_mult[THR_NEWG_FILT     ] = 1000;
-      sf->thresh_mult[THR_NEWA_FILT     ] = 1000;
-#else
-      sf->thresh_mult[THR_ZEROMV   ] = 0;
-      sf->thresh_mult[THR_ZEROG    ] = 0;
-      sf->thresh_mult[THR_ZEROA    ] = 0;
-      sf->thresh_mult[THR_NEARESTMV] = 0;
-      sf->thresh_mult[THR_NEARESTG ] = 0;
-      sf->thresh_mult[THR_NEARESTA ] = 0;
-      sf->thresh_mult[THR_NEARMV   ] = 0;
-      sf->thresh_mult[THR_NEARG    ] = 0;
-      sf->thresh_mult[THR_NEARA    ] = 0;
-
-      sf->thresh_mult[THR_DC       ] = 0;
-
-      sf->thresh_mult[THR_V_PRED   ] = 1000;
-      sf->thresh_mult[THR_H_PRED   ] = 1000;
-      sf->thresh_mult[THR_D45_PRED ] = 1000;
-      sf->thresh_mult[THR_D135_PRED] = 1000;
-      sf->thresh_mult[THR_D117_PRED] = 1000;
-      sf->thresh_mult[THR_D153_PRED] = 1000;
-      sf->thresh_mult[THR_D27_PRED ] = 1000;
-      sf->thresh_mult[THR_D63_PRED ] = 1000;
-      sf->thresh_mult[THR_B_PRED   ] = 2000;
-      sf->thresh_mult[THR_I8X8_PRED] = 2000;
-      sf->thresh_mult[THR_TM       ] = 1000;
-
-      sf->thresh_mult[THR_NEWMV    ] = 1000;
-      sf->thresh_mult[THR_NEWG     ] = 1000;
-      sf->thresh_mult[THR_NEWA     ] = 1000;
-#endif
-      sf->thresh_mult[THR_SPLITMV  ] = 2500;
-      sf->thresh_mult[THR_SPLITG   ] = 5000;
-      sf->thresh_mult[THR_SPLITA   ] = 5000;
-
-      sf->thresh_mult[THR_COMP_ZEROLG   ] = 0;
-      sf->thresh_mult[THR_COMP_NEARESTLG] = 0;
-      sf->thresh_mult[THR_COMP_NEARLG   ] = 0;
-      sf->thresh_mult[THR_COMP_ZEROLA   ] = 0;
-      sf->thresh_mult[THR_COMP_NEARESTLA] = 0;
-      sf->thresh_mult[THR_COMP_NEARLA   ] = 0;
-      sf->thresh_mult[THR_COMP_ZEROGA   ] = 0;
-      sf->thresh_mult[THR_COMP_NEARESTGA] = 0;
-      sf->thresh_mult[THR_COMP_NEARGA   ] = 0;
-
-      sf->thresh_mult[THR_COMP_NEWLG    ] = 1000;
-      sf->thresh_mult[THR_COMP_NEWLA    ] = 1000;
-      sf->thresh_mult[THR_COMP_NEWGA    ] = 1000;
-
-      sf->thresh_mult[THR_COMP_SPLITLA  ] = 2500;
-      sf->thresh_mult[THR_COMP_SPLITGA  ] = 5000;
-      sf->thresh_mult[THR_COMP_SPLITLG  ] = 5000;
-
-      sf->first_step = 0;
-      sf->max_step_search_steps = MAX_MVSEARCH_STEPS;
-      sf->search_best_filter = SEARCH_BEST_FILTER;
-      break;
-    case 1:
-#if CONFIG_PRED_FILTER
-      sf->thresh_mult[THR_NEARESTMV] = 0;
-      sf->thresh_mult[THR_NEARESTMV_FILT] = 0;
-      sf->thresh_mult[THR_ZEROMV   ] = 0;
-      sf->thresh_mult[THR_ZEROMV_FILT   ] = 0;
-      sf->thresh_mult[THR_DC       ] = 0;
-      sf->thresh_mult[THR_NEARMV   ] = 0;
-      sf->thresh_mult[THR_NEARMV_FILT   ] = 0;
-      sf->thresh_mult[THR_V_PRED   ] = 1000;
-      sf->thresh_mult[THR_H_PRED   ] = 1000;
-      sf->thresh_mult[THR_D45_PRED ] = 1000;
-      sf->thresh_mult[THR_D135_PRED] = 1000;
-      sf->thresh_mult[THR_D117_PRED] = 1000;
-      sf->thresh_mult[THR_D153_PRED] = 1000;
-      sf->thresh_mult[THR_D27_PRED ] = 1000;
-      sf->thresh_mult[THR_D63_PRED ] = 1000;
-      sf->thresh_mult[THR_B_PRED   ] = 2500;
-      sf->thresh_mult[THR_I8X8_PRED] = 2500;
-      sf->thresh_mult[THR_TM       ] = 1000;
-
-      sf->thresh_mult[THR_NEARESTG ] = 1000;
-      sf->thresh_mult[THR_NEARESTG_FILT ] = 1000;
-      sf->thresh_mult[THR_NEARESTA ] = 1000;
-      sf->thresh_mult[THR_NEARESTA_FILT ] = 1000;
-
-      sf->thresh_mult[THR_ZEROG    ] = 1000;
-      sf->thresh_mult[THR_ZEROA    ] = 1000;
-      sf->thresh_mult[THR_NEARG    ] = 1000;
-      sf->thresh_mult[THR_NEARA    ] = 1000;
-      sf->thresh_mult[THR_ZEROG_FILT    ] = 1000;
-      sf->thresh_mult[THR_ZEROA_FILT    ] = 1000;
-      sf->thresh_mult[THR_NEARG_FILT    ] = 1000;
-      sf->thresh_mult[THR_NEARA_FILT    ] = 1000;
-
-      sf->thresh_mult[THR_ZEROMV   ] = 0;
-      sf->thresh_mult[THR_ZEROG    ] = 0;
-      sf->thresh_mult[THR_ZEROA    ] = 0;
-      sf->thresh_mult[THR_NEARESTMV] = 0;
-      sf->thresh_mult[THR_NEARESTG ] = 0;
-      sf->thresh_mult[THR_NEARESTA ] = 0;
-      sf->thresh_mult[THR_NEARMV   ] = 0;
-      sf->thresh_mult[THR_NEARG    ] = 0;
-      sf->thresh_mult[THR_NEARA    ] = 0;
-      sf->thresh_mult[THR_ZEROMV_FILT   ] = 0;
-      sf->thresh_mult[THR_ZEROG_FILT    ] = 0;
-      sf->thresh_mult[THR_ZEROA_FILT    ] = 0;
-      sf->thresh_mult[THR_NEARESTMV_FILT] = 0;
-      sf->thresh_mult[THR_NEARESTG_FILT ] = 0;
-      sf->thresh_mult[THR_NEARESTA_FILT ] = 0;
-      sf->thresh_mult[THR_NEARMV_FILT   ] = 0;
-      sf->thresh_mult[THR_NEARG_FILT    ] = 0;
-      sf->thresh_mult[THR_NEARA_FILT    ] = 0;
-
-      sf->thresh_mult[THR_NEWMV    ] = 1000;
-      sf->thresh_mult[THR_NEWG     ] = 1000;
-      sf->thresh_mult[THR_NEWA     ] = 1000;
-      sf->thresh_mult[THR_NEWMV_FILT    ] = 1000;
-      sf->thresh_mult[THR_NEWG_FILT     ] = 1000;
-      sf->thresh_mult[THR_NEWA_FILT     ] = 1000;
-#else
-      sf->thresh_mult[THR_NEARESTMV] = 0;
-      sf->thresh_mult[THR_ZEROMV   ] = 0;
-      sf->thresh_mult[THR_DC       ] = 0;
-      sf->thresh_mult[THR_NEARMV   ] = 0;
-      sf->thresh_mult[THR_V_PRED   ] = 1000;
-      sf->thresh_mult[THR_H_PRED   ] = 1000;
-      sf->thresh_mult[THR_D45_PRED ] = 1000;
-      sf->thresh_mult[THR_D135_PRED] = 1000;
-      sf->thresh_mult[THR_D117_PRED] = 1000;
-      sf->thresh_mult[THR_D153_PRED] = 1000;
-      sf->thresh_mult[THR_D27_PRED ] = 1000;
-      sf->thresh_mult[THR_D63_PRED ] = 1000;
-      sf->thresh_mult[THR_B_PRED   ] = 2500;
-      sf->thresh_mult[THR_I8X8_PRED] = 2500;
-      sf->thresh_mult[THR_TM       ] = 1000;
-
-      sf->thresh_mult[THR_NEARESTG ] = 1000;
-      sf->thresh_mult[THR_NEARESTA ] = 1000;
-
-      sf->thresh_mult[THR_ZEROG    ] = 1000;
-      sf->thresh_mult[THR_ZEROA    ] = 1000;
-      sf->thresh_mult[THR_NEARG    ] = 1000;
-      sf->thresh_mult[THR_NEARA    ] = 1000;
-
-      sf->thresh_mult[THR_ZEROMV   ] = 0;
-      sf->thresh_mult[THR_ZEROG    ] = 0;
-      sf->thresh_mult[THR_ZEROA    ] = 0;
-      sf->thresh_mult[THR_NEARESTMV] = 0;
-      sf->thresh_mult[THR_NEARESTG ] = 0;
-      sf->thresh_mult[THR_NEARESTA ] = 0;
-      sf->thresh_mult[THR_NEARMV   ] = 0;
-      sf->thresh_mult[THR_NEARG    ] = 0;
-      sf->thresh_mult[THR_NEARA    ] = 0;
-
-      sf->thresh_mult[THR_NEWMV    ] = 1000;
-      sf->thresh_mult[THR_NEWG     ] = 1000;
-      sf->thresh_mult[THR_NEWA     ] = 1000;
-#endif
-      sf->thresh_mult[THR_SPLITMV  ] = 1700;
-      sf->thresh_mult[THR_SPLITG   ] = 4500;
-      sf->thresh_mult[THR_SPLITA   ] = 4500;
-
-      sf->thresh_mult[THR_COMP_ZEROLG   ] = 0;
-      sf->thresh_mult[THR_COMP_NEARESTLG] = 0;
-      sf->thresh_mult[THR_COMP_NEARLG   ] = 0;
-      sf->thresh_mult[THR_COMP_ZEROLA   ] = 0;
-      sf->thresh_mult[THR_COMP_NEARESTLA] = 0;
-      sf->thresh_mult[THR_COMP_NEARLA   ] = 0;
-      sf->thresh_mult[THR_COMP_ZEROGA   ] = 0;
-      sf->thresh_mult[THR_COMP_NEARESTGA] = 0;
-      sf->thresh_mult[THR_COMP_NEARGA   ] = 0;
-
-      sf->thresh_mult[THR_COMP_NEWLG    ] = 1000;
-      sf->thresh_mult[THR_COMP_NEWLA    ] = 1000;
-      sf->thresh_mult[THR_COMP_NEWGA    ] = 1000;
-
-      sf->thresh_mult[THR_COMP_SPLITLA  ] = 1700;
-      sf->thresh_mult[THR_COMP_SPLITGA  ] = 4500;
-      sf->thresh_mult[THR_COMP_SPLITLG  ] = 4500;
-
-      if (Speed > 0) {
-        /* Disable coefficient optimization above speed 0 */
-        sf->optimize_coefficients = 0;
-        sf->no_skip_block4x4_search = 0;
-
-        sf->first_step = 1;
-
-        cpi->mode_check_freq[THR_SPLITG] = 2;
-        cpi->mode_check_freq[THR_SPLITA] = 2;
-        cpi->mode_check_freq[THR_SPLITMV] = 0;
-
-        cpi->mode_check_freq[THR_COMP_SPLITGA] = 2;
-        cpi->mode_check_freq[THR_COMP_SPLITLG] = 2;
-        cpi->mode_check_freq[THR_COMP_SPLITLA] = 0;
-      }
-
-      if (Speed > 1) {
-        cpi->mode_check_freq[THR_SPLITG] = 4;
-        cpi->mode_check_freq[THR_SPLITA] = 4;
-        cpi->mode_check_freq[THR_SPLITMV] = 2;
-
-        cpi->mode_check_freq[THR_COMP_SPLITGA] = 4;
-        cpi->mode_check_freq[THR_COMP_SPLITLG] = 4;
-        cpi->mode_check_freq[THR_COMP_SPLITLA] = 2;
-
-        sf->thresh_mult[THR_TM       ] = 1500;
-        sf->thresh_mult[THR_V_PRED   ] = 1500;
-        sf->thresh_mult[THR_H_PRED   ] = 1500;
-        sf->thresh_mult[THR_D45_PRED ] = 1500;
-        sf->thresh_mult[THR_D135_PRED] = 1500;
-        sf->thresh_mult[THR_D117_PRED] = 1500;
-        sf->thresh_mult[THR_D153_PRED] = 1500;
-        sf->thresh_mult[THR_D27_PRED ] = 1500;
-        sf->thresh_mult[THR_D63_PRED ] = 1500;
-        sf->thresh_mult[THR_B_PRED   ] = 5000;
-        sf->thresh_mult[THR_I8X8_PRED] = 5000;
-
-        if (cpi->ref_frame_flags & VP9_LAST_FLAG) {
-          sf->thresh_mult[THR_NEWMV    ] = 2000;
-#if CONFIG_PRED_FILTER
-          sf->thresh_mult[THR_NEWMV_FILT    ] = 2000;
-#endif
-          sf->thresh_mult[THR_SPLITMV  ] = 10000;
-          sf->thresh_mult[THR_COMP_SPLITLG  ] = 20000;
-        }
-
-        if (cpi->ref_frame_flags & VP9_GOLD_FLAG) {
-          sf->thresh_mult[THR_NEARESTG ] = 1500;
-          sf->thresh_mult[THR_ZEROG    ] = 1500;
-          sf->thresh_mult[THR_NEARG    ] = 1500;
-          sf->thresh_mult[THR_NEWG     ] = 2000;
-#if CONFIG_PRED_FILTER
-          sf->thresh_mult[THR_NEARESTG_FILT ] = 1500;
-          sf->thresh_mult[THR_ZEROG_FILT    ] = 1500;
-          sf->thresh_mult[THR_NEARG_FILT    ] = 1500;
-          sf->thresh_mult[THR_NEWG_FILT     ] = 2000;
-#endif
-          sf->thresh_mult[THR_SPLITG   ] = 20000;
-          sf->thresh_mult[THR_COMP_SPLITGA  ] = 20000;
-        }
-
-        if (cpi->ref_frame_flags & VP9_ALT_FLAG) {
-          sf->thresh_mult[THR_NEARESTA ] = 1500;
-          sf->thresh_mult[THR_ZEROA    ] = 1500;
-          sf->thresh_mult[THR_NEARA    ] = 1500;
-          sf->thresh_mult[THR_NEWA     ] = 2000;
-#if CONFIG_PRED_FILTER
-          sf->thresh_mult[THR_NEARESTA_FILT ] = 1500;
-          sf->thresh_mult[THR_ZEROA_FILT    ] = 1500;
-          sf->thresh_mult[THR_NEARA_FILT    ] = 1500;
-          sf->thresh_mult[THR_NEWA_FILT     ] = 2000;
-#endif
-          sf->thresh_mult[THR_SPLITA   ] = 20000;
-          sf->thresh_mult[THR_COMP_SPLITLA  ] = 10000;
-        }
-
-        sf->thresh_mult[THR_COMP_ZEROLG   ] = 1500;
-        sf->thresh_mult[THR_COMP_NEARESTLG] = 1500;
-        sf->thresh_mult[THR_COMP_NEARLG   ] = 1500;
-        sf->thresh_mult[THR_COMP_ZEROLA   ] = 1500;
-        sf->thresh_mult[THR_COMP_NEARESTLA] = 1500;
-        sf->thresh_mult[THR_COMP_NEARLA   ] = 1500;
-        sf->thresh_mult[THR_COMP_ZEROGA   ] = 1500;
-        sf->thresh_mult[THR_COMP_NEARESTGA] = 1500;
-        sf->thresh_mult[THR_COMP_NEARGA   ] = 1500;
-
-        sf->thresh_mult[THR_COMP_NEWLG    ] = 2000;
-        sf->thresh_mult[THR_COMP_NEWLA    ] = 2000;
-        sf->thresh_mult[THR_COMP_NEWGA    ] = 2000;
-      }
-
-      if (Speed > 2) {
-        cpi->mode_check_freq[THR_SPLITG] = 15;
-        cpi->mode_check_freq[THR_SPLITA] = 15;
-        cpi->mode_check_freq[THR_SPLITMV] = 7;
-
-        cpi->mode_check_freq[THR_COMP_SPLITGA] = 15;
-        cpi->mode_check_freq[THR_COMP_SPLITLG] = 15;
-        cpi->mode_check_freq[THR_COMP_SPLITLA] = 7;
-
-        sf->thresh_mult[THR_TM       ] = 2000;
-        sf->thresh_mult[THR_V_PRED   ] = 2000;
-        sf->thresh_mult[THR_H_PRED   ] = 2000;
-        sf->thresh_mult[THR_D45_PRED ] = 2000;
-        sf->thresh_mult[THR_D135_PRED] = 2000;
-        sf->thresh_mult[THR_D117_PRED] = 2000;
-        sf->thresh_mult[THR_D153_PRED] = 2000;
-        sf->thresh_mult[THR_D27_PRED ] = 2000;
-        sf->thresh_mult[THR_D63_PRED ] = 2000;
-        sf->thresh_mult[THR_B_PRED   ] = 7500;
-        sf->thresh_mult[THR_I8X8_PRED] = 7500;
-
-        if (cpi->ref_frame_flags & VP9_LAST_FLAG) {
-          sf->thresh_mult[THR_NEWMV    ] = 2000;
-#if CONFIG_PRED_FILTER
-          sf->thresh_mult[THR_NEWMV_FILT    ] = 2000;
-#endif
-          sf->thresh_mult[THR_SPLITMV  ] = 25000;
-          sf->thresh_mult[THR_COMP_SPLITLG  ] = 50000;
-        }
-
-        if (cpi->ref_frame_flags & VP9_GOLD_FLAG) {
-          sf->thresh_mult[THR_NEARESTG ] = 2000;
-          sf->thresh_mult[THR_ZEROG    ] = 2000;
-          sf->thresh_mult[THR_NEARG    ] = 2000;
-          sf->thresh_mult[THR_NEWG     ] = 2500;
-#if CONFIG_PRED_FILTER
-          sf->thresh_mult[THR_NEARESTG_FILT ] = 2000;
-          sf->thresh_mult[THR_ZEROG_FILT    ] = 2000;
-          sf->thresh_mult[THR_NEARG_FILT    ] = 2000;
-          sf->thresh_mult[THR_NEWG_FILT     ] = 2500;
-#endif
-          sf->thresh_mult[THR_SPLITG   ] = 50000;
-          sf->thresh_mult[THR_COMP_SPLITGA  ] = 50000;
-        }
-
-        if (cpi->ref_frame_flags & VP9_ALT_FLAG) {
-          sf->thresh_mult[THR_NEARESTA ] = 2000;
-          sf->thresh_mult[THR_ZEROA    ] = 2000;
-          sf->thresh_mult[THR_NEARA    ] = 2000;
-          sf->thresh_mult[THR_NEWA     ] = 2500;
-#if CONFIG_PRED_FILTER
-          sf->thresh_mult[THR_NEARESTA_FILT ] = 2000;
-          sf->thresh_mult[THR_ZEROA_FILT    ] = 2000;
-          sf->thresh_mult[THR_NEARA_FILT    ] = 2000;
-          sf->thresh_mult[THR_NEWA_FILT     ] = 2500;
-#endif
-          sf->thresh_mult[THR_SPLITA   ] = 50000;
-          sf->thresh_mult[THR_COMP_SPLITLA  ] = 25000;
-        }
-
-        sf->thresh_mult[THR_COMP_ZEROLG   ] = 2000;
-        sf->thresh_mult[THR_COMP_NEARESTLG] = 2000;
-        sf->thresh_mult[THR_COMP_NEARLG   ] = 2000;
-        sf->thresh_mult[THR_COMP_ZEROLA   ] = 2000;
-        sf->thresh_mult[THR_COMP_NEARESTLA] = 2000;
-        sf->thresh_mult[THR_COMP_NEARLA   ] = 2000;
-        sf->thresh_mult[THR_COMP_ZEROGA   ] = 2000;
-        sf->thresh_mult[THR_COMP_NEARESTGA] = 2000;
-        sf->thresh_mult[THR_COMP_NEARGA   ] = 2000;
-
-        sf->thresh_mult[THR_COMP_NEWLG    ] = 2500;
-        sf->thresh_mult[THR_COMP_NEWLA    ] = 2500;
-        sf->thresh_mult[THR_COMP_NEWGA    ] = 2500;
-
-        sf->improved_dct = 0;
-
-        // Only do recode loop on key frames, golden frames and
-        // alt ref frames
-        sf->recode_loop = 2;
-
-      }
-
-      break;
-
-  }; /* switch */
-
-  /* disable frame modes if flags not set */
-  if (!(cpi->ref_frame_flags & VP9_LAST_FLAG)) {
-    sf->thresh_mult[THR_NEWMV    ] = INT_MAX;
-    sf->thresh_mult[THR_NEARESTMV] = INT_MAX;
-    sf->thresh_mult[THR_ZEROMV   ] = INT_MAX;
-    sf->thresh_mult[THR_NEARMV   ] = INT_MAX;
-#if CONFIG_PRED_FILTER
-    sf->thresh_mult[THR_NEWMV_FILT    ] = INT_MAX;
-    sf->thresh_mult[THR_NEARESTMV_FILT] = INT_MAX;
-    sf->thresh_mult[THR_ZEROMV_FILT   ] = INT_MAX;
-    sf->thresh_mult[THR_NEARMV_FILT   ] = INT_MAX;
-#endif
-    sf->thresh_mult[THR_SPLITMV  ] = INT_MAX;
-  }
-
-  if (!(cpi->ref_frame_flags & VP9_GOLD_FLAG)) {
-    sf->thresh_mult[THR_NEARESTG ] = INT_MAX;
-    sf->thresh_mult[THR_ZEROG    ] = INT_MAX;
-    sf->thresh_mult[THR_NEARG    ] = INT_MAX;
-    sf->thresh_mult[THR_NEWG     ] = INT_MAX;
-#if CONFIG_PRED_FILTER
-    sf->thresh_mult[THR_NEARESTG_FILT ] = INT_MAX;
-    sf->thresh_mult[THR_ZEROG_FILT    ] = INT_MAX;
-    sf->thresh_mult[THR_NEARG_FILT    ] = INT_MAX;
-    sf->thresh_mult[THR_NEWG_FILT     ] = INT_MAX;
-#endif
-    sf->thresh_mult[THR_SPLITG   ] = INT_MAX;
-  }
-
-  if (!(cpi->ref_frame_flags & VP9_ALT_FLAG)) {
-    sf->thresh_mult[THR_NEARESTA ] = INT_MAX;
-    sf->thresh_mult[THR_ZEROA    ] = INT_MAX;
-    sf->thresh_mult[THR_NEARA    ] = INT_MAX;
-    sf->thresh_mult[THR_NEWA     ] = INT_MAX;
-#if CONFIG_PRED_FILTER
-    sf->thresh_mult[THR_NEARESTA_FILT ] = INT_MAX;
-    sf->thresh_mult[THR_ZEROA_FILT    ] = INT_MAX;
-    sf->thresh_mult[THR_NEARA_FILT    ] = INT_MAX;
-    sf->thresh_mult[THR_NEWA_FILT     ] = INT_MAX;
-#endif
-    sf->thresh_mult[THR_SPLITA   ] = INT_MAX;
-  }
-
-  if ((cpi->ref_frame_flags & (VP9_LAST_FLAG | VP9_GOLD_FLAG)) != (VP9_LAST_FLAG | VP9_GOLD_FLAG)) {
-    sf->thresh_mult[THR_COMP_ZEROLG   ] = INT_MAX;
-    sf->thresh_mult[THR_COMP_NEARESTLG] = INT_MAX;
-    sf->thresh_mult[THR_COMP_NEARLG   ] = INT_MAX;
-    sf->thresh_mult[THR_COMP_NEWLG    ] = INT_MAX;
-    sf->thresh_mult[THR_COMP_SPLITLG  ] = INT_MAX;
-  }
-
-  if ((cpi->ref_frame_flags & (VP9_LAST_FLAG | VP9_ALT_FLAG)) != (VP9_LAST_FLAG | VP9_ALT_FLAG)) {
-    sf->thresh_mult[THR_COMP_ZEROLA   ] = INT_MAX;
-    sf->thresh_mult[THR_COMP_NEARESTLA] = INT_MAX;
-    sf->thresh_mult[THR_COMP_NEARLA   ] = INT_MAX;
-    sf->thresh_mult[THR_COMP_NEWLA    ] = INT_MAX;
-    sf->thresh_mult[THR_COMP_SPLITLA  ] = INT_MAX;
-  }
-
-  if ((cpi->ref_frame_flags & (VP9_GOLD_FLAG | VP9_ALT_FLAG)) != (VP9_GOLD_FLAG | VP9_ALT_FLAG)) {
-    sf->thresh_mult[THR_COMP_ZEROGA   ] = INT_MAX;
-    sf->thresh_mult[THR_COMP_NEARESTGA] = INT_MAX;
-    sf->thresh_mult[THR_COMP_NEARGA   ] = INT_MAX;
-    sf->thresh_mult[THR_COMP_NEWGA    ] = INT_MAX;
-    sf->thresh_mult[THR_COMP_SPLITGA  ] = INT_MAX;
-  }
-
-  // Slow quant, dct and trellis not worthwhile for first pass
-  // so make sure they are always turned off.
-  if (cpi->pass == 1) {
-    sf->optimize_coefficients = 0;
-    sf->improved_dct = 0;
-  }
-
-  if (cpi->sf.search_method == NSTEP) {
-    vp9_init3smotion_compensation(&cpi->mb,
-                                  cm->yv12_fb[cm->lst_fb_idx].y_stride);
-  } else if (cpi->sf.search_method == DIAMOND) {
-    vp9_init_dsmotion_compensation(&cpi->mb,
-                                   cm->yv12_fb[cm->lst_fb_idx].y_stride);
-  }
-
-  cpi->mb.vp9_short_fdct16x16 = vp9_short_fdct16x16;
-  cpi->mb.vp9_short_fdct8x8 = vp9_short_fdct8x8;
-  cpi->mb.vp9_short_fdct8x4 = vp9_short_fdct8x4;
-  cpi->mb.vp9_short_fdct4x4 = vp9_short_fdct4x4;
-  cpi->mb.short_walsh4x4 = vp9_short_walsh4x4;
-  cpi->mb.short_fhaar2x2 = vp9_short_fhaar2x2;
-
-#if CONFIG_LOSSLESS
-  if (cpi->oxcf.lossless) {
-    cpi->mb.vp9_short_fdct8x4 = vp9_short_walsh8x4_x8;
-    cpi->mb.vp9_short_fdct4x4 = vp9_short_walsh4x4_x8;
-    cpi->mb.short_walsh4x4 = vp9_short_walsh4x4;
-    cpi->mb.short_fhaar2x2 = vp9_short_fhaar2x2;
-    cpi->mb.short_walsh4x4 = vp9_short_walsh4x4_lossless;
-  }
-#endif
-
-
-
-  cpi->mb.quantize_b_4x4      = vp9_regular_quantize_b_4x4;
-  cpi->mb.quantize_b_4x4_pair = vp9_regular_quantize_b_4x4_pair;
-  cpi->mb.quantize_b_8x8      = vp9_regular_quantize_b_8x8;
-  cpi->mb.quantize_b_16x16    = vp9_regular_quantize_b_16x16;
-  cpi->mb.quantize_b_2x2      = vp9_regular_quantize_b_2x2;
-
-  vp9_init_quantizer(cpi);
-
-#if CONFIG_RUNTIME_CPU_DETECT
-  cpi->mb.e_mbd.rtcd = &cpi->common.rtcd;
-#endif
-
-  if (cpi->sf.iterative_sub_pixel == 1) {
-    cpi->find_fractional_mv_step = vp9_find_best_sub_pixel_step_iteratively;
-  } else if (cpi->sf.quarter_pixel_search) {
-    cpi->find_fractional_mv_step = vp9_find_best_sub_pixel_step;
-  } else if (cpi->sf.half_pixel_search) {
-    cpi->find_fractional_mv_step = vp9_find_best_half_pixel_step;
-  }
-
-  if (cpi->sf.optimize_coefficients == 1 && cpi->pass != 1)
-    cpi->mb.optimize = 1;
-  else
-    cpi->mb.optimize = 0;
-
-#ifdef SPEEDSTATS
-  frames_at_speed[cpi->Speed]++;
-#endif
-}
-static void alloc_raw_frame_buffers(VP9_COMP *cpi) {
-  int width = (cpi->oxcf.Width + 15) & ~15;
-  int height = (cpi->oxcf.Height + 15) & ~15;
-
-  cpi->lookahead = vp9_lookahead_init(cpi->oxcf.Width, cpi->oxcf.Height,
-                                      cpi->oxcf.lag_in_frames);
-  if (!cpi->lookahead)
-    vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
-                       "Failed to allocate lag buffers");
-
-#if VP9_TEMPORAL_ALT_REF
-
-  if (vp8_yv12_alloc_frame_buffer(&cpi->alt_ref_buffer,
-                                  width, height, VP8BORDERINPIXELS))
-    vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
-                       "Failed to allocate altref buffer");
-
-#endif
-}
-
-static int alloc_partition_data(VP9_COMP *cpi) {
-  vpx_free(cpi->mb.pip);
-
-  cpi->mb.pip = vpx_calloc((cpi->common.mb_cols + 1) *
-                           (cpi->common.mb_rows + 1),
-                           sizeof(PARTITION_INFO));
-  if (!cpi->mb.pip)
-    return 1;
-
-  cpi->mb.pi = cpi->mb.pip + cpi->common.mode_info_stride + 1;
-
-  return 0;
-}
-
-void vp9_alloc_compressor_data(VP9_COMP *cpi) {
-  VP9_COMMON *cm = &cpi->common;
-
-  int width = cm->Width;
-  int height = cm->Height;
-
-  if (vp9_alloc_frame_buffers(cm, width, height))
-    vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
-                       "Failed to allocate frame buffers");
-
-  if (alloc_partition_data(cpi))
-    vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
-                       "Failed to allocate partition data");
-
-
-  if ((width & 0xf) != 0)
-    width += 16 - (width & 0xf);
-
-  if ((height & 0xf) != 0)
-    height += 16 - (height & 0xf);
-
-
-  if (vp8_yv12_alloc_frame_buffer(&cpi->last_frame_uf,
-                                  width, height, VP8BORDERINPIXELS))
-    vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
-                       "Failed to allocate last frame buffer");
-
-  if (vp8_yv12_alloc_frame_buffer(&cpi->scaled_source,
-                                  width, height, VP8BORDERINPIXELS))
-    vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
-                       "Failed to allocate scaled source buffer");
-
-
-  vpx_free(cpi->tok);
-
-  {
-    unsigned int tokens = cm->mb_rows * cm->mb_cols * 24 * 16;
-
-    CHECK_MEM_ERROR(cpi->tok, vpx_calloc(tokens, sizeof(*cpi->tok)));
-  }
-
-  // Data used for real time vc mode to see if gf needs refreshing
-  cpi->inter_zz_count = 0;
-  cpi->gf_bad_count = 0;
-  cpi->gf_update_recommended = 0;
-
-
-  // Structures used to minitor GF usage
-  vpx_free(cpi->gf_active_flags);
-  CHECK_MEM_ERROR(cpi->gf_active_flags,
-                  vpx_calloc(1, cm->mb_rows * cm->mb_cols));
-  cpi->gf_active_count = cm->mb_rows * cm->mb_cols;
-
-  vpx_free(cpi->mb_activity_map);
-  CHECK_MEM_ERROR(cpi->mb_activity_map,
-                  vpx_calloc(sizeof(unsigned int),
-                             cm->mb_rows * cm->mb_cols));
-
-  vpx_free(cpi->mb_norm_activity_map);
-  CHECK_MEM_ERROR(cpi->mb_norm_activity_map,
-                  vpx_calloc(sizeof(unsigned int),
-                             cm->mb_rows * cm->mb_cols));
-
-  vpx_free(cpi->twopass.total_stats);
-
-  cpi->twopass.total_stats = vpx_calloc(1, sizeof(FIRSTPASS_STATS));
-
-  vpx_free(cpi->twopass.total_left_stats);
-  cpi->twopass.total_left_stats = vpx_calloc(1, sizeof(FIRSTPASS_STATS));
-
-  vpx_free(cpi->twopass.this_frame_stats);
-
-  cpi->twopass.this_frame_stats = vpx_calloc(1, sizeof(FIRSTPASS_STATS));
-
-  if (!cpi->twopass.total_stats ||
-      !cpi->twopass.total_left_stats ||
-      !cpi->twopass.this_frame_stats)
-    vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
-                       "Failed to allocate firstpass stats");
-
-  vpx_free(cpi->tplist);
-
-  CHECK_MEM_ERROR(cpi->tplist,
-                  vpx_malloc(sizeof(TOKENLIST) * (cpi->common.mb_rows)));
-}
-
-
-// TODO perhaps change number of steps expose to outside world when setting
-// max and min limits. Also this will likely want refining for the extended Q
-// range.
-//
-// Table that converts 0-63 Q range values passed in outside to the Qindex
-// range used internally.
-static const int q_trans[] = {
-  0,    4,   8,  12,  16,  20,  24,  28,
-  32,   36,  40,  44,  48,  52,  56,  60,
-  64,   68,  72,  76,  80,  84,  88,  92,
-  96,  100, 104, 108, 112, 116, 120, 124,
-  128, 132, 136, 140, 144, 148, 152, 156,
-  160, 164, 168, 172, 176, 180, 184, 188,
-  192, 196, 200, 204, 208, 212, 216, 220,
-  224, 228, 232, 236, 240, 244, 249, 255,
-};
-
-int vp9_reverse_trans(int x) {
-  int i;
-
-  for (i = 0; i < 64; i++)
-    if (q_trans[i] >= x)
-      return i;
-
-  return 63;
-};
-void vp9_new_frame_rate(VP9_COMP *cpi, double framerate) {
-  if (framerate < .1)
-    framerate = 30;
-
-  cpi->oxcf.frame_rate             = framerate;
-  cpi->output_frame_rate            = cpi->oxcf.frame_rate;
-  cpi->per_frame_bandwidth          = (int)(cpi->oxcf.target_bandwidth / cpi->output_frame_rate);
-  cpi->av_per_frame_bandwidth        = (int)(cpi->oxcf.target_bandwidth / cpi->output_frame_rate);
-  cpi->min_frame_bandwidth          = (int)(cpi->av_per_frame_bandwidth * cpi->oxcf.two_pass_vbrmin_section / 100);
-
-  if (cpi->min_frame_bandwidth < FRAME_OVERHEAD_BITS)
-    cpi->min_frame_bandwidth = FRAME_OVERHEAD_BITS;
-
-  // Set Maximum gf/arf interval
-  cpi->max_gf_interval = ((int)(cpi->output_frame_rate / 2.0) + 2);
-
-  if (cpi->max_gf_interval < 12)
-    cpi->max_gf_interval = 12;
-
-  // Extended interval for genuinely static scenes
-  cpi->twopass.static_scene_max_gf_interval = cpi->key_frame_frequency >> 1;
-
-  // Special conditions when altr ref frame enabled in lagged compress mode
-  if (cpi->oxcf.play_alternate && cpi->oxcf.lag_in_frames) {
-    if (cpi->max_gf_interval > cpi->oxcf.lag_in_frames - 1)
-      cpi->max_gf_interval = cpi->oxcf.lag_in_frames - 1;
-
-    if (cpi->twopass.static_scene_max_gf_interval > cpi->oxcf.lag_in_frames - 1)
-      cpi->twopass.static_scene_max_gf_interval = cpi->oxcf.lag_in_frames - 1;
-  }
-
-  if (cpi->max_gf_interval > cpi->twopass.static_scene_max_gf_interval)
-    cpi->max_gf_interval = cpi->twopass.static_scene_max_gf_interval;
-}
-
-
-static int
-rescale(int val, int num, int denom) {
-  int64_t llnum = num;
-  int64_t llden = denom;
-  int64_t llval = val;
-
-  return llval * llnum / llden;
-}
-
-
-static void init_config(VP9_PTR ptr, VP9_CONFIG *oxcf) {
-  VP9_COMP *cpi = (VP9_COMP *)(ptr);
-  VP9_COMMON *cm = &cpi->common;
-
-  cpi->oxcf = *oxcf;
-
-  cpi->goldfreq = 7;
-
-  cm->version = oxcf->Version;
-  vp9_setup_version(cm);
-
-  // change includes all joint functionality
-  vp9_change_config(ptr, oxcf);
-
-  // Initialize active best and worst q and average q values.
-  cpi->active_worst_quality         = cpi->oxcf.worst_allowed_q;
-  cpi->active_best_quality          = cpi->oxcf.best_allowed_q;
-  cpi->avg_frame_qindex             = cpi->oxcf.worst_allowed_q;
-
-  // Initialise the starting buffer levels
-  cpi->buffer_level                 = cpi->oxcf.starting_buffer_level;
-  cpi->bits_off_target              = cpi->oxcf.starting_buffer_level;
-
-  cpi->rolling_target_bits          = cpi->av_per_frame_bandwidth;
-  cpi->rolling_actual_bits          = cpi->av_per_frame_bandwidth;
-  cpi->long_rolling_target_bits     = cpi->av_per_frame_bandwidth;
-  cpi->long_rolling_actual_bits     = cpi->av_per_frame_bandwidth;
-
-  cpi->total_actual_bits            = 0;
-  cpi->total_target_vs_actual       = 0;
-
-  cpi->static_mb_pct = 0;
-
-#if VP9_TEMPORAL_ALT_REF
-  {
-    int i;
-
-    cpi->fixed_divide[0] = 0;
-
-    for (i = 1; i < 512; i++)
-      cpi->fixed_divide[i] = 0x80000 / i;
-  }
-#endif
-}
-
-
-void vp9_change_config(VP9_PTR ptr, VP9_CONFIG *oxcf) {
-  VP9_COMP *cpi = (VP9_COMP *)(ptr);
-  VP9_COMMON *cm = &cpi->common;
-
-  if (!cpi)
-    return;
-
-  if (!oxcf)
-    return;
-
-  if (cm->version != oxcf->Version) {
-    cm->version = oxcf->Version;
-    vp9_setup_version(cm);
-  }
-
-  cpi->oxcf = *oxcf;
-
-  switch (cpi->oxcf.Mode) {
-      // Real time and one pass deprecated in test code base
-    case MODE_FIRSTPASS:
-      cpi->pass = 1;
-      cpi->compressor_speed = 1;
-      break;
-
-    case MODE_SECONDPASS:
-      cpi->pass = 2;
-      cpi->compressor_speed = 1;
-
-      if (cpi->oxcf.cpu_used < -5) {
-        cpi->oxcf.cpu_used = -5;
-      }
-
-      if (cpi->oxcf.cpu_used > 5)
-        cpi->oxcf.cpu_used = 5;
-
-      break;
-
-    case MODE_SECONDPASS_BEST:
-      cpi->pass = 2;
-      cpi->compressor_speed = 0;
-      break;
-  }
-
-  cpi->oxcf.worst_allowed_q = q_trans[oxcf->worst_allowed_q];
-  cpi->oxcf.best_allowed_q = q_trans[oxcf->best_allowed_q];
-  cpi->oxcf.cq_level = q_trans[cpi->oxcf.cq_level];
-
-#if CONFIG_LOSSLESS
-  cpi->oxcf.lossless = oxcf->lossless;
-  if (cpi->oxcf.lossless) {
-    cpi->common.rtcd.idct.idct1        = vp9_short_inv_walsh4x4_1_x8_c;
-    cpi->common.rtcd.idct.idct16       = vp9_short_inv_walsh4x4_x8_c;
-    cpi->common.rtcd.idct.idct1_scalar_add  = vp9_dc_only_inv_walsh_add_c;
-    cpi->common.rtcd.idct.iwalsh1      = vp9_short_inv_walsh4x4_1_c;
-    cpi->common.rtcd.idct.iwalsh16     = vp9_short_inv_walsh4x4_lossless_c;
-  }
-#endif
-
-  cpi->baseline_gf_interval = DEFAULT_GF_INTERVAL;
-
-  cpi->ref_frame_flags = VP9_ALT_FLAG | VP9_GOLD_FLAG | VP9_LAST_FLAG;
-
-  // cpi->use_golden_frame_only = 0;
-  // cpi->use_last_frame_only = 0;
-  cm->refresh_golden_frame = 0;
-  cm->refresh_last_frame = 1;
-  cm->refresh_entropy_probs = 1;
-
-  setup_features(cpi);
-  cpi->mb.e_mbd.allow_high_precision_mv = 0;   // Default mv precision adaptation
-
-  {
-    int i;
-
-    for (i = 0; i < MAX_MB_SEGMENTS; i++)
-      cpi->segment_encode_breakout[i] = cpi->oxcf.encode_breakout;
-  }
-
-  // At the moment the first order values may not be > MAXQ
-  if (cpi->oxcf.fixed_q > MAXQ)
-    cpi->oxcf.fixed_q = MAXQ;
-
-  // local file playback mode == really big buffer
-  if (cpi->oxcf.end_usage == USAGE_LOCAL_FILE_PLAYBACK) {
-    cpi->oxcf.starting_buffer_level   = 60000;
-    cpi->oxcf.optimal_buffer_level    = 60000;
-    cpi->oxcf.maximum_buffer_size     = 240000;
-  }
-
-  // Convert target bandwidth from Kbit/s to Bit/s
-  cpi->oxcf.target_bandwidth       *= 1000;
-
-  cpi->oxcf.starting_buffer_level =
-    rescale(cpi->oxcf.starting_buffer_level,
-            cpi->oxcf.target_bandwidth, 1000);
-
-  // Set or reset optimal and maximum buffer levels.
-  if (cpi->oxcf.optimal_buffer_level == 0)
-    cpi->oxcf.optimal_buffer_level = cpi->oxcf.target_bandwidth / 8;
-  else
-    cpi->oxcf.optimal_buffer_level =
-      rescale(cpi->oxcf.optimal_buffer_level,
-              cpi->oxcf.target_bandwidth, 1000);
-
-  if (cpi->oxcf.maximum_buffer_size == 0)
-    cpi->oxcf.maximum_buffer_size = cpi->oxcf.target_bandwidth / 8;
-  else
-    cpi->oxcf.maximum_buffer_size =
-      rescale(cpi->oxcf.maximum_buffer_size,
-              cpi->oxcf.target_bandwidth, 1000);
-
-  // Set up frame rate and related parameters rate control values.
-  vp9_new_frame_rate(cpi, cpi->oxcf.frame_rate);
-
-  // Set absolute upper and lower quality limits
-  cpi->worst_quality               = cpi->oxcf.worst_allowed_q;
-  cpi->best_quality                = cpi->oxcf.best_allowed_q;
-
-  // active values should only be modified if out of new range
-  if (cpi->active_worst_quality > cpi->oxcf.worst_allowed_q) {
-    cpi->active_worst_quality = cpi->oxcf.worst_allowed_q;
-  }
-  // less likely
-  else if (cpi->active_worst_quality < cpi->oxcf.best_allowed_q) {
-    cpi->active_worst_quality = cpi->oxcf.best_allowed_q;
-  }
-  if (cpi->active_best_quality < cpi->oxcf.best_allowed_q) {
-    cpi->active_best_quality = cpi->oxcf.best_allowed_q;
-  }
-  // less likely
-  else if (cpi->active_best_quality > cpi->oxcf.worst_allowed_q) {
-    cpi->active_best_quality = cpi->oxcf.worst_allowed_q;
-  }
-
-  cpi->buffered_mode = (cpi->oxcf.optimal_buffer_level > 0) ? TRUE : FALSE;
-
-  cpi->cq_target_quality = cpi->oxcf.cq_level;
-
-  if (!cm->use_bilinear_mc_filter)
-    cm->mcomp_filter_type = DEFAULT_INTERP_FILTER;
-  else
-    cm->mcomp_filter_type = BILINEAR;
-
-  cpi->target_bandwidth = cpi->oxcf.target_bandwidth;
-
-  cm->Width       = cpi->oxcf.Width;
-  cm->Height      = cpi->oxcf.Height;
-
-  cm->horiz_scale  = cpi->horiz_scale;
-  cm->vert_scale   = cpi->vert_scale;
-
-  // VP8 sharpness level mapping 0-7 (vs 0-10 in general VPx dialogs)
-  if (cpi->oxcf.Sharpness > 7)
-    cpi->oxcf.Sharpness = 7;
-
-  cm->sharpness_level = cpi->oxcf.Sharpness;
-
-  if (cm->horiz_scale != NORMAL || cm->vert_scale != NORMAL) {
-    int UNINITIALIZED_IS_SAFE(hr), UNINITIALIZED_IS_SAFE(hs);
-    int UNINITIALIZED_IS_SAFE(vr), UNINITIALIZED_IS_SAFE(vs);
-
-    Scale2Ratio(cm->horiz_scale, &hr, &hs);
-    Scale2Ratio(cm->vert_scale, &vr, &vs);
-
-    // always go to the next whole number
-    cm->Width = (hs - 1 + cpi->oxcf.Width * hr) / hs;
-    cm->Height = (vs - 1 + cpi->oxcf.Height * vr) / vs;
-  }
-
-  if (((cm->Width + 15) & 0xfffffff0) !=
-      cm->yv12_fb[cm->lst_fb_idx].y_width ||
-      ((cm->Height + 15) & 0xfffffff0) !=
-      cm->yv12_fb[cm->lst_fb_idx].y_height ||
-      cm->yv12_fb[cm->lst_fb_idx].y_width == 0) {
-    alloc_raw_frame_buffers(cpi);
-    vp9_alloc_compressor_data(cpi);
-  }
-
-  if (cpi->oxcf.fixed_q >= 0) {
-    cpi->last_q[0] = cpi->oxcf.fixed_q;
-    cpi->last_q[1] = cpi->oxcf.fixed_q;
-    cpi->last_boosted_qindex = cpi->oxcf.fixed_q;
-  }
-
-  cpi->Speed = cpi->oxcf.cpu_used;
-
-  // force to allowlag to 0 if lag_in_frames is 0;
-  if (cpi->oxcf.lag_in_frames == 0) {
-    cpi->oxcf.allow_lag = 0;
-  }
-  // Limit on lag buffers as these are not currently dynamically allocated
-  else if (cpi->oxcf.lag_in_frames > MAX_LAG_BUFFERS)
-    cpi->oxcf.lag_in_frames = MAX_LAG_BUFFERS;
-
-  // YX Temp
-  cpi->alt_ref_source = NULL;
-  cpi->is_src_frame_alt_ref = 0;
-
-#if 0
-  // Experimental RD Code
-  cpi->frame_distortion = 0;
-  cpi->last_frame_distortion = 0;
-#endif
-
-}
-
-#define M_LOG2_E 0.693147180559945309417
-#define log2f(x) (log (x) / (float) M_LOG2_E)
-
-static void cal_nmvjointsadcost(int *mvjointsadcost) {
-  mvjointsadcost[0] = 600;
-  mvjointsadcost[1] = 300;
-  mvjointsadcost[2] = 300;
-  mvjointsadcost[0] = 300;
-}
-
-static void cal_nmvsadcosts(int *mvsadcost[2]) {
-  int i = 1;
-
-  mvsadcost [0] [0] = 0;
-  mvsadcost [1] [0] = 0;
-
-  do {
-    double z = 256 * (2 * (log2f(8 * i) + .6));
-    mvsadcost [0][i] = (int) z;
-    mvsadcost [1][i] = (int) z;
-    mvsadcost [0][-i] = (int) z;
-    mvsadcost [1][-i] = (int) z;
-  } while (++i <= MV_MAX);
-}
-
-static void cal_nmvsadcosts_hp(int *mvsadcost[2]) {
-  int i = 1;
-
-  mvsadcost [0] [0] = 0;
-  mvsadcost [1] [0] = 0;
-
-  do {
-    double z = 256 * (2 * (log2f(8 * i) + .6));
-    mvsadcost [0][i] = (int) z;
-    mvsadcost [1][i] = (int) z;
-    mvsadcost [0][-i] = (int) z;
-    mvsadcost [1][-i] = (int) z;
-  } while (++i <= MV_MAX);
-}
-
-VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) {
-  int i;
-  volatile union {
-    VP9_COMP *cpi;
-    VP9_PTR   ptr;
-  } ctx;
-
-  VP9_COMP *cpi;
-  VP9_COMMON *cm;
-
-  cpi = ctx.cpi = vpx_memalign(32, sizeof(VP9_COMP));
-  // Check that the CPI instance is valid
-  if (!cpi)
-    return 0;
-
-  cm = &cpi->common;
-
-  vpx_memset(cpi, 0, sizeof(VP9_COMP));
-
-  if (setjmp(cm->error.jmp)) {
-    VP9_PTR ptr = ctx.ptr;
-
-    ctx.cpi->common.error.setjmp = 0;
-    vp9_remove_compressor(&ptr);
-    return 0;
-  }
-
-  cpi->common.error.setjmp = 1;
-
-  CHECK_MEM_ERROR(cpi->mb.ss, vpx_calloc(sizeof(search_site), (MAX_MVSEARCH_STEPS * 8) + 1));
-
-  vp9_create_common(&cpi->common);
-  vp9_cmachine_specific_config(cpi);
-
-  init_config((VP9_PTR)cpi, oxcf);
-
-  memcpy(cpi->base_skip_false_prob, base_skip_false_prob, sizeof(base_skip_false_prob));
-  cpi->common.current_video_frame   = 0;
-  cpi->kf_overspend_bits            = 0;
-  cpi->kf_bitrate_adjustment        = 0;
-  cpi->frames_till_gf_update_due      = 0;
-  cpi->gf_overspend_bits            = 0;
-  cpi->non_gf_bitrate_adjustment     = 0;
-  cm->prob_last_coded               = 128;
-  cm->prob_gf_coded                 = 128;
-  cm->prob_intra_coded              = 63;
-#if CONFIG_SUPERBLOCKS
-  cm->sb_coded                      = 200;
-#endif
-  for (i = 0; i < COMP_PRED_CONTEXTS; i++)
-    cm->prob_comppred[i]         = 128;
-  for (i = 0; i < TX_SIZE_MAX - 1; i++)
-    cm->prob_tx[i]               = 128;
-
-  // Prime the recent reference frame useage counters.
-  // Hereafter they will be maintained as a sort of moving average
-  cpi->recent_ref_frame_usage[INTRA_FRAME]  = 1;
-  cpi->recent_ref_frame_usage[LAST_FRAME]   = 1;
-  cpi->recent_ref_frame_usage[GOLDEN_FRAME] = 1;
-  cpi->recent_ref_frame_usage[ALTREF_FRAME] = 1;
-
-  // Set reference frame sign bias for ALTREF frame to 1 (for now)
-  cpi->common.ref_frame_sign_bias[ALTREF_FRAME] = 1;
-
-  cpi->baseline_gf_interval = DEFAULT_GF_INTERVAL;
-
-  cpi->gold_is_last = 0;
-  cpi->alt_is_last  = 0;
-  cpi->gold_is_alt  = 0;
-
-  // allocate memory for storing last frame's MVs for MV prediction.
-  CHECK_MEM_ERROR(cpi->lfmv, vpx_calloc((cpi->common.mb_rows + 2) * (cpi->common.mb_cols + 2), sizeof(int_mv)));
-  CHECK_MEM_ERROR(cpi->lf_ref_frame_sign_bias, vpx_calloc((cpi->common.mb_rows + 2) * (cpi->common.mb_cols + 2), sizeof(int)));
-  CHECK_MEM_ERROR(cpi->lf_ref_frame, vpx_calloc((cpi->common.mb_rows + 2) * (cpi->common.mb_cols + 2), sizeof(int)));
-
-  // Create the encoder segmentation map and set all entries to 0
-  CHECK_MEM_ERROR(cpi->segmentation_map, vpx_calloc((cpi->common.mb_rows * cpi->common.mb_cols), 1));
-
-  // And a copy in common for temporal coding
-  CHECK_MEM_ERROR(cm->last_frame_seg_map,
-                  vpx_calloc((cpi->common.mb_rows * cpi->common.mb_cols), 1));
-
-  // And a place holder structure is the coding context
-  // for use if we want to save and restore it
-  CHECK_MEM_ERROR(cpi->coding_context.last_frame_seg_map_copy,
-                  vpx_calloc((cpi->common.mb_rows * cpi->common.mb_cols), 1));
-
-  CHECK_MEM_ERROR(cpi->active_map, vpx_calloc(cpi->common.mb_rows * cpi->common.mb_cols, 1));
-  vpx_memset(cpi->active_map, 1, (cpi->common.mb_rows * cpi->common.mb_cols));
-  cpi->active_map_enabled = 0;
-
-  for (i = 0; i < (sizeof(cpi->mbgraph_stats) /
-                   sizeof(cpi->mbgraph_stats[0])); i++) {
-    CHECK_MEM_ERROR(cpi->mbgraph_stats[i].mb_stats,
-                    vpx_calloc(cpi->common.mb_rows * cpi->common.mb_cols *
-                               sizeof(*cpi->mbgraph_stats[i].mb_stats),
-                               1));
-  }
-
-#ifdef ENTROPY_STATS
-  if (cpi->pass != 1)
-    init_context_counters();
-#endif
-#ifdef MODE_STATS
-  vp9_zero(y_modes);
-  vp9_zero(i8x8_modes);
-  vp9_zero(uv_modes);
-  vp9_zero(uv_modes_y);
-  vp9_zero(b_modes);
-  vp9_zero(inter_y_modes);
-  vp9_zero(inter_uv_modes);
-  vp9_zero(inter_b_modes);
-#endif
-#ifdef NMV_STATS
-  init_nmvstats();
-#endif
-
-  /*Initialize the feed-forward activity masking.*/
-  cpi->activity_avg = 90 << 12;
-
-  cpi->frames_since_key = 8;        // Give a sensible default for the first frame.
-  cpi->key_frame_frequency = cpi->oxcf.key_freq;
-  cpi->this_key_frame_forced = FALSE;
-  cpi->next_key_frame_forced = FALSE;
-
-  cpi->source_alt_ref_pending = FALSE;
-  cpi->source_alt_ref_active = FALSE;
-  cpi->common.refresh_alt_ref_frame = 0;
-
-  cpi->b_calculate_psnr = CONFIG_INTERNAL_STATS;
-#if CONFIG_INTERNAL_STATS
-  cpi->b_calculate_ssimg = 0;
-
-  cpi->count = 0;
-  cpi->bytes = 0;
-
-  if (cpi->b_calculate_psnr) {
-    cpi->total_sq_error = 0.0;
-    cpi->total_sq_error2 = 0.0;
-    cpi->total_y = 0.0;
-    cpi->total_u = 0.0;
-    cpi->total_v = 0.0;
-    cpi->total = 0.0;
-    cpi->totalp_y = 0.0;
-    cpi->totalp_u = 0.0;
-    cpi->totalp_v = 0.0;
-    cpi->totalp = 0.0;
-    cpi->tot_recode_hits = 0;
-    cpi->summed_quality = 0;
-    cpi->summed_weights = 0;
-  }
-
-  if (cpi->b_calculate_ssimg) {
-    cpi->total_ssimg_y = 0;
-    cpi->total_ssimg_u = 0;
-    cpi->total_ssimg_v = 0;
-    cpi->total_ssimg_all = 0;
-  }
-
-#endif
-
-#ifndef LLONG_MAX
-#define LLONG_MAX  9223372036854775807LL
-#endif
-  cpi->first_time_stamp_ever = LLONG_MAX;
-
-  cpi->frames_till_gf_update_due      = 0;
-  cpi->key_frame_count              = 1;
-
-  cpi->ni_av_qi                     = cpi->oxcf.worst_allowed_q;
-  cpi->ni_tot_qi                    = 0;
-  cpi->ni_frames                   = 0;
-  cpi->tot_q = 0.0;
-  cpi->avg_q = vp9_convert_qindex_to_q(cpi->oxcf.worst_allowed_q);
-  cpi->total_byte_count             = 0;
-
-  cpi->rate_correction_factor         = 1.0;
-  cpi->key_frame_rate_correction_factor = 1.0;
-  cpi->gf_rate_correction_factor  = 1.0;
-  cpi->twopass.est_max_qcorrection_factor  = 1.0;
-
-  cal_nmvjointsadcost(cpi->mb.nmvjointsadcost);
-  cpi->mb.nmvcost[0] = &cpi->mb.nmvcosts[0][MV_MAX];
-  cpi->mb.nmvcost[1] = &cpi->mb.nmvcosts[1][MV_MAX];
-  cpi->mb.nmvsadcost[0] = &cpi->mb.nmvsadcosts[0][MV_MAX];
-  cpi->mb.nmvsadcost[1] = &cpi->mb.nmvsadcosts[1][MV_MAX];
-  cal_nmvsadcosts(cpi->mb.nmvsadcost);
-
-  cpi->mb.nmvcost_hp[0] = &cpi->mb.nmvcosts_hp[0][MV_MAX];
-  cpi->mb.nmvcost_hp[1] = &cpi->mb.nmvcosts_hp[1][MV_MAX];
-  cpi->mb.nmvsadcost_hp[0] = &cpi->mb.nmvsadcosts_hp[0][MV_MAX];
-  cpi->mb.nmvsadcost_hp[1] = &cpi->mb.nmvsadcosts_hp[1][MV_MAX];
-  cal_nmvsadcosts_hp(cpi->mb.nmvsadcost_hp);
-
-  for (i = 0; i < KEY_FRAME_CONTEXT; i++) {
-    cpi->prior_key_frame_distance[i] = (int)cpi->output_frame_rate;
-  }
-
-#ifdef OUTPUT_YUV_SRC
-  yuv_file = fopen("bd.yuv", "ab");
-#endif
-#ifdef OUTPUT_YUV_REC
-  yuv_rec_file = fopen("rec.yuv", "wb");
-#endif
-
-#if 0
-  framepsnr = fopen("framepsnr.stt", "a");
-  kf_list = fopen("kf_list.stt", "w");
-#endif
-
-  cpi->output_pkt_list = oxcf->output_pkt_list;
-
-  if (cpi->pass == 1) {
-    vp9_init_first_pass(cpi);
-  } else if (cpi->pass == 2) {
-    size_t packet_sz = sizeof(FIRSTPASS_STATS);
-    int packets = oxcf->two_pass_stats_in.sz / packet_sz;
-
-    cpi->twopass.stats_in_start = oxcf->two_pass_stats_in.buf;
-    cpi->twopass.stats_in = cpi->twopass.stats_in_start;
-    cpi->twopass.stats_in_end = (void *)((char *)cpi->twopass.stats_in
-                                         + (packets - 1) * packet_sz);
-    vp9_init_second_pass(cpi);
-  }
-
-  vp9_set_speed_features(cpi);
-
-  // Set starting values of RD threshold multipliers (128 = *1)
-  for (i = 0; i < MAX_MODES; i++) {
-    cpi->rd_thresh_mult[i] = 128;
-  }
-
-#ifdef ENTROPY_STATS
-  init_mv_ref_counts();
-#endif
-
-#define BFP(BT, SDF, VF, SVF, SVFHH, SVFHV, SVFHHV, SDX3F, SDX8F, SDX4DF) \
-    cpi->fn_ptr[BT].sdf            = SDF; \
-    cpi->fn_ptr[BT].vf             = VF; \
-    cpi->fn_ptr[BT].svf            = SVF; \
-    cpi->fn_ptr[BT].svf_halfpix_h  = SVFHH; \
-    cpi->fn_ptr[BT].svf_halfpix_v  = SVFHV; \
-    cpi->fn_ptr[BT].svf_halfpix_hv = SVFHHV; \
-    cpi->fn_ptr[BT].sdx3f          = SDX3F; \
-    cpi->fn_ptr[BT].sdx8f          = SDX8F; \
-    cpi->fn_ptr[BT].sdx4df         = SDX4DF;
-
-
-#if CONFIG_SUPERBLOCKS
-  BFP(BLOCK_32X32, vp9_sad32x32, vp9_variance32x32, vp9_sub_pixel_variance32x32,
-      vp9_variance_halfpixvar32x32_h, vp9_variance_halfpixvar32x32_v,
-      vp9_variance_halfpixvar32x32_hv, vp9_sad32x32x3, vp9_sad32x32x8,
-      vp9_sad32x32x4d)
-#endif
-
-  BFP(BLOCK_16X16, vp9_sad16x16, vp9_variance16x16, vp9_sub_pixel_variance16x16,
-       vp9_variance_halfpixvar16x16_h, vp9_variance_halfpixvar16x16_v,
-       vp9_variance_halfpixvar16x16_hv, vp9_sad16x16x3, vp9_sad16x16x8,
-       vp9_sad16x16x4d)
-
-  BFP(BLOCK_16X8, vp9_sad16x8, vp9_variance16x8, vp9_sub_pixel_variance16x8,
-      NULL, NULL, NULL, vp9_sad16x8x3, vp9_sad16x8x8, vp9_sad16x8x4d)
-
-  BFP(BLOCK_8X16, vp9_sad8x16, vp9_variance8x16, vp9_sub_pixel_variance8x16,
-      NULL, NULL, NULL, vp9_sad8x16x3, vp9_sad8x16x8, vp9_sad8x16x4d)
-
-  BFP(BLOCK_8X8, vp9_sad8x8, vp9_variance8x8, vp9_sub_pixel_variance8x8,
-      NULL, NULL, NULL, vp9_sad8x8x3, vp9_sad8x8x8, vp9_sad8x8x4d)
-
-  BFP(BLOCK_4X4, vp9_sad4x4, vp9_variance4x4, vp9_sub_pixel_variance4x4,
-      NULL, NULL, NULL, vp9_sad4x4x3, vp9_sad4x4x8, vp9_sad4x4x4d)
-
-#if ARCH_X86 || ARCH_X86_64
-  cpi->fn_ptr[BLOCK_16X16].copymem  = vp9_copy32xn;
-  cpi->fn_ptr[BLOCK_16X8].copymem   = vp9_copy32xn;
-  cpi->fn_ptr[BLOCK_8X16].copymem   = vp9_copy32xn;
-  cpi->fn_ptr[BLOCK_8X8].copymem    = vp9_copy32xn;
-  cpi->fn_ptr[BLOCK_4X4].copymem    = vp9_copy32xn;
-#endif
-
-  cpi->full_search_sad = SEARCH_INVOKE(&cpi->rtcd.search, full_search);
-  cpi->diamond_search_sad = SEARCH_INVOKE(&cpi->rtcd.search, diamond_search);
-  cpi->refining_search_sad = SEARCH_INVOKE(&cpi->rtcd.search, refining_search);
-
-  // make sure frame 1 is okay
-  cpi->error_bins[0] = cpi->common.MBs;
-
-  /* vp9_init_quantizer() is first called here. Add check in
-   * vp9_frame_init_quantizer() so that vp9_init_quantizer is only
-   * called later when needed. This will avoid unnecessary calls of
-   * vp9_init_quantizer() for every frame.
-   */
-  vp9_init_quantizer(cpi);
-
-  vp9_loop_filter_init(cm);
-
-  cpi->common.error.setjmp = 0;
-
-  vp9_zero(cpi->y_uv_mode_count)
-
-  return (VP9_PTR) cpi;
-}
-
-void vp9_remove_compressor(VP9_PTR *ptr) {
-  VP9_COMP *cpi = (VP9_COMP *)(*ptr);
-  int i;
-
-  if (!cpi)
-    return;
-
-  if (cpi && (cpi->common.current_video_frame > 0)) {
-    if (cpi->pass == 2) {
-      vp9_end_second_pass(cpi);
-    }
-
-#ifdef ENTROPY_STATS
-    if (cpi->pass != 1) {
-      print_context_counters();
-      print_tree_update_probs();
-      print_mode_context();
-    }
-#endif
-#ifdef NMV_STATS
-    if (cpi->pass != 1)
-      print_nmvstats();
-#endif
-
-#if CONFIG_INTERNAL_STATS
-
-    vp9_clear_system_state();
-
-    // printf("\n8x8-4x4:%d-%d\n", cpi->t8x8_count, cpi->t4x4_count);
-    if (cpi->pass != 1) {
-      FILE *f = fopen("opsnr.stt", "a");
-      double time_encoded = (cpi->last_end_time_stamp_seen
-                             - cpi->first_time_stamp_ever) / 10000000.000;
-      double total_encode_time = (cpi->time_receive_data + cpi->time_compress_data)   / 1000.000;
-      double dr = (double)cpi->bytes * (double) 8 / (double)1000  / time_encoded;
-#if defined(MODE_STATS)
-      print_mode_contexts(&cpi->common);
-#endif
-      if (cpi->b_calculate_psnr) {
-        YV12_BUFFER_CONFIG *lst_yv12 = &cpi->common.yv12_fb[cpi->common.lst_fb_idx];
-        double samples = 3.0 / 2 * cpi->count * lst_yv12->y_width * lst_yv12->y_height;
-        double total_psnr = vp9_mse2psnr(samples, 255.0, cpi->total_sq_error);
-        double total_psnr2 = vp9_mse2psnr(samples, 255.0, cpi->total_sq_error2);
-        double total_ssim = 100 * pow(cpi->summed_quality / cpi->summed_weights, 8.0);
-
-        fprintf(f, "Bitrate\tAVGPsnr\tGLBPsnr\tAVPsnrP\tGLPsnrP\tVPXSSIM\t  Time(ms)\n");
-        fprintf(f, "%7.2f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t%8.0f\n",
-                dr, cpi->total / cpi->count, total_psnr, cpi->totalp / cpi->count, total_psnr2, total_ssim,
-                total_encode_time);
-//                fprintf(f, "%7.3f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t%8.0f %10ld\n",
-//                        dr, cpi->total / cpi->count, total_psnr, cpi->totalp / cpi->count, total_psnr2, total_ssim,
-//                        total_encode_time, cpi->tot_recode_hits);
-      }
-
-      if (cpi->b_calculate_ssimg) {
-        fprintf(f, "BitRate\tSSIM_Y\tSSIM_U\tSSIM_V\tSSIM_A\t  Time(ms)\n");
-        fprintf(f, "%7.2f\t%6.4f\t%6.4f\t%6.4f\t%6.4f\t%8.0f\n", dr,
-                cpi->total_ssimg_y / cpi->count, cpi->total_ssimg_u / cpi->count,
-                cpi->total_ssimg_v / cpi->count, cpi->total_ssimg_all / cpi->count, total_encode_time);
-//                fprintf(f, "%7.3f\t%6.4f\t%6.4f\t%6.4f\t%6.4f\t%8.0f  %10ld\n", dr,
-//                        cpi->total_ssimg_y / cpi->count, cpi->total_ssimg_u / cpi->count,
-//                        cpi->total_ssimg_v / cpi->count, cpi->total_ssimg_all / cpi->count, total_encode_time, cpi->tot_recode_hits);
-      }
-
-      fclose(f);
-    }
-
-#endif
-
-
-#ifdef MODE_STATS
-    {
-      extern int count_mb_seg[4];
-      char modes_stats_file[250];
-      FILE *f;
-      double dr = (double)cpi->oxcf.frame_rate * (double)cpi->bytes * (double)8 / (double)cpi->count / (double)1000;
-      sprintf(modes_stats_file, "modes_q%03d.stt", cpi->common.base_qindex);
-      f = fopen(modes_stats_file, "w");
-      fprintf(f, "intra_mode in Intra Frames:\n");
-      {
-        int i;
-        fprintf(f, "Y: ");
-        for (i = 0; i < VP9_YMODES; i++) fprintf(f, " %8d,", y_modes[i]);
-        fprintf(f, "\n");
-      }
-      {
-        int i;
-        fprintf(f, "I8: ");
-        for (i = 0; i < VP9_I8X8_MODES; i++) fprintf(f, " %8d,", i8x8_modes[i]);
-        fprintf(f, "\n");
-      }
-      {
-        int i;
-        fprintf(f, "UV: ");
-        for (i = 0; i < VP9_UV_MODES; i++) fprintf(f, " %8d,", uv_modes[i]);
-        fprintf(f, "\n");
-      }
-      {
-        int i, j;
-        fprintf(f, "KeyFrame Y-UV:\n");
-        for (i = 0; i < VP9_YMODES; i++) {
-          fprintf(f, "%2d:", i);
-          for (j = 0; j < VP9_UV_MODES; j++) fprintf(f, "%8d, ", uv_modes_y[i][j]);
-          fprintf(f, "\n");
-        }
-      }
-      {
-        int i, j;
-        fprintf(f, "Inter Y-UV:\n");
-        for (i = 0; i < VP9_YMODES; i++) {
-          fprintf(f, "%2d:", i);
-          for (j = 0; j < VP9_UV_MODES; j++) fprintf(f, "%8d, ", cpi->y_uv_mode_count[i][j]);
-          fprintf(f, "\n");
-        }
-      }
-      {
-        int i;
-
-        fprintf(f, "B: ");
-        for (i = 0; i < VP9_BINTRAMODES; i++)
-          fprintf(f, "%8d, ", b_modes[i]);
-
-        fprintf(f, "\n");
-
-      }
-
-      fprintf(f, "Modes in Inter Frames:\n");
-      {
-        int i;
-        fprintf(f, "Y: ");
-        for (i = 0; i < MB_MODE_COUNT; i++) fprintf(f, " %8d,", inter_y_modes[i]);
-        fprintf(f, "\n");
-      }
-      {
-        int i;
-        fprintf(f, "UV: ");
-        for (i = 0; i < VP9_UV_MODES; i++) fprintf(f, " %8d,", inter_uv_modes[i]);
-        fprintf(f, "\n");
-      }
-      {
-        int i;
-        fprintf(f, "B: ");
-        for (i = 0; i < B_MODE_COUNT; i++) fprintf(f, "%8d, ", inter_b_modes[i]);
-        fprintf(f, "\n");
-      }
-      fprintf(f, "P:%8d, %8d, %8d, %8d\n", count_mb_seg[0], count_mb_seg[1], count_mb_seg[2], count_mb_seg[3]);
-      fprintf(f, "PB:%8d, %8d, %8d, %8d\n", inter_b_modes[LEFT4X4], inter_b_modes[ABOVE4X4], inter_b_modes[ZERO4X4], inter_b_modes[NEW4X4]);
-      fclose(f);
-    }
-#endif
-
-#ifdef ENTROPY_STATS
-    {
-      int i, j, k;
-      FILE *fmode = fopen("modecontext.c", "w");
-
-      fprintf(fmode, "\n#include \"entropymode.h\"\n\n");
-      fprintf(fmode, "const unsigned int vp9_kf_default_bmode_counts ");
-      fprintf(fmode, "[VP9_BINTRAMODES] [VP9_BINTRAMODES] [VP9_BINTRAMODES] =\n{\n");
-
-      for (i = 0; i < 10; i++) {
-
-        fprintf(fmode, "    { // Above Mode :  %d\n", i);
-
-        for (j = 0; j < 10; j++) {
-
-          fprintf(fmode, "        {");
-
-          for (k = 0; k < VP9_BINTRAMODES; k++) {
-            if (!intra_mode_stats[i][j][k])
-              fprintf(fmode, " %5d, ", 1);
-            else
-              fprintf(fmode, " %5d, ", intra_mode_stats[i][j][k]);
-          }
-
-          fprintf(fmode, "}, // left_mode %d\n", j);
-
-        }
-
-        fprintf(fmode, "    },\n");
-
-      }
-
-      fprintf(fmode, "};\n");
-      fclose(fmode);
-    }
-#endif
-
-
-#if defined(SECTIONBITS_OUTPUT)
-
-    if (0) {
-      int i;
-      FILE *f = fopen("tokenbits.stt", "a");
-
-      for (i = 0; i < 28; i++)
-        fprintf(f, "%8d", (int)(Sectionbits[i] / 256));
-
-      fprintf(f, "\n");
-      fclose(f);
-    }
-
-#endif
-
-#if 0
-    {
-      printf("\n_pick_loop_filter_level:%d\n", cpi->time_pick_lpf / 1000);
-      printf("\n_frames recive_data encod_mb_row compress_frame  Total\n");
-      printf("%6d %10ld %10ld %10ld %10ld\n", cpi->common.current_video_frame, cpi->time_receive_data / 1000, cpi->time_encode_mb_row / 1000, cpi->time_compress_data / 1000, (cpi->time_receive_data + cpi->time_compress_data) / 1000);
-    }
-#endif
-
-  }
-
-  dealloc_compressor_data(cpi);
-  vpx_free(cpi->mb.ss);
-  vpx_free(cpi->tok);
-
-  for (i = 0; i < sizeof(cpi->mbgraph_stats) / sizeof(cpi->mbgraph_stats[0]); i++) {
-    vpx_free(cpi->mbgraph_stats[i].mb_stats);
-  }
-
-  vp9_remove_common(&cpi->common);
-  vpx_free(cpi);
-  *ptr = 0;
-
-#ifdef OUTPUT_YUV_SRC
-  fclose(yuv_file);
-#endif
-#ifdef OUTPUT_YUV_REC
-  fclose(yuv_rec_file);
-#endif
-
-#if 0
-
-  if (keyfile)
-    fclose(keyfile);
-
-  if (framepsnr)
-    fclose(framepsnr);
-
-  if (kf_list)
-    fclose(kf_list);
-
-#endif
-
-}
-
-
-static uint64_t calc_plane_error(unsigned char *orig, int orig_stride,
-                                 unsigned char *recon, int recon_stride,
-                                 unsigned int cols, unsigned int rows) {
-  unsigned int row, col;
-  uint64_t total_sse = 0;
-  int diff;
-
-  for (row = 0; row + 16 <= rows; row += 16) {
-    for (col = 0; col + 16 <= cols; col += 16) {
-      unsigned int sse;
-
-      vp9_mse16x16(orig + col, orig_stride, recon + col, recon_stride, &sse);
-      total_sse += sse;
-    }
-
-    /* Handle odd-sized width */
-    if (col < cols) {
-      unsigned int   border_row, border_col;
-      unsigned char *border_orig = orig;
-      unsigned char *border_recon = recon;
-
-      for (border_row = 0; border_row < 16; border_row++) {
-        for (border_col = col; border_col < cols; border_col++) {
-          diff = border_orig[border_col] - border_recon[border_col];
-          total_sse += diff * diff;
-        }
-
-        border_orig += orig_stride;
-        border_recon += recon_stride;
-      }
-    }
-
-    orig += orig_stride * 16;
-    recon += recon_stride * 16;
-  }
-
-  /* Handle odd-sized height */
-  for (; row < rows; row++) {
-    for (col = 0; col < cols; col++) {
-      diff = orig[col] - recon[col];
-      total_sse += diff * diff;
-    }
-
-    orig += orig_stride;
-    recon += recon_stride;
-  }
-
-  return total_sse;
-}
-
-
-static void generate_psnr_packet(VP9_COMP *cpi) {
-  YV12_BUFFER_CONFIG      *orig = cpi->Source;
-  YV12_BUFFER_CONFIG      *recon = cpi->common.frame_to_show;
-  struct vpx_codec_cx_pkt  pkt;
-  uint64_t                 sse;
-  int                      i;
-  unsigned int             width = cpi->common.Width;
-  unsigned int             height = cpi->common.Height;
-
-  pkt.kind = VPX_CODEC_PSNR_PKT;
-  sse = calc_plane_error(orig->y_buffer, orig->y_stride,
-                         recon->y_buffer, recon->y_stride,
-                         width, height);
-  pkt.data.psnr.sse[0] = sse;
-  pkt.data.psnr.sse[1] = sse;
-  pkt.data.psnr.samples[0] = width * height;
-  pkt.data.psnr.samples[1] = width * height;
-
-  width = (width + 1) / 2;
-  height = (height + 1) / 2;
-
-  sse = calc_plane_error(orig->u_buffer, orig->uv_stride,
-                         recon->u_buffer, recon->uv_stride,
-                         width, height);
-  pkt.data.psnr.sse[0] += sse;
-  pkt.data.psnr.sse[2] = sse;
-  pkt.data.psnr.samples[0] += width * height;
-  pkt.data.psnr.samples[2] = width * height;
-
-  sse = calc_plane_error(orig->v_buffer, orig->uv_stride,
-                         recon->v_buffer, recon->uv_stride,
-                         width, height);
-  pkt.data.psnr.sse[0] += sse;
-  pkt.data.psnr.sse[3] = sse;
-  pkt.data.psnr.samples[0] += width * height;
-  pkt.data.psnr.samples[3] = width * height;
-
-  for (i = 0; i < 4; i++)
-    pkt.data.psnr.psnr[i] = vp9_mse2psnr(pkt.data.psnr.samples[i], 255.0,
-                                         pkt.data.psnr.sse[i]);
-
-  vpx_codec_pkt_list_add(cpi->output_pkt_list, &pkt);
-}
-
-
-int vp9_use_as_reference(VP9_PTR ptr, int ref_frame_flags) {
-  VP9_COMP *cpi = (VP9_COMP *)(ptr);
-
-  if (ref_frame_flags > 7)
-    return -1;
-
-  cpi->ref_frame_flags = ref_frame_flags;
-  return 0;
-}
-int vp9_update_reference(VP9_PTR ptr, int ref_frame_flags) {
-  VP9_COMP *cpi = (VP9_COMP *)(ptr);
-
-  if (ref_frame_flags > 7)
-    return -1;
-
-  cpi->common.refresh_golden_frame = 0;
-  cpi->common.refresh_alt_ref_frame = 0;
-  cpi->common.refresh_last_frame   = 0;
-
-  if (ref_frame_flags & VP9_LAST_FLAG)
-    cpi->common.refresh_last_frame = 1;
-
-  if (ref_frame_flags & VP9_GOLD_FLAG)
-    cpi->common.refresh_golden_frame = 1;
-
-  if (ref_frame_flags & VP9_ALT_FLAG)
-    cpi->common.refresh_alt_ref_frame = 1;
-
-  return 0;
-}
-
-int vp9_get_reference_enc(VP9_PTR ptr, VP9_REFFRAME ref_frame_flag,
-                          YV12_BUFFER_CONFIG *sd) {
-  VP9_COMP *cpi = (VP9_COMP *)(ptr);
-  VP9_COMMON *cm = &cpi->common;
-  int ref_fb_idx;
-
-  if (ref_frame_flag == VP9_LAST_FLAG)
-    ref_fb_idx = cm->lst_fb_idx;
-  else if (ref_frame_flag == VP9_GOLD_FLAG)
-    ref_fb_idx = cm->gld_fb_idx;
-  else if (ref_frame_flag == VP9_ALT_FLAG)
-    ref_fb_idx = cm->alt_fb_idx;
-  else
-    return -1;
-
-  vp8_yv12_copy_frame_ptr(&cm->yv12_fb[ref_fb_idx], sd);
-
-  return 0;
-}
-
-int vp9_set_reference_enc(VP9_PTR ptr, VP9_REFFRAME ref_frame_flag,
-                          YV12_BUFFER_CONFIG *sd) {
-  VP9_COMP *cpi = (VP9_COMP *)(ptr);
-  VP9_COMMON *cm = &cpi->common;
-
-  int ref_fb_idx;
-
-  if (ref_frame_flag == VP9_LAST_FLAG)
-    ref_fb_idx = cm->lst_fb_idx;
-  else if (ref_frame_flag == VP9_GOLD_FLAG)
-    ref_fb_idx = cm->gld_fb_idx;
-  else if (ref_frame_flag == VP9_ALT_FLAG)
-    ref_fb_idx = cm->alt_fb_idx;
-  else
-    return -1;
-
-  vp8_yv12_copy_frame_ptr(sd, &cm->yv12_fb[ref_fb_idx]);
-
-  return 0;
-}
-int vp9_update_entropy(VP9_PTR comp, int update) {
-  VP9_COMP *cpi = (VP9_COMP *) comp;
-  VP9_COMMON *cm = &cpi->common;
-  cm->refresh_entropy_probs = update;
-
-  return 0;
-}
-
-
-#ifdef OUTPUT_YUV_SRC
-void vp9_write_yuv_frame(YV12_BUFFER_CONFIG *s) {
-  unsigned char *src = s->y_buffer;
-  int h = s->y_height;
-
-  do {
-    fwrite(src, s->y_width, 1,  yuv_file);
-    src += s->y_stride;
-  } while (--h);
-
-  src = s->u_buffer;
-  h = s->uv_height;
-
-  do {
-    fwrite(src, s->uv_width, 1,  yuv_file);
-    src += s->uv_stride;
-  } while (--h);
-
-  src = s->v_buffer;
-  h = s->uv_height;
-
-  do {
-    fwrite(src, s->uv_width, 1, yuv_file);
-    src += s->uv_stride;
-  } while (--h);
-}
-#endif
-
-#ifdef OUTPUT_YUV_REC
-void vp9_write_yuv_rec_frame(VP9_COMMON *cm) {
-  YV12_BUFFER_CONFIG *s = cm->frame_to_show;
-  unsigned char *src = s->y_buffer;
-  int h = cm->Height;
-
-  do {
-    fwrite(src, s->y_width, 1,  yuv_rec_file);
-    src += s->y_stride;
-  } while (--h);
-
-  src = s->u_buffer;
-  h = (cm->Height + 1) / 2;
-
-  do {
-    fwrite(src, s->uv_width, 1,  yuv_rec_file);
-    src += s->uv_stride;
-  } while (--h);
-
-  src = s->v_buffer;
-  h = (cm->Height + 1) / 2;
-
-  do {
-    fwrite(src, s->uv_width, 1, yuv_rec_file);
-    src += s->uv_stride;
-  } while (--h);
-}
-#endif
-
-static void update_alt_ref_frame_stats(VP9_COMP *cpi) {
-  VP9_COMMON *cm = &cpi->common;
-
-  // Update data structure that monitors level of reference to last GF
-  vpx_memset(cpi->gf_active_flags, 1, (cm->mb_rows * cm->mb_cols));
-  cpi->gf_active_count = cm->mb_rows * cm->mb_cols;
-
-  // this frame refreshes means next frames don't unless specified by user
-  cpi->common.frames_since_golden = 0;
-
-  // Clear the alternate reference update pending flag.
-  cpi->source_alt_ref_pending = FALSE;
-
-  // Set the alternate refernce frame active flag
-  cpi->source_alt_ref_active = TRUE;
-
-
-}
-static void update_golden_frame_stats(VP9_COMP *cpi) {
-  VP9_COMMON *cm = &cpi->common;
-
-  // Update the Golden frame usage counts.
-  if (cm->refresh_golden_frame) {
-    // Update data structure that monitors level of reference to last GF
-    vpx_memset(cpi->gf_active_flags, 1, (cm->mb_rows * cm->mb_cols));
-    cpi->gf_active_count = cm->mb_rows * cm->mb_cols;
-
-    // this frame refreshes means next frames don't unless specified by user
-    cm->refresh_golden_frame = 0;
-    cpi->common.frames_since_golden = 0;
-
-    // if ( cm->frame_type == KEY_FRAME )
-    // {
-    cpi->recent_ref_frame_usage[INTRA_FRAME] = 1;
-    cpi->recent_ref_frame_usage[LAST_FRAME] = 1;
-    cpi->recent_ref_frame_usage[GOLDEN_FRAME] = 1;
-    cpi->recent_ref_frame_usage[ALTREF_FRAME] = 1;
-    // }
-    // else
-    // {
-    //  // Carry a potrtion of count over to begining of next gf sequence
-    //  cpi->recent_ref_frame_usage[INTRA_FRAME] >>= 5;
-    //  cpi->recent_ref_frame_usage[LAST_FRAME] >>= 5;
-    //  cpi->recent_ref_frame_usage[GOLDEN_FRAME] >>= 5;
-    //  cpi->recent_ref_frame_usage[ALTREF_FRAME] >>= 5;
-    // }
-
-    // ******** Fixed Q test code only ************
-    // If we are going to use the ALT reference for the next group of frames set a flag to say so.
-    if (cpi->oxcf.fixed_q >= 0 &&
-        cpi->oxcf.play_alternate && !cpi->common.refresh_alt_ref_frame) {
-      cpi->source_alt_ref_pending = TRUE;
-      cpi->frames_till_gf_update_due = cpi->baseline_gf_interval;
-    }
-
-    if (!cpi->source_alt_ref_pending)
-      cpi->source_alt_ref_active = FALSE;
-
-    // Decrement count down till next gf
-    if (cpi->frames_till_gf_update_due > 0)
-      cpi->frames_till_gf_update_due--;
-
-  } else if (!cpi->common.refresh_alt_ref_frame) {
-    // Decrement count down till next gf
-    if (cpi->frames_till_gf_update_due > 0)
-      cpi->frames_till_gf_update_due--;
-
-    if (cpi->common.frames_till_alt_ref_frame)
-      cpi->common.frames_till_alt_ref_frame--;
-
-    cpi->common.frames_since_golden++;
-
-    if (cpi->common.frames_since_golden > 1) {
-      cpi->recent_ref_frame_usage[INTRA_FRAME] += cpi->count_mb_ref_frame_usage[INTRA_FRAME];
-      cpi->recent_ref_frame_usage[LAST_FRAME] += cpi->count_mb_ref_frame_usage[LAST_FRAME];
-      cpi->recent_ref_frame_usage[GOLDEN_FRAME] += cpi->count_mb_ref_frame_usage[GOLDEN_FRAME];
-      cpi->recent_ref_frame_usage[ALTREF_FRAME] += cpi->count_mb_ref_frame_usage[ALTREF_FRAME];
-    }
-  }
-}
-
-static int find_fp_qindex() {
-  int i;
-
-  for (i = 0; i < QINDEX_RANGE; i++) {
-    if (vp9_convert_qindex_to_q(i) >= 30.0) {
-      break;
-    }
-  }
-
-  if (i == QINDEX_RANGE)
-    i--;
-
-  return i;
-}
-
-static void Pass1Encode(VP9_COMP *cpi, unsigned long *size, unsigned char *dest, unsigned int *frame_flags) {
-  (void) size;
-  (void) dest;
-  (void) frame_flags;
-
-
-  vp9_set_quantizer(cpi, find_fp_qindex());
-  vp9_first_pass(cpi);
-}
-
-#define WRITE_RECON_BUFFER 0
-#if WRITE_RECON_BUFFER
-void write_cx_frame_to_file(YV12_BUFFER_CONFIG *frame, int this_frame) {
-
-  // write the frame
-  FILE *yframe;
-  int i;
-  char filename[255];
-
-  sprintf(filename, "cx\\y%04d.raw", this_frame);
-  yframe = fopen(filename, "wb");
-
-  for (i = 0; i < frame->y_height; i++)
-    fwrite(frame->y_buffer + i * frame->y_stride,
-           frame->y_width, 1, yframe);
-
-  fclose(yframe);
-  sprintf(filename, "cx\\u%04d.raw", this_frame);
-  yframe = fopen(filename, "wb");
-
-  for (i = 0; i < frame->uv_height; i++)
-    fwrite(frame->u_buffer + i * frame->uv_stride,
-           frame->uv_width, 1, yframe);
-
-  fclose(yframe);
-  sprintf(filename, "cx\\v%04d.raw", this_frame);
-  yframe = fopen(filename, "wb");
-
-  for (i = 0; i < frame->uv_height; i++)
-    fwrite(frame->v_buffer + i * frame->uv_stride,
-           frame->uv_width, 1, yframe);
-
-  fclose(yframe);
-}
-#endif
-
-static double compute_edge_pixel_proportion(YV12_BUFFER_CONFIG *frame) {
-#define EDGE_THRESH 128
-  int i, j;
-  int num_edge_pels = 0;
-  int num_pels = (frame->y_height - 2) * (frame->y_width - 2);
-  unsigned char *prev = frame->y_buffer + 1;
-  unsigned char *curr = frame->y_buffer + 1 + frame->y_stride;
-  unsigned char *next = frame->y_buffer + 1 + 2 * frame->y_stride;
-  for (i = 1; i < frame->y_height - 1; i++) {
-    for (j = 1; j < frame->y_width - 1; j++) {
-      /* Sobel hor and ver gradients */
-      int v = 2 * (curr[1] - curr[-1]) + (prev[1] - prev[-1]) + (next[1] - next[-1]);
-      int h = 2 * (prev[0] - next[0]) + (prev[1] - next[1]) + (prev[-1] - next[-1]);
-      h = (h < 0 ? -h : h);
-      v = (v < 0 ? -v : v);
-      if (h > EDGE_THRESH || v > EDGE_THRESH) num_edge_pels++;
-      curr++;
-      prev++;
-      next++;
-    }
-    curr += frame->y_stride - frame->y_width + 2;
-    prev += frame->y_stride - frame->y_width + 2;
-    next += frame->y_stride - frame->y_width + 2;
-  }
-  return (double)num_edge_pels / (double)num_pels;
-}
-
-// Function to test for conditions that indicate we should loop
-// back and recode a frame.
-static BOOL recode_loop_test(VP9_COMP *cpi,
-                             int high_limit, int low_limit,
-                             int q, int maxq, int minq) {
-  BOOL    force_recode = FALSE;
-  VP9_COMMON *cm = &cpi->common;
-
-  // Is frame recode allowed at all
-  // Yes if either recode mode 1 is selected or mode two is selcted
-  // and the frame is a key frame. golden frame or alt_ref_frame
-  if ((cpi->sf.recode_loop == 1) ||
-      ((cpi->sf.recode_loop == 2) &&
-       ((cm->frame_type == KEY_FRAME) ||
-        cm->refresh_golden_frame ||
-        cm->refresh_alt_ref_frame))) {
-    // General over and under shoot tests
-    if (((cpi->projected_frame_size > high_limit) && (q < maxq)) ||
-        ((cpi->projected_frame_size < low_limit) && (q > minq))) {
-      force_recode = TRUE;
-    }
-    // Special Constrained quality tests
-    else if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) {
-      // Undershoot and below auto cq level
-      if ((q > cpi->cq_target_quality) &&
-          (cpi->projected_frame_size <
-           ((cpi->this_frame_target * 7) >> 3))) {
-        force_recode = TRUE;
-      }
-      // Severe undershoot and between auto and user cq level
-      else if ((q > cpi->oxcf.cq_level) &&
-               (cpi->projected_frame_size < cpi->min_frame_bandwidth) &&
-               (cpi->active_best_quality > cpi->oxcf.cq_level)) {
-        force_recode = TRUE;
-        cpi->active_best_quality = cpi->oxcf.cq_level;
-      }
-    }
-  }
-
-  return force_recode;
-}
-
-static void update_reference_frames(VP9_COMMON *cm) {
-  YV12_BUFFER_CONFIG *yv12_fb = cm->yv12_fb;
-
-  // At this point the new frame has been encoded.
-  // If any buffer copy / swapping is signaled it should be done here.
-
-  if (cm->frame_type == KEY_FRAME) {
-    yv12_fb[cm->new_fb_idx].flags |= VP9_GOLD_FLAG | VP9_ALT_FLAG;
-
-    yv12_fb[cm->gld_fb_idx].flags &= ~VP9_GOLD_FLAG;
-    yv12_fb[cm->alt_fb_idx].flags &= ~VP9_ALT_FLAG;
-
-    cm->alt_fb_idx = cm->gld_fb_idx = cm->new_fb_idx;
-  } else { /* For non key frames */
-    if (cm->refresh_alt_ref_frame) {
-      assert(!cm->copy_buffer_to_arf);
-
-      cm->yv12_fb[cm->new_fb_idx].flags |= VP9_ALT_FLAG;
-      cm->yv12_fb[cm->alt_fb_idx].flags &= ~VP9_ALT_FLAG;
-      cm->alt_fb_idx = cm->new_fb_idx;
-    } else if (cm->copy_buffer_to_arf) {
-      assert(!(cm->copy_buffer_to_arf & ~0x3));
-
-      if (cm->copy_buffer_to_arf == 1) {
-        if (cm->alt_fb_idx != cm->lst_fb_idx) {
-          yv12_fb[cm->lst_fb_idx].flags |= VP9_ALT_FLAG;
-          yv12_fb[cm->alt_fb_idx].flags &= ~VP9_ALT_FLAG;
-          cm->alt_fb_idx = cm->lst_fb_idx;
-        }
-      } else { /* if (cm->copy_buffer_to_arf == 2) */
-        if (cm->alt_fb_idx != cm->gld_fb_idx) {
-          yv12_fb[cm->gld_fb_idx].flags |= VP9_ALT_FLAG;
-          yv12_fb[cm->alt_fb_idx].flags &= ~VP9_ALT_FLAG;
-          cm->alt_fb_idx = cm->gld_fb_idx;
-        }
-      }
-    }
-
-    if (cm->refresh_golden_frame) {
-      assert(!cm->copy_buffer_to_gf);
-
-      cm->yv12_fb[cm->new_fb_idx].flags |= VP9_GOLD_FLAG;
-      cm->yv12_fb[cm->gld_fb_idx].flags &= ~VP9_GOLD_FLAG;
-      cm->gld_fb_idx = cm->new_fb_idx;
-    } else if (cm->copy_buffer_to_gf) {
-      assert(!(cm->copy_buffer_to_arf & ~0x3));
-
-      if (cm->copy_buffer_to_gf == 1) {
-        if (cm->gld_fb_idx != cm->lst_fb_idx) {
-          yv12_fb[cm->lst_fb_idx].flags |= VP9_GOLD_FLAG;
-          yv12_fb[cm->gld_fb_idx].flags &= ~VP9_GOLD_FLAG;
-          cm->gld_fb_idx = cm->lst_fb_idx;
-        }
-      } else { /* if (cm->copy_buffer_to_gf == 2) */
-        if (cm->alt_fb_idx != cm->gld_fb_idx) {
-          yv12_fb[cm->alt_fb_idx].flags |= VP9_GOLD_FLAG;
-          yv12_fb[cm->gld_fb_idx].flags &= ~VP9_GOLD_FLAG;
-          cm->gld_fb_idx = cm->alt_fb_idx;
-        }
-      }
-    }
-  }
-
-  if (cm->refresh_last_frame) {
-    cm->yv12_fb[cm->new_fb_idx].flags |= VP9_LAST_FLAG;
-    cm->yv12_fb[cm->lst_fb_idx].flags &= ~VP9_LAST_FLAG;
-    cm->lst_fb_idx = cm->new_fb_idx;
-  }
-}
-
-static void loopfilter_frame(VP9_COMP *cpi, VP9_COMMON *cm) {
-  if (cm->no_lpf) {
-    cm->filter_level = 0;
-  }
-#if CONFIG_LOSSLESS
-  else if (cpi->oxcf.lossless) {
-    cm->filter_level = 0;
-  }
-#endif
-  else {
-    struct vpx_usec_timer timer;
-
-    vp9_clear_system_state();
-
-    vpx_usec_timer_start(&timer);
-    if (cpi->sf.auto_filter == 0)
-      vp9_pick_filter_level_fast(cpi->Source, cpi);
-    else
-      vp9_pick_filter_level(cpi->Source, cpi);
-
-    vpx_usec_timer_mark(&timer);
-    cpi->time_pick_lpf += vpx_usec_timer_elapsed(&timer);
-  }
-
-  if (cm->filter_level > 0) {
-    vp9_set_alt_lf_level(cpi, cm->filter_level);
-    vp9_loop_filter_frame(cm, &cpi->mb.e_mbd);
-  }
-
-  vp8_yv12_extend_frame_borders_ptr(cm->frame_to_show);
-
-}
-
-#if CONFIG_PRED_FILTER
-void select_pred_filter_mode(VP9_COMP *cpi) {
-  VP9_COMMON *cm = &cpi->common;
-
-  int prob_pred_filter_off = cm->prob_pred_filter_off;
-
-  // Force filter on/off if probability is extreme
-  if (prob_pred_filter_off >= 255 * 0.95)
-    cm->pred_filter_mode = 0;   // Off at the frame level
-  else if (prob_pred_filter_off <= 255 * 0.05)
-    cm->pred_filter_mode = 1;   // On at the frame level
-  else
-    cm->pred_filter_mode = 2;   // Selectable at the MB level
-}
-
-void update_pred_filt_prob(VP9_COMP *cpi) {
-  VP9_COMMON *cm = &cpi->common;
-  int prob_pred_filter_off;
-
-  // Based on the selection in the previous frame determine what mode
-  // to use for the current frame and work out the signaling probability
-  if (cpi->pred_filter_on_count + cpi->pred_filter_off_count) {
-    prob_pred_filter_off = cpi->pred_filter_off_count * 256 /
-                           (cpi->pred_filter_on_count + cpi->pred_filter_off_count);
-
-    if (prob_pred_filter_off < 1)
-      prob_pred_filter_off = 1;
-
-    if (prob_pred_filter_off > 255)
-      prob_pred_filter_off = 255;
-
-    cm->prob_pred_filter_off = prob_pred_filter_off;
-  } else
-    cm->prob_pred_filter_off = 128;
-  /*
-      {
-        FILE *fp = fopen("filt_use.txt", "a");
-        fprintf (fp, "%d %d prob=%d\n", cpi->pred_filter_off_count,
-                 cpi->pred_filter_on_count, cm->prob_pred_filter_off);
-        fclose(fp);
-      }
-  */
-}
-#endif
-
-static void encode_frame_to_data_rate
-(
-  VP9_COMP *cpi,
-  unsigned long *size,
-  unsigned char *dest,
-  unsigned int *frame_flags
-) {
-  VP9_COMMON *cm = &cpi->common;
-  MACROBLOCKD *xd = &cpi->mb.e_mbd;
-
-  int Q;
-  int frame_over_shoot_limit;
-  int frame_under_shoot_limit;
-
-  int Loop = FALSE;
-  int loop_count;
-  int this_q;
-  int last_zbin_oq;
-
-  int q_low;
-  int q_high;
-  int zbin_oq_high;
-  int zbin_oq_low = 0;
-
-  int top_index;
-  int bottom_index;
-  int active_worst_qchanged = FALSE;
-
-  int overshoot_seen = FALSE;
-  int undershoot_seen = FALSE;
-
-  int loop_size_estimate = 0;
-
-  SPEED_FEATURES *sf = &cpi->sf;
-#if RESET_FOREACH_FILTER
-  int q_low0;
-  int q_high0;
-  int zbin_oq_high0;
-  int zbin_oq_low0 = 0;
-  int Q0;
-  int last_zbin_oq0;
-  int active_best_quality0;
-  int active_worst_quality0;
-  double rate_correction_factor0;
-  double gf_rate_correction_factor0;
-#endif
-
-  /* list of filters to search over */
-  int mcomp_filters_to_search[] = {
-    EIGHTTAP, EIGHTTAP_SHARP, SIXTAP, SWITCHABLE
-  };
-  int mcomp_filters = sizeof(mcomp_filters_to_search) /
-      sizeof(*mcomp_filters_to_search);
-  int mcomp_filter_index = 0;
-  INT64 mcomp_filter_cost[4];
-
-  // Clear down mmx registers to allow floating point in what follows
-  vp9_clear_system_state();
-
-
-  // For an alt ref frame in 2 pass we skip the call to the second
-  // pass function that sets the target bandwidth so must set it here
-  if (cpi->common.refresh_alt_ref_frame) {
-    cpi->per_frame_bandwidth = cpi->twopass.gf_bits;                           // Per frame bit target for the alt ref frame
-    cpi->target_bandwidth = cpi->twopass.gf_bits * cpi->output_frame_rate;      // per second target bitrate
-  }
-
-  // Default turn off buffer to buffer copying
-  cm->copy_buffer_to_gf = 0;
-  cm->copy_buffer_to_arf = 0;
-
-  // Clear zbin over-quant value and mode boost values.
-  cpi->zbin_over_quant = 0;
-  cpi->zbin_mode_boost = 0;
-
-  // Enable or disable mode based tweaking of the zbin
-  // For 2 Pass Only used where GF/ARF prediction quality
-  // is above a threshold
-  cpi->zbin_mode_boost = 0;
-#if CONFIG_LOSSLESS
-  cpi->zbin_mode_boost_enabled = FALSE;
-#else
-  cpi->zbin_mode_boost_enabled = TRUE;
-#endif
-  if (cpi->gfu_boost <= 400) {
-    cpi->zbin_mode_boost_enabled = FALSE;
-  }
-
-  // Current default encoder behaviour for the altref sign bias
-  if (cpi->source_alt_ref_active)
-    cpi->common.ref_frame_sign_bias[ALTREF_FRAME] = 1;
-  else
-    cpi->common.ref_frame_sign_bias[ALTREF_FRAME] = 0;
-
-  // Check to see if a key frame is signalled
-  // For two pass with auto key frame enabled cm->frame_type may already be set, but not for one pass.
-  if ((cm->current_video_frame == 0) ||
-      (cm->frame_flags & FRAMEFLAGS_KEY) ||
-      (cpi->oxcf.auto_key && (cpi->frames_since_key % cpi->key_frame_frequency == 0))) {
-    // Key frame from VFW/auto-keyframe/first frame
-    cm->frame_type = KEY_FRAME;
-  }
-
-  // Set default state for segment based loop filter update flags
-  xd->mode_ref_lf_delta_update = 0;
-
-  // Set various flags etc to special state if it is a key frame
-  if (cm->frame_type == KEY_FRAME) {
-    int i;
-
-    // Reset the loop filter deltas and segmentation map
-    setup_features(cpi);
-
-    // If segmentation is enabled force a map update for key frames
-    if (xd->segmentation_enabled) {
-      xd->update_mb_segmentation_map = 1;
-      xd->update_mb_segmentation_data = 1;
-    }
-
-    // The alternate reference frame cannot be active for a key frame
-    cpi->source_alt_ref_active = FALSE;
-
-    // Reset the RD threshold multipliers to default of * 1 (128)
-    for (i = 0; i < MAX_MODES; i++) {
-      cpi->rd_thresh_mult[i] = 128;
-    }
-  }
-
-  // Test code for new segment features
-  init_seg_features(cpi);
-
-  // Decide how big to make the frame
-  vp9_pick_frame_size(cpi);
-
-  vp9_clear_system_state();
-
-  // Set an active best quality and if necessary active worst quality
-  Q = cpi->active_worst_quality;
-
-  if (cm->frame_type == KEY_FRAME) {
-    int high = 2000;
-    int low = 400;
-
-    if (cpi->kf_boost > high)
-      cpi->active_best_quality = kf_low_motion_minq[Q];
-    else if (cpi->kf_boost < low)
-      cpi->active_best_quality = kf_high_motion_minq[Q];
-    else {
-      int gap = high - low;
-      int offset = high - cpi->kf_boost;
-      int qdiff = kf_high_motion_minq[Q] - kf_low_motion_minq[Q];
-      int adjustment = ((offset * qdiff) + (gap >> 1)) / gap;
-
-      cpi->active_best_quality = kf_low_motion_minq[Q] + adjustment;
-    }
-
-    // Make an adjustment based on the %s static
-    // The main impact of this is at lower Q to prevent overly large key
-    // frames unless a lot of the image is static.
-    if (cpi->kf_zeromotion_pct < 64)
-      cpi->active_best_quality += 4 - (cpi->kf_zeromotion_pct >> 4);
-
-    // Special case for key frames forced because we have reached
-    // the maximum key frame interval. Here force the Q to a range
-    // based on the ambient Q to reduce the risk of popping
-    if (cpi->this_key_frame_forced) {
-      int delta_qindex;
-      int qindex = cpi->last_boosted_qindex;
-
-      delta_qindex = compute_qdelta(cpi, qindex,
-                                    (qindex * 0.75));
-
-      cpi->active_best_quality = qindex + delta_qindex;
-      if (cpi->active_best_quality < cpi->best_quality)
-        cpi->active_best_quality = cpi->best_quality;
-    }
-  }
-
-  else if (cm->refresh_golden_frame || cpi->common.refresh_alt_ref_frame) {
-    int high = 2000;
-    int low = 400;
-
-    // Use the lower of cpi->active_worst_quality and recent
-    // average Q as basis for GF/ARF Q limit unless last frame was
-    // a key frame.
-    if ((cpi->frames_since_key > 1) &&
-        (cpi->avg_frame_qindex < cpi->active_worst_quality)) {
-      Q = cpi->avg_frame_qindex;
-    }
-
-    // For constrained quality dont allow Q less than the cq level
-    if ((cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) &&
-        (Q < cpi->cq_target_quality)) {
-      Q = cpi->cq_target_quality;
-    }
-
-    if (cpi->gfu_boost > high)
-      cpi->active_best_quality = gf_low_motion_minq[Q];
-    else if (cpi->gfu_boost < low)
-      cpi->active_best_quality = gf_high_motion_minq[Q];
-    else {
-      int gap = high - low;
-      int offset = high - cpi->gfu_boost;
-      int qdiff = gf_high_motion_minq[Q] - gf_low_motion_minq[Q];
-      int adjustment = ((offset * qdiff) + (gap >> 1)) / gap;
-
-      cpi->active_best_quality = gf_low_motion_minq[Q] + adjustment;
-    }
-
-    // Constrained quality use slightly lower active best.
-    if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) {
-      cpi->active_best_quality =
-        cpi->active_best_quality * 15 / 16;
-    }
-  } else {
-    cpi->active_best_quality = inter_minq[Q];
-
-    // For the constant/constrained quality mode we dont want
-    // q to fall below the cq level.
-    if ((cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) &&
-        (cpi->active_best_quality < cpi->cq_target_quality)) {
-      // If we are strongly undershooting the target rate in the last
-      // frames then use the user passed in cq value not the auto
-      // cq value.
-      if (cpi->rolling_actual_bits < cpi->min_frame_bandwidth)
-        cpi->active_best_quality = cpi->oxcf.cq_level;
-      else
-        cpi->active_best_quality = cpi->cq_target_quality;
-    }
-  }
-
-  // Clip the active best and worst quality values to limits
-  if (cpi->active_worst_quality > cpi->worst_quality)
-    cpi->active_worst_quality = cpi->worst_quality;
-
-  if (cpi->active_best_quality < cpi->best_quality)
-    cpi->active_best_quality = cpi->best_quality;
-
-  if (cpi->active_best_quality > cpi->worst_quality)
-    cpi->active_best_quality = cpi->worst_quality;
-
-  if (cpi->active_worst_quality < cpi->active_best_quality)
-    cpi->active_worst_quality = cpi->active_best_quality;
-
-  // Specuial case code to try and match quality with forced key frames
-  if ((cm->frame_type == KEY_FRAME) && cpi->this_key_frame_forced) {
-    Q = cpi->last_boosted_qindex;
-  } else {
-    // Determine initial Q to try
-    Q = vp9_regulate_q(cpi, cpi->this_frame_target);
-  }
-  last_zbin_oq = cpi->zbin_over_quant;
-
-  // Set highest allowed value for Zbin over quant
-  if (cm->frame_type == KEY_FRAME)
-    zbin_oq_high = 0; // ZBIN_OQ_MAX/16
-  else if (cm->refresh_alt_ref_frame || (cm->refresh_golden_frame && !cpi->source_alt_ref_active))
-    zbin_oq_high = 16;
-  else
-    zbin_oq_high = ZBIN_OQ_MAX;
-
-  vp9_compute_frame_size_bounds(cpi, &frame_under_shoot_limit,
-                                &frame_over_shoot_limit);
-
-  // Limit Q range for the adaptive loop.
-  bottom_index = cpi->active_best_quality;
-  top_index    = cpi->active_worst_quality;
-  q_low  = cpi->active_best_quality;
-  q_high = cpi->active_worst_quality;
-
-  loop_count = 0;
-
-  if (cm->frame_type != KEY_FRAME) {
-    /* TODO: Decide this more intelligently */
-    if (sf->search_best_filter) {
-      cm->mcomp_filter_type = mcomp_filters_to_search[0];
-      mcomp_filter_index = 0;
-    } else {
-      cm->mcomp_filter_type = DEFAULT_INTERP_FILTER;
-    }
-    /* TODO: Decide this more intelligently */
-    xd->allow_high_precision_mv = (Q < HIGH_PRECISION_MV_QTHRESH);
-  }
-
-#if CONFIG_POSTPROC
-
-  if (cpi->oxcf.noise_sensitivity > 0) {
-    unsigned char *src;
-    int l = 0;
-
-    switch (cpi->oxcf.noise_sensitivity) {
-      case 1:
-        l = 20;
-        break;
-      case 2:
-        l = 40;
-        break;
-      case 3:
-        l = 60;
-        break;
-      case 4:
-
-      case 5:
-        l = 100;
-        break;
-      case 6:
-        l = 150;
-        break;
-    }
-
-
-    if (cm->frame_type == KEY_FRAME) {
-      vp9_de_noise(cpi->Source, cpi->Source, l, 1,  0, RTCD(postproc));
-    } else {
-      vp9_de_noise(cpi->Source, cpi->Source, l, 1,  0, RTCD(postproc));
-
-      src = cpi->Source->y_buffer;
-
-      if (cpi->Source->y_stride < 0) {
-        src += cpi->Source->y_stride * (cpi->Source->y_height - 1);
-      }
-    }
-  }
-
-#endif
-
-#ifdef OUTPUT_YUV_SRC
-  vp9_write_yuv_frame(cpi->Source);
-#endif
-
-#if RESET_FOREACH_FILTER
-  if (sf->search_best_filter) {
-    q_low0 = q_low;
-    q_high0 = q_high;
-    Q0 = Q;
-    zbin_oq_low0 = zbin_oq_low;
-    zbin_oq_high0 = zbin_oq_high;
-    last_zbin_oq0 = last_zbin_oq;
-    rate_correction_factor0 = cpi->rate_correction_factor;
-    gf_rate_correction_factor0 = cpi->gf_rate_correction_factor;
-    active_best_quality0 = cpi->active_best_quality;
-    active_worst_quality0 = cpi->active_worst_quality;
-  }
-#endif
-  do {
-    vp9_clear_system_state();  // __asm emms;
-
-    vp9_set_quantizer(cpi, Q);
-    this_q = Q;
-
-    if (loop_count == 0) {
-
-      // setup skip prob for costing in mode/mv decision
-      if (cpi->common.mb_no_coeff_skip) {
-        int k;
-        for (k = 0; k < MBSKIP_CONTEXTS; k++)
-          cm->mbskip_pred_probs[k] = cpi->base_skip_false_prob[Q][k];
-
-        if (cm->frame_type != KEY_FRAME) {
-          if (cpi->common.refresh_alt_ref_frame) {
-            for (k = 0; k < MBSKIP_CONTEXTS; k++) {
-              if (cpi->last_skip_false_probs[2][k] != 0)
-                cm->mbskip_pred_probs[k] = cpi->last_skip_false_probs[2][k];
-            }
-          } else if (cpi->common.refresh_golden_frame) {
-            for (k = 0; k < MBSKIP_CONTEXTS; k++) {
-              if (cpi->last_skip_false_probs[1][k] != 0)
-                cm->mbskip_pred_probs[k] = cpi->last_skip_false_probs[1][k];
-            }
-          } else {
-            int k;
-            for (k = 0; k < MBSKIP_CONTEXTS; k++) {
-              if (cpi->last_skip_false_probs[0][k] != 0)
-                cm->mbskip_pred_probs[k] = cpi->last_skip_false_probs[0][k];
-            }
-          }
-
-          // as this is for cost estimate, let's make sure it does not
-          // get extreme either way
-          {
-            int k;
-            for (k = 0; k < MBSKIP_CONTEXTS; ++k) {
-              if (cm->mbskip_pred_probs[k] < 5)
-                cm->mbskip_pred_probs[k] = 5;
-
-              if (cm->mbskip_pred_probs[k] > 250)
-                cm->mbskip_pred_probs[k] = 250;
-
-              if (cpi->is_src_frame_alt_ref)
-                cm->mbskip_pred_probs[k] = 1;
-            }
-          }
-        }
-      }
-
-      // Set up entropy depending on frame type.
-      if (cm->frame_type == KEY_FRAME)
-        vp9_setup_key_frame(cpi);
-      else
-        vp9_setup_inter_frame(cpi);
-    }
-
-    // transform / motion compensation build reconstruction frame
-
-    vp9_encode_frame(cpi);
-
-    // Update the skip mb flag probabilities based on the distribution
-    // seen in the last encoder iteration.
-    update_base_skip_probs(cpi);
-
-    vp9_clear_system_state();  // __asm emms;
-
-#if CONFIG_PRED_FILTER
-    // Update prediction filter on/off probability based on
-    // selection made for the current frame
-    if (cm->frame_type != KEY_FRAME)
-      update_pred_filt_prob(cpi);
-#endif
-
-    // Dummy pack of the bitstream using up to date stats to get an
-    // accurate estimate of output frame size to determine if we need
-    // to recode.
-    vp9_save_coding_context(cpi);
-    cpi->dummy_packing = 1;
-    vp9_pack_bitstream(cpi, dest, size);
-    cpi->projected_frame_size = (*size) << 3;
-    vp9_restore_coding_context(cpi);
-
-    if (frame_over_shoot_limit == 0)
-      frame_over_shoot_limit = 1;
-    active_worst_qchanged = FALSE;
-
-    // Special case handling for forced key frames
-    if ((cm->frame_type == KEY_FRAME) && cpi->this_key_frame_forced) {
-      int last_q = Q;
-      int kf_err = vp9_calc_ss_err(cpi->Source,
-                                   &cm->yv12_fb[cm->new_fb_idx]);
-
-      int high_err_target = cpi->ambient_err;
-      int low_err_target = (cpi->ambient_err >> 1);
-
-      // Prevent possible divide by zero error below for perfect KF
-      kf_err += (!kf_err);
-
-      // The key frame is not good enough or we can afford
-      // to make it better without undue risk of popping.
-      if (((kf_err > high_err_target) &&
-           (cpi->projected_frame_size <= frame_over_shoot_limit)) ||
-          ((kf_err > low_err_target) &&
-           (cpi->projected_frame_size <= frame_under_shoot_limit))) {
-        // Lower q_high
-        q_high = (Q > q_low) ? (Q - 1) : q_low;
-
-        // Adjust Q
-        Q = (Q * high_err_target) / kf_err;
-        if (Q < ((q_high + q_low) >> 1))
-          Q = (q_high + q_low) >> 1;
-      }
-      // The key frame is much better than the previous frame
-      else if ((kf_err < low_err_target) &&
-               (cpi->projected_frame_size >= frame_under_shoot_limit)) {
-        // Raise q_low
-        q_low = (Q < q_high) ? (Q + 1) : q_high;
-
-        // Adjust Q
-        Q = (Q * low_err_target) / kf_err;
-        if (Q > ((q_high + q_low + 1) >> 1))
-          Q = (q_high + q_low + 1) >> 1;
-      }
-
-      // Clamp Q to upper and lower limits:
-      if (Q > q_high)
-        Q = q_high;
-      else if (Q < q_low)
-        Q = q_low;
-
-      Loop = ((Q != last_q)) ? TRUE : FALSE;
-    }
-
-    // Is the projected frame size out of range and are we allowed to attempt to recode.
-    else if (recode_loop_test(cpi,
-                              frame_over_shoot_limit, frame_under_shoot_limit,
-                              Q, top_index, bottom_index)) {
-      int last_q = Q;
-      int Retries = 0;
-
-      // Frame size out of permitted range:
-      // Update correction factor & compute new Q to try...
-
-      // Frame is too large
-      if (cpi->projected_frame_size > cpi->this_frame_target) {
-        q_low = (Q < q_high) ? (Q + 1) : q_high; // Raise Qlow as to at least the current value
-
-        if (cpi->zbin_over_quant > 0)            // If we are using over quant do the same for zbin_oq_low
-          zbin_oq_low = (cpi->zbin_over_quant < zbin_oq_high) ? (cpi->zbin_over_quant + 1) : zbin_oq_high;
-
-        if (undershoot_seen || (loop_count > 1)) {
-          // Update rate_correction_factor unless cpi->active_worst_quality has changed.
-          if (!active_worst_qchanged)
-            vp9_update_rate_correction_factors(cpi, 1);
-
-          Q = (q_high + q_low + 1) / 2;
-
-          // Adjust cpi->zbin_over_quant (only allowed when Q is max)
-          if (Q < MAXQ)
-            cpi->zbin_over_quant = 0;
-          else {
-            zbin_oq_low = (cpi->zbin_over_quant < zbin_oq_high) ? (cpi->zbin_over_quant + 1) : zbin_oq_high;
-            cpi->zbin_over_quant = (zbin_oq_high + zbin_oq_low) / 2;
-          }
-        } else {
-          // Update rate_correction_factor unless cpi->active_worst_quality has changed.
-          if (!active_worst_qchanged)
-            vp9_update_rate_correction_factors(cpi, 0);
-
-          Q = vp9_regulate_q(cpi, cpi->this_frame_target);
-
-          while (((Q < q_low) || (cpi->zbin_over_quant < zbin_oq_low)) && (Retries < 10)) {
-            vp9_update_rate_correction_factors(cpi, 0);
-            Q = vp9_regulate_q(cpi, cpi->this_frame_target);
-            Retries++;
-          }
-        }
-
-        overshoot_seen = TRUE;
-      }
-      // Frame is too small
-      else {
-        if (cpi->zbin_over_quant == 0)
-          q_high = (Q > q_low) ? (Q - 1) : q_low; // Lower q_high if not using over quant
-        else                                    // else lower zbin_oq_high
-          zbin_oq_high = (cpi->zbin_over_quant > zbin_oq_low) ? (cpi->zbin_over_quant - 1) : zbin_oq_low;
-
-        if (overshoot_seen || (loop_count > 1)) {
-          // Update rate_correction_factor unless cpi->active_worst_quality has changed.
-          if (!active_worst_qchanged)
-            vp9_update_rate_correction_factors(cpi, 1);
-
-          Q = (q_high + q_low) / 2;
-
-          // Adjust cpi->zbin_over_quant (only allowed when Q is max)
-          if (Q < MAXQ)
-            cpi->zbin_over_quant = 0;
-          else
-            cpi->zbin_over_quant = (zbin_oq_high + zbin_oq_low) / 2;
-        } else {
-          // Update rate_correction_factor unless cpi->active_worst_quality has changed.
-          if (!active_worst_qchanged)
-            vp9_update_rate_correction_factors(cpi, 0);
-
-          Q = vp9_regulate_q(cpi, cpi->this_frame_target);
-
-          // Special case reset for qlow for constrained quality.
-          // This should only trigger where there is very substantial
-          // undershoot on a frame and the auto cq level is above
-          // the user passsed in value.
-          if ((cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) &&
-              (Q < q_low)) {
-            q_low = Q;
-          }
-
-          while (((Q > q_high) || (cpi->zbin_over_quant > zbin_oq_high)) && (Retries < 10)) {
-            vp9_update_rate_correction_factors(cpi, 0);
-            Q = vp9_regulate_q(cpi, cpi->this_frame_target);
-            Retries++;
-          }
-        }
-
-        undershoot_seen = TRUE;
-      }
-
-      // Clamp Q to upper and lower limits:
-      if (Q > q_high)
-        Q = q_high;
-      else if (Q < q_low)
-        Q = q_low;
-
-      // Clamp cpi->zbin_over_quant
-      cpi->zbin_over_quant = (cpi->zbin_over_quant < zbin_oq_low) ?
-          zbin_oq_low : (cpi->zbin_over_quant > zbin_oq_high) ?
-          zbin_oq_high : cpi->zbin_over_quant;
-
-      // Loop = ((Q != last_q) || (last_zbin_oq != cpi->zbin_over_quant)) ? TRUE : FALSE;
-      Loop = ((Q != last_q)) ? TRUE : FALSE;
-      last_zbin_oq = cpi->zbin_over_quant;
-    } else
-      Loop = FALSE;
-
-    if (cpi->is_src_frame_alt_ref)
-      Loop = FALSE;
-
-    if (cm->frame_type != KEY_FRAME &&
-        !sf->search_best_filter &&
-        cm->mcomp_filter_type == SWITCHABLE) {
-      int interp_factor = Q / 3;  /* denominator is 256 */
-      int count[VP9_SWITCHABLE_FILTERS];
-      int tot_count = 0, c = 0, thr;
-      int i, j;
-      for (i = 0; i < VP9_SWITCHABLE_FILTERS; ++i) {
-        count[i] = 0;
-        for (j = 0; j <= VP9_SWITCHABLE_FILTERS; ++j) {
-          count[i] += cpi->switchable_interp_count[j][i];
-        }
-        tot_count += count[i];
-      }
-
-      thr = ((tot_count * interp_factor + 128) >> 8);
-      for (i = 0; i < VP9_SWITCHABLE_FILTERS; ++i) {
-        c += (count[i] >= thr);
-      }
-      if (c == 1) {
-        /* Mostly one filter is used. So set the filter at frame level */
-        for (i = 0; i < VP9_SWITCHABLE_FILTERS; ++i) {
-          if (count[i]) {
-            cm->mcomp_filter_type = vp9_switchable_interp[i];
-            Loop = TRUE;  /* Make sure to loop since the filter changed */
-            break;
-          }
-        }
-      }
-    }
-
-    if (Loop == FALSE && cm->frame_type != KEY_FRAME && sf->search_best_filter) {
-      if (mcomp_filter_index < mcomp_filters) {
-        INT64 err = vp9_calc_ss_err(cpi->Source,
-                                    &cm->yv12_fb[cm->new_fb_idx]);
-        INT64 rate = cpi->projected_frame_size << 8;
-        mcomp_filter_cost[mcomp_filter_index] =
-          (RDCOST(cpi->RDMULT, cpi->RDDIV, rate, err));
-        mcomp_filter_index++;
-        if (mcomp_filter_index < mcomp_filters) {
-          cm->mcomp_filter_type = mcomp_filters_to_search[mcomp_filter_index];
-          loop_count = -1;
-          Loop = TRUE;
-        } else {
-          int f;
-          INT64 best_cost = mcomp_filter_cost[0];
-          int mcomp_best_filter = mcomp_filters_to_search[0];
-          for (f = 1; f < mcomp_filters; f++) {
-            if (mcomp_filter_cost[f] < best_cost) {
-              mcomp_best_filter = mcomp_filters_to_search[f];
-              best_cost = mcomp_filter_cost[f];
-            }
-          }
-          if (mcomp_best_filter != mcomp_filters_to_search[mcomp_filters - 1]) {
-            loop_count = -1;
-            Loop = TRUE;
-            cm->mcomp_filter_type = mcomp_best_filter;
-          }
-          /*
-          printf("  best filter = %d, ( ", mcomp_best_filter);
-          for (f=0;f<mcomp_filters; f++) printf("%d ",  mcomp_filter_cost[f]);
-          printf(")\n");
-          */
-        }
-#if RESET_FOREACH_FILTER
-        if (Loop == TRUE) {
-          overshoot_seen = FALSE;
-          undershoot_seen = FALSE;
-          zbin_oq_low = zbin_oq_low0;
-          zbin_oq_high = zbin_oq_high0;
-          q_low = q_low0;
-          q_high = q_high0;
-          Q = Q0;
-          cpi->zbin_over_quant = last_zbin_oq = last_zbin_oq0;
-          cpi->rate_correction_factor = rate_correction_factor0;
-          cpi->gf_rate_correction_factor = gf_rate_correction_factor0;
-          cpi->active_best_quality = active_best_quality0;
-          cpi->active_worst_quality = active_worst_quality0;
-        }
-#endif
-      }
-    }
-
-    if (Loop == TRUE) {
-      loop_count++;
-#if CONFIG_INTERNAL_STATS
-      cpi->tot_recode_hits++;
-#endif
-    }
-  } while (Loop == TRUE);
-
-  // Special case code to reduce pulsing when key frames are forced at a
-  // fixed interval. Note the reconstruction error if it is the frame before
-  // the force key frame
-  if (cpi->next_key_frame_forced && (cpi->twopass.frames_to_key == 0)) {
-    cpi->ambient_err = vp9_calc_ss_err(cpi->Source,
-                                       &cm->yv12_fb[cm->new_fb_idx]);
-  }
-
-  // This frame's MVs are saved and will be used in next frame's MV
-  // prediction. Last frame has one more line(add to bottom) and one
-  // more column(add to right) than cm->mip. The edge elements are
-  // initialized to 0.
-  if (cm->show_frame) { // do not save for altref frame
-    int mb_row;
-    int mb_col;
-    MODE_INFO *tmp = cm->mip;
-
-    if (cm->frame_type != KEY_FRAME) {
-      for (mb_row = 0; mb_row < cm->mb_rows + 1; mb_row ++) {
-        for (mb_col = 0; mb_col < cm->mb_cols + 1; mb_col ++) {
-          if (tmp->mbmi.ref_frame != INTRA_FRAME)
-            cpi->lfmv[mb_col + mb_row * (cm->mode_info_stride + 1)].as_int = tmp->mbmi.mv[0].as_int;
-
-          cpi->lf_ref_frame_sign_bias[mb_col + mb_row * (cm->mode_info_stride + 1)] = cm->ref_frame_sign_bias[tmp->mbmi.ref_frame];
-          cpi->lf_ref_frame[mb_col + mb_row * (cm->mode_info_stride + 1)] = tmp->mbmi.ref_frame;
-          tmp++;
-        }
-      }
-    }
-  }
-
-  // Update the GF useage maps.
-  // This is done after completing the compression of a frame when all modes
-  // etc. are finalized but before loop filter
-  vp9_update_gf_useage_maps(cpi, cm, &cpi->mb);
-
-  if (cm->frame_type == KEY_FRAME)
-    cm->refresh_last_frame = 1;
-
-#if 0
-  {
-    FILE *f = fopen("gfactive.stt", "a");
-    fprintf(f, "%8d %8d %8d %8d %8d\n", cm->current_video_frame, (100 * cpi->gf_active_count) / (cpi->common.mb_rows * cpi->common.mb_cols), cpi->this_iiratio, cpi->next_iiratio, cm->refresh_golden_frame);
-    fclose(f);
-  }
-#endif
-
-  cm->frame_to_show = &cm->yv12_fb[cm->new_fb_idx];
-
-#if WRITE_RECON_BUFFER
-  if (cm->show_frame)
-    write_cx_frame_to_file(cm->frame_to_show,
-                           cm->current_video_frame);
-  else
-    write_cx_frame_to_file(cm->frame_to_show,
-                           cm->current_video_frame + 1000);
-#endif
-
-  // Pick the loop filter level for the frame.
-  loopfilter_frame(cpi, cm);
-
-  // build the bitstream
-  cpi->dummy_packing = 0;
-  vp9_pack_bitstream(cpi, dest, size);
-
-  if (cpi->mb.e_mbd.update_mb_segmentation_map) {
-    update_reference_segmentation_map(cpi);
-  }
-
-#if CONFIG_PRED_FILTER
-  // Select the prediction filtering mode to use for the
-  // next frame based on the current frame selections
-  if (cm->frame_type != KEY_FRAME)
-    select_pred_filter_mode(cpi);
-#endif
-
-  update_reference_frames(cm);
-  vp9_copy(cpi->common.fc.coef_counts, cpi->coef_counts);
-  vp9_copy(cpi->common.fc.hybrid_coef_counts, cpi->hybrid_coef_counts);
-  vp9_copy(cpi->common.fc.coef_counts_8x8, cpi->coef_counts_8x8);
-  vp9_copy(cpi->common.fc.hybrid_coef_counts_8x8, cpi->hybrid_coef_counts_8x8);
-  vp9_copy(cpi->common.fc.coef_counts_16x16, cpi->coef_counts_16x16);
-  vp9_copy(cpi->common.fc.hybrid_coef_counts_16x16,
-           cpi->hybrid_coef_counts_16x16);
-  vp9_adapt_coef_probs(&cpi->common);
-  if (cpi->common.frame_type != KEY_FRAME) {
-    vp9_copy(cpi->common.fc.ymode_counts, cpi->ymode_count);
-    vp9_copy(cpi->common.fc.uv_mode_counts, cpi->y_uv_mode_count);
-    vp9_copy(cpi->common.fc.bmode_counts, cpi->bmode_count);
-    vp9_copy(cpi->common.fc.i8x8_mode_counts, cpi->i8x8_mode_count);
-    vp9_copy(cpi->common.fc.sub_mv_ref_counts, cpi->sub_mv_ref_count);
-    vp9_copy(cpi->common.fc.mbsplit_counts, cpi->mbsplit_count);
-    vp9_adapt_mode_probs(&cpi->common);
-
-    cpi->common.fc.NMVcount = cpi->NMVcount;
-    vp9_adapt_nmv_probs(&cpi->common, cpi->mb.e_mbd.allow_high_precision_mv);
-    vp9_update_mode_context(&cpi->common);
-  }
-
-  /* Move storing frame_type out of the above loop since it is also
-   * needed in motion search besides loopfilter */
-  cm->last_frame_type = cm->frame_type;
-
-  // Keep a copy of the size estimate used in the loop
-  loop_size_estimate = cpi->projected_frame_size;
-
-  // Update rate control heuristics
-  cpi->total_byte_count += (*size);
-  cpi->projected_frame_size = (*size) << 3;
-
-  if (!active_worst_qchanged)
-    vp9_update_rate_correction_factors(cpi, 2);
-
-  cpi->last_q[cm->frame_type] = cm->base_qindex;
-
-  // Keep record of last boosted (KF/KF/ARF) Q value.
-  // If the current frame is coded at a lower Q then we also update it.
-  // If all mbs in this group are skipped only update if the Q value is
-  // better than that already stored.
-  // This is used to help set quality in forced key frames to reduce popping
-  if ((cm->base_qindex < cpi->last_boosted_qindex) ||
-      ((cpi->static_mb_pct < 100) &&
-       ((cm->frame_type == KEY_FRAME) ||
-        cm->refresh_alt_ref_frame ||
-        (cm->refresh_golden_frame && !cpi->is_src_frame_alt_ref)))) {
-    cpi->last_boosted_qindex = cm->base_qindex;
-  }
-
-  if (cm->frame_type == KEY_FRAME) {
-    vp9_adjust_key_frame_context(cpi);
-  }
-
-  // Keep a record of ambient average Q.
-  if (cm->frame_type != KEY_FRAME)
-    cpi->avg_frame_qindex = (2 + 3 * cpi->avg_frame_qindex + cm->base_qindex) >> 2;
-
-  // Keep a record from which we can calculate the average Q excluding GF updates and key frames
-  if ((cm->frame_type != KEY_FRAME) && !cm->refresh_golden_frame && !cm->refresh_alt_ref_frame) {
-    cpi->ni_frames++;
-    cpi->tot_q += vp9_convert_qindex_to_q(Q);
-    cpi->avg_q = cpi->tot_q / (double)cpi->ni_frames;
-
-    // Calculate the average Q for normal inter frames (not key or GFU
-    // frames).
-    cpi->ni_tot_qi += Q;
-    cpi->ni_av_qi = (cpi->ni_tot_qi / cpi->ni_frames);
-  }
-
-  // Update the buffer level variable.
-  // Non-viewable frames are a special case and are treated as pure overhead.
-  if (!cm->show_frame)
-    cpi->bits_off_target -= cpi->projected_frame_size;
-  else
-    cpi->bits_off_target += cpi->av_per_frame_bandwidth - cpi->projected_frame_size;
-
-  // Clip the buffer level at the maximum buffer size
-  if (cpi->bits_off_target > cpi->oxcf.maximum_buffer_size)
-    cpi->bits_off_target = cpi->oxcf.maximum_buffer_size;
-
-  // Rolling monitors of whether we are over or underspending used to help regulate min and Max Q in two pass.
-  cpi->rolling_target_bits = ((cpi->rolling_target_bits * 3) + cpi->this_frame_target + 2) / 4;
-  cpi->rolling_actual_bits = ((cpi->rolling_actual_bits * 3) + cpi->projected_frame_size + 2) / 4;
-  cpi->long_rolling_target_bits = ((cpi->long_rolling_target_bits * 31) + cpi->this_frame_target + 16) / 32;
-  cpi->long_rolling_actual_bits = ((cpi->long_rolling_actual_bits * 31) + cpi->projected_frame_size + 16) / 32;
-
-  // Actual bits spent
-  cpi->total_actual_bits    += cpi->projected_frame_size;
-
-  // Debug stats
-  cpi->total_target_vs_actual += (cpi->this_frame_target - cpi->projected_frame_size);
-
-  cpi->buffer_level = cpi->bits_off_target;
-
-  // Update bits left to the kf and gf groups to account for overshoot or undershoot on these frames
-  if (cm->frame_type == KEY_FRAME) {
-    cpi->twopass.kf_group_bits += cpi->this_frame_target - cpi->projected_frame_size;
-
-    if (cpi->twopass.kf_group_bits < 0)
-      cpi->twopass.kf_group_bits = 0;
-  } else if (cm->refresh_golden_frame || cm->refresh_alt_ref_frame) {
-    cpi->twopass.gf_group_bits += cpi->this_frame_target - cpi->projected_frame_size;
-
-    if (cpi->twopass.gf_group_bits < 0)
-      cpi->twopass.gf_group_bits = 0;
-  }
-
-  // Update the skip mb flag probabilities based on the distribution seen
-  // in this frame.
-  update_base_skip_probs(cpi);
-
-#if 0 //CONFIG_NEW_MVREF && CONFIG_INTERNAL_STATS
-  {
-    FILE *f = fopen("mv_ref_dist.stt", "a");
-    unsigned int i;
-    for (i = 0; i < MAX_MV_REFS; ++i) {
-      fprintf(f, "%10d", cpi->best_ref_index_counts[0][i]);
-    }
-    fprintf(f, "\n" );
-
-    fclose(f);
-  }
-#endif
-
-#if 0// 1 && CONFIG_INTERNAL_STATS
-  {
-    FILE *f = fopen("tmp.stt", "a");
-    int recon_err;
-
-    vp9_clear_system_state();  // __asm emms;
-
-    recon_err = vp9_calc_ss_err(cpi->Source,
-                                &cm->yv12_fb[cm->new_fb_idx]);
-
-    if (cpi->twopass.total_left_stats->coded_error != 0.0)
-      fprintf(f, "%10d %10d %10d %10d %10d %10d %10d %10d"
-              "%7.2f %7.2f %7.2f %7.2f %7.2f %7.2f %7.2f"
-              "%6d %5d %5d %5d %8d %8.2f %10d %10.3f"
-              "%10.3f %8d %10d %10d %10d\n",
-              cpi->common.current_video_frame, cpi->this_frame_target,
-              cpi->projected_frame_size, loop_size_estimate,
-              (cpi->projected_frame_size - cpi->this_frame_target),
-              (int)cpi->total_target_vs_actual,
-              (cpi->oxcf.starting_buffer_level - cpi->bits_off_target),
-              (int)cpi->total_actual_bits,
-              vp9_convert_qindex_to_q(cm->base_qindex),
-              (double)vp9_dc_quant(cm->base_qindex, 0) / 4.0,
-              vp9_convert_qindex_to_q(cpi->active_best_quality),
-              vp9_convert_qindex_to_q(cpi->active_worst_quality),
-              cpi->avg_q,
-              vp9_convert_qindex_to_q(cpi->ni_av_qi),
-              vp9_convert_qindex_to_q(cpi->cq_target_quality),
-              cpi->zbin_over_quant,
-              // cpi->avg_frame_qindex, cpi->zbin_over_quant,
-              cm->refresh_golden_frame, cm->refresh_alt_ref_frame,
-              cm->frame_type, cpi->gfu_boost,
-              cpi->twopass.est_max_qcorrection_factor,
-              (int)cpi->twopass.bits_left,
-              cpi->twopass.total_left_stats->coded_error,
-              (double)cpi->twopass.bits_left /
-              cpi->twopass.total_left_stats->coded_error,
-              cpi->tot_recode_hits, recon_err, cpi->kf_boost,
-              cpi->kf_zeromotion_pct);
-    else
-      fprintf(f, "%10d %10d %10d %10d %10d %10d %10d %10d"
-              "%7.2f %7.2f %7.2f %7.2f %7.2f %7.2f %7.2f"
-              "%6d %5d %5d %5d %8d %8.2f %10d %10.3f"
-              "%8d %10d %10d %10d\n",
-              cpi->common.current_video_frame,
-              cpi->this_frame_target, cpi->projected_frame_size,
-              loop_size_estimate,
-              (cpi->projected_frame_size - cpi->this_frame_target),
-              (int)cpi->total_target_vs_actual,
-              (cpi->oxcf.starting_buffer_level - cpi->bits_off_target),
-              (int)cpi->total_actual_bits,
-              vp9_convert_qindex_to_q(cm->base_qindex),
-              (double)vp9_dc_quant(cm->base_qindex, 0) / 4.0,
-              vp9_convert_qindex_to_q(cpi->active_best_quality),
-              vp9_convert_qindex_to_q(cpi->active_worst_quality),
-              cpi->avg_q,
-              vp9_convert_qindex_to_q(cpi->ni_av_qi),
-              vp9_convert_qindex_to_q(cpi->cq_target_quality),
-              cpi->zbin_over_quant,
-              // cpi->avg_frame_qindex, cpi->zbin_over_quant,
-              cm->refresh_golden_frame, cm->refresh_alt_ref_frame,
-              cm->frame_type, cpi->gfu_boost,
-              cpi->twopass.est_max_qcorrection_factor,
-              (int)cpi->twopass.bits_left,
-              cpi->twopass.total_left_stats->coded_error,
-              cpi->tot_recode_hits, recon_err, cpi->kf_boost,
-              cpi->kf_zeromotion_pct);
-
-    fclose(f);
-
-    if (0) {
-      FILE *fmodes = fopen("Modes.stt", "a");
-      int i;
-
-      fprintf(fmodes, "%6d:%1d:%1d:%1d ",
-              cpi->common.current_video_frame,
-              cm->frame_type, cm->refresh_golden_frame,
-              cm->refresh_alt_ref_frame);
-
-      for (i = 0; i < MAX_MODES; i++)
-        fprintf(fmodes, "%5d ", cpi->mode_chosen_counts[i]);
-
-      fprintf(fmodes, "\n");
-
-      fclose(fmodes);
-    }
-  }
-
-#endif
-
-#if 0
-  // Debug stats for segment feature experiments.
-  print_seg_map(cpi);
-#endif
-
-  // If this was a kf or Gf note the Q
-  if ((cm->frame_type == KEY_FRAME) || cm->refresh_golden_frame || cm->refresh_alt_ref_frame)
-    cm->last_kf_gf_q = cm->base_qindex;
-
-  if (cm->refresh_golden_frame == 1)
-    cm->frame_flags = cm->frame_flags | FRAMEFLAGS_GOLDEN;
-  else
-    cm->frame_flags = cm->frame_flags&~FRAMEFLAGS_GOLDEN;
-
-  if (cm->refresh_alt_ref_frame == 1)
-    cm->frame_flags = cm->frame_flags | FRAMEFLAGS_ALTREF;
-  else
-    cm->frame_flags = cm->frame_flags&~FRAMEFLAGS_ALTREF;
-
-
-  if (cm->refresh_last_frame & cm->refresh_golden_frame) // both refreshed
-    cpi->gold_is_last = 1;
-  else if (cm->refresh_last_frame ^ cm->refresh_golden_frame) // 1 refreshed but not the other
-    cpi->gold_is_last = 0;
-
-  if (cm->refresh_last_frame & cm->refresh_alt_ref_frame) // both refreshed
-    cpi->alt_is_last = 1;
-  else if (cm->refresh_last_frame ^ cm->refresh_alt_ref_frame) // 1 refreshed but not the other
-    cpi->alt_is_last = 0;
-
-  if (cm->refresh_alt_ref_frame & cm->refresh_golden_frame) // both refreshed
-    cpi->gold_is_alt = 1;
-  else if (cm->refresh_alt_ref_frame ^ cm->refresh_golden_frame) // 1 refreshed but not the other
-    cpi->gold_is_alt = 0;
-
-  cpi->ref_frame_flags = VP9_ALT_FLAG | VP9_GOLD_FLAG | VP9_LAST_FLAG;
-
-  if (cpi->gold_is_last)
-    cpi->ref_frame_flags &= ~VP9_GOLD_FLAG;
-
-  if (cpi->alt_is_last)
-    cpi->ref_frame_flags &= ~VP9_ALT_FLAG;
-
-  if (cpi->gold_is_alt)
-    cpi->ref_frame_flags &= ~VP9_ALT_FLAG;
-
-  if (cpi->oxcf.play_alternate && cm->refresh_alt_ref_frame && (cm->frame_type != KEY_FRAME))
-    // Update the alternate reference frame stats as appropriate.
-    update_alt_ref_frame_stats(cpi);
-  else
-    // Update the Golden frame stats as appropriate.
-    update_golden_frame_stats(cpi);
-
-  if (cm->frame_type == KEY_FRAME) {
-    // Tell the caller that the frame was coded as a key frame
-    *frame_flags = cm->frame_flags | FRAMEFLAGS_KEY;
-
-    // As this frame is a key frame  the next defaults to an inter frame.
-    cm->frame_type = INTER_FRAME;
-  } else {
-    *frame_flags = cm->frame_flags&~FRAMEFLAGS_KEY;
-  }
-
-  // Clear the one shot update flags for segmentation map and mode/ref loop filter deltas.
-  xd->update_mb_segmentation_map = 0;
-  xd->update_mb_segmentation_data = 0;
-  xd->mode_ref_lf_delta_update = 0;
-
-
-  // Dont increment frame counters if this was an altref buffer update not a real frame
-  if (cm->show_frame) {
-    cm->current_video_frame++;
-    cpi->frames_since_key++;
-  }
-
-  // reset to normal state now that we are done.
-
-
-
-#if 0
-  {
-    char filename[512];
-    FILE *recon_file;
-    sprintf(filename, "enc%04d.yuv", (int) cm->current_video_frame);
-    recon_file = fopen(filename, "wb");
-    fwrite(cm->yv12_fb[cm->lst_fb_idx].buffer_alloc,
-           cm->yv12_fb[cm->lst_fb_idx].frame_size, 1, recon_file);
-    fclose(recon_file);
-  }
-#endif
-#ifdef OUTPUT_YUV_REC
-  vp9_write_yuv_rec_frame(cm);
-#endif
-
-  if (cm->show_frame) {
-    vpx_memcpy(cm->prev_mip, cm->mip,
-               (cm->mb_cols + 1) * (cm->mb_rows + 1)* sizeof(MODE_INFO));
-  } else {
-    vpx_memset(cm->prev_mip, 0,
-               (cm->mb_cols + 1) * (cm->mb_rows + 1)* sizeof(MODE_INFO));
-  }
-}
-
-static void Pass2Encode(VP9_COMP *cpi, unsigned long *size,
-                        unsigned char *dest, unsigned int *frame_flags) {
-
-  if (!cpi->common.refresh_alt_ref_frame)
-    vp9_second_pass(cpi);
-
-  encode_frame_to_data_rate(cpi, size, dest, frame_flags);
-  cpi->twopass.bits_left -= 8 * *size;
-
-  if (!cpi->common.refresh_alt_ref_frame) {
-    double lower_bounds_min_rate = FRAME_OVERHEAD_BITS * cpi->oxcf.frame_rate;
-    double two_pass_min_rate = (double)(cpi->oxcf.target_bandwidth
-                                        * cpi->oxcf.two_pass_vbrmin_section / 100);
-
-    if (two_pass_min_rate < lower_bounds_min_rate)
-      two_pass_min_rate = lower_bounds_min_rate;
-
-    cpi->twopass.bits_left += (int64_t)(two_pass_min_rate / cpi->oxcf.frame_rate);
-  }
-}
-
-// For ARM NEON, d8-d15 are callee-saved registers, and need to be saved by us.
-#if HAVE_ARMV7
-extern void vp9_push_neon(int64_t *store);
-extern void vp9_pop_neon(int64_t *store);
-#endif
-
-
-int vp9_receive_raw_frame(VP9_PTR ptr, unsigned int frame_flags,
-                          YV12_BUFFER_CONFIG *sd, int64_t time_stamp,
-                          int64_t end_time) {
-#if HAVE_ARMV7
-  int64_t store_reg[8];
-#endif
-  VP9_COMP              *cpi = (VP9_COMP *) ptr;
-  VP9_COMMON            *cm = &cpi->common;
-  struct vpx_usec_timer  timer;
-  int                    res = 0;
-
-#if HAVE_ARMV7
-#if CONFIG_RUNTIME_CPU_DETECT
-  if (cm->rtcd.flags & HAS_NEON)
-#endif
-  {
-    vp9_push_neon(store_reg);
-  }
-#endif
-
-  vpx_usec_timer_start(&timer);
-  if (vp9_lookahead_push(cpi->lookahead, sd, time_stamp, end_time, frame_flags,
-                         cpi->active_map_enabled ? cpi->active_map : NULL))
-    res = -1;
-  cm->clr_type = sd->clrtype;
-  vpx_usec_timer_mark(&timer);
-  cpi->time_receive_data += vpx_usec_timer_elapsed(&timer);
-
-#if HAVE_ARMV7
-#if CONFIG_RUNTIME_CPU_DETECT
-  if (cm->rtcd.flags & HAS_NEON)
-#endif
-  {
-    vp9_pop_neon(store_reg);
-  }
-#endif
-
-  return res;
-}
-
-
-static int frame_is_reference(const VP9_COMP *cpi) {
-  const VP9_COMMON *cm = &cpi->common;
-  const MACROBLOCKD *xd = &cpi->mb.e_mbd;
-
-  return cm->frame_type == KEY_FRAME || cm->refresh_last_frame
-         || cm->refresh_golden_frame || cm->refresh_alt_ref_frame
-         || cm->copy_buffer_to_gf || cm->copy_buffer_to_arf
-         || cm->refresh_entropy_probs
-         || xd->mode_ref_lf_delta_update
-         || xd->update_mb_segmentation_map || xd->update_mb_segmentation_data;
-}
-
-
-int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags,
-                            unsigned long *size, unsigned char *dest,
-                            int64_t *time_stamp, int64_t *time_end, int flush) {
-#if HAVE_ARMV7
-  int64_t store_reg[8];
-#endif
-  VP9_COMP *cpi = (VP9_COMP *) ptr;
-  VP9_COMMON *cm = &cpi->common;
-  struct vpx_usec_timer  cmptimer;
-  YV12_BUFFER_CONFIG    *force_src_buffer = NULL;
-
-  if (!cpi)
-    return -1;
-
-#if HAVE_ARMV7
-#if CONFIG_RUNTIME_CPU_DETECT
-  if (cm->rtcd.flags & HAS_NEON)
-#endif
-  {
-    vp9_push_neon(store_reg);
-  }
-#endif
-
-  vpx_usec_timer_start(&cmptimer);
-
-  cpi->source = NULL;
-
-  cpi->mb.e_mbd.allow_high_precision_mv = ALTREF_HIGH_PRECISION_MV;
-  // Should we code an alternate reference frame
-  if (cpi->oxcf.play_alternate &&
-      cpi->source_alt_ref_pending) {
-    if ((cpi->source = vp9_lookahead_peek(cpi->lookahead,
-                                          cpi->frames_till_gf_update_due))) {
-      cpi->alt_ref_source = cpi->source;
-      if (cpi->oxcf.arnr_max_frames > 0) {
-        vp9_temporal_filter_prepare_c(cpi,
-                                      cpi->frames_till_gf_update_due);
-        force_src_buffer = &cpi->alt_ref_buffer;
-      }
-      cm->frames_till_alt_ref_frame = cpi->frames_till_gf_update_due;
-      cm->refresh_alt_ref_frame = 1;
-      cm->refresh_golden_frame = 0;
-      cm->refresh_last_frame = 0;
-      cm->show_frame = 0;
-      cpi->source_alt_ref_pending = FALSE;   // Clear Pending altf Ref flag.
-      cpi->is_src_frame_alt_ref = 0;
-    }
-  }
-
-  if (!cpi->source) {
-    if ((cpi->source = vp9_lookahead_pop(cpi->lookahead, flush))) {
-      cm->show_frame = 1;
-
-      cpi->is_src_frame_alt_ref = cpi->alt_ref_source
-                                  && (cpi->source == cpi->alt_ref_source);
-
-      if (cpi->is_src_frame_alt_ref)
-        cpi->alt_ref_source = NULL;
-    }
-  }
-
-  if (cpi->source) {
-    cpi->un_scaled_source =
-      cpi->Source = force_src_buffer ? force_src_buffer : &cpi->source->img;
-    *time_stamp = cpi->source->ts_start;
-    *time_end = cpi->source->ts_end;
-    *frame_flags = cpi->source->flags;
-  } else {
-    *size = 0;
-    if (flush && cpi->pass == 1 && !cpi->twopass.first_pass_done) {
-      vp9_end_first_pass(cpi);    /* get last stats packet */
-      cpi->twopass.first_pass_done = 1;
-    }
-
-#if HAVE_ARMV7
-#if CONFIG_RUNTIME_CPU_DETECT
-    if (cm->rtcd.flags & HAS_NEON)
-#endif
-    {
-      vp9_pop_neon(store_reg);
-    }
-#endif
-    return -1;
-  }
-
-  if (cpi->source->ts_start < cpi->first_time_stamp_ever) {
-    cpi->first_time_stamp_ever = cpi->source->ts_start;
-    cpi->last_end_time_stamp_seen = cpi->source->ts_start;
-  }
-
-  // adjust frame rates based on timestamps given
-  if (!cm->refresh_alt_ref_frame) {
-    int64_t this_duration;
-    int step = 0;
-
-    if (cpi->source->ts_start == cpi->first_time_stamp_ever) {
-      this_duration = cpi->source->ts_end - cpi->source->ts_start;
-      step = 1;
-    } else {
-      int64_t last_duration;
-
-      this_duration = cpi->source->ts_end - cpi->last_end_time_stamp_seen;
-      last_duration = cpi->last_end_time_stamp_seen
-                      - cpi->last_time_stamp_seen;
-      // do a step update if the duration changes by 10%
-      if (last_duration)
-        step = ((this_duration - last_duration) * 10 / last_duration);
-    }
-
-    if (this_duration) {
-      if (step)
-        vp9_new_frame_rate(cpi, 10000000.0 / this_duration);
-      else {
-        double avg_duration, interval;
-
-        /* Average this frame's rate into the last second's average
-         * frame rate. If we haven't seen 1 second yet, then average
-         * over the whole interval seen.
-         */
-        interval = cpi->source->ts_end - cpi->first_time_stamp_ever;
-        if (interval > 10000000.0)
-          interval = 10000000;
-
-        avg_duration = 10000000.0 / cpi->oxcf.frame_rate;
-        avg_duration *= (interval - avg_duration + this_duration);
-        avg_duration /= interval;
-
-        vp9_new_frame_rate(cpi, 10000000.0 / avg_duration);
-      }
-    }
-
-    cpi->last_time_stamp_seen = cpi->source->ts_start;
-    cpi->last_end_time_stamp_seen = cpi->source->ts_end;
-  }
-
-  // start with a 0 size frame
-  *size = 0;
-
-  // Clear down mmx registers
-  vp9_clear_system_state();  // __asm emms;
-
-  cm->frame_type = INTER_FRAME;
-  cm->frame_flags = *frame_flags;
-
-#if 0
-
-  if (cm->refresh_alt_ref_frame) {
-    // cm->refresh_golden_frame = 1;
-    cm->refresh_golden_frame = 0;
-    cm->refresh_last_frame = 0;
-  } else {
-    cm->refresh_golden_frame = 0;
-    cm->refresh_last_frame = 1;
-  }
-
-#endif
-  /* find a free buffer for the new frame */
-  {
-    int i = 0;
-    for (; i < NUM_YV12_BUFFERS; i++) {
-      if (!cm->yv12_fb[i].flags) {
-        cm->new_fb_idx = i;
-        break;
-      }
-    }
-
-    assert(i < NUM_YV12_BUFFERS);
-  }
-  if (cpi->pass == 1) {
-    Pass1Encode(cpi, size, dest, frame_flags);
-  } else if (cpi->pass == 2) {
-    Pass2Encode(cpi, size, dest, frame_flags);
-  } else {
-    encode_frame_to_data_rate(cpi, size, dest, frame_flags);
-  }
-
-  if (cm->refresh_entropy_probs) {
-    if (cm->refresh_alt_ref_frame)
-      vpx_memcpy(&cm->lfc_a, &cm->fc, sizeof(cm->fc));
-    else
-      vpx_memcpy(&cm->lfc, &cm->fc, sizeof(cm->fc));
-  }
-
-  // if its a dropped frame honor the requests on subsequent frames
-  if (*size > 0) {
-    cpi->droppable = !frame_is_reference(cpi);
-
-    // return to normal state
-    cm->refresh_entropy_probs = 1;
-    cm->refresh_alt_ref_frame = 0;
-    cm->refresh_golden_frame = 0;
-    cm->refresh_last_frame = 1;
-    cm->frame_type = INTER_FRAME;
-
-  }
-
-  vpx_usec_timer_mark(&cmptimer);
-  cpi->time_compress_data += vpx_usec_timer_elapsed(&cmptimer);
-
-  if (cpi->b_calculate_psnr && cpi->pass != 1 && cm->show_frame) {
-    generate_psnr_packet(cpi);
-  }
-
-#if CONFIG_INTERNAL_STATS
-
-  if (cpi->pass != 1) {
-    cpi->bytes += *size;
-
-    if (cm->show_frame) {
-
-      cpi->count++;
-
-      if (cpi->b_calculate_psnr) {
-        double ye, ue, ve;
-        double frame_psnr;
-        YV12_BUFFER_CONFIG      *orig = cpi->Source;
-        YV12_BUFFER_CONFIG      *recon = cpi->common.frame_to_show;
-        YV12_BUFFER_CONFIG      *pp = &cm->post_proc_buffer;
-        int y_samples = orig->y_height * orig->y_width;
-        int uv_samples = orig->uv_height * orig->uv_width;
-        int t_samples = y_samples + 2 * uv_samples;
-        int64_t sq_error;
-
-        ye = calc_plane_error(orig->y_buffer, orig->y_stride,
-                              recon->y_buffer, recon->y_stride, orig->y_width,
-                              orig->y_height);
-
-        ue = calc_plane_error(orig->u_buffer, orig->uv_stride,
-                              recon->u_buffer, recon->uv_stride, orig->uv_width,
-                              orig->uv_height);
-
-        ve = calc_plane_error(orig->v_buffer, orig->uv_stride,
-                              recon->v_buffer, recon->uv_stride, orig->uv_width,
-                              orig->uv_height);
-
-        sq_error = ye + ue + ve;
-
-        frame_psnr = vp9_mse2psnr(t_samples, 255.0, sq_error);
-
-        cpi->total_y += vp9_mse2psnr(y_samples, 255.0, ye);
-        cpi->total_u += vp9_mse2psnr(uv_samples, 255.0, ue);
-        cpi->total_v += vp9_mse2psnr(uv_samples, 255.0, ve);
-        cpi->total_sq_error += sq_error;
-        cpi->total  += frame_psnr;
-        {
-          double frame_psnr2, frame_ssim2 = 0;
-          double weight = 0;
-#if CONFIG_POSTPROC
-          vp9_deblock(cm->frame_to_show, &cm->post_proc_buffer, cm->filter_level * 10 / 6, 1, 0, IF_RTCD(&cm->rtcd.postproc));
-#endif
-          vp9_clear_system_state();
-
-          ye = calc_plane_error(orig->y_buffer, orig->y_stride,
-                                pp->y_buffer, pp->y_stride, orig->y_width,
-                                orig->y_height);
-
-          ue = calc_plane_error(orig->u_buffer, orig->uv_stride,
-                                pp->u_buffer, pp->uv_stride, orig->uv_width,
-                                orig->uv_height);
-
-          ve = calc_plane_error(orig->v_buffer, orig->uv_stride,
-                                pp->v_buffer, pp->uv_stride, orig->uv_width,
-                                orig->uv_height);
-
-          sq_error = ye + ue + ve;
-
-          frame_psnr2 = vp9_mse2psnr(t_samples, 255.0, sq_error);
-
-          cpi->totalp_y += vp9_mse2psnr(y_samples, 255.0, ye);
-          cpi->totalp_u += vp9_mse2psnr(uv_samples, 255.0, ue);
-          cpi->totalp_v += vp9_mse2psnr(uv_samples, 255.0, ve);
-          cpi->total_sq_error2 += sq_error;
-          cpi->totalp  += frame_psnr2;
-
-          frame_ssim2 = vp9_calc_ssim(cpi->Source,
-                                      &cm->post_proc_buffer, 1, &weight);
-
-          cpi->summed_quality += frame_ssim2 * weight;
-          cpi->summed_weights += weight;
-#if 0
-          {
-            FILE *f = fopen("q_used.stt", "a");
-            fprintf(f, "%5d : Y%f7.3:U%f7.3:V%f7.3:F%f7.3:S%7.3f\n",
-                    cpi->common.current_video_frame, y2, u2, v2,
-                    frame_psnr2, frame_ssim2);
-            fclose(f);
-          }
-#endif
-        }
-      }
-
-      if (cpi->b_calculate_ssimg) {
-        double y, u, v, frame_all;
-        frame_all =  vp9_calc_ssimg(cpi->Source, cm->frame_to_show,
-                                    &y, &u, &v);
-        cpi->total_ssimg_y += y;
-        cpi->total_ssimg_u += u;
-        cpi->total_ssimg_v += v;
-        cpi->total_ssimg_all += frame_all;
-      }
-
-    }
-  }
-
-#endif
-
-#if HAVE_ARMV7
-#if CONFIG_RUNTIME_CPU_DETECT
-  if (cm->rtcd.flags & HAS_NEON)
-#endif
-  {
-    vp9_pop_neon(store_reg);
-  }
-#endif
-
-  return 0;
-}
-
-int vp9_get_preview_raw_frame(VP9_PTR comp, YV12_BUFFER_CONFIG *dest,
-                              vp9_ppflags_t *flags) {
-  VP9_COMP *cpi = (VP9_COMP *) comp;
-
-  if (cpi->common.refresh_alt_ref_frame)
-    return -1;
-  else {
-    int ret;
-#if CONFIG_POSTPROC
-    ret = vp9_post_proc_frame(&cpi->common, dest, flags);
-#else
-
-    if (cpi->common.frame_to_show) {
-      *dest = *cpi->common.frame_to_show;
-      dest->y_width = cpi->common.Width;
-      dest->y_height = cpi->common.Height;
-      dest->uv_height = cpi->common.Height / 2;
-      ret = 0;
-    } else {
-      ret = -1;
-    }
-
-#endif // !CONFIG_POSTPROC
-    vp9_clear_system_state();
-    return ret;
-  }
-}
-
-int vp9_set_roimap(VP9_PTR comp, unsigned char *map, unsigned int rows,
-                   unsigned int cols, int delta_q[4], int delta_lf[4],
-                   unsigned int threshold[4]) {
-  VP9_COMP *cpi = (VP9_COMP *) comp;
-  signed char feature_data[SEG_LVL_MAX][MAX_MB_SEGMENTS];
-  MACROBLOCKD *xd = &cpi->mb.e_mbd;
-  int i;
-
-  if (cpi->common.mb_rows != rows || cpi->common.mb_cols != cols)
-    return -1;
-
-  if (!map) {
-    vp9_disable_segmentation((VP9_PTR)cpi);
-    return 0;
-  }
-
-  // Set the segmentation Map
-  vp9_set_segmentation_map((VP9_PTR)cpi, map);
-
-  // Activate segmentation.
-  vp9_enable_segmentation((VP9_PTR)cpi);
-
-  // Set up the quant segment data
-  feature_data[SEG_LVL_ALT_Q][0] = delta_q[0];
-  feature_data[SEG_LVL_ALT_Q][1] = delta_q[1];
-  feature_data[SEG_LVL_ALT_Q][2] = delta_q[2];
-  feature_data[SEG_LVL_ALT_Q][3] = delta_q[3];
-
-  // Set up the loop segment data s
-  feature_data[SEG_LVL_ALT_LF][0] = delta_lf[0];
-  feature_data[SEG_LVL_ALT_LF][1] = delta_lf[1];
-  feature_data[SEG_LVL_ALT_LF][2] = delta_lf[2];
-  feature_data[SEG_LVL_ALT_LF][3] = delta_lf[3];
-
-  cpi->segment_encode_breakout[0] = threshold[0];
-  cpi->segment_encode_breakout[1] = threshold[1];
-  cpi->segment_encode_breakout[2] = threshold[2];
-  cpi->segment_encode_breakout[3] = threshold[3];
-
-  // Enable the loop and quant changes in the feature mask
-  for (i = 0; i < 4; i++) {
-    if (delta_q[i])
-      vp9_enable_segfeature(xd, i, SEG_LVL_ALT_Q);
-    else
-      vp9_disable_segfeature(xd, i, SEG_LVL_ALT_Q);
-
-    if (delta_lf[i])
-      vp9_enable_segfeature(xd, i, SEG_LVL_ALT_LF);
-    else
-      vp9_disable_segfeature(xd, i, SEG_LVL_ALT_LF);
-  }
-
-  // Initialise the feature data structure
-  // SEGMENT_DELTADATA    0, SEGMENT_ABSDATA      1
-  vp9_set_segment_data((VP9_PTR)cpi, &feature_data[0][0], SEGMENT_DELTADATA);
-
-  return 0;
-}
-
-int vp9_set_active_map(VP9_PTR comp, unsigned char *map,
-                       unsigned int rows, unsigned int cols) {
-  VP9_COMP *cpi = (VP9_COMP *) comp;
-
-  if (rows == cpi->common.mb_rows && cols == cpi->common.mb_cols) {
-    if (map) {
-      vpx_memcpy(cpi->active_map, map, rows * cols);
-      cpi->active_map_enabled = 1;
-    } else
-      cpi->active_map_enabled = 0;
-
-    return 0;
-  } else {
-    // cpi->active_map_enabled = 0;
-    return -1;
-  }
-}
-
-int vp9_set_internal_size(VP9_PTR comp,
-                          VPX_SCALING horiz_mode, VPX_SCALING vert_mode) {
-  VP9_COMP *cpi = (VP9_COMP *) comp;
-
-  if (horiz_mode <= ONETWO)
-    cpi->common.horiz_scale = horiz_mode;
-  else
-    return -1;
-
-  if (vert_mode <= ONETWO)
-    cpi->common.vert_scale  = vert_mode;
-  else
-    return -1;
-
-  return 0;
-}
-
-
-
-int vp9_calc_ss_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest) {
-  int i, j;
-  int Total = 0;
-
-  unsigned char *src = source->y_buffer;
-  unsigned char *dst = dest->y_buffer;
-
-  // Loop through the Y plane raw and reconstruction data summing (square differences)
-  for (i = 0; i < source->y_height; i += 16) {
-    for (j = 0; j < source->y_width; j += 16) {
-      unsigned int sse;
-      Total += vp9_mse16x16(src + j, source->y_stride, dst + j, dest->y_stride,
-                            &sse);
-    }
-
-    src += 16 * source->y_stride;
-    dst += 16 * dest->y_stride;
-  }
-
-  return Total;
-}
-
-
-int vp9_get_quantizer(VP9_PTR c) {
-  VP9_COMP   *cpi = (VP9_COMP *) c;
-  return cpi->common.base_qindex;
-}
--- a/vp8/encoder/onyx_int.h
+++ /dev/null
@@ -1,788 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef __INC_ONYX_INT_H
-#define __INC_ONYX_INT_H
-
-#include <stdio.h>
-#include "vpx_ports/config.h"
-#include "vp8/common/onyx.h"
-#include "treewriter.h"
-#include "tokenize.h"
-#include "vp8/common/onyxc_int.h"
-#include "variance.h"
-#include "encodemb.h"
-#include "quantize.h"
-#include "vp8/common/entropy.h"
-#include "vp8/common/entropymode.h"
-#include "vpx_ports/mem.h"
-#include "vpx/internal/vpx_codec_internal.h"
-#include "mcomp.h"
-#include "temporal_filter.h"
-#include "vp8/common/findnearmv.h"
-#include "lookahead.h"
-
-// #define SPEEDSTATS 1
-#define MIN_GF_INTERVAL             4
-#define DEFAULT_GF_INTERVAL         7
-
-#define KEY_FRAME_CONTEXT 5
-
-#define MAX_LAG_BUFFERS 25
-
-#define AF_THRESH   25
-#define AF_THRESH2  100
-#define ARF_DECAY_THRESH 12
-
-#if CONFIG_PRED_FILTER
-#define MAX_MODES 54
-#else  // CONFIG_PRED_FILTER
-#define MAX_MODES 42
-#endif  // CONFIG_PRED_FILTER
-
-#define MIN_THRESHMULT  32
-#define MAX_THRESHMULT  512
-
-#define GF_ZEROMV_ZBIN_BOOST 12
-#define LF_ZEROMV_ZBIN_BOOST 6
-#define MV_ZBIN_BOOST        4
-#define ZBIN_OQ_MAX 192
-
-#define VP9_TEMPORAL_ALT_REF 1
-
-typedef struct {
-  nmv_context nmvc;
-  int nmvjointcost[MV_JOINTS];
-  int nmvcosts[2][MV_VALS];
-  int nmvcosts_hp[2][MV_VALS];
-
-#ifdef MODE_STATS
-  // Stats
-  int y_modes[VP9_YMODES];
-  int uv_modes[VP9_UV_MODES];
-  int i8x8_modes[VP9_I8X8_MODES];
-  int b_modes[B_MODE_COUNT];
-  int inter_y_modes[MB_MODE_COUNT];
-  int inter_uv_modes[VP9_UV_MODES];
-  int inter_b_modes[B_MODE_COUNT];
-#endif
-
-  vp9_prob segment_pred_probs[PREDICTION_PROBS];
-  unsigned char ref_pred_probs_update[PREDICTION_PROBS];
-  vp9_prob ref_pred_probs[PREDICTION_PROBS];
-  vp9_prob prob_comppred[COMP_PRED_CONTEXTS];
-
-  unsigned char *last_frame_seg_map_copy;
-
-  // 0 = Intra, Last, GF, ARF
-  signed char last_ref_lf_deltas[MAX_REF_LF_DELTAS];
-  // 0 = BPRED, ZERO_MV, MV, SPLIT
-  signed char last_mode_lf_deltas[MAX_MODE_LF_DELTAS];
-
-  vp9_prob coef_probs[BLOCK_TYPES]
-      [COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES];
-  vp9_prob hybrid_coef_probs[BLOCK_TYPES]
-      [COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES];
-
-  vp9_prob coef_probs_8x8[BLOCK_TYPES_8X8]
-      [COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES];
-  vp9_prob hybrid_coef_probs_8x8[BLOCK_TYPES_8X8]
-      [COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES];
-
-  vp9_prob coef_probs_16x16[BLOCK_TYPES_16X16]
-      [COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES];
-  vp9_prob hybrid_coef_probs_16x16[BLOCK_TYPES_16X16]
-      [COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES];
-
-  vp9_prob ymode_prob [VP9_YMODES - 1]; /* interframe intra mode probs */
-  vp9_prob uv_mode_prob [VP9_YMODES][VP9_UV_MODES - 1];
-  vp9_prob bmode_prob [VP9_BINTRAMODES - 1];
-  vp9_prob i8x8_mode_prob [VP9_I8X8_MODES - 1];
-  vp9_prob sub_mv_ref_prob [SUBMVREF_COUNT][VP9_SUBMVREFS - 1];
-  vp9_prob mbsplit_prob [VP9_NUMMBSPLITS - 1];
-
-  vp9_prob switchable_interp_prob[VP9_SWITCHABLE_FILTERS + 1]
-                                 [VP9_SWITCHABLE_FILTERS - 1];
-
-  int mv_ref_ct[6][4][2];
-  int mode_context[6][4];
-  int mv_ref_ct_a[6][4][2];
-  int mode_context_a[6][4];
-
-} CODING_CONTEXT;
-
-typedef struct {
-  double frame;
-  double intra_error;
-  double coded_error;
-  double sr_coded_error;
-  double ssim_weighted_pred_err;
-  double pcnt_inter;
-  double pcnt_motion;
-  double pcnt_second_ref;
-  double pcnt_neutral;
-  double MVr;
-  double mvr_abs;
-  double MVc;
-  double mvc_abs;
-  double MVrv;
-  double MVcv;
-  double mv_in_out_count;
-  double new_mv_count;
-  double duration;
-  double count;
-}
-FIRSTPASS_STATS;
-
-typedef struct {
-  int frames_so_far;
-  double frame_intra_error;
-  double frame_coded_error;
-  double frame_pcnt_inter;
-  double frame_pcnt_motion;
-  double frame_mvr;
-  double frame_mvr_abs;
-  double frame_mvc;
-  double frame_mvc_abs;
-
-} ONEPASS_FRAMESTATS;
-
-typedef struct {
-  struct {
-    int err;
-    union {
-      int_mv mv;
-      MB_PREDICTION_MODE mode;
-    } m;
-  } ref[MAX_REF_FRAMES];
-} MBGRAPH_MB_STATS;
-
-typedef struct {
-  MBGRAPH_MB_STATS *mb_stats;
-} MBGRAPH_FRAME_STATS;
-
-#if CONFIG_PRED_FILTER
-typedef enum {
-  THR_ZEROMV,
-  THR_ZEROMV_FILT,
-  THR_DC,
-
-  THR_NEARESTMV,
-  THR_NEARESTMV_FILT,
-  THR_NEARMV,
-  THR_NEARMV_FILT,
-
-  THR_ZEROG,
-  THR_ZEROG_FILT,
-  THR_NEARESTG,
-  THR_NEARESTG_FILT,
-
-  THR_ZEROA,
-  THR_ZEROA_FILT,
-  THR_NEARESTA,
-  THR_NEARESTA_FILT,
-
-  THR_NEARG,
-  THR_NEARG_FILT,
-  THR_NEARA,
-  THR_NEARA_FILT,
-
-  THR_V_PRED,
-  THR_H_PRED,
-  THR_D45_PRED,
-  THR_D135_PRED,
-  THR_D117_PRED,
-  THR_D153_PRED,
-  THR_D27_PRED,
-  THR_D63_PRED,
-  THR_TM,
-
-  THR_NEWMV,
-  THR_NEWMV_FILT,
-  THR_NEWG,
-  THR_NEWG_FILT,
-  THR_NEWA,
-  THR_NEWA_FILT,
-
-  THR_SPLITMV,
-  THR_SPLITG,
-  THR_SPLITA,
-
-  THR_B_PRED,
-  THR_I8X8_PRED,
-
-  THR_COMP_ZEROLG,
-  THR_COMP_NEARESTLG,
-  THR_COMP_NEARLG,
-
-  THR_COMP_ZEROLA,
-  THR_COMP_NEARESTLA,
-  THR_COMP_NEARLA,
-
-  THR_COMP_ZEROGA,
-  THR_COMP_NEARESTGA,
-  THR_COMP_NEARGA,
-
-  THR_COMP_NEWLG,
-  THR_COMP_NEWLA,
-  THR_COMP_NEWGA,
-
-  THR_COMP_SPLITLG,
-  THR_COMP_SPLITLA,
-  THR_COMP_SPLITGA,
-}
-THR_MODES;
-#else
-typedef enum {
-  THR_ZEROMV,
-  THR_DC,
-
-  THR_NEARESTMV,
-  THR_NEARMV,
-
-  THR_ZEROG,
-  THR_NEARESTG,
-
-  THR_ZEROA,
-  THR_NEARESTA,
-
-  THR_NEARG,
-  THR_NEARA,
-
-  THR_V_PRED,
-  THR_H_PRED,
-  THR_D45_PRED,
-  THR_D135_PRED,
-  THR_D117_PRED,
-  THR_D153_PRED,
-  THR_D27_PRED,
-  THR_D63_PRED,
-  THR_TM,
-
-  THR_NEWMV,
-  THR_NEWG,
-  THR_NEWA,
-
-  THR_SPLITMV,
-  THR_SPLITG,
-  THR_SPLITA,
-
-  THR_B_PRED,
-  THR_I8X8_PRED,
-
-  THR_COMP_ZEROLG,
-  THR_COMP_NEARESTLG,
-  THR_COMP_NEARLG,
-
-  THR_COMP_ZEROLA,
-  THR_COMP_NEARESTLA,
-  THR_COMP_NEARLA,
-
-  THR_COMP_ZEROGA,
-  THR_COMP_NEARESTGA,
-  THR_COMP_NEARGA,
-
-  THR_COMP_NEWLG,
-  THR_COMP_NEWLA,
-  THR_COMP_NEWGA,
-
-  THR_COMP_SPLITLG,
-  THR_COMP_SPLITLA,
-  THR_COMP_SPLITGA
-}
-THR_MODES;
-#endif
-
-typedef enum {
-  DIAMOND = 0,
-  NSTEP = 1,
-  HEX = 2
-} SEARCH_METHODS;
-
-typedef struct {
-  int RD;
-  SEARCH_METHODS search_method;
-  int improved_dct;
-  int auto_filter;
-  int recode_loop;
-  int iterative_sub_pixel;
-  int half_pixel_search;
-  int quarter_pixel_search;
-  int thresh_mult[MAX_MODES];
-  int max_step_search_steps;
-  int first_step;
-  int optimize_coefficients;
-  int no_skip_block4x4_search;
-  int improved_mv_pred;
-  int search_best_filter;
-
-} SPEED_FEATURES;
-
-typedef struct {
-  MACROBLOCK  mb;
-  int totalrate;
-} MB_ROW_COMP;
-
-typedef struct {
-  TOKENEXTRA *start;
-  TOKENEXTRA *stop;
-} TOKENLIST;
-
-typedef struct {
-  int ithread;
-  void *ptr1;
-  void *ptr2;
-} ENCODETHREAD_DATA;
-typedef struct {
-  int ithread;
-  void *ptr1;
-} LPFTHREAD_DATA;
-
-
-typedef struct VP9_ENCODER_RTCD {
-  VP9_COMMON_RTCD            *common;
-  vp9_search_rtcd_vtable_t    search;
-  vp9_temporal_rtcd_vtable_t  temporal;
-} VP9_ENCODER_RTCD;
-
-enum BlockSize {
-  BLOCK_16X8 = PARTITIONING_16X8,
-  BLOCK_8X16 = PARTITIONING_8X16,
-  BLOCK_8X8 = PARTITIONING_8X8,
-  BLOCK_4X4 = PARTITIONING_4X4,
-  BLOCK_16X16,
-  BLOCK_MAX_SEGMENTS,
-  BLOCK_32X32 = BLOCK_MAX_SEGMENTS,
-  BLOCK_MAX_SB_SEGMENTS,
-};
-
-typedef struct VP9_COMP {
-
-  DECLARE_ALIGNED(16, short, Y1quant[QINDEX_RANGE][16]);
-  DECLARE_ALIGNED(16, unsigned char, Y1quant_shift[QINDEX_RANGE][16]);
-  DECLARE_ALIGNED(16, short, Y1zbin[QINDEX_RANGE][16]);
-  DECLARE_ALIGNED(16, short, Y1round[QINDEX_RANGE][16]);
-
-  DECLARE_ALIGNED(16, short, Y2quant[QINDEX_RANGE][16]);
-  DECLARE_ALIGNED(16, unsigned char, Y2quant_shift[QINDEX_RANGE][16]);
-  DECLARE_ALIGNED(16, short, Y2zbin[QINDEX_RANGE][16]);
-  DECLARE_ALIGNED(16, short, Y2round[QINDEX_RANGE][16]);
-
-  DECLARE_ALIGNED(16, short, UVquant[QINDEX_RANGE][16]);
-  DECLARE_ALIGNED(16, unsigned char, UVquant_shift[QINDEX_RANGE][16]);
-  DECLARE_ALIGNED(16, short, UVzbin[QINDEX_RANGE][16]);
-  DECLARE_ALIGNED(16, short, UVround[QINDEX_RANGE][16]);
-
-  DECLARE_ALIGNED(16, short, zrun_zbin_boost_y1[QINDEX_RANGE][16]);
-  DECLARE_ALIGNED(16, short, zrun_zbin_boost_y2[QINDEX_RANGE][16]);
-  DECLARE_ALIGNED(16, short, zrun_zbin_boost_uv[QINDEX_RANGE][16]);
-
-  DECLARE_ALIGNED(64, short, Y1zbin_8x8[QINDEX_RANGE][64]);
-  DECLARE_ALIGNED(64, short, Y2zbin_8x8[QINDEX_RANGE][64]);
-  DECLARE_ALIGNED(64, short, UVzbin_8x8[QINDEX_RANGE][64]);
-  DECLARE_ALIGNED(64, short, zrun_zbin_boost_y1_8x8[QINDEX_RANGE][64]);
-  DECLARE_ALIGNED(64, short, zrun_zbin_boost_y2_8x8[QINDEX_RANGE][64]);
-  DECLARE_ALIGNED(64, short, zrun_zbin_boost_uv_8x8[QINDEX_RANGE][64]);
-
-  DECLARE_ALIGNED(16, short, Y1zbin_16x16[QINDEX_RANGE][256]);
-  DECLARE_ALIGNED(16, short, Y2zbin_16x16[QINDEX_RANGE][256]);
-  DECLARE_ALIGNED(16, short, UVzbin_16x16[QINDEX_RANGE][256]);
-  DECLARE_ALIGNED(16, short, zrun_zbin_boost_y1_16x16[QINDEX_RANGE][256]);
-  DECLARE_ALIGNED(16, short, zrun_zbin_boost_y2_16x16[QINDEX_RANGE][256]);
-  DECLARE_ALIGNED(16, short, zrun_zbin_boost_uv_16x16[QINDEX_RANGE][256]);
-
-  MACROBLOCK mb;
-  VP9_COMMON common;
-  VP9_CONFIG oxcf;
-
-  struct lookahead_ctx    *lookahead;
-  struct lookahead_entry  *source;
-  struct lookahead_entry  *alt_ref_source;
-
-  YV12_BUFFER_CONFIG *Source;
-  YV12_BUFFER_CONFIG *un_scaled_source;
-  YV12_BUFFER_CONFIG scaled_source;
-
-  int source_alt_ref_pending; // frame in src_buffers has been identified to be encoded as an alt ref
-  int source_alt_ref_active;  // an alt ref frame has been encoded and is usable
-
-  int is_src_frame_alt_ref;   // source of frame to encode is an exact copy of an alt ref frame
-
-  int gold_is_last; // golden frame same as last frame ( short circuit gold searches)
-  int alt_is_last;  // Alt reference frame same as last ( short circuit altref search)
-  int gold_is_alt;  // don't do both alt and gold search ( just do gold).
-
-  // int refresh_alt_ref_frame;
-  YV12_BUFFER_CONFIG last_frame_uf;
-
-  TOKENEXTRA *tok;
-  unsigned int tok_count;
-
-
-  unsigned int frames_since_key;
-  unsigned int key_frame_frequency;
-  unsigned int this_key_frame_forced;
-  unsigned int next_key_frame_forced;
-
-  // Ambient reconstruction err target for force key frames
-  int ambient_err;
-
-  unsigned int mode_check_freq[MAX_MODES];
-  unsigned int mode_test_hit_counts[MAX_MODES];
-  unsigned int mode_chosen_counts[MAX_MODES];
-
-  int rd_thresh_mult[MAX_MODES];
-  int rd_baseline_thresh[MAX_MODES];
-  int rd_threshes[MAX_MODES];
-  int64_t rd_comp_pred_diff[NB_PREDICTION_TYPES];
-  int rd_prediction_type_threshes[4][NB_PREDICTION_TYPES];
-  int comp_pred_count[COMP_PRED_CONTEXTS];
-  int single_pred_count[COMP_PRED_CONTEXTS];
-  // FIXME contextualize
-  int txfm_count[TX_SIZE_MAX];
-  int txfm_count_8x8p[TX_SIZE_MAX - 1];
-  int64_t rd_tx_select_diff[NB_TXFM_MODES];
-  int rd_tx_select_threshes[4][NB_TXFM_MODES];
-
-  int RDMULT;
-  int RDDIV;
-
-  CODING_CONTEXT coding_context;
-
-  // Rate targetting variables
-  int64_t prediction_error;
-  int64_t last_prediction_error;
-  int64_t intra_error;
-  int64_t last_intra_error;
-
-  int this_frame_target;
-  int projected_frame_size;
-  int last_q[2];                   // Separate values for Intra/Inter
-  int last_boosted_qindex;         // Last boosted GF/KF/ARF q
-
-  double rate_correction_factor;
-  double key_frame_rate_correction_factor;
-  double gf_rate_correction_factor;
-
-  int frames_till_gf_update_due;      // Count down till next GF
-  int current_gf_interval;          // GF interval chosen when we coded the last GF
-
-  int gf_overspend_bits;            // Total bits overspent becasue of GF boost (cumulative)
-
-  int non_gf_bitrate_adjustment;     // Used in the few frames following a GF to recover the extra bits spent in that GF
-
-  int kf_overspend_bits;            // Extra bits spent on key frames that need to be recovered on inter frames
-  int kf_bitrate_adjustment;        // Current number of bit s to try and recover on each inter frame.
-  int max_gf_interval;
-  int baseline_gf_interval;
-  int active_arnr_frames;           // <= cpi->oxcf.arnr_max_frames
-
-  int64_t key_frame_count;
-  int prior_key_frame_distance[KEY_FRAME_CONTEXT];
-  int per_frame_bandwidth;          // Current section per frame bandwidth target
-  int av_per_frame_bandwidth;        // Average frame size target for clip
-  int min_frame_bandwidth;          // Minimum allocation that should be used for any frame
-  int inter_frame_target;
-  double output_frame_rate;
-  int64_t last_time_stamp_seen;
-  int64_t last_end_time_stamp_seen;
-  int64_t first_time_stamp_ever;
-
-  int ni_av_qi;
-  int ni_tot_qi;
-  int ni_frames;
-  int avg_frame_qindex;
-  double tot_q;
-  double avg_q;
-
-  int zbin_over_quant;
-  int zbin_mode_boost;
-  int zbin_mode_boost_enabled;
-
-  int64_t total_byte_count;
-
-  int buffered_mode;
-
-  int buffer_level;
-  int bits_off_target;
-
-  int rolling_target_bits;
-  int rolling_actual_bits;
-
-  int long_rolling_target_bits;
-  int long_rolling_actual_bits;
-
-  int64_t total_actual_bits;
-  int total_target_vs_actual;        // debug stats
-
-  int worst_quality;
-  int active_worst_quality;
-  int best_quality;
-  int active_best_quality;
-
-  int cq_target_quality;
-
-#if CONFIG_SUPERBLOCKS
-  int sb_count;
-  int sb_ymode_count [VP9_I32X32_MODES];
-#endif
-  int ymode_count [VP9_YMODES];        /* intra MB type cts this frame */
-  int bmode_count [VP9_BINTRAMODES];
-  int i8x8_mode_count [VP9_I8X8_MODES];
-  int sub_mv_ref_count [SUBMVREF_COUNT][VP9_SUBMVREFS];
-  int mbsplit_count [VP9_NUMMBSPLITS];
-  // int uv_mode_count[VP9_UV_MODES];       /* intra MB type cts this frame */
-  int y_uv_mode_count[VP9_YMODES][VP9_UV_MODES];
-
-  nmv_context_counts NMVcount;
-
-  unsigned int coef_counts [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];  /* for this frame */
-  vp9_prob frame_coef_probs [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
-  unsigned int frame_branch_ct [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES][2];
-  unsigned int hybrid_coef_counts [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];  /* for this frame */
-  vp9_prob frame_hybrid_coef_probs [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
-  unsigned int frame_hybrid_branch_ct [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES][2];
-
-  unsigned int coef_counts_8x8 [BLOCK_TYPES_8X8] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];  /* for this frame */
-  vp9_prob frame_coef_probs_8x8 [BLOCK_TYPES_8X8] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
-  unsigned int frame_branch_ct_8x8 [BLOCK_TYPES_8X8] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES][2];
-  unsigned int hybrid_coef_counts_8x8 [BLOCK_TYPES_8X8] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];  /* for this frame */
-  vp9_prob frame_hybrid_coef_probs_8x8 [BLOCK_TYPES_8X8] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
-  unsigned int frame_hybrid_branch_ct_8x8 [BLOCK_TYPES_8X8] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES][2];
-
-  unsigned int coef_counts_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];  /* for this frame */
-  vp9_prob frame_coef_probs_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
-  unsigned int frame_branch_ct_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES][2];
-  unsigned int hybrid_coef_counts_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];  /* for this frame */
-  vp9_prob frame_hybrid_coef_probs_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
-  unsigned int frame_hybrid_branch_ct_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES][2];
-
-  int gfu_boost;
-  int last_boost;
-  int kf_boost;
-  int kf_zeromotion_pct;
-
-  int target_bandwidth;
-  struct vpx_codec_pkt_list  *output_pkt_list;
-
-#if 0
-  // Experimental code for lagged and one pass
-  ONEPASS_FRAMESTATS one_pass_frame_stats[MAX_LAG_BUFFERS];
-  int one_pass_frame_index;
-#endif
-  MBGRAPH_FRAME_STATS mbgraph_stats[MAX_LAG_BUFFERS];
-  int mbgraph_n_frames;             // number of frames filled in the above
-  int static_mb_pct;                // % forced skip mbs by segmentation
-  int seg0_progress, seg0_idx, seg0_cnt;
-  int ref_pred_count[3][2];
-
-  int decimation_factor;
-  int decimation_count;
-
-  // for real time encoding
-  int avg_encode_time;              // microsecond
-  int avg_pick_mode_time;            // microsecond
-  int Speed;
-  unsigned int cpu_freq;           // Mhz
-  int compressor_speed;
-
-  int interquantizer;
-  int goldfreq;
-  int auto_worst_q;
-  int cpu_used;
-  int horiz_scale;
-  int vert_scale;
-  int pass;
-
-  vp9_prob last_skip_false_probs[3][MBSKIP_CONTEXTS];
-  int last_skip_probs_q[3];
-
-  int recent_ref_frame_usage[MAX_REF_FRAMES];
-  int count_mb_ref_frame_usage[MAX_REF_FRAMES];
-  int ref_frame_flags;
-
-  unsigned char ref_pred_probs_update[PREDICTION_PROBS];
-
-  SPEED_FEATURES sf;
-  int error_bins[1024];
-
-  // Data used for real time conferencing mode to help determine if it would be good to update the gf
-  int inter_zz_count;
-  int gf_bad_count;
-  int gf_update_recommended;
-  int skip_true_count[3];
-  int skip_false_count[3];
-
-  unsigned char *segmentation_map;
-
-  // segment threashold for encode breakout
-  int  segment_encode_breakout[MAX_MB_SEGMENTS];
-
-  unsigned char *active_map;
-  unsigned int active_map_enabled;
-
-  TOKENLIST *tplist;
-
-  fractional_mv_step_fp *find_fractional_mv_step;
-  vp9_full_search_fn_t full_search_sad;
-  vp9_refining_search_fn_t refining_search_sad;
-  vp9_diamond_search_fn_t diamond_search_sad;
-  vp9_variance_fn_ptr_t fn_ptr[BLOCK_MAX_SB_SEGMENTS];
-  uint64_t time_receive_data;
-  uint64_t time_compress_data;
-  uint64_t time_pick_lpf;
-  uint64_t time_encode_mb_row;
-
-  int base_skip_false_prob[QINDEX_RANGE][3];
-
-  struct twopass_rc {
-    unsigned int section_intra_rating;
-    unsigned int next_iiratio;
-    unsigned int this_iiratio;
-    FIRSTPASS_STATS *total_stats;
-    FIRSTPASS_STATS *this_frame_stats;
-    FIRSTPASS_STATS *stats_in, *stats_in_end, *stats_in_start;
-    FIRSTPASS_STATS *total_left_stats;
-    int first_pass_done;
-    int64_t bits_left;
-    int64_t clip_bits_total;
-    double avg_iiratio;
-    double modified_error_total;
-    double modified_error_used;
-    double modified_error_left;
-    double kf_intra_err_min;
-    double gf_intra_err_min;
-    int frames_to_key;
-    int maxq_max_limit;
-    int maxq_min_limit;
-    int static_scene_max_gf_interval;
-    int kf_bits;
-    int gf_group_error_left;           // Remaining error from uncoded frames in a gf group. Two pass use only
-
-    // Projected total bits available for a key frame group of frames
-    int64_t kf_group_bits;
-
-    // Error score of frames still to be coded in kf group
-    int64_t kf_group_error_left;
-
-    int gf_group_bits;                // Projected Bits available for a group of frames including 1 GF or ARF
-    int gf_bits;                     // Bits for the golden frame or ARF - 2 pass only
-    int alt_extra_bits;
-
-    int sr_update_lag;
-    double est_max_qcorrection_factor;
-  } twopass;
-
-#if CONFIG_RUNTIME_CPU_DETECT
-  VP9_ENCODER_RTCD            rtcd;
-#endif
-#if VP9_TEMPORAL_ALT_REF
-  YV12_BUFFER_CONFIG alt_ref_buffer;
-  YV12_BUFFER_CONFIG *frames[MAX_LAG_BUFFERS];
-  int fixed_divide[512];
-#endif
-
-#if CONFIG_INTERNAL_STATS
-  int    count;
-  double total_y;
-  double total_u;
-  double total_v;
-  double total;
-  double total_sq_error;
-  double totalp_y;
-  double totalp_u;
-  double totalp_v;
-  double totalp;
-  double total_sq_error2;
-  int    bytes;
-  double summed_quality;
-  double summed_weights;
-  unsigned int tot_recode_hits;
-
-
-  double total_ssimg_y;
-  double total_ssimg_u;
-  double total_ssimg_v;
-  double total_ssimg_all;
-
-  int b_calculate_ssimg;
-#endif
-  int b_calculate_psnr;
-
-  // Per MB activity measurement
-  unsigned int activity_avg;
-  unsigned int *mb_activity_map;
-  int *mb_norm_activity_map;
-
-  // Record of which MBs still refer to last golden frame either
-  // directly or through 0,0
-  unsigned char *gf_active_flags;
-  int gf_active_count;
-
-  int output_partition;
-
-  // Store last frame's MV info for next frame MV prediction
-  int_mv *lfmv;
-  int *lf_ref_frame_sign_bias;
-  int *lf_ref_frame;
-
-  /* force next frame to intra when kf_auto says so */
-  int force_next_frame_intra;
-
-  int droppable;
-
-  // TODO Do we still need this??
-  int update_context;
-
-  int dummy_packing;    /* flag to indicate if packing is dummy */
-
-#if CONFIG_PRED_FILTER
-  int pred_filter_on_count;
-  int pred_filter_off_count;
-#endif
-  unsigned int switchable_interp_count[VP9_SWITCHABLE_FILTERS + 1]
-                                      [VP9_SWITCHABLE_FILTERS];
-
-#if CONFIG_NEW_MVREF
-  unsigned int best_ref_index_counts[MAX_REF_FRAMES][MAX_MV_REFS];
-#endif
-
-} VP9_COMP;
-
-void vp9_encode_frame(VP9_COMP *cpi);
-
-void vp9_pack_bitstream(VP9_COMP *cpi, unsigned char *dest,
-                        unsigned long *size);
-
-void vp9_activity_masking(VP9_COMP *cpi, MACROBLOCK *x);
-
-void vp9_tokenize_mb(VP9_COMP *, MACROBLOCKD *, TOKENEXTRA **, int dry_run);
-void vp9_stuff_mb(VP9_COMP *cpi, MACROBLOCKD *xd, TOKENEXTRA **t, int dry_run);
-
-void vp9_set_speed_features(VP9_COMP *cpi);
-
-#if CONFIG_DEBUG
-#define CHECK_MEM_ERROR(lval,expr) do {\
-    lval = (expr); \
-    if(!lval) \
-      vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,\
-                         "Failed to allocate "#lval" at %s:%d", \
-                         __FILE__,__LINE__);\
-  } while(0)
-#else
-#define CHECK_MEM_ERROR(lval,expr) do {\
-    lval = (expr); \
-    if(!lval) \
-      vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,\
-                         "Failed to allocate "#lval);\
-  } while(0)
-#endif
-#endif  // __INC_ONYX_INT_H
--- a/vp8/encoder/picklpf.c
+++ /dev/null
@@ -1,420 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vp8/common/onyxc_int.h"
-#include "onyx_int.h"
-#include "quantize.h"
-#include "vpx_mem/vpx_mem.h"
-#include "vpx_scale/yv12extend.h"
-#include "vpx_scale/vpxscale.h"
-#include "vp8/common/alloccommon.h"
-#include "vp8/common/loopfilter.h"
-#if ARCH_ARM
-#include "vpx_ports/arm.h"
-#endif
-
-extern int vp9_calc_ss_err(YV12_BUFFER_CONFIG *source,
-                           YV12_BUFFER_CONFIG *dest);
-#if HAVE_ARMV7
-extern void vp8_yv12_copy_frame_yonly_no_extend_frame_borders_neon(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc);
-#endif
-
-#if CONFIG_RUNTIME_CPU_DETECT
-#define IF_RTCD(x) (x)
-#else
-#define IF_RTCD(x) NULL
-#endif
-
-extern void(*vp9_yv12_copy_partial_frame_ptr)(YV12_BUFFER_CONFIG *src_ybc,
-                                              YV12_BUFFER_CONFIG *dst_ybc,
-                                              int fraction);
-
-void vp9_yv12_copy_partial_frame(YV12_BUFFER_CONFIG *src_ybc,
-                                 YV12_BUFFER_CONFIG *dst_ybc, int Fraction) {
-  unsigned char *src_y, *dst_y;
-  int yheight;
-  int ystride;
-  int border;
-  int yoffset;
-  int linestocopy;
-
-  border   = src_ybc->border;
-  yheight  = src_ybc->y_height;
-  ystride  = src_ybc->y_stride;
-
-  linestocopy = (yheight >> (Fraction + 4));
-
-  if (linestocopy < 1)
-    linestocopy = 1;
-
-  linestocopy <<= 4;
-
-  yoffset  = ystride * ((yheight >> 5) * 16 - 8);
-  src_y = src_ybc->y_buffer + yoffset;
-  dst_y = dst_ybc->y_buffer + yoffset;
-
-  vpx_memcpy(dst_y, src_y, ystride * (linestocopy + 16));
-}
-
-static int calc_partial_ssl_err(YV12_BUFFER_CONFIG *source,
-                                YV12_BUFFER_CONFIG *dest, int Fraction) {
-  int i, j;
-  int Total = 0;
-  int srcoffset, dstoffset;
-  unsigned char *src = source->y_buffer;
-  unsigned char *dst = dest->y_buffer;
-
-  int linestocopy = (source->y_height >> (Fraction + 4));
-
-  if (linestocopy < 1)
-    linestocopy = 1;
-
-  linestocopy <<= 4;
-
-
-  srcoffset = source->y_stride   * (dest->y_height >> 5) * 16;
-  dstoffset = dest->y_stride     * (dest->y_height >> 5) * 16;
-
-  src += srcoffset;
-  dst += dstoffset;
-
-  // Loop through the Y plane raw and reconstruction data summing (square differences)
-  for (i = 0; i < linestocopy; i += 16) {
-    for (j = 0; j < source->y_width; j += 16) {
-      unsigned int sse;
-      Total += vp9_mse16x16(src + j, source->y_stride, dst + j, dest->y_stride,
-                            &sse);
-    }
-
-    src += 16 * source->y_stride;
-    dst += 16 * dest->y_stride;
-  }
-
-  return Total;
-}
-
-// Enforce a minimum filter level based upon baseline Q
-static int get_min_filter_level(VP9_COMP *cpi, int base_qindex) {
-  int min_filter_level;
-  /*int q = (int) vp9_convert_qindex_to_q(base_qindex);
-
-  if (cpi->source_alt_ref_active && cpi->common.refresh_golden_frame && !cpi->common.refresh_alt_ref_frame)
-      min_filter_level = 0;
-  else
-  {
-      if (q <= 10)
-          min_filter_level = 0;
-      else if (q <= 64)
-          min_filter_level = 1;
-      else
-          min_filter_level = (q >> 6);
-  }
-  */
-  min_filter_level = 0;
-
-  return min_filter_level;
-}
-
-// Enforce a maximum filter level based upon baseline Q
-static int get_max_filter_level(VP9_COMP *cpi, int base_qindex) {
-  // PGW August 2006: Highest filter values almost always a bad idea
-
-  // jbb chg: 20100118 - not so any more with this overquant stuff allow high values
-  // with lots of intra coming in.
-  int max_filter_level = MAX_LOOP_FILTER;// * 3 / 4;
-  (void)base_qindex;
-
-  if (cpi->twopass.section_intra_rating > 8)
-    max_filter_level = MAX_LOOP_FILTER * 3 / 4;
-
-  return max_filter_level;
-}
-
-void vp9_pick_filter_level_fast(YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi) {
-  VP9_COMMON *cm = &cpi->common;
-
-  int best_err = 0;
-  int filt_err = 0;
-  int min_filter_level = get_min_filter_level(cpi, cm->base_qindex);
-  int max_filter_level = get_max_filter_level(cpi, cm->base_qindex);
-  int filt_val;
-  int best_filt_val = cm->filter_level;
-
-  //  Make a copy of the unfiltered / processed recon buffer
-  vp9_yv12_copy_partial_frame_ptr(cm->frame_to_show, &cpi->last_frame_uf, 3);
-
-  if (cm->frame_type == KEY_FRAME)
-    cm->sharpness_level = 0;
-  else
-    cm->sharpness_level = cpi->oxcf.Sharpness;
-
-  if (cm->sharpness_level != cm->last_sharpness_level) {
-    vp9_loop_filter_update_sharpness(&cm->lf_info, cm->sharpness_level);
-    cm->last_sharpness_level = cm->sharpness_level;
-  }
-
-  // Start the search at the previous frame filter level unless it is now out of range.
-  if (cm->filter_level < min_filter_level)
-    cm->filter_level = min_filter_level;
-  else if (cm->filter_level > max_filter_level)
-    cm->filter_level = max_filter_level;
-
-  filt_val = cm->filter_level;
-  best_filt_val = filt_val;
-
-  // Get the err using the previous frame's filter value.
-  vp9_loop_filter_partial_frame(cm, &cpi->mb.e_mbd, filt_val);
-
-  best_err = calc_partial_ssl_err(sd, cm->frame_to_show, 3);
-
-  //  Re-instate the unfiltered frame
-  vp9_yv12_copy_partial_frame_ptr(&cpi->last_frame_uf, cm->frame_to_show, 3);
-
-  filt_val -= (1 + ((filt_val > 10) ? 1 : 0));
-
-  // Search lower filter levels
-  while (filt_val >= min_filter_level) {
-    // Apply the loop filter
-    vp9_loop_filter_partial_frame(cm, &cpi->mb.e_mbd, filt_val);
-
-    // Get the err for filtered frame
-    filt_err = calc_partial_ssl_err(sd, cm->frame_to_show, 3);
-
-    //  Re-instate the unfiltered frame
-    vp9_yv12_copy_partial_frame_ptr(&cpi->last_frame_uf, cm->frame_to_show, 3);
-
-
-    // Update the best case record or exit loop.
-    if (filt_err < best_err) {
-      best_err = filt_err;
-      best_filt_val = filt_val;
-    } else
-      break;
-
-    // Adjust filter level
-    filt_val -= (1 + ((filt_val > 10) ? 1 : 0));
-  }
-
-  // Search up (note that we have already done filt_val = cm->filter_level)
-  filt_val = cm->filter_level + (1 + ((filt_val > 10) ? 1 : 0));
-
-  if (best_filt_val == cm->filter_level) {
-    // Resist raising filter level for very small gains
-    best_err -= (best_err >> 10);
-
-    while (filt_val < max_filter_level) {
-      // Apply the loop filter
-      vp9_loop_filter_partial_frame(cm, &cpi->mb.e_mbd, filt_val);
-
-      // Get the err for filtered frame
-      filt_err = calc_partial_ssl_err(sd, cm->frame_to_show, 3);
-
-      //  Re-instate the unfiltered frame
-      vp9_yv12_copy_partial_frame_ptr(&cpi->last_frame_uf,
-                                      cm->frame_to_show, 3);
-
-      // Update the best case record or exit loop.
-      if (filt_err < best_err) {
-        // Do not raise filter level if improvement is < 1 part in 4096
-        best_err = filt_err - (filt_err >> 10);
-
-        best_filt_val = filt_val;
-      } else
-        break;
-
-      // Adjust filter level
-      filt_val += (1 + ((filt_val > 10) ? 1 : 0));
-    }
-  }
-
-  cm->filter_level = best_filt_val;
-
-  if (cm->filter_level < min_filter_level)
-    cm->filter_level = min_filter_level;
-
-  if (cm->filter_level > max_filter_level)
-    cm->filter_level = max_filter_level;
-}
-
-// Stub function for now Alt LF not used
-void vp9_set_alt_lf_level(VP9_COMP *cpi, int filt_val) {
-}
-
-void vp9_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi) {
-  VP9_COMMON *cm = &cpi->common;
-
-  int best_err = 0;
-  int filt_err = 0;
-  int min_filter_level = get_min_filter_level(cpi, cm->base_qindex);
-  int max_filter_level = get_max_filter_level(cpi, cm->base_qindex);
-
-  int filter_step;
-  int filt_high = 0;
-  int filt_mid = cm->filter_level;      // Start search at previous frame filter level
-  int filt_low = 0;
-  int filt_best;
-  int filt_direction = 0;
-
-  int Bias = 0;                       // Bias against raising loop filter and in favour of lowering it
-
-  //  Make a copy of the unfiltered / processed recon buffer
-#if HAVE_ARMV7
-#if CONFIG_RUNTIME_CPU_DETECT
-  if (cm->rtcd.flags & HAS_NEON)
-#endif
-  {
-    vp8_yv12_copy_frame_yonly_no_extend_frame_borders_neon(cm->frame_to_show, &cpi->last_frame_uf);
-  }
-#if CONFIG_RUNTIME_CPU_DETECT
-  else
-#endif
-#endif
-#if !HAVE_ARMV7 || CONFIG_RUNTIME_CPU_DETECT
-  {
-    vp8_yv12_copy_frame_ptr(cm->frame_to_show, &cpi->last_frame_uf);
-  }
-#endif
-
-  if (cm->frame_type == KEY_FRAME)
-    cm->sharpness_level = 0;
-  else
-    cm->sharpness_level = cpi->oxcf.Sharpness;
-
-  // Start the search at the previous frame filter level unless it is now out of range.
-  filt_mid = cm->filter_level;
-
-  if (filt_mid < min_filter_level)
-    filt_mid = min_filter_level;
-  else if (filt_mid > max_filter_level)
-    filt_mid = max_filter_level;
-
-  // Define the initial step size
-  filter_step = (filt_mid < 16) ? 4 : filt_mid / 4;
-
-  // Get baseline error score
-  vp9_set_alt_lf_level(cpi, filt_mid);
-  vp9_loop_filter_frame_yonly(cm, &cpi->mb.e_mbd, filt_mid);
-
-  best_err = vp9_calc_ss_err(sd, cm->frame_to_show);
-  filt_best = filt_mid;
-
-  //  Re-instate the unfiltered frame
-#if HAVE_ARMV7
-#if CONFIG_RUNTIME_CPU_DETECT
-  if (cm->rtcd.flags & HAS_NEON)
-#endif
-  {
-    vp8_yv12_copy_frame_yonly_no_extend_frame_borders_neon(&cpi->last_frame_uf, cm->frame_to_show);
-  }
-#if CONFIG_RUNTIME_CPU_DETECT
-  else
-#endif
-#endif
-#if !HAVE_ARMV7 || CONFIG_RUNTIME_CPU_DETECT
-  {
-    vp8_yv12_copy_frame_yonly_ptr(&cpi->last_frame_uf, cm->frame_to_show);
-  }
-#endif
-
-  while (filter_step > 0) {
-    Bias = (best_err >> (15 - (filt_mid / 8))) * filter_step; // PGW change 12/12/06 for small images
-
-    // jbb chg: 20100118 - in sections with lots of new material coming in don't bias as much to a low filter value
-    if (cpi->twopass.section_intra_rating < 20)
-      Bias = Bias * cpi->twopass.section_intra_rating / 20;
-
-    // yx, bias less for large block size
-    if (cpi->common.txfm_mode != ONLY_4X4)
-      Bias >>= 1;
-
-    filt_high = ((filt_mid + filter_step) > max_filter_level) ? max_filter_level : (filt_mid + filter_step);
-    filt_low = ((filt_mid - filter_step) < min_filter_level) ? min_filter_level : (filt_mid - filter_step);
-
-    if ((filt_direction <= 0) && (filt_low != filt_mid)) {
-      // Get Low filter error score
-      vp9_set_alt_lf_level(cpi, filt_low);
-      vp9_loop_filter_frame_yonly(cm, &cpi->mb.e_mbd, filt_low);
-
-      filt_err = vp9_calc_ss_err(sd, cm->frame_to_show);
-
-      //  Re-instate the unfiltered frame
-#if HAVE_ARMV7
-#if CONFIG_RUNTIME_CPU_DETECT
-      if (cm->rtcd.flags & HAS_NEON)
-#endif
-      {
-        vp8_yv12_copy_frame_yonly_no_extend_frame_borders_neon(&cpi->last_frame_uf, cm->frame_to_show);
-      }
-#if CONFIG_RUNTIME_CPU_DETECT
-      else
-#endif
-#endif
-#if !HAVE_ARMV7 || CONFIG_RUNTIME_CPU_DETECT
-      {
-        vp8_yv12_copy_frame_yonly_ptr(&cpi->last_frame_uf, cm->frame_to_show);
-      }
-#endif
-
-      // If value is close to the best so far then bias towards a lower loop filter value.
-      if ((filt_err - Bias) < best_err) {
-        // Was it actually better than the previous best?
-        if (filt_err < best_err)
-          best_err = filt_err;
-
-        filt_best = filt_low;
-      }
-    }
-
-    // Now look at filt_high
-    if ((filt_direction >= 0) && (filt_high != filt_mid)) {
-      vp9_set_alt_lf_level(cpi, filt_high);
-      vp9_loop_filter_frame_yonly(cm, &cpi->mb.e_mbd, filt_high);
-
-      filt_err = vp9_calc_ss_err(sd, cm->frame_to_show);
-
-      //  Re-instate the unfiltered frame
-#if HAVE_ARMV7
-#if CONFIG_RUNTIME_CPU_DETECT
-      if (cm->rtcd.flags & HAS_NEON)
-#endif
-      {
-        vp8_yv12_copy_frame_yonly_no_extend_frame_borders_neon(&cpi->last_frame_uf, cm->frame_to_show);
-      }
-#if CONFIG_RUNTIME_CPU_DETECT
-      else
-#endif
-#endif
-#if !HAVE_ARMV7 || CONFIG_RUNTIME_CPU_DETECT
-      {
-        vp8_yv12_copy_frame_yonly_ptr(&cpi->last_frame_uf, cm->frame_to_show);
-      }
-#endif
-
-      // Was it better than the previous best?
-      if (filt_err < (best_err - Bias)) {
-        best_err = filt_err;
-        filt_best = filt_high;
-      }
-    }
-
-    // Half the step distance if the best filter value was the same as last time
-    if (filt_best == filt_mid) {
-      filter_step = filter_step / 2;
-      filt_direction = 0;
-    } else {
-      filt_direction = (filt_best < filt_mid) ? -1 : 1;
-      filt_mid = filt_best;
-    }
-  }
-
-  cm->filter_level = filt_best;
-}
-
--- a/vp8/encoder/ppc/csystemdependent.c
+++ /dev/null
@@ -1,155 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vp8/encoder/variance.h"
-#include "vp8/encoder/onyx_int.h"
-
-SADFunction *vp9_sad16x16;
-SADFunction *vp9_sad16x8;
-SADFunction *vp9_sad8x16;
-SADFunction *vp9_sad8x8;
-SADFunction *vp9_sad4x4;
-
-variance_function *vp9_variance4x4;
-variance_function *vp9_variance8x8;
-variance_function *vp9_variance8x16;
-variance_function *vp9_variance16x8;
-variance_function *vp9_variance16x16;
-
-variance_function *vp9_mse16x16;
-
-sub_pixel_variance_function *vp9_sub_pixel_variance4x4;
-sub_pixel_variance_function *vp9_sub_pixel_variance8x8;
-sub_pixel_variance_function *vp9_sub_pixel_variance8x16;
-sub_pixel_variance_function *vp9_sub_pixel_variance16x8;
-sub_pixel_variance_function *vp9_sub_pixel_variance16x16;
-
-int (*vp9_block_error)(short *coeff, short *dqcoeff);
-int (*vp9_mbblock_error)(MACROBLOCK *mb, int dc);
-
-int (*vp9_mbuverror)(MACROBLOCK *mb);
-unsigned int (*vp9_get_mb_ss)(short *);
-void (*vp9_short_fdct4x4)(short *input, short *output, int pitch);
-void (*vp9_short_fdct8x4)(short *input, short *output, int pitch);
-void (*vp8_fast_fdct4x4)(short *input, short *output, int pitch);
-void (*vp8_fast_fdct8x4)(short *input, short *output, int pitch);
-void (*short_walsh4x4)(short *input, short *output, int pitch);
-
-void (*vp9_subtract_b)(BLOCK *be, BLOCKD *bd, int pitch);
-void (*vp9_subtract_mby)(short *diff, unsigned char *src, unsigned char *pred, int stride);
-void (*vp9_subtract_mbuv)(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride);
-void (*vp8_fast_quantize_b)(BLOCK *b, BLOCKD *d);
-
-// c imports
-extern int block_error_c(short *coeff, short *dqcoeff);
-extern int vp9_mbblock_error_c(MACROBLOCK *mb, int dc);
-
-extern int vp9_mbuverror_c(MACROBLOCK *mb);
-extern unsigned int vp8_get8x8var_c(unsigned char *src_ptr, int  source_stride, unsigned char *ref_ptr, int  recon_stride, unsigned int *SSE, int *Sum);
-extern void short_fdct4x4_c(short *input, short *output, int pitch);
-extern void short_fdct8x4_c(short *input, short *output, int pitch);
-extern void vp9_short_walsh4x4_c(short *input, short *output, int pitch);
-
-extern void vp9_subtract_b_c(BLOCK *be, BLOCKD *bd, int pitch);
-extern void subtract_mby_c(short *diff, unsigned char *src, unsigned char *pred, int stride);
-extern void subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride);
-extern void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d);
-
-extern SADFunction sad16x16_c;
-extern SADFunction sad16x8_c;
-extern SADFunction sad8x16_c;
-extern SADFunction sad8x8_c;
-extern SADFunction sad4x4_c;
-
-extern variance_function variance16x16_c;
-extern variance_function variance8x16_c;
-extern variance_function variance16x8_c;
-extern variance_function variance8x8_c;
-extern variance_function variance4x4_c;
-extern variance_function mse16x16_c;
-
-extern sub_pixel_variance_function sub_pixel_variance4x4_c;
-extern sub_pixel_variance_function sub_pixel_variance8x8_c;
-extern sub_pixel_variance_function sub_pixel_variance8x16_c;
-extern sub_pixel_variance_function sub_pixel_variance16x8_c;
-extern sub_pixel_variance_function sub_pixel_variance16x16_c;
-
-extern unsigned int vp9_get_mb_ss_c(short *);
-
-// ppc
-extern int vp9_block_error_ppc(short *coeff, short *dqcoeff);
-
-extern void vp9_short_fdct4x4_ppc(short *input, short *output, int pitch);
-extern void vp9_short_fdct8x4_ppc(short *input, short *output, int pitch);
-
-extern void vp9_subtract_mby_ppc(short *diff, unsigned char *src, unsigned char *pred, int stride);
-extern void vp9_subtract_mbuv_ppc(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride);
-
-extern SADFunction vp9_sad16x16_ppc;
-extern SADFunction vp9_sad16x8_ppc;
-extern SADFunction vp9_sad8x16_ppc;
-extern SADFunction vp9_sad8x8_ppc;
-extern SADFunction vp9_sad4x4_ppc;
-
-extern variance_function vp9_variance16x16_ppc;
-extern variance_function vp9_variance8x16_ppc;
-extern variance_function vp9_variance16x8_ppc;
-extern variance_function vp9_variance8x8_ppc;
-extern variance_function vp9_variance4x4_ppc;
-extern variance_function vp9_mse16x16_ppc;
-
-extern sub_pixel_variance_function vp9_sub_pixel_variance4x4_ppc;
-extern sub_pixel_variance_function vp9_sub_pixel_variance8x8_ppc;
-extern sub_pixel_variance_function vp9_sub_pixel_variance8x16_ppc;
-extern sub_pixel_variance_function vp9_sub_pixel_variance16x8_ppc;
-extern sub_pixel_variance_function vp9_sub_pixel_variance16x16_ppc;
-
-extern unsigned int vp8_get8x8var_ppc(unsigned char *src_ptr, int  source_stride, unsigned char *ref_ptr, int  recon_stride, unsigned int *SSE, int *Sum);
-extern unsigned int vp8_get16x16var_ppc(unsigned char *src_ptr, int  source_stride, unsigned char *ref_ptr, int  recon_stride, unsigned int *SSE, int *Sum);
-
-void vp9_cmachine_specific_config(void) {
-  // Pure C:
-  vp9_mbuverror               = vp9_mbuverror_c;
-  vp8_fast_quantize_b           = vp8_fast_quantize_b_c;
-  vp9_short_fdct4x4            = vp9_short_fdct4x4_ppc;
-  vp9_short_fdct8x4            = vp9_short_fdct8x4_ppc;
-  vp8_fast_fdct4x4             = vp9_short_fdct4x4_ppc;
-  vp8_fast_fdct8x4             = vp9_short_fdct8x4_ppc;
-  short_walsh4x4               = vp9_short_walsh4x4_c;
-
-  vp9_variance4x4             = vp9_variance4x4_ppc;
-  vp9_variance8x8             = vp9_variance8x8_ppc;
-  vp9_variance8x16            = vp9_variance8x16_ppc;
-  vp9_variance16x8            = vp9_variance16x8_ppc;
-  vp9_variance16x16           = vp9_variance16x16_ppc;
-  vp9_mse16x16                = vp9_mse16x16_ppc;
-
-  vp9_sub_pixel_variance4x4     = vp9_sub_pixel_variance4x4_ppc;
-  vp9_sub_pixel_variance8x8     = vp9_sub_pixel_variance8x8_ppc;
-  vp9_sub_pixel_variance8x16    = vp9_sub_pixel_variance8x16_ppc;
-  vp9_sub_pixel_variance16x8    = vp9_sub_pixel_variance16x8_ppc;
-  vp9_sub_pixel_variance16x16   = vp9_sub_pixel_variance16x16_ppc;
-
-  vp9_get_mb_ss                 = vp9_get_mb_ss_c;
-
-  vp9_sad16x16                = vp9_sad16x16_ppc;
-  vp9_sad16x8                 = vp9_sad16x8_ppc;
-  vp9_sad8x16                 = vp9_sad8x16_ppc;
-  vp9_sad8x8                  = vp9_sad8x8_ppc;
-  vp9_sad4x4                  = vp9_sad4x4_ppc;
-
-  vp9_block_error              = vp9_block_error_ppc;
-  vp9_mbblock_error            = vp9_mbblock_error_c;
-
-  vp9_subtract_b               = vp9_subtract_b_c;
-  vp9_subtract_mby             = vp9_subtract_mby_ppc;
-  vp9_subtract_mbuv            = vp9_subtract_mbuv_ppc;
-}
--- a/vp8/encoder/ppc/encodemb_altivec.asm
+++ /dev/null
@@ -1,153 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    .globl vp8_subtract_mbuv_ppc
-    .globl vp8_subtract_mby_ppc
-
-;# r3 short *diff
-;# r4 unsigned char *usrc
-;# r5 unsigned char *vsrc
-;# r6 unsigned char *pred
-;# r7 int stride
-vp8_subtract_mbuv_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xf000
-    mtspr   256, r12            ;# set VRSAVE
-
-    li      r9, 256
-    add     r3, r3, r9
-    add     r3, r3, r9
-    add     r6, r6, r9
-
-    li      r10, 16
-    li      r9,  4
-    mtctr   r9
-
-    vspltisw v0, 0
-
-mbu_loop:
-    lvsl    v5, 0, r4           ;# permutate value for alignment
-    lvx     v1, 0, r4           ;# src
-    lvx     v2, 0, r6           ;# pred
-
-    add     r4, r4, r7
-    addi    r6, r6, 16
-
-    vperm   v1, v1, v0, v5
-
-    vmrghb  v3, v0, v1          ;# unpack high src  to short
-    vmrghb  v4, v0, v2          ;# unpack high pred to short
-
-    lvsl    v5, 0, r4           ;# permutate value for alignment
-    lvx     v1, 0, r4           ;# src
-
-    add     r4, r4, r7
-
-    vsubshs v3, v3, v4
-
-    stvx    v3, 0, r3           ;# store out diff
-
-    vperm   v1, v1, v0, v5
-
-    vmrghb  v3, v0, v1          ;# unpack high src  to short
-    vmrglb  v4, v0, v2          ;# unpack high pred to short
-
-    vsubshs v3, v3, v4
-
-    stvx    v3, r10, r3         ;# store out diff
-
-    addi    r3, r3, 32
-
-    bdnz    mbu_loop
-
-    mtctr   r9
-
-mbv_loop:
-    lvsl    v5, 0, r5           ;# permutate value for alignment
-    lvx     v1, 0, r5           ;# src
-    lvx     v2, 0, r6           ;# pred
-
-    add     r5, r5, r7
-    addi    r6, r6, 16
-
-    vperm   v1, v1, v0, v5
-
-    vmrghb  v3, v0, v1          ;# unpack high src  to short
-    vmrghb  v4, v0, v2          ;# unpack high pred to short
-
-    lvsl    v5, 0, r5           ;# permutate value for alignment
-    lvx     v1, 0, r5           ;# src
-
-    add     r5, r5, r7
-
-    vsubshs v3, v3, v4
-
-    stvx    v3, 0, r3           ;# store out diff
-
-    vperm   v1, v1, v0, v5
-
-    vmrghb  v3, v0, v1          ;# unpack high src  to short
-    vmrglb  v4, v0, v2          ;# unpack high pred to short
-
-    vsubshs v3, v3, v4
-
-    stvx    v3, r10, r3         ;# store out diff
-
-    addi    r3, r3, 32
-
-    bdnz    mbv_loop
-
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr
-
-;# r3 short *diff
-;# r4 unsigned char *src
-;# r5 unsigned char *pred
-;# r6 int stride
-vp8_subtract_mby_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xf800
-    mtspr   256, r12            ;# set VRSAVE
-
-    li      r10, 16
-    mtctr   r10
-
-    vspltisw v0, 0
-
-mby_loop:
-    lvx     v1, 0, r4           ;# src
-    lvx     v2, 0, r5           ;# pred
-
-    add     r4, r4, r6
-    addi    r5, r5, 16
-
-    vmrghb  v3, v0, v1          ;# unpack high src  to short
-    vmrghb  v4, v0, v2          ;# unpack high pred to short
-
-    vsubshs v3, v3, v4
-
-    stvx    v3, 0, r3           ;# store out diff
-
-    vmrglb  v3, v0, v1          ;# unpack low src  to short
-    vmrglb  v4, v0, v2          ;# unpack low pred to short
-
-    vsubshs v3, v3, v4
-
-    stvx    v3, r10, r3         ;# store out diff
-
-    addi    r3, r3, 32
-
-    bdnz    mby_loop
-
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr
--- a/vp8/encoder/ppc/fdct_altivec.asm
+++ /dev/null
@@ -1,205 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    .globl vp8_short_fdct4x4_ppc
-    .globl vp8_short_fdct8x4_ppc
-
-.macro load_c V, LABEL, OFF, R0, R1
-    lis     \R0, \LABEL@ha
-    la      \R1, \LABEL@l(\R0)
-    lvx     \V, \OFF, \R1
-.endm
-
-;# Forward and inverse DCTs are nearly identical; only differences are
-;#   in normalization (fwd is twice unitary, inv is half unitary)
-;#   and that they are of course transposes of each other.
-;#
-;#   The following three accomplish most of implementation and
-;#   are used only by ppc_idct.c and ppc_fdct.c.
-.macro prologue
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xfffc
-    mtspr   256, r12            ;# set VRSAVE
-
-    stwu    r1,-32(r1)          ;# create space on the stack
-
-    li      r6, 16
-
-    load_c v0, dct_tab, 0, r9, r10
-    lvx     v1,   r6, r10
-    addi    r10, r10, 32
-    lvx     v2,    0, r10
-    lvx     v3,   r6, r10
-
-    load_c v4, ppc_dctperm_tab,  0, r9, r10
-    load_c v5, ppc_dctperm_tab, r6, r9, r10
-
-    load_c v6, round_tab, 0, r10, r9
-.endm
-
-.macro epilogue
-    addi    r1, r1, 32          ;# recover stack
-
-    mtspr   256, r11            ;# reset old VRSAVE
-.endm
-
-;# Do horiz xf on two rows of coeffs  v8 = a0 a1 a2 a3  b0 b1 b2 b3.
-;#   a/A are the even rows 0,2   b/B are the odd rows 1,3
-;#   For fwd transform, indices are horizontal positions, then frequencies.
-;#   For inverse transform, frequencies then positions.
-;#   The two resulting  A0..A3  B0..B3  are later combined
-;#   and vertically transformed.
-
-.macro two_rows_horiz Dst
-    vperm   v9, v8, v8, v4      ;# v9 = a2 a3 a0 a1  b2 b3 b0 b1
-
-    vmsumshm v10, v0, v8, v6
-    vmsumshm v10, v1, v9, v10
-    vsraw   v10, v10, v7        ;# v10 = A0 A1  B0 B1
-
-    vmsumshm v11, v2, v8, v6
-    vmsumshm v11, v3, v9, v11
-    vsraw   v11, v11, v7        ;# v11 = A2 A3  B2 B3
-
-    vpkuwum v10, v10, v11       ;# v10  = A0 A1  B0 B1  A2 A3  B2 B3
-    vperm   \Dst, v10, v10, v5  ;# Dest = A0 B0  A1 B1  A2 B2  A3 B3
-.endm
-
-;# Vertical xf on two rows. DCT values in comments are for inverse transform;
-;#   forward transform uses transpose.
-
-.macro two_rows_vert Ceven, Codd
-    vspltw  v8, \Ceven, 0       ;# v8 = c00 c10  or  c02 c12 four times
-    vspltw  v9, \Codd,  0       ;# v9 = c20 c30  or  c22 c32 ""
-    vmsumshm v8, v8, v12, v6
-    vmsumshm v8, v9, v13, v8
-    vsraw   v10, v8, v7
-
-    vspltw  v8, \Codd,  1       ;# v8 = c01 c11  or  c03 c13
-    vspltw  v9, \Ceven, 1       ;# v9 = c21 c31  or  c23 c33
-    vmsumshm v8, v8, v12, v6
-    vmsumshm v8, v9, v13, v8
-    vsraw   v8, v8, v7
-
-    vpkuwum v8, v10, v8         ;# v8 = rows 0,1  or 2,3
-.endm
-
-.macro two_rows_h Dest
-    stw     r0,  0(r8)
-    lwz     r0,  4(r3)
-    stw     r0,  4(r8)
-    lwzux   r0, r3,r5
-    stw     r0,  8(r8)
-    lwz     r0,  4(r3)
-    stw     r0, 12(r8)
-    lvx     v8,  0,r8
-    two_rows_horiz \Dest
-.endm
-
-    .align 2
-;# r3 short *input
-;# r4 short *output
-;# r5 int pitch
-vp8_short_fdct4x4_ppc:
-
-    prologue
-
-    vspltisw v7, 14             ;# == 14, fits in 5 signed bits
-    addi    r8, r1, 0
-
-
-    lwz     r0, 0(r3)
-    two_rows_h v12                ;# v12 = H00 H10  H01 H11  H02 H12  H03 H13
-
-    lwzux   r0, r3, r5
-    two_rows_h v13                ;# v13 = H20 H30  H21 H31  H22 H32  H23 H33
-
-    lvx     v6, r6, r9          ;# v6 = Vround
-    vspltisw v7, -16            ;# == 16 == -16, only low 5 bits matter
-
-    two_rows_vert v0, v1
-    stvx    v8, 0, r4
-    two_rows_vert v2, v3
-    stvx    v8, r6, r4
-
-    epilogue
-
-    blr
-
-    .align 2
-;# r3 short *input
-;# r4 short *output
-;# r5 int pitch
-vp8_short_fdct8x4_ppc:
-    prologue
-
-    vspltisw v7, 14             ;# == 14, fits in 5 signed bits
-    addi    r8,  r1, 0
-    addi    r10, r3, 0
-
-    lwz     r0, 0(r3)
-    two_rows_h v12                ;# v12 = H00 H10  H01 H11  H02 H12  H03 H13
-
-    lwzux   r0, r3, r5
-    two_rows_h v13                ;# v13 = H20 H30  H21 H31  H22 H32  H23 H33
-
-    lvx     v6, r6, r9          ;# v6 = Vround
-    vspltisw v7, -16            ;# == 16 == -16, only low 5 bits matter
-
-    two_rows_vert v0, v1
-    stvx    v8, 0, r4
-    two_rows_vert v2, v3
-    stvx    v8, r6, r4
-
-    ;# Next block
-    addi    r3, r10, 8
-    addi    r4, r4, 32
-    lvx     v6, 0, r9           ;# v6 = Hround
-
-    vspltisw v7, 14             ;# == 14, fits in 5 signed bits
-    addi    r8, r1, 0
-
-    lwz     r0, 0(r3)
-    two_rows_h v12                ;# v12 = H00 H10  H01 H11  H02 H12  H03 H13
-
-    lwzux   r0, r3, r5
-    two_rows_h v13                ;# v13 = H20 H30  H21 H31  H22 H32  H23 H33
-
-    lvx     v6, r6, r9          ;# v6 = Vround
-    vspltisw v7, -16            ;# == 16 == -16, only low 5 bits matter
-
-    two_rows_vert v0, v1
-    stvx    v8, 0, r4
-    two_rows_vert v2, v3
-    stvx    v8, r6, r4
-
-    epilogue
-
-    blr
-
-    .data
-    .align 4
-ppc_dctperm_tab:
-    .byte 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11
-    .byte 0,1,4,5, 2,3,6,7, 8,9,12,13, 10,11,14,15
-
-    .align 4
-dct_tab:
-    .short  23170, 23170,-12540,-30274, 23170, 23170,-12540,-30274
-    .short  23170, 23170, 30274, 12540, 23170, 23170, 30274, 12540
-
-    .short  23170,-23170, 30274,-12540, 23170,-23170, 30274,-12540
-    .short -23170, 23170, 12540,-30274,-23170, 23170, 12540,-30274
-
-    .align 4
-round_tab:
-    .long (1 << (14-1)), (1 << (14-1)), (1 << (14-1)), (1 << (14-1))
-    .long (1 << (16-1)), (1 << (16-1)), (1 << (16-1)), (1 << (16-1))
--- a/vp8/encoder/ppc/rdopt_altivec.asm
+++ /dev/null
@@ -1,51 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    .globl vp8_block_error_ppc
-
-    .align 2
-;# r3 short *Coeff
-;# r4 short *dqcoeff
-vp8_block_error_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xf800
-    mtspr   256, r12            ;# set VRSAVE
-
-    stwu    r1,-32(r1)          ;# create space on the stack
-
-    stw     r5, 12(r1)          ;# tranfer dc to vector register
-
-    lvx     v0, 0, r3           ;# Coeff
-    lvx     v1, 0, r4           ;# dqcoeff
-
-    li      r10, 16
-
-    vspltisw v3, 0
-
-    vsubshs v0, v0, v1
-
-    vmsumshm v2, v0, v0, v3     ;# multiply differences
-
-    lvx     v0, r10, r3         ;# Coeff
-    lvx     v1, r10, r4         ;# dqcoeff
-
-    vsubshs v0, v0, v1
-
-    vmsumshm v1, v0, v0, v2     ;# multiply differences
-    vsumsws v1, v1, v3          ;# sum up
-
-    stvx    v1, 0, r1
-    lwz     r3, 12(r1)          ;# return value
-
-    addi    r1, r1, 32          ;# recover stack
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr
--- a/vp8/encoder/ppc/sad_altivec.asm
+++ /dev/null
@@ -1,277 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    .globl vp8_sad16x16_ppc
-    .globl vp8_sad16x8_ppc
-    .globl vp8_sad8x16_ppc
-    .globl vp8_sad8x8_ppc
-    .globl vp8_sad4x4_ppc
-
-.macro load_aligned_16 V R O
-    lvsl    v3,  0, \R          ;# permutate value for alignment
-
-    lvx     v1,  0, \R
-    lvx     v2, \O, \R
-
-    vperm   \V, v1, v2, v3
-.endm
-
-.macro prologue
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xffc0
-    mtspr   256, r12            ;# set VRSAVE
-
-    stwu    r1, -32(r1)         ;# create space on the stack
-
-    li      r10, 16             ;# load offset and loop counter
-
-    vspltisw v8, 0              ;# zero out total to start
-.endm
-
-.macro epilogue
-    addi    r1, r1, 32          ;# recover stack
-
-    mtspr   256, r11            ;# reset old VRSAVE
-.endm
-
-.macro SAD_16
-    ;# v6 = abs (v4 - v5)
-    vsububs v6, v4, v5
-    vsububs v7, v5, v4
-    vor     v6, v6, v7
-
-    ;# v8 += abs (v4 - v5)
-    vsum4ubs v8, v6, v8
-.endm
-
-.macro sad_16_loop loop_label
-    lvsl    v3,  0, r5          ;# only needs to be done once per block
-
-    ;# preload a line of data before getting into the loop
-    lvx     v4, 0, r3
-    lvx     v1,  0, r5
-    lvx     v2, r10, r5
-
-    add     r5, r5, r6
-    add     r3, r3, r4
-
-    vperm   v5, v1, v2, v3
-
-    .align 4
-\loop_label:
-    ;# compute difference on first row
-    vsububs v6, v4, v5
-    vsububs v7, v5, v4
-
-    ;# load up next set of data
-    lvx     v9, 0, r3
-    lvx     v1,  0, r5
-    lvx     v2, r10, r5
-
-    ;# perform abs() of difference
-    vor     v6, v6, v7
-    add     r3, r3, r4
-
-    ;# add to the running tally
-    vsum4ubs v8, v6, v8
-
-    ;# now onto the next line
-    vperm   v5, v1, v2, v3
-    add     r5, r5, r6
-    lvx     v4, 0, r3
-
-    ;# compute difference on second row
-    vsububs v6, v9, v5
-    lvx     v1,  0, r5
-    vsububs v7, v5, v9
-    lvx     v2, r10, r5
-    vor     v6, v6, v7
-    add     r3, r3, r4
-    vsum4ubs v8, v6, v8
-    vperm   v5, v1, v2, v3
-    add     r5, r5, r6
-
-    bdnz    \loop_label
-
-    vspltisw v7, 0
-
-    vsumsws v8, v8, v7
-
-    stvx    v8, 0, r1
-    lwz     r3, 12(r1)
-.endm
-
-.macro sad_8_loop loop_label
-    .align 4
-\loop_label:
-    ;# only one of the inputs should need to be aligned.
-    load_aligned_16 v4, r3, r10
-    load_aligned_16 v5, r5, r10
-
-    ;# move onto the next line
-    add     r3, r3, r4
-    add     r5, r5, r6
-
-    ;# only one of the inputs should need to be aligned.
-    load_aligned_16 v6, r3, r10
-    load_aligned_16 v7, r5, r10
-
-    ;# move onto the next line
-    add     r3, r3, r4
-    add     r5, r5, r6
-
-    vmrghb  v4, v4, v6
-    vmrghb  v5, v5, v7
-
-    SAD_16
-
-    bdnz    \loop_label
-
-    vspltisw v7, 0
-
-    vsumsws v8, v8, v7
-
-    stvx    v8, 0, r1
-    lwz     r3, 12(r1)
-.endm
-
-    .align 2
-;# r3 unsigned char *src_ptr
-;# r4 int  src_stride
-;# r5 unsigned char *ref_ptr
-;# r6 int  ref_stride
-;#
-;# r3 return value
-vp8_sad16x16_ppc:
-
-    prologue
-
-    li      r9, 8
-    mtctr   r9
-
-    sad_16_loop sad16x16_loop
-
-    epilogue
-
-    blr
-
-    .align 2
-;# r3 unsigned char *src_ptr
-;# r4 int  src_stride
-;# r5 unsigned char *ref_ptr
-;# r6 int  ref_stride
-;#
-;# r3 return value
-vp8_sad16x8_ppc:
-
-    prologue
-
-    li      r9, 4
-    mtctr   r9
-
-    sad_16_loop sad16x8_loop
-
-    epilogue
-
-    blr
-
-    .align 2
-;# r3 unsigned char *src_ptr
-;# r4 int  src_stride
-;# r5 unsigned char *ref_ptr
-;# r6 int  ref_stride
-;#
-;# r3 return value
-vp8_sad8x16_ppc:
-
-    prologue
-
-    li      r9, 8
-    mtctr   r9
-
-    sad_8_loop sad8x16_loop
-
-    epilogue
-
-    blr
-
-    .align 2
-;# r3 unsigned char *src_ptr
-;# r4 int  src_stride
-;# r5 unsigned char *ref_ptr
-;# r6 int  ref_stride
-;#
-;# r3 return value
-vp8_sad8x8_ppc:
-
-    prologue
-
-    li      r9, 4
-    mtctr   r9
-
-    sad_8_loop sad8x8_loop
-
-    epilogue
-
-    blr
-
-.macro transfer_4x4 I P
-    lwz     r0, 0(\I)
-    add     \I, \I, \P
-
-    lwz     r7, 0(\I)
-    add     \I, \I, \P
-
-    lwz     r8, 0(\I)
-    add     \I, \I, \P
-
-    lwz     r9, 0(\I)
-
-    stw     r0,  0(r1)
-    stw     r7,  4(r1)
-    stw     r8,  8(r1)
-    stw     r9, 12(r1)
-.endm
-
-    .align 2
-;# r3 unsigned char *src_ptr
-;# r4 int  src_stride
-;# r5 unsigned char *ref_ptr
-;# r6 int  ref_stride
-;#
-;# r3 return value
-vp8_sad4x4_ppc:
-
-    prologue
-
-    transfer_4x4 r3, r4
-    lvx     v4, 0, r1
-
-    transfer_4x4 r5, r6
-    lvx     v5, 0, r1
-
-    vspltisw v8, 0              ;# zero out total to start
-
-    ;# v6 = abs (v4 - v5)
-    vsububs v6, v4, v5
-    vsububs v7, v5, v4
-    vor     v6, v6, v7
-
-    ;# v8 += abs (v4 - v5)
-    vsum4ubs v7, v6, v8
-    vsumsws v7, v7, v8
-
-    stvx    v7, 0, r1
-    lwz     r3, 12(r1)
-
-    epilogue
-
-    blr
--- a/vp8/encoder/ppc/variance_altivec.asm
+++ /dev/null
@@ -1,375 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    .globl vp8_get8x8var_ppc
-    .globl vp8_get16x16var_ppc
-    .globl vp8_mse16x16_ppc
-    .globl vp9_variance16x16_ppc
-    .globl vp9_variance16x8_ppc
-    .globl vp9_variance8x16_ppc
-    .globl vp9_variance8x8_ppc
-    .globl vp9_variance4x4_ppc
-
-.macro load_aligned_16 V R O
-    lvsl    v3,  0, \R          ;# permutate value for alignment
-
-    lvx     v1,  0, \R
-    lvx     v2, \O, \R
-
-    vperm   \V, v1, v2, v3
-.endm
-
-.macro prologue
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xffc0
-    mtspr   256, r12            ;# set VRSAVE
-
-    stwu    r1, -32(r1)         ;# create space on the stack
-
-    li      r10, 16             ;# load offset and loop counter
-
-    vspltisw v7, 0              ;# zero for merging
-    vspltisw v8, 0              ;# zero out total to start
-    vspltisw v9, 0              ;# zero out total for dif^2
-.endm
-
-.macro epilogue
-    addi    r1, r1, 32          ;# recover stack
-
-    mtspr   256, r11            ;# reset old VRSAVE
-.endm
-
-.macro compute_sum_sse
-    ;# Compute sum first.  Unpack to so signed subract
-    ;#  can be used.  Only have a half word signed
-    ;#  subract.  Do high, then low.
-    vmrghb  v2, v7, v4
-    vmrghb  v3, v7, v5
-    vsubshs v2, v2, v3
-    vsum4shs v8, v2, v8
-
-    vmrglb  v2, v7, v4
-    vmrglb  v3, v7, v5
-    vsubshs v2, v2, v3
-    vsum4shs v8, v2, v8
-
-    ;# Now compute sse.
-    vsububs v2, v4, v5
-    vsububs v3, v5, v4
-    vor     v2, v2, v3
-
-    vmsumubm v9, v2, v2, v9
-.endm
-
-.macro variance_16 DS loop_label store_sum
-\loop_label:
-    ;# only one of the inputs should need to be aligned.
-    load_aligned_16 v4, r3, r10
-    load_aligned_16 v5, r5, r10
-
-    ;# move onto the next line
-    add     r3, r3, r4
-    add     r5, r5, r6
-
-    compute_sum_sse
-
-    bdnz    \loop_label
-
-    vsumsws v8, v8, v7
-    vsumsws v9, v9, v7
-
-    stvx    v8, 0, r1
-    lwz     r3, 12(r1)
-
-    stvx    v9, 0, r1
-    lwz     r4, 12(r1)
-
-.if \store_sum
-    stw     r3, 0(r8)           ;# sum
-.endif
-    stw     r4, 0(r7)           ;# sse
-
-    mullw   r3, r3, r3          ;# sum*sum
-    srawi   r3, r3, \DS         ;# (sum*sum) >> DS
-    subf    r3, r3, r4          ;# sse - ((sum*sum) >> DS)
-.endm
-
-.macro variance_8 DS loop_label store_sum
-\loop_label:
-    ;# only one of the inputs should need to be aligned.
-    load_aligned_16 v4, r3, r10
-    load_aligned_16 v5, r5, r10
-
-    ;# move onto the next line
-    add     r3, r3, r4
-    add     r5, r5, r6
-
-    ;# only one of the inputs should need to be aligned.
-    load_aligned_16 v6, r3, r10
-    load_aligned_16 v0, r5, r10
-
-    ;# move onto the next line
-    add     r3, r3, r4
-    add     r5, r5, r6
-
-    vmrghb  v4, v4, v6
-    vmrghb  v5, v5, v0
-
-    compute_sum_sse
-
-    bdnz    \loop_label
-
-    vsumsws v8, v8, v7
-    vsumsws v9, v9, v7
-
-    stvx    v8, 0, r1
-    lwz     r3, 12(r1)
-
-    stvx    v9, 0, r1
-    lwz     r4, 12(r1)
-
-.if \store_sum
-    stw     r3, 0(r8)           ;# sum
-.endif
-    stw     r4, 0(r7)           ;# sse
-
-    mullw   r3, r3, r3          ;# sum*sum
-    srawi   r3, r3, \DS         ;# (sum*sum) >> 8
-    subf    r3, r3, r4          ;# sse - ((sum*sum) >> 8)
-.endm
-
-    .align 2
-;# r3 unsigned char *src_ptr
-;# r4 int  source_stride
-;# r5 unsigned char *ref_ptr
-;# r6 int  recon_stride
-;# r7 unsigned int *SSE
-;# r8 int *Sum
-;#
-;# r3 return value
-vp8_get8x8var_ppc:
-
-    prologue
-
-    li      r9, 4
-    mtctr   r9
-
-    variance_8 6, get8x8var_loop, 1
-
-    epilogue
-
-    blr
-
-    .align 2
-;# r3 unsigned char *src_ptr
-;# r4 int  source_stride
-;# r5 unsigned char *ref_ptr
-;# r6 int  recon_stride
-;# r7 unsigned int *SSE
-;# r8 int *Sum
-;#
-;# r3 return value
-vp8_get16x16var_ppc:
-
-    prologue
-
-    mtctr   r10
-
-    variance_16 8, get16x16var_loop, 1
-
-    epilogue
-
-    blr
-
-    .align 2
-;# r3 unsigned char *src_ptr
-;# r4 int  source_stride
-;# r5 unsigned char *ref_ptr
-;# r6 int  recon_stride
-;# r7 unsigned int *sse
-;#
-;# r 3 return value
-vp8_mse16x16_ppc:
-    prologue
-
-    mtctr   r10
-
-mse16x16_loop:
-    ;# only one of the inputs should need to be aligned.
-    load_aligned_16 v4, r3, r10
-    load_aligned_16 v5, r5, r10
-
-    ;# move onto the next line
-    add     r3, r3, r4
-    add     r5, r5, r6
-
-    ;# Now compute sse.
-    vsububs v2, v4, v5
-    vsububs v3, v5, v4
-    vor     v2, v2, v3
-
-    vmsumubm v9, v2, v2, v9
-
-    bdnz    mse16x16_loop
-
-    vsumsws v9, v9, v7
-
-    stvx    v9, 0, r1
-    lwz     r3, 12(r1)
-
-    stvx    v9, 0, r1
-    lwz     r3, 12(r1)
-
-    stw     r3, 0(r7)           ;# sse
-
-    epilogue
-
-    blr
-
-    .align 2
-;# r3 unsigned char *src_ptr
-;# r4 int  source_stride
-;# r5 unsigned char *ref_ptr
-;# r6 int  recon_stride
-;# r7 unsigned int *sse
-;#
-;# r3 return value
-vp9_variance16x16_ppc:
-
-    prologue
-
-    mtctr   r10
-
-    variance_16 8, variance16x16_loop, 0
-
-    epilogue
-
-    blr
-
-    .align 2
-;# r3 unsigned char *src_ptr
-;# r4 int  source_stride
-;# r5 unsigned char *ref_ptr
-;# r6 int  recon_stride
-;# r7 unsigned int *sse
-;#
-;# r3 return value
-vp9_variance16x8_ppc:
-
-    prologue
-
-    li      r9, 8
-    mtctr   r9
-
-    variance_16 7, variance16x8_loop, 0
-
-    epilogue
-
-    blr
-
-    .align 2
-;# r3 unsigned char *src_ptr
-;# r4 int  source_stride
-;# r5 unsigned char *ref_ptr
-;# r6 int  recon_stride
-;# r7 unsigned int *sse
-;#
-;# r3 return value
-vp9_variance8x16_ppc:
-
-    prologue
-
-    li      r9, 8
-    mtctr   r9
-
-    variance_8 7, variance8x16_loop, 0
-
-    epilogue
-
-    blr
-
-    .align 2
-;# r3 unsigned char *src_ptr
-;# r4 int  source_stride
-;# r5 unsigned char *ref_ptr
-;# r6 int  recon_stride
-;# r7 unsigned int *sse
-;#
-;# r3 return value
-vp9_variance8x8_ppc:
-
-    prologue
-
-    li      r9, 4
-    mtctr   r9
-
-    variance_8 6, variance8x8_loop, 0
-
-    epilogue
-
-    blr
-
-.macro transfer_4x4 I P
-    lwz     r0, 0(\I)
-    add     \I, \I, \P
-
-    lwz     r10,0(\I)
-    add     \I, \I, \P
-
-    lwz     r8, 0(\I)
-    add     \I, \I, \P
-
-    lwz     r9, 0(\I)
-
-    stw     r0,  0(r1)
-    stw     r10, 4(r1)
-    stw     r8,  8(r1)
-    stw     r9, 12(r1)
-.endm
-
-    .align 2
-;# r3 unsigned char *src_ptr
-;# r4 int  source_stride
-;# r5 unsigned char *ref_ptr
-;# r6 int  recon_stride
-;# r7 unsigned int *sse
-;#
-;# r3 return value
-vp9_variance4x4_ppc:
-
-    prologue
-
-    transfer_4x4 r3, r4
-    lvx     v4, 0, r1
-
-    transfer_4x4 r5, r6
-    lvx     v5, 0, r1
-
-    compute_sum_sse
-
-    vsumsws v8, v8, v7
-    vsumsws v9, v9, v7
-
-    stvx    v8, 0, r1
-    lwz     r3, 12(r1)
-
-    stvx    v9, 0, r1
-    lwz     r4, 12(r1)
-
-    stw     r4, 0(r7)           ;# sse
-
-    mullw   r3, r3, r3          ;# sum*sum
-    srawi   r3, r3, 4           ;# (sum*sum) >> 4
-    subf    r3, r3, r4          ;# sse - ((sum*sum) >> 4)
-
-    epilogue
-
-    blr
--- a/vp8/encoder/ppc/variance_subpixel_altivec.asm
+++ /dev/null
@@ -1,865 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    .globl vp9_sub_pixel_variance4x4_ppc
-    .globl vp9_sub_pixel_variance8x8_ppc
-    .globl vp9_sub_pixel_variance8x16_ppc
-    .globl vp9_sub_pixel_variance16x8_ppc
-    .globl vp9_sub_pixel_variance16x16_ppc
-
-.macro load_c V, LABEL, OFF, R0, R1
-    lis     \R0, \LABEL@ha
-    la      \R1, \LABEL@l(\R0)
-    lvx     \V, \OFF, \R1
-.endm
-
-.macro load_vfilter V0, V1
-    load_c \V0, vfilter_b, r6, r12, r10
-
-    addi    r6,  r6, 16
-    lvx     \V1, r6, r10
-.endm
-
-.macro HProlog jump_label
-    ;# load up horizontal filter
-    slwi.   r5, r5, 4           ;# index into horizontal filter array
-
-    ;# index to the next set of vectors in the row.
-    li      r10, 16
-
-    ;# downshift by 7 ( divide by 128 ) at the end
-    vspltish v19, 7
-
-    ;# If there isn't any filtering to be done for the horizontal, then
-    ;#  just skip to the second pass.
-    beq     \jump_label
-
-    load_c v20, hfilter_b, r5, r12, r0
-
-    ;# setup constants
-    ;# v14 permutation value for alignment
-    load_c v28, b_hperm_b, 0, r12, r0
-
-    ;# index to the next set of vectors in the row.
-    li      r12, 32
-
-    ;# rounding added in on the multiply
-    vspltisw v21, 8
-    vspltisw v18, 3
-    vslw    v18, v21, v18       ;# 0x00000040000000400000004000000040
-
-    slwi.   r6, r6, 5           ;# index into vertical filter array
-.endm
-
-;# Filters a horizontal line
-;# expects:
-;#  r3  src_ptr
-;#  r4  pitch
-;#  r10 16
-;#  r12 32
-;#  v17 perm intput
-;#  v18 rounding
-;#  v19 shift
-;#  v20 filter taps
-;#  v21 tmp
-;#  v22 tmp
-;#  v23 tmp
-;#  v24 tmp
-;#  v25 tmp
-;#  v26 tmp
-;#  v27 tmp
-;#  v28 perm output
-;#
-
-.macro hfilter_8 V, hp, lp, increment_counter
-    lvsl    v17,  0, r3         ;# permutate value for alignment
-
-    ;# input to filter is 9 bytes wide, output is 8 bytes.
-    lvx     v21,   0, r3
-    lvx     v22, r10, r3
-
-.if \increment_counter
-    add     r3, r3, r4
-.endif
-    vperm   v21, v21, v22, v17
-
-    vperm   v24, v21, v21, \hp  ;# v20 = 0123 1234 2345 3456
-    vperm   v25, v21, v21, \lp  ;# v21 = 4567 5678 6789 789A
-
-    vmsummbm v24, v20, v24, v18
-    vmsummbm v25, v20, v25, v18
-
-    vpkswus v24, v24, v25       ;# v24 = 0 4 8 C 1 5 9 D (16-bit)
-
-    vsrh    v24, v24, v19       ;# divide v0, v1 by 128
-
-    vpkuhus \V, v24, v24        ;# \V = scrambled 8-bit result
-.endm
-
-.macro vfilter_16 P0 P1
-    vmuleub v22, \P0, v20       ;# 64 + 4 positive taps
-    vadduhm v22, v18, v22
-    vmuloub v23, \P0, v20
-    vadduhm v23, v18, v23
-
-    vmuleub v24, \P1, v21
-    vadduhm v22, v22, v24       ;# Re = evens, saturation unnecessary
-    vmuloub v25, \P1, v21
-    vadduhm v23, v23, v25       ;# Ro = odds
-
-    vsrh    v22, v22, v19       ;# divide by 128
-    vsrh    v23, v23, v19       ;# v16 v17 = evens, odds
-    vmrghh  \P0, v22, v23       ;# v18 v19 = 16-bit result in order
-    vmrglh  v23, v22, v23
-    vpkuhus \P0, \P0, v23       ;# P0 = 8-bit result
-.endm
-
-.macro compute_sum_sse src, ref, sum, sse, t1, t2, z0
-    ;# Compute sum first.  Unpack to so signed subract
-    ;#  can be used.  Only have a half word signed
-    ;#  subract.  Do high, then low.
-    vmrghb  \t1, \z0, \src
-    vmrghb  \t2, \z0, \ref
-    vsubshs \t1, \t1, \t2
-    vsum4shs \sum, \t1, \sum
-
-    vmrglb  \t1, \z0, \src
-    vmrglb  \t2, \z0, \ref
-    vsubshs \t1, \t1, \t2
-    vsum4shs \sum, \t1, \sum
-
-    ;# Now compute sse.
-    vsububs \t1, \src, \ref
-    vsububs \t2, \ref, \src
-    vor     \t1, \t1, \t2
-
-    vmsumubm \sse, \t1, \t1, \sse
-.endm
-
-.macro variance_final sum, sse, z0, DS
-    vsumsws \sum, \sum, \z0
-    vsumsws \sse, \sse, \z0
-
-    stvx    \sum, 0, r1
-    lwz     r3, 12(r1)
-
-    stvx    \sse, 0, r1
-    lwz     r4, 12(r1)
-
-    stw     r4, 0(r9)           ;# sse
-
-    mullw   r3, r3, r3          ;# sum*sum
-    srawi   r3, r3, \DS         ;# (sum*sum) >> 8
-    subf    r3, r3, r4          ;# sse - ((sum*sum) >> 8)
-.endm
-
-.macro compute_sum_sse_16 V, increment_counter
-    load_and_align_16  v16, r7, r8, \increment_counter
-    compute_sum_sse \V, v16, v18, v19, v20, v21, v23
-.endm
-
-.macro load_and_align_16 V, R, P, increment_counter
-    lvsl    v17,  0, \R         ;# permutate value for alignment
-
-    ;# input to filter is 21 bytes wide, output is 16 bytes.
-    ;#  input will can span three vectors if not aligned correctly.
-    lvx     v21,   0, \R
-    lvx     v22, r10, \R
-
-.if \increment_counter
-    add     \R, \R, \P
-.endif
-
-    vperm   \V, v21, v22, v17
-.endm
-
-    .align 2
-;# r3 unsigned char  *src_ptr
-;# r4 int  src_pixels_per_line
-;# r5 int  xoffset
-;# r6 int  yoffset
-;# r7 unsigned char *dst_ptr
-;# r8 int dst_pixels_per_line
-;# r9 unsigned int *sse
-;#
-;# r3 return value
-vp9_sub_pixel_variance4x4_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xf830
-    ori     r12, r12, 0xfff8
-    mtspr   256, r12            ;# set VRSAVE
-
-    stwu    r1,-32(r1)          ;# create space on the stack
-
-    HProlog second_pass_4x4_pre_copy_b
-
-    ;# Load up permutation constants
-    load_c v10, b_0123_b, 0, r12, r0
-    load_c v11, b_4567_b, 0, r12, r0
-
-    hfilter_8 v0, v10, v11, 1
-    hfilter_8 v1, v10, v11, 1
-    hfilter_8 v2, v10, v11, 1
-    hfilter_8 v3, v10, v11, 1
-
-    ;# Finished filtering main horizontal block.  If there is no
-    ;#  vertical filtering, jump to storing the data.  Otherwise
-    ;#  load up and filter the additional line that is needed
-    ;#  for the vertical filter.
-    beq     compute_sum_sse_4x4_b
-
-    hfilter_8 v4, v10, v11, 0
-
-    b   second_pass_4x4_b
-
-second_pass_4x4_pre_copy_b:
-    slwi    r6, r6, 5           ;# index into vertical filter array
-
-    load_and_align_16 v0, r3, r4, 1
-    load_and_align_16 v1, r3, r4, 1
-    load_and_align_16 v2, r3, r4, 1
-    load_and_align_16 v3, r3, r4, 1
-    load_and_align_16 v4, r3, r4, 0
-
-second_pass_4x4_b:
-    vspltish v20, 8
-    vspltish v18, 3
-    vslh    v18, v20, v18       ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
-
-    load_vfilter v20, v21
-
-    vfilter_16 v0,  v1
-    vfilter_16 v1,  v2
-    vfilter_16 v2,  v3
-    vfilter_16 v3,  v4
-
-compute_sum_sse_4x4_b:
-    vspltish v18, 0             ;# sum
-    vspltish v19, 0             ;# sse
-    vspltish v23, 0             ;# unpack
-    li      r10, 16
-
-    load_and_align_16 v4, r7, r8, 1
-    load_and_align_16 v5, r7, r8, 1
-    load_and_align_16 v6, r7, r8, 1
-    load_and_align_16 v7, r7, r8, 1
-
-    vmrghb  v0, v0, v1
-    vmrghb  v1, v2, v3
-
-    vmrghb  v2, v4, v5
-    vmrghb  v3, v6, v7
-
-    load_c v10, b_hilo_b, 0, r12, r0
-
-    vperm   v0, v0, v1, v10
-    vperm   v1, v2, v3, v10
-
-    compute_sum_sse v0, v1, v18, v19, v20, v21, v23
-
-    variance_final v18, v19, v23, 4
-
-    addi    r1, r1, 32          ;# recover stack
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr
-
-    .align 2
-;# r3 unsigned char  *src_ptr
-;# r4 int  src_pixels_per_line
-;# r5 int  xoffset
-;# r6 int  yoffset
-;# r7 unsigned char *dst_ptr
-;# r8 int dst_pixels_per_line
-;# r9 unsigned int *sse
-;#
-;# r3 return value
-vp9_sub_pixel_variance8x8_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xfff0
-    ori     r12, r12, 0xffff
-    mtspr   256, r12            ;# set VRSAVE
-
-    stwu    r1,-32(r1)          ;# create space on the stack
-
-    HProlog second_pass_8x8_pre_copy_b
-
-    ;# Load up permutation constants
-    load_c v10, b_0123_b, 0, r12, r0
-    load_c v11, b_4567_b, 0, r12, r0
-
-    hfilter_8 v0, v10, v11, 1
-    hfilter_8 v1, v10, v11, 1
-    hfilter_8 v2, v10, v11, 1
-    hfilter_8 v3, v10, v11, 1
-    hfilter_8 v4, v10, v11, 1
-    hfilter_8 v5, v10, v11, 1
-    hfilter_8 v6, v10, v11, 1
-    hfilter_8 v7, v10, v11, 1
-
-    ;# Finished filtering main horizontal block.  If there is no
-    ;#  vertical filtering, jump to storing the data.  Otherwise
-    ;#  load up and filter the additional line that is needed
-    ;#  for the vertical filter.
-    beq     compute_sum_sse_8x8_b
-
-    hfilter_8 v8, v10, v11, 0
-
-    b   second_pass_8x8_b
-
-second_pass_8x8_pre_copy_b:
-    slwi.   r6, r6, 5           ;# index into vertical filter array
-
-    load_and_align_16 v0, r3, r4, 1
-    load_and_align_16 v1, r3, r4, 1
-    load_and_align_16 v2, r3, r4, 1
-    load_and_align_16 v3, r3, r4, 1
-    load_and_align_16 v4, r3, r4, 1
-    load_and_align_16 v5, r3, r4, 1
-    load_and_align_16 v6, r3, r4, 1
-    load_and_align_16 v7, r3, r4, 1
-    load_and_align_16 v8, r3, r4, 0
-
-    beq     compute_sum_sse_8x8_b
-
-second_pass_8x8_b:
-    vspltish v20, 8
-    vspltish v18, 3
-    vslh    v18, v20, v18   ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
-
-    load_vfilter v20, v21
-
-    vfilter_16 v0, v1
-    vfilter_16 v1, v2
-    vfilter_16 v2, v3
-    vfilter_16 v3, v4
-    vfilter_16 v4, v5
-    vfilter_16 v5, v6
-    vfilter_16 v6, v7
-    vfilter_16 v7, v8
-
-compute_sum_sse_8x8_b:
-    vspltish v18, 0             ;# sum
-    vspltish v19, 0             ;# sse
-    vspltish v23, 0             ;# unpack
-    li      r10, 16
-
-    vmrghb  v0, v0, v1
-    vmrghb  v1, v2, v3
-    vmrghb  v2, v4, v5
-    vmrghb  v3, v6, v7
-
-    load_and_align_16 v4,  r7, r8, 1
-    load_and_align_16 v5,  r7, r8, 1
-    load_and_align_16 v6,  r7, r8, 1
-    load_and_align_16 v7,  r7, r8, 1
-    load_and_align_16 v8,  r7, r8, 1
-    load_and_align_16 v9,  r7, r8, 1
-    load_and_align_16 v10, r7, r8, 1
-    load_and_align_16 v11, r7, r8, 0
-
-    vmrghb  v4, v4,  v5
-    vmrghb  v5, v6,  v7
-    vmrghb  v6, v8,  v9
-    vmrghb  v7, v10, v11
-
-    compute_sum_sse v0, v4, v18, v19, v20, v21, v23
-    compute_sum_sse v1, v5, v18, v19, v20, v21, v23
-    compute_sum_sse v2, v6, v18, v19, v20, v21, v23
-    compute_sum_sse v3, v7, v18, v19, v20, v21, v23
-
-    variance_final v18, v19, v23, 6
-
-    addi    r1, r1, 32          ;# recover stack
-    mtspr   256, r11            ;# reset old VRSAVE
-    blr
-
-    .align 2
-;# r3 unsigned char  *src_ptr
-;# r4 int  src_pixels_per_line
-;# r5 int  xoffset
-;# r6 int  yoffset
-;# r7 unsigned char *dst_ptr
-;# r8 int dst_pixels_per_line
-;# r9 unsigned int *sse
-;#
-;# r3 return value
-vp9_sub_pixel_variance8x16_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xffff
-    ori     r12, r12, 0xfffc
-    mtspr   256, r12            ;# set VRSAVE
-
-    stwu    r1,-32(r1)          ;# create space on the stack
-
-    HProlog second_pass_8x16_pre_copy_b
-
-    ;# Load up permutation constants
-    load_c v29, b_0123_b, 0, r12, r0
-    load_c v30, b_4567_b, 0, r12, r0
-
-    hfilter_8 v0,  v29, v30, 1
-    hfilter_8 v1,  v29, v30, 1
-    hfilter_8 v2,  v29, v30, 1
-    hfilter_8 v3,  v29, v30, 1
-    hfilter_8 v4,  v29, v30, 1
-    hfilter_8 v5,  v29, v30, 1
-    hfilter_8 v6,  v29, v30, 1
-    hfilter_8 v7,  v29, v30, 1
-    hfilter_8 v8,  v29, v30, 1
-    hfilter_8 v9,  v29, v30, 1
-    hfilter_8 v10, v29, v30, 1
-    hfilter_8 v11, v29, v30, 1
-    hfilter_8 v12, v29, v30, 1
-    hfilter_8 v13, v29, v30, 1
-    hfilter_8 v14, v29, v30, 1
-    hfilter_8 v15, v29, v30, 1
-
-    ;# Finished filtering main horizontal block.  If there is no
-    ;#  vertical filtering, jump to storing the data.  Otherwise
-    ;#  load up and filter the additional line that is needed
-    ;#  for the vertical filter.
-    beq     compute_sum_sse_8x16_b
-
-    hfilter_8 v16, v29, v30, 0
-
-    b   second_pass_8x16_b
-
-second_pass_8x16_pre_copy_b:
-    slwi.   r6, r6, 5           ;# index into vertical filter array
-
-    load_and_align_16 v0,  r3, r4, 1
-    load_and_align_16 v1,  r3, r4, 1
-    load_and_align_16 v2,  r3, r4, 1
-    load_and_align_16 v3,  r3, r4, 1
-    load_and_align_16 v4,  r3, r4, 1
-    load_and_align_16 v5,  r3, r4, 1
-    load_and_align_16 v6,  r3, r4, 1
-    load_and_align_16 v7,  r3, r4, 1
-    load_and_align_16 v8,  r3, r4, 1
-    load_and_align_16 v9,  r3, r4, 1
-    load_and_align_16 v10, r3, r4, 1
-    load_and_align_16 v11, r3, r4, 1
-    load_and_align_16 v12, r3, r4, 1
-    load_and_align_16 v13, r3, r4, 1
-    load_and_align_16 v14, r3, r4, 1
-    load_and_align_16 v15, r3, r4, 1
-    load_and_align_16 v16, r3, r4, 0
-
-    beq     compute_sum_sse_8x16_b
-
-second_pass_8x16_b:
-    vspltish v20, 8
-    vspltish v18, 3
-    vslh    v18, v20, v18   ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
-
-    load_vfilter v20, v21
-
-    vfilter_16 v0,  v1
-    vfilter_16 v1,  v2
-    vfilter_16 v2,  v3
-    vfilter_16 v3,  v4
-    vfilter_16 v4,  v5
-    vfilter_16 v5,  v6
-    vfilter_16 v6,  v7
-    vfilter_16 v7,  v8
-    vfilter_16 v8,  v9
-    vfilter_16 v9,  v10
-    vfilter_16 v10, v11
-    vfilter_16 v11, v12
-    vfilter_16 v12, v13
-    vfilter_16 v13, v14
-    vfilter_16 v14, v15
-    vfilter_16 v15, v16
-
-compute_sum_sse_8x16_b:
-    vspltish v18, 0             ;# sum
-    vspltish v19, 0             ;# sse
-    vspltish v23, 0             ;# unpack
-    li      r10, 16
-
-    vmrghb  v0, v0,  v1
-    vmrghb  v1, v2,  v3
-    vmrghb  v2, v4,  v5
-    vmrghb  v3, v6,  v7
-    vmrghb  v4, v8,  v9
-    vmrghb  v5, v10, v11
-    vmrghb  v6, v12, v13
-    vmrghb  v7, v14, v15
-
-    load_and_align_16 v8,  r7, r8, 1
-    load_and_align_16 v9,  r7, r8, 1
-    load_and_align_16 v10, r7, r8, 1
-    load_and_align_16 v11, r7, r8, 1
-    load_and_align_16 v12, r7, r8, 1
-    load_and_align_16 v13, r7, r8, 1
-    load_and_align_16 v14, r7, r8, 1
-    load_and_align_16 v15, r7, r8, 1
-
-    vmrghb  v8,  v8,  v9
-    vmrghb  v9,  v10, v11
-    vmrghb  v10, v12, v13
-    vmrghb  v11, v14, v15
-
-    compute_sum_sse v0, v8,  v18, v19, v20, v21, v23
-    compute_sum_sse v1, v9,  v18, v19, v20, v21, v23
-    compute_sum_sse v2, v10, v18, v19, v20, v21, v23
-    compute_sum_sse v3, v11, v18, v19, v20, v21, v23
-
-    load_and_align_16 v8,  r7, r8, 1
-    load_and_align_16 v9,  r7, r8, 1
-    load_and_align_16 v10, r7, r8, 1
-    load_and_align_16 v11, r7, r8, 1
-    load_and_align_16 v12, r7, r8, 1
-    load_and_align_16 v13, r7, r8, 1
-    load_and_align_16 v14, r7, r8, 1
-    load_and_align_16 v15, r7, r8, 0
-
-    vmrghb  v8,  v8,  v9
-    vmrghb  v9,  v10, v11
-    vmrghb  v10, v12, v13
-    vmrghb  v11, v14, v15
-
-    compute_sum_sse v4, v8,  v18, v19, v20, v21, v23
-    compute_sum_sse v5, v9,  v18, v19, v20, v21, v23
-    compute_sum_sse v6, v10, v18, v19, v20, v21, v23
-    compute_sum_sse v7, v11, v18, v19, v20, v21, v23
-
-    variance_final v18, v19, v23, 7
-
-    addi    r1, r1, 32          ;# recover stack
-    mtspr   256, r11            ;# reset old VRSAVE
-    blr
-
-;# Filters a horizontal line
-;# expects:
-;#  r3  src_ptr
-;#  r4  pitch
-;#  r10 16
-;#  r12 32
-;#  v17 perm intput
-;#  v18 rounding
-;#  v19 shift
-;#  v20 filter taps
-;#  v21 tmp
-;#  v22 tmp
-;#  v23 tmp
-;#  v24 tmp
-;#  v25 tmp
-;#  v26 tmp
-;#  v27 tmp
-;#  v28 perm output
-;#
-.macro hfilter_16 V, increment_counter
-
-    lvsl    v17,  0, r3         ;# permutate value for alignment
-
-    ;# input to filter is 21 bytes wide, output is 16 bytes.
-    ;#  input will can span three vectors if not aligned correctly.
-    lvx     v21,   0, r3
-    lvx     v22, r10, r3
-    lvx     v23, r12, r3
-
-.if \increment_counter
-    add     r3, r3, r4
-.endif
-    vperm   v21, v21, v22, v17
-    vperm   v22, v22, v23, v17  ;# v8 v9 = 21 input pixels left-justified
-
-    ;# set 0
-    vmsummbm v24, v20, v21, v18 ;# taps times elements
-
-    ;# set 1
-    vsldoi  v23, v21, v22, 1
-    vmsummbm v25, v20, v23, v18
-
-    ;# set 2
-    vsldoi  v23, v21, v22, 2
-    vmsummbm v26, v20, v23, v18
-
-    ;# set 3
-    vsldoi  v23, v21, v22, 3
-    vmsummbm v27, v20, v23, v18
-
-    vpkswus v24, v24, v25       ;# v24 = 0 4 8 C 1 5 9 D (16-bit)
-    vpkswus v25, v26, v27       ;# v25 = 2 6 A E 3 7 B F
-
-    vsrh    v24, v24, v19       ;# divide v0, v1 by 128
-    vsrh    v25, v25, v19
-
-    vpkuhus \V, v24, v25        ;# \V = scrambled 8-bit result
-    vperm   \V, \V, v0, v28     ;# \V = correctly-ordered result
-.endm
-
-    .align 2
-;# r3 unsigned char  *src_ptr
-;# r4 int  src_pixels_per_line
-;# r5 int  xoffset
-;# r6 int  yoffset
-;# r7 unsigned char *dst_ptr
-;# r8 int dst_pixels_per_line
-;# r9 unsigned int *sse
-;#
-;# r3 return value
-vp9_sub_pixel_variance16x8_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xffff
-    ori     r12, r12, 0xfff8
-    mtspr   256, r12            ;# set VRSAVE
-
-    stwu    r1, -32(r1)         ;# create space on the stack
-
-    HProlog second_pass_16x8_pre_copy_b
-
-    hfilter_16 v0, 1
-    hfilter_16 v1, 1
-    hfilter_16 v2, 1
-    hfilter_16 v3, 1
-    hfilter_16 v4, 1
-    hfilter_16 v5, 1
-    hfilter_16 v6, 1
-    hfilter_16 v7, 1
-
-    ;# Finished filtering main horizontal block.  If there is no
-    ;#  vertical filtering, jump to storing the data.  Otherwise
-    ;#  load up and filter the additional line that is needed
-    ;#  for the vertical filter.
-    beq     compute_sum_sse_16x8_b
-
-    hfilter_16 v8, 0
-
-    b   second_pass_16x8_b
-
-second_pass_16x8_pre_copy_b:
-    slwi.   r6, r6, 5           ;# index into vertical filter array
-
-    load_and_align_16  v0,  r3, r4, 1
-    load_and_align_16  v1,  r3, r4, 1
-    load_and_align_16  v2,  r3, r4, 1
-    load_and_align_16  v3,  r3, r4, 1
-    load_and_align_16  v4,  r3, r4, 1
-    load_and_align_16  v5,  r3, r4, 1
-    load_and_align_16  v6,  r3, r4, 1
-    load_and_align_16  v7,  r3, r4, 1
-    load_and_align_16  v8,  r3, r4, 1
-
-    beq     compute_sum_sse_16x8_b
-
-second_pass_16x8_b:
-    vspltish v20, 8
-    vspltish v18, 3
-    vslh    v18, v20, v18   ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
-
-    load_vfilter v20, v21
-
-    vfilter_16 v0,  v1
-    vfilter_16 v1,  v2
-    vfilter_16 v2,  v3
-    vfilter_16 v3,  v4
-    vfilter_16 v4,  v5
-    vfilter_16 v5,  v6
-    vfilter_16 v6,  v7
-    vfilter_16 v7,  v8
-
-compute_sum_sse_16x8_b:
-    vspltish v18, 0             ;# sum
-    vspltish v19, 0             ;# sse
-    vspltish v23, 0             ;# unpack
-    li      r10, 16
-
-    compute_sum_sse_16 v0, 1
-    compute_sum_sse_16 v1, 1
-    compute_sum_sse_16 v2, 1
-    compute_sum_sse_16 v3, 1
-    compute_sum_sse_16 v4, 1
-    compute_sum_sse_16 v5, 1
-    compute_sum_sse_16 v6, 1
-    compute_sum_sse_16 v7, 0
-
-    variance_final v18, v19, v23, 7
-
-    addi    r1, r1, 32          ;# recover stack
-
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr
-
-    .align 2
-;# r3 unsigned char  *src_ptr
-;# r4 int  src_pixels_per_line
-;# r5 int  xoffset
-;# r6 int  yoffset
-;# r7 unsigned char *dst_ptr
-;# r8 int dst_pixels_per_line
-;# r9 unsigned int *sse
-;#
-;# r3 return value
-vp9_sub_pixel_variance16x16_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xffff
-    ori     r12, r12, 0xfff8
-    mtspr   256, r12            ;# set VRSAVE
-
-    stwu    r1, -32(r1)         ;# create space on the stack
-
-    HProlog second_pass_16x16_pre_copy_b
-
-    hfilter_16 v0,  1
-    hfilter_16 v1,  1
-    hfilter_16 v2,  1
-    hfilter_16 v3,  1
-    hfilter_16 v4,  1
-    hfilter_16 v5,  1
-    hfilter_16 v6,  1
-    hfilter_16 v7,  1
-    hfilter_16 v8,  1
-    hfilter_16 v9,  1
-    hfilter_16 v10, 1
-    hfilter_16 v11, 1
-    hfilter_16 v12, 1
-    hfilter_16 v13, 1
-    hfilter_16 v14, 1
-    hfilter_16 v15, 1
-
-    ;# Finished filtering main horizontal block.  If there is no
-    ;#  vertical filtering, jump to storing the data.  Otherwise
-    ;#  load up and filter the additional line that is needed
-    ;#  for the vertical filter.
-    beq     compute_sum_sse_16x16_b
-
-    hfilter_16 v16, 0
-
-    b   second_pass_16x16_b
-
-second_pass_16x16_pre_copy_b:
-    slwi.   r6, r6, 5           ;# index into vertical filter array
-
-    load_and_align_16  v0,  r3, r4, 1
-    load_and_align_16  v1,  r3, r4, 1
-    load_and_align_16  v2,  r3, r4, 1
-    load_and_align_16  v3,  r3, r4, 1
-    load_and_align_16  v4,  r3, r4, 1
-    load_and_align_16  v5,  r3, r4, 1
-    load_and_align_16  v6,  r3, r4, 1
-    load_and_align_16  v7,  r3, r4, 1
-    load_and_align_16  v8,  r3, r4, 1
-    load_and_align_16  v9,  r3, r4, 1
-    load_and_align_16  v10, r3, r4, 1
-    load_and_align_16  v11, r3, r4, 1
-    load_and_align_16  v12, r3, r4, 1
-    load_and_align_16  v13, r3, r4, 1
-    load_and_align_16  v14, r3, r4, 1
-    load_and_align_16  v15, r3, r4, 1
-    load_and_align_16  v16, r3, r4, 0
-
-    beq     compute_sum_sse_16x16_b
-
-second_pass_16x16_b:
-    vspltish v20, 8
-    vspltish v18, 3
-    vslh    v18, v20, v18   ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
-
-    load_vfilter v20, v21
-
-    vfilter_16 v0,  v1
-    vfilter_16 v1,  v2
-    vfilter_16 v2,  v3
-    vfilter_16 v3,  v4
-    vfilter_16 v4,  v5
-    vfilter_16 v5,  v6
-    vfilter_16 v6,  v7
-    vfilter_16 v7,  v8
-    vfilter_16 v8,  v9
-    vfilter_16 v9,  v10
-    vfilter_16 v10, v11
-    vfilter_16 v11, v12
-    vfilter_16 v12, v13
-    vfilter_16 v13, v14
-    vfilter_16 v14, v15
-    vfilter_16 v15, v16
-
-compute_sum_sse_16x16_b:
-    vspltish v18, 0             ;# sum
-    vspltish v19, 0             ;# sse
-    vspltish v23, 0             ;# unpack
-    li      r10, 16
-
-    compute_sum_sse_16 v0,  1
-    compute_sum_sse_16 v1,  1
-    compute_sum_sse_16 v2,  1
-    compute_sum_sse_16 v3,  1
-    compute_sum_sse_16 v4,  1
-    compute_sum_sse_16 v5,  1
-    compute_sum_sse_16 v6,  1
-    compute_sum_sse_16 v7,  1
-    compute_sum_sse_16 v8,  1
-    compute_sum_sse_16 v9,  1
-    compute_sum_sse_16 v10, 1
-    compute_sum_sse_16 v11, 1
-    compute_sum_sse_16 v12, 1
-    compute_sum_sse_16 v13, 1
-    compute_sum_sse_16 v14, 1
-    compute_sum_sse_16 v15, 0
-
-    variance_final v18, v19, v23, 8
-
-    addi    r1, r1, 32          ;# recover stack
-
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr
-
-    .data
-
-    .align 4
-hfilter_b:
-    .byte   128,  0,  0,  0,128,  0,  0,  0,128,  0,  0,  0,128,  0,  0,  0
-    .byte   112, 16,  0,  0,112, 16,  0,  0,112, 16,  0,  0,112, 16,  0,  0
-    .byte    96, 32,  0,  0, 96, 32,  0,  0, 96, 32,  0,  0, 96, 32,  0,  0
-    .byte    80, 48,  0,  0, 80, 48,  0,  0, 80, 48,  0,  0, 80, 48,  0,  0
-    .byte    64, 64,  0,  0, 64, 64,  0,  0, 64, 64,  0,  0, 64, 64,  0,  0
-    .byte    48, 80,  0,  0, 48, 80,  0,  0, 48, 80,  0,  0, 48, 80,  0,  0
-    .byte    32, 96,  0,  0, 32, 96,  0,  0, 32, 96,  0,  0, 32, 96,  0,  0
-    .byte    16,112,  0,  0, 16,112,  0,  0, 16,112,  0,  0, 16,112,  0,  0
-
-    .align 4
-vfilter_b:
-    .byte   128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128
-    .byte     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
-    .byte   112,112,112,112,112,112,112,112,112,112,112,112,112,112,112,112
-    .byte    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
-    .byte    96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96
-    .byte    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32
-    .byte    80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80
-    .byte    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48
-    .byte    64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
-    .byte    64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
-    .byte    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48
-    .byte    80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80
-    .byte    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32
-    .byte    96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96
-    .byte    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
-    .byte   112,112,112,112,112,112,112,112,112,112,112,112,112,112,112,112
-
-    .align 4
-b_hperm_b:
-    .byte     0,  4,  8, 12,  1,  5,  9, 13,  2,  6, 10, 14,  3,  7, 11, 15
-
-    .align 4
-b_0123_b:
-    .byte     0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6
-
-    .align 4
-b_4567_b:
-    .byte     4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10
-
-b_hilo_b:
-    .byte     0,  1,  2,  3,  4,  5,  6,  7, 16, 17, 18, 19, 20, 21, 22, 23
--- a/vp8/encoder/psnr.c
+++ /dev/null
@@ -1,30 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vpx_scale/yv12config.h"
-#include "math.h"
-#include "vp8/common/systemdependent.h" /* for vp9_clear_system_state() */
-
-#define MAX_PSNR 100
-
-double vp9_mse2psnr(double Samples, double Peak, double Mse) {
-  double psnr;
-
-  if ((double)Mse > 0.0)
-    psnr = 10.0 * log10(Peak * Peak * Samples / Mse);
-  else
-    psnr = MAX_PSNR;      // Limit to prevent / 0
-
-  if (psnr > MAX_PSNR)
-    psnr = MAX_PSNR;
-
-  return psnr;
-}
--- a/vp8/encoder/psnr.h
+++ /dev/null
@@ -1,17 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef __INC_PSNR_H
-#define __INC_PSNR_H
-
-extern double vp9_mse2psnr(double Samples, double Peak, double Mse);
-
-#endif
--- a/vp8/encoder/quantize.c
+++ /dev/null
@@ -1,716 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <math.h>
-#include "vpx_mem/vpx_mem.h"
-
-#include "onyx_int.h"
-#include "quantize.h"
-#include "vp8/common/quant_common.h"
-
-#include "vp8/common/seg_common.h"
-
-#ifdef ENC_DEBUG
-extern int enc_debug;
-#endif
-
-void vp9_ht_quantize_b_4x4(BLOCK *b, BLOCKD *d, TX_TYPE tx_type) {
-  int i, rc, eob;
-  int zbin;
-  int x, y, z, sz;
-  short *zbin_boost_ptr  = b->zrun_zbin_boost;
-  short *coeff_ptr       = b->coeff;
-  short *zbin_ptr        = b->zbin;
-  short *round_ptr       = b->round;
-  short *quant_ptr       = b->quant;
-  unsigned char *quant_shift_ptr = b->quant_shift;
-  short *qcoeff_ptr      = d->qcoeff;
-  short *dqcoeff_ptr     = d->dqcoeff;
-  short *dequant_ptr     = d->dequant;
-  short zbin_oq_value    = b->zbin_extra;
-
-  int const *pt_scan ;
-
-  switch (tx_type) {
-    case ADST_DCT :
-      pt_scan = vp9_row_scan;
-      break;
-
-    case DCT_ADST :
-      pt_scan = vp9_col_scan;
-      break;
-
-    default :
-      pt_scan = vp9_default_zig_zag1d;
-      break;
-  }
-
-  vpx_memset(qcoeff_ptr, 0, 32);
-  vpx_memset(dqcoeff_ptr, 0, 32);
-
-  eob = -1;
-
-  for (i = 0; i < b->eob_max_offset; i++) {
-    rc   = pt_scan[i];
-    z    = coeff_ptr[rc];
-
-    zbin = zbin_ptr[rc] + *zbin_boost_ptr + zbin_oq_value;
-    zbin_boost_ptr ++;
-
-    sz = (z >> 31);                                 // sign of z
-    x  = (z ^ sz) - sz;                             // x = abs(z)
-
-    if (x >= zbin) {
-      x += round_ptr[rc];
-      y  = (((x * quant_ptr[rc]) >> 16) + x)
-           >> quant_shift_ptr[rc];                // quantize (x)
-      x  = (y ^ sz) - sz;                         // get the sign back
-      qcoeff_ptr[rc]  = x;                        // write to destination
-      dqcoeff_ptr[rc] = x * dequant_ptr[rc];      // dequantized value
-
-      if (y) {
-        eob = i;                                // last nonzero coeffs
-        zbin_boost_ptr = b->zrun_zbin_boost;    // reset zero runlength
-      }
-    }
-  }
-
-  d->eob = eob + 1;
-}
-
-void vp9_regular_quantize_b_4x4(BLOCK *b, BLOCKD *d) {
-  int i, rc, eob;
-  int zbin;
-  int x, y, z, sz;
-  short *zbin_boost_ptr  = b->zrun_zbin_boost;
-  short *coeff_ptr       = b->coeff;
-  short *zbin_ptr        = b->zbin;
-  short *round_ptr       = b->round;
-  short *quant_ptr       = b->quant;
-  unsigned char *quant_shift_ptr = b->quant_shift;
-  short *qcoeff_ptr      = d->qcoeff;
-  short *dqcoeff_ptr     = d->dqcoeff;
-  short *dequant_ptr     = d->dequant;
-  short zbin_oq_value    = b->zbin_extra;
-
-  vpx_memset(qcoeff_ptr, 0, 32);
-  vpx_memset(dqcoeff_ptr, 0, 32);
-
-  eob = -1;
-
-  for (i = 0; i < b->eob_max_offset; i++) {
-    rc   = vp9_default_zig_zag1d[i];
-    z    = coeff_ptr[rc];
-
-    zbin = zbin_ptr[rc] + *zbin_boost_ptr + zbin_oq_value;
-    zbin_boost_ptr ++;
-
-    sz = (z >> 31);                                 // sign of z
-    x  = (z ^ sz) - sz;                             // x = abs(z)
-
-    if (x >= zbin) {
-      x += round_ptr[rc];
-
-      y  = (((x * quant_ptr[rc]) >> 16) + x)
-           >> quant_shift_ptr[rc];                // quantize (x)
-      x  = (y ^ sz) - sz;                         // get the sign back
-      qcoeff_ptr[rc]  = x;                        // write to destination
-      dqcoeff_ptr[rc] = x * dequant_ptr[rc];      // dequantized value
-
-      if (y) {
-        eob = i;                                // last nonzero coeffs
-        zbin_boost_ptr = b->zrun_zbin_boost;    // reset zero runlength
-      }
-    }
-  }
-
-  d->eob = eob + 1;
-}
-
-void vp9_quantize_mby_4x4_c(MACROBLOCK *x) {
-  int i;
-  int has_2nd_order = x->e_mbd.mode_info_context->mbmi.mode != SPLITMV;
-
-  for (i = 0; i < 16; i++)
-    x->quantize_b_4x4(&x->block[i], &x->e_mbd.block[i]);
-
-  if (has_2nd_order)
-    x->quantize_b_4x4(&x->block[24], &x->e_mbd.block[24]);
-}
-
-void vp9_quantize_mbuv_4x4_c(MACROBLOCK *x) {
-  int i;
-
-  for (i = 16; i < 24; i++)
-    x->quantize_b_4x4(&x->block[i], &x->e_mbd.block[i]);
-}
-
-void vp9_quantize_mb_4x4_c(MACROBLOCK *x) {
-  vp9_quantize_mby_4x4_c(x);
-  vp9_quantize_mbuv_4x4_c(x);
-}
-
-void vp9_regular_quantize_b_2x2(BLOCK *b, BLOCKD *d) {
-  int i, rc, eob;
-  int zbin;
-  int x, y, z, sz;
-  short *zbin_boost_ptr = b->zrun_zbin_boost;
-  int zbin_zrun_index = 0;
-  short *coeff_ptr  = b->coeff;
-  short *zbin_ptr   = b->zbin;
-  short *round_ptr  = b->round;
-  short *quant_ptr  = b->quant;
-  unsigned char *quant_shift_ptr = b->quant_shift;
-  short *qcoeff_ptr = d->qcoeff;
-  short *dqcoeff_ptr = d->dqcoeff;
-  short *dequant_ptr = d->dequant;
-  short zbin_oq_value = b->zbin_extra;
-  // double q2nd = 4;
-  vpx_memset(qcoeff_ptr, 0, 32);
-  vpx_memset(dqcoeff_ptr, 0, 32);
-
-  eob = -1;
-
-  for (i = 0; i < b->eob_max_offset_8x8; i++) {
-    rc   = vp9_default_zig_zag1d[i];
-    z    = coeff_ptr[rc];
-
-    zbin_boost_ptr = &b->zrun_zbin_boost[zbin_zrun_index];
-    zbin_zrun_index += 4;
-    zbin = (zbin_ptr[rc] + *zbin_boost_ptr + zbin_oq_value);
-
-    sz = (z >> 31);                               // sign of z
-    x  = (z ^ sz) - sz;                           // x = abs(z)
-
-    if (x >= zbin) {
-      x += (round_ptr[rc]);
-      y  = ((int)((int)(x * quant_ptr[rc]) >> 16) + x)
-           >> quant_shift_ptr[rc];                // quantize (x)
-      x  = (y ^ sz) - sz;                         // get the sign back
-      qcoeff_ptr[rc]  = x;                        // write to destination
-      dqcoeff_ptr[rc] = x * dequant_ptr[rc];      // dequantized value
-
-      if (y) {
-        eob = i;                                  // last nonzero coeffs
-        zbin_zrun_index = 0;
-      }
-    }
-  }
-
-  d->eob = eob + 1;
-}
-
-void vp9_regular_quantize_b_8x8(BLOCK *b, BLOCKD *d) {
-  int i, rc, eob;
-  int zbin;
-  int x, y, z, sz;
-  short *zbin_boost_ptr = b->zrun_zbin_boost_8x8;
-  short *coeff_ptr  = b->coeff;
-  short *zbin_ptr   = b->zbin_8x8;
-  short *round_ptr  = b->round;
-  short *quant_ptr  = b->quant;
-  unsigned char *quant_shift_ptr = b->quant_shift;
-  short *qcoeff_ptr = d->qcoeff;
-  short *dqcoeff_ptr = d->dqcoeff;
-  short *dequant_ptr = d->dequant;
-  short zbin_oq_value = b->zbin_extra;
-
-  vpx_memset(qcoeff_ptr, 0, 64 * sizeof(short));
-  vpx_memset(dqcoeff_ptr, 0, 64 * sizeof(short));
-
-  eob = -1;
-
-  for (i = 0; i < b->eob_max_offset_8x8; i++) {
-    rc   = vp9_default_zig_zag1d_8x8[i];
-    z    = coeff_ptr[rc];
-
-    zbin = (zbin_ptr[rc != 0] + *zbin_boost_ptr + zbin_oq_value);
-    zbin_boost_ptr++;
-
-    sz = (z >> 31);                               // sign of z
-    x  = (z ^ sz) - sz;                           // x = abs(z)
-
-    if (x >= zbin) {
-      x += (round_ptr[rc != 0]);
-      y  = ((int)(((int)(x * quant_ptr[rc != 0]) >> 16) + x))
-           >> quant_shift_ptr[rc != 0];            // quantize (x)
-      x  = (y ^ sz) - sz;                         // get the sign back
-      qcoeff_ptr[rc]  = x;                        // write to destination
-      dqcoeff_ptr[rc] = x * dequant_ptr[rc != 0]; // dequantized value
-
-      if (y) {
-        eob = i;                                  // last nonzero coeffs
-        zbin_boost_ptr = b->zrun_zbin_boost_8x8;
-      }
-    }
-  }
-
-  d->eob = eob + 1;
-}
-
-void vp9_quantize_mby_8x8(MACROBLOCK *x) {
-  int i;
-  int has_2nd_order = x->e_mbd.mode_info_context->mbmi.mode != SPLITMV;
-
-  for (i = 0; i < 16; i ++) {
-    x->e_mbd.block[i].eob = 0;
-  }
-  x->e_mbd.block[24].eob = 0;
-  for (i = 0; i < 16; i += 4)
-    x->quantize_b_8x8(&x->block[i], &x->e_mbd.block[i]);
-
-  if (has_2nd_order)
-    x->quantize_b_2x2(&x->block[24], &x->e_mbd.block[24]);
-}
-
-void vp9_quantize_mbuv_8x8(MACROBLOCK *x) {
-  int i;
-
-  for (i = 16; i < 24; i ++)
-    x->e_mbd.block[i].eob = 0;
-  for (i = 16; i < 24; i += 4)
-    x->quantize_b_8x8(&x->block[i], &x->e_mbd.block[i]);
-}
-
-void vp9_quantize_mb_8x8(MACROBLOCK *x) {
-  vp9_quantize_mby_8x8(x);
-  vp9_quantize_mbuv_8x8(x);
-}
-
-void vp9_quantize_mby_16x16(MACROBLOCK *x) {
-  int i;
-
-  for (i = 0; i < 16; i++)
-    x->e_mbd.block[i].eob = 0;
-  x->e_mbd.block[24].eob = 0;
-  x->quantize_b_16x16(&x->block[0], &x->e_mbd.block[0]);
-}
-
-void vp9_quantize_mb_16x16(MACROBLOCK *x) {
-  vp9_quantize_mby_16x16(x);
-  vp9_quantize_mbuv_8x8(x);
-}
-
-void vp9_regular_quantize_b_16x16(BLOCK *b, BLOCKD *d) {
-  int i, rc, eob;
-  int zbin;
-  int x, y, z, sz;
-  short *zbin_boost_ptr = b->zrun_zbin_boost_16x16;
-  short *coeff_ptr  = b->coeff;
-  short *zbin_ptr   = b->zbin_16x16;
-  short *round_ptr  = b->round;
-  short *quant_ptr  = b->quant;
-  unsigned char *quant_shift_ptr = b->quant_shift;
-  short *qcoeff_ptr = d->qcoeff;
-  short *dqcoeff_ptr = d->dqcoeff;
-  short *dequant_ptr = d->dequant;
-  short zbin_oq_value = b->zbin_extra;
-
-  vpx_memset(qcoeff_ptr, 0, 256*sizeof(short));
-  vpx_memset(dqcoeff_ptr, 0, 256*sizeof(short));
-
-  eob = -1;
-  for (i = 0; i < b->eob_max_offset_16x16; i++) {
-    rc   = vp9_default_zig_zag1d_16x16[i];
-    z    = coeff_ptr[rc];
-
-    zbin = (zbin_ptr[rc!=0] + *zbin_boost_ptr + zbin_oq_value);
-    zbin_boost_ptr ++;
-
-    sz = (z >> 31);                               // sign of z
-    x  = (z ^ sz) - sz;                           // x = abs(z)
-
-    if (x >= zbin) {
-      x += (round_ptr[rc!=0]);
-      y  = ((int)(((int)(x * quant_ptr[rc!=0]) >> 16) + x))
-          >> quant_shift_ptr[rc!=0];              // quantize (x)
-      x  = (y ^ sz) - sz;                         // get the sign back
-      qcoeff_ptr[rc]  = x;                        // write to destination
-      dqcoeff_ptr[rc] = x * dequant_ptr[rc!=0];   // dequantized value
-
-      if (y) {
-        eob = i;                                  // last nonzero coeffs
-        zbin_boost_ptr = b->zrun_zbin_boost_16x16;
-      }
-    }
-  }
-
-  d->eob = eob + 1;
-}
-
-/* quantize_b_pair function pointer in MACROBLOCK structure is set to one of
- * these two C functions if corresponding optimized routine is not available.
- * NEON optimized version implements currently the fast quantization for pair
- * of blocks. */
-void vp9_regular_quantize_b_4x4_pair(BLOCK *b1, BLOCK *b2,
-                                     BLOCKD *d1, BLOCKD *d2) {
-  vp9_regular_quantize_b_4x4(b1, d1);
-  vp9_regular_quantize_b_4x4(b2, d2);
-}
-
-static void invert_quant(short *quant,
-                         unsigned char *shift, short d) {
-  unsigned t;
-  int l;
-  t = d;
-  for (l = 0; t > 1; l++)
-    t >>= 1;
-  t = 1 + (1 << (16 + l)) / d;
-  *quant = (short)(t - (1 << 16));
-  *shift = l;
-}
-
-void vp9_init_quantizer(VP9_COMP *cpi) {
-  int i;
-  int quant_val;
-  int Q;
-  static const int zbin_boost[16] = {  0,  0,  8, 10, 12, 14, 16, 20,
-                                      24, 28, 32, 36, 40, 44, 44, 44
-                                    };
-
-  static const int zbin_boost_8x8[64] = {  0,  0,  0,  8,  8,  8, 10, 12,
-                                          14, 16, 18, 20, 22, 24, 26, 28,
-                                          30, 32, 34, 36, 38, 40, 42, 44,
-                                          46, 48, 48, 48, 48, 48, 48, 48,
-                                          48, 48, 48, 48, 48, 48, 48, 48,
-                                          48, 48, 48, 48, 48, 48, 48, 48,
-                                          48, 48, 48, 48, 48, 48, 48, 48,
-                                          48, 48, 48, 48, 48, 48, 48, 48
-                                        };
-  static const int zbin_boost_16x16[256] = {
-     0,  0,  0,  8,  8,  8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28,
-    30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 48, 48, 48, 48, 48, 48,
-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
-  };
-  int qrounding_factor = 48;
-
-
-  for (Q = 0; Q < QINDEX_RANGE; Q++) {
-    int qzbin_factor = (vp9_dc_quant(Q, 0) < 148) ? 84 : 80;
-
-#if CONFIG_LOSSLESS
-    if (cpi->oxcf.lossless) {
-      if (Q == 0) {
-        qzbin_factor = 64;
-        qrounding_factor = 64;
-      }
-    }
-#endif
-
-    // dc values
-    quant_val = vp9_dc_quant(Q, cpi->common.y1dc_delta_q);
-    invert_quant(cpi->Y1quant[Q] + 0,
-                 cpi->Y1quant_shift[Q] + 0, quant_val);
-    cpi->Y1zbin[Q][0] = ((qzbin_factor * quant_val) + 64) >> 7;
-    cpi->Y1zbin_8x8[Q][0] = ((qzbin_factor * quant_val) + 64) >> 7;
-    cpi->Y1zbin_16x16[Q][0] = ((qzbin_factor * quant_val) + 64) >> 7;
-    cpi->Y1round[Q][0] = (qrounding_factor * quant_val) >> 7;
-    cpi->common.Y1dequant[Q][0] = quant_val;
-    cpi->zrun_zbin_boost_y1[Q][0] = (quant_val * zbin_boost[0]) >> 7;
-    cpi->zrun_zbin_boost_y1_8x8[Q][0] =
-      ((quant_val * zbin_boost_8x8[0]) + 64) >> 7;
-    cpi->zrun_zbin_boost_y1_16x16[Q][0] = ((quant_val * zbin_boost_16x16[0]) + 64) >> 7;
-
-
-    quant_val = vp9_dc2quant(Q, cpi->common.y2dc_delta_q);
-    invert_quant(cpi->Y2quant[Q] + 0,
-                 cpi->Y2quant_shift[Q] + 0, quant_val);
-    cpi->Y2zbin[Q][0] = ((qzbin_factor * quant_val) + 64) >> 7;
-    cpi->Y2zbin_8x8[Q][0] = ((qzbin_factor * quant_val) + 64) >> 7;
-    cpi->Y2zbin_16x16[Q][0] = ((qzbin_factor * quant_val) + 64) >> 7;
-    cpi->Y2round[Q][0] = (qrounding_factor * quant_val) >> 7;
-    cpi->common.Y2dequant[Q][0] = quant_val;
-    cpi->zrun_zbin_boost_y2[Q][0] = (quant_val * zbin_boost[0]) >> 7;
-    cpi->zrun_zbin_boost_y2_8x8[Q][0] =
-      ((quant_val * zbin_boost_8x8[0]) + 64) >> 7;
-    cpi->zrun_zbin_boost_y2_16x16[Q][0] = ((quant_val * zbin_boost_16x16[0]) + 64) >> 7;
-
-    quant_val = vp9_dc_uv_quant(Q, cpi->common.uvdc_delta_q);
-    invert_quant(cpi->UVquant[Q] + 0,
-                 cpi->UVquant_shift[Q] + 0, quant_val);
-    cpi->UVzbin[Q][0] = ((qzbin_factor * quant_val) + 64) >> 7;
-    cpi->UVzbin_8x8[Q][0] = ((qzbin_factor * quant_val) + 64) >> 7;
-    cpi->UVzbin_16x16[Q][0] = ((qzbin_factor * quant_val) + 64) >> 7;
-    cpi->UVround[Q][0] = (qrounding_factor * quant_val) >> 7;
-    cpi->common.UVdequant[Q][0] = quant_val;
-    cpi->zrun_zbin_boost_uv[Q][0] = (quant_val * zbin_boost[0]) >> 7;
-    cpi->zrun_zbin_boost_uv_8x8[Q][0] =
-      ((quant_val * zbin_boost_8x8[0]) + 64) >> 7;
-    cpi->zrun_zbin_boost_uv_16x16[Q][0] = ((quant_val * zbin_boost_16x16[0]) + 64) >> 7;
-
-    // all the 4x4 ac values =;
-    for (i = 1; i < 16; i++) {
-      int rc = vp9_default_zig_zag1d[i];
-
-      quant_val = vp9_ac_yquant(Q);
-      invert_quant(cpi->Y1quant[Q] + rc,
-                   cpi->Y1quant_shift[Q] + rc, quant_val);
-      cpi->Y1zbin[Q][rc] = ((qzbin_factor * quant_val) + 64) >> 7;
-      cpi->Y1round[Q][rc] = (qrounding_factor * quant_val) >> 7;
-      cpi->common.Y1dequant[Q][rc] = quant_val;
-      cpi->zrun_zbin_boost_y1[Q][i] =
-        ((quant_val * zbin_boost[i]) + 64) >> 7;
-
-      quant_val = vp9_ac2quant(Q, cpi->common.y2ac_delta_q);
-      invert_quant(cpi->Y2quant[Q] + rc,
-                   cpi->Y2quant_shift[Q] + rc, quant_val);
-      cpi->Y2zbin[Q][rc] = ((qzbin_factor * quant_val) + 64) >> 7;
-      cpi->Y2round[Q][rc] = (qrounding_factor * quant_val) >> 7;
-      cpi->common.Y2dequant[Q][rc] = quant_val;
-      cpi->zrun_zbin_boost_y2[Q][i] =
-        ((quant_val * zbin_boost[i]) + 64) >> 7;
-
-      quant_val = vp9_ac_uv_quant(Q, cpi->common.uvac_delta_q);
-      invert_quant(cpi->UVquant[Q] + rc,
-                   cpi->UVquant_shift[Q] + rc, quant_val);
-      cpi->UVzbin[Q][rc] = ((qzbin_factor * quant_val) + 64) >> 7;
-      cpi->UVround[Q][rc] = (qrounding_factor * quant_val) >> 7;
-      cpi->common.UVdequant[Q][rc] = quant_val;
-      cpi->zrun_zbin_boost_uv[Q][i] =
-        ((quant_val * zbin_boost[i]) + 64) >> 7;
-    }
-
-    // 8x8 structures... only zbin seperated out for now
-    // This needs cleaning up for 8x8 especially if we are to add
-    // support for non flat Q matices
-    for (i = 1; i < 64; i++) {
-      int rc = vp9_default_zig_zag1d_8x8[i];
-
-      quant_val = vp9_ac_yquant(Q);
-      cpi->Y1zbin_8x8[Q][rc] = ((qzbin_factor * quant_val) + 64) >> 7;
-      cpi->zrun_zbin_boost_y1_8x8[Q][i] =
-        ((quant_val * zbin_boost_8x8[i]) + 64) >> 7;
-
-      quant_val = vp9_ac2quant(Q, cpi->common.y2ac_delta_q);
-      cpi->Y2zbin_8x8[Q][rc] = ((qzbin_factor * quant_val) + 64) >> 7;
-      cpi->zrun_zbin_boost_y2_8x8[Q][i] =
-        ((quant_val * zbin_boost_8x8[i]) + 64) >> 7;
-
-      quant_val = vp9_ac_uv_quant(Q, cpi->common.uvac_delta_q);
-      cpi->UVzbin_8x8[Q][rc] = ((qzbin_factor * quant_val) + 64) >> 7;
-      cpi->zrun_zbin_boost_uv_8x8[Q][i] =
-        ((quant_val * zbin_boost_8x8[i]) + 64) >> 7;
-    }
-
-    // 16x16 structures. Same comment above applies.
-    for (i = 1; i < 256; i++) {
-      int rc = vp9_default_zig_zag1d_16x16[i];
-
-      quant_val = vp9_ac_yquant(Q);
-      cpi->Y1zbin_16x16[Q][rc] = ((qzbin_factor * quant_val) + 64) >> 7;
-      cpi->zrun_zbin_boost_y1_16x16[Q][i] = ((quant_val * zbin_boost_16x16[i]) + 64) >> 7;
-
-      quant_val = vp9_ac2quant(Q, cpi->common.y2ac_delta_q);
-      cpi->Y2zbin_16x16[Q][rc] = ((qzbin_factor * quant_val) + 64) >> 7;
-      cpi->zrun_zbin_boost_y2_16x16[Q][i] = ((quant_val * zbin_boost_16x16[i]) + 64) >> 7;
-
-      quant_val = vp9_ac_uv_quant(Q, cpi->common.uvac_delta_q);
-      cpi->UVzbin_16x16[Q][rc] = ((qzbin_factor * quant_val) + 64) >> 7;
-      cpi->zrun_zbin_boost_uv_16x16[Q][i] = ((quant_val * zbin_boost_16x16[i]) + 64) >> 7;
-    }
-  }
-}
-
-void vp9_mb_init_quantizer(VP9_COMP *cpi, MACROBLOCK *x) {
-  int i;
-  int QIndex;
-  MACROBLOCKD *xd = &x->e_mbd;
-  int zbin_extra;
-  int segment_id = xd->mode_info_context->mbmi.segment_id;
-
-  // Select the baseline MB Q index allowing for any segment level change.
-  if (vp9_segfeature_active(xd, segment_id, SEG_LVL_ALT_Q)) {
-    // Abs Value
-    if (xd->mb_segment_abs_delta == SEGMENT_ABSDATA)
-      QIndex = vp9_get_segdata(xd, segment_id, SEG_LVL_ALT_Q);
-
-    // Delta Value
-    else {
-      QIndex = cpi->common.base_qindex +
-               vp9_get_segdata(xd, segment_id, SEG_LVL_ALT_Q);
-
-      // Clamp to valid range
-      QIndex = (QIndex >= 0) ? ((QIndex <= MAXQ) ? QIndex : MAXQ) : 0;
-    }
-  } else
-    QIndex = cpi->common.base_qindex;
-
-  // Y
-  zbin_extra = (cpi->common.Y1dequant[QIndex][1] *
-                (cpi->zbin_over_quant +
-                 cpi->zbin_mode_boost +
-                 x->act_zbin_adj)) >> 7;
-
-  for (i = 0; i < 16; i++) {
-    x->block[i].quant = cpi->Y1quant[QIndex];
-    x->block[i].quant_shift = cpi->Y1quant_shift[QIndex];
-    x->block[i].zbin = cpi->Y1zbin[QIndex];
-    x->block[i].zbin_8x8 = cpi->Y1zbin_8x8[QIndex];
-    x->block[i].zbin_16x16 = cpi->Y1zbin_16x16[QIndex];
-    x->block[i].round = cpi->Y1round[QIndex];
-    x->e_mbd.block[i].dequant = cpi->common.Y1dequant[QIndex];
-    x->block[i].zrun_zbin_boost = cpi->zrun_zbin_boost_y1[QIndex];
-    x->block[i].zrun_zbin_boost_8x8 = cpi->zrun_zbin_boost_y1_8x8[QIndex];
-    x->block[i].zrun_zbin_boost_16x16 = cpi->zrun_zbin_boost_y1_16x16[QIndex];
-    x->block[i].zbin_extra = (short)zbin_extra;
-
-    // Segment max eob offset feature.
-    if (vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB)) {
-      x->block[i].eob_max_offset =
-        vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);
-      x->block[i].eob_max_offset_8x8 =
-        vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);
-      x->block[i].eob_max_offset_16x16 =
-        vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);
-    } else {
-      x->block[i].eob_max_offset = 16;
-      x->block[i].eob_max_offset_8x8 = 64;
-      x->block[i].eob_max_offset_16x16 = 256;
-    }
-  }
-
-  // UV
-  zbin_extra = (cpi->common.UVdequant[QIndex][1] *
-                (cpi->zbin_over_quant +
-                 cpi->zbin_mode_boost +
-                 x->act_zbin_adj)) >> 7;
-
-  for (i = 16; i < 24; i++) {
-    x->block[i].quant = cpi->UVquant[QIndex];
-    x->block[i].quant_shift = cpi->UVquant_shift[QIndex];
-    x->block[i].zbin = cpi->UVzbin[QIndex];
-    x->block[i].zbin_8x8 = cpi->UVzbin_8x8[QIndex];
-    x->block[i].zbin_16x16 = cpi->UVzbin_16x16[QIndex];
-    x->block[i].round = cpi->UVround[QIndex];
-    x->e_mbd.block[i].dequant = cpi->common.UVdequant[QIndex];
-    x->block[i].zrun_zbin_boost = cpi->zrun_zbin_boost_uv[QIndex];
-    x->block[i].zrun_zbin_boost_8x8 = cpi->zrun_zbin_boost_uv_8x8[QIndex];
-    x->block[i].zrun_zbin_boost_16x16 = cpi->zrun_zbin_boost_uv_16x16[QIndex];
-
-    x->block[i].zbin_extra = (short)zbin_extra;
-
-    // Segment max eob offset feature.
-    if (vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB)) {
-      x->block[i].eob_max_offset =
-        vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);
-      x->block[i].eob_max_offset_8x8 =
-        vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);
-    } else {
-      x->block[i].eob_max_offset = 16;
-      x->block[i].eob_max_offset_8x8 = 64;
-    }
-  }
-
-  // Y2
-  zbin_extra = (cpi->common.Y2dequant[QIndex][1] *
-                ((cpi->zbin_over_quant / 2) +
-                 cpi->zbin_mode_boost +
-                 x->act_zbin_adj)) >> 7;
-
-  x->block[24].quant = cpi->Y2quant[QIndex];
-  x->block[24].quant_shift = cpi->Y2quant_shift[QIndex];
-  x->block[24].zbin = cpi->Y2zbin[QIndex];
-  x->block[24].zbin_8x8 = cpi->Y2zbin_8x8[QIndex];
-  x->block[24].zbin_16x16 = cpi->Y2zbin_16x16[QIndex];
-  x->block[24].round = cpi->Y2round[QIndex];
-  x->e_mbd.block[24].dequant = cpi->common.Y2dequant[QIndex];
-  x->block[24].zrun_zbin_boost = cpi->zrun_zbin_boost_y2[QIndex];
-  x->block[24].zrun_zbin_boost_8x8 = cpi->zrun_zbin_boost_y2_8x8[QIndex];
-  x->block[24].zrun_zbin_boost_16x16 = cpi->zrun_zbin_boost_y2_16x16[QIndex];
-  x->block[24].zbin_extra = (short)zbin_extra;
-
-  // TBD perhaps not use for Y2
-  // Segment max eob offset feature.
-  if (vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB)) {
-    x->block[24].eob_max_offset =
-      vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);
-    x->block[24].eob_max_offset_8x8 =
-      vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);
-  } else {
-    x->block[24].eob_max_offset = 16;
-    x->block[24].eob_max_offset_8x8 = 4;
-  }
-
-  /* save this macroblock QIndex for vp9_update_zbin_extra() */
-  x->e_mbd.q_index = QIndex;
-}
-
-void vp9_update_zbin_extra(VP9_COMP *cpi, MACROBLOCK *x) {
-  int i;
-  int QIndex = x->e_mbd.q_index;
-  int zbin_extra;
-
-  // Y
-  zbin_extra = (cpi->common.Y1dequant[QIndex][1] *
-                (cpi->zbin_over_quant +
-                 cpi->zbin_mode_boost +
-                 x->act_zbin_adj)) >> 7;
-  for (i = 0; i < 16; i++) {
-    x->block[i].zbin_extra = (short)zbin_extra;
-  }
-
-  // UV
-  zbin_extra = (cpi->common.UVdequant[QIndex][1] *
-                (cpi->zbin_over_quant +
-                 cpi->zbin_mode_boost +
-                 x->act_zbin_adj)) >> 7;
-
-  for (i = 16; i < 24; i++) {
-    x->block[i].zbin_extra = (short)zbin_extra;
-  }
-
-  // Y2
-  zbin_extra = (cpi->common.Y2dequant[QIndex][1] *
-                ((cpi->zbin_over_quant / 2) +
-                 cpi->zbin_mode_boost +
-                 x->act_zbin_adj)) >> 7;
-
-  x->block[24].zbin_extra = (short)zbin_extra;
-}
-
-void vp9_frame_init_quantizer(VP9_COMP *cpi) {
-  // Clear Zbin mode boost for default case
-  cpi->zbin_mode_boost = 0;
-
-  // MB level quantizer setup
-  vp9_mb_init_quantizer(cpi, &cpi->mb);
-}
-
-void vp9_set_quantizer(struct VP9_COMP *cpi, int Q) {
-  VP9_COMMON *cm = &cpi->common;
-
-  cm->base_qindex = Q;
-
-  // if any of the delta_q values are changing update flag will
-  // have to be set.
-  cm->y1dc_delta_q = 0;
-  cm->y2ac_delta_q = 0;
-  cm->uvdc_delta_q = 0;
-  cm->uvac_delta_q = 0;
-  cm->y2dc_delta_q = 0;
-
-  // quantizer has to be reinitialized if any delta_q changes.
-  // As there are not any here for now this is inactive code.
-  // if(update)
-  //    vp9_init_quantizer(cpi);
-}
--- a/vp8/encoder/quantize.h
+++ /dev/null
@@ -1,97 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef __INC_QUANTIZE_H
-#define __INC_QUANTIZE_H
-
-#include "block.h"
-
-#define prototype_quantize_block(sym) \
-  void (sym)(BLOCK *b,BLOCKD *d)
-
-#define prototype_quantize_block_pair(sym) \
-  void (sym)(BLOCK *b1, BLOCK *b2, BLOCKD *d1, BLOCKD *d2)
-
-#define prototype_quantize_mb(sym) \
-  void (sym)(MACROBLOCK *x)
-
-#if ARCH_X86 || ARCH_X86_64
-#include "x86/quantize_x86.h"
-#endif
-
-#if ARCH_ARM
-#include "arm/quantize_arm.h"
-#endif
-
-#define prototype_quantize_block_type(sym) \
-  void (sym)(BLOCK *b, BLOCKD *d, TX_TYPE type)
-extern prototype_quantize_block_type(vp9_ht_quantize_b_4x4);
-
-#ifndef vp9_quantize_quantb_4x4
-#define vp9_quantize_quantb_4x4 vp9_regular_quantize_b_4x4
-#endif
-extern prototype_quantize_block(vp9_quantize_quantb_4x4);
-
-#ifndef vp9_quantize_quantb_4x4_pair
-#define vp9_quantize_quantb_4x4_pair vp9_regular_quantize_b_4x4_pair
-#endif
-extern prototype_quantize_block_pair(vp9_quantize_quantb_4x4_pair);
-
-#ifndef vp9_quantize_quantb_8x8
-#define vp9_quantize_quantb_8x8 vp9_regular_quantize_b_8x8
-#endif
-extern prototype_quantize_block(vp9_quantize_quantb_8x8);
-
-#ifndef vp9_quantize_quantb_16x16
-#define vp9_quantize_quantb_16x16 vp9_regular_quantize_b_16x16
-#endif
-extern prototype_quantize_block(vp9_quantize_quantb_16x16);
-
-#ifndef vp9_quantize_quantb_2x2
-#define vp9_quantize_quantb_2x2 vp9_regular_quantize_b_2x2
-#endif
-extern prototype_quantize_block(vp9_quantize_quantb_2x2);
-
-#ifndef vp9_quantize_mb_4x4
-#define vp9_quantize_mb_4x4 vp9_quantize_mb_4x4_c
-#endif
-extern prototype_quantize_mb(vp9_quantize_mb_4x4);
-void vp9_quantize_mb_8x8(MACROBLOCK *x);
-
-#ifndef vp9_quantize_mbuv_4x4
-#define vp9_quantize_mbuv_4x4 vp9_quantize_mbuv_4x4_c
-#endif
-extern prototype_quantize_mb(vp9_quantize_mbuv_4x4);
-
-#ifndef vp9_quantize_mby_4x4
-#define vp9_quantize_mby_4x4 vp9_quantize_mby_4x4_c
-#endif
-extern prototype_quantize_mb(vp9_quantize_mby_4x4);
-
-extern prototype_quantize_mb(vp9_quantize_mby_8x8);
-extern prototype_quantize_mb(vp9_quantize_mbuv_8x8);
-
-void vp9_quantize_mb_16x16(MACROBLOCK *x);
-extern prototype_quantize_block(vp9_quantize_quantb_16x16);
-extern prototype_quantize_mb(vp9_quantize_mby_16x16);
-
-struct VP9_COMP;
-
-extern void vp9_set_quantizer(struct VP9_COMP *cpi, int Q);
-
-extern void vp9_frame_init_quantizer(struct VP9_COMP *cpi);
-
-extern void vp9_update_zbin_extra(struct VP9_COMP *cpi, MACROBLOCK *x);
-
-extern void vp9_mb_init_quantizer(struct VP9_COMP *cpi, MACROBLOCK *x);
-
-extern void vp9_init_quantizer(struct VP9_COMP *cpi);
-
-#endif
--- a/vp8/encoder/ratectrl.c
+++ /dev/null
@@ -1,698 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include <limits.h>
-#include <assert.h>
-
-#include "math.h"
-#include "vp8/common/alloccommon.h"
-#include "vp8/common/common.h"
-#include "ratectrl.h"
-#include "vp8/common/entropymode.h"
-#include "vpx_mem/vpx_mem.h"
-#include "vp8/common/systemdependent.h"
-#include "encodemv.h"
-#include "vp8/common/quant_common.h"
-
-#define MIN_BPB_FACTOR          0.005
-#define MAX_BPB_FACTOR          50
-
-#ifdef MODE_STATS
-extern unsigned int y_modes[VP9_YMODES];
-extern unsigned int uv_modes[VP9_UV_MODES];
-extern unsigned int b_modes[B_MODE_COUNT];
-
-extern unsigned int inter_y_modes[MB_MODE_COUNT];
-extern unsigned int inter_uv_modes[VP9_UV_MODES];
-extern unsigned int inter_b_modes[B_MODE_COUNT];
-#endif
-
-// Bits Per MB at different Q (Multiplied by 512)
-#define BPER_MB_NORMBITS    9
-
-// % adjustment to target kf size based on seperation from previous frame
-static const int kf_boost_seperation_adjustment[16] = {
-  30,   40,   50,   55,   60,   65,   70,   75,
-  80,   85,   90,   95,  100,  100,  100,  100,
-};
-
-static const int gf_adjust_table[101] = {
-  100,
-  115, 130, 145, 160, 175, 190, 200, 210, 220, 230,
-  240, 260, 270, 280, 290, 300, 310, 320, 330, 340,
-  350, 360, 370, 380, 390, 400, 400, 400, 400, 400,
-  400, 400, 400, 400, 400, 400, 400, 400, 400, 400,
-  400, 400, 400, 400, 400, 400, 400, 400, 400, 400,
-  400, 400, 400, 400, 400, 400, 400, 400, 400, 400,
-  400, 400, 400, 400, 400, 400, 400, 400, 400, 400,
-  400, 400, 400, 400, 400, 400, 400, 400, 400, 400,
-  400, 400, 400, 400, 400, 400, 400, 400, 400, 400,
-  400, 400, 400, 400, 400, 400, 400, 400, 400, 400,
-};
-
-static const int gf_intra_usage_adjustment[20] = {
-  125, 120, 115, 110, 105, 100,  95,  85,  80,  75,
-  70,  65,  60,  55,  50,  50,  50,  50,  50,  50,
-};
-
-static const int gf_interval_table[101] = {
-  7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
-  8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
-  9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
-  9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
-};
-
-static const unsigned int prior_key_frame_weight[KEY_FRAME_CONTEXT] = { 1, 2, 3, 4, 5 };
-
-// These functions use formulaic calculations to make playing with the
-// quantizer tables easier. If necessary they can be replaced by lookup
-// tables if and when things settle down in the experimental bitstream
-double vp9_convert_qindex_to_q(int qindex) {
-  // Convert the index to a real Q value (scaled down to match old Q values)
-  return (double)vp9_ac_yquant(qindex) / 4.0;
-}
-
-int vp9_gfboost_qadjust(int qindex) {
-  int retval;
-  double q;
-
-  q = vp9_convert_qindex_to_q(qindex);
-  retval = (int)((0.00000828 * q * q * q) +
-                 (-0.0055 * q * q) +
-                 (1.32 * q) + 79.3);
-  return retval;
-}
-
-static int kfboost_qadjust(int qindex) {
-  int retval;
-  double q;
-
-  q = vp9_convert_qindex_to_q(qindex);
-  retval = (int)((0.00000973 * q * q * q) +
-                 (-0.00613 * q * q) +
-                 (1.316 * q) + 121.2);
-  return retval;
-}
-
-int vp9_bits_per_mb(FRAME_TYPE frame_type, int qindex) {
-  if (frame_type == KEY_FRAME)
-    return (int)(4500000 / vp9_convert_qindex_to_q(qindex));
-  else
-    return (int)(2850000 / vp9_convert_qindex_to_q(qindex));
-}
-
-
-void vp9_save_coding_context(VP9_COMP *cpi) {
-  CODING_CONTEXT *const cc = &cpi->coding_context;
-  VP9_COMMON *cm = &cpi->common;
-  MACROBLOCKD *xd = &cpi->mb.e_mbd;
-
-  // Stores a snapshot of key state variables which can subsequently be
-  // restored with a call to vp9_restore_coding_context. These functions are
-  // intended for use in a re-code loop in vp9_compress_frame where the
-  // quantizer value is adjusted between loop iterations.
-
-  cc->nmvc = cm->fc.nmvc;
-  vp9_copy(cc->nmvjointcost,  cpi->mb.nmvjointcost);
-  vp9_copy(cc->nmvcosts,  cpi->mb.nmvcosts);
-  vp9_copy(cc->nmvcosts_hp,  cpi->mb.nmvcosts_hp);
-
-  vp9_copy(cc->mv_ref_ct, cm->fc.mv_ref_ct);
-  vp9_copy(cc->mode_context, cm->fc.mode_context);
-  vp9_copy(cc->mv_ref_ct_a, cm->fc.mv_ref_ct_a);
-  vp9_copy(cc->mode_context_a, cm->fc.mode_context_a);
-
-  vp9_copy(cc->ymode_prob, cm->fc.ymode_prob);
-  vp9_copy(cc->bmode_prob, cm->fc.bmode_prob);
-  vp9_copy(cc->uv_mode_prob, cm->fc.uv_mode_prob);
-  vp9_copy(cc->i8x8_mode_prob, cm->fc.i8x8_mode_prob);
-  vp9_copy(cc->sub_mv_ref_prob, cm->fc.sub_mv_ref_prob);
-  vp9_copy(cc->mbsplit_prob, cm->fc.mbsplit_prob);
-
-  // Stats
-#ifdef MODE_STATS
-  vp9_copy(cc->y_modes,       y_modes);
-  vp9_copy(cc->uv_modes,      uv_modes);
-  vp9_copy(cc->b_modes,       b_modes);
-  vp9_copy(cc->inter_y_modes,  inter_y_modes);
-  vp9_copy(cc->inter_uv_modes, inter_uv_modes);
-  vp9_copy(cc->inter_b_modes,  inter_b_modes);
-#endif
-
-  vp9_copy(cc->segment_pred_probs, cm->segment_pred_probs);
-  vp9_copy(cc->ref_pred_probs_update, cpi->ref_pred_probs_update);
-  vp9_copy(cc->ref_pred_probs, cm->ref_pred_probs);
-  vp9_copy(cc->prob_comppred, cm->prob_comppred);
-
-  vpx_memcpy(cpi->coding_context.last_frame_seg_map_copy,
-             cm->last_frame_seg_map, (cm->mb_rows * cm->mb_cols));
-
-  vp9_copy(cc->last_ref_lf_deltas, xd->last_ref_lf_deltas);
-  vp9_copy(cc->last_mode_lf_deltas, xd->last_mode_lf_deltas);
-
-  vp9_copy(cc->coef_probs, cm->fc.coef_probs);
-  vp9_copy(cc->hybrid_coef_probs, cm->fc.hybrid_coef_probs);
-  vp9_copy(cc->coef_probs_8x8, cm->fc.coef_probs_8x8);
-  vp9_copy(cc->hybrid_coef_probs_8x8, cm->fc.hybrid_coef_probs_8x8);
-  vp9_copy(cc->coef_probs_16x16, cm->fc.coef_probs_16x16);
-  vp9_copy(cc->hybrid_coef_probs_16x16, cm->fc.hybrid_coef_probs_16x16);
-  vp9_copy(cc->switchable_interp_prob, cm->fc.switchable_interp_prob);
-}
-
-void vp9_restore_coding_context(VP9_COMP *cpi) {
-  CODING_CONTEXT *const cc = &cpi->coding_context;
-  VP9_COMMON *cm = &cpi->common;
-  MACROBLOCKD *xd = &cpi->mb.e_mbd;
-
-  // Restore key state variables to the snapshot state stored in the
-  // previous call to vp9_save_coding_context.
-
-  cm->fc.nmvc = cc->nmvc;
-  vp9_copy(cpi->mb.nmvjointcost, cc->nmvjointcost);
-  vp9_copy(cpi->mb.nmvcosts, cc->nmvcosts);
-  vp9_copy(cpi->mb.nmvcosts_hp, cc->nmvcosts_hp);
-
-  vp9_copy(cm->fc.mv_ref_ct, cc->mv_ref_ct);
-  vp9_copy(cm->fc.mode_context, cc->mode_context);
-  vp9_copy(cm->fc.mv_ref_ct_a, cc->mv_ref_ct_a);
-  vp9_copy(cm->fc.mode_context_a, cc->mode_context_a);
-
-  vp9_copy(cm->fc.ymode_prob, cc->ymode_prob);
-  vp9_copy(cm->fc.bmode_prob, cc->bmode_prob);
-  vp9_copy(cm->fc.i8x8_mode_prob, cc->i8x8_mode_prob);
-  vp9_copy(cm->fc.uv_mode_prob, cc->uv_mode_prob);
-  vp9_copy(cm->fc.sub_mv_ref_prob, cc->sub_mv_ref_prob);
-  vp9_copy(cm->fc.mbsplit_prob, cc->mbsplit_prob);
-
-  // Stats
-#ifdef MODE_STATS
-  vp9_copy(y_modes, cc->y_modes);
-  vp9_copy(uv_modes, cc->uv_modes);
-  vp9_copy(b_modes, cc->b_modes);
-  vp9_copy(inter_y_modes, cc->inter_y_modes);
-  vp9_copy(inter_uv_modes, cc->inter_uv_modes);
-  vp9_copy(inter_b_modes, cc->inter_b_modes);
-#endif
-
-  vp9_copy(cm->segment_pred_probs, cc->segment_pred_probs);
-  vp9_copy(cpi->ref_pred_probs_update, cc->ref_pred_probs_update);
-  vp9_copy(cm->ref_pred_probs, cc->ref_pred_probs);
-  vp9_copy(cm->prob_comppred, cc->prob_comppred);
-
-  vpx_memcpy(cm->last_frame_seg_map,
-             cpi->coding_context.last_frame_seg_map_copy,
-             (cm->mb_rows * cm->mb_cols));
-
-  vp9_copy(xd->last_ref_lf_deltas, cc->last_ref_lf_deltas);
-  vp9_copy(xd->last_mode_lf_deltas, cc->last_mode_lf_deltas);
-
-  vp9_copy(cm->fc.coef_probs, cc->coef_probs);
-  vp9_copy(cm->fc.hybrid_coef_probs, cc->hybrid_coef_probs);
-  vp9_copy(cm->fc.coef_probs_8x8, cc->coef_probs_8x8);
-  vp9_copy(cm->fc.hybrid_coef_probs_8x8, cc->hybrid_coef_probs_8x8);
-  vp9_copy(cm->fc.coef_probs_16x16, cc->coef_probs_16x16);
-  vp9_copy(cm->fc.hybrid_coef_probs_16x16, cc->hybrid_coef_probs_16x16);
-  vp9_copy(cm->fc.switchable_interp_prob, cc->switchable_interp_prob);
-}
-
-
-void vp9_setup_key_frame(VP9_COMP *cpi) {
-  VP9_COMMON *cm = &cpi->common;
-  // Setup for Key frame:
-  vp9_default_coef_probs(& cpi->common);
-  vp9_kf_default_bmode_probs(cpi->common.kf_bmode_prob);
-  vp9_init_mbmode_probs(& cpi->common);
-  vp9_default_bmode_probs(cm->fc.bmode_prob);
-
-  vp9_init_mv_probs(& cpi->common);
-
-  // cpi->common.filter_level = 0;      // Reset every key frame.
-  cpi->common.filter_level = cpi->common.base_qindex * 3 / 8;
-
-  // interval before next GF
-  cpi->frames_till_gf_update_due = cpi->baseline_gf_interval;
-
-  cpi->common.refresh_golden_frame = TRUE;
-  cpi->common.refresh_alt_ref_frame = TRUE;
-
-  vp9_init_mode_contexts(&cpi->common);
-  vpx_memcpy(&cpi->common.lfc, &cpi->common.fc, sizeof(cpi->common.fc));
-  vpx_memcpy(&cpi->common.lfc_a, &cpi->common.fc, sizeof(cpi->common.fc));
-
-  vpx_memset(cm->prev_mip, 0,
-    (cm->mb_cols + 1) * (cm->mb_rows + 1)* sizeof(MODE_INFO));
-  vpx_memset(cm->mip, 0,
-    (cm->mb_cols + 1) * (cm->mb_rows + 1)* sizeof(MODE_INFO));
-
-  vp9_update_mode_info_border(cm, cm->mip);
-  vp9_update_mode_info_in_image(cm, cm->mi);
-}
-
-void vp9_setup_inter_frame(VP9_COMP *cpi) {
-  if (cpi->common.refresh_alt_ref_frame) {
-    vpx_memcpy(&cpi->common.fc,
-               &cpi->common.lfc_a,
-               sizeof(cpi->common.fc));
-    vpx_memcpy(cpi->common.fc.vp8_mode_contexts,
-               cpi->common.fc.mode_context_a,
-               sizeof(cpi->common.fc.vp8_mode_contexts));
-  } else {
-    vpx_memcpy(&cpi->common.fc,
-               &cpi->common.lfc,
-               sizeof(cpi->common.fc));
-    vpx_memcpy(cpi->common.fc.vp8_mode_contexts,
-               cpi->common.fc.mode_context,
-               sizeof(cpi->common.fc.vp8_mode_contexts));
-  }
-}
-
-
-static int estimate_bits_at_q(int frame_kind, int Q, int MBs,
-                              double correction_factor) {
-  int Bpm = (int)(.5 + correction_factor * vp9_bits_per_mb(frame_kind, Q));
-
-  /* Attempt to retain reasonable accuracy without overflow. The cutoff is
-   * chosen such that the maximum product of Bpm and MBs fits 31 bits. The
-   * largest Bpm takes 20 bits.
-   */
-  if (MBs > (1 << 11))
-    return (Bpm >> BPER_MB_NORMBITS) * MBs;
-  else
-    return (Bpm * MBs) >> BPER_MB_NORMBITS;
-}
-
-
-static void calc_iframe_target_size(VP9_COMP *cpi) {
-  // boost defaults to half second
-  int target;
-
-  // Clear down mmx registers to allow floating point in what follows
-  vp9_clear_system_state();  // __asm emms;
-
-  // New Two pass RC
-  target = cpi->per_frame_bandwidth;
-
-  if (cpi->oxcf.rc_max_intra_bitrate_pct) {
-    unsigned int max_rate = cpi->per_frame_bandwidth
-                            * cpi->oxcf.rc_max_intra_bitrate_pct / 100;
-
-    if (target > max_rate)
-      target = max_rate;
-  }
-
-  cpi->this_frame_target = target;
-
-}
-
-
-//  Do the best we can to define the parameteres for the next GF based
-//  on what information we have available.
-//
-//  In this experimental code only two pass is supported
-//  so we just use the interval determined in the two pass code.
-static void calc_gf_params(VP9_COMP *cpi) {
-  // Set the gf interval
-  cpi->frames_till_gf_update_due = cpi->baseline_gf_interval;
-}
-
-
-static void calc_pframe_target_size(VP9_COMP *cpi) {
-  int min_frame_target;
-
-  min_frame_target = 0;
-
-  min_frame_target = cpi->min_frame_bandwidth;
-
-  if (min_frame_target < (cpi->av_per_frame_bandwidth >> 5))
-    min_frame_target = cpi->av_per_frame_bandwidth >> 5;
-
-
-  // Special alt reference frame case
-  if (cpi->common.refresh_alt_ref_frame) {
-    // Per frame bit target for the alt ref frame
-    cpi->per_frame_bandwidth = cpi->twopass.gf_bits;
-    cpi->this_frame_target = cpi->per_frame_bandwidth;
-  }
-
-  // Normal frames (gf,and inter)
-  else {
-    cpi->this_frame_target = cpi->per_frame_bandwidth;
-  }
-
-  // Sanity check that the total sum of adjustments is not above the maximum allowed
-  // That is that having allowed for KF and GF penalties we have not pushed the
-  // current interframe target to low. If the adjustment we apply here is not capable of recovering
-  // all the extra bits we have spent in the KF or GF then the remainder will have to be recovered over
-  // a longer time span via other buffer / rate control mechanisms.
-  if (cpi->this_frame_target < min_frame_target)
-    cpi->this_frame_target = min_frame_target;
-
-  if (!cpi->common.refresh_alt_ref_frame)
-    // Note the baseline target data rate for this inter frame.
-    cpi->inter_frame_target = cpi->this_frame_target;
-
-  // Adjust target frame size for Golden Frames:
-  if (cpi->frames_till_gf_update_due == 0) {
-    // int Boost = 0;
-    int Q = (cpi->oxcf.fixed_q < 0) ? cpi->last_q[INTER_FRAME] : cpi->oxcf.fixed_q;
-
-    cpi->common.refresh_golden_frame = TRUE;
-
-    calc_gf_params(cpi);
-
-    // If we are using alternate ref instead of gf then do not apply the boost
-    // It will instead be applied to the altref update
-    // Jims modified boost
-    if (!cpi->source_alt_ref_active) {
-      if (cpi->oxcf.fixed_q < 0) {
-        // The spend on the GF is defined in the two pass code
-        // for two pass encodes
-        cpi->this_frame_target = cpi->per_frame_bandwidth;
-      } else
-        cpi->this_frame_target =
-          (estimate_bits_at_q(1, Q, cpi->common.MBs, 1.0)
-           * cpi->last_boost) / 100;
-
-    }
-    // If there is an active ARF at this location use the minimum
-    // bits on this frame even if it is a contructed arf.
-    // The active maximum quantizer insures that an appropriate
-    // number of bits will be spent if needed for contstructed ARFs.
-    else {
-      cpi->this_frame_target = 0;
-    }
-
-    cpi->current_gf_interval = cpi->frames_till_gf_update_due;
-  }
-}
-
-
-void vp9_update_rate_correction_factors(VP9_COMP *cpi, int damp_var) {
-  int    Q = cpi->common.base_qindex;
-  int    correction_factor = 100;
-  double rate_correction_factor;
-  double adjustment_limit;
-
-  int    projected_size_based_on_q = 0;
-
-  // Clear down mmx registers to allow floating point in what follows
-  vp9_clear_system_state();  // __asm emms;
-
-  if (cpi->common.frame_type == KEY_FRAME) {
-    rate_correction_factor = cpi->key_frame_rate_correction_factor;
-  } else {
-    if (cpi->common.refresh_alt_ref_frame || cpi->common.refresh_golden_frame)
-      rate_correction_factor = cpi->gf_rate_correction_factor;
-    else
-      rate_correction_factor = cpi->rate_correction_factor;
-  }
-
-  // Work out how big we would have expected the frame to be at this Q given the current correction factor.
-  // Stay in double to avoid int overflow when values are large
-  projected_size_based_on_q =
-    (int)(((.5 + rate_correction_factor *
-            vp9_bits_per_mb(cpi->common.frame_type, Q)) *
-           cpi->common.MBs) / (1 << BPER_MB_NORMBITS));
-
-  // Make some allowance for cpi->zbin_over_quant
-  if (cpi->zbin_over_quant > 0) {
-    int Z = cpi->zbin_over_quant;
-    double Factor = 0.99;
-    double factor_adjustment = 0.01 / 256.0; // (double)ZBIN_OQ_MAX;
-
-    while (Z > 0) {
-      Z--;
-      projected_size_based_on_q =
-        (int)(Factor * projected_size_based_on_q);
-      Factor += factor_adjustment;
-
-      if (Factor  >= 0.999)
-        Factor = 0.999;
-    }
-  }
-
-  // Work out a size correction factor.
-  // if ( cpi->this_frame_target > 0 )
-  //  correction_factor = (100 * cpi->projected_frame_size) / cpi->this_frame_target;
-  if (projected_size_based_on_q > 0)
-    correction_factor = (100 * cpi->projected_frame_size) / projected_size_based_on_q;
-
-  // More heavily damped adjustment used if we have been oscillating either side of target
-  switch (damp_var) {
-    case 0:
-      adjustment_limit = 0.75;
-      break;
-    case 1:
-      adjustment_limit = 0.375;
-      break;
-    case 2:
-    default:
-      adjustment_limit = 0.25;
-      break;
-  }
-
-  // if ( (correction_factor > 102) && (Q < cpi->active_worst_quality) )
-  if (correction_factor > 102) {
-    // We are not already at the worst allowable quality
-    correction_factor = (int)(100.5 + ((correction_factor - 100) * adjustment_limit));
-    rate_correction_factor = ((rate_correction_factor * correction_factor) / 100);
-
-    // Keep rate_correction_factor within limits
-    if (rate_correction_factor > MAX_BPB_FACTOR)
-      rate_correction_factor = MAX_BPB_FACTOR;
-  }
-  // else if ( (correction_factor < 99) && (Q > cpi->active_best_quality) )
-  else if (correction_factor < 99) {
-    // We are not already at the best allowable quality
-    correction_factor = (int)(100.5 - ((100 - correction_factor) * adjustment_limit));
-    rate_correction_factor = ((rate_correction_factor * correction_factor) / 100);
-
-    // Keep rate_correction_factor within limits
-    if (rate_correction_factor < MIN_BPB_FACTOR)
-      rate_correction_factor = MIN_BPB_FACTOR;
-  }
-
-  if (cpi->common.frame_type == KEY_FRAME)
-    cpi->key_frame_rate_correction_factor = rate_correction_factor;
-  else {
-    if (cpi->common.refresh_alt_ref_frame || cpi->common.refresh_golden_frame)
-      cpi->gf_rate_correction_factor = rate_correction_factor;
-    else
-      cpi->rate_correction_factor = rate_correction_factor;
-  }
-}
-
-
-int vp9_regulate_q(VP9_COMP *cpi, int target_bits_per_frame) {
-  int Q = cpi->active_worst_quality;
-
-  int i;
-  int last_error = INT_MAX;
-  int target_bits_per_mb;
-  int bits_per_mb_at_this_q;
-  double correction_factor;
-
-  // Reset Zbin OQ value
-  cpi->zbin_over_quant = 0;
-
-  // Select the appropriate correction factor based upon type of frame.
-  if (cpi->common.frame_type == KEY_FRAME)
-    correction_factor = cpi->key_frame_rate_correction_factor;
-  else {
-    if (cpi->common.refresh_alt_ref_frame || cpi->common.refresh_golden_frame)
-      correction_factor = cpi->gf_rate_correction_factor;
-    else
-      correction_factor = cpi->rate_correction_factor;
-  }
-
-  // Calculate required scaling factor based on target frame size and size of frame produced using previous Q
-  if (target_bits_per_frame >= (INT_MAX >> BPER_MB_NORMBITS))
-    target_bits_per_mb = (target_bits_per_frame / cpi->common.MBs) << BPER_MB_NORMBITS;       // Case where we would overflow int
-  else
-    target_bits_per_mb = (target_bits_per_frame << BPER_MB_NORMBITS) / cpi->common.MBs;
-
-  i = cpi->active_best_quality;
-
-  do {
-    bits_per_mb_at_this_q =
-      (int)(.5 + correction_factor *
-            vp9_bits_per_mb(cpi->common.frame_type, i));
-
-    if (bits_per_mb_at_this_q <= target_bits_per_mb) {
-      if ((target_bits_per_mb - bits_per_mb_at_this_q) <= last_error)
-        Q = i;
-      else
-        Q = i - 1;
-
-      break;
-    } else
-      last_error = bits_per_mb_at_this_q - target_bits_per_mb;
-  } while (++i <= cpi->active_worst_quality);
-
-
-  // If we are at MAXQ then enable Q over-run which seeks to claw back additional bits through things like
-  // the RD multiplier and zero bin size.
-  if (Q >= MAXQ) {
-    int zbin_oqmax;
-
-    double Factor = 0.99;
-    double factor_adjustment = 0.01 / 256.0; // (double)ZBIN_OQ_MAX;
-
-    if (cpi->common.frame_type == KEY_FRAME)
-      zbin_oqmax = 0; // ZBIN_OQ_MAX/16
-    else if (cpi->common.refresh_alt_ref_frame || (cpi->common.refresh_golden_frame && !cpi->source_alt_ref_active))
-      zbin_oqmax = 16;
-    else
-      zbin_oqmax = ZBIN_OQ_MAX;
-
-    // Each incrment in the zbin is assumed to have a fixed effect on bitrate. This is not of course true.
-    // The effect will be highly clip dependent and may well have sudden steps.
-    // The idea here is to acheive higher effective quantizers than the normal maximum by expanding the zero
-    // bin and hence decreasing the number of low magnitude non zero coefficients.
-    while (cpi->zbin_over_quant < zbin_oqmax) {
-      cpi->zbin_over_quant++;
-
-      if (cpi->zbin_over_quant > zbin_oqmax)
-        cpi->zbin_over_quant = zbin_oqmax;
-
-      // Adjust bits_per_mb_at_this_q estimate
-      bits_per_mb_at_this_q = (int)(Factor * bits_per_mb_at_this_q);
-      Factor += factor_adjustment;
-
-      if (Factor  >= 0.999)
-        Factor = 0.999;
-
-      if (bits_per_mb_at_this_q <= target_bits_per_mb)    // Break out if we get down to the target rate
-        break;
-    }
-
-  }
-
-  return Q;
-}
-
-
-static int estimate_keyframe_frequency(VP9_COMP *cpi) {
-  int i;
-
-  // Average key frame frequency
-  int av_key_frame_frequency = 0;
-
-  /* First key frame at start of sequence is a special case. We have no
-   * frequency data.
-   */
-  if (cpi->key_frame_count == 1) {
-    /* Assume a default of 1 kf every 2 seconds, or the max kf interval,
-     * whichever is smaller.
-     */
-    int key_freq = cpi->oxcf.key_freq > 0 ? cpi->oxcf.key_freq : 1;
-    av_key_frame_frequency = (int)cpi->output_frame_rate * 2;
-
-    if (cpi->oxcf.auto_key && av_key_frame_frequency > key_freq)
-      av_key_frame_frequency = cpi->oxcf.key_freq;
-
-    cpi->prior_key_frame_distance[KEY_FRAME_CONTEXT - 1]
-      = av_key_frame_frequency;
-  } else {
-    unsigned int total_weight = 0;
-    int last_kf_interval =
-      (cpi->frames_since_key > 0) ? cpi->frames_since_key : 1;
-
-    /* reset keyframe context and calculate weighted average of last
-     * KEY_FRAME_CONTEXT keyframes
-     */
-    for (i = 0; i < KEY_FRAME_CONTEXT; i++) {
-      if (i < KEY_FRAME_CONTEXT - 1)
-        cpi->prior_key_frame_distance[i]
-          = cpi->prior_key_frame_distance[i + 1];
-      else
-        cpi->prior_key_frame_distance[i] = last_kf_interval;
-
-      av_key_frame_frequency += prior_key_frame_weight[i]
-                                * cpi->prior_key_frame_distance[i];
-      total_weight += prior_key_frame_weight[i];
-    }
-
-    av_key_frame_frequency  /= total_weight;
-
-  }
-  return av_key_frame_frequency;
-}
-
-
-void vp9_adjust_key_frame_context(VP9_COMP *cpi) {
-  // Clear down mmx registers to allow floating point in what follows
-  vp9_clear_system_state();
-
-  cpi->frames_since_key = 0;
-  cpi->key_frame_count++;
-}
-
-
-void vp9_compute_frame_size_bounds(VP9_COMP *cpi, int *frame_under_shoot_limit,
-                                   int *frame_over_shoot_limit) {
-  // Set-up bounds on acceptable frame size:
-  if (cpi->oxcf.fixed_q >= 0) {
-    // Fixed Q scenario: frame size never outranges target (there is no target!)
-    *frame_under_shoot_limit = 0;
-    *frame_over_shoot_limit  = INT_MAX;
-  } else {
-    if (cpi->common.frame_type == KEY_FRAME) {
-      *frame_over_shoot_limit  = cpi->this_frame_target * 9 / 8;
-      *frame_under_shoot_limit = cpi->this_frame_target * 7 / 8;
-    } else {
-      if (cpi->common.refresh_alt_ref_frame || cpi->common.refresh_golden_frame) {
-        *frame_over_shoot_limit  = cpi->this_frame_target * 9 / 8;
-        *frame_under_shoot_limit = cpi->this_frame_target * 7 / 8;
-      } else {
-        // Stron overshoot limit for constrained quality
-        if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) {
-          *frame_over_shoot_limit  = cpi->this_frame_target * 11 / 8;
-          *frame_under_shoot_limit = cpi->this_frame_target * 2 / 8;
-        } else {
-          *frame_over_shoot_limit  = cpi->this_frame_target * 11 / 8;
-          *frame_under_shoot_limit = cpi->this_frame_target * 5 / 8;
-        }
-      }
-    }
-
-    // For very small rate targets where the fractional adjustment
-    // (eg * 7/8) may be tiny make sure there is at least a minimum
-    // range.
-    *frame_over_shoot_limit += 200;
-    *frame_under_shoot_limit -= 200;
-    if (*frame_under_shoot_limit < 0)
-      *frame_under_shoot_limit = 0;
-  }
-}
-
-
-// return of 0 means drop frame
-int vp9_pick_frame_size(VP9_COMP *cpi) {
-  VP9_COMMON *cm = &cpi->common;
-
-  if (cm->frame_type == KEY_FRAME)
-    calc_iframe_target_size(cpi);
-  else
-    calc_pframe_target_size(cpi);
-
-  return 1;
-}
--- a/vp8/encoder/ratectrl.h
+++ /dev/null
@@ -1,37 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#if !defined __INC_RATECTRL_H
-
-#include "onyx_int.h"
-
-#define FRAME_OVERHEAD_BITS 200
-
-extern void vp9_save_coding_context(VP9_COMP *cpi);
-extern void vp9_restore_coding_context(VP9_COMP *cpi);
-
-extern void vp9_setup_key_frame(VP9_COMP *cpi);
-extern void vp9_update_rate_correction_factors(VP9_COMP *cpi, int damp_var);
-extern int vp9_regulate_q(VP9_COMP *cpi, int target_bits_per_frame);
-extern void vp9_adjust_key_frame_context(VP9_COMP *cpi);
-extern void vp9_compute_frame_size_bounds(VP9_COMP *cpi,
-                                          int *frame_under_shoot_limit,
-                                          int *frame_over_shoot_limit);
-
-// return of 0 means drop frame
-extern int vp9_pick_frame_size(VP9_COMP *cpi);
-
-extern double vp9_convert_qindex_to_q(int qindex);
-extern int vp9_gfboost_qadjust(int qindex);
-extern int vp9_bits_per_mb(FRAME_TYPE frame_type, int qindex);
-void vp9_setup_inter_frame(VP9_COMP *cpi);
-
-#endif
--- a/vp8/encoder/rdopt.c
+++ /dev/null
@@ -1,4854 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include <stdio.h>
-#include <math.h>
-#include <limits.h>
-#include <assert.h>
-#include "vp8/common/pragmas.h"
-
-#include "tokenize.h"
-#include "treewriter.h"
-#include "onyx_int.h"
-#include "modecosts.h"
-#include "encodeintra.h"
-#include "vp8/common/entropymode.h"
-#include "vp8/common/reconinter.h"
-#include "vp8/common/reconintra.h"
-#include "vp8/common/reconintra4x4.h"
-#include "vp8/common/findnearmv.h"
-#include "vp8/common/quant_common.h"
-#include "encodemb.h"
-#include "quantize.h"
-#include "vp8/common/idct.h"
-#include "variance.h"
-#include "mcomp.h"
-#include "rdopt.h"
-#include "ratectrl.h"
-#include "vpx_mem/vpx_mem.h"
-#include "vp8/common/systemdependent.h"
-#include "vp8/encoder/encodemv.h"
-
-#include "vp8/common/seg_common.h"
-#include "vp8/common/pred_common.h"
-#include "vp8/common/entropy.h"
-#include "vpx_rtcd.h"
-#if CONFIG_NEWBESTREFMV
-#include "vp8/common/mvref_common.h"
-#endif
-
-#if CONFIG_RUNTIME_CPU_DETECT
-#define IF_RTCD(x)  (x)
-#else
-#define IF_RTCD(x)  NULL
-#endif
-
-extern void vp9_mb_init_quantizer(VP9_COMP *cpi, MACROBLOCK *x);
-extern void vp9_update_zbin_extra(VP9_COMP *cpi, MACROBLOCK *x);
-
-#define MAXF(a,b)            (((a) > (b)) ? (a) : (b))
-
-#define INVALID_MV 0x80008000
-
-/* Factor to weigh the rate for switchable interp filters */
-#define SWITCHABLE_INTERP_RATE_FACTOR 1
-
-static const int auto_speed_thresh[17] = {
-  1000,
-  200,
-  150,
-  130,
-  150,
-  125,
-  120,
-  115,
-  115,
-  115,
-  115,
-  115,
-  115,
-  115,
-  115,
-  115,
-  105
-};
-
-#if CONFIG_PRED_FILTER
-const MODE_DEFINITION vp9_mode_order[MAX_MODES] = {
-  {ZEROMV,    LAST_FRAME,   0,  0},
-  {ZEROMV,    LAST_FRAME,   0,  1},
-  {DC_PRED,   INTRA_FRAME,  0,  0},
-
-  {NEARESTMV, LAST_FRAME,   0,  0},
-  {NEARESTMV, LAST_FRAME,   0,  1},
-  {NEARMV,    LAST_FRAME,   0,  0},
-  {NEARMV,    LAST_FRAME,   0,  1},
-
-  {ZEROMV,    GOLDEN_FRAME, 0,  0},
-  {ZEROMV,    GOLDEN_FRAME, 0,  1},
-  {NEARESTMV, GOLDEN_FRAME, 0,  0},
-  {NEARESTMV, GOLDEN_FRAME, 0,  1},
-
-  {ZEROMV,    ALTREF_FRAME, 0,  0},
-  {ZEROMV,    ALTREF_FRAME, 0,  1},
-  {NEARESTMV, ALTREF_FRAME, 0,  0},
-  {NEARESTMV, ALTREF_FRAME, 0,  1},
-
-  {NEARMV,    GOLDEN_FRAME, 0,  0},
-  {NEARMV,    GOLDEN_FRAME, 0,  1},
-  {NEARMV,    ALTREF_FRAME, 0,  0},
-  {NEARMV,    ALTREF_FRAME, 0,  1},
-
-  {V_PRED,    INTRA_FRAME,  0,  0},
-  {H_PRED,    INTRA_FRAME,  0,  0},
-  {D45_PRED,  INTRA_FRAME,  0,  0},
-  {D135_PRED, INTRA_FRAME,  0,  0},
-  {D117_PRED, INTRA_FRAME,  0,  0},
-  {D153_PRED, INTRA_FRAME,  0,  0},
-  {D27_PRED,  INTRA_FRAME,  0,  0},
-  {D63_PRED,  INTRA_FRAME,  0,  0},
-
-  {TM_PRED,   INTRA_FRAME,  0,  0},
-
-  {NEWMV,     LAST_FRAME,   0,  0},
-  {NEWMV,     LAST_FRAME,   0,  1},
-  {NEWMV,     GOLDEN_FRAME, 0,  0},
-  {NEWMV,     GOLDEN_FRAME, 0,  1},
-  {NEWMV,     ALTREF_FRAME, 0,  0},
-  {NEWMV,     ALTREF_FRAME, 0,  1},
-
-  {SPLITMV,   LAST_FRAME,   0,  0},
-  {SPLITMV,   GOLDEN_FRAME, 0,  0},
-  {SPLITMV,   ALTREF_FRAME, 0,  0},
-
-  {B_PRED,    INTRA_FRAME,  0,  0},
-  {I8X8_PRED, INTRA_FRAME,  0,  0},
-
-  /* compound prediction modes */
-  {ZEROMV,    LAST_FRAME,   GOLDEN_FRAME, 0},
-  {NEARESTMV, LAST_FRAME,   GOLDEN_FRAME, 0},
-  {NEARMV,    LAST_FRAME,   GOLDEN_FRAME, 0},
-
-  {ZEROMV,    ALTREF_FRAME, LAST_FRAME,   0},
-  {NEARESTMV, ALTREF_FRAME, LAST_FRAME,   0},
-  {NEARMV,    ALTREF_FRAME, LAST_FRAME,   0},
-
-  {ZEROMV,    GOLDEN_FRAME, ALTREF_FRAME, 0},
-  {NEARESTMV, GOLDEN_FRAME, ALTREF_FRAME, 0},
-  {NEARMV,    GOLDEN_FRAME, ALTREF_FRAME, 0},
-
-  {NEWMV,     LAST_FRAME,   GOLDEN_FRAME, 0},
-  {NEWMV,     ALTREF_FRAME, LAST_FRAME,   0},
-  {NEWMV,     GOLDEN_FRAME, ALTREF_FRAME, 0},
-
-  {SPLITMV,   LAST_FRAME,   GOLDEN_FRAME, 0},
-  {SPLITMV,   ALTREF_FRAME, LAST_FRAME,   0},
-  {SPLITMV,   GOLDEN_FRAME, ALTREF_FRAME, 0}
-};
-#else
-const MODE_DEFINITION vp9_mode_order[MAX_MODES] = {
-  {ZEROMV,    LAST_FRAME,   0},
-  {DC_PRED,   INTRA_FRAME,  0},
-
-  {NEARESTMV, LAST_FRAME,   0},
-  {NEARMV,    LAST_FRAME,   0},
-
-  {ZEROMV,    GOLDEN_FRAME, 0},
-  {NEARESTMV, GOLDEN_FRAME, 0},
-
-  {ZEROMV,    ALTREF_FRAME, 0},
-  {NEARESTMV, ALTREF_FRAME, 0},
-
-  {NEARMV,    GOLDEN_FRAME, 0},
-  {NEARMV,    ALTREF_FRAME, 0},
-
-  {V_PRED,    INTRA_FRAME,  0},
-  {H_PRED,    INTRA_FRAME,  0},
-  {D45_PRED,  INTRA_FRAME,  0},
-  {D135_PRED, INTRA_FRAME,  0},
-  {D117_PRED, INTRA_FRAME,  0},
-  {D153_PRED, INTRA_FRAME,  0},
-  {D27_PRED,  INTRA_FRAME,  0},
-  {D63_PRED,  INTRA_FRAME,  0},
-
-  {TM_PRED,   INTRA_FRAME,  0},
-
-  {NEWMV,     LAST_FRAME,   0},
-  {NEWMV,     GOLDEN_FRAME, 0},
-  {NEWMV,     ALTREF_FRAME, 0},
-
-  {SPLITMV,   LAST_FRAME,   0},
-  {SPLITMV,   GOLDEN_FRAME, 0},
-  {SPLITMV,   ALTREF_FRAME, 0},
-
-  {B_PRED,    INTRA_FRAME,  0},
-  {I8X8_PRED, INTRA_FRAME,  0},
-
-  /* compound prediction modes */
-  {ZEROMV,    LAST_FRAME,   GOLDEN_FRAME},
-  {NEARESTMV, LAST_FRAME,   GOLDEN_FRAME},
-  {NEARMV,    LAST_FRAME,   GOLDEN_FRAME},
-
-  {ZEROMV,    ALTREF_FRAME, LAST_FRAME},
-  {NEARESTMV, ALTREF_FRAME, LAST_FRAME},
-  {NEARMV,    ALTREF_FRAME, LAST_FRAME},
-
-  {ZEROMV,    GOLDEN_FRAME, ALTREF_FRAME},
-  {NEARESTMV, GOLDEN_FRAME, ALTREF_FRAME},
-  {NEARMV,    GOLDEN_FRAME, ALTREF_FRAME},
-
-  {NEWMV,     LAST_FRAME,   GOLDEN_FRAME},
-  {NEWMV,     ALTREF_FRAME, LAST_FRAME  },
-  {NEWMV,     GOLDEN_FRAME, ALTREF_FRAME},
-
-  {SPLITMV,   LAST_FRAME,   GOLDEN_FRAME},
-  {SPLITMV,   ALTREF_FRAME, LAST_FRAME  },
-  {SPLITMV,   GOLDEN_FRAME, ALTREF_FRAME}
-};
-#endif
-
-static void fill_token_costs(
-  unsigned int (*c)[COEF_BANDS][PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS],
-  const vp9_prob(*p)[COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES],
-  int block_type_counts) {
-  int i, j, k;
-
-  for (i = 0; i < block_type_counts; i++)
-    for (j = 0; j < COEF_BANDS; j++)
-      for (k = 0; k < PREV_COEF_CONTEXTS; k++) {
-        if (k == 0 && ((j > 0 && i > 0) || (j > 1 && i == 0)))
-          vp9_cost_tokens_skip((int *)(c[i][j][k]),
-                               p[i][j][k],
-                               vp9_coef_tree);
-        else
-          vp9_cost_tokens((int *)(c[i][j][k]),
-                          p[i][j][k],
-                          vp9_coef_tree);
-      }
-}
-
-
-static int rd_iifactor[32] =  { 4, 4, 3, 2, 1, 0, 0, 0,
-                                0, 0, 0, 0, 0, 0, 0, 0,
-                                0, 0, 0, 0, 0, 0, 0, 0,
-                                0, 0, 0, 0, 0, 0, 0, 0, };
-
-// 3* dc_qlookup[Q]*dc_qlookup[Q];
-
-/* values are now correlated to quantizer */
-static int sad_per_bit16lut[QINDEX_RANGE];
-static int sad_per_bit4lut[QINDEX_RANGE];
-
-void vp9_init_me_luts() {
-  int i;
-
-  // Initialize the sad lut tables using a formulaic calculation for now
-  // This is to make it easier to resolve the impact of experimental changes
-  // to the quantizer tables.
-  for (i = 0; i < QINDEX_RANGE; i++) {
-    sad_per_bit16lut[i] =
-      (int)((0.0418 * vp9_convert_qindex_to_q(i)) + 2.4107);
-    sad_per_bit4lut[i] = (int)((0.063 * vp9_convert_qindex_to_q(i)) + 2.742);
-  }
-}
-
-static int compute_rd_mult(int qindex) {
-  int q;
-
-  q = vp9_dc_quant(qindex, 0);
-  return (11 * q * q) >> 6;
-}
-
-void vp9_initialize_me_consts(VP9_COMP *cpi, int QIndex) {
-  cpi->mb.sadperbit16 =  sad_per_bit16lut[QIndex];
-  cpi->mb.sadperbit4  =  sad_per_bit4lut[QIndex];
-}
-
-
-void vp9_initialize_rd_consts(VP9_COMP *cpi, int QIndex) {
-  int q, i;
-
-  vp9_clear_system_state();  // __asm emms;
-
-  // Further tests required to see if optimum is different
-  // for key frames, golden frames and arf frames.
-  // if (cpi->common.refresh_golden_frame ||
-  //     cpi->common.refresh_alt_ref_frame)
-  QIndex = (QIndex < 0) ? 0 : ((QIndex > MAXQ) ? MAXQ : QIndex);
-
-  cpi->RDMULT = compute_rd_mult(QIndex);
-
-  // Extend rate multiplier along side quantizer zbin increases
-  if (cpi->zbin_over_quant  > 0) {
-    double oq_factor;
-
-    // Experimental code using the same basic equation as used for Q above
-    // The units of cpi->zbin_over_quant are 1/128 of Q bin size
-    oq_factor = 1.0 + ((double)0.0015625 * cpi->zbin_over_quant);
-    cpi->RDMULT = (int)((double)cpi->RDMULT * oq_factor * oq_factor);
-  }
-
-  if (cpi->pass == 2 && (cpi->common.frame_type != KEY_FRAME)) {
-    if (cpi->twopass.next_iiratio > 31)
-      cpi->RDMULT += (cpi->RDMULT * rd_iifactor[31]) >> 4;
-    else
-      cpi->RDMULT +=
-        (cpi->RDMULT * rd_iifactor[cpi->twopass.next_iiratio]) >> 4;
-  }
-
-  if (cpi->RDMULT < 7)
-    cpi->RDMULT = 7;
-
-  cpi->mb.errorperbit = (cpi->RDMULT / 110);
-  cpi->mb.errorperbit += (cpi->mb.errorperbit == 0);
-
-  vp9_set_speed_features(cpi);
-
-  q = (int)pow(vp9_dc_quant(QIndex, 0) >> 2, 1.25);
-  q = q << 2;
-  cpi->RDMULT = cpi->RDMULT << 4;
-
-  if (q < 8)
-    q = 8;
-
-  if (cpi->RDMULT > 1000) {
-    cpi->RDDIV = 1;
-    cpi->RDMULT /= 100;
-
-    for (i = 0; i < MAX_MODES; i++) {
-      if (cpi->sf.thresh_mult[i] < INT_MAX) {
-        cpi->rd_threshes[i] = cpi->sf.thresh_mult[i] * q / 100;
-      } else {
-        cpi->rd_threshes[i] = INT_MAX;
-      }
-
-      cpi->rd_baseline_thresh[i] = cpi->rd_threshes[i];
-    }
-  } else {
-    cpi->RDDIV = 100;
-
-    for (i = 0; i < MAX_MODES; i++) {
-      if (cpi->sf.thresh_mult[i] < (INT_MAX / q)) {
-        cpi->rd_threshes[i] = cpi->sf.thresh_mult[i] * q;
-      } else {
-        cpi->rd_threshes[i] = INT_MAX;
-      }
-
-      cpi->rd_baseline_thresh[i] = cpi->rd_threshes[i];
-    }
-  }
-
-  fill_token_costs(
-    cpi->mb.token_costs[TX_4X4],
-    (const vp9_prob( *)[8][PREV_COEF_CONTEXTS][11]) cpi->common.fc.coef_probs,
-    BLOCK_TYPES);
-  fill_token_costs(
-    cpi->mb.hybrid_token_costs[TX_4X4],
-    (const vp9_prob( *)[8][PREV_COEF_CONTEXTS][11])
-    cpi->common.fc.hybrid_coef_probs,
-    BLOCK_TYPES);
-
-  fill_token_costs(
-    cpi->mb.token_costs[TX_8X8],
-    (const vp9_prob( *)[8][PREV_COEF_CONTEXTS][11]) cpi->common.fc.coef_probs_8x8,
-    BLOCK_TYPES_8X8);
-  fill_token_costs(
-    cpi->mb.hybrid_token_costs[TX_8X8],
-    (const vp9_prob( *)[8][PREV_COEF_CONTEXTS][11])
-    cpi->common.fc.hybrid_coef_probs_8x8,
-    BLOCK_TYPES_8X8);
-
-  fill_token_costs(
-    cpi->mb.token_costs[TX_16X16],
-    (const vp9_prob(*)[8][PREV_COEF_CONTEXTS][11]) cpi->common.fc.coef_probs_16x16,
-    BLOCK_TYPES_16X16);
-  fill_token_costs(
-    cpi->mb.hybrid_token_costs[TX_16X16],
-    (const vp9_prob(*)[8][PREV_COEF_CONTEXTS][11])
-    cpi->common.fc.hybrid_coef_probs_16x16,
-    BLOCK_TYPES_16X16);
-
-  /*rough estimate for costing*/
-  cpi->common.kf_ymode_probs_index = cpi->common.base_qindex >> 4;
-  vp9_init_mode_costs(cpi);
-
-  if (cpi->common.frame_type != KEY_FRAME)
-  {
-    vp9_build_nmv_cost_table(
-        cpi->mb.nmvjointcost,
-        cpi->mb.e_mbd.allow_high_precision_mv ?
-        cpi->mb.nmvcost_hp : cpi->mb.nmvcost,
-        &cpi->common.fc.nmvc,
-        cpi->mb.e_mbd.allow_high_precision_mv, 1, 1);
-  }
-}
-
-void vp9_auto_select_speed(VP9_COMP *cpi) {
-  int milliseconds_for_compress = (int)(1000000 / cpi->oxcf.frame_rate);
-
-  milliseconds_for_compress = milliseconds_for_compress * (16 - cpi->oxcf.cpu_used) / 16;
-
-  /*
-  // this is done during parameter valid check
-  if( cpi->oxcf.cpu_used > 16)
-      cpi->oxcf.cpu_used = 16;
-  if( cpi->oxcf.cpu_used < -16)
-      cpi->oxcf.cpu_used = -16;
-  */
-
-  if (cpi->avg_pick_mode_time < milliseconds_for_compress &&
-      (cpi->avg_encode_time - cpi->avg_pick_mode_time) <
-      milliseconds_for_compress) {
-    if (cpi->avg_pick_mode_time == 0) {
-      cpi->Speed = 4;
-    } else {
-      if (milliseconds_for_compress * 100 < cpi->avg_encode_time * 95) {
-        cpi->Speed          += 2;
-        cpi->avg_pick_mode_time = 0;
-        cpi->avg_encode_time = 0;
-
-        if (cpi->Speed > 16) {
-          cpi->Speed = 16;
-        }
-      }
-
-      if (milliseconds_for_compress * 100 >
-          cpi->avg_encode_time * auto_speed_thresh[cpi->Speed]) {
-        cpi->Speed          -= 1;
-        cpi->avg_pick_mode_time = 0;
-        cpi->avg_encode_time = 0;
-
-        // In real-time mode, cpi->speed is in [4, 16].
-        if (cpi->Speed < 4) {      // if ( cpi->Speed < 0 )
-          cpi->Speed = 4;        // cpi->Speed = 0;
-        }
-      }
-    }
-  } else {
-    cpi->Speed += 4;
-
-    if (cpi->Speed > 16)
-      cpi->Speed = 16;
-
-
-    cpi->avg_pick_mode_time = 0;
-    cpi->avg_encode_time = 0;
-  }
-}
-
-int vp9_block_error_c(short *coeff, short *dqcoeff, int block_size) {
-  int i, error = 0;
-
-  for (i = 0; i < block_size; i++) {
-    int this_diff = coeff[i] - dqcoeff[i];
-    error += this_diff * this_diff;
-  }
-
-  return error;
-}
-
-int vp9_mbblock_error_c(MACROBLOCK *mb, int dc) {
-  BLOCK  *be;
-  BLOCKD *bd;
-  int i, j;
-  int berror, error = 0;
-
-  for (i = 0; i < 16; i++) {
-    be = &mb->block[i];
-    bd = &mb->e_mbd.block[i];
-
-    berror = 0;
-
-    for (j = dc; j < 16; j++) {
-      int this_diff = be->coeff[j] - bd->dqcoeff[j];
-      berror += this_diff * this_diff;
-    }
-
-    error += berror;
-  }
-
-  return error;
-}
-
-int vp9_mbuverror_c(MACROBLOCK *mb) {
-  BLOCK  *be;
-  BLOCKD *bd;
-
-  int i, error = 0;
-
-  for (i = 16; i < 24; i++) {
-    be = &mb->block[i];
-    bd = &mb->e_mbd.block[i];
-
-    error += vp9_block_error_c(be->coeff, bd->dqcoeff, 16);
-  }
-
-  return error;
-}
-
-int vp9_uvsse(MACROBLOCK *x) {
-  unsigned char *uptr, *vptr;
-  unsigned char *upred_ptr = (*(x->block[16].base_src) + x->block[16].src);
-  unsigned char *vpred_ptr = (*(x->block[20].base_src) + x->block[20].src);
-  int uv_stride = x->block[16].src_stride;
-
-  unsigned int sse1 = 0;
-  unsigned int sse2 = 0;
-  int mv_row = x->e_mbd.mode_info_context->mbmi.mv[0].as_mv.row;
-  int mv_col = x->e_mbd.mode_info_context->mbmi.mv[0].as_mv.col;
-  int offset;
-  int pre_stride = x->e_mbd.block[16].pre_stride;
-
-  if (mv_row < 0)
-    mv_row -= 1;
-  else
-    mv_row += 1;
-
-  if (mv_col < 0)
-    mv_col -= 1;
-  else
-    mv_col += 1;
-
-  mv_row /= 2;
-  mv_col /= 2;
-
-  offset = (mv_row >> 3) * pre_stride + (mv_col >> 3);
-  uptr = x->e_mbd.pre.u_buffer + offset;
-  vptr = x->e_mbd.pre.v_buffer + offset;
-
-  if ((mv_row | mv_col) & 7) {
-    vp9_sub_pixel_variance8x8(uptr, pre_stride, (mv_col & 7) << 1,
-                              (mv_row & 7) << 1, upred_ptr, uv_stride, &sse2);
-    vp9_sub_pixel_variance8x8(vptr, pre_stride, (mv_col & 7) << 1,
-                              (mv_row & 7) << 1, vpred_ptr, uv_stride, &sse1);
-    sse2 += sse1;
-  } else {
-    vp9_variance8x8(uptr, pre_stride, upred_ptr, uv_stride, &sse2);
-    vp9_variance8x8(vptr, pre_stride, vpred_ptr, uv_stride, &sse1);
-    sse2 += sse1;
-  }
-  return sse2;
-
-}
-
-static int cost_coeffs_2x2(MACROBLOCK *mb,
-                           BLOCKD *b, PLANE_TYPE type,
-                           ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l) {
-  int c = (type == PLANE_TYPE_Y_NO_DC); /* start at coef 0, unless Y with Y2 */
-  int eob = b->eob;
-  int pt;    /* surrounding block/prev coef predictor */
-  int cost = 0;
-  short *qcoeff_ptr = b->qcoeff;
-
-  VP9_COMBINEENTROPYCONTEXTS(pt, *a, *l);
-  assert(eob <= 4);
-
-  for (; c < eob; c++) {
-    int v = qcoeff_ptr[vp9_default_zig_zag1d[c]];
-    int t = vp9_dct_value_tokens_ptr[v].Token;
-    cost += mb->token_costs[TX_8X8][type][vp9_coef_bands[c]][pt][t];
-    cost += vp9_dct_value_cost_ptr[v];
-    pt = vp9_prev_token_class[t];
-  }
-
-  if (c < 4)
-    cost += mb->token_costs[TX_8X8][type][vp9_coef_bands[c]]
-            [pt] [DCT_EOB_TOKEN];
-
-  pt = (c != !type); // is eob first coefficient;
-  *a = *l = pt;
-  return cost;
-}
-
-static int cost_coeffs(MACROBLOCK *mb, BLOCKD *b, PLANE_TYPE type,
-                       ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
-                       int tx_size) {
-  const int eob = b->eob;
-  int c = (type == PLANE_TYPE_Y_NO_DC); /* start at coef 0, unless Y with Y2 */
-  int cost = 0, default_eob, seg_eob;
-  int pt;                     /* surrounding block/prev coef predictor */
-  int const *scan, *band;
-  short *qcoeff_ptr = b->qcoeff;
-  MACROBLOCKD *xd = &mb->e_mbd;
-  MB_MODE_INFO *mbmi = &mb->e_mbd.mode_info_context->mbmi;
-  TX_TYPE tx_type = DCT_DCT;
-  int segment_id = mbmi->segment_id;
-
-  switch (tx_size) {
-    case TX_4X4:
-      scan = vp9_default_zig_zag1d;
-      band = vp9_coef_bands;
-      default_eob = 16;
-      if (type == PLANE_TYPE_Y_WITH_DC) {
-        tx_type = get_tx_type_4x4(xd, b);
-        if (tx_type != DCT_DCT) {
-          switch (tx_type) {
-            case ADST_DCT:
-              scan = vp9_row_scan;
-              break;
-
-            case DCT_ADST:
-              scan = vp9_col_scan;
-              break;
-
-            default:
-              scan = vp9_default_zig_zag1d;
-              break;
-          }
-        }
-      }
-
-      break;
-    case TX_8X8:
-      scan = vp9_default_zig_zag1d_8x8;
-      band = vp9_coef_bands_8x8;
-      default_eob = 64;
-      if (type == PLANE_TYPE_Y_WITH_DC) {
-        BLOCKD *bb;
-        int ib = (b - xd->block);
-        if (ib < 16) {
-          ib = (ib & 8) + ((ib & 4) >> 1);
-          bb = xd->block + ib;
-          tx_type = get_tx_type_8x8(xd, bb);
-        }
-      }
-      break;
-    case TX_16X16:
-      scan = vp9_default_zig_zag1d_16x16;
-      band = vp9_coef_bands_16x16;
-      default_eob = 256;
-      if (type == PLANE_TYPE_Y_WITH_DC) {
-        tx_type = get_tx_type_16x16(xd, b);
-      }
-      break;
-    default:
-      break;
-  }
-  if (vp9_segfeature_active(&mb->e_mbd, segment_id, SEG_LVL_EOB))
-    seg_eob = vp9_get_segdata(&mb->e_mbd, segment_id, SEG_LVL_EOB);
-  else
-    seg_eob = default_eob;
-
-  VP9_COMBINEENTROPYCONTEXTS(pt, *a, *l);
-
-  if (tx_type != DCT_DCT) {
-    for (; c < eob; c++) {
-      int v = qcoeff_ptr[scan[c]];
-      int t = vp9_dct_value_tokens_ptr[v].Token;
-      cost += mb->hybrid_token_costs[tx_size][type][band[c]][pt][t];
-      cost += vp9_dct_value_cost_ptr[v];
-      pt = vp9_prev_token_class[t];
-    }
-    if (c < seg_eob)
-      cost += mb->hybrid_token_costs[tx_size][type][band[c]]
-          [pt][DCT_EOB_TOKEN];
-  } else {
-    for (; c < eob; c++) {
-      int v = qcoeff_ptr[scan[c]];
-      int t = vp9_dct_value_tokens_ptr[v].Token;
-      cost += mb->token_costs[tx_size][type][band[c]][pt][t];
-      cost += vp9_dct_value_cost_ptr[v];
-      pt = vp9_prev_token_class[t];
-    }
-    if (c < seg_eob)
-      cost += mb->token_costs[tx_size][type][band[c]]
-          [pt][DCT_EOB_TOKEN];
-  }
-
-  pt = (c != !type); // is eob first coefficient;
-  *a = *l = pt;
-  return cost;
-}
-
-static int rdcost_mby_4x4(MACROBLOCK *mb) {
-  int cost = 0;
-  int b;
-  MACROBLOCKD *xd = &mb->e_mbd;
-  ENTROPY_CONTEXT_PLANES t_above, t_left;
-  ENTROPY_CONTEXT *ta;
-  ENTROPY_CONTEXT *tl;
-
-  vpx_memcpy(&t_above, xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES));
-  vpx_memcpy(&t_left, xd->left_context, sizeof(ENTROPY_CONTEXT_PLANES));
-
-  ta = (ENTROPY_CONTEXT *)&t_above;
-  tl = (ENTROPY_CONTEXT *)&t_left;
-
-  for (b = 0; b < 16; b++)
-    cost += cost_coeffs(mb, xd->block + b, PLANE_TYPE_Y_NO_DC,
-                        ta + vp9_block2above[b], tl + vp9_block2left[b],
-                        TX_4X4);
-
-  cost += cost_coeffs(mb, xd->block + 24, PLANE_TYPE_Y2,
-                      ta + vp9_block2above[24], tl + vp9_block2left[24],
-                      TX_4X4);
-
-  return cost;
-}
-
-static void macro_block_yrd_4x4(MACROBLOCK *mb,
-                                int *Rate,
-                                int *Distortion,
-                                const VP9_ENCODER_RTCD *rtcd,
-                                int *skippable) {
-  int b;
-  MACROBLOCKD *const xd = &mb->e_mbd;
-  BLOCK   *const mb_y2 = mb->block + 24;
-  BLOCKD *const x_y2  = xd->block + 24;
-  short *Y2DCPtr = mb_y2->src_diff;
-  BLOCK *beptr;
-  int d;
-
-  vp9_subtract_mby(mb->src_diff, *(mb->block[0].base_src), xd->predictor,
-                   mb->block[0].src_stride);
-
-  // Fdct and building the 2nd order block
-  for (beptr = mb->block; beptr < mb->block + 16; beptr += 2) {
-    mb->vp9_short_fdct8x4(beptr->src_diff, beptr->coeff, 32);
-    *Y2DCPtr++ = beptr->coeff[0];
-    *Y2DCPtr++ = beptr->coeff[16];
-  }
-
-  // 2nd order fdct
-  mb->short_walsh4x4(mb_y2->src_diff, mb_y2->coeff, 8);
-
-  // Quantization
-  for (b = 0; b < 16; b++) {
-    mb->quantize_b_4x4(&mb->block[b], &xd->block[b]);
-  }
-
-  // DC predication and Quantization of 2nd Order block
-  mb->quantize_b_4x4(mb_y2, x_y2);
-
-  // Distortion
-  d = vp9_mbblock_error(mb, 1);
-
-  d += vp9_block_error(mb_y2->coeff, x_y2->dqcoeff, 16);
-
-  *Distortion = (d >> 2);
-  // rate
-  *Rate = rdcost_mby_4x4(mb);
-  *skippable = vp9_mby_is_skippable_4x4(&mb->e_mbd, 1);
-}
-
-static int rdcost_mby_8x8(MACROBLOCK *mb, int backup) {
-  int cost = 0;
-  int b;
-  MACROBLOCKD *xd = &mb->e_mbd;
-  ENTROPY_CONTEXT_PLANES t_above, t_left;
-  ENTROPY_CONTEXT *ta;
-  ENTROPY_CONTEXT *tl;
-
-  if (backup) {
-    vpx_memcpy(&t_above,xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES));
-    vpx_memcpy(&t_left, xd->left_context, sizeof(ENTROPY_CONTEXT_PLANES));
-
-    ta = (ENTROPY_CONTEXT *)&t_above;
-    tl = (ENTROPY_CONTEXT *)&t_left;
-  } else {
-    ta = (ENTROPY_CONTEXT *)mb->e_mbd.above_context;
-    tl = (ENTROPY_CONTEXT *)mb->e_mbd.left_context;
-  }
-
-  for (b = 0; b < 16; b += 4)
-    cost += cost_coeffs(mb, xd->block + b, PLANE_TYPE_Y_NO_DC,
-                        ta + vp9_block2above_8x8[b], tl + vp9_block2left_8x8[b],
-                        TX_8X8);
-
-  cost += cost_coeffs_2x2(mb, xd->block + 24, PLANE_TYPE_Y2,
-                          ta + vp9_block2above[24], tl + vp9_block2left[24]);
-  return cost;
-}
-
-static void macro_block_yrd_8x8(MACROBLOCK *mb,
-                                int *Rate,
-                                int *Distortion,
-                                const VP9_ENCODER_RTCD *rtcd,
-                                int *skippable) {
-  MACROBLOCKD *const xd = &mb->e_mbd;
-  BLOCK   *const mb_y2 = mb->block + 24;
-  BLOCKD *const x_y2  = xd->block + 24;
-  int d;
-
-  vp9_subtract_mby(mb->src_diff, *(mb->block[0].base_src), xd->predictor,
-                   mb->block[0].src_stride);
-
-  vp9_transform_mby_8x8(mb);
-  vp9_quantize_mby_8x8(mb);
-
-  /* remove 1st order dc to properly combine 1st/2nd order distortion */
-  mb->coeff[0] = 0;
-  mb->coeff[64] = 0;
-  mb->coeff[128] = 0;
-  mb->coeff[192] = 0;
-  xd->dqcoeff[0] = 0;
-  xd->dqcoeff[64] = 0;
-  xd->dqcoeff[128] = 0;
-  xd->dqcoeff[192] = 0;
-
-  d = vp9_mbblock_error(mb, 0);
-  d += vp9_block_error(mb_y2->coeff, x_y2->dqcoeff, 16);
-
-  *Distortion = (d >> 2);
-  // rate
-  *Rate = rdcost_mby_8x8(mb, 1);
-  *skippable = vp9_mby_is_skippable_8x8(&mb->e_mbd, 1);
-}
-
-static int rdcost_mby_16x16(MACROBLOCK *mb) {
-  int cost;
-  MACROBLOCKD *xd = &mb->e_mbd;
-  ENTROPY_CONTEXT_PLANES t_above, t_left;
-  ENTROPY_CONTEXT *ta, *tl;
-
-  vpx_memcpy(&t_above, xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES));
-  vpx_memcpy(&t_left, xd->left_context, sizeof(ENTROPY_CONTEXT_PLANES));
-
-  ta = (ENTROPY_CONTEXT *)&t_above;
-  tl = (ENTROPY_CONTEXT *)&t_left;
-
-  cost = cost_coeffs(mb, xd->block, PLANE_TYPE_Y_WITH_DC, ta, tl, TX_16X16);
-  return cost;
-}
-
-static void macro_block_yrd_16x16(MACROBLOCK *mb, int *Rate, int *Distortion,
-                                  const VP9_ENCODER_RTCD *rtcd, int *skippable) {
-  int d;
-  MACROBLOCKD *xd = &mb->e_mbd;
-  BLOCKD *b  = &mb->e_mbd.block[0];
-  BLOCK  *be = &mb->block[0];
-  TX_TYPE tx_type;
-
-  vp9_subtract_mby(mb->src_diff, *(mb->block[0].base_src), mb->e_mbd.predictor,
-                   mb->block[0].src_stride);
-
-  tx_type = get_tx_type_16x16(xd, b);
-  if (tx_type != DCT_DCT) {
-    vp9_fht(be->src_diff, 32, be->coeff, tx_type, 16);
-  } else
-    vp9_transform_mby_16x16(mb);
-
-  vp9_quantize_mby_16x16(mb);
-  // TODO(jingning) is it possible to quickly determine whether to force
-  //                trailing coefficients to be zero, instead of running trellis
-  //                optimization in the rate-distortion optimization loop?
-  if (mb->e_mbd.mode_info_context->mbmi.mode < I8X8_PRED)
-    vp9_optimize_mby_16x16(mb, rtcd);
-
-  d = vp9_mbblock_error(mb, 0);
-
-  *Distortion = (d >> 2);
-  // rate
-  *Rate = rdcost_mby_16x16(mb);
-  *skippable = vp9_mby_is_skippable_16x16(&mb->e_mbd);
-}
-
-static void macro_block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
-                            int *distortion, int *skippable,
-                            int64_t txfm_cache[NB_TXFM_MODES]) {
-  VP9_COMMON *cm = &cpi->common;
-  MB_MODE_INFO *mbmi = &x->e_mbd.mode_info_context->mbmi;
-
-  MACROBLOCKD *xd = &x->e_mbd;
-  int can_skip = cm->mb_no_coeff_skip;
-  vp9_prob skip_prob = can_skip ? vp9_get_pred_prob(cm, xd, PRED_MBSKIP) : 128;
-  int s0, s1;
-  int r4x4, r4x4s, r8x8, r8x8s, d4x4, d8x8, s4x4, s8x8;
-  int64_t rd4x4, rd8x8, rd4x4s, rd8x8s;
-  int d16x16, r16x16, r16x16s, s16x16;
-  int64_t rd16x16, rd16x16s;
-
-  // FIXME don't do sub x3
-  if (skip_prob == 0)
-    skip_prob = 1;
-  s0 = vp9_cost_bit(skip_prob, 0);
-  s1 = vp9_cost_bit(skip_prob, 1);
-  macro_block_yrd_16x16(x, &r16x16, &d16x16, IF_RTCD(&cpi->rtcd), &s16x16);
-  if (can_skip) {
-    if (s16x16) {
-      rd16x16 = RDCOST(x->rdmult, x->rddiv, s1, d16x16);
-    } else {
-      rd16x16 = RDCOST(x->rdmult, x->rddiv, r16x16 + s0, d16x16);
-    }
-  } else {
-    rd16x16 = RDCOST(x->rdmult, x->rddiv, r16x16, d16x16);
-  }
-  r16x16s = r16x16 + vp9_cost_one(cm->prob_tx[0]) + vp9_cost_one(cm->prob_tx[1]);
-  if (can_skip) {
-    if (s16x16) {
-      rd16x16s = RDCOST(x->rdmult, x->rddiv, s1, d16x16);
-    } else {
-      rd16x16s = RDCOST(x->rdmult, x->rddiv, r16x16s + s0, d16x16);
-    }
-  } else {
-    rd16x16s = RDCOST(x->rdmult, x->rddiv, r16x16s, d16x16);
-  }
-  macro_block_yrd_8x8(x, &r8x8, &d8x8, IF_RTCD(&cpi->rtcd), &s8x8);
-  if (can_skip) {
-    if (s8x8) {
-      rd8x8 = RDCOST(x->rdmult, x->rddiv, s1, d8x8);
-    } else {
-      rd8x8 = RDCOST(x->rdmult, x->rddiv, r8x8 + s0, d8x8);
-    }
-  } else {
-    rd8x8 = RDCOST(x->rdmult, x->rddiv, r8x8, d8x8);
-  }
-  r8x8s = r8x8 + vp9_cost_one(cm->prob_tx[0]);
-  r8x8s += vp9_cost_zero(cm->prob_tx[1]);
-  if (can_skip) {
-    if (s8x8) {
-      rd8x8s = RDCOST(x->rdmult, x->rddiv, s1, d8x8);
-    } else {
-      rd8x8s = RDCOST(x->rdmult, x->rddiv, r8x8s + s0, d8x8);
-    }
-  } else {
-    rd8x8s = RDCOST(x->rdmult, x->rddiv, r8x8s, d8x8);
-  }
-  macro_block_yrd_4x4(x, &r4x4, &d4x4, IF_RTCD(&cpi->rtcd), &s4x4);
-  if (can_skip) {
-    if (s4x4) {
-      rd4x4 = RDCOST(x->rdmult, x->rddiv, s1, d4x4);
-    } else {
-      rd4x4 = RDCOST(x->rdmult, x->rddiv, r4x4 + s0, d4x4);
-    }
-  } else {
-    rd4x4 = RDCOST(x->rdmult, x->rddiv, r4x4, d4x4);
-  }
-  r4x4s = r4x4 + vp9_cost_zero(cm->prob_tx[0]);
-  if (can_skip) {
-    if (s4x4) {
-      rd4x4s = RDCOST(x->rdmult, x->rddiv, s1, d4x4);
-    } else {
-      rd4x4s = RDCOST(x->rdmult, x->rddiv, r4x4s + s0, d4x4);
-    }
-  } else {
-    rd4x4s = RDCOST(x->rdmult, x->rddiv, r4x4s, d4x4);
-  }
-
-  if ( cpi->common.txfm_mode == ALLOW_16X16 ||
-      (cpi->common.txfm_mode == TX_MODE_SELECT &&
-       rd16x16s < rd8x8s && rd16x16s < rd4x4s)) {
-    mbmi->txfm_size = TX_16X16;
-    *skippable = s16x16;
-    *distortion = d16x16;
-    *rate = (cpi->common.txfm_mode == ALLOW_16X16) ? r16x16 : r16x16s;
-  } else
-  if ( cpi->common.txfm_mode == ALLOW_8X8 ||
-      (cpi->common.txfm_mode == TX_MODE_SELECT && rd8x8s < rd4x4s)) {
-    mbmi->txfm_size = TX_8X8;
-    *skippable = s8x8;
-    *distortion = d8x8;
-    *rate = (cpi->common.txfm_mode == ALLOW_8X8) ? r8x8 : r8x8s;
-  } else {
-    assert(cpi->common.txfm_mode == ONLY_4X4 ||
-           (cpi->common.txfm_mode == TX_MODE_SELECT && rd4x4s <= rd8x8s));
-    mbmi->txfm_size = TX_4X4;
-    *skippable = s4x4;
-    *distortion = d4x4;
-    *rate = (cpi->common.txfm_mode == ONLY_4X4) ? r4x4 : r4x4s;
-  }
-
-  txfm_cache[ONLY_4X4] = rd4x4;
-  txfm_cache[ALLOW_8X8] = rd8x8;
-  txfm_cache[ALLOW_16X16] = rd16x16;
-  if (rd16x16s < rd8x8s && rd16x16s < rd4x4s)
-    txfm_cache[TX_MODE_SELECT] = rd16x16s;
-  else
-    txfm_cache[TX_MODE_SELECT] = rd4x4s < rd8x8s ? rd4x4s : rd8x8s;
-
-}
-
-static void copy_predictor(unsigned char *dst, const unsigned char *predictor) {
-  const unsigned int *p = (const unsigned int *)predictor;
-  unsigned int *d = (unsigned int *)dst;
-  d[0] = p[0];
-  d[4] = p[4];
-  d[8] = p[8];
-  d[12] = p[12];
-}
-
-#if CONFIG_SUPERBLOCKS
-static void super_block_yrd_8x8(MACROBLOCK *x,
-                                int *rate,
-                                int *distortion,
-                                const VP9_ENCODER_RTCD *rtcd, int *skip)
-{
-  MACROBLOCKD *const xd = &x->e_mbd;
-  BLOCK *const by2 = x->block + 24;
-  BLOCKD *const bdy2  = xd->block + 24;
-  int d = 0, r = 0, n;
-  const uint8_t *src = x->src.y_buffer, *dst = xd->dst.y_buffer;
-  int src_y_stride = x->src.y_stride, dst_y_stride = xd->dst.y_stride;
-  ENTROPY_CONTEXT_PLANES *ta = xd->above_context;
-  ENTROPY_CONTEXT_PLANES *tl = xd->left_context;
-  ENTROPY_CONTEXT_PLANES t_above[2];
-  ENTROPY_CONTEXT_PLANES t_left[2];
-  int skippable = 1;
-
-  vpx_memcpy(t_above, xd->above_context, sizeof(t_above));
-  vpx_memcpy(t_left, xd->left_context, sizeof(t_left));
-
-  for (n = 0; n < 4; n++) {
-    int x_idx = n & 1, y_idx = n >> 1;
-
-    vp9_subtract_mby_s_c(x->src_diff,
-                         src + x_idx * 16 + y_idx * 16 * src_y_stride,
-                         src_y_stride,
-                         dst + x_idx * 16 + y_idx * 16 * dst_y_stride,
-                         dst_y_stride);
-    vp9_transform_mby_8x8(x);
-    vp9_quantize_mby_8x8(x);
-
-    /* remove 1st order dc to properly combine 1st/2nd order distortion */
-    x->coeff[  0] = 0;
-    x->coeff[ 64] = 0;
-    x->coeff[128] = 0;
-    x->coeff[192] = 0;
-    xd->dqcoeff[  0] = 0;
-    xd->dqcoeff[ 64] = 0;
-    xd->dqcoeff[128] = 0;
-    xd->dqcoeff[192] = 0;
-
-    d += vp9_mbblock_error(x, 0);
-    d += vp9_block_error(by2->coeff, bdy2->dqcoeff, 16);
-    xd->above_context = ta + x_idx;
-    xd->left_context = tl + y_idx;
-    r += rdcost_mby_8x8(x, 0);
-    skippable = skippable && vp9_mby_is_skippable_8x8(xd, 1);
-  }
-
-  *distortion = (d >> 2);
-  *rate       = r;
-  if (skip) *skip = skippable;
-  xd->above_context = ta;
-  xd->left_context = tl;
-  vpx_memcpy(xd->above_context, &t_above, sizeof(t_above));
-  vpx_memcpy(xd->left_context, &t_left, sizeof(t_left));
-}
-#endif
-
-static void copy_predictor_8x8(unsigned char *dst, const unsigned char *predictor) {
-  const unsigned int *p = (const unsigned int *)predictor;
-  unsigned int *d = (unsigned int *)dst;
-  d[0] = p[0];
-  d[1] = p[1];
-  d[4] = p[4];
-  d[5] = p[5];
-  d[8] = p[8];
-  d[9] = p[9];
-  d[12] = p[12];
-  d[13] = p[13];
-  d[16] = p[16];
-  d[17] = p[17];
-  d[20] = p[20];
-  d[21] = p[21];
-  d[24] = p[24];
-  d[25] = p[25];
-  d[28] = p[28];
-  d[29] = p[29];
-}
-
-static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, BLOCK *be,
-                                     BLOCKD *b, B_PREDICTION_MODE *best_mode,
-#if CONFIG_COMP_INTRA_PRED
-                                     B_PREDICTION_MODE *best_second_mode,
-                                     int allow_comp,
-#endif
-                                     int *bmode_costs,
-                                     ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
-                                     int *bestrate, int *bestratey,
-                                     int *bestdistortion) {
-  B_PREDICTION_MODE mode;
-  MACROBLOCKD *xd = &x->e_mbd;
-
-#if CONFIG_COMP_INTRA_PRED
-  B_PREDICTION_MODE mode2;
-#endif
-  int64_t best_rd = INT64_MAX;
-  int rate = 0;
-  int distortion;
-
-  ENTROPY_CONTEXT ta = *a, tempa = *a;
-  ENTROPY_CONTEXT tl = *l, templ = *l;
-  TX_TYPE tx_type = DCT_DCT;
-  TX_TYPE best_tx_type = DCT_DCT;
-  /*
-   * The predictor buffer is a 2d buffer with a stride of 16.  Create
-   * a temp buffer that meets the stride requirements, but we are only
-   * interested in the left 4x4 block
-   * */
-  DECLARE_ALIGNED_ARRAY(16, unsigned char,  best_predictor, 16 * 4);
-  DECLARE_ALIGNED_ARRAY(16, short, best_dqcoeff, 16);
-
-  for (mode = B_DC_PRED; mode <= B_HU_PRED; mode++) {
-#if CONFIG_COMP_INTRA_PRED
-    for (mode2 = (allow_comp ? 0 : (B_DC_PRED - 1));
-                   mode2 != (allow_comp ? (mode + 1) : 0); mode2++) {
-#endif
-      int64_t this_rd;
-      int ratey;
-
-      b->bmi.as_mode.first = mode;
-      rate = bmode_costs[mode];
-
-#if CONFIG_COMP_INTRA_PRED
-      if (mode2 == (B_PREDICTION_MODE)(B_DC_PRED - 1)) {
-#endif
-        vp9_intra4x4_predict(b, mode, b->predictor);
-#if CONFIG_COMP_INTRA_PRED
-      } else {
-        vp9_comp_intra4x4_predict(b, mode, mode2, b->predictor);
-        rate += bmode_costs[mode2];
-      }
-#endif
-      vp9_subtract_b(be, b, 16);
-
-      b->bmi.as_mode.first = mode;
-      tx_type = get_tx_type_4x4(xd, b);
-      if (tx_type != DCT_DCT) {
-        vp9_fht(be->src_diff, 32, be->coeff, tx_type, 4);
-        vp9_ht_quantize_b_4x4(be, b, tx_type);
-      } else {
-        x->vp9_short_fdct4x4(be->src_diff, be->coeff, 32);
-        x->quantize_b_4x4(be, b);
-      }
-
-      tempa = ta;
-      templ = tl;
-
-      ratey = cost_coeffs(x, b, PLANE_TYPE_Y_WITH_DC, &tempa, &templ, TX_4X4);
-      rate += ratey;
-      distortion = vp9_block_error(be->coeff, b->dqcoeff, 16) >> 2;
-
-      this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
-
-      if (this_rd < best_rd) {
-        *bestrate = rate;
-        *bestratey = ratey;
-        *bestdistortion = distortion;
-        best_rd = this_rd;
-        *best_mode = mode;
-        best_tx_type = tx_type;
-
-#if CONFIG_COMP_INTRA_PRED
-        *best_second_mode = mode2;
-#endif
-        *a = tempa;
-        *l = templ;
-        copy_predictor(best_predictor, b->predictor);
-        vpx_memcpy(best_dqcoeff, b->dqcoeff, 32);
-      }
-#if CONFIG_COMP_INTRA_PRED
-    }
-#endif
-  }
-  b->bmi.as_mode.first = (B_PREDICTION_MODE)(*best_mode);
-#if CONFIG_COMP_INTRA_PRED
-  b->bmi.as_mode.second = (B_PREDICTION_MODE)(*best_second_mode);
-#endif
-
-  // inverse transform
-  if (best_tx_type != DCT_DCT)
-    vp9_ihtllm_c(best_dqcoeff, b->diff, 32, best_tx_type, 4);
-  else
-    IDCT_INVOKE(IF_RTCD(&cpi->rtcd.common->idct), idct16)(
-        best_dqcoeff, b->diff, 32);
-
-  vp9_recon_b(best_predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
-
-  return best_rd;
-}
-
-static int64_t rd_pick_intra4x4mby_modes(VP9_COMP *cpi, MACROBLOCK *mb, int *Rate,
-                                     int *rate_y, int *Distortion, int64_t best_rd,
-#if CONFIG_COMP_INTRA_PRED
-                                     int allow_comp,
-#endif
-                                     int update_contexts) {
-  int i;
-  MACROBLOCKD *const xd = &mb->e_mbd;
-  int cost = mb->mbmode_cost [xd->frame_type] [B_PRED];
-  int distortion = 0;
-  int tot_rate_y = 0;
-  int64_t total_rd = 0;
-  ENTROPY_CONTEXT_PLANES t_above, t_left;
-  ENTROPY_CONTEXT *ta, *tl;
-  int *bmode_costs;
-
-  if (update_contexts) {
-    ta = (ENTROPY_CONTEXT *)xd->above_context;
-    tl = (ENTROPY_CONTEXT *)xd->left_context;
-  } else {
-    vpx_memcpy(&t_above, xd->above_context,
-               sizeof(ENTROPY_CONTEXT_PLANES));
-    vpx_memcpy(&t_left, xd->left_context,
-               sizeof(ENTROPY_CONTEXT_PLANES));
-
-    ta = (ENTROPY_CONTEXT *)&t_above;
-    tl = (ENTROPY_CONTEXT *)&t_left;
-  }
-
-  xd->mode_info_context->mbmi.mode = B_PRED;
-  bmode_costs = mb->inter_bmode_costs;
-
-  for (i = 0; i < 16; i++) {
-    MODE_INFO *const mic = xd->mode_info_context;
-    const int mis = xd->mode_info_stride;
-    B_PREDICTION_MODE UNINITIALIZED_IS_SAFE(best_mode);
-#if CONFIG_COMP_INTRA_PRED
-    B_PREDICTION_MODE UNINITIALIZED_IS_SAFE(best_second_mode);
-#endif
-    int UNINITIALIZED_IS_SAFE(r), UNINITIALIZED_IS_SAFE(ry), UNINITIALIZED_IS_SAFE(d);
-
-    if (xd->frame_type == KEY_FRAME) {
-      const B_PREDICTION_MODE A = above_block_mode(mic, i, mis);
-      const B_PREDICTION_MODE L = left_block_mode(mic, i);
-
-      bmode_costs  = mb->bmode_costs[A][L];
-    }
-
-    total_rd += rd_pick_intra4x4block(
-                  cpi, mb, mb->block + i, xd->block + i, &best_mode,
-#if CONFIG_COMP_INTRA_PRED
-                  & best_second_mode, allow_comp,
-#endif
-                  bmode_costs, ta + vp9_block2above[i],
-                  tl + vp9_block2left[i], &r, &ry, &d);
-
-    cost += r;
-    distortion += d;
-    tot_rate_y += ry;
-
-    mic->bmi[i].as_mode.first = best_mode;
-#if CONFIG_COMP_INTRA_PRED
-    mic->bmi[i].as_mode.second = best_second_mode;
-#endif
-
-    if (total_rd >= best_rd)
-      break;
-  }
-
-  if (total_rd >= best_rd)
-    return INT64_MAX;
-
-#if CONFIG_COMP_INTRA_PRED
-  cost += vp9_cost_bit(128, allow_comp);
-#endif
-  *Rate = cost;
-  *rate_y += tot_rate_y;
-  *Distortion = distortion;
-
-  return RDCOST(mb->rdmult, mb->rddiv, cost, distortion);
-}
-
-#if CONFIG_SUPERBLOCKS
-static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi,
-                                      MACROBLOCK *x,
-                                      int *rate,
-                                      int *rate_tokenonly,
-                                      int *distortion,
-                                      int *skippable) {
-  MB_PREDICTION_MODE mode;
-  MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected);
-  int this_rate, this_rate_tokenonly;
-  int this_distortion, s;
-  int64_t best_rd = INT64_MAX, this_rd;
-
-  /* Y Search for 32x32 intra prediction mode */
-  for (mode = DC_PRED; mode <= TM_PRED; mode++) {
-    x->e_mbd.mode_info_context->mbmi.mode = mode;
-    vp9_build_intra_predictors_sby_s(&x->e_mbd);
-
-    super_block_yrd_8x8(x, &this_rate_tokenonly,
-                        &this_distortion, IF_RTCD(&cpi->rtcd), &s);
-    this_rate = this_rate_tokenonly +
-                x->mbmode_cost[x->e_mbd.frame_type]
-                              [x->e_mbd.mode_info_context->mbmi.mode];
-    this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
-
-    if (this_rd < best_rd) {
-      mode_selected   = mode;
-      best_rd         = this_rd;
-      *rate           = this_rate;
-      *rate_tokenonly = this_rate_tokenonly;
-      *distortion     = this_distortion;
-      *skippable      = s;
-    }
-  }
-
-  x->e_mbd.mode_info_context->mbmi.mode = mode_selected;
-
-  return best_rd;
-}
-#endif
-
-static int64_t rd_pick_intra16x16mby_mode(VP9_COMP *cpi,
-                                          MACROBLOCK *x,
-                                          int *Rate,
-                                          int *rate_y,
-                                          int *Distortion,
-                                          int *skippable,
-                                          int64_t txfm_cache[NB_TXFM_MODES]) {
-  MB_PREDICTION_MODE mode;
-  TX_SIZE txfm_size;
-  MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected);
-#if CONFIG_COMP_INTRA_PRED
-  MB_PREDICTION_MODE mode2;
-  MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode2_selected);
-#endif
-  MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;
-  int rate, ratey;
-  int distortion, skip;
-  int64_t best_rd = INT64_MAX;
-  int64_t this_rd;
-  MACROBLOCKD *xd = &x->e_mbd;
-
-  int i;
-  for (i = 0; i < NB_TXFM_MODES; i++)
-    txfm_cache[i] = INT64_MAX;
-
-  // Y Search for 16x16 intra prediction mode
-  for (mode = DC_PRED; mode <= TM_PRED; mode++) {
-    int64_t local_txfm_cache[NB_TXFM_MODES];
-
-    mbmi->mode = mode;
-
-#if CONFIG_COMP_INTRA_PRED
-    for (mode2 = DC_PRED - 1; mode2 != TM_PRED + 1; mode2++) {
-      mbmi->second_mode = mode2;
-      if (mode2 == (MB_PREDICTION_MODE)(DC_PRED - 1)) {
-#endif
-        vp9_build_intra_predictors_mby(&x->e_mbd);
-#if CONFIG_COMP_INTRA_PRED
-      } else {
-        continue; // i.e. disable for now
-        vp9_build_comp_intra_predictors_mby(&x->e_mbd);
-      }
-#endif
-
-      macro_block_yrd(cpi, x, &ratey, &distortion, &skip, local_txfm_cache);
-
-      // FIXME add compoundmode cost
-      // FIXME add rate for mode2
-      rate = ratey + x->mbmode_cost[x->e_mbd.frame_type][mbmi->mode];
-
-      this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
-
-
-      if (this_rd < best_rd) {
-        mode_selected = mode;
-        txfm_size = mbmi->txfm_size;
-#if CONFIG_COMP_INTRA_PRED
-        mode2_selected = mode2;
-#endif
-        best_rd = this_rd;
-        *Rate = rate;
-        *rate_y = ratey;
-        *Distortion = distortion;
-        *skippable = skip;
-      }
-
-      for (i = 0; i < NB_TXFM_MODES; i++) {
-        int64_t adj_rd = this_rd + local_txfm_cache[i] -
-                          local_txfm_cache[cpi->common.txfm_mode];
-        if (adj_rd < txfm_cache[i]) {
-          txfm_cache[i] = adj_rd;
-        }
-      }
-
-#if CONFIG_COMP_INTRA_PRED
-    }
-#endif
-  }
-
-  mbmi->txfm_size = txfm_size;
-  mbmi->mode = mode_selected;
-
-#if CONFIG_COMP_INTRA_PRED
-  mbmi->second_mode = mode2_selected;
-#endif
-  return best_rd;
-}
-
-
-static int64_t rd_pick_intra8x8block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
-                                     B_PREDICTION_MODE *best_mode,
-#if CONFIG_COMP_INTRA_PRED
-                                     B_PREDICTION_MODE *best_second_mode,
-#endif
-                                     int *mode_costs,
-                                     ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
-                                     int *bestrate, int *bestratey,
-                                     int *bestdistortion) {
-  MB_PREDICTION_MODE mode;
-#if CONFIG_COMP_INTRA_PRED
-  MB_PREDICTION_MODE mode2;
-#endif
-  MACROBLOCKD *xd = &x->e_mbd;
-  int64_t best_rd = INT64_MAX;
-  int distortion, rate = 0;
-  BLOCK  *be = x->block + ib;
-  BLOCKD *b = xd->block + ib;
-  ENTROPY_CONTEXT ta0, ta1, besta0 = 0, besta1 = 0;
-  ENTROPY_CONTEXT tl0, tl1, bestl0 = 0, bestl1 = 0;
-
-  /*
-   * The predictor buffer is a 2d buffer with a stride of 16.  Create
-   * a temp buffer that meets the stride requirements, but we are only
-   * interested in the left 8x8 block
-   * */
-  DECLARE_ALIGNED_ARRAY(16, unsigned char,  best_predictor, 16 * 8);
-  DECLARE_ALIGNED_ARRAY(16, short, best_dqcoeff, 16 * 4);
-
-  // perform transformation of dimension 8x8
-  // note the input and output index mapping
-  int idx = (ib & 0x02) ? (ib + 2) : ib;
-
-  for (mode = DC_PRED; mode <= TM_PRED; mode++) {
-#if CONFIG_COMP_INTRA_PRED
-    for (mode2 = DC_PRED - 1; mode2 != TM_PRED + 1; mode2++) {
-#endif
-      int64_t this_rd;
-      int rate_t;
-
-      // FIXME rate for compound mode and second intrapred mode
-      rate = mode_costs[mode];
-      b->bmi.as_mode.first = mode;
-
-#if CONFIG_COMP_INTRA_PRED
-      if (mode2 == (MB_PREDICTION_MODE)(DC_PRED - 1)) {
-#endif
-        vp9_intra8x8_predict(b, mode, b->predictor);
-#if CONFIG_COMP_INTRA_PRED
-      } else {
-        continue; // i.e. disable for now
-        vp9_comp_intra8x8_predict(b, mode, mode2, b->predictor);
-      }
-#endif
-
-      vp9_subtract_4b_c(be, b, 16);
-
-      if (xd->mode_info_context->mbmi.txfm_size == TX_8X8) {
-        TX_TYPE tx_type = get_tx_type_8x8(xd, b);
-        if (tx_type != DCT_DCT)
-          vp9_fht(be->src_diff, 32, (x->block + idx)->coeff, tx_type, 8);
-        else
-          x->vp9_short_fdct8x8(be->src_diff, (x->block + idx)->coeff, 32);
-        x->quantize_b_8x8(x->block + idx, xd->block + idx);
-
-        // compute quantization mse of 8x8 block
-        distortion = vp9_block_error_c((x->block + idx)->coeff,
-                                       (xd->block + idx)->dqcoeff, 64);
-        ta0 = a[vp9_block2above_8x8[idx]];
-        tl0 = l[vp9_block2left_8x8[idx]];
-
-        rate_t = cost_coeffs(x, xd->block + idx, PLANE_TYPE_Y_WITH_DC,
-                             &ta0, &tl0, TX_8X8);
-
-        rate += rate_t;
-        ta1 = ta0;
-        tl1 = tl0;
-      } else {
-        x->vp9_short_fdct8x4(be->src_diff, be->coeff, 32);
-        x->vp9_short_fdct8x4((be + 4)->src_diff, (be + 4)->coeff, 32);
-
-        x->quantize_b_4x4_pair(x->block + ib, x->block + ib + 1,
-                               xd->block + ib, xd->block + ib + 1);
-        x->quantize_b_4x4_pair(x->block + ib + 4, x->block + ib + 5,
-                               xd->block + ib + 4, xd->block + ib + 5);
-
-        distortion = vp9_block_error_c((x->block + ib)->coeff,
-                                       (xd->block + ib)->dqcoeff, 16);
-        distortion += vp9_block_error_c((x->block + ib + 1)->coeff,
-                                        (xd->block + ib + 1)->dqcoeff, 16);
-        distortion += vp9_block_error_c((x->block + ib + 4)->coeff,
-                                        (xd->block + ib + 4)->dqcoeff, 16);
-        distortion += vp9_block_error_c((x->block + ib + 5)->coeff,
-                                        (xd->block + ib + 5)->dqcoeff, 16);
-
-        ta0 = a[vp9_block2above[ib]];
-        ta1 = a[vp9_block2above[ib + 1]];
-        tl0 = l[vp9_block2left[ib]];
-        tl1 = l[vp9_block2left[ib + 4]];
-        rate_t = cost_coeffs(x, xd->block + ib, PLANE_TYPE_Y_WITH_DC,
-                             &ta0, &tl0, TX_4X4);
-        rate_t += cost_coeffs(x, xd->block + ib + 1, PLANE_TYPE_Y_WITH_DC,
-                              &ta1, &tl0, TX_4X4);
-        rate_t += cost_coeffs(x, xd->block + ib + 4, PLANE_TYPE_Y_WITH_DC,
-                              &ta0, &tl1, TX_4X4);
-        rate_t += cost_coeffs(x, xd->block + ib + 5, PLANE_TYPE_Y_WITH_DC,
-                              &ta1, &tl1, TX_4X4);
-        rate += rate_t;
-      }
-
-      distortion >>= 2;
-      this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
-      if (this_rd < best_rd) {
-        *bestrate = rate;
-        *bestratey = rate_t;
-        *bestdistortion = distortion;
-        besta0 = ta0;
-        besta1 = ta1;
-        bestl0 = tl0;
-        bestl1 = tl1;
-        best_rd = this_rd;
-        *best_mode = mode;
-#if CONFIG_COMP_INTRA_PRED
-        *best_second_mode = mode2;
-#endif
-        copy_predictor_8x8(best_predictor, b->predictor);
-        vpx_memcpy(best_dqcoeff, b->dqcoeff, 64);
-        vpx_memcpy(best_dqcoeff + 32, b->dqcoeff + 64, 64);
-#if CONFIG_COMP_INTRA_PRED
-      }
-#endif
-    }
-  }
-  b->bmi.as_mode.first = (*best_mode);
-#if CONFIG_COMP_INTRA_PRED
-  b->bmi.as_mode.second = (*best_second_mode);
-#endif
-  vp9_encode_intra8x8(IF_RTCD(&cpi->rtcd), x, ib);
-
-  if (xd->mode_info_context->mbmi.txfm_size == TX_8X8) {
-    a[vp9_block2above_8x8[idx]]     = besta0;
-    a[vp9_block2above_8x8[idx] + 1] = besta1;
-    l[vp9_block2left_8x8[idx]]      = bestl0;
-    l[vp9_block2left_8x8[idx] + 1]  = bestl1;
-  } else {
-    a[vp9_block2above[ib]]     = besta0;
-    a[vp9_block2above[ib + 1]] = besta1;
-    l[vp9_block2left[ib]]      = bestl0;
-    l[vp9_block2left[ib + 4]]  = bestl1;
-  }
-
-  return best_rd;
-}
-
-static int64_t rd_pick_intra8x8mby_modes(VP9_COMP *cpi, MACROBLOCK *mb,
-                                         int *Rate, int *rate_y,
-                                         int *Distortion, int64_t best_rd) {
-  MACROBLOCKD *const xd = &mb->e_mbd;
-  int i, ib;
-  int cost = mb->mbmode_cost [xd->frame_type] [I8X8_PRED];
-  int distortion = 0;
-  int tot_rate_y = 0;
-  long long total_rd = 0;
-  ENTROPY_CONTEXT_PLANES t_above, t_left;
-  ENTROPY_CONTEXT *ta, *tl;
-  int *i8x8mode_costs;
-
-  vpx_memcpy(&t_above, xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES));
-  vpx_memcpy(&t_left, xd->left_context, sizeof(ENTROPY_CONTEXT_PLANES));
-
-  ta = (ENTROPY_CONTEXT *)&t_above;
-  tl = (ENTROPY_CONTEXT *)&t_left;
-
-  xd->mode_info_context->mbmi.mode = I8X8_PRED;
-  i8x8mode_costs  = mb->i8x8_mode_costs;
-
-  for (i = 0; i < 4; i++) {
-    MODE_INFO *const mic = xd->mode_info_context;
-    B_PREDICTION_MODE UNINITIALIZED_IS_SAFE(best_mode);
-#if CONFIG_COMP_INTRA_PRED
-    B_PREDICTION_MODE UNINITIALIZED_IS_SAFE(best_second_mode);
-#endif
-    int UNINITIALIZED_IS_SAFE(r), UNINITIALIZED_IS_SAFE(ry), UNINITIALIZED_IS_SAFE(d);
-
-    ib = vp9_i8x8_block[i];
-    total_rd += rd_pick_intra8x8block(
-                  cpi, mb, ib, &best_mode,
-#if CONFIG_COMP_INTRA_PRED
-                  & best_second_mode,
-#endif
-                  i8x8mode_costs, ta, tl, &r, &ry, &d);
-    cost += r;
-    distortion += d;
-    tot_rate_y += ry;
-    mic->bmi[ib].as_mode.first = best_mode;
-#if CONFIG_COMP_INTRA_PRED
-    mic->bmi[ib].as_mode.second = best_second_mode;
-#endif
-  }
-  *Rate = cost;
-  *rate_y += tot_rate_y;
-  *Distortion = distortion;
-  return RDCOST(mb->rdmult, mb->rddiv, cost, distortion);
-}
-
-static int rd_cost_mbuv(MACROBLOCK *mb) {
-  int b;
-  int cost = 0;
-  MACROBLOCKD *xd = &mb->e_mbd;
-  ENTROPY_CONTEXT_PLANES t_above, t_left;
-  ENTROPY_CONTEXT *ta, *tl;
-
-  vpx_memcpy(&t_above, xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES));
-  vpx_memcpy(&t_left, xd->left_context, sizeof(ENTROPY_CONTEXT_PLANES));
-
-  ta = (ENTROPY_CONTEXT *)&t_above;
-  tl = (ENTROPY_CONTEXT *)&t_left;
-
-  for (b = 16; b < 24; b++)
-    cost += cost_coeffs(mb, xd->block + b, PLANE_TYPE_UV,
-                        ta + vp9_block2above[b], tl + vp9_block2left[b],
-                        TX_4X4);
-
-  return cost;
-}
-
-
-static int64_t rd_inter16x16_uv(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
-                                int *distortion, int fullpixel, int *skip) {
-  vp9_subtract_mbuv(x->src_diff, x->src.u_buffer, x->src.v_buffer,
-                    x->e_mbd.predictor, x->src.uv_stride);
-
-  vp9_transform_mbuv_4x4(x);
-  vp9_quantize_mbuv_4x4(x);
-
-  *rate       = rd_cost_mbuv(x);
-  *distortion = vp9_mbuverror(x) / 4;
-  *skip       = vp9_mbuv_is_skippable_4x4(&x->e_mbd);
-
-  return RDCOST(x->rdmult, x->rddiv, *rate, *distortion);
-}
-
-static int rd_cost_mbuv_8x8(MACROBLOCK *mb, int backup) {
-  int b;
-  int cost = 0;
-  MACROBLOCKD *xd = &mb->e_mbd;
-  ENTROPY_CONTEXT_PLANES t_above, t_left;
-  ENTROPY_CONTEXT *ta, *tl;
-
-  if (backup) {
-    vpx_memcpy(&t_above, xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES));
-    vpx_memcpy(&t_left, xd->left_context, sizeof(ENTROPY_CONTEXT_PLANES));
-
-    ta = (ENTROPY_CONTEXT *)&t_above;
-    tl = (ENTROPY_CONTEXT *)&t_left;
-  } else {
-    ta = (ENTROPY_CONTEXT *)mb->e_mbd.above_context;
-    tl = (ENTROPY_CONTEXT *)mb->e_mbd.left_context;
-  }
-
-  for (b = 16; b < 24; b += 4)
-    cost += cost_coeffs(mb, xd->block + b, PLANE_TYPE_UV,
-                        ta + vp9_block2above_8x8[b],
-                        tl + vp9_block2left_8x8[b], TX_8X8);
-
-  return cost;
-}
-
-#if CONFIG_SUPERBLOCKS
-static int64_t rd_inter32x32_uv_8x8(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
-                                int *distortion, int fullpixel, int *skip) {
-  MACROBLOCKD *xd = &x->e_mbd;
-  int n, r = 0, d = 0;
-  const uint8_t *usrc = x->src.u_buffer, *udst = xd->dst.u_buffer;
-  const uint8_t *vsrc = x->src.v_buffer, *vdst = xd->dst.v_buffer;
-  int src_uv_stride = x->src.uv_stride, dst_uv_stride = xd->dst.uv_stride;
-  int skippable = 1;
-  ENTROPY_CONTEXT_PLANES t_above[2], t_left[2];
-  ENTROPY_CONTEXT_PLANES *ta = xd->above_context;
-  ENTROPY_CONTEXT_PLANES *tl = xd->left_context;
-
-  memcpy(t_above, xd->above_context, sizeof(t_above));
-  memcpy(t_left, xd->left_context, sizeof(t_left));
-
-  for (n = 0; n < 4; n++) {
-    int x_idx = n & 1, y_idx = n >> 1;
-
-    vp9_subtract_mbuv_s_c(x->src_diff,
-                          usrc + x_idx * 8 + y_idx * 8 * src_uv_stride,
-                          vsrc + x_idx * 8 + y_idx * 8 * src_uv_stride,
-                          src_uv_stride,
-                          udst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
-                          vdst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
-                          dst_uv_stride);
-
-    vp9_transform_mbuv_8x8(x);
-    vp9_quantize_mbuv_8x8(x);
-
-    xd->above_context = ta + x_idx;
-    xd->left_context = tl + y_idx;
-    r += rd_cost_mbuv_8x8(x, 0);
-    d += vp9_mbuverror(x) / 4;
-    skippable = skippable && vp9_mbuv_is_skippable_8x8(xd);
-  }
-
-  *rate = r;
-  *distortion = d;
-  if (skip) *skip = skippable;
-  xd->left_context = tl;
-  xd->above_context = ta;
-  memcpy(xd->above_context, t_above, sizeof(t_above));
-  memcpy(xd->left_context, t_left, sizeof(t_left));
-
-  return RDCOST(x->rdmult, x->rddiv, *rate, *distortion);
-}
-#endif
-
-static int64_t rd_inter16x16_uv_8x8(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
-                                    int *distortion, int fullpixel, int *skip) {
-  vp9_subtract_mbuv(x->src_diff, x->src.u_buffer, x->src.v_buffer,
-                    x->e_mbd.predictor, x->src.uv_stride);
-
-  vp9_transform_mbuv_8x8(x);
-  vp9_quantize_mbuv_8x8(x);
-
-  *rate       = rd_cost_mbuv_8x8(x, 1);
-  *distortion = vp9_mbuverror(x) / 4;
-  *skip       = vp9_mbuv_is_skippable_8x8(&x->e_mbd);
-
-  return RDCOST(x->rdmult, x->rddiv, *rate, *distortion);
-}
-
-
-static int64_t rd_inter4x4_uv(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
-                              int *distortion, int *skippable, int fullpixel) {
-  vp9_build_inter4x4_predictors_mbuv(&x->e_mbd);
-  vp9_subtract_mbuv(x->src_diff, x->src.u_buffer, x->src.v_buffer,
-                    x->e_mbd.predictor, x->src.uv_stride);
-
-  vp9_transform_mbuv_4x4(x);
-  vp9_quantize_mbuv_4x4(x);
-
-  *rate       = rd_cost_mbuv(x);
-  *distortion = vp9_mbuverror(x) / 4;
-  *skippable  = vp9_mbuv_is_skippable_4x4(&x->e_mbd);
-
-  return RDCOST(x->rdmult, x->rddiv, *rate, *distortion);
-}
-
-static void rd_pick_intra_mbuv_mode(VP9_COMP *cpi,
-                                    MACROBLOCK *x,
-                                    int *rate,
-                                    int *rate_tokenonly,
-                                    int *distortion,
-                                    int *skippable) {
-  MB_PREDICTION_MODE mode;
-  MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected);
-#if CONFIG_COMP_INTRA_PRED
-  MB_PREDICTION_MODE mode2;
-  MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode2_selected);
-#endif
-  MACROBLOCKD *xd = &x->e_mbd;
-  MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;
-  int64_t best_rd = INT64_MAX;
-  int UNINITIALIZED_IS_SAFE(d), UNINITIALIZED_IS_SAFE(r);
-  int rate_to, UNINITIALIZED_IS_SAFE(skip);
-
-  for (mode = DC_PRED; mode <= TM_PRED; mode++) {
-#if CONFIG_COMP_INTRA_PRED
-    for (mode2 = DC_PRED - 1; mode2 != TM_PRED + 1; mode2++) {
-#endif
-      int rate;
-      int distortion;
-      int64_t this_rd;
-
-      mbmi->uv_mode = mode;
-#if CONFIG_COMP_INTRA_PRED
-      mbmi->second_uv_mode = mode2;
-      if (mode2 == (MB_PREDICTION_MODE)(DC_PRED - 1)) {
-#endif
-        vp9_build_intra_predictors_mbuv(&x->e_mbd);
-#if CONFIG_COMP_INTRA_PRED
-      } else {
-        continue;
-        vp9_build_comp_intra_predictors_mbuv(&x->e_mbd);
-      }
-#endif
-
-      vp9_subtract_mbuv(x->src_diff, x->src.u_buffer, x->src.v_buffer,
-                        x->e_mbd.predictor, x->src.uv_stride);
-      vp9_transform_mbuv_4x4(x);
-      vp9_quantize_mbuv_4x4(x);
-
-      rate_to = rd_cost_mbuv(x);
-      rate = rate_to
-             + x->intra_uv_mode_cost[x->e_mbd.frame_type][mbmi->uv_mode];
-
-      distortion = vp9_mbuverror(x) / 4;
-
-      this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
-
-      if (this_rd < best_rd) {
-        skip = vp9_mbuv_is_skippable_4x4(xd);
-        best_rd = this_rd;
-        d = distortion;
-        r = rate;
-        *rate_tokenonly = rate_to;
-        mode_selected = mode;
-#if CONFIG_COMP_INTRA_PRED
-        mode2_selected = mode2;
-      }
-#endif
-    }
-  }
-
-  *rate = r;
-  *distortion = d;
-  *skippable = skip;
-
-  mbmi->uv_mode = mode_selected;
-#if CONFIG_COMP_INTRA_PRED
-  mbmi->second_uv_mode = mode2_selected;
-#endif
-}
-
-static void rd_pick_intra_mbuv_mode_8x8(VP9_COMP *cpi,
-                                        MACROBLOCK *x,
-                                        int *rate,
-                                        int *rate_tokenonly,
-                                        int *distortion,
-                                        int *skippable) {
-  MACROBLOCKD *xd = &x->e_mbd;
-  MB_PREDICTION_MODE mode;
-  MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected);
-  MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;
-  int64_t best_rd = INT64_MAX;
-  int UNINITIALIZED_IS_SAFE(d), UNINITIALIZED_IS_SAFE(r);
-  int rate_to, UNINITIALIZED_IS_SAFE(skip);
-
-  for (mode = DC_PRED; mode <= TM_PRED; mode++) {
-    int rate;
-    int distortion;
-    int64_t this_rd;
-
-    mbmi->uv_mode = mode;
-    vp9_build_intra_predictors_mbuv(&x->e_mbd);
-    vp9_subtract_mbuv(x->src_diff, x->src.u_buffer, x->src.v_buffer,
-                      x->e_mbd.predictor, x->src.uv_stride);
-    vp9_transform_mbuv_8x8(x);
-
-    vp9_quantize_mbuv_8x8(x);
-
-    rate_to = rd_cost_mbuv_8x8(x, 1);
-    rate = rate_to + x->intra_uv_mode_cost[x->e_mbd.frame_type][mbmi->uv_mode];
-
-    distortion = vp9_mbuverror(x) / 4;
-    this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
-
-    if (this_rd < best_rd) {
-      skip = vp9_mbuv_is_skippable_8x8(xd);
-      best_rd = this_rd;
-      d = distortion;
-      r = rate;
-      *rate_tokenonly = rate_to;
-      mode_selected = mode;
-    }
-  }
-  *rate = r;
-  *distortion = d;
-  *skippable = skip;
-  mbmi->uv_mode = mode_selected;
-}
-
-#if CONFIG_SUPERBLOCKS
-static void super_block_uvrd_8x8(MACROBLOCK *x,
-                                 int *rate,
-                                 int *distortion,
-                                 const VP9_ENCODER_RTCD *rtcd,
-                                 int *skippable) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  int d = 0, r = 0, n, s = 1;
-  const uint8_t *usrc = x->src.u_buffer, *udst = xd->dst.u_buffer;
-  const uint8_t *vsrc = x->src.v_buffer, *vdst = xd->dst.v_buffer;
-  int src_uv_stride = x->src.uv_stride, dst_uv_stride = xd->dst.uv_stride;
-  ENTROPY_CONTEXT_PLANES t_above[2], t_left[2];
-  ENTROPY_CONTEXT_PLANES *ta = xd->above_context;
-  ENTROPY_CONTEXT_PLANES *tl = xd->left_context;
-
-  memcpy(t_above, xd->above_context, sizeof(t_above));
-  memcpy(t_left,  xd->left_context,  sizeof(t_left));
-
-  for (n = 0; n < 4; n++) {
-    int x_idx = n & 1, y_idx = n >> 1;
-
-    vp9_subtract_mbuv_s_c(x->src_diff,
-                          usrc + x_idx * 8 + y_idx * 8 * src_uv_stride,
-                          vsrc + x_idx * 8 + y_idx * 8 * src_uv_stride,
-                          src_uv_stride,
-                          udst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
-                          vdst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
-                          dst_uv_stride);
-    vp9_transform_mbuv_8x8(x);
-    vp9_quantize_mbuv_8x8(x);
-    s &= vp9_mbuv_is_skippable_8x8(xd);
-
-    d += vp9_mbuverror(x) >> 2;
-    xd->above_context = ta + x_idx;
-    xd->left_context = tl + y_idx;
-    r += rd_cost_mbuv_8x8(x, 0);
-  }
-
-  xd->above_context = ta;
-  xd->left_context = tl;
-  *distortion = d;
-  *rate       = r;
-  *skippable  = s;
-
-  xd->left_context = tl;
-  xd->above_context = ta;
-  memcpy(xd->above_context, t_above, sizeof(t_above));
-  memcpy(xd->left_context,  t_left,  sizeof(t_left));
-}
-
-static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi,
-                                       MACROBLOCK *x,
-                                       int *rate,
-                                       int *rate_tokenonly,
-                                       int *distortion,
-                                       int *skippable) {
-  MB_PREDICTION_MODE mode;
-  MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected);
-  int64_t best_rd = INT64_MAX, this_rd;
-  int this_rate_tokenonly, this_rate;
-  int this_distortion, s;
-
-  for (mode = DC_PRED; mode <= TM_PRED; mode++) {
-    x->e_mbd.mode_info_context->mbmi.uv_mode = mode;
-    vp9_build_intra_predictors_sbuv_s(&x->e_mbd);
-
-    super_block_uvrd_8x8(x, &this_rate_tokenonly,
-                         &this_distortion, IF_RTCD(&cpi->rtcd), &s);
-    this_rate = this_rate_tokenonly +
-                x->mbmode_cost[x->e_mbd.frame_type]
-                              [x->e_mbd.mode_info_context->mbmi.mode];
-    this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
-
-    if (this_rd < best_rd) {
-      mode_selected   = mode;
-      best_rd         = this_rd;
-      *rate           = this_rate;
-      *rate_tokenonly = this_rate_tokenonly;
-      *distortion     = this_distortion;
-      *skippable      = s;
-    }
-  }
-
-  x->e_mbd.mode_info_context->mbmi.uv_mode = mode_selected;
-
-  return best_rd;
-}
-#endif
-
-int vp9_cost_mv_ref(VP9_COMP *cpi,
-                    MB_PREDICTION_MODE m,
-                    const int near_mv_ref_ct[4]) {
-  MACROBLOCKD *xd = &cpi->mb.e_mbd;
-  int segment_id = xd->mode_info_context->mbmi.segment_id;
-
-  // If the mode coding is done entirely at the segment level
-  // we should not account for it at the per mb level in rd code.
-  // Note that if the segment level coding is expanded from single mode
-  // to multiple mode masks as per reference frame coding we will need
-  // to do something different here.
-  if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_MODE)) {
-    VP9_COMMON *pc = &cpi->common;
-
-    vp9_prob p [VP9_MVREFS - 1];
-    assert(NEARESTMV <= m  &&  m <= SPLITMV);
-    vp9_mv_ref_probs(pc, p, near_mv_ref_ct);
-    return cost_token(vp9_mv_ref_tree, p,
-                      vp9_mv_ref_encoding_array - NEARESTMV + m);
-  } else
-    return 0;
-}
-
-void vp9_set_mbmode_and_mvs(MACROBLOCK *x, MB_PREDICTION_MODE mb, int_mv *mv) {
-  x->e_mbd.mode_info_context->mbmi.mode = mb;
-  x->e_mbd.mode_info_context->mbmi.mv[0].as_int = mv->as_int;
-}
-
-static int labels2mode(
-  MACROBLOCK *x,
-  int const *labelings, int which_label,
-  B_PREDICTION_MODE this_mode,
-  int_mv *this_mv, int_mv *this_second_mv,
-  int_mv seg_mvs[MAX_REF_FRAMES - 1],
-  int_mv *best_ref_mv,
-  int_mv *second_best_ref_mv,
-  DEC_MVCOSTS) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MODE_INFO *const mic = xd->mode_info_context;
-  MB_MODE_INFO * mbmi = &mic->mbmi;
-  const int mis = xd->mode_info_stride;
-
-  int i, cost = 0, thismvcost = 0;
-
-  /* We have to be careful retrieving previously-encoded motion vectors.
-     Ones from this macroblock have to be pulled from the BLOCKD array
-     as they have not yet made it to the bmi array in our MB_MODE_INFO. */
-  for (i = 0; i < 16; ++i) {
-    BLOCKD *const d = xd->block + i;
-    const int row = i >> 2,  col = i & 3;
-
-    B_PREDICTION_MODE m;
-
-    if (labelings[i] != which_label)
-      continue;
-
-    if (col  &&  labelings[i] == labelings[i - 1])
-      m = LEFT4X4;
-    else if (row  &&  labelings[i] == labelings[i - 4])
-      m = ABOVE4X4;
-    else {
-      // the only time we should do costing for new motion vector or mode
-      // is when we are on a new label  (jbb May 08, 2007)
-      switch (m = this_mode) {
-        case NEW4X4 :
-          if (mbmi->second_ref_frame) {
-            this_mv->as_int = seg_mvs[mbmi->ref_frame - 1].as_int;
-            this_second_mv->as_int =
-              seg_mvs[mbmi->second_ref_frame - 1].as_int;
-          }
-
-          thismvcost  = vp9_mv_bit_cost(this_mv, best_ref_mv, MVCOSTS,
-                                        102, xd->allow_high_precision_mv);
-          if (mbmi->second_ref_frame) {
-            thismvcost += vp9_mv_bit_cost(this_second_mv, second_best_ref_mv,
-                                          MVCOSTS, 102,
-                                          xd->allow_high_precision_mv);
-          }
-          break;
-        case LEFT4X4:
-          this_mv->as_int = col ? d[-1].bmi.as_mv.first.as_int : left_block_mv(mic, i);
-          if (mbmi->second_ref_frame)
-            this_second_mv->as_int = col ? d[-1].bmi.as_mv.second.as_int : left_block_second_mv(mic, i);
-          break;
-        case ABOVE4X4:
-          this_mv->as_int = row ? d[-4].bmi.as_mv.first.as_int : above_block_mv(mic, i, mis);
-          if (mbmi->second_ref_frame)
-            this_second_mv->as_int = row ? d[-4].bmi.as_mv.second.as_int : above_block_second_mv(mic, i, mis);
-          break;
-        case ZERO4X4:
-          this_mv->as_int = 0;
-          if (mbmi->second_ref_frame)
-            this_second_mv->as_int = 0;
-          break;
-        default:
-          break;
-      }
-
-      if (m == ABOVE4X4) { // replace above with left if same
-        int_mv left_mv, left_second_mv;
-
-        left_second_mv.as_int = 0;
-        left_mv.as_int = col ? d[-1].bmi.as_mv.first.as_int :
-                         left_block_mv(mic, i);
-        if (mbmi->second_ref_frame)
-          left_second_mv.as_int = col ? d[-1].bmi.as_mv.second.as_int :
-                                  left_block_second_mv(mic, i);
-
-        if (left_mv.as_int == this_mv->as_int &&
-            (!mbmi->second_ref_frame ||
-             left_second_mv.as_int == this_second_mv->as_int))
-          m = LEFT4X4;
-      }
-
-      cost = x->inter_bmode_costs[ m];
-    }
-
-    d->bmi.as_mv.first.as_int = this_mv->as_int;
-    if (mbmi->second_ref_frame)
-      d->bmi.as_mv.second.as_int = this_second_mv->as_int;
-
-    x->partition_info->bmi[i].mode = m;
-    x->partition_info->bmi[i].mv.as_int = this_mv->as_int;
-    if (mbmi->second_ref_frame)
-      x->partition_info->bmi[i].second_mv.as_int = this_second_mv->as_int;
-  }
-
-  cost += thismvcost;
-  return cost;
-}
-
-static int64_t encode_inter_mb_segment(MACROBLOCK *x,
-                                       int const *labels,
-                                       int which_label,
-                                       int *labelyrate,
-                                       int *distortion,
-                                       ENTROPY_CONTEXT *ta,
-                                       ENTROPY_CONTEXT *tl,
-                                       const VP9_ENCODER_RTCD *rtcd) {
-  int i;
-  MACROBLOCKD *xd = &x->e_mbd;
-
-  *labelyrate = 0;
-  *distortion = 0;
-  for (i = 0; i < 16; i++) {
-    if (labels[i] == which_label) {
-      BLOCKD *bd = &x->e_mbd.block[i];
-      BLOCK *be = &x->block[i];
-      int thisdistortion;
-
-      vp9_build_inter_predictors_b(bd, 16, xd->subpixel_predict);
-      if (xd->mode_info_context->mbmi.second_ref_frame)
-        vp9_build_2nd_inter_predictors_b(bd, 16, xd->subpixel_predict_avg);
-      vp9_subtract_b(be, bd, 16);
-      x->vp9_short_fdct4x4(be->src_diff, be->coeff, 32);
-      x->quantize_b_4x4(be, bd);
-      thisdistortion = vp9_block_error(be->coeff, bd->dqcoeff, 16);
-      *distortion += thisdistortion;
-      *labelyrate += cost_coeffs(x, bd, PLANE_TYPE_Y_WITH_DC,
-                                 ta + vp9_block2above[i],
-                                 tl + vp9_block2left[i], TX_4X4);
-    }
-  }
-  *distortion >>= 2;
-  return RDCOST(x->rdmult, x->rddiv, *labelyrate, *distortion);
-}
-
-static int64_t encode_inter_mb_segment_8x8(MACROBLOCK *x,
-                                           int const *labels,
-                                           int which_label,
-                                           int *labelyrate,
-                                           int *distortion,
-                                           int64_t *otherrd,
-                                           ENTROPY_CONTEXT *ta,
-                                           ENTROPY_CONTEXT *tl,
-                                           const VP9_ENCODER_RTCD *rtcd) {
-  int i, j;
-  MACROBLOCKD *xd = &x->e_mbd;
-  const int iblock[4] = { 0, 1, 4, 5 };
-  int othercost = 0, otherdist = 0;
-  ENTROPY_CONTEXT_PLANES tac, tlc;
-  ENTROPY_CONTEXT *tacp = (ENTROPY_CONTEXT *) &tac,
-                  *tlcp = (ENTROPY_CONTEXT *) &tlc;
-
-  if (otherrd) {
-    memcpy(&tac, ta, sizeof(ENTROPY_CONTEXT_PLANES));
-    memcpy(&tlc, tl, sizeof(ENTROPY_CONTEXT_PLANES));
-  }
-
-  *distortion = 0;
-  *labelyrate = 0;
-  for (i = 0; i < 4; i++) {
-    int ib = vp9_i8x8_block[i];
-
-    if (labels[ib] == which_label) {
-      int idx = (ib & 8) + ((ib & 2) << 1);
-      BLOCKD *bd = &xd->block[ib], *bd2 = &xd->block[idx];
-      BLOCK *be = &x->block[ib], *be2 = &x->block[idx];
-      int thisdistortion;
-
-      vp9_build_inter_predictors4b(xd, bd, 16);
-      if (xd->mode_info_context->mbmi.second_ref_frame)
-        vp9_build_2nd_inter_predictors4b(xd, bd, 16);
-      vp9_subtract_4b_c(be, bd, 16);
-
-      if (xd->mode_info_context->mbmi.txfm_size == TX_4X4) {
-        if (otherrd) {
-          x->vp9_short_fdct8x8(be->src_diff, be2->coeff, 32);
-          x->quantize_b_8x8(be2, bd2);
-          thisdistortion = vp9_block_error_c(be2->coeff, bd2->dqcoeff, 64);
-          otherdist += thisdistortion;
-          othercost += cost_coeffs(x, bd2, PLANE_TYPE_Y_WITH_DC,
-                                     tacp + vp9_block2above_8x8[idx],
-                                     tlcp + vp9_block2left_8x8[idx], TX_8X8);
-        }
-        for (j = 0; j < 4; j += 2) {
-          bd = &xd->block[ib + iblock[j]];
-          be = &x->block[ib + iblock[j]];
-          x->vp9_short_fdct8x4(be->src_diff, be->coeff, 32);
-          x->quantize_b_4x4_pair(be, be + 1, bd, bd + 1);
-          thisdistortion = vp9_block_error_c(be->coeff, bd->dqcoeff, 32);
-          *distortion += thisdistortion;
-          *labelyrate += cost_coeffs(x, bd, PLANE_TYPE_Y_WITH_DC,
-                                     ta + vp9_block2above[ib + iblock[j]],
-                                     tl + vp9_block2left[ib + iblock[j]],
-                                     TX_4X4);
-          *labelyrate += cost_coeffs(x, bd + 1, PLANE_TYPE_Y_WITH_DC,
-                                     ta + vp9_block2above[ib + iblock[j] + 1],
-                                     tl + vp9_block2left[ib + iblock[j]],
-                                     TX_4X4);
-        }
-      } else /* 8x8 */ {
-        if (otherrd) {
-          for (j = 0; j < 4; j += 2) {
-            BLOCKD *bd3 = &xd->block[ib + iblock[j]];
-            BLOCK *be3 = &x->block[ib + iblock[j]];
-            x->vp9_short_fdct8x4(be3->src_diff, be3->coeff, 32);
-            x->quantize_b_4x4_pair(be3, be3 + 1, bd3, bd3 + 1);
-            thisdistortion = vp9_block_error_c(be3->coeff, bd3->dqcoeff, 32);
-            otherdist += thisdistortion;
-            othercost += cost_coeffs(x, bd3, PLANE_TYPE_Y_WITH_DC,
-                                     tacp + vp9_block2above[ib + iblock[j]],
-                                     tlcp + vp9_block2left[ib + iblock[j]],
-                                     TX_4X4);
-            othercost += cost_coeffs(x, bd3 + 1, PLANE_TYPE_Y_WITH_DC,
-                                     tacp + vp9_block2above[ib + iblock[j] + 1],
-                                     tlcp + vp9_block2left[ib + iblock[j]],
-                                     TX_4X4);
-          }
-        }
-        x->vp9_short_fdct8x8(be->src_diff, be2->coeff, 32);
-        x->quantize_b_8x8(be2, bd2);
-        thisdistortion = vp9_block_error_c(be2->coeff, bd2->dqcoeff, 64);
-        *distortion += thisdistortion;
-        *labelyrate += cost_coeffs(x, bd2, PLANE_TYPE_Y_WITH_DC,
-                                   ta + vp9_block2above_8x8[idx],
-                                   tl + vp9_block2left_8x8[idx], TX_8X8);
-      }
-    }
-  }
-  *distortion >>= 2;
-  if (otherrd) {
-    otherdist >>= 2;
-    *otherrd = RDCOST(x->rdmult, x->rddiv, othercost, otherdist);
-  }
-  return RDCOST(x->rdmult, x->rddiv, *labelyrate, *distortion);
-}
-
-static const unsigned int segmentation_to_sseshift[4] = {3, 3, 2, 0};
-
-
-typedef struct {
-  int_mv *ref_mv, *second_ref_mv;
-  int_mv mvp;
-
-  int64_t segment_rd;
-  SPLITMV_PARTITIONING_TYPE segment_num;
-  TX_SIZE txfm_size;
-  int r;
-  int d;
-  int segment_yrate;
-  B_PREDICTION_MODE modes[16];
-  int_mv mvs[16], second_mvs[16];
-  int eobs[16];
-
-  int mvthresh;
-  int *mdcounts;
-
-  int_mv sv_mvp[4];     // save 4 mvp from 8x8
-  int sv_istep[2];  // save 2 initial step_param for 16x8/8x16
-
-} BEST_SEG_INFO;
-
-static __inline
-int mv_check_bounds(MACROBLOCK *x, int_mv *mv) {
-  int r = 0;
-  r |= (mv->as_mv.row >> 3) < x->mv_row_min;
-  r |= (mv->as_mv.row >> 3) > x->mv_row_max;
-  r |= (mv->as_mv.col >> 3) < x->mv_col_min;
-  r |= (mv->as_mv.col >> 3) > x->mv_col_max;
-  return r;
-}
-
-static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
-                                    BEST_SEG_INFO *bsi,
-                                    SPLITMV_PARTITIONING_TYPE segmentation,
-                                    TX_SIZE tx_size, int64_t *otherrds,
-                                    int64_t *rds, int *completed,
-                                    /* 16 = n_blocks */
-                                    int_mv seg_mvs[16 /* n_blocks */]
-                                                  [MAX_REF_FRAMES - 1]) {
-  int i, j;
-  int const *labels;
-  int br = 0, bd = 0;
-  B_PREDICTION_MODE this_mode;
-  MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;
-
-  int label_count;
-  int64_t this_segment_rd = 0, other_segment_rd;
-  int label_mv_thresh;
-  int rate = 0;
-  int sbr = 0, sbd = 0;
-  int segmentyrate = 0;
-  int best_eobs[16] = { 0 };
-
-  vp9_variance_fn_ptr_t *v_fn_ptr;
-
-  ENTROPY_CONTEXT_PLANES t_above, t_left;
-  ENTROPY_CONTEXT *ta, *tl;
-  ENTROPY_CONTEXT_PLANES t_above_b, t_left_b;
-  ENTROPY_CONTEXT *ta_b, *tl_b;
-
-  vpx_memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
-  vpx_memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
-
-  ta = (ENTROPY_CONTEXT *)&t_above;
-  tl = (ENTROPY_CONTEXT *)&t_left;
-  ta_b = (ENTROPY_CONTEXT *)&t_above_b;
-  tl_b = (ENTROPY_CONTEXT *)&t_left_b;
-
-  v_fn_ptr = &cpi->fn_ptr[segmentation];
-  labels = vp9_mbsplits[segmentation];
-  label_count = vp9_mbsplit_count[segmentation];
-
-  // 64 makes this threshold really big effectively
-  // making it so that we very rarely check mvs on
-  // segments.   setting this to 1 would make mv thresh
-  // roughly equal to what it is for macroblocks
-  label_mv_thresh = 1 * bsi->mvthresh / label_count;
-
-  // Segmentation method overheads
-  rate = cost_token(vp9_mbsplit_tree, vp9_mbsplit_probs,
-                    vp9_mbsplit_encodings + segmentation);
-  rate += vp9_cost_mv_ref(cpi, SPLITMV, bsi->mdcounts);
-  this_segment_rd += RDCOST(x->rdmult, x->rddiv, rate, 0);
-  br += rate;
-  other_segment_rd = this_segment_rd;
-
-  mbmi->txfm_size = tx_size;
-  for (i = 0; i < label_count && this_segment_rd < bsi->segment_rd; i++) {
-    int_mv mode_mv[B_MODE_COUNT], second_mode_mv[B_MODE_COUNT];
-    int64_t best_label_rd = INT64_MAX, best_other_rd = INT64_MAX;
-    B_PREDICTION_MODE mode_selected = ZERO4X4;
-    int bestlabelyrate = 0;
-
-    // search for the best motion vector on this segment
-    for (this_mode = LEFT4X4; this_mode <= NEW4X4; this_mode ++) {
-      int64_t this_rd, other_rd;
-      int distortion;
-      int labelyrate;
-      ENTROPY_CONTEXT_PLANES t_above_s, t_left_s;
-      ENTROPY_CONTEXT *ta_s;
-      ENTROPY_CONTEXT *tl_s;
-
-      vpx_memcpy(&t_above_s, &t_above, sizeof(ENTROPY_CONTEXT_PLANES));
-      vpx_memcpy(&t_left_s, &t_left, sizeof(ENTROPY_CONTEXT_PLANES));
-
-      ta_s = (ENTROPY_CONTEXT *)&t_above_s;
-      tl_s = (ENTROPY_CONTEXT *)&t_left_s;
-
-      // motion search for newmv (single predictor case only)
-      if (!mbmi->second_ref_frame && this_mode == NEW4X4) {
-        int sseshift, n;
-        int step_param = 0;
-        int further_steps;
-        int thissme, bestsme = INT_MAX;
-        BLOCK *c;
-        BLOCKD *e;
-
-        /* Is the best so far sufficiently good that we cant justify doing
-         * and new motion search. */
-        if (best_label_rd < label_mv_thresh)
-          break;
-
-        if (cpi->compressor_speed) {
-          if (segmentation == PARTITIONING_8X16 ||
-              segmentation == PARTITIONING_16X8) {
-            bsi->mvp.as_int = bsi->sv_mvp[i].as_int;
-            if (i == 1 && segmentation == PARTITIONING_16X8)
-              bsi->mvp.as_int = bsi->sv_mvp[2].as_int;
-
-            step_param = bsi->sv_istep[i];
-          }
-
-          // use previous block's result as next block's MV predictor.
-          if (segmentation == PARTITIONING_4X4 && i > 0) {
-            bsi->mvp.as_int = x->e_mbd.block[i - 1].bmi.as_mv.first.as_int;
-            if (i == 4 || i == 8 || i == 12)
-              bsi->mvp.as_int = x->e_mbd.block[i - 4].bmi.as_mv.first.as_int;
-            step_param = 2;
-          }
-        }
-
-        further_steps = (MAX_MVSEARCH_STEPS - 1) - step_param;
-
-        {
-          int sadpb = x->sadperbit4;
-          int_mv mvp_full;
-
-          mvp_full.as_mv.row = bsi->mvp.as_mv.row >> 3;
-          mvp_full.as_mv.col = bsi->mvp.as_mv.col >> 3;
-
-          // find first label
-          n = vp9_mbsplit_offset[segmentation][i];
-
-          c = &x->block[n];
-          e = &x->e_mbd.block[n];
-
-          bestsme = vp9_full_pixel_diamond(cpi, x, c, e, &mvp_full, step_param,
-                                           sadpb, further_steps, 0, v_fn_ptr,
-                                           bsi->ref_mv, &mode_mv[NEW4X4]);
-
-          sseshift = segmentation_to_sseshift[segmentation];
-
-          // Should we do a full search (best quality only)
-          if ((cpi->compressor_speed == 0) && (bestsme >> sseshift) > 4000) {
-            /* Check if mvp_full is within the range. */
-            clamp_mv(&mvp_full, x->mv_col_min, x->mv_col_max,
-                     x->mv_row_min, x->mv_row_max);
-
-            thissme = cpi->full_search_sad(x, c, e, &mvp_full,
-                                           sadpb, 16, v_fn_ptr,
-                                           XMVCOST, bsi->ref_mv);
-
-            if (thissme < bestsme) {
-              bestsme = thissme;
-              mode_mv[NEW4X4].as_int = e->bmi.as_mv.first.as_int;
-            } else {
-              /* The full search result is actually worse so re-instate the
-               * previous best vector */
-              e->bmi.as_mv.first.as_int = mode_mv[NEW4X4].as_int;
-            }
-          }
-        }
-
-        if (bestsme < INT_MAX) {
-          int distortion;
-          unsigned int sse;
-          cpi->find_fractional_mv_step(x, c, e, &mode_mv[NEW4X4],
-                                       bsi->ref_mv, x->errorperbit, v_fn_ptr,
-                                       XMVCOST, &distortion, &sse);
-
-          // safe motion search result for use in compound prediction
-          seg_mvs[i][mbmi->ref_frame - 1].as_int = mode_mv[NEW4X4].as_int;
-        }
-      } /* NEW4X4 */
-      else if (mbmi->second_ref_frame && this_mode == NEW4X4) {
-        /* motion search not completed? Then skip newmv for this block with
-         * comppred */
-        if (seg_mvs[i][mbmi->second_ref_frame - 1].as_int == INVALID_MV ||
-            seg_mvs[i][mbmi->ref_frame        - 1].as_int == INVALID_MV) {
-          continue;
-        }
-      }
-
-      rate = labels2mode(x, labels, i, this_mode, &mode_mv[this_mode],
-                         &second_mode_mv[this_mode], seg_mvs[i],
-                         bsi->ref_mv, bsi->second_ref_mv, XMVCOST);
-
-      // Trap vectors that reach beyond the UMV borders
-      if (((mode_mv[this_mode].as_mv.row >> 3) < x->mv_row_min) ||
-          ((mode_mv[this_mode].as_mv.row >> 3) > x->mv_row_max) ||
-          ((mode_mv[this_mode].as_mv.col >> 3) < x->mv_col_min) ||
-          ((mode_mv[this_mode].as_mv.col >> 3) > x->mv_col_max)) {
-        continue;
-      }
-      if (mbmi->second_ref_frame &&
-          mv_check_bounds(x, &second_mode_mv[this_mode]))
-        continue;
-
-      if (segmentation == PARTITIONING_4X4) {
-        this_rd = encode_inter_mb_segment(x, labels, i, &labelyrate,
-                                          &distortion,
-                                          ta_s, tl_s, IF_RTCD(&cpi->rtcd));
-        other_rd = this_rd;
-      } else {
-        this_rd = encode_inter_mb_segment_8x8(x, labels, i, &labelyrate,
-                                              &distortion, &other_rd,
-                                              ta_s, tl_s, IF_RTCD(&cpi->rtcd));
-      }
-      this_rd += RDCOST(x->rdmult, x->rddiv, rate, 0);
-      rate += labelyrate;
-
-      if (this_rd < best_label_rd) {
-        sbr = rate;
-        sbd = distortion;
-        bestlabelyrate = labelyrate;
-        mode_selected = this_mode;
-        best_label_rd = this_rd;
-        if (x->e_mbd.mode_info_context->mbmi.txfm_size == TX_4X4) {
-          for (j = 0; j < 16; j++)
-            if (labels[j] == i)
-              best_eobs[j] = x->e_mbd.block[j].eob;
-        } else {
-          for (j = 0; j < 4; j++) {
-            int ib = vp9_i8x8_block[j], idx = j * 4;
-
-            if (labels[ib] == i)
-              best_eobs[idx] = x->e_mbd.block[idx].eob;
-          }
-        }
-        if (other_rd < best_other_rd)
-          best_other_rd = other_rd;
-
-        vpx_memcpy(ta_b, ta_s, sizeof(ENTROPY_CONTEXT_PLANES));
-        vpx_memcpy(tl_b, tl_s, sizeof(ENTROPY_CONTEXT_PLANES));
-
-      }
-    } /*for each 4x4 mode*/
-
-    vpx_memcpy(ta, ta_b, sizeof(ENTROPY_CONTEXT_PLANES));
-    vpx_memcpy(tl, tl_b, sizeof(ENTROPY_CONTEXT_PLANES));
-
-    labels2mode(x, labels, i, mode_selected, &mode_mv[mode_selected],
-                &second_mode_mv[mode_selected], seg_mvs[i],
-                bsi->ref_mv, bsi->second_ref_mv, XMVCOST);
-
-    br += sbr;
-    bd += sbd;
-    segmentyrate += bestlabelyrate;
-    this_segment_rd += best_label_rd;
-    other_segment_rd += best_other_rd;
-    if (rds)
-      rds[i] = this_segment_rd;
-    if (otherrds)
-      otherrds[i] = other_segment_rd;
-  } /* for each label */
-
-  if (this_segment_rd < bsi->segment_rd) {
-    bsi->r = br;
-    bsi->d = bd;
-    bsi->segment_yrate = segmentyrate;
-    bsi->segment_rd = this_segment_rd;
-    bsi->segment_num = segmentation;
-    bsi->txfm_size = mbmi->txfm_size;
-
-    // store everything needed to come back to this!!
-    for (i = 0; i < 16; i++) {
-      BLOCKD *bd = &x->e_mbd.block[i];
-
-      bsi->mvs[i].as_mv = x->partition_info->bmi[i].mv.as_mv;
-      if (mbmi->second_ref_frame)
-        bsi->second_mvs[i].as_mv = x->partition_info->bmi[i].second_mv.as_mv;
-      bsi->modes[i] = x->partition_info->bmi[i].mode;
-      bsi->eobs[i] = best_eobs[i];
-    }
-  }
-
-  if (completed) {
-    *completed = i;
-  }
-}
-
-static void rd_check_segment(VP9_COMP *cpi, MACROBLOCK *x,
-                             BEST_SEG_INFO *bsi,
-                             unsigned int segmentation,
-                             /* 16 = n_blocks */
-                             int_mv seg_mvs[16][MAX_REF_FRAMES - 1],
-                             int64_t txfm_cache[NB_TXFM_MODES]) {
-  int i, n, c = vp9_mbsplit_count[segmentation];
-
-  if (segmentation == PARTITIONING_4X4) {
-    int64_t rd[16];
-
-    rd_check_segment_txsize(cpi, x, bsi, segmentation, TX_4X4, NULL,
-                            rd, &n, seg_mvs);
-    if (n == c) {
-      for (i = 0; i < NB_TXFM_MODES; i++) {
-        if (rd[c - 1] < txfm_cache[i])
-          txfm_cache[i] = rd[c - 1];
-      }
-    }
-  } else {
-    int64_t diff, base_rd;
-    int cost4x4 = vp9_cost_bit(cpi->common.prob_tx[0], 0);
-    int cost8x8 = vp9_cost_bit(cpi->common.prob_tx[0], 1);
-
-    if (cpi->common.txfm_mode == TX_MODE_SELECT) {
-      int64_t rd4x4[4], rd8x8[4];
-      int n4x4, n8x8, nmin;
-      BEST_SEG_INFO bsi4x4, bsi8x8;
-
-      /* factor in cost of cost4x4/8x8 in decision */
-      vpx_memcpy(&bsi4x4, bsi, sizeof(*bsi));
-      vpx_memcpy(&bsi8x8, bsi, sizeof(*bsi));
-      rd_check_segment_txsize(cpi, x, &bsi4x4, segmentation,
-                              TX_4X4, NULL, rd4x4, &n4x4, seg_mvs);
-      rd_check_segment_txsize(cpi, x, &bsi8x8, segmentation,
-                              TX_8X8, NULL, rd8x8, &n8x8, seg_mvs);
-      if (bsi4x4.segment_num == segmentation) {
-        bsi4x4.segment_rd += RDCOST(x->rdmult, x->rddiv, cost4x4, 0);
-        if (bsi4x4.segment_rd < bsi->segment_rd)
-          vpx_memcpy(bsi, &bsi4x4, sizeof(*bsi));
-      }
-      if (bsi8x8.segment_num == segmentation) {
-        bsi8x8.segment_rd += RDCOST(x->rdmult, x->rddiv, cost8x8, 0);
-        if (bsi8x8.segment_rd < bsi->segment_rd)
-          vpx_memcpy(bsi, &bsi8x8, sizeof(*bsi));
-      }
-      n = n4x4 > n8x8 ? n4x4 : n8x8;
-      if (n == c) {
-        nmin = n4x4 < n8x8 ? n4x4 : n8x8;
-        diff = rd8x8[nmin - 1] - rd4x4[nmin - 1];
-        if (n == n4x4) {
-          base_rd = rd4x4[c - 1];
-        } else {
-          base_rd = rd8x8[c - 1] - diff;
-        }
-      }
-    } else {
-      int64_t rd[4], otherrd[4];
-
-      if (cpi->common.txfm_mode == ONLY_4X4) {
-        rd_check_segment_txsize(cpi, x, bsi, segmentation, TX_4X4, otherrd,
-                                rd, &n, seg_mvs);
-        if (n == c) {
-          base_rd = rd[c - 1];
-          diff = otherrd[c - 1] - rd[c - 1];
-        }
-      } else /* use 8x8 transform */ {
-        rd_check_segment_txsize(cpi, x, bsi, segmentation, TX_8X8, otherrd,
-                                rd, &n, seg_mvs);
-        if (n == c) {
-          diff = rd[c - 1] - otherrd[c - 1];
-          base_rd = otherrd[c - 1];
-        }
-      }
-    }
-
-    if (n == c) {
-      if (base_rd < txfm_cache[ONLY_4X4]) {
-        txfm_cache[ONLY_4X4] = base_rd;
-      }
-      if (base_rd + diff < txfm_cache[1]) {
-        txfm_cache[ALLOW_8X8] = txfm_cache[ALLOW_16X16] = base_rd + diff;
-      }
-      if (diff < 0) {
-        base_rd += diff + RDCOST(x->rdmult, x->rddiv, cost8x8, 0);
-      } else {
-        base_rd += RDCOST(x->rdmult, x->rddiv, cost4x4, 0);
-      }
-      if (base_rd < txfm_cache[TX_MODE_SELECT]) {
-        txfm_cache[TX_MODE_SELECT] = base_rd;
-      }
-    }
-  }
-}
-
-static __inline void cal_step_param(int sr, int *sp) {
-  int step = 0;
-
-  if (sr > MAX_FIRST_STEP) sr = MAX_FIRST_STEP;
-  else if (sr < 1) sr = 1;
-
-  while (sr >>= 1)
-    step++;
-
-  *sp = MAX_MVSEARCH_STEPS - 1 - step;
-}
-
-static int rd_pick_best_mbsegmentation(VP9_COMP *cpi, MACROBLOCK *x,
-                                       int_mv *best_ref_mv,
-                                       int_mv *second_best_ref_mv,
-                                       int64_t best_rd,
-                                       int *mdcounts,
-                                       int *returntotrate,
-                                       int *returnyrate,
-                                       int *returndistortion,
-                                       int *skippable, int mvthresh,
-                                       int_mv seg_mvs[NB_PARTITIONINGS]
-                                                     [16 /* n_blocks */]
-                                                     [MAX_REF_FRAMES - 1],
-                                       int64_t txfm_cache[NB_TXFM_MODES]) {
-  int i;
-  BEST_SEG_INFO bsi;
-  MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;
-
-  vpx_memset(&bsi, 0, sizeof(bsi));
-  for (i = 0; i < NB_TXFM_MODES; i++)
-    txfm_cache[i] = INT64_MAX;
-
-  bsi.segment_rd = best_rd;
-  bsi.ref_mv = best_ref_mv;
-  bsi.second_ref_mv = second_best_ref_mv;
-  bsi.mvp.as_int = best_ref_mv->as_int;
-  bsi.mvthresh = mvthresh;
-  bsi.mdcounts = mdcounts;
-  bsi.txfm_size = TX_4X4;
-
-  for (i = 0; i < 16; i++)
-    bsi.modes[i] = ZERO4X4;
-
-  if (cpi->compressor_speed == 0) {
-    /* for now, we will keep the original segmentation order
-       when in best quality mode */
-    rd_check_segment(cpi, x, &bsi, PARTITIONING_16X8,
-                     seg_mvs[PARTITIONING_16X8], txfm_cache);
-    rd_check_segment(cpi, x, &bsi, PARTITIONING_8X16,
-                     seg_mvs[PARTITIONING_8X16], txfm_cache);
-    rd_check_segment(cpi, x, &bsi, PARTITIONING_8X8,
-                     seg_mvs[PARTITIONING_8X8], txfm_cache);
-    rd_check_segment(cpi, x, &bsi, PARTITIONING_4X4,
-                     seg_mvs[PARTITIONING_4X4], txfm_cache);
-  } else {
-    int sr;
-
-    rd_check_segment(cpi, x, &bsi, PARTITIONING_8X8,
-                     seg_mvs[PARTITIONING_8X8], txfm_cache);
-
-    if (bsi.segment_rd < best_rd) {
-      int tmp_col_min = x->mv_col_min;
-      int tmp_col_max = x->mv_col_max;
-      int tmp_row_min = x->mv_row_min;
-      int tmp_row_max = x->mv_row_max;
-
-      vp9_clamp_mv_min_max(x, best_ref_mv);
-
-      /* Get 8x8 result */
-      bsi.sv_mvp[0].as_int = bsi.mvs[0].as_int;
-      bsi.sv_mvp[1].as_int = bsi.mvs[2].as_int;
-      bsi.sv_mvp[2].as_int = bsi.mvs[8].as_int;
-      bsi.sv_mvp[3].as_int = bsi.mvs[10].as_int;
-
-      /* Use 8x8 result as 16x8/8x16's predictor MV. Adjust search range
-       * according to the closeness of 2 MV. */
-      /* block 8X16 */
-      sr = MAXF((abs(bsi.sv_mvp[0].as_mv.row - bsi.sv_mvp[2].as_mv.row)) >> 3,
-                (abs(bsi.sv_mvp[0].as_mv.col - bsi.sv_mvp[2].as_mv.col)) >> 3);
-      cal_step_param(sr, &bsi.sv_istep[0]);
-
-      sr = MAXF((abs(bsi.sv_mvp[1].as_mv.row - bsi.sv_mvp[3].as_mv.row)) >> 3,
-                (abs(bsi.sv_mvp[1].as_mv.col - bsi.sv_mvp[3].as_mv.col)) >> 3);
-      cal_step_param(sr, &bsi.sv_istep[1]);
-
-      rd_check_segment(cpi, x, &bsi, PARTITIONING_8X16,
-                       seg_mvs[PARTITIONING_8X16], txfm_cache);
-
-      /* block 16X8 */
-      sr = MAXF((abs(bsi.sv_mvp[0].as_mv.row - bsi.sv_mvp[1].as_mv.row)) >> 3,
-                (abs(bsi.sv_mvp[0].as_mv.col - bsi.sv_mvp[1].as_mv.col)) >> 3);
-      cal_step_param(sr, &bsi.sv_istep[0]);
-
-      sr = MAXF((abs(bsi.sv_mvp[2].as_mv.row - bsi.sv_mvp[3].as_mv.row)) >> 3,
-                (abs(bsi.sv_mvp[2].as_mv.col - bsi.sv_mvp[3].as_mv.col)) >> 3);
-      cal_step_param(sr, &bsi.sv_istep[1]);
-
-      rd_check_segment(cpi, x, &bsi, PARTITIONING_16X8,
-                       seg_mvs[PARTITIONING_16X8], txfm_cache);
-
-      /* If 8x8 is better than 16x8/8x16, then do 4x4 search */
-      /* Not skip 4x4 if speed=0 (good quality) */
-      if (cpi->sf.no_skip_block4x4_search ||
-          bsi.segment_num == PARTITIONING_8X8) {
-        /* || (sv_segment_rd8x8-bsi.segment_rd) < sv_segment_rd8x8>>5) */
-        bsi.mvp.as_int = bsi.sv_mvp[0].as_int;
-        rd_check_segment(cpi, x, &bsi, PARTITIONING_4X4,
-                         seg_mvs[PARTITIONING_4X4], txfm_cache);
-      }
-
-      /* restore UMV window */
-      x->mv_col_min = tmp_col_min;
-      x->mv_col_max = tmp_col_max;
-      x->mv_row_min = tmp_row_min;
-      x->mv_row_max = tmp_row_max;
-    }
-  }
-
-  /* set it to the best */
-  for (i = 0; i < 16; i++) {
-    BLOCKD *bd = &x->e_mbd.block[i];
-
-    bd->bmi.as_mv.first.as_int = bsi.mvs[i].as_int;
-    if (mbmi->second_ref_frame)
-      bd->bmi.as_mv.second.as_int = bsi.second_mvs[i].as_int;
-    bd->eob = bsi.eobs[i];
-  }
-
-  *returntotrate = bsi.r;
-  *returndistortion = bsi.d;
-  *returnyrate = bsi.segment_yrate;
-  *skippable = bsi.txfm_size == TX_4X4 ?
-                    vp9_mby_is_skippable_4x4(&x->e_mbd, 0) :
-                    vp9_mby_is_skippable_8x8(&x->e_mbd, 0);
-
-  /* save partitions */
-  mbmi->txfm_size = bsi.txfm_size;
-  mbmi->partitioning = bsi.segment_num;
-  x->partition_info->count = vp9_mbsplit_count[bsi.segment_num];
-
-  for (i = 0; i < x->partition_info->count; i++) {
-    int j;
-
-    j = vp9_mbsplit_offset[bsi.segment_num][i];
-
-    x->partition_info->bmi[i].mode = bsi.modes[j];
-    x->partition_info->bmi[i].mv.as_mv = bsi.mvs[j].as_mv;
-    if (mbmi->second_ref_frame)
-      x->partition_info->bmi[i].second_mv.as_mv = bsi.second_mvs[j].as_mv;
-  }
-  /*
-   * used to set mbmi->mv.as_int
-   */
-  x->partition_info->bmi[15].mv.as_int = bsi.mvs[15].as_int;
-  if (mbmi->second_ref_frame)
-    x->partition_info->bmi[15].second_mv.as_int = bsi.second_mvs[15].as_int;
-
-  return bsi.segment_rd;
-}
-
-/* Order arr in increasing order, original position stored in idx */
-static void insertsortmv(int arr[], int len) {
-  int i, j, k;
-
-  for (i = 1; i <= len - 1; i++) {
-    for (j = 0; j < i; j++) {
-      if (arr[j] > arr[i]) {
-        int temp;
-
-        temp = arr[i];
-
-        for (k = i; k > j; k--)
-          arr[k] = arr[k - 1];
-
-        arr[j] = temp;
-      }
-    }
-  }
-}
-
-static void insertsortsad(int arr[], int idx[], int len) {
-  int i, j, k;
-
-  for (i = 1; i <= len - 1; i++) {
-    for (j = 0; j < i; j++) {
-      if (arr[j] > arr[i]) {
-        int temp, tempi;
-
-        temp = arr[i];
-        tempi = idx[i];
-
-        for (k = i; k > j; k--) {
-          arr[k] = arr[k - 1];
-          idx[k] = idx[k - 1];
-        }
-
-        arr[j] = temp;
-        idx[j] = tempi;
-      }
-    }
-  }
-}
-
-// The improved MV prediction
-void vp9_mv_pred(VP9_COMP *cpi, MACROBLOCKD *xd, const MODE_INFO *here,
-                 int_mv *mvp, int refframe, int *ref_frame_sign_bias,
-                 int *sr, int near_sadidx[]) {
-  const MODE_INFO *above = here - xd->mode_info_stride;
-  const MODE_INFO *left = here - 1;
-  const MODE_INFO *aboveleft = above - 1;
-  int_mv           near_mvs[8];
-  int              near_ref[8];
-  int_mv           mv;
-  int              vcnt = 0;
-  int              find = 0;
-  int              mb_offset;
-
-  int              mvx[8];
-  int              mvy[8];
-  int              i;
-
-  mv.as_int = 0;
-
-  if (here->mbmi.ref_frame != INTRA_FRAME) {
-    near_mvs[0].as_int = near_mvs[1].as_int = near_mvs[2].as_int = near_mvs[3].as_int = near_mvs[4].as_int = near_mvs[5].as_int = near_mvs[6].as_int = near_mvs[7].as_int = 0;
-    near_ref[0] = near_ref[1] = near_ref[2] = near_ref[3] = near_ref[4] = near_ref[5] = near_ref[6] = near_ref[7] = 0;
-
-    // read in 3 nearby block's MVs from current frame as prediction candidates.
-    if (above->mbmi.ref_frame != INTRA_FRAME) {
-      near_mvs[vcnt].as_int = above->mbmi.mv[0].as_int;
-      mv_bias(ref_frame_sign_bias[above->mbmi.ref_frame], refframe, &near_mvs[vcnt], ref_frame_sign_bias);
-      near_ref[vcnt] =  above->mbmi.ref_frame;
-    }
-    vcnt++;
-    if (left->mbmi.ref_frame != INTRA_FRAME) {
-      near_mvs[vcnt].as_int = left->mbmi.mv[0].as_int;
-      mv_bias(ref_frame_sign_bias[left->mbmi.ref_frame], refframe, &near_mvs[vcnt], ref_frame_sign_bias);
-      near_ref[vcnt] =  left->mbmi.ref_frame;
-    }
-    vcnt++;
-    if (aboveleft->mbmi.ref_frame != INTRA_FRAME) {
-      near_mvs[vcnt].as_int = aboveleft->mbmi.mv[0].as_int;
-      mv_bias(ref_frame_sign_bias[aboveleft->mbmi.ref_frame], refframe, &near_mvs[vcnt], ref_frame_sign_bias);
-      near_ref[vcnt] =  aboveleft->mbmi.ref_frame;
-    }
-    vcnt++;
-
-    // read in 5 nearby block's MVs from last frame.
-    if (cpi->common.last_frame_type != KEY_FRAME) {
-      mb_offset = (-xd->mb_to_top_edge / 128 + 1) * (xd->mode_info_stride + 1) + (-xd->mb_to_left_edge / 128 + 1);
-
-      // current in last frame
-      if (cpi->lf_ref_frame[mb_offset] != INTRA_FRAME) {
-        near_mvs[vcnt].as_int = cpi->lfmv[mb_offset].as_int;
-        mv_bias(cpi->lf_ref_frame_sign_bias[mb_offset], refframe, &near_mvs[vcnt], ref_frame_sign_bias);
-        near_ref[vcnt] =  cpi->lf_ref_frame[mb_offset];
-      }
-      vcnt++;
-
-      // above in last frame
-      if (cpi->lf_ref_frame[mb_offset - xd->mode_info_stride - 1] != INTRA_FRAME) {
-        near_mvs[vcnt].as_int = cpi->lfmv[mb_offset - xd->mode_info_stride - 1].as_int;
-        mv_bias(cpi->lf_ref_frame_sign_bias[mb_offset - xd->mode_info_stride - 1], refframe, &near_mvs[vcnt], ref_frame_sign_bias);
-        near_ref[vcnt] =  cpi->lf_ref_frame[mb_offset - xd->mode_info_stride - 1];
-      }
-      vcnt++;
-
-      // left in last frame
-      if (cpi->lf_ref_frame[mb_offset - 1] != INTRA_FRAME) {
-        near_mvs[vcnt].as_int = cpi->lfmv[mb_offset - 1].as_int;
-        mv_bias(cpi->lf_ref_frame_sign_bias[mb_offset - 1], refframe, &near_mvs[vcnt], ref_frame_sign_bias);
-        near_ref[vcnt] =  cpi->lf_ref_frame[mb_offset - 1];
-      }
-      vcnt++;
-
-      // right in last frame
-      if (cpi->lf_ref_frame[mb_offset + 1] != INTRA_FRAME) {
-        near_mvs[vcnt].as_int = cpi->lfmv[mb_offset + 1].as_int;
-        mv_bias(cpi->lf_ref_frame_sign_bias[mb_offset + 1], refframe, &near_mvs[vcnt], ref_frame_sign_bias);
-        near_ref[vcnt] =  cpi->lf_ref_frame[mb_offset + 1];
-      }
-      vcnt++;
-
-      // below in last frame
-      if (cpi->lf_ref_frame[mb_offset + xd->mode_info_stride + 1] != INTRA_FRAME) {
-        near_mvs[vcnt].as_int = cpi->lfmv[mb_offset + xd->mode_info_stride + 1].as_int;
-        mv_bias(cpi->lf_ref_frame_sign_bias[mb_offset + xd->mode_info_stride + 1], refframe, &near_mvs[vcnt], ref_frame_sign_bias);
-        near_ref[vcnt] =  cpi->lf_ref_frame[mb_offset + xd->mode_info_stride + 1];
-      }
-      vcnt++;
-    }
-
-    for (i = 0; i < vcnt; i++) {
-      if (near_ref[near_sadidx[i]] != INTRA_FRAME) {
-        if (here->mbmi.ref_frame == near_ref[near_sadidx[i]]) {
-          mv.as_int = near_mvs[near_sadidx[i]].as_int;
-          find = 1;
-          if (i < 3)
-            *sr = 3;
-          else
-            *sr = 2;
-          break;
-        }
-      }
-    }
-
-    if (!find) {
-      for (i = 0; i < vcnt; i++) {
-        mvx[i] = near_mvs[i].as_mv.row;
-        mvy[i] = near_mvs[i].as_mv.col;
-      }
-
-      insertsortmv(mvx, vcnt);
-      insertsortmv(mvy, vcnt);
-      mv.as_mv.row = mvx[vcnt / 2];
-      mv.as_mv.col = mvy[vcnt / 2];
-
-      find = 1;
-      // sr is set to 0 to allow calling function to decide the search range.
-      *sr = 0;
-    }
-  }
-
-  /* Set up return values */
-  mvp->as_int = mv.as_int;
-  clamp_mv2(mvp, xd);
-}
-
-static void cal_sad(VP9_COMP *cpi, MACROBLOCKD *xd, MACROBLOCK *x,
-                    int recon_yoffset, int near_sadidx[],
-                    enum BlockSize block_size) {
-  /* 0-cf above, 1-cf left, 2-cf aboveleft, 3-lf current, 4-lf above,
-   * 5-lf left, 6-lf right, 7-lf below */
-  int near_sad[8] = {0};
-  BLOCK *b = &x->block[0];
-  unsigned char *src_y_ptr = *(b->base_src);
-  const unsigned char *dst_y_ptr = xd->dst.y_buffer;
-  const int bs = (block_size == BLOCK_16X16) ? 16 : 32;
-  const int dst_y_str = xd->dst.y_stride;
-
-  // calculate sad for current frame 3 nearby MBs.
-  if (xd->mb_to_top_edge == 0 && xd->mb_to_left_edge == 0) {
-    near_sad[0] = near_sad[1] = near_sad[2] = INT_MAX;
-  } else if (xd->mb_to_top_edge == 0) {
-    // only has left MB for sad calculation.
-    near_sad[0] = near_sad[2] = INT_MAX;
-    near_sad[1] = cpi->fn_ptr[block_size].sdf(src_y_ptr, b->src_stride,
-                                              dst_y_ptr - bs,
-                                              dst_y_str, 0x7fffffff);
-  } else if (xd->mb_to_left_edge == 0) {
-    // only has left MB for sad calculation.
-    near_sad[1] = near_sad[2] = INT_MAX;
-    near_sad[0] = cpi->fn_ptr[block_size].sdf(src_y_ptr, b->src_stride,
-                                              dst_y_ptr - dst_y_str * bs,
-                                              dst_y_str, 0x7fffffff);
-  } else {
-    near_sad[0] = cpi->fn_ptr[block_size].sdf(src_y_ptr, b->src_stride,
-                                              dst_y_ptr - dst_y_str * bs,
-                                              dst_y_str, 0x7fffffff);
-    near_sad[1] = cpi->fn_ptr[block_size].sdf(src_y_ptr, b->src_stride,
-                                              dst_y_ptr - bs,
-                                              dst_y_str, 0x7fffffff);
-    near_sad[2] = cpi->fn_ptr[block_size].sdf(src_y_ptr, b->src_stride,
-                                              dst_y_ptr - dst_y_str * bs - bs,
-                                              dst_y_str, 0x7fffffff);
-  }
-
-  if (cpi->common.last_frame_type != KEY_FRAME) {
-    // calculate sad for last frame 5 nearby MBs.
-    unsigned char *pre_y_buffer = cpi->common.yv12_fb[cpi->common.lst_fb_idx].y_buffer + recon_yoffset;
-    const int pre_y_str = cpi->common.yv12_fb[cpi->common.lst_fb_idx].y_stride;
-
-    if (xd->mb_to_top_edge == 0) near_sad[4] = INT_MAX;
-    if (xd->mb_to_left_edge == 0) near_sad[5] = INT_MAX;
-    if (xd->mb_to_right_edge == 0) near_sad[6] = INT_MAX;
-    if (xd->mb_to_bottom_edge == 0) near_sad[7] = INT_MAX;
-
-    near_sad[3] = cpi->fn_ptr[block_size].sdf(src_y_ptr, b->src_stride,
-                                              pre_y_buffer,
-                                              pre_y_str, 0x7fffffff);
-    if (near_sad[4] != INT_MAX)
-      near_sad[4] = cpi->fn_ptr[block_size].sdf(src_y_ptr, b->src_stride,
-                                                pre_y_buffer - pre_y_str * bs,
-                                                pre_y_str, 0x7fffffff);
-    if (near_sad[5] != INT_MAX)
-      near_sad[5] = cpi->fn_ptr[block_size].sdf(src_y_ptr, b->src_stride,
-                                                pre_y_buffer - bs,
-                                                pre_y_str, 0x7fffffff);
-    if (near_sad[6] != INT_MAX)
-      near_sad[6] = cpi->fn_ptr[block_size].sdf(src_y_ptr, b->src_stride,
-                                                pre_y_buffer + bs,
-                                                pre_y_str, 0x7fffffff);
-    if (near_sad[7] != INT_MAX)
-      near_sad[7] = cpi->fn_ptr[block_size].sdf(src_y_ptr, b->src_stride,
-                                                pre_y_buffer + pre_y_str * bs,
-                                                pre_y_str, 0x7fffffff);
-  }
-
-  if (cpi->common.last_frame_type != KEY_FRAME) {
-    insertsortsad(near_sad, near_sadidx, 8);
-  } else {
-    insertsortsad(near_sad, near_sadidx, 3);
-  }
-}
-
-static void set_i8x8_block_modes(MACROBLOCK *x, int modes[2][4]) {
-  int i;
-  MACROBLOCKD *xd = &x->e_mbd;
-  for (i = 0; i < 4; i++) {
-    int ib = vp9_i8x8_block[i];
-    xd->mode_info_context->bmi[ib + 0].as_mode.first = modes[0][i];
-    xd->mode_info_context->bmi[ib + 1].as_mode.first = modes[0][i];
-    xd->mode_info_context->bmi[ib + 4].as_mode.first = modes[0][i];
-    xd->mode_info_context->bmi[ib + 5].as_mode.first = modes[0][i];
-#if CONFIG_COMP_INTRA_PRED
-    xd->mode_info_context->bmi[ib + 0].as_mode.second = modes[1][i];
-    xd->mode_info_context->bmi[ib + 1].as_mode.second = modes[1][i];
-    xd->mode_info_context->bmi[ib + 4].as_mode.second = modes[1][i];
-    xd->mode_info_context->bmi[ib + 5].as_mode.second = modes[1][i];
-#endif
-    // printf("%d,%d,%d,%d %d,%d,%d,%d\n",
-    //       modes[0][0], modes[0][1], modes[0][2], modes[0][3],
-    //       modes[1][0], modes[1][1], modes[1][2], modes[1][3]);
-  }
-
-  for (i = 0; i < 16; i++) {
-    xd->block[i].bmi = xd->mode_info_context->bmi[i];
-  }
-}
-
-extern void vp9_calc_ref_probs(int *count, vp9_prob *probs);
-static void estimate_curframe_refprobs(VP9_COMP *cpi, vp9_prob mod_refprobs[3], int pred_ref) {
-  int norm_cnt[MAX_REF_FRAMES];
-  const int *const rfct = cpi->count_mb_ref_frame_usage;
-  int intra_count = rfct[INTRA_FRAME];
-  int last_count  = rfct[LAST_FRAME];
-  int gf_count    = rfct[GOLDEN_FRAME];
-  int arf_count   = rfct[ALTREF_FRAME];
-
-  // Work out modified reference frame probabilities to use where prediction
-  // of the reference frame fails
-  if (pred_ref == INTRA_FRAME) {
-    norm_cnt[0] = 0;
-    norm_cnt[1] = last_count;
-    norm_cnt[2] = gf_count;
-    norm_cnt[3] = arf_count;
-    vp9_calc_ref_probs(norm_cnt, mod_refprobs);
-    mod_refprobs[0] = 0;    // This branch implicit
-  } else if (pred_ref == LAST_FRAME) {
-    norm_cnt[0] = intra_count;
-    norm_cnt[1] = 0;
-    norm_cnt[2] = gf_count;
-    norm_cnt[3] = arf_count;
-    vp9_calc_ref_probs(norm_cnt, mod_refprobs);
-    mod_refprobs[1] = 0;    // This branch implicit
-  } else if (pred_ref == GOLDEN_FRAME) {
-    norm_cnt[0] = intra_count;
-    norm_cnt[1] = last_count;
-    norm_cnt[2] = 0;
-    norm_cnt[3] = arf_count;
-    vp9_calc_ref_probs(norm_cnt, mod_refprobs);
-    mod_refprobs[2] = 0;  // This branch implicit
-  } else {
-    norm_cnt[0] = intra_count;
-    norm_cnt[1] = last_count;
-    norm_cnt[2] = gf_count;
-    norm_cnt[3] = 0;
-    vp9_calc_ref_probs(norm_cnt, mod_refprobs);
-    mod_refprobs[2] = 0;  // This branch implicit
-  }
-}
-
-static __inline unsigned weighted_cost(vp9_prob *tab0, vp9_prob *tab1, int idx, int val, int weight) {
-  unsigned cost0 = tab0[idx] ? vp9_cost_bit(tab0[idx], val) : 0;
-  unsigned cost1 = tab1[idx] ? vp9_cost_bit(tab1[idx], val) : 0;
-  // weight is 16-bit fixed point, so this basically calculates:
-  // 0.5 + weight * cost1 + (1.0 - weight) * cost0
-  return (0x8000 + weight * cost1 + (0x10000 - weight) * cost0) >> 16;
-}
-
-static void estimate_ref_frame_costs(VP9_COMP *cpi, int segment_id, unsigned int *ref_costs) {
-  VP9_COMMON *cm = &cpi->common;
-  MACROBLOCKD *xd = &cpi->mb.e_mbd;
-  vp9_prob *mod_refprobs;
-
-  unsigned int cost;
-  int pred_ref;
-  int pred_flag;
-  int pred_ctx;
-  int i;
-  int tot_count;
-
-  vp9_prob pred_prob, new_pred_prob;
-  int seg_ref_active;
-  int seg_ref_count = 0;
-  seg_ref_active = vp9_segfeature_active(xd,
-                                         segment_id,
-                                         SEG_LVL_REF_FRAME);
-
-  if (seg_ref_active) {
-    seg_ref_count = vp9_check_segref(xd, segment_id, INTRA_FRAME)  +
-                    vp9_check_segref(xd, segment_id, LAST_FRAME)   +
-                    vp9_check_segref(xd, segment_id, GOLDEN_FRAME) +
-                    vp9_check_segref(xd, segment_id, ALTREF_FRAME);
-  }
-
-  // Get the predicted reference for this mb
-  pred_ref = vp9_get_pred_ref(cm, xd);
-
-  // Get the context probability for the prediction flag (based on last frame)
-  pred_prob = vp9_get_pred_prob(cm, xd, PRED_REF);
-
-  // Predict probability for current frame based on stats so far
-  pred_ctx = vp9_get_pred_context(cm, xd, PRED_REF);
-  tot_count = cpi->ref_pred_count[pred_ctx][0] + cpi->ref_pred_count[pred_ctx][1];
-  if (tot_count) {
-    new_pred_prob =
-      (cpi->ref_pred_count[pred_ctx][0] * 255 + (tot_count >> 1)) / tot_count;
-    new_pred_prob += !new_pred_prob;
-  } else
-    new_pred_prob = 128;
-
-  // Get the set of probabilities to use if prediction fails
-  mod_refprobs = cm->mod_refprobs[pred_ref];
-
-  // For each possible selected reference frame work out a cost.
-  for (i = 0; i < MAX_REF_FRAMES; i++) {
-    if (seg_ref_active && seg_ref_count == 1) {
-      cost = 0;
-    } else {
-      pred_flag = (i == pred_ref);
-
-      // Get the prediction for the current mb
-      cost = weighted_cost(&pred_prob, &new_pred_prob, 0,
-                           pred_flag, cpi->seg0_progress);
-      if (cost > 1024) cost = 768; // i.e. account for 4 bits max.
-
-      // for incorrectly predicted cases
-      if (! pred_flag) {
-        vp9_prob curframe_mod_refprobs[3];
-
-        if (cpi->seg0_progress) {
-          estimate_curframe_refprobs(cpi, curframe_mod_refprobs, pred_ref);
-        } else {
-          vpx_memset(curframe_mod_refprobs, 0, sizeof(curframe_mod_refprobs));
-        }
-
-        cost += weighted_cost(mod_refprobs, curframe_mod_refprobs, 0,
-                              (i != INTRA_FRAME), cpi->seg0_progress);
-        if (i != INTRA_FRAME) {
-          cost += weighted_cost(mod_refprobs, curframe_mod_refprobs, 1,
-                                (i != LAST_FRAME), cpi->seg0_progress);
-          if (i != LAST_FRAME) {
-            cost += weighted_cost(mod_refprobs, curframe_mod_refprobs, 2,
-                                  (i != GOLDEN_FRAME), cpi->seg0_progress);
-          }
-        }
-      }
-    }
-
-    ref_costs[i] = cost;
-  }
-}
-
-static void store_coding_context(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
-                                 int mode_index,
-                                 PARTITION_INFO *partition,
-                                 int_mv *ref_mv,
-                                 int_mv *second_ref_mv,
-                                 int single_pred_diff,
-                                 int comp_pred_diff,
-                                 int hybrid_pred_diff,
-                                 int64_t txfm_size_diff[NB_TXFM_MODES]) {
-  MACROBLOCKD *xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
-
-  // Take a snapshot of the coding context so it can be
-  // restored if we decide to encode this way
-  ctx->best_mode_index = mode_index;
-  vpx_memcpy(&ctx->mic, xd->mode_info_context,
-             sizeof(MODE_INFO));
-  if (partition)
-    vpx_memcpy(&ctx->partition_info, partition,
-               sizeof(PARTITION_INFO));
-  ctx->best_ref_mv.as_int = ref_mv->as_int;
-  ctx->second_best_ref_mv.as_int = second_ref_mv->as_int;
-
-  // ctx[mb_index].rddiv = x->rddiv;
-  // ctx[mb_index].rdmult = x->rdmult;
-
-  ctx->single_pred_diff = single_pred_diff;
-  ctx->comp_pred_diff   = comp_pred_diff;
-  ctx->hybrid_pred_diff = hybrid_pred_diff;
-
-  if (txfm_size_diff) {
-    memcpy(ctx->txfm_rd_diff, txfm_size_diff, sizeof(ctx->txfm_rd_diff));
-  } else {
-    memset(ctx->txfm_rd_diff, 0, sizeof(ctx->txfm_rd_diff));
-  }
-}
-
-static void inter_mode_cost(VP9_COMP *cpi, MACROBLOCK *x, int this_mode,
-                            int *rate2, int *distortion2, int *rate_y,
-                            int *distortion, int* rate_uv, int *distortion_uv,
-                            int *skippable, int64_t txfm_cache[NB_TXFM_MODES]) {
-  int y_skippable, uv_skippable;
-
-  // Y cost and distortion
-  macro_block_yrd(cpi, x, rate_y, distortion, &y_skippable, txfm_cache);
-
-  *rate2 += *rate_y;
-  *distortion2 += *distortion;
-
-  // UV cost and distortion
-  if (x->e_mbd.mode_info_context->mbmi.txfm_size != TX_4X4)
-    rd_inter16x16_uv_8x8(cpi, x, rate_uv, distortion_uv,
-                         cpi->common.full_pixel, &uv_skippable);
-  else
-    rd_inter16x16_uv(cpi, x, rate_uv, distortion_uv, cpi->common.full_pixel,
-                     &uv_skippable);
-  *rate2 += *rate_uv;
-  *distortion2 += *distortion_uv;
-  *skippable = y_skippable && uv_skippable;
-}
-
-#define MIN(x,y) (((x)<(y))?(x):(y))
-#define MAX(x,y) (((x)>(y))?(x):(y))
-static void setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x,
-                               int idx, int frame_type,
-                               int recon_yoffset, int recon_uvoffset,
-                               int_mv frame_nearest_mv[4],
-                               int_mv frame_near_mv[4],
-                               int_mv frame_best_ref_mv[4],
-                               int frame_mdcounts[4][4],
-                               unsigned char *y_buffer[4],
-                               unsigned char *u_buffer[4],
-                               unsigned char *v_buffer[4]) {
-  YV12_BUFFER_CONFIG *yv12 = &cpi->common.yv12_fb[idx];
-  MACROBLOCKD *xd = &x->e_mbd;
-  MB_MODE_INFO * mbmi = &xd->mode_info_context->mbmi;
-
-
-  vp9_find_near_mvs(xd, xd->mode_info_context,
-                    xd->prev_mode_info_context,
-                    &frame_nearest_mv[frame_type], &frame_near_mv[frame_type],
-                    &frame_best_ref_mv[frame_type], frame_mdcounts[frame_type],
-                    frame_type, cpi->common.ref_frame_sign_bias);
-
-  y_buffer[frame_type] = yv12->y_buffer + recon_yoffset;
-  u_buffer[frame_type] = yv12->u_buffer + recon_uvoffset;
-  v_buffer[frame_type] = yv12->v_buffer + recon_uvoffset;
-
-#if CONFIG_NEWBESTREFMV
-  vp9_find_mv_refs(xd, xd->mode_info_context,
-                   xd->prev_mode_info_context,
-                   frame_type,
-                   mbmi->ref_mvs[frame_type],
-                   cpi->common.ref_frame_sign_bias);
-
-  vp9_find_best_ref_mvs(xd, y_buffer[frame_type],
-                        yv12->y_stride,
-                        mbmi->ref_mvs[frame_type],
-                        &frame_best_ref_mv[frame_type],
-                        &frame_nearest_mv[frame_type],
-                        &frame_near_mv[frame_type]);
-#endif
-}
-
-static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
-                                 enum BlockSize block_size,
-                                 int *saddone, int near_sadidx[],
-                                 int mdcounts[4], int64_t txfm_cache[],
-                                 int *rate2, int *distortion, int *skippable,
-                                 int *compmode_cost,
-                                 int *rate_y, int *distortion_y,
-                                 int *rate_uv, int *distortion_uv,
-                                 int *mode_excluded, int *disable_skip,
-                                 int recon_yoffset, int mode_index,
-                                 int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES],
-                                 int_mv frame_best_ref_mv[4]) {
-  VP9_COMMON *cm = &cpi->common;
-  MACROBLOCKD *xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
-  BLOCK *b = &x->block[0];
-  BLOCKD *d = &xd->block[0];
-  const int is_comp_pred = (mbmi->second_ref_frame != 0);
-  const int num_refs = is_comp_pred ? 2 : 1;
-  const int this_mode = mbmi->mode;
-  int i;
-  int refs[2] = { mbmi->ref_frame, mbmi->second_ref_frame };
-  int_mv cur_mv[2];
-  int_mv mvp;
-  int64_t this_rd = 0;
-
-  switch (this_mode) {
-    case NEWMV:
-      if (is_comp_pred) {
-        if (frame_mv[NEWMV][refs[0]].as_int == INVALID_MV ||
-            frame_mv[NEWMV][refs[1]].as_int == INVALID_MV)
-          return INT64_MAX;
-        *rate2 += vp9_mv_bit_cost(&frame_mv[NEWMV][refs[0]],
-                                  &frame_best_ref_mv[refs[0]],
-                                  XMVCOST, 96,
-                                  x->e_mbd.allow_high_precision_mv);
-        *rate2 += vp9_mv_bit_cost(&frame_mv[NEWMV][refs[1]],
-                                  &frame_best_ref_mv[refs[1]],
-                                  XMVCOST, 96,
-                                  x->e_mbd.allow_high_precision_mv);
-      } else {
-        int bestsme = INT_MAX;
-        int further_steps, step_param = cpi->sf.first_step;
-        int sadpb = x->sadperbit16;
-        int_mv mvp_full, tmp_mv;
-        // search range got from mv_pred(). It uses step_param levels. (0-7)
-        int sr = 0;
-
-        int tmp_col_min = x->mv_col_min;
-        int tmp_col_max = x->mv_col_max;
-        int tmp_row_min = x->mv_row_min;
-        int tmp_row_max = x->mv_row_max;
-
-        vp9_clamp_mv_min_max(x, &frame_best_ref_mv[refs[0]]);
-
-        if (!*saddone) {
-          cal_sad(cpi, xd, x, recon_yoffset, &near_sadidx[0], block_size);
-          *saddone = 1;
-        }
-
-        vp9_mv_pred(cpi, &x->e_mbd, x->e_mbd.mode_info_context, &mvp,
-                    mbmi->ref_frame, cpi->common.ref_frame_sign_bias,
-                    &sr, &near_sadidx[0]);
-
-        mvp_full.as_mv.col = mvp.as_mv.col >> 3;
-        mvp_full.as_mv.row = mvp.as_mv.row >> 3;
-
-        // adjust search range according to sr from mv prediction
-        step_param = MAX(step_param, sr);
-
-        // Further step/diamond searches as necessary
-        further_steps = (cpi->sf.max_step_search_steps - 1) - step_param;
-
-        bestsme = vp9_full_pixel_diamond(cpi, x, b, d, &mvp_full, step_param,
-                                         sadpb, further_steps, 1,
-                                         &cpi->fn_ptr[block_size],
-                                         &frame_best_ref_mv[refs[0]], &tmp_mv);
-
-        x->mv_col_min = tmp_col_min;
-        x->mv_col_max = tmp_col_max;
-        x->mv_row_min = tmp_row_min;
-        x->mv_row_max = tmp_row_max;
-
-        if (bestsme < INT_MAX) {
-          int dis; /* TODO: use dis in distortion calculation later. */
-          unsigned int sse;
-          cpi->find_fractional_mv_step(x, b, d, &tmp_mv,
-                                       &frame_best_ref_mv[refs[0]],
-                                       x->errorperbit,
-                                       &cpi->fn_ptr[block_size],
-                                       XMVCOST, &dis, &sse);
-        }
-        d->bmi.as_mv.first.as_int = tmp_mv.as_int;
-        frame_mv[NEWMV][refs[0]].as_int = d->bmi.as_mv.first.as_int;
-
-        // Add the new motion vector cost to our rolling cost variable
-        *rate2 += vp9_mv_bit_cost(&tmp_mv, &frame_best_ref_mv[refs[0]],
-                                  XMVCOST, 96, xd->allow_high_precision_mv);
-      }
-      break;
-    case NEARESTMV:
-    case NEARMV:
-      // Do not bother proceeding if the vector (from newmv, nearest or
-      // near) is 0,0 as this should then be coded using the zeromv mode.
-      for (i = 0; i < num_refs; ++i)
-        if (frame_mv[this_mode][refs[i]].as_int == 0)
-          return INT64_MAX;
-    case ZEROMV:
-    default:
-      break;
-  }
-  for (i = 0; i < num_refs; ++i) {
-    cur_mv[i] = frame_mv[this_mode][refs[i]];
-    // Clip "next_nearest" so that it does not extend to far out of image
-    clamp_mv2(&cur_mv[i], xd);
-    if (mv_check_bounds(x, &cur_mv[i]))
-      return INT64_MAX;
-    mbmi->mv[i].as_int = cur_mv[i].as_int;
-  }
-
-#if CONFIG_PRED_FILTER
-  // Filtered prediction:
-  mbmi->pred_filter_enabled = vp9_mode_order[mode_index].pred_filter_flag;
-  *rate2 += vp9_cost_bit(cpi->common.prob_pred_filter_off,
-                         mbmi->pred_filter_enabled);
-#endif
-  if (cpi->common.mcomp_filter_type == SWITCHABLE) {
-    const int c = vp9_get_pred_context(cm, xd, PRED_SWITCHABLE_INTERP);
-    const int m = vp9_switchable_interp_map[mbmi->interp_filter];
-    *rate2 += SWITCHABLE_INTERP_RATE_FACTOR * x->switchable_interp_costs[c][m];
-  }
-
-  /* We don't include the cost of the second reference here, because there
-   * are only three options: Last/Golden, ARF/Last or Golden/ARF, or in other
-   * words if you present them in that order, the second one is always known
-   * if the first is known */
-  *compmode_cost = vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_COMP),
-                                is_comp_pred);
-  *rate2 += vp9_cost_mv_ref(cpi, this_mode, mdcounts);
-
-  if (block_size == BLOCK_16X16) {
-    vp9_build_1st_inter16x16_predictors_mby(xd, xd->predictor, 16, 0);
-    if (is_comp_pred)
-      vp9_build_2nd_inter16x16_predictors_mby(xd, xd->predictor, 16);
-  } else {
-#if CONFIG_SUPERBLOCKS
-    vp9_build_inter32x32_predictors_sb(xd,
-                                       xd->dst.y_buffer,
-                                       xd->dst.u_buffer,
-                                       xd->dst.v_buffer,
-                                       xd->dst.y_stride,
-                                       xd->dst.uv_stride);
-#endif
-  }
-
-  if (cpi->active_map_enabled && x->active_ptr[0] == 0)
-    x->skip = 1;
-  else if (x->encode_breakout) {
-    unsigned int sse, var;
-    int threshold = (xd->block[0].dequant[1]
-                     * xd->block[0].dequant[1] >> 4);
-
-    if (threshold < x->encode_breakout)
-      threshold = x->encode_breakout;
-
-    if (block_size == BLOCK_16X16) {
-      var = vp9_variance16x16(*(b->base_src), b->src_stride,
-                              xd->predictor, 16, &sse);
-    } else {
-#if CONFIG_SUPERBLOCKS
-      var = vp9_variance32x32(*(b->base_src), b->src_stride,
-                              xd->dst.y_buffer, xd->dst.y_stride, &sse);
-#endif
-    }
-
-    if (sse < threshold) {
-      unsigned int q2dc = xd->block[24].dequant[0];
-      /* If there is no codeable 2nd order dc
-       or a very small uniform pixel change change */
-      if ((sse - var < q2dc * q2dc >> 4) ||
-          (sse / 2 > var && sse - var < 64)) {
-        // Check u and v to make sure skip is ok
-        int sse2;
-
-        if (block_size == BLOCK_16X16) {
-          sse2 = vp9_uvsse(x);
-        } else {
-          unsigned int sse2u, sse2v;
-          var = vp9_variance16x16(x->src.u_buffer, x->src.uv_stride,
-                                  xd->dst.u_buffer, xd->dst.uv_stride, &sse2u);
-          var = vp9_variance16x16(x->src.v_buffer, x->src.uv_stride,
-                                  xd->dst.v_buffer, xd->dst.uv_stride, &sse2v);
-          sse2 = sse2u + sse2v;
-        }
-
-        if (sse2 * 2 < threshold) {
-          x->skip = 1;
-          *distortion = sse + sse2;
-          *rate2 = 500;
-
-          /* for best_yrd calculation */
-          *rate_uv = 0;
-          *distortion_uv = sse2;
-
-          *disable_skip = 1;
-          this_rd = RDCOST(x->rdmult, x->rddiv, *rate2, *distortion);
-        }
-      }
-    }
-  }
-
-  if (!x->skip) {
-    if (block_size == BLOCK_16X16) {
-      vp9_build_1st_inter16x16_predictors_mbuv(xd, &xd->predictor[256],
-                                               &xd->predictor[320], 8);
-      if (is_comp_pred)
-        vp9_build_2nd_inter16x16_predictors_mbuv(xd, &xd->predictor[256],
-                                                 &xd->predictor[320], 8);
-      inter_mode_cost(cpi, x, this_mode, rate2, distortion,
-                      rate_y, distortion_y, rate_uv, distortion_uv,
-                      skippable, txfm_cache);
-    } else {
-#if CONFIG_SUPERBLOCKS
-      int skippable_y, skippable_uv;
-
-      // Y cost and distortion - FIXME support other transform sizes
-      super_block_yrd_8x8(x, rate_y, distortion_y,
-                          IF_RTCD(&cpi->rtcd), &skippable_y);
-      *rate2 += *rate_y;
-      *distortion += *distortion_y;
-
-      rd_inter32x32_uv_8x8(cpi, x, rate_uv, distortion_uv,
-                           cm->full_pixel, &skippable_uv);
-
-      *rate2 += *rate_uv;
-      *distortion += *distortion_uv;
-      *skippable = skippable_y && skippable_uv;
-#endif
-    }
-  }
-  if (is_comp_pred) {
-    *mode_excluded = (cpi->common.comp_pred_mode == SINGLE_PREDICTION_ONLY);
-  } else {
-    *mode_excluded = (cpi->common.comp_pred_mode == COMP_PREDICTION_ONLY);
-  }
-
-  return this_rd;  // if 0, this will be re-calculated by caller
-}
-
-void vp9_rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
-                            int recon_yoffset, int recon_uvoffset,
-                            int *returnrate, int *returndistortion,
-                            int64_t *returnintra) {
-  VP9_COMMON *cm = &cpi->common;
-  MACROBLOCKD *xd = &x->e_mbd;
-  union b_mode_info best_bmodes[16];
-  MB_MODE_INFO best_mbmode;
-  PARTITION_INFO best_partition;
-  int_mv best_ref_mv, second_best_ref_mv;
-  MB_PREDICTION_MODE this_mode;
-  MB_MODE_INFO * mbmi = &xd->mode_info_context->mbmi;
-  int i, best_mode_index = 0;
-  int mode8x8[2][4];
-  unsigned char segment_id = mbmi->segment_id;
-
-  int mode_index;
-  int mdcounts[4];
-  int rate, distortion;
-  int rate2, distortion2;
-  int64_t best_txfm_rd[NB_TXFM_MODES];
-  int64_t best_txfm_diff[NB_TXFM_MODES];
-  int64_t best_pred_diff[NB_PREDICTION_TYPES];
-  int64_t best_pred_rd[NB_PREDICTION_TYPES];
-  int64_t best_rd = INT64_MAX, best_intra_rd = INT64_MAX;
-#if CONFIG_PRED_FILTER
-  int64_t best_overall_rd = INT64_MAX;
-#endif
-  int uv_intra_rate, uv_intra_distortion, uv_intra_rate_tokenonly;
-  int uv_intra_skippable = 0;
-  int uv_intra_rate_8x8 = 0, uv_intra_distortion_8x8 = 0, uv_intra_rate_tokenonly_8x8 = 0;
-  int uv_intra_skippable_8x8 = 0;
-  int rate_y, UNINITIALIZED_IS_SAFE(rate_uv);
-  int distortion_uv = INT_MAX;
-  int64_t best_yrd = INT64_MAX;
-#if CONFIG_PRED_FILTER
-  int best_filter_state;
-#endif
-  int switchable_filter_index = 0;
-
-  MB_PREDICTION_MODE uv_intra_mode;
-  MB_PREDICTION_MODE uv_intra_mode_8x8 = 0;
-
-  int near_sadidx[8] = {0, 1, 2, 3, 4, 5, 6, 7};
-  int saddone = 0;
-
-  int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
-  int_mv frame_best_ref_mv[4];
-  int frame_mdcounts[4][4];
-  unsigned char *y_buffer[4], *u_buffer[4], *v_buffer[4];
-
-  unsigned int ref_costs[MAX_REF_FRAMES];
-  int_mv seg_mvs[NB_PARTITIONINGS][16 /* n_blocks */][MAX_REF_FRAMES - 1];
-
-  vpx_memset(mode8x8, 0, sizeof(mode8x8));
-  vpx_memset(&frame_mv, 0, sizeof(frame_mv));
-  vpx_memset(&best_mbmode, 0, sizeof(best_mbmode));
-  vpx_memset(&best_bmodes, 0, sizeof(best_bmodes));
-  vpx_memset(&x->mb_context[xd->mb_index], 0, sizeof(PICK_MODE_CONTEXT));
-
-  for (i = 0; i < MAX_REF_FRAMES; i++)
-    frame_mv[NEWMV][i].as_int = INVALID_MV;
-  for (i = 0; i < NB_PREDICTION_TYPES; ++i)
-    best_pred_rd[i] = INT64_MAX;
-  for (i = 0; i < NB_TXFM_MODES; i++)
-    best_txfm_rd[i] = INT64_MAX;
-
-  for (i = 0; i < NB_PARTITIONINGS; i++) {
-    int j, k;
-
-    for (j = 0; j < 16; j++)
-      for (k = 0; k < MAX_REF_FRAMES - 1; k++)
-        seg_mvs[i][j][k].as_int = INVALID_MV;
-  }
-
-  if (cpi->ref_frame_flags & VP9_LAST_FLAG) {
-    setup_buffer_inter(cpi, x, cpi->common.lst_fb_idx, LAST_FRAME,
-                       recon_yoffset, recon_uvoffset, frame_mv[NEARESTMV],
-                       frame_mv[NEARMV], frame_best_ref_mv,
-                       frame_mdcounts, y_buffer, u_buffer, v_buffer);
-  }
-
-  if (cpi->ref_frame_flags & VP9_GOLD_FLAG) {
-    setup_buffer_inter(cpi, x, cpi->common.gld_fb_idx, GOLDEN_FRAME,
-                       recon_yoffset, recon_uvoffset, frame_mv[NEARESTMV],
-                       frame_mv[NEARMV], frame_best_ref_mv,
-                       frame_mdcounts, y_buffer, u_buffer, v_buffer);
-  }
-
-  if (cpi->ref_frame_flags & VP9_ALT_FLAG) {
-    setup_buffer_inter(cpi, x, cpi->common.alt_fb_idx, ALTREF_FRAME,
-                       recon_yoffset, recon_uvoffset, frame_mv[NEARESTMV],
-                       frame_mv[NEARMV], frame_best_ref_mv,
-                       frame_mdcounts, y_buffer, u_buffer, v_buffer);
-  }
-
-  *returnintra = INT64_MAX;
-
-  x->skip = 0;
-
-  mbmi->ref_frame = INTRA_FRAME;
-
-  /* Initialize zbin mode boost for uv costing */
-  cpi->zbin_mode_boost = 0;
-  vp9_update_zbin_extra(cpi, x);
-
-  rd_pick_intra_mbuv_mode(cpi, x, &uv_intra_rate,
-                          &uv_intra_rate_tokenonly, &uv_intra_distortion,
-                          &uv_intra_skippable);
-  uv_intra_mode = mbmi->uv_mode;
-
-  /* rough estimate for now */
-  if (cpi->common.txfm_mode != ONLY_4X4) {
-    rd_pick_intra_mbuv_mode_8x8(cpi, x, &uv_intra_rate_8x8,
-                                &uv_intra_rate_tokenonly_8x8,
-                                &uv_intra_distortion_8x8,
-                                &uv_intra_skippable_8x8);
-    uv_intra_mode_8x8 = mbmi->uv_mode;
-  }
-
-  // Get estimates of reference frame costs for each reference frame
-  // that depend on the current prediction etc.
-  estimate_ref_frame_costs(cpi, segment_id, ref_costs);
-
-  for (mode_index = 0; mode_index < MAX_MODES;
-       mode_index += (!switchable_filter_index)) {
-    int64_t this_rd = INT64_MAX;
-    int disable_skip = 0, skippable = 0;
-    int other_cost = 0;
-    int compmode_cost = 0;
-    int mode_excluded = 0;
-    int64_t txfm_cache[NB_TXFM_MODES] = { 0 };
-
-    // These variables hold are rolling total cost and distortion for this mode
-    rate2 = 0;
-    distortion2 = 0;
-    rate_y = 0;
-    rate_uv = 0;
-
-    this_mode = vp9_mode_order[mode_index].mode;
-    mbmi->mode = this_mode;
-    mbmi->uv_mode = DC_PRED;
-    mbmi->ref_frame = vp9_mode_order[mode_index].ref_frame;
-    mbmi->second_ref_frame = vp9_mode_order[mode_index].second_ref_frame;
-#if CONFIG_PRED_FILTER
-    mbmi->pred_filter_enabled = 0;
-#endif
-    if (cpi->common.mcomp_filter_type == SWITCHABLE &&
-        this_mode >= NEARESTMV && this_mode <= SPLITMV) {
-      mbmi->interp_filter =
-          vp9_switchable_interp[switchable_filter_index++];
-      if (switchable_filter_index == VP9_SWITCHABLE_FILTERS)
-        switchable_filter_index = 0;
-    } else {
-      mbmi->interp_filter = cpi->common.mcomp_filter_type;
-    }
-    vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common);
-
-    // Test best rd so far against threshold for trying this mode.
-    if (best_rd <= cpi->rd_threshes[mode_index])
-      continue;
-
-    // current coding mode under rate-distortion optimization test loop
-#if CONFIG_COMP_INTRA_PRED
-    mbmi->second_mode = (MB_PREDICTION_MODE)(DC_PRED - 1);
-    mbmi->second_uv_mode = (MB_PREDICTION_MODE)(DC_PRED - 1);
-#endif
-
-    // If the segment reference frame feature is enabled....
-    // then do nothing if the current ref frame is not allowed..
-    if (vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME) &&
-        !vp9_check_segref(xd, segment_id, mbmi->ref_frame)) {
-      continue;
-    // If the segment mode feature is enabled....
-    // then do nothing if the current mode is not allowed..
-    } else if (vp9_segfeature_active(xd, segment_id, SEG_LVL_MODE) &&
-               (this_mode !=
-                vp9_get_segdata(xd, segment_id, SEG_LVL_MODE))) {
-      continue;
-    // Disable this drop out case if either the mode or ref frame
-    // segment level feature is enabled for this segment. This is to
-    // prevent the possibility that the we end up unable to pick any mode.
-    } else if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME) &&
-               !vp9_segfeature_active(xd, segment_id, SEG_LVL_MODE)) {
-      // Only consider ZEROMV/ALTREF_FRAME for alt ref frame,
-      // unless ARNR filtering is enabled in which case we want
-      // an unfiltered alternative
-      if (cpi->is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0)) {
-        if (this_mode != ZEROMV ||
-            mbmi->ref_frame != ALTREF_FRAME) {
-          continue;
-        }
-      }
-    }
-
-    /* everything but intra */
-    if (mbmi->ref_frame) {
-      int ref = mbmi->ref_frame;
-
-      xd->pre.y_buffer = y_buffer[ref];
-      xd->pre.u_buffer = u_buffer[ref];
-      xd->pre.v_buffer = v_buffer[ref];
-      best_ref_mv = frame_best_ref_mv[ref];
-      vpx_memcpy(mdcounts, frame_mdcounts[ref], sizeof(mdcounts));
-    }
-
-    if (mbmi->second_ref_frame) {
-      int ref = mbmi->second_ref_frame;
-
-      xd->second_pre.y_buffer = y_buffer[ref];
-      xd->second_pre.u_buffer = u_buffer[ref];
-      xd->second_pre.v_buffer = v_buffer[ref];
-      second_best_ref_mv  = frame_best_ref_mv[ref];
-    }
-
-    // Experimental code. Special case for gf and arf zeromv modes.
-    // Increase zbin size to suppress noise
-    if (cpi->zbin_mode_boost_enabled) {
-      if (vp9_mode_order[mode_index].ref_frame == INTRA_FRAME)
-        cpi->zbin_mode_boost = 0;
-      else {
-        if (vp9_mode_order[mode_index].mode == ZEROMV) {
-          if (vp9_mode_order[mode_index].ref_frame != LAST_FRAME)
-            cpi->zbin_mode_boost = GF_ZEROMV_ZBIN_BOOST;
-          else
-            cpi->zbin_mode_boost = LF_ZEROMV_ZBIN_BOOST;
-        } else if (vp9_mode_order[mode_index].mode == SPLITMV)
-          cpi->zbin_mode_boost = 0;
-        else
-          cpi->zbin_mode_boost = MV_ZBIN_BOOST;
-      }
-
-      vp9_update_zbin_extra(cpi, x);
-    }
-
-    // Intra
-    if (!mbmi->ref_frame) {
-      switch (this_mode) {
-        default:
-        case DC_PRED:
-        case V_PRED:
-        case H_PRED:
-        case TM_PRED:
-        case D45_PRED:
-        case D135_PRED:
-        case D117_PRED:
-        case D153_PRED:
-        case D27_PRED:
-        case D63_PRED:
-          mbmi->ref_frame = INTRA_FRAME;
-          // FIXME compound intra prediction
-          vp9_build_intra_predictors_mby(&x->e_mbd);
-          macro_block_yrd(cpi, x, &rate_y, &distortion, &skippable, txfm_cache);
-          rate2 += rate_y;
-          distortion2 += distortion;
-          rate2 += x->mbmode_cost[xd->frame_type][mbmi->mode];
-          if (mbmi->txfm_size != TX_4X4) {
-            rate2 += uv_intra_rate_8x8;
-            rate_uv = uv_intra_rate_tokenonly_8x8;
-            distortion2 += uv_intra_distortion_8x8;
-            distortion_uv = uv_intra_distortion_8x8;
-            skippable = skippable && uv_intra_skippable_8x8;
-          } else {
-            rate2 += uv_intra_rate;
-            rate_uv = uv_intra_rate_tokenonly;
-            distortion2 += uv_intra_distortion;
-            distortion_uv = uv_intra_distortion;
-            skippable = skippable && uv_intra_skippable;
-          }
-          break;
-        case B_PRED: {
-          int64_t tmp_rd;
-
-          // Note the rate value returned here includes the cost of coding
-          // the BPRED mode : x->mbmode_cost[xd->frame_type][BPRED];
-          mbmi->txfm_size = TX_4X4;
-          tmp_rd = rd_pick_intra4x4mby_modes(cpi, x, &rate, &rate_y, &distortion, best_yrd,
-#if CONFIG_COMP_INTRA_PRED
-                                             0,
-#endif
-                                             0);
-          rate2 += rate;
-          distortion2 += distortion;
-
-          if (tmp_rd < best_yrd) {
-            rate2 += uv_intra_rate;
-            rate_uv = uv_intra_rate_tokenonly;
-            distortion2 += uv_intra_distortion;
-            distortion_uv = uv_intra_distortion;
-          } else {
-            this_rd = INT64_MAX;
-            disable_skip = 1;
-          }
-        }
-        break;
-        case I8X8_PRED: {
-          int cost0 = vp9_cost_bit(cm->prob_tx[0], 0);
-          int cost1 = vp9_cost_bit(cm->prob_tx[0], 1);
-          int64_t tmp_rd_4x4s, tmp_rd_8x8s;
-          int64_t tmp_rd_4x4, tmp_rd_8x8, tmp_rd;
-          int r4x4, tok4x4, d4x4, r8x8, tok8x8, d8x8;
-          mbmi->txfm_size = TX_4X4;
-          tmp_rd_4x4 = rd_pick_intra8x8mby_modes(cpi, x, &r4x4, &tok4x4,
-                                                 &d4x4, best_yrd);
-          mode8x8[0][0] = xd->mode_info_context->bmi[0].as_mode.first;
-          mode8x8[0][1] = xd->mode_info_context->bmi[2].as_mode.first;
-          mode8x8[0][2] = xd->mode_info_context->bmi[8].as_mode.first;
-          mode8x8[0][3] = xd->mode_info_context->bmi[10].as_mode.first;
-#if CONFIG_COMP_INTRA_PRED
-          mode8x8[1][0] = xd->mode_info_context->bmi[0].as_mode.second;
-          mode8x8[1][1] = xd->mode_info_context->bmi[2].as_mode.second;
-          mode8x8[1][2] = xd->mode_info_context->bmi[8].as_mode.second;
-          mode8x8[1][3] = xd->mode_info_context->bmi[10].as_mode.second;
-#endif
-          mbmi->txfm_size = TX_8X8;
-          tmp_rd_8x8 = rd_pick_intra8x8mby_modes(cpi, x, &r8x8, &tok8x8,
-                                                 &d8x8, best_yrd);
-          txfm_cache[ONLY_4X4]  = tmp_rd_4x4;
-          txfm_cache[ALLOW_8X8] = tmp_rd_8x8;
-          txfm_cache[ALLOW_16X16] = tmp_rd_8x8;
-          tmp_rd_4x4s = tmp_rd_4x4 + RDCOST(x->rdmult, x->rddiv, cost0, 0);
-          tmp_rd_8x8s = tmp_rd_8x8 + RDCOST(x->rdmult, x->rddiv, cost1, 0);
-          txfm_cache[TX_MODE_SELECT] = tmp_rd_4x4s < tmp_rd_8x8s ? tmp_rd_4x4s : tmp_rd_8x8s;
-          if (cm->txfm_mode == TX_MODE_SELECT) {
-            if (tmp_rd_4x4s < tmp_rd_8x8s) {
-              rate = r4x4 + cost0;
-              rate_y = tok4x4 + cost0;
-              distortion = d4x4;
-              mbmi->txfm_size = TX_4X4;
-              tmp_rd = tmp_rd_4x4s;
-            } else {
-              rate = r8x8 + cost1;
-              rate_y = tok8x8 + cost1;
-              distortion = d8x8;
-              mbmi->txfm_size = TX_8X8;
-              tmp_rd = tmp_rd_8x8s;
-
-              mode8x8[0][0] = xd->mode_info_context->bmi[0].as_mode.first;
-              mode8x8[0][1] = xd->mode_info_context->bmi[2].as_mode.first;
-              mode8x8[0][2] = xd->mode_info_context->bmi[8].as_mode.first;
-              mode8x8[0][3] = xd->mode_info_context->bmi[10].as_mode.first;
-#if CONFIG_COMP_INTRA_PRED
-              mode8x8[1][0] = xd->mode_info_context->bmi[0].as_mode.second;
-              mode8x8[1][1] = xd->mode_info_context->bmi[2].as_mode.second;
-              mode8x8[1][2] = xd->mode_info_context->bmi[8].as_mode.second;
-              mode8x8[1][3] = xd->mode_info_context->bmi[10].as_mode.second;
-#endif
-            }
-          } else if (cm->txfm_mode == ONLY_4X4) {
-            rate = r4x4;
-            rate_y = tok4x4;
-            distortion = d4x4;
-            mbmi->txfm_size = TX_4X4;
-            tmp_rd = tmp_rd_4x4;
-          } else {
-            rate = r8x8;
-            rate_y = tok8x8;
-            distortion = d8x8;
-            mbmi->txfm_size = TX_8X8;
-            tmp_rd = tmp_rd_8x8;
-
-            mode8x8[0][0] = xd->mode_info_context->bmi[0].as_mode.first;
-            mode8x8[0][1] = xd->mode_info_context->bmi[2].as_mode.first;
-            mode8x8[0][2] = xd->mode_info_context->bmi[8].as_mode.first;
-            mode8x8[0][3] = xd->mode_info_context->bmi[10].as_mode.first;
-#if CONFIG_COMP_INTRA_PRED
-            mode8x8[1][0] = xd->mode_info_context->bmi[0].as_mode.second;
-            mode8x8[1][1] = xd->mode_info_context->bmi[2].as_mode.second;
-            mode8x8[1][2] = xd->mode_info_context->bmi[8].as_mode.second;
-            mode8x8[1][3] = xd->mode_info_context->bmi[10].as_mode.second;
-#endif
-          }
-
-          rate2 += rate;
-          distortion2 += distortion;
-
-          /* TODO: uv rate maybe over-estimated here since there is UV intra
-                   mode coded in I8X8_PRED prediction */
-          if (tmp_rd < best_yrd) {
-            rate2 += uv_intra_rate;
-            rate_uv = uv_intra_rate_tokenonly;
-            distortion2 += uv_intra_distortion;
-            distortion_uv = uv_intra_distortion;
-          } else {
-            this_rd = INT64_MAX;
-            disable_skip = 1;
-          }
-        }
-        break;
-      }
-    }
-    // Split MV. The code is very different from the other inter modes so
-    // special case it.
-    else if (this_mode == SPLITMV) {
-      const int is_comp_pred = mbmi->second_ref_frame != 0;
-      int64_t tmp_rd, this_rd_thresh;
-      int_mv *second_ref = is_comp_pred ? &second_best_ref_mv : NULL;
-
-      this_rd_thresh =
-              (mbmi->ref_frame == LAST_FRAME) ?
-          cpi->rd_threshes[THR_NEWMV] : cpi->rd_threshes[THR_NEWA];
-      this_rd_thresh =
-              (mbmi->ref_frame == GOLDEN_FRAME) ?
-          cpi->rd_threshes[THR_NEWG] : this_rd_thresh;
-
-      tmp_rd = rd_pick_best_mbsegmentation(cpi, x, &best_ref_mv,
-                                           second_ref, best_yrd, mdcounts,
-                                           &rate, &rate_y, &distortion,
-                                           &skippable,
-                                           this_rd_thresh, seg_mvs,
-                                           txfm_cache);
-      rate2 += rate;
-      distortion2 += distortion;
-
-      if (cpi->common.mcomp_filter_type == SWITCHABLE)
-        rate2 += SWITCHABLE_INTERP_RATE_FACTOR * x->switchable_interp_costs
-            [vp9_get_pred_context(&cpi->common, xd, PRED_SWITCHABLE_INTERP)]
-                [vp9_switchable_interp_map[mbmi->interp_filter]];
-      // If even the 'Y' rd value of split is higher than best so far
-      // then dont bother looking at UV
-      if (tmp_rd < best_yrd) {
-        int uv_skippable;
-
-        rd_inter4x4_uv(cpi, x, &rate_uv, &distortion_uv, &uv_skippable,
-                       cpi->common.full_pixel);
-        rate2 += rate_uv;
-        distortion2 += distortion_uv;
-        skippable = skippable && uv_skippable;
-      } else {
-        this_rd = INT64_MAX;
-        disable_skip = 1;
-      }
-
-      if (is_comp_pred)
-        mode_excluded = cpi->common.comp_pred_mode == SINGLE_PREDICTION_ONLY;
-      else
-        mode_excluded = cpi->common.comp_pred_mode == COMP_PREDICTION_ONLY;
-
-      compmode_cost =
-        vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_COMP), is_comp_pred);
-      mbmi->mode = this_mode;
-    }
-    else {
-      this_rd = handle_inter_mode(cpi, x, BLOCK_16X16,
-                                  &saddone, near_sadidx, mdcounts, txfm_cache,
-                                  &rate2, &distortion2, &skippable,
-                                  &compmode_cost, &rate_y, &distortion,
-                                  &rate_uv, &distortion_uv,
-                                  &mode_excluded, &disable_skip, recon_yoffset,
-                                  mode_index, frame_mv, frame_best_ref_mv);
-      if (this_rd == INT64_MAX)
-        continue;
-    }
-
-    if (cpi->common.comp_pred_mode == HYBRID_PREDICTION)
-      rate2 += compmode_cost;
-
-    // Estimate the reference frame signaling cost and add it
-    // to the rolling cost variable.
-    rate2 += ref_costs[mbmi->ref_frame];
-
-    if (!disable_skip) {
-      // Test for the condition where skip block will be activated
-      // because there are no non zero coefficients and make any
-      // necessary adjustment for rate. Ignore if skip is coded at
-      // segment level as the cost wont have been added in.
-      if (cpi->common.mb_no_coeff_skip) {
-        int mb_skip_allowed;
-
-        // Is Mb level skip allowed for this mb.
-        mb_skip_allowed =
-          !vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) ||
-          vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);
-
-        if (skippable) {
-          mbmi->mb_skip_coeff = 1;
-
-          // Back out the coefficient coding costs
-          rate2 -= (rate_y + rate_uv);
-          // for best_yrd calculation
-          rate_uv = 0;
-
-          if (mb_skip_allowed) {
-            int prob_skip_cost;
-
-            // Cost the skip mb case
-            vp9_prob skip_prob =
-              vp9_get_pred_prob(cm, &x->e_mbd, PRED_MBSKIP);
-
-            if (skip_prob) {
-              prob_skip_cost = vp9_cost_bit(skip_prob, 1);
-              rate2 += prob_skip_cost;
-              other_cost += prob_skip_cost;
-            }
-          }
-        }
-        // Add in the cost of the no skip flag.
-        else {
-          mbmi->mb_skip_coeff = 0;
-          if (mb_skip_allowed) {
-            int prob_skip_cost = vp9_cost_bit(
-                   vp9_get_pred_prob(cm, &x->e_mbd, PRED_MBSKIP), 0);
-            rate2 += prob_skip_cost;
-            other_cost += prob_skip_cost;
-          }
-        }
-      }
-
-      // Calculate the final RD estimate for this mode.
-      this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
-    }
-
-    // Keep record of best intra distortion
-    if ((mbmi->ref_frame == INTRA_FRAME) &&
-        (this_rd < best_intra_rd)) {
-      best_intra_rd = this_rd;
-      *returnintra = distortion2;
-    }
-
-    if (!disable_skip && mbmi->ref_frame == INTRA_FRAME)
-      for (i = 0; i < NB_PREDICTION_TYPES; ++i)
-        best_pred_rd[i] = MIN(best_pred_rd[i], this_rd);
-
-#if CONFIG_PRED_FILTER
-    // Keep track of the best mode irrespective of prediction filter state
-    if (this_rd < best_overall_rd) {
-      best_overall_rd = this_rd;
-      best_filter_state = mbmi->pred_filter_enabled;
-    }
-
-    // Ignore modes where the prediction filter state doesn't
-    // match the state signaled at the frame level
-    if ((cm->pred_filter_mode == 2) ||
-        (cm->pred_filter_mode ==
-         mbmi->pred_filter_enabled)) {
-#endif
-      // Did this mode help.. i.e. is it the new best mode
-      if (this_rd < best_rd || x->skip) {
-        if (!mode_excluded) {
-          // Note index of best mode so far
-          best_mode_index = mode_index;
-
-          if (this_mode <= B_PRED) {
-            if (mbmi->txfm_size != TX_4X4
-                && this_mode != B_PRED
-                && this_mode != I8X8_PRED)
-              mbmi->uv_mode = uv_intra_mode_8x8;
-            else
-              mbmi->uv_mode = uv_intra_mode;
-            /* required for left and above block mv */
-            mbmi->mv[0].as_int = 0;
-          }
-
-          other_cost += ref_costs[mbmi->ref_frame];
-
-          /* Calculate the final y RD estimate for this mode */
-          best_yrd = RDCOST(x->rdmult, x->rddiv, (rate2 - rate_uv - other_cost),
-                            (distortion2 - distortion_uv));
-
-          *returnrate = rate2;
-          *returndistortion = distortion2;
-          best_rd = this_rd;
-          vpx_memcpy(&best_mbmode, mbmi, sizeof(MB_MODE_INFO));
-          vpx_memcpy(&best_partition, x->partition_info, sizeof(PARTITION_INFO));
-
-          if ((this_mode == B_PRED)
-              || (this_mode == I8X8_PRED)
-              || (this_mode == SPLITMV))
-            for (i = 0; i < 16; i++) {
-              best_bmodes[i] = xd->block[i].bmi;
-            }
-        }
-
-        // Testing this mode gave rise to an improvement in best error score.
-        // Lower threshold a bit for next time
-        cpi->rd_thresh_mult[mode_index] =
-            (cpi->rd_thresh_mult[mode_index] >= (MIN_THRESHMULT + 2)) ?
-            cpi->rd_thresh_mult[mode_index] - 2 : MIN_THRESHMULT;
-        cpi->rd_threshes[mode_index] =
-            (cpi->rd_baseline_thresh[mode_index] >> 7) *
-            cpi->rd_thresh_mult[mode_index];
-      }
-      // If the mode did not help improve the best error case then raise the
-      // threshold for testing that mode next time around.
-      else {
-        cpi->rd_thresh_mult[mode_index] += 4;
-
-        if (cpi->rd_thresh_mult[mode_index] > MAX_THRESHMULT)
-          cpi->rd_thresh_mult[mode_index] = MAX_THRESHMULT;
-
-        cpi->rd_threshes[mode_index] = (cpi->rd_baseline_thresh[mode_index] >> 7) * cpi->rd_thresh_mult[mode_index];
-      }
-
-      /* keep record of best compound/single-only prediction */
-      if (!disable_skip &&
-          mbmi->ref_frame != INTRA_FRAME) {
-        int64_t single_rd, hybrid_rd;
-        int single_rate, hybrid_rate;
-
-        if (cpi->common.comp_pred_mode == HYBRID_PREDICTION) {
-          single_rate = rate2 - compmode_cost;
-          hybrid_rate = rate2;
-        } else {
-          single_rate = rate2;
-          hybrid_rate = rate2 + compmode_cost;
-        }
-
-        single_rd = RDCOST(x->rdmult, x->rddiv, single_rate, distortion2);
-        hybrid_rd = RDCOST(x->rdmult, x->rddiv, hybrid_rate, distortion2);
-
-        if (mbmi->second_ref_frame == INTRA_FRAME &&
-            single_rd < best_pred_rd[SINGLE_PREDICTION_ONLY]) {
-          best_pred_rd[SINGLE_PREDICTION_ONLY] = single_rd;
-        } else if (mbmi->second_ref_frame != INTRA_FRAME &&
-                   single_rd < best_pred_rd[COMP_PREDICTION_ONLY]) {
-          best_pred_rd[COMP_PREDICTION_ONLY] = single_rd;
-        }
-        if (hybrid_rd < best_pred_rd[HYBRID_PREDICTION])
-          best_pred_rd[HYBRID_PREDICTION] = hybrid_rd;
-      }
-
-      /* keep record of best txfm size */
-      if (!mode_excluded && this_rd != INT64_MAX) {
-        for (i = 0; i < NB_TXFM_MODES; i++) {
-          int64_t adj_rd;
-          if (this_mode != B_PRED) {
-            adj_rd = this_rd + txfm_cache[i] - txfm_cache[cm->txfm_mode];
-          } else {
-            adj_rd = this_rd;
-          }
-          if (adj_rd < best_txfm_rd[i])
-            best_txfm_rd[i] = adj_rd;
-        }
-      }
-#if CONFIG_PRED_FILTER
-    }
-#endif
-
-    if (x->skip && !mode_excluded)
-      break;
-  }
-
-#if CONFIG_PRED_FILTER
-  // Update counts for prediction filter usage
-  if (best_filter_state != 0)
-    ++cpi->pred_filter_on_count;
-  else
-    ++cpi->pred_filter_off_count;
-#endif
-  if (cpi->common.mcomp_filter_type == SWITCHABLE &&
-      best_mbmode.mode >= NEARESTMV &&
-      best_mbmode.mode <= SPLITMV) {
-    ++cpi->switchable_interp_count
-        [vp9_get_pred_context(&cpi->common, xd, PRED_SWITCHABLE_INTERP)]
-        [vp9_switchable_interp_map[best_mbmode.interp_filter]];
-  }
-
-  // Reduce the activation RD thresholds for the best choice mode
-  if ((cpi->rd_baseline_thresh[best_mode_index] > 0) &&
-      (cpi->rd_baseline_thresh[best_mode_index] < (INT_MAX >> 2))) {
-    int best_adjustment = (cpi->rd_thresh_mult[best_mode_index] >> 2);
-
-    cpi->rd_thresh_mult[best_mode_index] =
-        (cpi->rd_thresh_mult[best_mode_index] >=
-         (MIN_THRESHMULT + best_adjustment)) ?
-        cpi->rd_thresh_mult[best_mode_index] - best_adjustment : MIN_THRESHMULT;
-    cpi->rd_threshes[best_mode_index] =
-        (cpi->rd_baseline_thresh[best_mode_index] >> 7) *
-        cpi->rd_thresh_mult[best_mode_index];
-  }
-
-  // This code force Altref,0,0 and skip for the frame that overlays a
-  // an alrtef unless Altref is filtered. However, this is unsafe if
-  // segment level coding of ref frame or mode is enabled for this
-  // segment.
-  if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME) &&
-      !vp9_segfeature_active(xd, segment_id, SEG_LVL_MODE) &&
-      cpi->is_src_frame_alt_ref &&
-      (cpi->oxcf.arnr_max_frames == 0) &&
-      (best_mbmode.mode != ZEROMV || best_mbmode.ref_frame != ALTREF_FRAME)) {
-    mbmi->mode = ZEROMV;
-    if (cm->txfm_mode != TX_MODE_SELECT)
-      mbmi->txfm_size = cm->txfm_mode;
-    else
-      mbmi->txfm_size = TX_16X16;
-    mbmi->ref_frame = ALTREF_FRAME;
-    mbmi->mv[0].as_int = 0;
-    mbmi->uv_mode = DC_PRED;
-    mbmi->mb_skip_coeff =
-      (cpi->common.mb_no_coeff_skip) ? 1 : 0;
-    mbmi->partitioning = 0;
-
-    vpx_memset(best_pred_diff, 0, sizeof(best_pred_diff));
-    vpx_memset(best_txfm_diff, 0, sizeof(best_txfm_diff));
-    goto end;
-  }
-
-  // macroblock modes
-  vpx_memcpy(mbmi, &best_mbmode, sizeof(MB_MODE_INFO));
-  if (best_mbmode.mode == B_PRED) {
-    for (i = 0; i < 16; i++) {
-      xd->mode_info_context->bmi[i].as_mode = best_bmodes[i].as_mode;
-      xd->block[i].bmi.as_mode = xd->mode_info_context->bmi[i].as_mode;
-    }
-  }
-
-  if (best_mbmode.mode == I8X8_PRED)
-    set_i8x8_block_modes(x, mode8x8);
-
-  if (best_mbmode.mode == SPLITMV) {
-    for (i = 0; i < 16; i++)
-      xd->mode_info_context->bmi[i].as_mv.first.as_int = best_bmodes[i].as_mv.first.as_int;
-    if (mbmi->second_ref_frame)
-      for (i = 0; i < 16; i++)
-        xd->mode_info_context->bmi[i].as_mv.second.as_int = best_bmodes[i].as_mv.second.as_int;
-
-    vpx_memcpy(x->partition_info, &best_partition, sizeof(PARTITION_INFO));
-
-    mbmi->mv[0].as_int = x->partition_info->bmi[15].mv.as_int;
-    mbmi->mv[1].as_int = x->partition_info->bmi[15].second_mv.as_int;
-  }
-
-  for (i = 0; i < NB_PREDICTION_TYPES; ++i) {
-    if (best_pred_rd[i] == INT64_MAX)
-      best_pred_diff[i] = INT_MIN;
-    else
-      best_pred_diff[i] = best_rd - best_pred_rd[i];
-  }
-
-  if (!x->skip) {
-    for (i = 0; i < NB_TXFM_MODES; i++) {
-      if (best_txfm_rd[i] == INT64_MAX)
-        best_txfm_diff[i] = INT_MIN;
-      else
-        best_txfm_diff[i] = best_rd - best_txfm_rd[i];
-    }
-  } else {
-    vpx_memset(best_txfm_diff, 0, sizeof(best_txfm_diff));
-  }
-
-end:
-  store_coding_context(x, &x->mb_context[xd->mb_index], best_mode_index, &best_partition,
-                       &frame_best_ref_mv[xd->mode_info_context->mbmi.ref_frame],
-                       &frame_best_ref_mv[xd->mode_info_context->mbmi.second_ref_frame],
-                       best_pred_diff[0], best_pred_diff[1], best_pred_diff[2],
-                       best_txfm_diff);
-}
-
-#if CONFIG_SUPERBLOCKS
-void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
-                               int *returnrate,
-                               int *returndist) {
-  VP9_COMMON *cm = &cpi->common;
-  MACROBLOCKD *xd = &x->e_mbd;
-  int rate_y, rate_uv;
-  int rate_y_tokenonly, rate_uv_tokenonly;
-  int error_y, error_uv;
-  int dist_y, dist_uv;
-  int y_skip, uv_skip;
-
-  xd->mode_info_context->mbmi.txfm_size = TX_8X8;
-
-  error_uv = rd_pick_intra_sbuv_mode(cpi, x, &rate_uv, &rate_uv_tokenonly,
-                                     &dist_uv, &uv_skip);
-  error_y = rd_pick_intra_sby_mode(cpi, x, &rate_y, &rate_y_tokenonly,
-                                   &dist_y, &y_skip);
-
-  if (cpi->common.mb_no_coeff_skip && y_skip && uv_skip) {
-    *returnrate = rate_y + rate_uv - rate_y_tokenonly - rate_uv_tokenonly +
-                  vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_MBSKIP), 1);
-    *returndist = dist_y + (dist_uv >> 2);
-  } else {
-    *returnrate = rate_y + rate_uv;
-    if (cpi->common.mb_no_coeff_skip)
-      *returnrate += vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_MBSKIP), 0);
-    *returndist = dist_y + (dist_uv >> 2);
-  }
-}
-#endif
-
-void vp9_rd_pick_intra_mode(VP9_COMP *cpi, MACROBLOCK *x,
-                            int *returnrate, int *returndist) {
-  VP9_COMMON *cm = &cpi->common;
-  MACROBLOCKD *xd = &x->e_mbd;
-  MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;
-  int64_t error4x4, error16x16;
-#if CONFIG_COMP_INTRA_PRED
-  int64_t error4x4d;
-  int rate4x4d, dist4x4d;
-#endif
-  int rate4x4, rate16x16 = 0, rateuv, rateuv8x8;
-  int dist4x4, dist16x16, distuv, distuv8x8;
-  int rate;
-  int rate4x4_tokenonly = 0;
-  int rate16x16_tokenonly = 0;
-  int rateuv_tokenonly = 0, rateuv8x8_tokenonly = 0;
-  int64_t error8x8;
-  int rate8x8_tokenonly=0;
-  int rate8x8, dist8x8;
-  int mode16x16;
-  int mode8x8[2][4];
-  int dist;
-  int modeuv, modeuv8x8, uv_intra_skippable, uv_intra_skippable_8x8;
-  int y_intra16x16_skippable;
-  int64_t txfm_cache[NB_TXFM_MODES];
-  TX_SIZE txfm_size_16x16;
-  int i;
-
-  mbmi->ref_frame = INTRA_FRAME;
-  rd_pick_intra_mbuv_mode(cpi, x, &rateuv, &rateuv_tokenonly, &distuv,
-                          &uv_intra_skippable);
-  modeuv = mbmi->uv_mode;
-  if (cpi->common.txfm_mode != ONLY_4X4) {
-    rd_pick_intra_mbuv_mode_8x8(cpi, x, &rateuv8x8, &rateuv8x8_tokenonly,
-                                &distuv8x8, &uv_intra_skippable_8x8);
-    modeuv8x8 = mbmi->uv_mode;
-  } else {
-    uv_intra_skippable_8x8 = uv_intra_skippable;
-    rateuv8x8 = rateuv;
-    distuv8x8 = distuv;
-    rateuv8x8_tokenonly = rateuv_tokenonly;
-    modeuv8x8 = modeuv;
-  }
-
-  // current macroblock under rate-distortion optimization test loop
-  error16x16 = rd_pick_intra16x16mby_mode(cpi, x, &rate16x16,
-                                          &rate16x16_tokenonly, &dist16x16,
-                                          &y_intra16x16_skippable, txfm_cache);
-  mode16x16 = mbmi->mode;
-  txfm_size_16x16 = mbmi->txfm_size;
-
-  // FIXME(rbultje) support transform-size selection
-  mbmi->txfm_size = (cm->txfm_mode == ONLY_4X4) ? TX_4X4 : TX_8X8;
-  error8x8 = rd_pick_intra8x8mby_modes(cpi, x, &rate8x8, &rate8x8_tokenonly,
-                                       &dist8x8, error16x16);
-  mode8x8[0][0]= xd->mode_info_context->bmi[0].as_mode.first;
-  mode8x8[0][1]= xd->mode_info_context->bmi[2].as_mode.first;
-  mode8x8[0][2]= xd->mode_info_context->bmi[8].as_mode.first;
-  mode8x8[0][3]= xd->mode_info_context->bmi[10].as_mode.first;
-#if CONFIG_COMP_INTRA_PRED
-  mode8x8[1][0] = xd->mode_info_context->bmi[0].as_mode.second;
-  mode8x8[1][1] = xd->mode_info_context->bmi[2].as_mode.second;
-  mode8x8[1][2] = xd->mode_info_context->bmi[8].as_mode.second;
-  mode8x8[1][3] = xd->mode_info_context->bmi[10].as_mode.second;
-#endif
-
-  error4x4 = rd_pick_intra4x4mby_modes(cpi, x,
-                                       &rate4x4, &rate4x4_tokenonly,
-                                       &dist4x4, error16x16,
-#if CONFIG_COMP_INTRA_PRED
-                                       0,
-#endif
-                                       0);
-#if CONFIG_COMP_INTRA_PRED
-  error4x4d = rd_pick_intra4x4mby_modes(cpi, x,
-                                        &rate4x4d, &rate4x4_tokenonly,
-                                        &dist4x4d, error16x16, 1, 0);
-#endif
-
-  mbmi->mb_skip_coeff = 0;
-  if (cpi->common.mb_no_coeff_skip &&
-      y_intra16x16_skippable && uv_intra_skippable_8x8) {
-    mbmi->mb_skip_coeff = 1;
-    mbmi->mode = mode16x16;
-    mbmi->uv_mode = modeuv;
-    rate = rateuv8x8 + rate16x16 - rateuv8x8_tokenonly - rate16x16_tokenonly +
-           vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_MBSKIP), 1);
-    dist = dist16x16 + (distuv8x8 >> 2);
-    mbmi->txfm_size = txfm_size_16x16;
-    memset(x->mb_context[xd->mb_index].txfm_rd_diff, 0,
-           sizeof(x->mb_context[xd->mb_index].txfm_rd_diff));
-  } else if (error8x8 > error16x16) {
-    if (error4x4 < error16x16) {
-      rate = rateuv;
-#if CONFIG_COMP_INTRA_PRED
-      rate += (error4x4d < error4x4) ? rate4x4d : rate4x4;
-      if (error4x4d >= error4x4) // FIXME save original modes etc.
-        error4x4 = rd_pick_intra4x4mby_modes(cpi, x, &rate4x4,
-                                             &rate4x4_tokenonly,
-                                             &dist4x4, error16x16, 0,
-                                             cpi->update_context);
-#else
-      rate += rate4x4;
-#endif
-      mbmi->mode = B_PRED;
-      mbmi->txfm_size = TX_4X4;
-      dist = dist4x4 + (distuv >> 2);
-      memset(x->mb_context[xd->mb_index].txfm_rd_diff, 0,
-             sizeof(x->mb_context[xd->mb_index].txfm_rd_diff));
-    } else {
-      mbmi->txfm_size = txfm_size_16x16;
-      mbmi->mode = mode16x16;
-      rate = rate16x16 + rateuv8x8;
-      dist = dist16x16 + (distuv8x8 >> 2);
-      for (i = 0; i < NB_TXFM_MODES; i++) {
-        x->mb_context[xd->mb_index].txfm_rd_diff[i] = error16x16 - txfm_cache[i];
-      }
-    }
-    if (cpi->common.mb_no_coeff_skip)
-      rate += vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_MBSKIP), 0);
-  } else {
-    if (error4x4 < error8x8) {
-      rate = rateuv;
-#if CONFIG_COMP_INTRA_PRED
-      rate += (error4x4d < error4x4) ? rate4x4d : rate4x4;
-      if (error4x4d >= error4x4) // FIXME save original modes etc.
-        error4x4 = rd_pick_intra4x4mby_modes(cpi, x, &rate4x4,
-                                             &rate4x4_tokenonly,
-                                             &dist4x4, error16x16, 0,
-                                             cpi->update_context);
-#else
-      rate += rate4x4;
-#endif
-      mbmi->mode = B_PRED;
-      mbmi->txfm_size = TX_4X4;
-      dist = dist4x4 + (distuv >> 2);
-      memset(x->mb_context[xd->mb_index].txfm_rd_diff, 0,
-             sizeof(x->mb_context[xd->mb_index].txfm_rd_diff));
-    } else {
-      // FIXME(rbultje) support transform-size selection
-      mbmi->mode = I8X8_PRED;
-      mbmi->txfm_size = (cm->txfm_mode == ONLY_4X4) ? TX_4X4 : TX_8X8;
-      set_i8x8_block_modes(x, mode8x8);
-      rate = rate8x8 + rateuv;
-      dist = dist8x8 + (distuv >> 2);
-      memset(x->mb_context[xd->mb_index].txfm_rd_diff, 0,
-             sizeof(x->mb_context[xd->mb_index].txfm_rd_diff));
-    }
-    if (cpi->common.mb_no_coeff_skip)
-      rate += vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_MBSKIP), 0);
-  }
-
-  *returnrate = rate;
-  *returndist = dist;
-}
-
-#if CONFIG_SUPERBLOCKS
-int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
-                                  int recon_yoffset, int recon_uvoffset,
-                                  int *returnrate, int *returndistortion) {
-  VP9_COMMON *cm = &cpi->common;
-  MACROBLOCKD *xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
-  MB_PREDICTION_MODE this_mode;
-  MV_REFERENCE_FRAME ref_frame;
-  unsigned char segment_id = xd->mode_info_context->mbmi.segment_id;
-  int comp_pred;
-  int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
-  int_mv frame_best_ref_mv[4];
-  int frame_mdcounts[4][4];
-  unsigned char *y_buffer[4];
-  unsigned char *u_buffer[4];
-  unsigned char *v_buffer[4];
-  static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
-                                    VP9_ALT_FLAG };
-  int idx_list[4] = { 0, cpi->common.lst_fb_idx, cpi->common.gld_fb_idx,
-                      cpi->common.alt_fb_idx };
-  int mdcounts[4];
-  int near_sadidx[8] = { 0, 1, 2, 3, 4, 5, 6, 7 };
-  int saddone = 0;
-  int64_t best_rd = INT64_MAX;
-  int64_t best_comp_rd = INT64_MAX;
-  int64_t best_single_rd = INT64_MAX;
-  int64_t best_hybrid_rd = INT64_MAX;
-  int64_t best_yrd = INT64_MAX;
-  MB_MODE_INFO best_mbmode;
-  int mode_index, best_mode_index;
-  unsigned int ref_costs[MAX_REF_FRAMES];
-
-  x->skip = 0;
-  xd->mode_info_context->mbmi.segment_id = segment_id;
-  estimate_ref_frame_costs(cpi, segment_id, ref_costs);
-  vpx_memset(&best_mbmode, 0, sizeof(best_mbmode));
-
-  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
-    if (cpi->ref_frame_flags & flag_list[ref_frame]) {
-      setup_buffer_inter(cpi, x, idx_list[ref_frame], ref_frame,
-                         recon_yoffset, recon_uvoffset, frame_mv[NEARESTMV],
-                         frame_mv[NEARMV], frame_best_ref_mv,
-                         frame_mdcounts, y_buffer, u_buffer, v_buffer);
-    }
-    frame_mv[NEWMV][ref_frame].as_int = INVALID_MV;
-    frame_mv[ZEROMV][ref_frame].as_int = 0;
-  }
-
-  for (mode_index = 0; mode_index < MAX_MODES; mode_index++) {
-    int mode_excluded;
-    int64_t this_rd = INT64_MAX;
-    int disable_skip = 0;
-    int other_cost = 0;
-    int compmode_cost = 0;
-    int rate2 = 0, rate_y = 0, rate_uv = 0;
-    int distortion2 = 0, distortion_y = 0, distortion_uv = 0;
-    int skippable;
-    int64_t txfm_cache[NB_TXFM_MODES];
-
-    // Test best rd so far against threshold for trying this mode.
-    if (best_rd <= cpi->rd_threshes[mode_index]) {
-      continue;
-    }
-
-    this_mode = vp9_mode_order[mode_index].mode;
-    ref_frame = vp9_mode_order[mode_index].ref_frame;
-    mbmi->ref_frame = ref_frame;
-    comp_pred = vp9_mode_order[mode_index].second_ref_frame != INTRA_FRAME;
-    mbmi->mode = this_mode;
-    mbmi->uv_mode = DC_PRED;
-#if CONFIG_COMP_INTRA_PRED
-    mbmi->second_mode = (MB_PREDICTION_MODE)(DC_PRED - 1);
-    mbmi->second_uv_mode = (MB_PREDICTION_MODE)(DC_PRED - 1);
-#endif
-
-    if (!(cpi->ref_frame_flags & flag_list[ref_frame]))
-      continue;
-
-    // not yet supported or not superblocky
-    // TODO(rbultje): support intra coding
-    if (ref_frame == INTRA_FRAME || this_mode == SPLITMV)
-      continue;
-
-    if (comp_pred) {
-      int second_ref;
-
-      if (ref_frame == ALTREF_FRAME) {
-        second_ref = LAST_FRAME;
-      } else {
-        second_ref = ref_frame + 1;
-      }
-      if (!(cpi->ref_frame_flags & flag_list[second_ref]))
-        continue;
-      mbmi->second_ref_frame = second_ref;
-
-      xd->second_pre.y_buffer = y_buffer[second_ref];
-      xd->second_pre.u_buffer = u_buffer[second_ref];
-      xd->second_pre.v_buffer = v_buffer[second_ref];
-      mode_excluded = cm->comp_pred_mode == SINGLE_PREDICTION_ONLY;
-    } else {
-      mbmi->second_ref_frame = INTRA_FRAME;
-      mode_excluded = cm->comp_pred_mode == COMP_PREDICTION_ONLY;
-    }
-
-    xd->pre.y_buffer = y_buffer[ref_frame];
-    xd->pre.u_buffer = u_buffer[ref_frame];
-    xd->pre.v_buffer = v_buffer[ref_frame];
-    vpx_memcpy(mdcounts, frame_mdcounts[ref_frame], sizeof(mdcounts));
-
-    // If the segment reference frame feature is enabled....
-    // then do nothing if the current ref frame is not allowed..
-    if (vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME) &&
-        !vp9_check_segref(xd, segment_id, ref_frame)) {
-      continue;
-    // If the segment mode feature is enabled....
-    // then do nothing if the current mode is not allowed..
-    } else if (vp9_segfeature_active(xd, segment_id, SEG_LVL_MODE) &&
-               (this_mode != vp9_get_segdata(xd, segment_id, SEG_LVL_MODE))) {
-      continue;
-    // Disable this drop out case if either the mode or ref frame
-    // segment level feature is enabled for this segment. This is to
-    // prevent the possibility that we end up unable to pick any mode.
-    } else if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME) &&
-               !vp9_segfeature_active(xd, segment_id, SEG_LVL_MODE)) {
-      // Only consider ZEROMV/ALTREF_FRAME for alt ref frame,
-      // unless ARNR filtering is enabled in which case we want
-      // an unfiltered alternative
-      if (cpi->is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0)) {
-        if (this_mode != ZEROMV || ref_frame != ALTREF_FRAME) {
-          continue;
-        }
-      }
-    }
-
-    this_rd = handle_inter_mode(cpi, x, BLOCK_32X32,
-                                &saddone, near_sadidx, mdcounts, txfm_cache,
-                                &rate2, &distortion2, &skippable,
-                                &compmode_cost, &rate_y, &distortion_y,
-                                &rate_uv, &distortion_uv,
-                                &mode_excluded, &disable_skip, recon_yoffset,
-                                mode_index, frame_mv, frame_best_ref_mv);
-    if (this_rd == INT64_MAX)
-      continue;
-
-    if (cpi->common.comp_pred_mode == HYBRID_PREDICTION) {
-      rate2 += compmode_cost;
-    }
-
-    // Estimate the reference frame signaling cost and add it
-    // to the rolling cost variable.
-    rate2 += ref_costs[xd->mode_info_context->mbmi.ref_frame];
-
-    if (!disable_skip) {
-      // Test for the condition where skip block will be activated
-      // because there are no non zero coefficients and make any
-      // necessary adjustment for rate. Ignore if skip is coded at
-      // segment level as the cost wont have been added in.
-      if (cpi->common.mb_no_coeff_skip) {
-        int mb_skip_allowed;
-
-        // Is Mb level skip allowed for this mb.
-        mb_skip_allowed =
-          !vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) ||
-          vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);
-
-        if (skippable) {
-          // Back out the coefficient coding costs
-          rate2 -= (rate_y + rate_uv);
-          // for best_yrd calculation
-          rate_uv = 0;
-
-          if (mb_skip_allowed) {
-            int prob_skip_cost;
-
-            // Cost the skip mb case
-            vp9_prob skip_prob =
-              vp9_get_pred_prob(cm, xd, PRED_MBSKIP);
-
-            if (skip_prob) {
-              prob_skip_cost = vp9_cost_bit(skip_prob, 1);
-              rate2 += prob_skip_cost;
-              other_cost += prob_skip_cost;
-            }
-          }
-        }
-        // Add in the cost of the no skip flag.
-        else if (mb_skip_allowed) {
-          int prob_skip_cost = vp9_cost_bit(vp9_get_pred_prob(cm, xd,
-                                                          PRED_MBSKIP), 0);
-          rate2 += prob_skip_cost;
-          other_cost += prob_skip_cost;
-        }
-      }
-
-      // Calculate the final RD estimate for this mode.
-      this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
-    }
-
-#if 0
-    // Keep record of best intra distortion
-    if ((xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) &&
-        (this_rd < best_intra_rd)) {
-      best_intra_rd = this_rd;
-      *returnintra = distortion2;
-    }
-#endif
-
-    if (!disable_skip && xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) {
-      if (this_rd < best_comp_rd)
-        best_comp_rd = this_rd;
-      if (this_rd < best_single_rd)
-        best_single_rd = this_rd;
-      if (this_rd < best_hybrid_rd)
-        best_hybrid_rd = this_rd;
-    }
-
-    // Did this mode help.. i.e. is it the new best mode
-    if (this_rd < best_rd || x->skip) {
-      if (!mode_excluded) {
-        // Note index of best mode so far
-        best_mode_index = mode_index;
-
-#if 0
-        if (this_mode <= B_PRED) {
-          xd->mode_info_context->mbmi.uv_mode = uv_intra_mode_8x8;
-          /* required for left and above block mv */
-          xd->mode_info_context->mbmi.mv.as_int = 0;
-        }
-#endif
-
-        other_cost += ref_costs[xd->mode_info_context->mbmi.ref_frame];
-
-        /* Calculate the final y RD estimate for this mode */
-        best_yrd = RDCOST(x->rdmult, x->rddiv, (rate2 - rate_uv - other_cost),
-                          (distortion2 - distortion_uv));
-
-        *returnrate = rate2;
-        *returndistortion = distortion2;
-        best_rd = this_rd;
-        vpx_memcpy(&best_mbmode, mbmi, sizeof(MB_MODE_INFO));
-      }
-#if 0
-      // Testing this mode gave rise to an improvement in best error score. Lower threshold a bit for next time
-      cpi->rd_thresh_mult[mode_index] = (cpi->rd_thresh_mult[mode_index] >= (MIN_THRESHMULT + 2)) ? cpi->rd_thresh_mult[mode_index] - 2 : MIN_THRESHMULT;
-      cpi->rd_threshes[mode_index] = (cpi->rd_baseline_thresh[mode_index] >> 7) * cpi->rd_thresh_mult[mode_index];
-#endif
-    }
-    // If the mode did not help improve the best error case then raise the threshold for testing that mode next time around.
-    else {
-#if 0
-      cpi->rd_thresh_mult[mode_index] += 4;
-
-      if (cpi->rd_thresh_mult[mode_index] > MAX_THRESHMULT)
-        cpi->rd_thresh_mult[mode_index] = MAX_THRESHMULT;
-
-      cpi->rd_threshes[mode_index] = (cpi->rd_baseline_thresh[mode_index] >> 7) * cpi->rd_thresh_mult[mode_index];
-#endif
-    }
-
-    /* keep record of best compound/single-only prediction */
-    if (!disable_skip && mbmi->ref_frame != INTRA_FRAME) {
-      int single_rd, hybrid_rd, single_rate, hybrid_rate;
-
-      if (cpi->common.comp_pred_mode == HYBRID_PREDICTION) {
-        single_rate = rate2 - compmode_cost;
-        hybrid_rate = rate2;
-      } else {
-        single_rate = rate2;
-        hybrid_rate = rate2 + compmode_cost;
-      }
-
-      single_rd = RDCOST(x->rdmult, x->rddiv, single_rate, distortion2);
-      hybrid_rd = RDCOST(x->rdmult, x->rddiv, hybrid_rate, distortion2);
-
-      if (mbmi->second_ref_frame == INTRA_FRAME && single_rd < best_single_rd) {
-        best_single_rd = single_rd;
-      } else if (mbmi->second_ref_frame != INTRA_FRAME &&
-                 single_rd < best_comp_rd) {
-        best_comp_rd = single_rd;
-      }
-      if (hybrid_rd < best_hybrid_rd) {
-        best_hybrid_rd = hybrid_rd;
-      }
-    }
-
-    if (x->skip && !mode_excluded)
-      break;
-  }
-
-  // TODO(rbultje) integrate with RD thresholding
-#if 0
-  // Reduce the activation RD thresholds for the best choice mode
-  if ((cpi->rd_baseline_thresh[best_mode_index] > 0) &&
-      (cpi->rd_baseline_thresh[best_mode_index] < (INT_MAX >> 2))) {
-    int best_adjustment = (cpi->rd_thresh_mult[best_mode_index] >> 2);
-
-    cpi->rd_thresh_mult[best_mode_index] =
-      (cpi->rd_thresh_mult[best_mode_index] >= (MIN_THRESHMULT + best_adjustment)) ?
-      cpi->rd_thresh_mult[best_mode_index] - best_adjustment : MIN_THRESHMULT;
-    cpi->rd_threshes[best_mode_index] =
-      (cpi->rd_baseline_thresh[best_mode_index] >> 7) * cpi->rd_thresh_mult[best_mode_index];
-  }
-#endif
-
-  // This code forces Altref,0,0 and skip for the frame that overlays a
-  // an alrtef unless Altref is filtered. However, this is unsafe if
-  // segment level coding of ref frame or mode is enabled for this
-  // segment.
-  if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME) &&
-      !vp9_segfeature_active(xd, segment_id, SEG_LVL_MODE) &&
-      cpi->is_src_frame_alt_ref &&
-      (cpi->oxcf.arnr_max_frames == 0) &&
-      (best_mbmode.mode != ZEROMV || best_mbmode.ref_frame != ALTREF_FRAME)) {
-    mbmi->mode = ZEROMV;
-    mbmi->ref_frame = ALTREF_FRAME;
-    mbmi->second_ref_frame = 0;
-    mbmi->mv[0].as_int = 0;
-    mbmi->uv_mode = DC_PRED;
-    mbmi->mb_skip_coeff = (cpi->common.mb_no_coeff_skip) ? 1 : 0;
-    mbmi->partitioning = 0;
-    mbmi->txfm_size = TX_8X8;
-
-    if (best_rd != INT64_MAX)
-      store_coding_context(x, &x->sb_context[0], best_mode_index, NULL,
-                           &frame_best_ref_mv[mbmi->ref_frame],
-                           &frame_best_ref_mv[mbmi->second_ref_frame],
-                           0, 0, 0, NULL);
-    return best_rd;
-  }
-
-  // macroblock modes
-  vpx_memcpy(mbmi, &best_mbmode, sizeof(MB_MODE_INFO));
-  mbmi->txfm_size = TX_8X8;
-
-  if (best_rd != INT64_MAX)
-    store_coding_context(x, &x->sb_context[0], best_mode_index, NULL,
-                         &frame_best_ref_mv[mbmi->ref_frame],
-                         &frame_best_ref_mv[mbmi->second_ref_frame],
-                         (best_single_rd == INT64_MAX) ? INT_MIN :
-                                        (best_rd - best_single_rd),
-                         (best_comp_rd   == INT64_MAX) ? INT_MIN :
-                                        (best_rd - best_comp_rd),
-                         (best_hybrid_rd == INT64_MAX) ? INT_MIN :
-                                        (best_rd - best_hybrid_rd),
-                         NULL);
-
-  return best_rd;
-}
-#endif
-
-void vp9_pick_mode_inter_macroblock(VP9_COMP *cpi, MACROBLOCK *x,
-                                    int recon_yoffset,
-                                    int recon_uvoffset,
-                                    int *totalrate, int *totaldist) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;
-  int rate, distortion;
-  int64_t intra_error = 0;
-  unsigned char *segment_id = &mbmi->segment_id;
-
-  if (xd->segmentation_enabled)
-    x->encode_breakout = cpi->segment_encode_breakout[*segment_id];
-  else
-    x->encode_breakout = cpi->oxcf.encode_breakout;
-
-  // if (cpi->sf.RD)
-  // For now this codebase is limited to a single rd encode path
-  {
-    int zbin_mode_boost_enabled = cpi->zbin_mode_boost_enabled;
-
-    vp9_rd_pick_inter_mode(cpi, x, recon_yoffset, recon_uvoffset, &rate,
-                           &distortion, &intra_error);
-
-    /* restore cpi->zbin_mode_boost_enabled */
-    cpi->zbin_mode_boost_enabled = zbin_mode_boost_enabled;
-  }
-  // else
-  // The non rd encode path has been deleted from this code base
-  // to simplify development
-  //    vp9_pick_inter_mode
-
-  // Store metrics so they can be added in to totals if this mode is picked
-  x->mb_context[xd->mb_index].distortion  = distortion;
-  x->mb_context[xd->mb_index].intra_error = intra_error;
-
-  *totalrate = rate;
-  *totaldist = distortion;
-}
--- a/vp8/encoder/rdopt.h
+++ /dev/null
@@ -1,41 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef __INC_RDOPT_H
-#define __INC_RDOPT_H
-
-#define RDCOST(RM,DM,R,D) ( ((128+((int64_t)R)*(RM)) >> 8) + ((int64_t)DM)*(D) )
-#define RDCOST_8x8(RM,DM,R,D) ( ((128+((int64_t)R)*(RM)) >> 8) + ((int64_t)DM)*(D) )
-
-extern void vp9_initialize_rd_consts(VP9_COMP *cpi, int Qvalue);
-
-extern void vp9_rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
-                                   int recon_yoffset, int recon_uvoffset,
-                                   int *returnrate, int *returndistortion,
-                                   int64_t *returnintra);
-
-extern void vp9_rd_pick_intra_mode(VP9_COMP *cpi, MACROBLOCK *x,
-                                   int *r, int *d);
-
-extern void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
-                                      int *r, int *d);
-
-extern void vp9_mv_pred(VP9_COMP *cpi, MACROBLOCKD *xd,
-                        const MODE_INFO *here, int_mv *mvp,
-                        int refframe, int *ref_frame_sign_bias,
-                        int *sr, int near_sadidx[]);
-
-extern void vp9_init_me_luts();
-
-extern void vp9_set_mbmode_and_mvs(MACROBLOCK *x,
-                                   MB_PREDICTION_MODE mb, int_mv *mv);
-
-#endif
--- a/vp8/encoder/sad_c.c
+++ /dev/null
@@ -1,480 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include <stdlib.h>
-#include "vp8/common/sadmxn.h"
-#include "vpx_ports/config.h"
-#include "vpx/vpx_integer.h"
-
-unsigned int vp9_sad32x32_c(const unsigned char *src_ptr,
-                            int  src_stride,
-                            const unsigned char *ref_ptr,
-                            int  ref_stride,
-                            int max_sad) {
-  return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 32, 32);
-}
-
-unsigned int vp9_sad16x16_c(const unsigned char *src_ptr,
-                            int  src_stride,
-                            const unsigned char *ref_ptr,
-                            int  ref_stride,
-                            int max_sad) {
-  return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 16, 16);
-}
-
-unsigned int vp9_sad8x8_c(const unsigned char *src_ptr,
-                          int  src_stride,
-                          const unsigned char *ref_ptr,
-                          int  ref_stride,
-                          int max_sad) {
-  return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 8, 8);
-}
-
-
-unsigned int vp9_sad16x8_c(const unsigned char *src_ptr,
-                           int  src_stride,
-                           const unsigned char *ref_ptr,
-                           int  ref_stride,
-                           int max_sad) {
-  return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 16, 8);
-}
-
-unsigned int vp9_sad8x16_c(const unsigned char *src_ptr,
-                           int  src_stride,
-                           const unsigned char *ref_ptr,
-                           int  ref_stride,
-                           int max_sad) {
-  return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 8, 16);
-}
-
-
-unsigned int vp9_sad4x4_c(const unsigned char *src_ptr,
-                          int  src_stride,
-                          const unsigned char *ref_ptr,
-                          int  ref_stride,
-                          int max_sad) {
-  return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 4, 4);
-}
-
-void vp9_sad32x32x3_c(const unsigned char *src_ptr,
-                      int  src_stride,
-                      const unsigned char *ref_ptr,
-                      int  ref_stride,
-                      unsigned int *sad_array
-                      ) {
-  sad_array[0] = vp9_sad32x32_c(src_ptr, src_stride,
-                                ref_ptr, ref_stride, 0x7fffffff);
-  sad_array[1] = vp9_sad32x32_c(src_ptr, src_stride,
-                                ref_ptr + 1, ref_stride, 0x7fffffff);
-  sad_array[2] = vp9_sad32x32_c(src_ptr, src_stride,
-                                ref_ptr + 2, ref_stride, 0x7fffffff);
-}
-
-void vp9_sad32x32x8_c(const unsigned char *src_ptr,
-                      int  src_stride,
-                      const unsigned char *ref_ptr,
-                      int  ref_stride,
-                      unsigned short *sad_array
-                      ) {
-  sad_array[0] = (unsigned short)vp9_sad32x32_c(src_ptr, src_stride,
-                                                ref_ptr, ref_stride,
-                                                0x7fffffff);
-  sad_array[1] = (unsigned short)vp9_sad32x32_c(src_ptr, src_stride,
-                                                ref_ptr + 1, ref_stride,
-                                                0x7fffffff);
-  sad_array[2] = (unsigned short)vp9_sad32x32_c(src_ptr, src_stride,
-                                                ref_ptr + 2, ref_stride,
-                                                0x7fffffff);
-  sad_array[3] = (unsigned short)vp9_sad32x32_c(src_ptr, src_stride,
-                                                ref_ptr + 3, ref_stride,
-                                                0x7fffffff);
-  sad_array[4] = (unsigned short)vp9_sad32x32_c(src_ptr, src_stride,
-                                                ref_ptr + 4, ref_stride,
-                                                0x7fffffff);
-  sad_array[5] = (unsigned short)vp9_sad32x32_c(src_ptr, src_stride,
-                                                ref_ptr + 5, ref_stride,
-                                                0x7fffffff);
-  sad_array[6] = (unsigned short)vp9_sad32x32_c(src_ptr, src_stride,
-                                                ref_ptr + 6, ref_stride,
-                                                0x7fffffff);
-  sad_array[7] = (unsigned short)vp9_sad32x32_c(src_ptr, src_stride,
-                                                ref_ptr + 7, ref_stride,
-                                                0x7fffffff);
-}
-
-void vp9_sad16x16x3_c(const unsigned char *src_ptr,
-                      int  src_stride,
-                      const unsigned char *ref_ptr,
-                      int  ref_stride,
-                      unsigned int *sad_array) {
-  sad_array[0] = vp9_sad16x16_c(src_ptr, src_stride,
-                                ref_ptr, ref_stride, 0x7fffffff);
-  sad_array[1] = vp9_sad16x16_c(src_ptr, src_stride,
-                                ref_ptr + 1, ref_stride, 0x7fffffff);
-  sad_array[2] = vp9_sad16x16_c(src_ptr, src_stride,
-                                ref_ptr + 2, ref_stride, 0x7fffffff);
-}
-
-void vp9_sad16x16x8_c(const unsigned char *src_ptr,
-                      int  src_stride,
-                      const unsigned char *ref_ptr,
-                      int  ref_stride,
-                      unsigned short *sad_array) {
-  sad_array[0] = (unsigned short)vp9_sad16x16_c(src_ptr, src_stride,
-                                                ref_ptr, ref_stride,
-                                                0x7fffffff);
-  sad_array[1] = (unsigned short)vp9_sad16x16_c(src_ptr, src_stride,
-                                                ref_ptr + 1, ref_stride,
-                                                0x7fffffff);
-  sad_array[2] = (unsigned short)vp9_sad16x16_c(src_ptr, src_stride,
-                                                ref_ptr + 2, ref_stride,
-                                                0x7fffffff);
-  sad_array[3] = (unsigned short)vp9_sad16x16_c(src_ptr, src_stride,
-                                                ref_ptr + 3, ref_stride,
-                                                0x7fffffff);
-  sad_array[4] = (unsigned short)vp9_sad16x16_c(src_ptr, src_stride,
-                                                ref_ptr + 4, ref_stride,
-                                                0x7fffffff);
-  sad_array[5] = (unsigned short)vp9_sad16x16_c(src_ptr, src_stride,
-                                                ref_ptr + 5, ref_stride,
-                                                0x7fffffff);
-  sad_array[6] = (unsigned short)vp9_sad16x16_c(src_ptr, src_stride,
-                                                ref_ptr + 6, ref_stride,
-                                                0x7fffffff);
-  sad_array[7] = (unsigned short)vp9_sad16x16_c(src_ptr, src_stride,
-                                                ref_ptr + 7, ref_stride,
-                                                0x7fffffff);
-}
-
-void vp9_sad16x8x3_c(const unsigned char *src_ptr,
-                     int  src_stride,
-                     const unsigned char *ref_ptr,
-                     int  ref_stride,
-                     unsigned int *sad_array) {
-  sad_array[0] = vp9_sad16x8_c(src_ptr, src_stride,
-                               ref_ptr, ref_stride, 0x7fffffff);
-  sad_array[1] = vp9_sad16x8_c(src_ptr, src_stride,
-                               ref_ptr + 1, ref_stride, 0x7fffffff);
-  sad_array[2] = vp9_sad16x8_c(src_ptr, src_stride,
-                               ref_ptr + 2, ref_stride, 0x7fffffff);
-}
-
-void vp9_sad16x8x8_c(const unsigned char *src_ptr,
-                     int  src_stride,
-                     const unsigned char *ref_ptr,
-                     int  ref_stride,
-                     unsigned short *sad_array) {
-  sad_array[0] = (unsigned short)vp9_sad16x8_c(src_ptr, src_stride,
-                                               ref_ptr, ref_stride,
-                                               0x7fffffff);
-  sad_array[1] = (unsigned short)vp9_sad16x8_c(src_ptr, src_stride,
-                                               ref_ptr + 1, ref_stride,
-                                               0x7fffffff);
-  sad_array[2] = (unsigned short)vp9_sad16x8_c(src_ptr, src_stride,
-                                               ref_ptr + 2, ref_stride,
-                                               0x7fffffff);
-  sad_array[3] = (unsigned short)vp9_sad16x8_c(src_ptr, src_stride,
-                                               ref_ptr + 3, ref_stride,
-                                               0x7fffffff);
-  sad_array[4] = (unsigned short)vp9_sad16x8_c(src_ptr, src_stride,
-                                               ref_ptr + 4, ref_stride,
-                                               0x7fffffff);
-  sad_array[5] = (unsigned short)vp9_sad16x8_c(src_ptr, src_stride,
-                                               ref_ptr + 5, ref_stride,
-                                               0x7fffffff);
-  sad_array[6] = (unsigned short)vp9_sad16x8_c(src_ptr, src_stride,
-                                               ref_ptr + 6, ref_stride,
-                                               0x7fffffff);
-  sad_array[7] = (unsigned short)vp9_sad16x8_c(src_ptr, src_stride,
-                                               ref_ptr + 7, ref_stride,
-                                               0x7fffffff);
-}
-
-void vp9_sad8x8x3_c(const unsigned char *src_ptr,
-                    int  src_stride,
-                    const unsigned char *ref_ptr,
-                    int  ref_stride,
-                    unsigned int *sad_array) {
-  sad_array[0] = vp9_sad8x8_c(src_ptr, src_stride,
-                              ref_ptr, ref_stride, 0x7fffffff);
-  sad_array[1] = vp9_sad8x8_c(src_ptr, src_stride,
-                              ref_ptr + 1, ref_stride, 0x7fffffff);
-  sad_array[2] = vp9_sad8x8_c(src_ptr, src_stride,
-                              ref_ptr + 2, ref_stride, 0x7fffffff);
-}
-
-void vp9_sad8x8x8_c(const unsigned char *src_ptr,
-                    int  src_stride,
-                    const unsigned char *ref_ptr,
-                    int  ref_stride,
-                    unsigned short *sad_array) {
-  sad_array[0] = (unsigned short)vp9_sad8x8_c(src_ptr, src_stride,
-                                              ref_ptr, ref_stride,
-                                              0x7fffffff);
-  sad_array[1] = (unsigned short)vp9_sad8x8_c(src_ptr, src_stride,
-                                              ref_ptr + 1, ref_stride,
-                                              0x7fffffff);
-  sad_array[2] = (unsigned short)vp9_sad8x8_c(src_ptr, src_stride,
-                                              ref_ptr + 2, ref_stride,
-                                              0x7fffffff);
-  sad_array[3] = (unsigned short)vp9_sad8x8_c(src_ptr, src_stride,
-                                              ref_ptr + 3, ref_stride,
-                                              0x7fffffff);
-  sad_array[4] = (unsigned short)vp9_sad8x8_c(src_ptr, src_stride,
-                                              ref_ptr + 4, ref_stride,
-                                              0x7fffffff);
-  sad_array[5] = (unsigned short)vp9_sad8x8_c(src_ptr, src_stride,
-                                              ref_ptr + 5, ref_stride,
-                                              0x7fffffff);
-  sad_array[6] = (unsigned short)vp9_sad8x8_c(src_ptr, src_stride,
-                                              ref_ptr + 6, ref_stride,
-                                              0x7fffffff);
-  sad_array[7] = (unsigned short)vp9_sad8x8_c(src_ptr, src_stride,
-                                              ref_ptr + 7, ref_stride,
-                                              0x7fffffff);
-}
-
-void vp9_sad8x16x3_c(const unsigned char *src_ptr,
-                     int  src_stride,
-                     const unsigned char *ref_ptr,
-                     int  ref_stride,
-                     unsigned int *sad_array) {
-  sad_array[0] = vp9_sad8x16_c(src_ptr, src_stride,
-                               ref_ptr, ref_stride, 0x7fffffff);
-  sad_array[1] = vp9_sad8x16_c(src_ptr, src_stride,
-                               ref_ptr + 1, ref_stride, 0x7fffffff);
-  sad_array[2] = vp9_sad8x16_c(src_ptr, src_stride,
-                               ref_ptr + 2, ref_stride, 0x7fffffff);
-}
-
-void vp9_sad8x16x8_c(const unsigned char *src_ptr,
-                     int  src_stride,
-                     const unsigned char *ref_ptr,
-                     int  ref_stride,
-                     unsigned short *sad_array) {
-  sad_array[0] = (unsigned short)vp9_sad8x16_c(src_ptr, src_stride,
-                                               ref_ptr, ref_stride,
-                                               0x7fffffff);
-  sad_array[1] = (unsigned short)vp9_sad8x16_c(src_ptr, src_stride,
-                                               ref_ptr + 1, ref_stride,
-                                               0x7fffffff);
-  sad_array[2] = (unsigned short)vp9_sad8x16_c(src_ptr, src_stride,
-                                               ref_ptr + 2, ref_stride,
-                                               0x7fffffff);
-  sad_array[3] = (unsigned short)vp9_sad8x16_c(src_ptr, src_stride,
-                                               ref_ptr + 3, ref_stride,
-                                               0x7fffffff);
-  sad_array[4] = (unsigned short)vp9_sad8x16_c(src_ptr, src_stride,
-                                               ref_ptr + 4, ref_stride,
-                                               0x7fffffff);
-  sad_array[5] = (unsigned short)vp9_sad8x16_c(src_ptr, src_stride,
-                                               ref_ptr + 5, ref_stride,
-                                               0x7fffffff);
-  sad_array[6] = (unsigned short)vp9_sad8x16_c(src_ptr, src_stride,
-                                               ref_ptr + 6, ref_stride,
-                                               0x7fffffff);
-  sad_array[7] = (unsigned short)vp9_sad8x16_c(src_ptr, src_stride,
-                                               ref_ptr + 7, ref_stride,
-                                               0x7fffffff);
-}
-
-void vp9_sad4x4x3_c(const unsigned char *src_ptr,
-                    int  src_stride,
-                    const unsigned char *ref_ptr,
-                    int  ref_stride,
-                    unsigned int *sad_array) {
-  sad_array[0] = vp9_sad4x4_c(src_ptr, src_stride,
-                              ref_ptr, ref_stride, 0x7fffffff);
-  sad_array[1] = vp9_sad4x4_c(src_ptr, src_stride,
-                              ref_ptr + 1, ref_stride, 0x7fffffff);
-  sad_array[2] = vp9_sad4x4_c(src_ptr, src_stride,
-                              ref_ptr + 2, ref_stride, 0x7fffffff);
-}
-
-void vp9_sad4x4x8_c(const unsigned char *src_ptr,
-                    int  src_stride,
-                    const unsigned char *ref_ptr,
-                    int  ref_stride,
-                    unsigned short *sad_array) {
-  sad_array[0] = (unsigned short)vp9_sad4x4_c(src_ptr, src_stride,
-                                              ref_ptr, ref_stride,
-                                              0x7fffffff);
-  sad_array[1] = (unsigned short)vp9_sad4x4_c(src_ptr, src_stride,
-                                              ref_ptr + 1, ref_stride,
-                                              0x7fffffff);
-  sad_array[2] = (unsigned short)vp9_sad4x4_c(src_ptr, src_stride,
-                                              ref_ptr + 2, ref_stride,
-                                              0x7fffffff);
-  sad_array[3] = (unsigned short)vp9_sad4x4_c(src_ptr, src_stride,
-                                              ref_ptr + 3, ref_stride,
-                                              0x7fffffff);
-  sad_array[4] = (unsigned short)vp9_sad4x4_c(src_ptr, src_stride,
-                                              ref_ptr + 4, ref_stride,
-                                              0x7fffffff);
-  sad_array[5] = (unsigned short)vp9_sad4x4_c(src_ptr, src_stride,
-                                              ref_ptr + 5, ref_stride,
-                                              0x7fffffff);
-  sad_array[6] = (unsigned short)vp9_sad4x4_c(src_ptr, src_stride,
-                                              ref_ptr + 6, ref_stride,
-                                              0x7fffffff);
-  sad_array[7] = (unsigned short)vp9_sad4x4_c(src_ptr, src_stride,
-                                              ref_ptr + 7, ref_stride,
-                                              0x7fffffff);
-}
-
-void vp9_sad32x32x4d_c(const unsigned char *src_ptr,
-                       int  src_stride,
-                       unsigned char *ref_ptr[],
-                       int  ref_stride,
-                       unsigned int *sad_array
-                       ) {
-  sad_array[0] = vp9_sad32x32_c(src_ptr, src_stride,
-                                ref_ptr[0], ref_stride, 0x7fffffff);
-  sad_array[1] = vp9_sad32x32_c(src_ptr, src_stride,
-                                ref_ptr[1], ref_stride, 0x7fffffff);
-  sad_array[2] = vp9_sad32x32_c(src_ptr, src_stride,
-                                ref_ptr[2], ref_stride, 0x7fffffff);
-  sad_array[3] = vp9_sad32x32_c(src_ptr, src_stride,
-                                ref_ptr[3], ref_stride, 0x7fffffff);
-}
-
-void vp9_sad16x16x4d_c(const unsigned char *src_ptr,
-                       int  src_stride,
-                       unsigned char *ref_ptr[],
-                       int  ref_stride,
-                       unsigned int *sad_array) {
-  sad_array[0] = vp9_sad16x16_c(src_ptr, src_stride,
-                                ref_ptr[0], ref_stride, 0x7fffffff);
-  sad_array[1] = vp9_sad16x16_c(src_ptr, src_stride,
-                                ref_ptr[1], ref_stride, 0x7fffffff);
-  sad_array[2] = vp9_sad16x16_c(src_ptr, src_stride,
-                                ref_ptr[2], ref_stride, 0x7fffffff);
-  sad_array[3] = vp9_sad16x16_c(src_ptr, src_stride,
-                                ref_ptr[3], ref_stride, 0x7fffffff);
-}
-
-void vp9_sad16x8x4d_c(const unsigned char *src_ptr,
-                      int  src_stride,
-                      unsigned char *ref_ptr[],
-                      int  ref_stride,
-                      unsigned int *sad_array) {
-  sad_array[0] = vp9_sad16x8_c(src_ptr, src_stride,
-                               ref_ptr[0], ref_stride, 0x7fffffff);
-  sad_array[1] = vp9_sad16x8_c(src_ptr, src_stride,
-                               ref_ptr[1], ref_stride, 0x7fffffff);
-  sad_array[2] = vp9_sad16x8_c(src_ptr, src_stride,
-                               ref_ptr[2], ref_stride, 0x7fffffff);
-  sad_array[3] = vp9_sad16x8_c(src_ptr, src_stride,
-                               ref_ptr[3], ref_stride, 0x7fffffff);
-}
-
-void vp9_sad8x8x4d_c(const unsigned char *src_ptr,
-                     int  src_stride,
-                     unsigned char *ref_ptr[],
-                     int  ref_stride,
-                     unsigned int *sad_array) {
-  sad_array[0] = vp9_sad8x8_c(src_ptr, src_stride,
-                              ref_ptr[0], ref_stride, 0x7fffffff);
-  sad_array[1] = vp9_sad8x8_c(src_ptr, src_stride,
-                              ref_ptr[1], ref_stride, 0x7fffffff);
-  sad_array[2] = vp9_sad8x8_c(src_ptr, src_stride,
-                              ref_ptr[2], ref_stride, 0x7fffffff);
-  sad_array[3] = vp9_sad8x8_c(src_ptr, src_stride,
-                              ref_ptr[3], ref_stride, 0x7fffffff);
-}
-
-void vp9_sad8x16x4d_c(const unsigned char *src_ptr,
-                      int  src_stride,
-                      unsigned char *ref_ptr[],
-                      int  ref_stride,
-                      unsigned int *sad_array) {
-  sad_array[0] = vp9_sad8x16_c(src_ptr, src_stride,
-                               ref_ptr[0], ref_stride, 0x7fffffff);
-  sad_array[1] = vp9_sad8x16_c(src_ptr, src_stride,
-                               ref_ptr[1], ref_stride, 0x7fffffff);
-  sad_array[2] = vp9_sad8x16_c(src_ptr, src_stride,
-                               ref_ptr[2], ref_stride, 0x7fffffff);
-  sad_array[3] = vp9_sad8x16_c(src_ptr, src_stride,
-                               ref_ptr[3], ref_stride, 0x7fffffff);
-}
-
-void vp9_sad4x4x4d_c(const unsigned char *src_ptr,
-                     int  src_stride,
-                     unsigned char *ref_ptr[],
-                     int  ref_stride,
-                     unsigned int *sad_array) {
-  sad_array[0] = vp9_sad4x4_c(src_ptr, src_stride,
-                              ref_ptr[0], ref_stride, 0x7fffffff);
-  sad_array[1] = vp9_sad4x4_c(src_ptr, src_stride,
-                              ref_ptr[1], ref_stride, 0x7fffffff);
-  sad_array[2] = vp9_sad4x4_c(src_ptr, src_stride,
-                              ref_ptr[2], ref_stride, 0x7fffffff);
-  sad_array[3] = vp9_sad4x4_c(src_ptr, src_stride,
-                              ref_ptr[3], ref_stride, 0x7fffffff);
-}
-
-/* Copy 2 macroblocks to a buffer */
-void vp9_copy32xn_c(unsigned char *src_ptr,
-                    int  src_stride,
-                    unsigned char *dst_ptr,
-                    int  dst_stride,
-                    int height) {
-  int r;
-
-  for (r = 0; r < height; r++) {
-#if !(CONFIG_FAST_UNALIGNED)
-    dst_ptr[0] = src_ptr[0];
-    dst_ptr[1] = src_ptr[1];
-    dst_ptr[2] = src_ptr[2];
-    dst_ptr[3] = src_ptr[3];
-    dst_ptr[4] = src_ptr[4];
-    dst_ptr[5] = src_ptr[5];
-    dst_ptr[6] = src_ptr[6];
-    dst_ptr[7] = src_ptr[7];
-    dst_ptr[8] = src_ptr[8];
-    dst_ptr[9] = src_ptr[9];
-    dst_ptr[10] = src_ptr[10];
-    dst_ptr[11] = src_ptr[11];
-    dst_ptr[12] = src_ptr[12];
-    dst_ptr[13] = src_ptr[13];
-    dst_ptr[14] = src_ptr[14];
-    dst_ptr[15] = src_ptr[15];
-    dst_ptr[16] = src_ptr[16];
-    dst_ptr[17] = src_ptr[17];
-    dst_ptr[18] = src_ptr[18];
-    dst_ptr[19] = src_ptr[19];
-    dst_ptr[20] = src_ptr[20];
-    dst_ptr[21] = src_ptr[21];
-    dst_ptr[22] = src_ptr[22];
-    dst_ptr[23] = src_ptr[23];
-    dst_ptr[24] = src_ptr[24];
-    dst_ptr[25] = src_ptr[25];
-    dst_ptr[26] = src_ptr[26];
-    dst_ptr[27] = src_ptr[27];
-    dst_ptr[28] = src_ptr[28];
-    dst_ptr[29] = src_ptr[29];
-    dst_ptr[30] = src_ptr[30];
-    dst_ptr[31] = src_ptr[31];
-#else
-    ((uint32_t *)dst_ptr)[0] = ((uint32_t *)src_ptr)[0];
-    ((uint32_t *)dst_ptr)[1] = ((uint32_t *)src_ptr)[1];
-    ((uint32_t *)dst_ptr)[2] = ((uint32_t *)src_ptr)[2];
-    ((uint32_t *)dst_ptr)[3] = ((uint32_t *)src_ptr)[3];
-    ((uint32_t *)dst_ptr)[4] = ((uint32_t *)src_ptr)[4];
-    ((uint32_t *)dst_ptr)[5] = ((uint32_t *)src_ptr)[5];
-    ((uint32_t *)dst_ptr)[6] = ((uint32_t *)src_ptr)[6];
-    ((uint32_t *)dst_ptr)[7] = ((uint32_t *)src_ptr)[7];
-#endif
-    src_ptr += src_stride;
-    dst_ptr += dst_stride;
-
-  }
-}
--- a/vp8/encoder/satd_c.c
+++ /dev/null
@@ -1,47 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <stdlib.h>
-#include "vpx_ports/mem.h"
-#include "./vpx_rtcd.h"
-unsigned int vp9_satd16x16_c(const unsigned char *src_ptr,
-                             int  src_stride,
-                             const unsigned char *ref_ptr,
-                             int  ref_stride,
-                             unsigned int *psatd) {
-  int r, c, i;
-  unsigned int satd = 0;
-  DECLARE_ALIGNED(16, short, diff_in[256]);
-  DECLARE_ALIGNED(16, short, diff_out[16]);
-  short *in;
-
-  for (r = 0; r < 16; r++) {
-    for (c = 0; c < 16; c++) {
-      diff_in[r * 16 + c] = src_ptr[c] - ref_ptr[c];
-    }
-    src_ptr += src_stride;
-    ref_ptr += ref_stride;
-  }
-
-  in = diff_in;
-  for (r = 0; r < 16; r += 4) {
-    for (c = 0; c < 16; c += 4) {
-      vp9_short_walsh4x4_c(in + c, diff_out, 32);
-      for (i = 0; i < 16; i++)
-        satd += abs(diff_out[i]);
-    }
-    in += 64;
-  }
-
-  if (psatd)
-    *psatd = satd;
-
-  return satd;
-}
--- a/vp8/encoder/segmentation.c
+++ /dev/null
@@ -1,327 +1,0 @@
-/*
- *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "limits.h"
-#include "vpx_mem/vpx_mem.h"
-#include "segmentation.h"
-#include "vp8/common/pred_common.h"
-
-void vp9_update_gf_useage_maps(VP9_COMP *cpi, VP9_COMMON *cm, MACROBLOCK *x) {
-  int mb_row, mb_col;
-
-  MODE_INFO *this_mb_mode_info = cm->mi;
-
-  x->gf_active_ptr = (signed char *)cpi->gf_active_flags;
-
-  if ((cm->frame_type == KEY_FRAME) || (cm->refresh_golden_frame)) {
-    // Reset Gf useage monitors
-    vpx_memset(cpi->gf_active_flags, 1, (cm->mb_rows * cm->mb_cols));
-    cpi->gf_active_count = cm->mb_rows * cm->mb_cols;
-  } else {
-    // for each macroblock row in image
-    for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) {
-      // for each macroblock col in image
-      for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) {
-
-        // If using golden then set GF active flag if not already set.
-        // If using last frame 0,0 mode then leave flag as it is
-        // else if using non 0,0 motion or intra modes then clear
-        // flag if it is currently set
-        if ((this_mb_mode_info->mbmi.ref_frame == GOLDEN_FRAME) ||
-            (this_mb_mode_info->mbmi.ref_frame == ALTREF_FRAME)) {
-          if (*(x->gf_active_ptr) == 0) {
-            *(x->gf_active_ptr) = 1;
-            cpi->gf_active_count++;
-          }
-        } else if ((this_mb_mode_info->mbmi.mode != ZEROMV) &&
-                   *(x->gf_active_ptr)) {
-          *(x->gf_active_ptr) = 0;
-          cpi->gf_active_count--;
-        }
-
-        x->gf_active_ptr++;          // Step onto next entry
-        this_mb_mode_info++;         // skip to next mb
-
-      }
-
-      // this is to account for the border
-      this_mb_mode_info++;
-    }
-  }
-}
-
-void vp9_enable_segmentation(VP9_PTR ptr) {
-  VP9_COMP *cpi = (VP9_COMP *)(ptr);
-
-  // Set the appropriate feature bit
-  cpi->mb.e_mbd.segmentation_enabled = 1;
-  cpi->mb.e_mbd.update_mb_segmentation_map = 1;
-  cpi->mb.e_mbd.update_mb_segmentation_data = 1;
-}
-
-void vp9_disable_segmentation(VP9_PTR ptr) {
-  VP9_COMP *cpi = (VP9_COMP *)(ptr);
-
-  // Clear the appropriate feature bit
-  cpi->mb.e_mbd.segmentation_enabled = 0;
-}
-
-void vp9_set_segmentation_map(VP9_PTR ptr,
-                              unsigned char *segmentation_map) {
-  VP9_COMP *cpi = (VP9_COMP *)(ptr);
-
-  // Copy in the new segmentation map
-  vpx_memcpy(cpi->segmentation_map, segmentation_map,
-             (cpi->common.mb_rows * cpi->common.mb_cols));
-
-  // Signal that the map should be updated.
-  cpi->mb.e_mbd.update_mb_segmentation_map = 1;
-  cpi->mb.e_mbd.update_mb_segmentation_data = 1;
-}
-
-void vp9_set_segment_data(VP9_PTR ptr,
-                          signed char *feature_data,
-                          unsigned char abs_delta) {
-  VP9_COMP *cpi = (VP9_COMP *)(ptr);
-
-  cpi->mb.e_mbd.mb_segment_abs_delta = abs_delta;
-
-  vpx_memcpy(cpi->mb.e_mbd.segment_feature_data, feature_data,
-             sizeof(cpi->mb.e_mbd.segment_feature_data));
-
-  // TBD ?? Set the feature mask
-  // vpx_memcpy(cpi->mb.e_mbd.segment_feature_mask, 0,
-  //            sizeof(cpi->mb.e_mbd.segment_feature_mask));
-}
-
-// Based on set of segment counts calculate a probability tree
-static void calc_segtree_probs(MACROBLOCKD *xd,
-                               int *segcounts,
-                               vp9_prob *segment_tree_probs) {
-  int count1, count2;
-  int tot_count;
-  int i;
-
-  // Blank the strtucture to start with
-  vpx_memset(segment_tree_probs, 0,
-             MB_FEATURE_TREE_PROBS * sizeof(*segment_tree_probs));
-
-  // Total count for all segments
-  count1 = segcounts[0] + segcounts[1];
-  count2 = segcounts[2] + segcounts[3];
-  tot_count = count1 + count2;
-
-  // Work out probabilities of each segment
-  if (tot_count)
-    segment_tree_probs[0] = (count1 * 255) / tot_count;
-  if (count1 > 0)
-    segment_tree_probs[1] = (segcounts[0] * 255) / count1;
-  if (count2 > 0)
-    segment_tree_probs[2] = (segcounts[2] * 255) / count2;
-
-  // Clamp probabilities to minimum allowed value
-  for (i = 0; i < MB_FEATURE_TREE_PROBS; i++) {
-    if (segment_tree_probs[i] == 0)
-      segment_tree_probs[i] = 1;
-  }
-}
-
-// Based on set of segment counts and probabilities calculate a cost estimate
-static int cost_segmap(MACROBLOCKD *xd,
-                       int *segcounts,
-                       vp9_prob *probs) {
-  int cost;
-  int count1, count2;
-
-  // Cost the top node of the tree
-  count1 = segcounts[0] + segcounts[1];
-  count2 = segcounts[2] + segcounts[3];
-  cost = count1 * vp9_cost_zero(probs[0]) +
-         count2 * vp9_cost_one(probs[0]);
-
-  // Now add the cost of each individual segment branch
-  if (count1 > 0)
-    cost += segcounts[0] * vp9_cost_zero(probs[1]) +
-            segcounts[1] * vp9_cost_one(probs[1]);
-
-  if (count2 > 0)
-    cost += segcounts[2] * vp9_cost_zero(probs[2]) +
-            segcounts[3] * vp9_cost_one(probs[2]);
-
-  return cost;
-
-}
-
-void vp9_choose_segmap_coding_method(VP9_COMP *cpi) {
-  VP9_COMMON *const cm = &cpi->common;
-  MACROBLOCKD *const xd = &cpi->mb.e_mbd;
-
-  const int mis = cm->mode_info_stride;
-  int i;
-  int tot_count;
-  int no_pred_cost;
-  int t_pred_cost = INT_MAX;
-  int pred_context;
-
-  int mb_row, mb_col;
-  int segmap_index = 0;
-  unsigned char segment_id;
-
-  int temporal_predictor_count[PREDICTION_PROBS][2];
-  int no_pred_segcounts[MAX_MB_SEGMENTS];
-  int t_unpred_seg_counts[MAX_MB_SEGMENTS];
-
-  vp9_prob no_pred_tree[MB_FEATURE_TREE_PROBS];
-  vp9_prob t_pred_tree[MB_FEATURE_TREE_PROBS];
-  vp9_prob t_nopred_prob[PREDICTION_PROBS];
-
-  // Set default state for the segment tree probabilities and the
-  // temporal coding probabilities
-  vpx_memset(xd->mb_segment_tree_probs, 255,
-             sizeof(xd->mb_segment_tree_probs));
-  vpx_memset(cm->segment_pred_probs, 255,
-             sizeof(cm->segment_pred_probs));
-
-  vpx_memset(no_pred_segcounts, 0, sizeof(no_pred_segcounts));
-  vpx_memset(t_unpred_seg_counts, 0, sizeof(t_unpred_seg_counts));
-  vpx_memset(temporal_predictor_count, 0, sizeof(temporal_predictor_count));
-
-  // First of all generate stats regarding how well the last segment map
-  // predicts this one
-
-  // Initialize macroblock decoder mode info context for the first mb
-  // in the frame
-  xd->mode_info_context = cm->mi;
-
-  for (mb_row = 0; mb_row < cm->mb_rows; mb_row += 2) {
-    for (mb_col = 0; mb_col < cm->mb_cols; mb_col += 2) {
-      for (i = 0; i < 4; i++) {
-        static const int dx[4] = { +1, -1, +1, +1 };
-        static const int dy[4] = {  0, +1,  0, -1 };
-        int x_idx = i & 1, y_idx = i >> 1;
-
-        if (mb_col + x_idx >= cm->mb_cols ||
-            mb_row + y_idx >= cm->mb_rows) {
-          goto end;
-        }
-
-        xd->mb_to_top_edge = -((mb_row * 16) << 3);
-        xd->mb_to_bottom_edge = ((cm->mb_rows - 1 - mb_row) * 16) << 3;
-        xd->mb_to_left_edge = -((mb_col * 16) << 3);
-        xd->mb_to_right_edge = ((cm->mb_cols - 1 - mb_row) * 16) << 3;
-
-        segmap_index = (mb_row + y_idx) * cm->mb_cols + mb_col + x_idx;
-        segment_id = xd->mode_info_context->mbmi.segment_id;
-#if CONFIG_SUPERBLOCKS
-        if (xd->mode_info_context->mbmi.encoded_as_sb) {
-          if (mb_col + 1 < cm->mb_cols)
-            segment_id = segment_id &&
-                         xd->mode_info_context[1].mbmi.segment_id;
-          if (mb_row + 1 < cm->mb_rows) {
-            segment_id = segment_id &&
-                         xd->mode_info_context[mis].mbmi.segment_id;
-            if (mb_col + 1 < cm->mb_cols)
-              segment_id = segment_id &&
-                           xd->mode_info_context[mis + 1].mbmi.segment_id;
-          }
-        }
-#endif
-
-        // Count the number of hits on each segment with no prediction
-        no_pred_segcounts[segment_id]++;
-
-        // Temporal prediction not allowed on key frames
-        if (cm->frame_type != KEY_FRAME) {
-          // Test to see if the segment id matches the predicted value.
-          int seg_predicted =
-            (segment_id == vp9_get_pred_mb_segid(cm, xd, segmap_index));
-
-          // Get the segment id prediction context
-          pred_context =
-            vp9_get_pred_context(cm, xd, PRED_SEG_ID);
-
-          // Store the prediction status for this mb and update counts
-          // as appropriate
-          vp9_set_pred_flag(xd, PRED_SEG_ID, seg_predicted);
-          temporal_predictor_count[pred_context][seg_predicted]++;
-
-          if (!seg_predicted)
-            // Update the "unpredicted" segment count
-            t_unpred_seg_counts[segment_id]++;
-        }
-
-#if CONFIG_SUPERBLOCKS
-        if (xd->mode_info_context->mbmi.encoded_as_sb) {
-          assert(!i);
-          xd->mode_info_context += 2;
-          break;
-        }
-#endif
-      end:
-        xd->mode_info_context += dx[i] + dy[i] * cm->mode_info_stride;
-      }
-    }
-
-    // this is to account for the border in mode_info_context
-    xd->mode_info_context -= mb_col;
-    xd->mode_info_context += cm->mode_info_stride * 2;
-  }
-
-  // Work out probability tree for coding segments without prediction
-  // and the cost.
-  calc_segtree_probs(xd, no_pred_segcounts, no_pred_tree);
-  no_pred_cost = cost_segmap(xd, no_pred_segcounts, no_pred_tree);
-
-  // Key frames cannot use temporal prediction
-  if (cm->frame_type != KEY_FRAME) {
-    // Work out probability tree for coding those segments not
-    // predicted using the temporal method and the cost.
-    calc_segtree_probs(xd, t_unpred_seg_counts, t_pred_tree);
-    t_pred_cost = cost_segmap(xd, t_unpred_seg_counts, t_pred_tree);
-
-    // Add in the cost of the signalling for each prediction context
-    for (i = 0; i < PREDICTION_PROBS; i++) {
-      tot_count = temporal_predictor_count[i][0] +
-                  temporal_predictor_count[i][1];
-
-      // Work out the context probabilities for the segment
-      // prediction flag
-      if (tot_count) {
-        t_nopred_prob[i] = (temporal_predictor_count[i][0] * 255) /
-                           tot_count;
-
-        // Clamp to minimum allowed value
-        if (t_nopred_prob[i] < 1)
-          t_nopred_prob[i] = 1;
-      } else
-        t_nopred_prob[i] = 1;
-
-      // Add in the predictor signaling cost
-      t_pred_cost += (temporal_predictor_count[i][0] *
-                      vp9_cost_zero(t_nopred_prob[i])) +
-                     (temporal_predictor_count[i][1] *
-                      vp9_cost_one(t_nopred_prob[i]));
-    }
-  }
-
-  // Now choose which coding method to use.
-  if (t_pred_cost < no_pred_cost) {
-    cm->temporal_update = 1;
-    vpx_memcpy(xd->mb_segment_tree_probs,
-               t_pred_tree, sizeof(t_pred_tree));
-    vpx_memcpy(&cm->segment_pred_probs,
-               t_nopred_prob, sizeof(t_nopred_prob));
-  } else {
-    cm->temporal_update = 0;
-    vpx_memcpy(xd->mb_segment_tree_probs,
-               no_pred_tree, sizeof(no_pred_tree));
-  }
-}
--- a/vp8/encoder/segmentation.h
+++ /dev/null
@@ -1,46 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "string.h"
-#include "vp8/common/blockd.h"
-#include "onyx_int.h"
-
-#ifndef __INC_SEGMENTATION_H__
-#define __INC_SEGMENTATION_H__ 1
-
-extern void vp9_update_gf_useage_maps(VP9_COMP *cpi, VP9_COMMON *cm,
-                                      MACROBLOCK *x);
-
-extern void vp9_enable_segmentation(VP9_PTR ptr);
-extern void vp9_disable_segmentation(VP9_PTR ptr);
-
-// Valid values for a segment are 0 to 3
-// Segmentation map is arrange as [Rows][Columns]
-extern void vp9_set_segmentation_map(VP9_PTR ptr,
-                                     unsigned char *segmentation_map);
-
-// The values given for each segment can be either deltas (from the default
-// value chosen for the frame) or absolute values.
-//
-// Valid range for abs values is (0-127 for MB_LVL_ALT_Q), (0-63 for
-// SEGMENT_ALT_LF)
-// Valid range for delta values are (+/-127 for MB_LVL_ALT_Q), (+/-63 for
-// SEGMENT_ALT_LF)
-//
-// abs_delta = SEGMENT_DELTADATA (deltas) abs_delta = SEGMENT_ABSDATA (use
-// the absolute values given).
-//
-extern void vp9_set_segment_data(VP9_PTR ptr, signed char *feature_data,
-                                 unsigned char abs_delta);
-
-extern void vp9_choose_segmap_coding_method(VP9_COMP *cpi);
-
-#endif /* __INC_SEGMENTATION_H__ */
--- a/vp8/encoder/ssim.c
+++ /dev/null
@@ -1,147 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "onyx_int.h"
-
-void vp9_ssim_parms_16x16_c(unsigned char *s, int sp, unsigned char *r,
-                            int rp, unsigned long *sum_s, unsigned long *sum_r,
-                            unsigned long *sum_sq_s, unsigned long *sum_sq_r,
-                            unsigned long *sum_sxr) {
-  int i, j;
-  for (i = 0; i < 16; i++, s += sp, r += rp) {
-    for (j = 0; j < 16; j++) {
-      *sum_s += s[j];
-      *sum_r += r[j];
-      *sum_sq_s += s[j] * s[j];
-      *sum_sq_r += r[j] * r[j];
-      *sum_sxr += s[j] * r[j];
-    }
-  }
-}
-void vp9_ssim_parms_8x8_c(unsigned char *s, int sp, unsigned char *r, int rp,
-                          unsigned long *sum_s, unsigned long *sum_r,
-                          unsigned long *sum_sq_s, unsigned long *sum_sq_r,
-                          unsigned long *sum_sxr) {
-  int i, j;
-  for (i = 0; i < 8; i++, s += sp, r += rp) {
-    for (j = 0; j < 8; j++) {
-      *sum_s += s[j];
-      *sum_r += r[j];
-      *sum_sq_s += s[j] * s[j];
-      *sum_sq_r += r[j] * r[j];
-      *sum_sxr += s[j] * r[j];
-    }
-  }
-}
-
-const static int64_t cc1 =  26634; // (64^2*(.01*255)^2
-const static int64_t cc2 = 239708; // (64^2*(.03*255)^2
-
-static double similarity(unsigned long sum_s, unsigned long sum_r,
-                         unsigned long sum_sq_s, unsigned long sum_sq_r,
-                         unsigned long sum_sxr, int count) {
-  int64_t ssim_n, ssim_d;
-  int64_t c1, c2;
-
-  // scale the constants by number of pixels
-  c1 = (cc1 * count * count) >> 12;
-  c2 = (cc2 * count * count) >> 12;
-
-  ssim_n = (2 * sum_s * sum_r + c1) * ((int64_t) 2 * count * sum_sxr -
-                                       (int64_t) 2 * sum_s * sum_r + c2);
-
-  ssim_d = (sum_s * sum_s + sum_r * sum_r + c1) *
-           ((int64_t)count * sum_sq_s - (int64_t)sum_s * sum_s +
-            (int64_t)count * sum_sq_r - (int64_t) sum_r * sum_r + c2);
-
-  return ssim_n * 1.0 / ssim_d;
-}
-
-static double ssim_16x16(unsigned char *s, int sp, unsigned char *r, int rp) {
-  unsigned long sum_s = 0, sum_r = 0, sum_sq_s = 0, sum_sq_r = 0, sum_sxr = 0;
-  vp9_ssim_parms_16x16(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r,
-                       &sum_sxr);
-  return similarity(sum_s, sum_r, sum_sq_s, sum_sq_r, sum_sxr, 256);
-}
-static double ssim_8x8(unsigned char *s, int sp, unsigned char *r, int rp) {
-  unsigned long sum_s = 0, sum_r = 0, sum_sq_s = 0, sum_sq_r = 0, sum_sxr = 0;
-  vp9_ssim_parms_8x8(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r,
-                     &sum_sxr);
-  return similarity(sum_s, sum_r, sum_sq_s, sum_sq_r, sum_sxr, 64);
-}
-
-// We are using a 8x8 moving window with starting location of each 8x8 window
-// on the 4x4 pixel grid. Such arrangement allows the windows to overlap
-// block boundaries to penalize blocking artifacts.
-double vp9_ssim2(unsigned char *img1, unsigned char *img2, int stride_img1,
-                 int stride_img2, int width, int height) {
-  int i, j;
-  int samples = 0;
-  double ssim_total = 0;
-
-  // sample point start with each 4x4 location
-  for (i = 0; i < height - 8; i += 4, img1 += stride_img1 * 4, img2 += stride_img2 * 4) {
-    for (j = 0; j < width - 8; j += 4) {
-      double v = ssim_8x8(img1 + j, stride_img1, img2 + j, stride_img2);
-      ssim_total += v;
-      samples++;
-    }
-  }
-  ssim_total /= samples;
-  return ssim_total;
-}
-double vp9_calc_ssim(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest,
-                     int lumamask, double *weight) {
-  double a, b, c;
-  double ssimv;
-
-  a = vp9_ssim2(source->y_buffer, dest->y_buffer,
-                source->y_stride, dest->y_stride, source->y_width,
-                source->y_height);
-
-  b = vp9_ssim2(source->u_buffer, dest->u_buffer,
-                source->uv_stride, dest->uv_stride, source->uv_width,
-                source->uv_height);
-
-  c = vp9_ssim2(source->v_buffer, dest->v_buffer,
-                source->uv_stride, dest->uv_stride, source->uv_width,
-                source->uv_height);
-
-  ssimv = a * .8 + .1 * (b + c);
-
-  *weight = 1;
-
-  return ssimv;
-}
-
-double vp9_calc_ssimg(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest,
-                      double *ssim_y, double *ssim_u, double *ssim_v) {
-  double ssim_all = 0;
-  double a, b, c;
-
-  a = vp9_ssim2(source->y_buffer, dest->y_buffer,
-                source->y_stride, dest->y_stride, source->y_width,
-                source->y_height);
-
-  b = vp9_ssim2(source->u_buffer, dest->u_buffer,
-                source->uv_stride, dest->uv_stride, source->uv_width,
-                source->uv_height);
-
-  c = vp9_ssim2(source->v_buffer, dest->v_buffer,
-                source->uv_stride, dest->uv_stride, source->uv_width,
-                source->uv_height);
-  *ssim_y = a;
-  *ssim_u = b;
-  *ssim_v = c;
-  ssim_all = (a * 4 + b + c) / 6;
-
-  return ssim_all;
-}
--- a/vp8/encoder/temporal_filter.c
+++ /dev/null
@@ -1,516 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vp8/common/onyxc_int.h"
-#include "onyx_int.h"
-#include "vp8/common/systemdependent.h"
-#include "quantize.h"
-#include "vp8/common/alloccommon.h"
-#include "mcomp.h"
-#include "firstpass.h"
-#include "psnr.h"
-#include "vpx_scale/vpxscale.h"
-#include "vp8/common/extend.h"
-#include "ratectrl.h"
-#include "vp8/common/quant_common.h"
-#include "segmentation.h"
-#include "vpx_scale/yv12extend.h"
-#include "vpx_mem/vpx_mem.h"
-#include "vp8/common/swapyv12buffer.h"
-#include "vpx_ports/vpx_timer.h"
-
-#include <math.h>
-#include <limits.h>
-
-#define ALT_REF_MC_ENABLED 1    // dis/enable MC in AltRef filtering
-#define ALT_REF_SUBPEL_ENABLED 1 // dis/enable subpel in MC AltRef filtering
-
-#if VP9_TEMPORAL_ALT_REF
-
-
-static void temporal_filter_predictors_mb_c
-(
-  MACROBLOCKD *xd,
-  unsigned char *y_mb_ptr,
-  unsigned char *u_mb_ptr,
-  unsigned char *v_mb_ptr,
-  int stride,
-  int mv_row,
-  int mv_col,
-  unsigned char *pred
-) {
-  int offset;
-  unsigned char *yptr, *uptr, *vptr;
-  int omv_row, omv_col;
-
-  // Y
-  yptr = y_mb_ptr + (mv_row >> 3) * stride + (mv_col >> 3);
-
-  if ((mv_row | mv_col) & 7) {
-    xd->subpixel_predict16x16(yptr, stride,
-                             (mv_col & 7) << 1, (mv_row & 7) << 1, &pred[0], 16);
-  } else {
-    vp9_copy_mem16x16(yptr, stride, &pred[0], 16);
-  }
-
-  // U & V
-  omv_row = mv_row;
-  omv_col = mv_col;
-  mv_row >>= 1;
-  mv_col >>= 1;
-  stride = (stride + 1) >> 1;
-  offset = (mv_row >> 3) * stride + (mv_col >> 3);
-  uptr = u_mb_ptr + offset;
-  vptr = v_mb_ptr + offset;
-
-  if ((omv_row | omv_col) & 15) {
-    xd->subpixel_predict8x8(uptr, stride,
-                           (omv_col & 15), (omv_row & 15), &pred[256], 8);
-    xd->subpixel_predict8x8(vptr, stride,
-                           (omv_col & 15), (omv_row & 15), &pred[320], 8);
-  }
-  else {
-    vp9_copy_mem8x8(uptr, stride, &pred[256], 8);
-    vp9_copy_mem8x8(vptr, stride, &pred[320], 8);
-  }
-}
-void vp9_temporal_filter_apply_c
-(
-  unsigned char *frame1,
-  unsigned int stride,
-  unsigned char *frame2,
-  unsigned int block_size,
-  int strength,
-  int filter_weight,
-  unsigned int *accumulator,
-  unsigned short *count
-) {
-  unsigned int i, j, k;
-  int modifier;
-  int byte = 0;
-
-  for (i = 0, k = 0; i < block_size; i++) {
-    for (j = 0; j < block_size; j++, k++) {
-
-      int src_byte = frame1[byte];
-      int pixel_value = *frame2++;
-
-      modifier   = src_byte - pixel_value;
-      // This is an integer approximation of:
-      // float coeff = (3.0 * modifer * modifier) / pow(2, strength);
-      // modifier =  (int)roundf(coeff > 16 ? 0 : 16-coeff);
-      modifier  *= modifier;
-      modifier  *= 3;
-      modifier  += 1 << (strength - 1);
-      modifier >>= strength;
-
-      if (modifier > 16)
-        modifier = 16;
-
-      modifier = 16 - modifier;
-      modifier *= filter_weight;
-
-      count[k] += modifier;
-      accumulator[k] += modifier * pixel_value;
-
-      byte++;
-    }
-
-    byte += stride - block_size;
-  }
-}
-
-#if ALT_REF_MC_ENABLED
-
-static int temporal_filter_find_matching_mb_c
-(
-  VP9_COMP *cpi,
-  YV12_BUFFER_CONFIG *arf_frame,
-  YV12_BUFFER_CONFIG *frame_ptr,
-  int mb_offset,
-  int error_thresh
-) {
-  MACROBLOCK *x = &cpi->mb;
-  int step_param;
-  int further_steps;
-  int sadpb = x->sadperbit16;
-  int bestsme = INT_MAX;
-
-  BLOCK *b = &x->block[0];
-  BLOCKD *d = &x->e_mbd.block[0];
-  int_mv best_ref_mv1;
-  int_mv best_ref_mv1_full; /* full-pixel value of best_ref_mv1 */
-
-  // Save input state
-  unsigned char **base_src = b->base_src;
-  int src = b->src;
-  int src_stride = b->src_stride;
-  unsigned char **base_pre = d->base_pre;
-  int pre = d->pre;
-  int pre_stride = d->pre_stride;
-
-  best_ref_mv1.as_int = 0;
-  best_ref_mv1_full.as_mv.col = best_ref_mv1.as_mv.col >> 3;
-  best_ref_mv1_full.as_mv.row = best_ref_mv1.as_mv.row >> 3;
-
-  // Setup frame pointers
-  b->base_src = &arf_frame->y_buffer;
-  b->src_stride = arf_frame->y_stride;
-  b->src = mb_offset;
-
-  d->base_pre = &frame_ptr->y_buffer;
-  d->pre_stride = frame_ptr->y_stride;
-  d->pre = mb_offset;
-
-  // Further step/diamond searches as necessary
-  if (cpi->Speed < 8) {
-    step_param = cpi->sf.first_step +
-                 ((cpi->Speed > 5) ? 1 : 0);
-    further_steps =
-      (cpi->sf.max_step_search_steps - 1) - step_param;
-  } else {
-    step_param = cpi->sf.first_step + 2;
-    further_steps = 0;
-  }
-
-  /*cpi->sf.search_method == HEX*/
-  // TODO Check that the 16x16 vf & sdf are selected here
-  // Ignore mv costing by sending NULL pointer instead of cost arrays
-  bestsme = vp9_hex_search(x, b, d, &best_ref_mv1_full, &d->bmi.as_mv.first,
-                           step_param, sadpb, &cpi->fn_ptr[BLOCK_16X16],
-                           NULLMVCOST, NULLMVCOST,
-                           &best_ref_mv1);
-
-#if ALT_REF_SUBPEL_ENABLED
-  // Try sub-pixel MC?
-  // if (bestsme > error_thresh && bestsme < INT_MAX)
-  {
-    int distortion;
-    unsigned int sse;
-    // Ignore mv costing by sending NULL pointer instead of cost array
-    bestsme = cpi->find_fractional_mv_step(x, b, d, &d->bmi.as_mv.first,
-                                           &best_ref_mv1,
-                                           x->errorperbit,
-                                           &cpi->fn_ptr[BLOCK_16X16],
-                                           NULLMVCOST,
-                                           &distortion, &sse);
-  }
-#endif
-
-  // Save input state
-  b->base_src = base_src;
-  b->src = src;
-  b->src_stride = src_stride;
-  d->base_pre = base_pre;
-  d->pre = pre;
-  d->pre_stride = pre_stride;
-
-  return bestsme;
-}
-#endif
-
-static void temporal_filter_iterate_c
-(
-  VP9_COMP *cpi,
-  int frame_count,
-  int alt_ref_index,
-  int strength
-) {
-  int byte;
-  int frame;
-  int mb_col, mb_row;
-  unsigned int filter_weight;
-  int mb_cols = cpi->common.mb_cols;
-  int mb_rows = cpi->common.mb_rows;
-  int mb_y_offset = 0;
-  int mb_uv_offset = 0;
-  DECLARE_ALIGNED_ARRAY(16, unsigned int, accumulator, 16 * 16 + 8 * 8 + 8 * 8);
-  DECLARE_ALIGNED_ARRAY(16, unsigned short, count, 16 * 16 + 8 * 8 + 8 * 8);
-  MACROBLOCKD *mbd = &cpi->mb.e_mbd;
-  YV12_BUFFER_CONFIG *f = cpi->frames[alt_ref_index];
-  unsigned char *dst1, *dst2;
-  DECLARE_ALIGNED_ARRAY(16, unsigned char,  predictor, 16 * 16 + 8 * 8 + 8 * 8);
-
-  // Save input state
-  unsigned char *y_buffer = mbd->pre.y_buffer;
-  unsigned char *u_buffer = mbd->pre.u_buffer;
-  unsigned char *v_buffer = mbd->pre.v_buffer;
-
-  for (mb_row = 0; mb_row < mb_rows; mb_row++) {
-#if ALT_REF_MC_ENABLED
-    // Source frames are extended to 16 pixels.  This is different than
-    //  L/A/G reference frames that have a border of 32 (VP8BORDERINPIXELS)
-    // A 6/8 tap filter is used for motion search.  This requires 2 pixels
-    //  before and 3 pixels after.  So the largest Y mv on a border would
-    //  then be 16 - INTERP_EXTEND. The UV blocks are half the size of the Y and
-    //  therefore only extended by 8.  The largest mv that a UV block
-    //  can support is 8 - INTERP_EXTEND.  A UV mv is half of a Y mv.
-    //  (16 - INTERP_EXTEND) >> 1 which is greater than 8 - INTERP_EXTEND.
-    // To keep the mv in play for both Y and UV planes the max that it
-    //  can be on a border is therefore 16 - (2*INTERP_EXTEND+1).
-    cpi->mb.mv_row_min = -((mb_row * 16) + (17 - 2 * INTERP_EXTEND));
-    cpi->mb.mv_row_max = ((cpi->common.mb_rows - 1 - mb_row) * 16)
-                         + (17 - 2 * INTERP_EXTEND);
-#endif
-
-    for (mb_col = 0; mb_col < mb_cols; mb_col++) {
-      int i, j, k;
-      int stride;
-
-      vpx_memset(accumulator, 0, 384 * sizeof(unsigned int));
-      vpx_memset(count, 0, 384 * sizeof(unsigned short));
-
-#if ALT_REF_MC_ENABLED
-      cpi->mb.mv_col_min = -((mb_col * 16) + (17 - 2 * INTERP_EXTEND));
-      cpi->mb.mv_col_max = ((cpi->common.mb_cols - 1 - mb_col) * 16)
-                           + (17 - 2 * INTERP_EXTEND);
-#endif
-
-      for (frame = 0; frame < frame_count; frame++) {
-        if (cpi->frames[frame] == NULL)
-          continue;
-
-        mbd->block[0].bmi.as_mv.first.as_mv.row = 0;
-        mbd->block[0].bmi.as_mv.first.as_mv.col = 0;
-
-        if (frame == alt_ref_index) {
-          filter_weight = 2;
-        } else {
-          int err = 0;
-#if ALT_REF_MC_ENABLED
-#define THRESH_LOW   10000
-#define THRESH_HIGH  20000
-
-          // Find best match in this frame by MC
-          err = temporal_filter_find_matching_mb_c
-                (cpi,
-                 cpi->frames[alt_ref_index],
-                 cpi->frames[frame],
-                 mb_y_offset,
-                 THRESH_LOW);
-#endif
-          // Assign higher weight to matching MB if it's error
-          // score is lower. If not applying MC default behavior
-          // is to weight all MBs equal.
-          filter_weight = err < THRESH_LOW
-                          ? 2 : err < THRESH_HIGH ? 1 : 0;
-        }
-
-        if (filter_weight != 0) {
-          // Construct the predictors
-          temporal_filter_predictors_mb_c
-          (mbd,
-           cpi->frames[frame]->y_buffer + mb_y_offset,
-           cpi->frames[frame]->u_buffer + mb_uv_offset,
-           cpi->frames[frame]->v_buffer + mb_uv_offset,
-           cpi->frames[frame]->y_stride,
-           mbd->block[0].bmi.as_mv.first.as_mv.row,
-           mbd->block[0].bmi.as_mv.first.as_mv.col,
-           predictor);
-
-          // Apply the filter (YUV)
-          TEMPORAL_INVOKE(&cpi->rtcd.temporal, apply)
-          (f->y_buffer + mb_y_offset,
-           f->y_stride,
-           predictor,
-           16,
-           strength,
-           filter_weight,
-           accumulator,
-           count);
-
-          TEMPORAL_INVOKE(&cpi->rtcd.temporal, apply)
-          (f->u_buffer + mb_uv_offset,
-           f->uv_stride,
-           predictor + 256,
-           8,
-           strength,
-           filter_weight,
-           accumulator + 256,
-           count + 256);
-
-          TEMPORAL_INVOKE(&cpi->rtcd.temporal, apply)
-          (f->v_buffer + mb_uv_offset,
-           f->uv_stride,
-           predictor + 320,
-           8,
-           strength,
-           filter_weight,
-           accumulator + 320,
-           count + 320);
-        }
-      }
-
-      // Normalize filter output to produce AltRef frame
-      dst1 = cpi->alt_ref_buffer.y_buffer;
-      stride = cpi->alt_ref_buffer.y_stride;
-      byte = mb_y_offset;
-      for (i = 0, k = 0; i < 16; i++) {
-        for (j = 0; j < 16; j++, k++) {
-          unsigned int pval = accumulator[k] + (count[k] >> 1);
-          pval *= cpi->fixed_divide[count[k]];
-          pval >>= 19;
-
-          dst1[byte] = (unsigned char)pval;
-
-          // move to next pixel
-          byte++;
-        }
-
-        byte += stride - 16;
-      }
-
-      dst1 = cpi->alt_ref_buffer.u_buffer;
-      dst2 = cpi->alt_ref_buffer.v_buffer;
-      stride = cpi->alt_ref_buffer.uv_stride;
-      byte = mb_uv_offset;
-      for (i = 0, k = 256; i < 8; i++) {
-        for (j = 0; j < 8; j++, k++) {
-          int m = k + 64;
-
-          // U
-          unsigned int pval = accumulator[k] + (count[k] >> 1);
-          pval *= cpi->fixed_divide[count[k]];
-          pval >>= 19;
-          dst1[byte] = (unsigned char)pval;
-
-          // V
-          pval = accumulator[m] + (count[m] >> 1);
-          pval *= cpi->fixed_divide[count[m]];
-          pval >>= 19;
-          dst2[byte] = (unsigned char)pval;
-
-          // move to next pixel
-          byte++;
-        }
-
-        byte += stride - 8;
-      }
-
-      mb_y_offset += 16;
-      mb_uv_offset += 8;
-    }
-
-    mb_y_offset += 16 * (f->y_stride - mb_cols);
-    mb_uv_offset += 8 * (f->uv_stride - mb_cols);
-  }
-
-  // Restore input state
-  mbd->pre.y_buffer = y_buffer;
-  mbd->pre.u_buffer = u_buffer;
-  mbd->pre.v_buffer = v_buffer;
-}
-
-void vp9_temporal_filter_prepare_c
-(
-  VP9_COMP *cpi,
-  int distance
-) {
-  int frame = 0;
-
-  int num_frames_backward = 0;
-  int num_frames_forward = 0;
-  int frames_to_blur_backward = 0;
-  int frames_to_blur_forward = 0;
-  int frames_to_blur = 0;
-  int start_frame = 0;
-
-  int strength = cpi->oxcf.arnr_strength;
-
-  int blur_type = cpi->oxcf.arnr_type;
-
-  int max_frames = cpi->active_arnr_frames;
-
-  num_frames_backward = distance;
-  num_frames_forward = vp9_lookahead_depth(cpi->lookahead)
-                       - (num_frames_backward + 1);
-
-  switch (blur_type) {
-    case 1:
-      /////////////////////////////////////////
-      // Backward Blur
-
-      frames_to_blur_backward = num_frames_backward;
-
-      if (frames_to_blur_backward >= max_frames)
-        frames_to_blur_backward = max_frames - 1;
-
-      frames_to_blur = frames_to_blur_backward + 1;
-      break;
-
-    case 2:
-      /////////////////////////////////////////
-      // Forward Blur
-
-      frames_to_blur_forward = num_frames_forward;
-
-      if (frames_to_blur_forward >= max_frames)
-        frames_to_blur_forward = max_frames - 1;
-
-      frames_to_blur = frames_to_blur_forward + 1;
-      break;
-
-    case 3:
-    default:
-      /////////////////////////////////////////
-      // Center Blur
-      frames_to_blur_forward = num_frames_forward;
-      frames_to_blur_backward = num_frames_backward;
-
-      if (frames_to_blur_forward > frames_to_blur_backward)
-        frames_to_blur_forward = frames_to_blur_backward;
-
-      if (frames_to_blur_backward > frames_to_blur_forward)
-        frames_to_blur_backward = frames_to_blur_forward;
-
-      // When max_frames is even we have 1 more frame backward than forward
-      if (frames_to_blur_forward > (max_frames - 1) / 2)
-        frames_to_blur_forward = ((max_frames - 1) / 2);
-
-      if (frames_to_blur_backward > (max_frames / 2))
-        frames_to_blur_backward = (max_frames / 2);
-
-      frames_to_blur = frames_to_blur_backward + frames_to_blur_forward + 1;
-      break;
-  }
-
-  start_frame = distance + frames_to_blur_forward;
-
-#ifdef DEBUGFWG
-  // DEBUG FWG
-  printf("max:%d FBCK:%d FFWD:%d ftb:%d ftbbck:%d ftbfwd:%d sei:%d lasei:%d start:%d"
-, max_frames
-, num_frames_backward
-, num_frames_forward
-, frames_to_blur
-, frames_to_blur_backward
-, frames_to_blur_forward
-, cpi->source_encode_index
-, cpi->last_alt_ref_sei
-, start_frame);
-#endif
-
-  // Setup frame pointers, NULL indicates frame not included in filter
-  vpx_memset(cpi->frames, 0, max_frames * sizeof(YV12_BUFFER_CONFIG *));
-  for (frame = 0; frame < frames_to_blur; frame++) {
-    int which_buffer =  start_frame - frame;
-    struct lookahead_entry *buf = vp9_lookahead_peek(cpi->lookahead,
-                                                     which_buffer);
-    cpi->frames[frames_to_blur - 1 - frame] = &buf->img;
-  }
-
-  temporal_filter_iterate_c(
-    cpi,
-    frames_to_blur,
-    frames_to_blur_backward,
-    strength);
-}
-#endif
--- a/vp8/encoder/temporal_filter.h
+++ /dev/null
@@ -1,47 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef __INC_TEMPORAL_FILTER_H
-#define __INC_TEMPORAL_FILTER_H
-
-#define prototype_apply(sym)\
-  void (sym) \
-  ( \
-    unsigned char *frame1, \
-    unsigned int stride, \
-    unsigned char *frame2, \
-    unsigned int block_size, \
-    int strength, \
-    int filter_weight, \
-    unsigned int *accumulator, \
-    unsigned short *count \
-  )
-
-#if ARCH_X86 || ARCH_X86_64
-#include "x86/temporal_filter_x86.h"
-#endif
-
-#ifndef vp9_temporal_filter_apply
-#define vp9_temporal_filter_apply vp9_temporal_filter_apply_c
-#endif
-extern prototype_apply(vp9_temporal_filter_apply);
-
-typedef struct {
-  prototype_apply(*apply);
-} vp9_temporal_rtcd_vtable_t;
-
-#if CONFIG_RUNTIME_CPU_DETECT
-#define TEMPORAL_INVOKE(ctx,fn) (ctx)->fn
-#else
-#define TEMPORAL_INVOKE(ctx,fn) vp9_temporal_filter_##fn
-#endif
-
-#endif // __INC_TEMPORAL_FILTER_H
--- a/vp8/encoder/tokenize.c
+++ /dev/null
@@ -1,868 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include <math.h>
-#include <stdio.h>
-#include <string.h>
-#include <assert.h>
-#include "onyx_int.h"
-#include "tokenize.h"
-#include "vpx_mem/vpx_mem.h"
-
-#include "vp8/common/pred_common.h"
-#include "vp8/common/seg_common.h"
-#include "vp8/common/entropy.h"
-
-/* Global event counters used for accumulating statistics across several
-   compressions, then generating context.c = initial stats. */
-
-#ifdef ENTROPY_STATS
-INT64 context_counters[BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];
-INT64 hybrid_context_counters[BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];
-
-INT64 context_counters_8x8[BLOCK_TYPES_8X8] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];
-INT64 hybrid_context_counters_8x8[BLOCK_TYPES_8X8] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];
-
-INT64 context_counters_16x16[BLOCK_TYPES_16X16] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];
-INT64 hybrid_context_counters_16x16[BLOCK_TYPES_16X16] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];
-
-extern unsigned int tree_update_hist[BLOCK_TYPES][COEF_BANDS]
-                    [PREV_COEF_CONTEXTS][ENTROPY_NODES][2];
-extern unsigned int hybrid_tree_update_hist[BLOCK_TYPES][COEF_BANDS]
-                    [PREV_COEF_CONTEXTS][ENTROPY_NODES][2];
-extern unsigned int tree_update_hist_8x8[BLOCK_TYPES_8X8][COEF_BANDS]
-                    [PREV_COEF_CONTEXTS][ENTROPY_NODES] [2];
-extern unsigned int hybrid_tree_update_hist_8x8[BLOCK_TYPES_8X8][COEF_BANDS]
-                    [PREV_COEF_CONTEXTS][ENTROPY_NODES] [2];
-extern unsigned int tree_update_hist_16x16[BLOCK_TYPES_16X16][COEF_BANDS]
-                    [PREV_COEF_CONTEXTS][ENTROPY_NODES] [2];
-extern unsigned int hybrid_tree_update_hist_16x16[BLOCK_TYPES_16X16][COEF_BANDS]
-                    [PREV_COEF_CONTEXTS][ENTROPY_NODES] [2];
-#endif  /* ENTROPY_STATS */
-
-void vp9_stuff_mb(VP9_COMP *cpi, MACROBLOCKD *xd, TOKENEXTRA **t, int dry_run);
-void vp9_fix_contexts(MACROBLOCKD *xd);
-
-static TOKENVALUE dct_value_tokens[DCT_MAX_VALUE * 2];
-const TOKENVALUE *vp9_dct_value_tokens_ptr;
-static int dct_value_cost[DCT_MAX_VALUE * 2];
-const int *vp9_dct_value_cost_ptr;
-
-static void fill_value_tokens() {
-
-  TOKENVALUE *const t = dct_value_tokens + DCT_MAX_VALUE;
-  vp9_extra_bit_struct *const e = vp9_extra_bits;
-
-  int i = -DCT_MAX_VALUE;
-  int sign = 1;
-
-  do {
-    if (!i)
-      sign = 0;
-
-    {
-      const int a = sign ? -i : i;
-      int eb = sign;
-
-      if (a > 4) {
-        int j = 4;
-
-        while (++j < 11  &&  e[j].base_val <= a) {}
-
-        t[i].Token = --j;
-        eb |= (a - e[j].base_val) << 1;
-      } else
-        t[i].Token = a;
-
-      t[i].Extra = eb;
-    }
-
-    // initialize the cost for extra bits for all possible coefficient value.
-    {
-      int cost = 0;
-      vp9_extra_bit_struct *p = vp9_extra_bits + t[i].Token;
-
-      if (p->base_val) {
-        const int extra = t[i].Extra;
-        const int Length = p->Len;
-
-        if (Length)
-          cost += treed_cost(p->tree, p->prob, extra >> 1, Length);
-
-        cost += vp9_cost_bit(vp9_prob_half, extra & 1); /* sign */
-        dct_value_cost[i + DCT_MAX_VALUE] = cost;
-      }
-
-    }
-
-  } while (++i < DCT_MAX_VALUE);
-
-  vp9_dct_value_tokens_ptr = dct_value_tokens + DCT_MAX_VALUE;
-  vp9_dct_value_cost_ptr   = dct_value_cost + DCT_MAX_VALUE;
-}
-
-static void tokenize_b(VP9_COMP *cpi,
-                       MACROBLOCKD *xd,
-                       const BLOCKD * const b,
-                       TOKENEXTRA **tp,
-                       PLANE_TYPE type,
-                       ENTROPY_CONTEXT *a,
-                       ENTROPY_CONTEXT *l,
-                       TX_SIZE tx_size,
-                       int dry_run) {
-  int pt; /* near block/prev token context index */
-  int c = (type == PLANE_TYPE_Y_NO_DC) ? 1 : 0;
-  const int eob = b->eob;     /* one beyond last nonzero coeff */
-  TOKENEXTRA *t = *tp;        /* store tokens starting here */
-  const short *qcoeff_ptr = b->qcoeff;
-  int seg_eob;
-  int segment_id = xd->mode_info_context->mbmi.segment_id;
-  const int *bands, *scan;
-  unsigned int (*counts)[COEF_BANDS][PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS];
-  vp9_prob (*probs)[COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES];
-  const TX_TYPE tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?
-                          get_tx_type(xd, b) : DCT_DCT;
-
-  VP9_COMBINEENTROPYCONTEXTS(pt, *a, *l);
-  switch (tx_size) {
-    default:
-    case TX_4X4:
-      seg_eob = 16;
-      bands = vp9_coef_bands;
-      scan = vp9_default_zig_zag1d;
-      if (tx_type != DCT_DCT) {
-        counts = cpi->hybrid_coef_counts;
-        probs = cpi->common.fc.hybrid_coef_probs;
-        if (tx_type == ADST_DCT) {
-          scan = vp9_row_scan;
-        } else if (tx_type == DCT_ADST) {
-          scan = vp9_col_scan;
-        }
-      } else {
-        counts = cpi->coef_counts;
-        probs = cpi->common.fc.coef_probs;
-      }
-      break;
-    case TX_8X8:
-      if (type == PLANE_TYPE_Y2) {
-        seg_eob = 4;
-        bands = vp9_coef_bands;
-        scan = vp9_default_zig_zag1d;
-      } else {
-        seg_eob = 64;
-        bands = vp9_coef_bands_8x8;
-        scan = vp9_default_zig_zag1d_8x8;
-      }
-      if (tx_type != DCT_DCT) {
-        counts = cpi->hybrid_coef_counts_8x8;
-        probs = cpi->common.fc.hybrid_coef_probs_8x8;
-      } else {
-        counts = cpi->coef_counts_8x8;
-        probs = cpi->common.fc.coef_probs_8x8;
-      }
-      break;
-    case TX_16X16:
-      seg_eob = 256;
-      bands = vp9_coef_bands_16x16;
-      scan = vp9_default_zig_zag1d_16x16;
-      if (tx_type != DCT_DCT) {
-        counts = cpi->hybrid_coef_counts_16x16;
-        probs = cpi->common.fc.hybrid_coef_probs_16x16;
-      } else {
-        counts = cpi->coef_counts_16x16;
-        probs = cpi->common.fc.coef_probs_16x16;
-      }
-      break;
-  }
-
-  if (vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB))
-    seg_eob = vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);
-
-  do {
-    const int band = bands[c];
-    int token;
-
-    if (c < eob) {
-      const int rc = scan[c];
-      const int v = qcoeff_ptr[rc];
-
-      assert(-DCT_MAX_VALUE <= v  &&  v < DCT_MAX_VALUE);
-
-      t->Extra = vp9_dct_value_tokens_ptr[v].Extra;
-      token    = vp9_dct_value_tokens_ptr[v].Token;
-    } else {
-      token = DCT_EOB_TOKEN;
-    }
-
-    t->Token = token;
-    t->context_tree = probs[type][band][pt];
-    t->skip_eob_node = (pt == 0) && ((band > 0 && type != PLANE_TYPE_Y_NO_DC) ||
-                                     (band > 1 && type == PLANE_TYPE_Y_NO_DC));
-    assert(vp9_coef_encodings[t->Token].Len - t->skip_eob_node > 0);
-    if (!dry_run) {
-      ++counts[type][band][pt][token];
-    }
-    pt = vp9_prev_token_class[token];
-    ++t;
-  } while (c < eob && ++c < seg_eob);
-
-  *tp = t;
-  *a = *l = (c != !type); /* 0 <-> all coeff data is zero */
-}
-
-int vp9_mby_is_skippable_4x4(MACROBLOCKD *xd, int has_y2_block) {
-  int skip = 1;
-  int i = 0;
-
-  if (has_y2_block) {
-    for (i = 0; i < 16; i++)
-      skip &= (xd->block[i].eob < 2);
-    skip &= (!xd->block[24].eob);
-  } else {
-    for (i = 0; i < 16; i++)
-      skip &= (!xd->block[i].eob);
-  }
-  return skip;
-}
-
-int vp9_mbuv_is_skippable_4x4(MACROBLOCKD *xd) {
-  int skip = 1;
-  int i;
-
-  for (i = 16; i < 24; i++)
-    skip &= (!xd->block[i].eob);
-  return skip;
-}
-
-static int mb_is_skippable_4x4(MACROBLOCKD *xd, int has_y2_block) {
-  return (vp9_mby_is_skippable_4x4(xd, has_y2_block) &
-          vp9_mbuv_is_skippable_4x4(xd));
-}
-
-int vp9_mby_is_skippable_8x8(MACROBLOCKD *xd, int has_y2_block) {
-  int skip = 1;
-  int i = 0;
-
-  if (has_y2_block) {
-    for (i = 0; i < 16; i += 4)
-      skip &= (xd->block[i].eob < 2);
-    skip &= (!xd->block[24].eob);
-  } else {
-    for (i = 0; i < 16; i += 4)
-      skip &= (!xd->block[i].eob);
-  }
-  return skip;
-}
-
-int vp9_mbuv_is_skippable_8x8(MACROBLOCKD *xd) {
-  return (!xd->block[16].eob) & (!xd->block[20].eob);
-}
-
-static int mb_is_skippable_8x8(MACROBLOCKD *xd, int has_y2_block) {
-  return (vp9_mby_is_skippable_8x8(xd, has_y2_block) &
-          vp9_mbuv_is_skippable_8x8(xd));
-}
-
-static int mb_is_skippable_8x8_4x4uv(MACROBLOCKD *xd, int has_y2_block) {
-  return (vp9_mby_is_skippable_8x8(xd, has_y2_block) &
-          vp9_mbuv_is_skippable_4x4(xd));
-}
-
-int vp9_mby_is_skippable_16x16(MACROBLOCKD *xd) {
-  int skip = 1;
-  skip &= !xd->block[0].eob;
-  return skip;
-}
-
-static int mb_is_skippable_16x16(MACROBLOCKD *xd) {
-  return (vp9_mby_is_skippable_16x16(xd) & vp9_mbuv_is_skippable_8x8(xd));
-}
-
-void vp9_tokenize_mb(VP9_COMP *cpi,
-                     MACROBLOCKD *xd,
-                     TOKENEXTRA **t,
-                     int dry_run) {
-  PLANE_TYPE plane_type;
-  int has_y2_block;
-  int b;
-  int tx_size = xd->mode_info_context->mbmi.txfm_size;
-  int mb_skip_context = vp9_get_pred_context(&cpi->common, xd, PRED_MBSKIP);
-  TOKENEXTRA *t_backup = *t;
-  ENTROPY_CONTEXT * A = (ENTROPY_CONTEXT *) xd->above_context;
-  ENTROPY_CONTEXT * L = (ENTROPY_CONTEXT *) xd->left_context;
-
-  // If the MB is going to be skipped because of a segment level flag
-  // exclude this from the skip count stats used to calculate the
-  // transmitted skip probability;
-  int skip_inc;
-  int segment_id = xd->mode_info_context->mbmi.segment_id;
-
-  if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) ||
-      (vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) != 0)) {
-    skip_inc = 1;
-  } else
-    skip_inc = 0;
-
-  has_y2_block = (tx_size != TX_16X16
-                  && xd->mode_info_context->mbmi.mode != B_PRED
-                  && xd->mode_info_context->mbmi.mode != I8X8_PRED
-                  && xd->mode_info_context->mbmi.mode != SPLITMV);
-
-  switch (tx_size) {
-    case TX_16X16:
-      xd->mode_info_context->mbmi.mb_skip_coeff = mb_is_skippable_16x16(xd);
-      break;
-    case TX_8X8:
-      if (xd->mode_info_context->mbmi.mode == I8X8_PRED ||
-          xd->mode_info_context->mbmi.mode == SPLITMV)
-        xd->mode_info_context->mbmi.mb_skip_coeff = mb_is_skippable_8x8_4x4uv(xd, 0);
-      else
-        xd->mode_info_context->mbmi.mb_skip_coeff = mb_is_skippable_8x8(xd, has_y2_block);
-      break;
-
-    default:
-      xd->mode_info_context->mbmi.mb_skip_coeff = mb_is_skippable_4x4(xd, has_y2_block);
-      break;
-  }
-
-  if (xd->mode_info_context->mbmi.mb_skip_coeff) {
-    if (!dry_run)
-      cpi->skip_true_count[mb_skip_context] += skip_inc;
-    if (!cpi->common.mb_no_coeff_skip) {
-      vp9_stuff_mb(cpi, xd, t, dry_run);
-    } else {
-      vp9_fix_contexts(xd);
-    }
-    if (dry_run)
-      *t = t_backup;
-    return;
-  }
-
-  if (!dry_run)
-    cpi->skip_false_count[mb_skip_context] += skip_inc;
-
-  if (has_y2_block) {
-    if (tx_size == TX_8X8) {
-      tokenize_b(cpi, xd, xd->block + 24, t, PLANE_TYPE_Y2,
-                 A + vp9_block2above_8x8[24], L + vp9_block2left_8x8[24],
-                 TX_8X8, dry_run);
-    } else {
-      tokenize_b(cpi, xd, xd->block + 24, t, PLANE_TYPE_Y2,
-                 A + vp9_block2above[24], L + vp9_block2left[24],
-                 TX_4X4, dry_run);
-    }
-
-    plane_type = PLANE_TYPE_Y_NO_DC;
-  } else
-    plane_type = PLANE_TYPE_Y_WITH_DC;
-
-  if (tx_size == TX_16X16) {
-    tokenize_b(cpi, xd, xd->block, t, PLANE_TYPE_Y_WITH_DC,
-               A, L, TX_16X16, dry_run);
-    A[1] = A[2] = A[3] = A[0];
-    L[1] = L[2] = L[3] = L[0];
-
-    for (b = 16; b < 24; b += 4) {
-      tokenize_b(cpi, xd, xd->block + b, t, PLANE_TYPE_UV,
-                 A + vp9_block2above_8x8[b], L + vp9_block2left_8x8[b],
-                 TX_8X8, dry_run);
-      A[vp9_block2above_8x8[b] + 1] = A[vp9_block2above_8x8[b]];
-      L[vp9_block2left_8x8[b] + 1]  = L[vp9_block2left_8x8[b]];
-    }
-    vpx_memset(&A[8], 0, sizeof(A[8]));
-    vpx_memset(&L[8], 0, sizeof(L[8]));
-  } else if (tx_size == TX_8X8) {
-    for (b = 0; b < 16; b += 4) {
-      tokenize_b(cpi, xd, xd->block + b, t, plane_type,
-                 A + vp9_block2above_8x8[b], L + vp9_block2left_8x8[b],
-                 TX_8X8, dry_run);
-      A[vp9_block2above_8x8[b] + 1] = A[vp9_block2above_8x8[b]];
-      L[vp9_block2left_8x8[b] + 1]  = L[vp9_block2left_8x8[b]];
-    }
-    if (xd->mode_info_context->mbmi.mode == I8X8_PRED ||
-        xd->mode_info_context->mbmi.mode == SPLITMV) {
-      for (b = 16; b < 24; b++) {
-        tokenize_b(cpi, xd, xd->block + b, t, PLANE_TYPE_UV,
-                   A + vp9_block2above[b], L + vp9_block2left[b],
-                   TX_4X4, dry_run);
-      }
-    } else {
-      for (b = 16; b < 24; b += 4) {
-        tokenize_b(cpi, xd, xd->block + b, t, PLANE_TYPE_UV,
-                   A + vp9_block2above_8x8[b], L + vp9_block2left_8x8[b],
-                   TX_8X8, dry_run);
-        A[vp9_block2above_8x8[b] + 1] = A[vp9_block2above_8x8[b]];
-        L[vp9_block2left_8x8[b] + 1]  = L[vp9_block2left_8x8[b]];
-      }
-    }
-  } else {
-    for (b = 0; b < 16; b++) {
-      tokenize_b(cpi, xd, xd->block + b, t, plane_type,
-                 A + vp9_block2above[b], L + vp9_block2left[b],
-                 TX_4X4, dry_run);
-    }
-
-    for (b = 16; b < 24; b++) {
-      tokenize_b(cpi, xd, xd->block + b, t, PLANE_TYPE_UV,
-                 A + vp9_block2above[b], L + vp9_block2left[b],
-                 TX_4X4, dry_run);
-    }
-  }
-  if (dry_run)
-    *t = t_backup;
-}
-
-
-#ifdef ENTROPY_STATS
-void init_context_counters(void) {
-  FILE *f = fopen("context.bin", "rb");
-  if (!f) {
-    vpx_memset(context_counters, 0, sizeof(context_counters));
-    vpx_memset(context_counters_8x8, 0, sizeof(context_counters_8x8));
-    vpx_memset(context_counters_16x16, 0, sizeof(context_counters_16x16));
-  } else {
-    fread(context_counters, sizeof(context_counters), 1, f);
-    fread(context_counters_8x8, sizeof(context_counters_8x8), 1, f);
-    fread(context_counters_16x16, sizeof(context_counters_16x16), 1, f);
-    fclose(f);
-  }
-
-  f = fopen("treeupdate.bin", "rb");
-  if (!f) {
-    vpx_memset(tree_update_hist, 0, sizeof(tree_update_hist));
-    vpx_memset(tree_update_hist_8x8, 0, sizeof(tree_update_hist_8x8));
-    vpx_memset(tree_update_hist_16x16, 0, sizeof(tree_update_hist_16x16));
-  } else {
-    fread(tree_update_hist, sizeof(tree_update_hist), 1, f);
-    fread(tree_update_hist_8x8, sizeof(tree_update_hist_8x8), 1, f);
-    fread(tree_update_hist_16x16, sizeof(tree_update_hist_16x16), 1, f);
-    fclose(f);
-  }
-}
-
-void print_context_counters() {
-  int type, band, pt, t;
-  FILE *f = fopen("context.c", "w");
-
-  fprintf(f, "#include \"entropy.h\"\n");
-  fprintf(f, "\n/* *** GENERATED FILE: DO NOT EDIT *** */\n\n");
-  fprintf(f, "static const unsigned int\n"
-          "vp9_default_coef_counts[BLOCK_TYPES]\n"
-          "                      [COEF_BANDS]\n"
-          "                      [PREV_COEF_CONTEXTS]\n"
-          "                      [MAX_ENTROPY_TOKENS]={\n");
-
-# define Comma( X) (X? ",":"")
-  type = 0;
-  do {
-    fprintf(f, "%s\n  { /* block Type %d */", Comma(type), type);
-    band = 0;
-    do {
-      fprintf(f, "%s\n    { /* Coeff Band %d */", Comma(band), band);
-      pt = 0;
-      do {
-        fprintf(f, "%s\n      {", Comma(pt));
-
-        t = 0;
-        do {
-          const INT64 x = context_counters [type] [band] [pt] [t];
-          const int y = (int) x;
-          assert(x == (INT64) y);  /* no overflow handling yet */
-          fprintf(f, "%s %d", Comma(t), y);
-        } while (++t < MAX_ENTROPY_TOKENS);
-        fprintf(f, "}");
-      } while (++pt < PREV_COEF_CONTEXTS);
-      fprintf(f, "\n    }");
-    } while (++band < COEF_BANDS);
-    fprintf(f, "\n  }");
-  } while (++type < BLOCK_TYPES);
-  fprintf(f, "\n};\n");
-
-  fprintf(f, "static const unsigned int\nvp9_default_coef_counts_8x8"
-          "[BLOCK_TYPES_8X8] [COEF_BANDS]"
-          "[PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS] = {");
-  type = 0;
-  do {
-    fprintf(f, "%s\n  { /* block Type %d */", Comma(type), type);
-    band = 0;
-    do {
-      fprintf(f, "%s\n    { /* Coeff Band %d */", Comma(band), band);
-      pt = 0;
-      do {
-        fprintf(f, "%s\n      {", Comma(pt));
-        t = 0;
-        do {
-          const INT64 x = context_counters_8x8 [type] [band] [pt] [t];
-          const int y = (int) x;
-
-          assert(x == (INT64) y);  /* no overflow handling yet */
-          fprintf(f, "%s %d", Comma(t), y);
-
-        } while (++t < MAX_ENTROPY_TOKENS);
-
-        fprintf(f, "}");
-      } while (++pt < PREV_COEF_CONTEXTS);
-
-      fprintf(f, "\n    }");
-
-    } while (++band < COEF_BANDS);
-
-    fprintf(f, "\n  }");
-  } while (++type < BLOCK_TYPES_8X8);
-  fprintf(f, "\n};\n");
-
-  fprintf(f, "static const unsigned int\nvp9_default_coef_counts_16x16"
-          "[BLOCK_TYPES_16X16] [COEF_BANDS]"
-          "[PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS] = {");
-  type = 0;
-  do {
-    fprintf(f, "%s\n  { /* block Type %d */", Comma(type), type);
-    band = 0;
-    do {
-      fprintf(f, "%s\n    { /* Coeff Band %d */", Comma(band), band);
-      pt = 0;
-      do {
-        fprintf(f, "%s\n      {", Comma(pt));
-        t = 0;
-        do {
-          const INT64 x = context_counters_16x16 [type] [band] [pt] [t];
-          const int y = (int) x;
-
-          assert(x == (INT64) y);  /* no overflow handling yet */
-          fprintf(f, "%s %d", Comma(t), y);
-
-        } while (++t < MAX_ENTROPY_TOKENS);
-
-        fprintf(f, "}");
-      } while (++pt < PREV_COEF_CONTEXTS);
-
-      fprintf(f, "\n    }");
-
-    } while (++band < COEF_BANDS);
-
-    fprintf(f, "\n  }");
-  } while (++type < BLOCK_TYPES_16X16);
-  fprintf(f, "\n};\n");
-
-  fprintf(f, "static const vp9_prob\n"
-          "vp9_default_coef_probs[BLOCK_TYPES] [COEF_BANDS] \n"
-          "[PREV_COEF_CONTEXTS] [ENTROPY_NODES] = {");
-  type = 0;
-  do {
-    fprintf(f, "%s\n  { /* block Type %d */", Comma(type), type);
-    band = 0;
-    do {
-      fprintf(f, "%s\n    { /* Coeff Band %d */", Comma(band), band);
-      pt = 0;
-      do {
-        unsigned int branch_ct [ENTROPY_NODES] [2];
-        unsigned int coef_counts[MAX_ENTROPY_TOKENS];
-        vp9_prob coef_probs[ENTROPY_NODES];
-        for (t = 0; t < MAX_ENTROPY_TOKENS; ++t)
-          coef_counts[t] = context_counters [type] [band] [pt] [t];
-        vp9_tree_probs_from_distribution(
-          MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree,
-          coef_probs, branch_ct, coef_counts, 256, 1);
-        fprintf(f, "%s\n      {", Comma(pt));
-
-        t = 0;
-        do {
-          fprintf(f, "%s %d", Comma(t), coef_probs[t]);
-
-        } while (++t < ENTROPY_NODES);
-
-        fprintf(f, "}");
-      } while (++pt < PREV_COEF_CONTEXTS);
-      fprintf(f, "\n    }");
-    } while (++band < COEF_BANDS);
-    fprintf(f, "\n  }");
-  } while (++type < BLOCK_TYPES);
-  fprintf(f, "\n};\n");
-
-  fprintf(f, "static const vp9_prob\n"
-          "vp9_default_coef_probs_8x8[BLOCK_TYPES_8X8] [COEF_BANDS]\n"
-          "[PREV_COEF_CONTEXTS] [ENTROPY_NODES] = {");
-  type = 0;
-  do {
-    fprintf(f, "%s\n  { /* block Type %d */", Comma(type), type);
-    band = 0;
-    do {
-      fprintf(f, "%s\n    { /* Coeff Band %d */", Comma(band), band);
-      pt = 0;
-      do {
-        unsigned int branch_ct [ENTROPY_NODES] [2];
-        unsigned int coef_counts[MAX_ENTROPY_TOKENS];
-        vp9_prob coef_probs[ENTROPY_NODES];
-        for (t = 0; t < MAX_ENTROPY_TOKENS; ++t)
-          coef_counts[t] = context_counters_8x8[type] [band] [pt] [t];
-        vp9_tree_probs_from_distribution(
-          MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree,
-          coef_probs, branch_ct, coef_counts, 256, 1);
-        fprintf(f, "%s\n      {", Comma(pt));
-
-        t = 0;
-        do {
-          fprintf(f, "%s %d", Comma(t), coef_probs[t]);
-        } while (++t < ENTROPY_NODES);
-        fprintf(f, "}");
-      } while (++pt < PREV_COEF_CONTEXTS);
-      fprintf(f, "\n    }");
-    } while (++band < COEF_BANDS);
-    fprintf(f, "\n  }");
-  } while (++type < BLOCK_TYPES_8X8);
-  fprintf(f, "\n};\n");
-
-  fprintf(f, "static const vp9_prob\n"
-          "vp9_default_coef_probs_16x16[BLOCK_TYPES_16X16] [COEF_BANDS]\n"
-          "[PREV_COEF_CONTEXTS] [ENTROPY_NODES] = {");
-  type = 0;
-  do {
-    fprintf(f, "%s\n  { /* block Type %d */", Comma(type), type);
-    band = 0;
-    do {
-      fprintf(f, "%s\n    { /* Coeff Band %d */", Comma(band), band);
-      pt = 0;
-      do {
-        unsigned int branch_ct [ENTROPY_NODES] [2];
-        unsigned int coef_counts[MAX_ENTROPY_TOKENS];
-        vp9_prob coef_probs[ENTROPY_NODES];
-        for (t = 0; t < MAX_ENTROPY_TOKENS; ++t)
-          coef_counts[t] = context_counters_16x16[type] [band] [pt] [t];
-        vp9_tree_probs_from_distribution(
-          MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree,
-          coef_probs, branch_ct, coef_counts, 256, 1);
-        fprintf(f, "%s\n      {", Comma(pt));
-
-        t = 0;
-        do {
-          fprintf(f, "%s %d", Comma(t), coef_probs[t]);
-        } while (++t < ENTROPY_NODES);
-        fprintf(f, "}");
-      } while (++pt < PREV_COEF_CONTEXTS);
-      fprintf(f, "\n    }");
-    } while (++band < COEF_BANDS);
-    fprintf(f, "\n  }");
-  } while (++type < BLOCK_TYPES_16X16);
-  fprintf(f, "\n};\n");
-
-  fclose(f);
-
-  f = fopen("context.bin", "wb");
-  fwrite(context_counters, sizeof(context_counters), 1, f);
-  fwrite(context_counters_8x8, sizeof(context_counters_8x8), 1, f);
-  fwrite(context_counters_16x16, sizeof(context_counters_16x16), 1, f);
-  fclose(f);
-}
-#endif
-
-void vp9_tokenize_initialize() {
-  fill_value_tokens();
-}
-
-static __inline void stuff_b(VP9_COMP *cpi,
-                             MACROBLOCKD *xd,
-                             const BLOCKD * const b,
-                             TOKENEXTRA **tp,
-                             PLANE_TYPE type,
-                             ENTROPY_CONTEXT *a,
-                             ENTROPY_CONTEXT *l,
-                             TX_SIZE tx_size,
-                             int dry_run) {
-  const int *bands;
-  unsigned int (*counts)[COEF_BANDS][PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS];
-  vp9_prob (*probs)[COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES];
-  int pt, band;
-  TOKENEXTRA *t = *tp;
-  const TX_TYPE tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?
-                          get_tx_type(xd, b) : DCT_DCT;
-  VP9_COMBINEENTROPYCONTEXTS(pt, *a, *l);
-
-  switch (tx_size) {
-    default:
-    case TX_4X4:
-      bands = vp9_coef_bands;
-      if (tx_type != DCT_DCT) {
-        counts = cpi->hybrid_coef_counts;
-        probs = cpi->common.fc.hybrid_coef_probs;
-      } else {
-        counts = cpi->coef_counts;
-        probs = cpi->common.fc.coef_probs;
-      }
-      break;
-    case TX_8X8:
-      bands = vp9_coef_bands_8x8;
-      if (tx_type != DCT_DCT) {
-        counts = cpi->hybrid_coef_counts_8x8;
-        probs = cpi->common.fc.hybrid_coef_probs_8x8;
-      } else {
-        counts = cpi->coef_counts_8x8;
-        probs = cpi->common.fc.coef_probs_8x8;
-      }
-      break;
-    case TX_16X16:
-      bands = vp9_coef_bands_16x16;
-      if (tx_type != DCT_DCT) {
-        counts = cpi->hybrid_coef_counts_16x16;
-        probs = cpi->common.fc.hybrid_coef_probs_16x16;
-      } else {
-        counts = cpi->coef_counts_16x16;
-        probs = cpi->common.fc.coef_probs_16x16;
-      }
-      break;
-  }
-  band = bands[(type == PLANE_TYPE_Y_NO_DC) ? 1 : 0];
-  t->Token = DCT_EOB_TOKEN;
-  t->context_tree = probs[type][band][pt];
-  t->skip_eob_node = 0;
-  ++t;
-  *tp = t;
-  *a = *l = 0;
-  if (!dry_run) {
-    ++counts[type][band][pt][DCT_EOB_TOKEN];
-  }
-}
-
-static void stuff_mb_8x8(VP9_COMP *cpi, MACROBLOCKD *xd,
-                         TOKENEXTRA **t, int dry_run) {
-  ENTROPY_CONTEXT *A = (ENTROPY_CONTEXT *)xd->above_context;
-  ENTROPY_CONTEXT *L = (ENTROPY_CONTEXT *)xd->left_context;
-  PLANE_TYPE plane_type;
-  int b;
-  const int has_y2_block = (xd->mode_info_context->mbmi.mode != B_PRED &&
-                            xd->mode_info_context->mbmi.mode != I8X8_PRED &&
-                            xd->mode_info_context->mbmi.mode != SPLITMV);
-
-  if (has_y2_block) {
-    stuff_b(cpi, xd, xd->block + 24, t, PLANE_TYPE_Y2,
-            A + vp9_block2above_8x8[24], L + vp9_block2left_8x8[24],
-            TX_8X8, dry_run);
-    plane_type = PLANE_TYPE_Y_NO_DC;
-  } else {
-    plane_type = PLANE_TYPE_Y_WITH_DC;
-  }
-
-  for (b = 0; b < 16; b += 4) {
-    stuff_b(cpi, xd, xd->block + b, t, plane_type, A + vp9_block2above_8x8[b],
-            L + vp9_block2left_8x8[b], TX_8X8, dry_run);
-    A[vp9_block2above_8x8[b] + 1] = A[vp9_block2above_8x8[b]];
-    L[vp9_block2left_8x8[b] + 1]  = L[vp9_block2left_8x8[b]];
-  }
-
-  for (b = 16; b < 24; b += 4) {
-    stuff_b(cpi, xd, xd->block + b, t, PLANE_TYPE_UV,
-            A + vp9_block2above_8x8[b], L + vp9_block2left_8x8[b],
-            TX_8X8, dry_run);
-    A[vp9_block2above_8x8[b] + 1] = A[vp9_block2above_8x8[b]];
-    L[vp9_block2left_8x8[b] + 1]  = L[vp9_block2left_8x8[b]];
-  }
-}
-
-static void stuff_mb_16x16(VP9_COMP *cpi, MACROBLOCKD *xd,
-                           TOKENEXTRA **t, int dry_run) {
-  ENTROPY_CONTEXT * A = (ENTROPY_CONTEXT *)xd->above_context;
-  ENTROPY_CONTEXT * L = (ENTROPY_CONTEXT *)xd->left_context;
-  int b;
-
-  stuff_b(cpi, xd, xd->block, t, PLANE_TYPE_Y_WITH_DC, A, L, TX_16X16, dry_run);
-  A[1] = A[2] = A[3] = A[0];
-  L[1] = L[2] = L[3] = L[0];
-  for (b = 16; b < 24; b += 4) {
-    stuff_b(cpi, xd, xd->block + b, t, PLANE_TYPE_UV, A + vp9_block2above[b],
-            L + vp9_block2above_8x8[b], TX_8X8, dry_run);
-    A[vp9_block2above_8x8[b] + 1] = A[vp9_block2above_8x8[b]];
-    L[vp9_block2left_8x8[b] + 1]  = L[vp9_block2left_8x8[b]];
-  }
-  vpx_memset(&A[8], 0, sizeof(A[8]));
-  vpx_memset(&L[8], 0, sizeof(L[8]));
-}
-
-static void stuff_mb_4x4(VP9_COMP *cpi, MACROBLOCKD *xd,
-                         TOKENEXTRA **t, int dry_run) {
-  ENTROPY_CONTEXT *A = (ENTROPY_CONTEXT *)xd->above_context;
-  ENTROPY_CONTEXT *L = (ENTROPY_CONTEXT *)xd->left_context;
-  int b;
-  PLANE_TYPE plane_type;
-  const int has_y2_block = (xd->mode_info_context->mbmi.mode != B_PRED &&
-                            xd->mode_info_context->mbmi.mode != I8X8_PRED &&
-                            xd->mode_info_context->mbmi.mode != SPLITMV);
-
-  if (has_y2_block) {
-    stuff_b(cpi, xd, xd->block + 24, t, PLANE_TYPE_Y2, A + vp9_block2above[24],
-            L + vp9_block2left[24], TX_4X4, dry_run);
-    plane_type = PLANE_TYPE_Y_NO_DC;
-  } else {
-    plane_type = PLANE_TYPE_Y_WITH_DC;
-  }
-
-  for (b = 0; b < 16; b++)
-    stuff_b(cpi, xd, xd->block + b, t, plane_type, A + vp9_block2above[b],
-            L + vp9_block2left[b], TX_4X4, dry_run);
-
-  for (b = 16; b < 24; b++)
-    stuff_b(cpi, xd, xd->block + b, t, PLANE_TYPE_UV, A + vp9_block2above[b],
-            L + vp9_block2left[b], TX_4X4, dry_run);
-}
-
-static void stuff_mb_8x8_4x4uv(VP9_COMP *cpi, MACROBLOCKD *xd,
-                               TOKENEXTRA **t, int dry_run) {
-  ENTROPY_CONTEXT *A = (ENTROPY_CONTEXT *)xd->above_context;
-  ENTROPY_CONTEXT *L = (ENTROPY_CONTEXT *)xd->left_context;
-  int b;
-
-  for (b = 0; b < 16; b += 4) {
-    stuff_b(cpi, xd, xd->block + b, t, PLANE_TYPE_Y_WITH_DC,
-            A + vp9_block2above_8x8[b], L + vp9_block2left_8x8[b],
-            TX_8X8, dry_run);
-    A[vp9_block2above_8x8[b] + 1] = A[vp9_block2above_8x8[b]];
-    L[vp9_block2left_8x8[b] + 1]  = L[vp9_block2left_8x8[b]];
-  }
-
-  for (b = 16; b < 24; b++)
-    stuff_b(cpi, xd, xd->block + b, t, PLANE_TYPE_UV, A + vp9_block2above[b],
-            L + vp9_block2left[b], TX_4X4, dry_run);
-}
-
-void vp9_stuff_mb(VP9_COMP *cpi, MACROBLOCKD *xd, TOKENEXTRA **t, int dry_run) {
-  TX_SIZE tx_size = xd->mode_info_context->mbmi.txfm_size;
-  TOKENEXTRA * const t_backup = *t;
-
-  if (tx_size == TX_16X16) {
-    stuff_mb_16x16(cpi, xd, t, dry_run);
-  } else if (tx_size == TX_8X8) {
-    if (xd->mode_info_context->mbmi.mode == I8X8_PRED ||
-        xd->mode_info_context->mbmi.mode == SPLITMV) {
-      stuff_mb_8x8_4x4uv(cpi, xd, t, dry_run);
-    } else {
-      stuff_mb_8x8(cpi, xd, t, dry_run);
-    }
-  } else {
-    stuff_mb_4x4(cpi, xd, t, dry_run);
-  }
-
-  if (dry_run) {
-    *t = t_backup;
-  }
-}
-
-void vp9_fix_contexts(MACROBLOCKD *xd) {
-  /* Clear entropy contexts for Y2 blocks */
-  if ((xd->mode_info_context->mbmi.mode != B_PRED
-      && xd->mode_info_context->mbmi.mode != I8X8_PRED
-      && xd->mode_info_context->mbmi.mode != SPLITMV)
-      || xd->mode_info_context->mbmi.txfm_size == TX_16X16
-      ) {
-    vpx_memset(xd->above_context, 0, sizeof(ENTROPY_CONTEXT_PLANES));
-    vpx_memset(xd->left_context, 0, sizeof(ENTROPY_CONTEXT_PLANES));
-  } else {
-    vpx_memset(xd->above_context, 0, sizeof(ENTROPY_CONTEXT_PLANES) - 1);
-    vpx_memset(xd->left_context, 0, sizeof(ENTROPY_CONTEXT_PLANES) - 1);
-  }
-}
--- a/vp8/encoder/tokenize.h
+++ /dev/null
@@ -1,59 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef tokenize_h
-#define tokenize_h
-
-#include "vp8/common/entropy.h"
-#include "block.h"
-
-void vp9_tokenize_initialize();
-
-typedef struct {
-  short Token;
-  short Extra;
-} TOKENVALUE;
-
-typedef struct {
-  const vp9_prob *context_tree;
-  short           Extra;
-  unsigned char   Token;
-  unsigned char   skip_eob_node;
-} TOKENEXTRA;
-
-int rd_cost_mby(MACROBLOCKD *);
-
-extern int vp9_mby_is_skippable_4x4(MACROBLOCKD *xd, int has_y2_block);
-extern int vp9_mbuv_is_skippable_4x4(MACROBLOCKD *xd);
-extern int vp9_mby_is_skippable_8x8(MACROBLOCKD *xd, int has_y2_block);
-extern int vp9_mbuv_is_skippable_8x8(MACROBLOCKD *xd);
-extern int vp9_mby_is_skippable_16x16(MACROBLOCKD *xd);
-
-#ifdef ENTROPY_STATS
-void init_context_counters();
-void print_context_counters();
-
-extern INT64 context_counters[BLOCK_TYPES][COEF_BANDS]
-                             [PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS];
-extern INT64 context_counters_8x8[BLOCK_TYPES_8X8][COEF_BANDS]
-                                 [PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS];
-extern INT64 context_counters_16x16[BLOCK_TYPES_16X16][COEF_BANDS]
-                                   [PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS];
-#endif
-
-extern const int *vp9_dct_value_cost_ptr;
-/* TODO: The Token field should be broken out into a separate char array to
- *  improve cache locality, since it's needed for costing when the rest of the
- *  fields are not.
- */
-extern const TOKENVALUE *vp9_dct_value_tokens_ptr;
-
-#endif  /* tokenize_h */
--- a/vp8/encoder/treewriter.c
+++ /dev/null
@@ -1,39 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "treewriter.h"
-
-static void cost(
-  int *const C,
-  vp9_tree T,
-  const vp9_prob *const P,
-  int i,
-  int c
-) {
-  const vp9_prob p = P [i >> 1];
-
-  do {
-    const vp9_tree_index j = T[i];
-    const int d = c + vp9_cost_bit(p, i & 1);
-
-    if (j <= 0)
-      C[-j] = d;
-    else
-      cost(C, T, P, j, d);
-  } while (++i & 1);
-}
-void vp9_cost_tokens(int *c, const vp9_prob *p, vp9_tree t) {
-  cost(c, t, p, 0, 0);
-}
-
-void vp9_cost_tokens_skip(int *c, const vp9_prob *p, vp9_tree t) {
-  cost(c, t, p, 2, 0);
-}
--- a/vp8/encoder/treewriter.h
+++ /dev/null
@@ -1,108 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef __INC_TREEWRITER_H
-#define __INC_TREEWRITER_H
-
-/* Trees map alphabets into huffman-like codes suitable for an arithmetic
-   bit coder.  Timothy S Murphy  11 October 2004 */
-
-#include "vp8/common/treecoder.h"
-
-#include "boolhuff.h"       /* for now */
-
-typedef BOOL_CODER vp9_writer;
-
-#define vp9_write encode_bool
-#define vp9_write_literal vp9_encode_value
-#define vp9_write_bit(W, V) vp9_write(W, V, vp9_prob_half)
-
-/* Approximate length of an encoded bool in 256ths of a bit at given prob */
-
-#define vp9_cost_zero(x) (vp9_prob_cost[x])
-#define vp9_cost_one(x) vp9_cost_zero(vp9_complement(x))
-
-#define vp9_cost_bit(x, b) vp9_cost_zero((b) ? vp9_complement(x) : (x))
-
-/* VP8BC version is scaled by 2^20 rather than 2^8; see bool_coder.h */
-
-
-/* Both of these return bits, not scaled bits. */
-
-static __inline unsigned int cost_branch(const unsigned int ct[2],
-                                         vp9_prob p) {
-  /* Imitate existing calculation */
-  return ((ct[0] * vp9_cost_zero(p))
-          + (ct[1] * vp9_cost_one(p))) >> 8;
-}
-
-static __inline unsigned int cost_branch256(const unsigned int ct[2],
-                                            vp9_prob p) {
-  /* Imitate existing calculation */
-  return ((ct[0] * vp9_cost_zero(p))
-          + (ct[1] * vp9_cost_one(p)));
-}
-
-/* Small functions to write explicit values and tokens, as well as
-   estimate their lengths. */
-
-static __inline void treed_write(vp9_writer *const w,
-                                 vp9_tree t,
-                                 const vp9_prob *const p,
-                                 int v,
-                                 /* number of bits in v, assumed nonzero */
-                                 int n) {
-  vp9_tree_index i = 0;
-
-  do {
-    const int b = (v >> --n) & 1;
-    vp9_write(w, b, p[i >> 1]);
-    i = t[i + b];
-  } while (n);
-}
-
-static __inline void write_token(vp9_writer *const w,
-                                 vp9_tree t,
-                                 const vp9_prob *const p,
-                                 vp9_token *const x) {
-  treed_write(w, t, p, x->value, x->Len);
-}
-
-static __inline int treed_cost(vp9_tree t,
-                               const vp9_prob *const p,
-                               int v,
-                               /* number of bits in v, assumed nonzero */
-                               int n) {
-  int c = 0;
-  vp9_tree_index i = 0;
-
-  do {
-    const int b = (v >> --n) & 1;
-    c += vp9_cost_bit(p[i >> 1], b);
-    i = t[i + b];
-  } while (n);
-
-  return c;
-}
-
-static __inline int cost_token(vp9_tree t,
-                               const vp9_prob *const p,
-                               vp9_token *const x) {
-  return treed_cost(t, p, x->value, x->Len);
-}
-
-/* Fill array of costs for all possible token values. */
-
-void vp9_cost_tokens(int *Costs, const vp9_prob *, vp9_tree);
-
-void vp9_cost_tokens_skip(int *c, const vp9_prob *p, vp9_tree t);
-
-#endif
--- a/vp8/encoder/variance.h
+++ /dev/null
@@ -1,84 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef VARIANCE_H
-#define VARIANCE_H
-
-typedef unsigned int(*vp9_sad_fn_t)(const unsigned char *src_ptr,
-                                    int source_stride,
-                                    const unsigned char *ref_ptr,
-                                    int ref_stride,
-                                    unsigned int max_sad);
-
-typedef void (*vp9_copy32xn_fn_t)(const unsigned char *src_ptr,
-                                  int source_stride,
-                                  const unsigned char *ref_ptr,
-                                  int ref_stride,
-                                  int n);
-
-typedef void (*vp9_sad_multi_fn_t)(const unsigned char *src_ptr,
-                                   int source_stride,
-                                   const unsigned char *ref_ptr,
-                                   int  ref_stride,
-                                   unsigned int *sad_array);
-
-typedef void (*vp9_sad_multi1_fn_t)(const unsigned char *src_ptr,
-                                    int source_stride,
-                                    const unsigned char *ref_ptr,
-                                    int  ref_stride,
-                                    unsigned short *sad_array);
-
-typedef void (*vp9_sad_multi_d_fn_t)(const unsigned char *src_ptr,
-                                     int source_stride,
-                                     const unsigned char * const ref_ptr[],
-                                     int  ref_stride, unsigned int *sad_array);
-
-typedef unsigned int (*vp9_variance_fn_t)(const unsigned char *src_ptr,
-                                          int source_stride,
-                                          const unsigned char *ref_ptr,
-                                          int ref_stride,
-                                          unsigned int *sse);
-
-typedef unsigned int (*vp9_subpixvariance_fn_t)(const unsigned char  *src_ptr,
-                                                int source_stride,
-                                                int xoffset,
-                                                int yoffset,
-                                                const unsigned char *ref_ptr,
-                                                int Refstride,
-                                                unsigned int *sse);
-
-typedef void (*vp9_ssimpf_fn_t)(unsigned char *s, int sp, unsigned char *r,
-                                int rp, unsigned long *sum_s,
-                                unsigned long *sum_r, unsigned long *sum_sq_s,
-                                unsigned long *sum_sq_r,
-                                unsigned long *sum_sxr);
-
-typedef unsigned int (*vp9_getmbss_fn_t)(const short *);
-
-typedef unsigned int (*vp9_get16x16prederror_fn_t)(const unsigned char *src_ptr,
-                                                   int source_stride,
-                                                   const unsigned char *ref_ptr,
-                                                   int  ref_stride);
-
-typedef struct variance_vtable {
-    vp9_sad_fn_t            sdf;
-    vp9_variance_fn_t       vf;
-    vp9_subpixvariance_fn_t svf;
-    vp9_variance_fn_t       svf_halfpix_h;
-    vp9_variance_fn_t       svf_halfpix_v;
-    vp9_variance_fn_t       svf_halfpix_hv;
-    vp9_sad_multi_fn_t      sdx3f;
-    vp9_sad_multi1_fn_t     sdx8f;
-    vp9_sad_multi_d_fn_t    sdx4df;
-    vp9_copy32xn_fn_t       copymem;
-} vp9_variance_fn_ptr_t;
-
-#endif
--- a/vp8/encoder/variance_c.c
+++ /dev/null
@@ -1,540 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "variance.h"
-#include "vp8/common/filter.h"
-
-
-unsigned int vp9_get_mb_ss_c(const short *src_ptr) {
-  unsigned int i, sum = 0;
-
-  for (i = 0; i < 256; i++) {
-    sum += (src_ptr[i] * src_ptr[i]);
-  }
-
-  return sum;
-}
-
-
-static void variance(const unsigned char *src_ptr,
-                     int  source_stride,
-                     const unsigned char *ref_ptr,
-                     int  recon_stride,
-                     int  w,
-                     int  h,
-                     unsigned int *sse,
-                     int *sum) {
-  int i, j;
-  int diff;
-
-  *sum = 0;
-  *sse = 0;
-
-  for (i = 0; i < h; i++) {
-    for (j = 0; j < w; j++) {
-      diff = src_ptr[j] - ref_ptr[j];
-      *sum += diff;
-      *sse += diff * diff;
-    }
-
-    src_ptr += source_stride;
-    ref_ptr += recon_stride;
-  }
-}
-
-#if CONFIG_SUPERBLOCKS
-unsigned int vp9_variance32x32_c(const unsigned char *src_ptr,
-                                 int  source_stride,
-                                 const unsigned char *ref_ptr,
-                                 int  recon_stride,
-                                 unsigned int *sse) {
-  unsigned int var;
-  int avg;
-
-  variance(src_ptr, source_stride, ref_ptr, recon_stride, 32, 32, &var, &avg);
-  *sse = var;
-  return (var - ((avg * avg) >> 10));
-}
-#endif
-
-unsigned int vp9_variance16x16_c(const unsigned char *src_ptr,
-                                 int  source_stride,
-                                 const unsigned char *ref_ptr,
-                                 int  recon_stride,
-                                 unsigned int *sse) {
-  unsigned int var;
-  int avg;
-
-  variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 16, &var, &avg);
-  *sse = var;
-  return (var - ((avg * avg) >> 8));
-}
-
-unsigned int vp9_variance8x16_c(const unsigned char *src_ptr,
-                                int  source_stride,
-                                const unsigned char *ref_ptr,
-                                int  recon_stride,
-                                unsigned int *sse) {
-  unsigned int var;
-  int avg;
-
-  variance(src_ptr, source_stride, ref_ptr, recon_stride, 8, 16, &var, &avg);
-  *sse = var;
-  return (var - ((avg * avg) >> 7));
-}
-
-unsigned int vp9_variance16x8_c(const unsigned char *src_ptr,
-                                int  source_stride,
-                                const unsigned char *ref_ptr,
-                                int  recon_stride,
-                                unsigned int *sse) {
-  unsigned int var;
-  int avg;
-
-  variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 8, &var, &avg);
-  *sse = var;
-  return (var - ((avg * avg) >> 7));
-}
-
-
-unsigned int vp9_variance8x8_c(const unsigned char *src_ptr,
-                               int  source_stride,
-                               const unsigned char *ref_ptr,
-                               int  recon_stride,
-                               unsigned int *sse) {
-  unsigned int var;
-  int avg;
-
-  variance(src_ptr, source_stride, ref_ptr, recon_stride, 8, 8, &var, &avg);
-  *sse = var;
-  return (var - ((avg * avg) >> 6));
-}
-
-unsigned int vp9_variance4x4_c(const unsigned char *src_ptr,
-                               int  source_stride,
-                               const unsigned char *ref_ptr,
-                               int  recon_stride,
-                               unsigned int *sse) {
-  unsigned int var;
-  int avg;
-
-  variance(src_ptr, source_stride, ref_ptr, recon_stride, 4, 4, &var, &avg);
-  *sse = var;
-  return (var - ((avg * avg) >> 4));
-}
-
-
-unsigned int vp9_mse16x16_c(const unsigned char *src_ptr,
-                            int  source_stride,
-                            const unsigned char *ref_ptr,
-                            int  recon_stride,
-                            unsigned int *sse) {
-  unsigned int var;
-  int avg;
-
-  variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 16, &var, &avg);
-  *sse = var;
-  return var;
-}
-
-
-/****************************************************************************
- *
- *  ROUTINE       : filter_block2d_bil_first_pass
- *
- *  INPUTS        : UINT8  *src_ptr          : Pointer to source block.
- *                  UINT32 src_pixels_per_line : Stride of input block.
- *                  UINT32 pixel_step        : Offset between filter input samples (see notes).
- *                  UINT32 output_height     : Input block height.
- *                  UINT32 output_width      : Input block width.
- *                  INT32  *vp9_filter          : Array of 2 bi-linear filter taps.
- *
- *  OUTPUTS       : INT32 *output_ptr        : Pointer to filtered block.
- *
- *  RETURNS       : void
- *
- *  FUNCTION      : Applies a 1-D 2-tap bi-linear filter to the source block in
- *                  either horizontal or vertical direction to produce the
- *                  filtered output block. Used to implement first-pass
- *                  of 2-D separable filter.
- *
- *  SPECIAL NOTES : Produces INT32 output to retain precision for next pass.
- *                  Two filter taps should sum to VP9_FILTER_WEIGHT.
- *                  pixel_step defines whether the filter is applied
- *                  horizontally (pixel_step=1) or vertically (pixel_step=stride).
- *                  It defines the offset required to move from one input
- *                  to the next.
- *
- ****************************************************************************/
-static void var_filter_block2d_bil_first_pass(const unsigned char *src_ptr,
-                                              unsigned short *output_ptr,
-                                              unsigned int src_pixels_per_line,
-                                              int pixel_step,
-                                              unsigned int output_height,
-                                              unsigned int output_width,
-                                              const short *vp9_filter) {
-  unsigned int i, j;
-
-  for (i = 0; i < output_height; i++) {
-    for (j = 0; j < output_width; j++) {
-      // Apply bilinear filter
-      output_ptr[j] = (((int)src_ptr[0]          * vp9_filter[0]) +
-                       ((int)src_ptr[pixel_step] * vp9_filter[1]) +
-                       (VP9_FILTER_WEIGHT / 2)) >> VP9_FILTER_SHIFT;
-      src_ptr++;
-    }
-
-    // Next row...
-    src_ptr    += src_pixels_per_line - output_width;
-    output_ptr += output_width;
-  }
-}
-
-/****************************************************************************
- *
- *  ROUTINE       : filter_block2d_bil_second_pass
- *
- *  INPUTS        : INT32  *src_ptr          : Pointer to source block.
- *                  UINT32 src_pixels_per_line : Stride of input block.
- *                  UINT32 pixel_step        : Offset between filter input samples (see notes).
- *                  UINT32 output_height     : Input block height.
- *                  UINT32 output_width      : Input block width.
- *                  INT32  *vp9_filter          : Array of 2 bi-linear filter taps.
- *
- *  OUTPUTS       : UINT16 *output_ptr       : Pointer to filtered block.
- *
- *  RETURNS       : void
- *
- *  FUNCTION      : Applies a 1-D 2-tap bi-linear filter to the source block in
- *                  either horizontal or vertical direction to produce the
- *                  filtered output block. Used to implement second-pass
- *                  of 2-D separable filter.
- *
- *  SPECIAL NOTES : Requires 32-bit input as produced by filter_block2d_bil_first_pass.
- *                  Two filter taps should sum to VP9_FILTER_WEIGHT.
- *                  pixel_step defines whether the filter is applied
- *                  horizontally (pixel_step=1) or vertically (pixel_step=stride).
- *                  It defines the offset required to move from one input
- *                  to the next.
- *
- ****************************************************************************/
-static void var_filter_block2d_bil_second_pass(const unsigned short *src_ptr,
-                                               unsigned char *output_ptr,
-                                               unsigned int src_pixels_per_line,
-                                               unsigned int pixel_step,
-                                               unsigned int output_height,
-                                               unsigned int output_width,
-                                               const short *vp9_filter) {
-  unsigned int  i, j;
-  int  Temp;
-
-  for (i = 0; i < output_height; i++) {
-    for (j = 0; j < output_width; j++) {
-      // Apply filter
-      Temp = ((int)src_ptr[0]         * vp9_filter[0]) +
-             ((int)src_ptr[pixel_step] * vp9_filter[1]) +
-             (VP9_FILTER_WEIGHT / 2);
-      output_ptr[j] = (unsigned int)(Temp >> VP9_FILTER_SHIFT);
-      src_ptr++;
-    }
-
-    // Next row...
-    src_ptr    += src_pixels_per_line - output_width;
-    output_ptr += output_width;
-  }
-}
-
-
-unsigned int vp9_sub_pixel_variance4x4_c(const unsigned char  *src_ptr,
-                                         int  src_pixels_per_line,
-                                         int  xoffset,
-                                         int  yoffset,
-                                         const unsigned char *dst_ptr,
-                                         int dst_pixels_per_line,
-                                         unsigned int *sse) {
-  unsigned char  temp2[20 * 16];
-  const short *HFilter, *VFilter;
-  unsigned short FData3[5 * 4]; // Temp data bufffer used in filtering
-
-  HFilter = vp9_bilinear_filters[xoffset];
-  VFilter = vp9_bilinear_filters[yoffset];
-
-  // First filter 1d Horizontal
-  var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 5, 4, HFilter);
-
-  // Now filter Verticaly
-  var_filter_block2d_bil_second_pass(FData3, temp2, 4,  4,  4,  4, VFilter);
-
-  return vp9_variance4x4_c(temp2, 4, dst_ptr, dst_pixels_per_line, sse);
-}
-
-
-unsigned int vp9_sub_pixel_variance8x8_c(const unsigned char  *src_ptr,
-                                         int  src_pixels_per_line,
-                                         int  xoffset,
-                                         int  yoffset,
-                                         const unsigned char *dst_ptr,
-                                         int dst_pixels_per_line,
-                                         unsigned int *sse) {
-  unsigned short FData3[9 * 8]; // Temp data bufffer used in filtering
-  unsigned char  temp2[20 * 16];
-  const short *HFilter, *VFilter;
-
-  HFilter = vp9_bilinear_filters[xoffset];
-  VFilter = vp9_bilinear_filters[yoffset];
-
-  var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 9, 8, HFilter);
-  var_filter_block2d_bil_second_pass(FData3, temp2, 8, 8, 8, 8, VFilter);
-
-  return vp9_variance8x8_c(temp2, 8, dst_ptr, dst_pixels_per_line, sse);
-}
-
-unsigned int vp9_sub_pixel_variance16x16_c(const unsigned char  *src_ptr,
-                                           int  src_pixels_per_line,
-                                           int  xoffset,
-                                           int  yoffset,
-                                           const unsigned char *dst_ptr,
-                                           int dst_pixels_per_line,
-                                           unsigned int *sse) {
-  unsigned short FData3[17 * 16]; // Temp data bufffer used in filtering
-  unsigned char  temp2[20 * 16];
-  const short *HFilter, *VFilter;
-
-  HFilter = vp9_bilinear_filters[xoffset];
-  VFilter = vp9_bilinear_filters[yoffset];
-
-  var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 17, 16, HFilter);
-  var_filter_block2d_bil_second_pass(FData3, temp2, 16, 16, 16, 16, VFilter);
-
-  return vp9_variance16x16_c(temp2, 16, dst_ptr, dst_pixels_per_line, sse);
-}
-
-#if CONFIG_SUPERBLOCKS
-unsigned int vp9_sub_pixel_variance32x32_c(const unsigned char  *src_ptr,
-                                           int  src_pixels_per_line,
-                                           int  xoffset,
-                                           int  yoffset,
-                                           const unsigned char *dst_ptr,
-                                           int dst_pixels_per_line,
-                                           unsigned int *sse) {
-  unsigned short FData3[33 * 32]; // Temp data bufffer used in filtering
-  unsigned char  temp2[36 * 32];
-  const short *HFilter, *VFilter;
-
-  HFilter = vp9_bilinear_filters[xoffset];
-  VFilter = vp9_bilinear_filters[yoffset];
-
-  var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 33, 32, HFilter);
-  var_filter_block2d_bil_second_pass(FData3, temp2, 32, 32, 32, 32, VFilter);
-
-  return vp9_variance32x32_c(temp2, 32, dst_ptr, dst_pixels_per_line, sse);
-}
-#endif
-
-unsigned int vp9_variance_halfpixvar16x16_h_c(const unsigned char *src_ptr,
-                                              int  source_stride,
-                                              const unsigned char *ref_ptr,
-                                              int  recon_stride,
-                                              unsigned int *sse) {
-  return vp9_sub_pixel_variance16x16_c(src_ptr, source_stride, 8, 0,
-                                       ref_ptr, recon_stride, sse);
-}
-
-#if CONFIG_SUPERBLOCKS
-unsigned int vp9_variance_halfpixvar32x32_h_c(const unsigned char *src_ptr,
-                                              int  source_stride,
-                                              const unsigned char *ref_ptr,
-                                              int  recon_stride,
-                                              unsigned int *sse) {
-  return vp9_sub_pixel_variance32x32_c(src_ptr, source_stride, 8, 0,
-                                       ref_ptr, recon_stride, sse);
-}
-#endif
-
-
-unsigned int vp9_variance_halfpixvar16x16_v_c(const unsigned char *src_ptr,
-                                              int  source_stride,
-                                              const unsigned char *ref_ptr,
-                                              int  recon_stride,
-                                              unsigned int *sse) {
-  return vp9_sub_pixel_variance16x16_c(src_ptr, source_stride, 0, 8,
-                                       ref_ptr, recon_stride, sse);
-}
-
-#if CONFIG_SUPERBLOCKS
-unsigned int vp9_variance_halfpixvar32x32_v_c(const unsigned char *src_ptr,
-                                              int  source_stride,
-                                              const unsigned char *ref_ptr,
-                                              int  recon_stride,
-                                              unsigned int *sse) {
-  return vp9_sub_pixel_variance32x32_c(src_ptr, source_stride, 0, 8,
-                                       ref_ptr, recon_stride, sse);
-}
-#endif
-
-unsigned int vp9_variance_halfpixvar16x16_hv_c(const unsigned char *src_ptr,
-                                               int  source_stride,
-                                               const unsigned char *ref_ptr,
-                                               int  recon_stride,
-                                               unsigned int *sse) {
-  return vp9_sub_pixel_variance16x16_c(src_ptr, source_stride, 8, 8,
-                                       ref_ptr, recon_stride, sse);
-}
-
-#if CONFIG_SUPERBLOCKS
-unsigned int vp9_variance_halfpixvar32x32_hv_c(const unsigned char *src_ptr,
-                                               int  source_stride,
-                                               const unsigned char *ref_ptr,
-                                               int  recon_stride,
-                                               unsigned int *sse) {
-  return vp9_sub_pixel_variance32x32_c(src_ptr, source_stride, 8, 8,
-                                       ref_ptr, recon_stride, sse);
-}
-#endif
-
-unsigned int vp9_sub_pixel_mse16x16_c(const unsigned char  *src_ptr,
-                                      int  src_pixels_per_line,
-                                      int  xoffset,
-                                      int  yoffset,
-                                      const unsigned char *dst_ptr,
-                                      int dst_pixels_per_line,
-                                      unsigned int *sse) {
-  vp9_sub_pixel_variance16x16_c(src_ptr, src_pixels_per_line,
-                                xoffset, yoffset, dst_ptr,
-                                dst_pixels_per_line, sse);
-  return *sse;
-}
-
-#if CONFIG_SUPERBLOCKS
-unsigned int vp9_sub_pixel_mse32x32_c(const unsigned char  *src_ptr,
-                                      int  src_pixels_per_line,
-                                      int  xoffset,
-                                      int  yoffset,
-                                      const unsigned char *dst_ptr,
-                                      int dst_pixels_per_line,
-                                      unsigned int *sse) {
-  vp9_sub_pixel_variance32x32_c(src_ptr, src_pixels_per_line,
-                                xoffset, yoffset, dst_ptr,
-                                dst_pixels_per_line, sse);
-  return *sse;
-}
-#endif
-
-unsigned int vp9_sub_pixel_variance16x8_c(const unsigned char  *src_ptr,
-                                          int  src_pixels_per_line,
-                                          int  xoffset,
-                                          int  yoffset,
-                                          const unsigned char *dst_ptr,
-                                          int dst_pixels_per_line,
-                                          unsigned int *sse) {
-  unsigned short FData3[16 * 9];  // Temp data bufffer used in filtering
-  unsigned char  temp2[20 * 16];
-  const short *HFilter, *VFilter;
-
-  HFilter = vp9_bilinear_filters[xoffset];
-  VFilter = vp9_bilinear_filters[yoffset];
-
-  var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 9, 16, HFilter);
-  var_filter_block2d_bil_second_pass(FData3, temp2, 16, 16, 8, 16, VFilter);
-
-  return vp9_variance16x8_c(temp2, 16, dst_ptr, dst_pixels_per_line, sse);
-}
-
-unsigned int vp9_sub_pixel_variance8x16_c(const unsigned char  *src_ptr,
-                                          int  src_pixels_per_line,
-                                          int  xoffset,
-                                          int  yoffset,
-                                          const unsigned char *dst_ptr,
-                                          int dst_pixels_per_line,
-                                          unsigned int *sse) {
-  unsigned short FData3[9 * 16];  // Temp data bufffer used in filtering
-  unsigned char  temp2[20 * 16];
-  const short *HFilter, *VFilter;
-
-  HFilter = vp9_bilinear_filters[xoffset];
-  VFilter = vp9_bilinear_filters[yoffset];
-
-  var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line,
-                                    1, 17, 8, HFilter);
-  var_filter_block2d_bil_second_pass(FData3, temp2, 8, 8, 16, 8, VFilter);
-
-  return vp9_variance8x16_c(temp2, 8, dst_ptr, dst_pixels_per_line, sse);
-}
-
-#if CONFIG_NEWBESTREFMV
-unsigned int vp9_variance2x16_c(const unsigned char *src_ptr,
-                                const int  source_stride,
-                                const unsigned char *ref_ptr,
-                                const int  recon_stride,
-                                unsigned int *sse) {
-  unsigned int var;
-  int avg;
-
-  variance(src_ptr, source_stride, ref_ptr, recon_stride, 2, 16, &var, &avg);
-  *sse = var;
-  return (var - ((avg * avg) >> 5));
-}
-
-unsigned int vp9_variance16x2_c(const unsigned char *src_ptr,
-                                const int  source_stride,
-                                const unsigned char *ref_ptr,
-                                const int  recon_stride,
-                                unsigned int *sse) {
-  unsigned int var;
-  int avg;
-
-  variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 2, &var, &avg);
-  *sse = var;
-  return (var - ((avg * avg) >> 5));
-}
-
-unsigned int vp9_sub_pixel_variance16x2_c(const unsigned char  *src_ptr,
-                                          const int  src_pixels_per_line,
-                                          const int  xoffset,
-                                          const int  yoffset,
-                                          const unsigned char *dst_ptr,
-                                          const int dst_pixels_per_line,
-                                          unsigned int *sse) {
-  unsigned short FData3[16 * 3];  // Temp data bufffer used in filtering
-  unsigned char  temp2[20 * 16];
-  const short *HFilter, *VFilter;
-
-  HFilter = vp9_bilinear_filters[xoffset];
-  VFilter = vp9_bilinear_filters[yoffset];
-
-  var_filter_block2d_bil_first_pass(src_ptr, FData3,
-                                    src_pixels_per_line, 1, 3, 16, HFilter);
-  var_filter_block2d_bil_second_pass(FData3, temp2, 16, 16, 2, 16, VFilter);
-
-  return vp9_variance16x2_c(temp2, 16, dst_ptr, dst_pixels_per_line, sse);
-}
-
-unsigned int vp9_sub_pixel_variance2x16_c(const unsigned char  *src_ptr,
-                                          const int  src_pixels_per_line,
-                                          const int  xoffset,
-                                          const int  yoffset,
-                                          const unsigned char *dst_ptr,
-                                          const int dst_pixels_per_line,
-                                          unsigned int *sse) {
-  unsigned short FData3[2 * 17];  // Temp data bufffer used in filtering
-  unsigned char  temp2[2 * 16];
-  const short *HFilter, *VFilter;
-
-  HFilter = vp9_bilinear_filters[xoffset];
-  VFilter = vp9_bilinear_filters[yoffset];
-
-  var_filter_block2d_bil_first_pass(src_ptr, FData3,
-                                    src_pixels_per_line, 1, 17, 2, HFilter);
-  var_filter_block2d_bil_second_pass(FData3, temp2, 2, 2, 16, 2, VFilter);
-
-  return vp9_variance2x16_c(temp2, 2, dst_ptr, dst_pixels_per_line, sse);
-}
-#endif
--- a/vp8/encoder/x86/dct_mmx.asm
+++ /dev/null
@@ -1,241 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-;void vp9_short_fdct4x4_mmx(short *input, short *output, int pitch)
-global sym(vp9_short_fdct4x4_mmx)
-sym(vp9_short_fdct4x4_mmx):
-    push        rbp
-    mov         rbp,        rsp
-    SHADOW_ARGS_TO_STACK 3
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-        mov         rsi,        arg(0)      ; input
-        mov         rdi,        arg(1)      ; output
-
-        movsxd      rax,        dword ptr arg(2) ;pitch
-
-        lea         rcx,        [rsi + rax*2]
-        ; read the input data
-        movq        mm0,        [rsi]
-        movq        mm1,        [rsi + rax]
-
-        movq        mm2,        [rcx]
-        movq        mm4,        [rcx + rax]
-
-        ; transpose for the first stage
-        movq        mm3,        mm0         ; 00 01 02 03
-        movq        mm5,        mm2         ; 20 21 22 23
-
-        punpcklwd   mm0,        mm1         ; 00 10 01 11
-        punpckhwd   mm3,        mm1         ; 02 12 03 13
-
-        punpcklwd   mm2,        mm4         ; 20 30 21 31
-        punpckhwd   mm5,        mm4         ; 22 32 23 33
-
-        movq        mm1,        mm0         ; 00 10 01 11
-        punpckldq   mm0,        mm2         ; 00 10 20 30
-
-        punpckhdq   mm1,        mm2         ; 01 11 21 31
-
-        movq        mm2,        mm3         ; 02 12 03 13
-        punpckldq   mm2,        mm5         ; 02 12 22 32
-
-        punpckhdq   mm3,        mm5         ; 03 13 23 33
-
-        ; mm0 0
-        ; mm1 1
-        ; mm2 2
-        ; mm3 3
-
-        ; first stage
-        movq        mm5,        mm0
-        movq        mm4,        mm1
-
-        paddw       mm0,        mm3         ; a1 = 0 + 3
-        paddw       mm1,        mm2         ; b1 = 1 + 2
-
-        psubw       mm4,        mm2         ; c1 = 1 - 2
-        psubw       mm5,        mm3         ; d1 = 0 - 3
-
-        psllw       mm5,        3
-        psllw       mm4,        3
-
-        psllw       mm0,        3
-        psllw       mm1,        3
-
-        ; output 0 and 2
-        movq        mm2,        mm0         ; a1
-
-        paddw       mm0,        mm1         ; op[0] = a1 + b1
-        psubw       mm2,        mm1         ; op[2] = a1 - b1
-
-        ; output 1 and 3
-        ; interleave c1, d1
-        movq        mm1,        mm5         ; d1
-        punpcklwd   mm1,        mm4         ; c1 d1
-        punpckhwd   mm5,        mm4         ; c1 d1
-
-        movq        mm3,        mm1
-        movq        mm4,        mm5
-
-        pmaddwd     mm1,        MMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
-        pmaddwd     mm4,        MMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
-
-        pmaddwd     mm3,        MMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
-        pmaddwd     mm5,        MMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
-
-        paddd       mm1,        MMWORD PTR[GLOBAL(_14500)]
-        paddd       mm4,        MMWORD PTR[GLOBAL(_14500)]
-        paddd       mm3,        MMWORD PTR[GLOBAL(_7500)]
-        paddd       mm5,        MMWORD PTR[GLOBAL(_7500)]
-
-        psrad       mm1,        12          ; (c1 * 2217 + d1 * 5352 +  14500)>>12
-        psrad       mm4,        12          ; (c1 * 2217 + d1 * 5352 +  14500)>>12
-        psrad       mm3,        12          ; (d1 * 2217 - c1 * 5352 +   7500)>>12
-        psrad       mm5,        12          ; (d1 * 2217 - c1 * 5352 +   7500)>>12
-
-        packssdw    mm1,        mm4         ; op[1]
-        packssdw    mm3,        mm5         ; op[3]
-
-        ; done with vertical
-        ; transpose for the second stage
-        movq        mm4,        mm0         ; 00 10 20 30
-        movq        mm5,        mm2         ; 02 12 22 32
-
-        punpcklwd   mm0,        mm1         ; 00 01 10 11
-        punpckhwd   mm4,        mm1         ; 20 21 30 31
-
-        punpcklwd   mm2,        mm3         ; 02 03 12 13
-        punpckhwd   mm5,        mm3         ; 22 23 32 33
-
-        movq        mm1,        mm0         ; 00 01 10 11
-        punpckldq   mm0,        mm2         ; 00 01 02 03
-
-        punpckhdq   mm1,        mm2         ; 01 22 12 13
-
-        movq        mm2,        mm4         ; 20 31 30 31
-        punpckldq   mm2,        mm5         ; 20 21 22 23
-
-        punpckhdq   mm4,        mm5         ; 30 31 32 33
-
-        ; mm0 0
-        ; mm1 1
-        ; mm2 2
-        ; mm3 4
-
-        movq        mm5,        mm0
-        movq        mm3,        mm1
-
-        paddw       mm0,        mm4         ; a1 = 0 + 3
-        paddw       mm1,        mm2         ; b1 = 1 + 2
-
-        psubw       mm3,        mm2         ; c1 = 1 - 2
-        psubw       mm5,        mm4         ; d1 = 0 - 3
-
-        pxor        mm6,        mm6         ; zero out for compare
-
-        pcmpeqw     mm6,        mm5         ; d1 != 0
-
-        pandn       mm6,        MMWORD PTR[GLOBAL(_cmp_mask)]   ; clear upper,
-                                                                ; and keep bit 0 of lower
-
-        ; output 0 and 2
-        movq        mm2,        mm0         ; a1
-
-        paddw       mm0,        mm1         ; a1 + b1
-        psubw       mm2,        mm1         ; a1 - b1
-
-        paddw       mm0,        MMWORD PTR[GLOBAL(_7w)]
-        paddw       mm2,        MMWORD PTR[GLOBAL(_7w)]
-
-        psraw       mm0,        4           ; op[0] = (a1 + b1 + 7)>>4
-        psraw       mm2,        4           ; op[8] = (a1 - b1 + 7)>>4
-
-        movq        MMWORD PTR[rdi + 0 ],  mm0
-        movq        MMWORD PTR[rdi + 16],  mm2
-
-        ; output 1 and 3
-        ; interleave c1, d1
-        movq        mm1,        mm5         ; d1
-        punpcklwd   mm1,        mm3         ; c1 d1
-        punpckhwd   mm5,        mm3         ; c1 d1
-
-        movq        mm3,        mm1
-        movq        mm4,        mm5
-
-        pmaddwd     mm1,        MMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
-        pmaddwd     mm4,        MMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
-
-        pmaddwd     mm3,        MMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
-        pmaddwd     mm5,        MMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
-
-        paddd       mm1,        MMWORD PTR[GLOBAL(_12000)]
-        paddd       mm4,        MMWORD PTR[GLOBAL(_12000)]
-        paddd       mm3,        MMWORD PTR[GLOBAL(_51000)]
-        paddd       mm5,        MMWORD PTR[GLOBAL(_51000)]
-
-        psrad       mm1,        16          ; (c1 * 2217 + d1 * 5352 +  14500)>>16
-        psrad       mm4,        16          ; (c1 * 2217 + d1 * 5352 +  14500)>>16
-        psrad       mm3,        16          ; (d1 * 2217 - c1 * 5352 +   7500)>>16
-        psrad       mm5,        16          ; (d1 * 2217 - c1 * 5352 +   7500)>>16
-
-        packssdw    mm1,        mm4         ; op[4]
-        packssdw    mm3,        mm5         ; op[12]
-
-        paddw       mm1,        mm6         ; op[4] += (d1!=0)
-
-        movq        MMWORD PTR[rdi + 8 ],  mm1
-        movq        MMWORD PTR[rdi + 24],  mm3
-
-     ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-SECTION_RODATA
-align 8
-_5352_2217:
-    dw 5352
-    dw 2217
-    dw 5352
-    dw 2217
-align 8
-_2217_neg5352:
-    dw 2217
-    dw -5352
-    dw 2217
-    dw -5352
-align 8
-_cmp_mask:
-    times 4 dw 1
-align 8
-_7w:
-    times 4 dw 7
-align 8
-_14500:
-    times 2 dd 14500
-align 8
-_7500:
-    times 2 dd 7500
-align 8
-_12000:
-    times 2 dd 12000
-align 8
-_51000:
-    times 2 dd 51000
--- a/vp8/encoder/x86/dct_sse2.asm
+++ /dev/null
@@ -1,432 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-%macro STACK_FRAME_CREATE 0
-%if ABI_IS_32BIT
-  %define       input       rsi
-  %define       output      rdi
-  %define       pitch       rax
-    push        rbp
-    mov         rbp, rsp
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    mov         rsi, arg(0)
-    mov         rdi, arg(1)
-
-    movsxd      rax, dword ptr arg(2)
-    lea         rcx, [rsi + rax*2]
-%else
-  %ifidn __OUTPUT_FORMAT__,x64
-    %define     input       rcx
-    %define     output      rdx
-    %define     pitch       r8
-    SAVE_XMM 7, u
-  %else
-    %define     input       rdi
-    %define     output      rsi
-    %define     pitch       rdx
-  %endif
-%endif
-%endmacro
-
-%macro STACK_FRAME_DESTROY 0
-  %define     input
-  %define     output
-  %define     pitch
-
-%if ABI_IS_32BIT
-    pop         rdi
-    pop         rsi
-    RESTORE_GOT
-    pop         rbp
-%else
-  %ifidn __OUTPUT_FORMAT__,x64
-    RESTORE_XMM
-  %endif
-%endif
-    ret
-%endmacro
-
-;void vp9_short_fdct4x4_sse2(short *input, short *output, int pitch)
-global sym(vp9_short_fdct4x4_sse2)
-sym(vp9_short_fdct4x4_sse2):
-
-    STACK_FRAME_CREATE
-
-    movq        xmm0, MMWORD PTR[input        ] ;03 02 01 00
-    movq        xmm2, MMWORD PTR[input+  pitch] ;13 12 11 10
-    lea         input,          [input+2*pitch]
-    movq        xmm1, MMWORD PTR[input        ] ;23 22 21 20
-    movq        xmm3, MMWORD PTR[input+  pitch] ;33 32 31 30
-
-    punpcklqdq  xmm0, xmm2                      ;13 12 11 10 03 02 01 00
-    punpcklqdq  xmm1, xmm3                      ;33 32 31 30 23 22 21 20
-
-    movdqa      xmm2, xmm0
-    punpckldq   xmm0, xmm1                      ;23 22 03 02 21 20 01 00
-    punpckhdq   xmm2, xmm1                      ;33 32 13 12 31 30 11 10
-    movdqa      xmm1, xmm0
-    punpckldq   xmm0, xmm2                      ;31 21 30 20 11 10 01 00
-    pshufhw     xmm1, xmm1, 0b1h                ;22 23 02 03 xx xx xx xx
-    pshufhw     xmm2, xmm2, 0b1h                ;32 33 12 13 xx xx xx xx
-
-    punpckhdq   xmm1, xmm2                      ;32 33 22 23 12 13 02 03
-    movdqa      xmm3, xmm0
-    paddw       xmm0, xmm1                      ;b1 a1 b1 a1 b1 a1 b1 a1
-    psubw       xmm3, xmm1                      ;c1 d1 c1 d1 c1 d1 c1 d1
-    psllw       xmm0, 3                         ;b1 <<= 3 a1 <<= 3
-    psllw       xmm3, 3                         ;c1 <<= 3 d1 <<= 3
-
-    movdqa      xmm1, xmm0
-    pmaddwd     xmm0, XMMWORD PTR[GLOBAL(_mult_add)]    ;a1 + b1
-    pmaddwd     xmm1, XMMWORD PTR[GLOBAL(_mult_sub)]    ;a1 - b1
-    movdqa      xmm4, xmm3
-    pmaddwd     xmm3, XMMWORD PTR[GLOBAL(_5352_2217)]   ;c1*2217 + d1*5352
-    pmaddwd     xmm4, XMMWORD PTR[GLOBAL(_2217_neg5352)];d1*2217 - c1*5352
-
-    paddd       xmm3, XMMWORD PTR[GLOBAL(_14500)]
-    paddd       xmm4, XMMWORD PTR[GLOBAL(_7500)]
-    psrad       xmm3, 12            ;(c1 * 2217 + d1 * 5352 +  14500)>>12
-    psrad       xmm4, 12            ;(d1 * 2217 - c1 * 5352 +   7500)>>12
-
-    packssdw    xmm0, xmm1                      ;op[2] op[0]
-    packssdw    xmm3, xmm4                      ;op[3] op[1]
-    ; 23 22 21 20 03 02 01 00
-    ;
-    ; 33 32 31 30 13 12 11 10
-    ;
-    movdqa      xmm2, xmm0
-    punpcklqdq  xmm0, xmm3                      ;13 12 11 10 03 02 01 00
-    punpckhqdq  xmm2, xmm3                      ;23 22 21 20 33 32 31 30
-
-    movdqa      xmm3, xmm0
-    punpcklwd   xmm0, xmm2                      ;32 30 22 20 12 10 02 00
-    punpckhwd   xmm3, xmm2                      ;33 31 23 21 13 11 03 01
-    movdqa      xmm2, xmm0
-    punpcklwd   xmm0, xmm3                      ;13 12 11 10 03 02 01 00
-    punpckhwd   xmm2, xmm3                      ;33 32 31 30 23 22 21 20
-
-    movdqa      xmm5, XMMWORD PTR[GLOBAL(_7)]
-    pshufd      xmm2, xmm2, 04eh
-    movdqa      xmm3, xmm0
-    paddw       xmm0, xmm2                      ;b1 b1 b1 b1 a1 a1 a1 a1
-    psubw       xmm3, xmm2                      ;c1 c1 c1 c1 d1 d1 d1 d1
-
-    pshufd      xmm0, xmm0, 0d8h                ;b1 b1 a1 a1 b1 b1 a1 a1
-    movdqa      xmm2, xmm3                      ;save d1 for compare
-    pshufd      xmm3, xmm3, 0d8h                ;c1 c1 d1 d1 c1 c1 d1 d1
-    pshuflw     xmm0, xmm0, 0d8h                ;b1 b1 a1 a1 b1 a1 b1 a1
-    pshuflw     xmm3, xmm3, 0d8h                ;c1 c1 d1 d1 c1 d1 c1 d1
-    pshufhw     xmm0, xmm0, 0d8h                ;b1 a1 b1 a1 b1 a1 b1 a1
-    pshufhw     xmm3, xmm3, 0d8h                ;c1 d1 c1 d1 c1 d1 c1 d1
-    movdqa      xmm1, xmm0
-    pmaddwd     xmm0, XMMWORD PTR[GLOBAL(_mult_add)] ;a1 + b1
-    pmaddwd     xmm1, XMMWORD PTR[GLOBAL(_mult_sub)] ;a1 - b1
-
-    pxor        xmm4, xmm4                      ;zero out for compare
-    paddd       xmm0, xmm5
-    paddd       xmm1, xmm5
-    pcmpeqw     xmm2, xmm4
-    psrad       xmm0, 4                         ;(a1 + b1 + 7)>>4
-    psrad       xmm1, 4                         ;(a1 - b1 + 7)>>4
-    pandn       xmm2, XMMWORD PTR[GLOBAL(_cmp_mask)] ;clear upper,
-                                                     ;and keep bit 0 of lower
-
-    movdqa      xmm4, xmm3
-    pmaddwd     xmm3, XMMWORD PTR[GLOBAL(_5352_2217)]    ;c1*2217 + d1*5352
-    pmaddwd     xmm4, XMMWORD PTR[GLOBAL(_2217_neg5352)] ;d1*2217 - c1*5352
-    paddd       xmm3, XMMWORD PTR[GLOBAL(_12000)]
-    paddd       xmm4, XMMWORD PTR[GLOBAL(_51000)]
-    packssdw    xmm0, xmm1                      ;op[8] op[0]
-    psrad       xmm3, 16                ;(c1 * 2217 + d1 * 5352 +  12000)>>16
-    psrad       xmm4, 16                ;(d1 * 2217 - c1 * 5352 +  51000)>>16
-
-    packssdw    xmm3, xmm4                      ;op[12] op[4]
-    movdqa      xmm1, xmm0
-    paddw       xmm3, xmm2                      ;op[4] += (d1!=0)
-    punpcklqdq  xmm0, xmm3                      ;op[4] op[0]
-    punpckhqdq  xmm1, xmm3                      ;op[12] op[8]
-
-    movdqa      XMMWORD PTR[output +  0], xmm0
-    movdqa      XMMWORD PTR[output + 16], xmm1
-
-    STACK_FRAME_DESTROY
-
-;void vp9_short_fdct8x4_sse2(short *input, short *output, int pitch)
-global sym(vp9_short_fdct8x4_sse2)
-sym(vp9_short_fdct8x4_sse2):
-
-    STACK_FRAME_CREATE
-
-        ; read the input data
-        movdqa      xmm0,       [input        ]
-        movdqa      xmm2,       [input+  pitch]
-        lea         input,      [input+2*pitch]
-        movdqa      xmm4,       [input        ]
-        movdqa      xmm3,       [input+  pitch]
-
-        ; transpose for the first stage
-        movdqa      xmm1,       xmm0        ; 00 01 02 03 04 05 06 07
-        movdqa      xmm5,       xmm4        ; 20 21 22 23 24 25 26 27
-
-        punpcklwd   xmm0,       xmm2        ; 00 10 01 11 02 12 03 13
-        punpckhwd   xmm1,       xmm2        ; 04 14 05 15 06 16 07 17
-
-        punpcklwd   xmm4,       xmm3        ; 20 30 21 31 22 32 23 33
-        punpckhwd   xmm5,       xmm3        ; 24 34 25 35 26 36 27 37
-
-        movdqa      xmm2,       xmm0        ; 00 10 01 11 02 12 03 13
-        punpckldq   xmm0,       xmm4        ; 00 10 20 30 01 11 21 31
-
-        punpckhdq   xmm2,       xmm4        ; 02 12 22 32 03 13 23 33
-
-        movdqa      xmm4,       xmm1        ; 04 14 05 15 06 16 07 17
-        punpckldq   xmm4,       xmm5        ; 04 14 24 34 05 15 25 35
-
-        punpckhdq   xmm1,       xmm5        ; 06 16 26 36 07 17 27 37
-        movdqa      xmm3,       xmm2        ; 02 12 22 32 03 13 23 33
-
-        punpckhqdq  xmm3,       xmm1        ; 03 13 23 33 07 17 27 37
-        punpcklqdq  xmm2,       xmm1        ; 02 12 22 32 06 16 26 36
-
-        movdqa      xmm1,       xmm0        ; 00 10 20 30 01 11 21 31
-        punpcklqdq  xmm0,       xmm4        ; 00 10 20 30 04 14 24 34
-
-        punpckhqdq  xmm1,       xmm4        ; 01 11 21 32 05 15 25 35
-
-        ; xmm0 0
-        ; xmm1 1
-        ; xmm2 2
-        ; xmm3 3
-
-        ; first stage
-        movdqa      xmm5,       xmm0
-        movdqa      xmm4,       xmm1
-
-        paddw       xmm0,       xmm3        ; a1 = 0 + 3
-        paddw       xmm1,       xmm2        ; b1 = 1 + 2
-
-        psubw       xmm4,       xmm2        ; c1 = 1 - 2
-        psubw       xmm5,       xmm3        ; d1 = 0 - 3
-
-        psllw       xmm5,        3
-        psllw       xmm4,        3
-
-        psllw       xmm0,        3
-        psllw       xmm1,        3
-
-        ; output 0 and 2
-        movdqa      xmm2,       xmm0        ; a1
-
-        paddw       xmm0,       xmm1        ; op[0] = a1 + b1
-        psubw       xmm2,       xmm1        ; op[2] = a1 - b1
-
-        ; output 1 and 3
-        ; interleave c1, d1
-        movdqa      xmm1,       xmm5        ; d1
-        punpcklwd   xmm1,       xmm4        ; c1 d1
-        punpckhwd   xmm5,       xmm4        ; c1 d1
-
-        movdqa      xmm3,       xmm1
-        movdqa      xmm4,       xmm5
-
-        pmaddwd     xmm1,       XMMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
-        pmaddwd     xmm4,       XMMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
-
-        pmaddwd     xmm3,       XMMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
-        pmaddwd     xmm5,       XMMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
-
-        paddd       xmm1,       XMMWORD PTR[GLOBAL(_14500)]
-        paddd       xmm4,       XMMWORD PTR[GLOBAL(_14500)]
-        paddd       xmm3,       XMMWORD PTR[GLOBAL(_7500)]
-        paddd       xmm5,       XMMWORD PTR[GLOBAL(_7500)]
-
-        psrad       xmm1,       12          ; (c1 * 2217 + d1 * 5352 +  14500)>>12
-        psrad       xmm4,       12          ; (c1 * 2217 + d1 * 5352 +  14500)>>12
-        psrad       xmm3,       12          ; (d1 * 2217 - c1 * 5352 +   7500)>>12
-        psrad       xmm5,       12          ; (d1 * 2217 - c1 * 5352 +   7500)>>12
-
-        packssdw    xmm1,       xmm4        ; op[1]
-        packssdw    xmm3,       xmm5        ; op[3]
-
-        ; done with vertical
-        ; transpose for the second stage
-        movdqa      xmm4,       xmm0         ; 00 10 20 30 04 14 24 34
-        movdqa      xmm5,       xmm2         ; 02 12 22 32 06 16 26 36
-
-        punpcklwd   xmm0,       xmm1         ; 00 01 10 11 20 21 30 31
-        punpckhwd   xmm4,       xmm1         ; 04 05 14 15 24 25 34 35
-
-        punpcklwd   xmm2,       xmm3         ; 02 03 12 13 22 23 32 33
-        punpckhwd   xmm5,       xmm3         ; 06 07 16 17 26 27 36 37
-
-        movdqa      xmm1,       xmm0         ; 00 01 10 11 20 21 30 31
-        punpckldq   xmm0,       xmm2         ; 00 01 02 03 10 11 12 13
-
-        punpckhdq   xmm1,       xmm2         ; 20 21 22 23 30 31 32 33
-
-        movdqa      xmm2,       xmm4         ; 04 05 14 15 24 25 34 35
-        punpckldq   xmm2,       xmm5         ; 04 05 06 07 14 15 16 17
-
-        punpckhdq   xmm4,       xmm5         ; 24 25 26 27 34 35 36 37
-        movdqa      xmm3,       xmm1         ; 20 21 22 23 30 31 32 33
-
-        punpckhqdq  xmm3,       xmm4         ; 30 31 32 33 34 35 36 37
-        punpcklqdq  xmm1,       xmm4         ; 20 21 22 23 24 25 26 27
-
-        movdqa      xmm4,       xmm0         ; 00 01 02 03 10 11 12 13
-        punpcklqdq  xmm0,       xmm2         ; 00 01 02 03 04 05 06 07
-
-        punpckhqdq  xmm4,       xmm2         ; 10 11 12 13 14 15 16 17
-
-        ; xmm0 0
-        ; xmm1 4
-        ; xmm2 1
-        ; xmm3 3
-
-        movdqa      xmm5,       xmm0
-        movdqa      xmm2,       xmm1
-
-        paddw       xmm0,       xmm3        ; a1 = 0 + 3
-        paddw       xmm1,       xmm4        ; b1 = 1 + 2
-
-        psubw       xmm4,       xmm2        ; c1 = 1 - 2
-        psubw       xmm5,       xmm3        ; d1 = 0 - 3
-
-        pxor        xmm6,       xmm6        ; zero out for compare
-
-        pcmpeqw     xmm6,       xmm5        ; d1 != 0
-
-        pandn       xmm6,       XMMWORD PTR[GLOBAL(_cmp_mask8x4)]   ; clear upper,
-                                                                    ; and keep bit 0 of lower
-
-        ; output 0 and 2
-        movdqa      xmm2,       xmm0        ; a1
-
-        paddw       xmm0,       xmm1        ; a1 + b1
-        psubw       xmm2,       xmm1        ; a1 - b1
-
-        paddw       xmm0,       XMMWORD PTR[GLOBAL(_7w)]
-        paddw       xmm2,       XMMWORD PTR[GLOBAL(_7w)]
-
-        psraw       xmm0,       4           ; op[0] = (a1 + b1 + 7)>>4
-        psraw       xmm2,       4           ; op[8] = (a1 - b1 + 7)>>4
-
-        ; output 1 and 3
-        ; interleave c1, d1
-        movdqa      xmm1,       xmm5        ; d1
-        punpcklwd   xmm1,       xmm4        ; c1 d1
-        punpckhwd   xmm5,       xmm4        ; c1 d1
-
-        movdqa      xmm3,       xmm1
-        movdqa      xmm4,       xmm5
-
-        pmaddwd     xmm1,       XMMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
-        pmaddwd     xmm4,       XMMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
-
-        pmaddwd     xmm3,       XMMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
-        pmaddwd     xmm5,       XMMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
-
-        paddd       xmm1,       XMMWORD PTR[GLOBAL(_12000)]
-        paddd       xmm4,       XMMWORD PTR[GLOBAL(_12000)]
-        paddd       xmm3,       XMMWORD PTR[GLOBAL(_51000)]
-        paddd       xmm5,       XMMWORD PTR[GLOBAL(_51000)]
-
-        psrad       xmm1,       16          ; (c1 * 2217 + d1 * 5352 +  14500)>>16
-        psrad       xmm4,       16          ; (c1 * 2217 + d1 * 5352 +  14500)>>16
-        psrad       xmm3,       16          ; (d1 * 2217 - c1 * 5352 +   7500)>>16
-        psrad       xmm5,       16          ; (d1 * 2217 - c1 * 5352 +   7500)>>16
-
-        packssdw    xmm1,       xmm4        ; op[4]
-        packssdw    xmm3,       xmm5        ; op[12]
-
-        paddw       xmm1,       xmm6        ; op[4] += (d1!=0)
-
-        movdqa      xmm4,       xmm0
-        movdqa      xmm5,       xmm2
-
-        punpcklqdq  xmm0,       xmm1
-        punpckhqdq  xmm4,       xmm1
-
-        punpcklqdq  xmm2,       xmm3
-        punpckhqdq  xmm5,       xmm3
-
-        movdqa      XMMWORD PTR[output + 0 ],  xmm0
-        movdqa      XMMWORD PTR[output + 16],  xmm2
-        movdqa      XMMWORD PTR[output + 32],  xmm4
-        movdqa      XMMWORD PTR[output + 48],  xmm5
-
-    STACK_FRAME_DESTROY
-
-SECTION_RODATA
-align 16
-_5352_2217:
-    dw 5352
-    dw 2217
-    dw 5352
-    dw 2217
-    dw 5352
-    dw 2217
-    dw 5352
-    dw 2217
-align 16
-_2217_neg5352:
-    dw 2217
-    dw -5352
-    dw 2217
-    dw -5352
-    dw 2217
-    dw -5352
-    dw 2217
-    dw -5352
-align 16
-_mult_add:
-    times 8 dw 1
-align 16
-_cmp_mask:
-    times 4 dw 1
-    times 4 dw 0
-align 16
-_cmp_mask8x4:
-    times 8 dw 1
-align 16
-_mult_sub:
-    dw 1
-    dw -1
-    dw 1
-    dw -1
-    dw 1
-    dw -1
-    dw 1
-    dw -1
-align 16
-_7:
-    times 4 dd 7
-align 16
-_7w:
-    times 8 dw 7
-align 16
-_14500:
-    times 4 dd 14500
-align 16
-_7500:
-    times 4 dd 7500
-align 16
-_12000:
-    times 4 dd 12000
-align 16
-_51000:
-    times 4 dd 51000
--- a/vp8/encoder/x86/encodeopt.asm
+++ /dev/null
@@ -1,386 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-;int vp9_block_error_xmm(short *coeff_ptr,  short *dcoef_ptr)
-global sym(vp9_block_error_xmm)
-sym(vp9_block_error_xmm):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 2
-    push rsi
-    push rdi
-    ; end prologue
-
-        mov         rsi,        arg(0) ;coeff_ptr
-        mov         rdi,        arg(1) ;dcoef_ptr
-
-        movdqa      xmm0,       [rsi]
-        movdqa      xmm1,       [rdi]
-
-        movdqa      xmm2,       [rsi+16]
-        movdqa      xmm3,       [rdi+16]
-
-        psubw       xmm0,       xmm1
-        psubw       xmm2,       xmm3
-
-        pmaddwd     xmm0,       xmm0
-        pmaddwd     xmm2,       xmm2
-
-        paddd       xmm0,       xmm2
-
-        pxor        xmm5,       xmm5
-        movdqa      xmm1,       xmm0
-
-        punpckldq   xmm0,       xmm5
-        punpckhdq   xmm1,       xmm5
-
-        paddd       xmm0,       xmm1
-        movdqa      xmm1,       xmm0
-
-        psrldq      xmm0,       8
-        paddd       xmm0,       xmm1
-
-        movq        rax,        xmm0
-
-    pop rdi
-    pop rsi
-    ; begin epilog
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;int vp9_block_error_mmx(short *coeff_ptr,  short *dcoef_ptr)
-global sym(vp9_block_error_mmx)
-sym(vp9_block_error_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 2
-    push rsi
-    push rdi
-    ; end prolog
-
-
-        mov         rsi,        arg(0) ;coeff_ptr
-        pxor        mm7,        mm7
-
-        mov         rdi,        arg(1) ;dcoef_ptr
-        movq        mm3,        [rsi]
-
-        movq        mm4,        [rdi]
-        movq        mm5,        [rsi+8]
-
-        movq        mm6,        [rdi+8]
-        pxor        mm1,        mm1 ; from movd mm1, dc ; dc =0
-
-        movq        mm2,        mm7
-        psubw       mm5,        mm6
-
-        por         mm1,        mm2
-        pmaddwd     mm5,        mm5
-
-        pcmpeqw     mm1,        mm7
-        psubw       mm3,        mm4
-
-        pand        mm1,        mm3
-        pmaddwd     mm1,        mm1
-
-        paddd       mm1,        mm5
-        movq        mm3,        [rsi+16]
-
-        movq        mm4,        [rdi+16]
-        movq        mm5,        [rsi+24]
-
-        movq        mm6,        [rdi+24]
-        psubw       mm5,        mm6
-
-        pmaddwd     mm5,        mm5
-        psubw       mm3,        mm4
-
-        pmaddwd     mm3,        mm3
-        paddd       mm3,        mm5
-
-        paddd       mm1,        mm3
-        movq        mm0,        mm1
-
-        psrlq       mm1,        32
-        paddd       mm0,        mm1
-
-        movq        rax,        mm0
-
-    pop rdi
-    pop rsi
-    ; begin epilog
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;int vp9_mbblock_error_mmx_impl(short *coeff_ptr, short *dcoef_ptr, int dc);
-global sym(vp9_mbblock_error_mmx_impl)
-sym(vp9_mbblock_error_mmx_impl):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 3
-    push rsi
-    push rdi
-    ; end prolog
-
-
-        mov         rsi,        arg(0) ;coeff_ptr
-        pxor        mm7,        mm7
-
-        mov         rdi,        arg(1) ;dcoef_ptr
-        pxor        mm2,        mm2
-
-        movd        mm1,        dword ptr arg(2) ;dc
-        por         mm1,        mm2
-
-        pcmpeqw     mm1,        mm7
-        mov         rcx,        16
-
-.mberror_loop_mmx:
-        movq        mm3,       [rsi]
-        movq        mm4,       [rdi]
-
-        movq        mm5,       [rsi+8]
-        movq        mm6,       [rdi+8]
-
-
-        psubw       mm5,        mm6
-        pmaddwd     mm5,        mm5
-
-        psubw       mm3,        mm4
-        pand        mm3,        mm1
-
-        pmaddwd     mm3,        mm3
-        paddd       mm2,        mm5
-
-        paddd       mm2,        mm3
-        movq        mm3,       [rsi+16]
-
-        movq        mm4,       [rdi+16]
-        movq        mm5,       [rsi+24]
-
-        movq        mm6,       [rdi+24]
-        psubw       mm5,        mm6
-
-        pmaddwd     mm5,        mm5
-        psubw       mm3,        mm4
-
-        pmaddwd     mm3,        mm3
-        paddd       mm2,        mm5
-
-        paddd       mm2,        mm3
-        add         rsi,        32
-
-        add         rdi,        32
-        sub         rcx,        1
-
-        jnz         .mberror_loop_mmx
-
-        movq        mm0,        mm2
-        psrlq       mm2,        32
-
-        paddd       mm0,        mm2
-        movq        rax,        mm0
-
-    pop rdi
-    pop rsi
-    ; begin epilog
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;int vp9_mbblock_error_xmm_impl(short *coeff_ptr, short *dcoef_ptr, int dc);
-global sym(vp9_mbblock_error_xmm_impl)
-sym(vp9_mbblock_error_xmm_impl):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 3
-    SAVE_XMM 6
-    push rsi
-    push rdi
-    ; end prolog
-
-
-        mov         rsi,        arg(0) ;coeff_ptr
-        pxor        xmm6,       xmm6
-
-        mov         rdi,        arg(1) ;dcoef_ptr
-        pxor        xmm4,       xmm4
-
-        movd        xmm5,       dword ptr arg(2) ;dc
-        por         xmm5,       xmm4
-
-        pcmpeqw     xmm5,       xmm6
-        mov         rcx,        16
-
-.mberror_loop:
-        movdqa      xmm0,       [rsi]
-        movdqa      xmm1,       [rdi]
-
-        movdqa      xmm2,       [rsi+16]
-        movdqa      xmm3,       [rdi+16]
-
-
-        psubw       xmm2,       xmm3
-        pmaddwd     xmm2,       xmm2
-
-        psubw       xmm0,       xmm1
-        pand        xmm0,       xmm5
-
-        pmaddwd     xmm0,       xmm0
-        add         rsi,        32
-
-        add         rdi,        32
-
-        sub         rcx,        1
-        paddd       xmm4,       xmm2
-
-        paddd       xmm4,       xmm0
-        jnz         .mberror_loop
-
-        movdqa      xmm0,       xmm4
-        punpckldq   xmm0,       xmm6
-
-        punpckhdq   xmm4,       xmm6
-        paddd       xmm0,       xmm4
-
-        movdqa      xmm1,       xmm0
-        psrldq      xmm0,       8
-
-        paddd       xmm0,       xmm1
-        movq        rax,        xmm0
-
-    pop rdi
-    pop rsi
-    ; begin epilog
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;int vp9_mbuverror_mmx_impl(short *s_ptr, short *d_ptr);
-global sym(vp9_mbuverror_mmx_impl)
-sym(vp9_mbuverror_mmx_impl):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 2
-    push rsi
-    push rdi
-    ; end prolog
-
-
-        mov             rsi,        arg(0) ;s_ptr
-        mov             rdi,        arg(1) ;d_ptr
-
-        mov             rcx,        16
-        pxor            mm7,        mm7
-
-.mbuverror_loop_mmx:
-
-        movq            mm1,        [rsi]
-        movq            mm2,        [rdi]
-
-        psubw           mm1,        mm2
-        pmaddwd         mm1,        mm1
-
-
-        movq            mm3,        [rsi+8]
-        movq            mm4,        [rdi+8]
-
-        psubw           mm3,        mm4
-        pmaddwd         mm3,        mm3
-
-
-        paddd           mm7,        mm1
-        paddd           mm7,        mm3
-
-
-        add             rsi,        16
-        add             rdi,        16
-
-        dec             rcx
-        jnz             .mbuverror_loop_mmx
-
-        movq            mm0,        mm7
-        psrlq           mm7,        32
-
-        paddd           mm0,        mm7
-        movq            rax,        mm0
-
-    pop rdi
-    pop rsi
-    ; begin epilog
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;int vp9_mbuverror_xmm_impl(short *s_ptr, short *d_ptr);
-global sym(vp9_mbuverror_xmm_impl)
-sym(vp9_mbuverror_xmm_impl):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 2
-    push rsi
-    push rdi
-    ; end prolog
-
-
-        mov             rsi,        arg(0) ;s_ptr
-        mov             rdi,        arg(1) ;d_ptr
-
-        mov             rcx,        16
-        pxor            xmm3,       xmm3
-
-.mbuverror_loop:
-
-        movdqa          xmm1,       [rsi]
-        movdqa          xmm2,       [rdi]
-
-        psubw           xmm1,       xmm2
-        pmaddwd         xmm1,       xmm1
-
-        paddd           xmm3,       xmm1
-
-        add             rsi,        16
-        add             rdi,        16
-
-        dec             rcx
-        jnz             .mbuverror_loop
-
-        pxor        xmm0,           xmm0
-        movdqa      xmm1,           xmm3
-
-        movdqa      xmm2,           xmm1
-        punpckldq   xmm1,           xmm0
-
-        punpckhdq   xmm2,           xmm0
-        paddd       xmm1,           xmm2
-
-        movdqa      xmm2,           xmm1
-
-        psrldq      xmm1,           8
-        paddd       xmm1,           xmm2
-
-        movq            rax,            xmm1
-
-    pop rdi
-    pop rsi
-    ; begin epilog
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
--- a/vp8/encoder/x86/fwalsh_sse2.asm
+++ /dev/null
@@ -1,164 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-;void vp9_short_walsh4x4_sse2(short *input, short *output, int pitch)
-global sym(vp9_short_walsh4x4_sse2)
-sym(vp9_short_walsh4x4_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 3
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    mov     rsi, arg(0)           ; input
-    mov     rdi, arg(1)           ; output
-    movsxd  rdx, dword ptr arg(2) ; pitch
-
-    ; first for loop
-    movq    xmm0, MMWORD PTR [rsi]           ; load input
-    movq    xmm1, MMWORD PTR [rsi + rdx]
-    lea     rsi,  [rsi + rdx*2]
-    movq    xmm2, MMWORD PTR [rsi]
-    movq    xmm3, MMWORD PTR [rsi + rdx]
-
-    punpcklwd xmm0,  xmm1
-    punpcklwd xmm2,  xmm3
-
-    movdqa    xmm1, xmm0
-    punpckldq xmm0, xmm2           ; ip[1] ip[0]
-    punpckhdq xmm1, xmm2           ; ip[3] ip[2]
-
-    movdqa    xmm2, xmm0
-    paddw     xmm0, xmm1
-    psubw     xmm2, xmm1
-
-    psllw     xmm0, 2              ; d1  a1
-    psllw     xmm2, 2              ; c1  b1
-
-    movdqa    xmm1, xmm0
-    punpcklqdq xmm0, xmm2          ; b1  a1
-    punpckhqdq xmm1, xmm2          ; c1  d1
-
-    pxor      xmm6, xmm6
-    movq      xmm6, xmm0
-    pxor      xmm7, xmm7
-    pcmpeqw   xmm7, xmm6
-    paddw     xmm7, [GLOBAL(c1)]
-
-    movdqa    xmm2, xmm0
-    paddw     xmm0, xmm1           ; b1+c1  a1+d1
-    psubw     xmm2, xmm1           ; b1-c1  a1-d1
-    paddw     xmm0, xmm7           ; b1+c1  a1+d1+(a1!=0)
-
-    ; second for loop
-    ; input: 13  9  5  1 12  8  4  0 (xmm0)
-    ;        14 10  6  2 15 11  7  3 (xmm2)
-    ; after shuffle:
-    ;        13  5  9  1 12  4  8  0 (xmm0)
-    ;        14  6 10  2 15  7 11  3 (xmm1)
-    pshuflw   xmm3, xmm0, 0xd8
-    pshufhw   xmm0, xmm3, 0xd8
-    pshuflw   xmm3, xmm2, 0xd8
-    pshufhw   xmm1, xmm3, 0xd8
-
-    movdqa    xmm2, xmm0
-    pmaddwd   xmm0, [GLOBAL(c1)]    ; d11 a11 d10 a10
-    pmaddwd   xmm2, [GLOBAL(cn1)]   ; c11 b11 c10 b10
-    movdqa    xmm3, xmm1
-    pmaddwd   xmm1, [GLOBAL(c1)]    ; d12 a12 d13 a13
-    pmaddwd   xmm3, [GLOBAL(cn1)]   ; c12 b12 c13 b13
-
-    pshufd    xmm4, xmm0, 0xd8      ; d11 d10 a11 a10
-    pshufd    xmm5, xmm2, 0xd8      ; c11 c10 b11 b10
-    pshufd    xmm6, xmm1, 0x72      ; d13 d12 a13 a12
-    pshufd    xmm7, xmm3, 0x72      ; c13 c12 b13 b12
-
-    movdqa    xmm0, xmm4
-    punpcklqdq xmm0, xmm5           ; b11 b10 a11 a10
-    punpckhqdq xmm4, xmm5           ; c11 c10 d11 d10
-    movdqa    xmm1, xmm6
-    punpcklqdq xmm1, xmm7           ; b13 b12 a13 a12
-    punpckhqdq xmm6, xmm7           ; c13 c12 d13 d12
-
-    movdqa    xmm2, xmm0
-    paddd     xmm0, xmm4            ; b21 b20 a21 a20
-    psubd     xmm2, xmm4            ; c21 c20 d21 d20
-    movdqa    xmm3, xmm1
-    paddd     xmm1, xmm6            ; b23 b22 a23 a22
-    psubd     xmm3, xmm6            ; c23 c22 d23 d22
-
-    pxor      xmm4, xmm4
-    movdqa    xmm5, xmm4
-    pcmpgtd   xmm4, xmm0
-    pcmpgtd   xmm5, xmm2
-    pand      xmm4, [GLOBAL(cd1)]
-    pand      xmm5, [GLOBAL(cd1)]
-
-    pxor      xmm6, xmm6
-    movdqa    xmm7, xmm6
-    pcmpgtd   xmm6, xmm1
-    pcmpgtd   xmm7, xmm3
-    pand      xmm6, [GLOBAL(cd1)]
-    pand      xmm7, [GLOBAL(cd1)]
-
-    paddd     xmm0, xmm4
-    paddd     xmm2, xmm5
-    paddd     xmm0, [GLOBAL(cd3)]
-    paddd     xmm2, [GLOBAL(cd3)]
-    paddd     xmm1, xmm6
-    paddd     xmm3, xmm7
-    paddd     xmm1, [GLOBAL(cd3)]
-    paddd     xmm3, [GLOBAL(cd3)]
-
-    psrad     xmm0, 3
-    psrad     xmm1, 3
-    psrad     xmm2, 3
-    psrad     xmm3, 3
-    movdqa    xmm4, xmm0
-    punpcklqdq xmm0, xmm1           ; a23 a22 a21 a20
-    punpckhqdq xmm4, xmm1           ; b23 b22 b21 b20
-    movdqa    xmm5, xmm2
-    punpckhqdq xmm2, xmm3           ; c23 c22 c21 c20
-    punpcklqdq xmm5, xmm3           ; d23 d22 d21 d20
-
-    packssdw  xmm0, xmm4            ; b23 b22 b21 b20 a23 a22 a21 a20
-    packssdw  xmm2, xmm5            ; d23 d22 d21 d20 c23 c22 c21 c20
-
-    movdqa  XMMWORD PTR [rdi], xmm0
-    movdqa  XMMWORD PTR [rdi + 16], xmm2
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-SECTION_RODATA
-align 16
-c1:
-    dw 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001
-align 16
-cn1:
-    dw 0x0001, 0xffff, 0x0001, 0xffff, 0x0001, 0xffff, 0x0001, 0xffff
-align 16
-cd1:
-    dd 0x00000001, 0x00000001, 0x00000001, 0x00000001
-align 16
-cd3:
-    dd 0x00000003, 0x00000003, 0x00000003, 0x00000003
--- a/vp8/encoder/x86/mcomp_x86.h
+++ /dev/null
@@ -1,40 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef MCOMP_X86_H
-#define MCOMP_X86_H
-
-#if HAVE_SSE3
-#if !CONFIG_RUNTIME_CPU_DETECT
-
-#undef  vp9_search_full_search
-#define vp9_search_full_search vp9_full_search_sadx3
-
-#undef  vp9_search_refining_search
-#define vp9_search_refining_search vp9_refining_search_sadx4
-
-#undef  vp9_search_diamond_search
-#define vp9_search_diamond_search vp9_diamond_search_sadx4
-
-#endif
-#endif
-
-#if HAVE_SSE4_1
-#if !CONFIG_RUNTIME_CPU_DETECT
-
-#undef  vp9_search_full_search
-#define vp9_search_full_search vp9_full_search_sadx8
-
-#endif
-#endif
-
-#endif
-
--- a/vp8/encoder/x86/quantize_mmx.asm
+++ /dev/null
@@ -1,286 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-;int vp9_fast_quantize_b_impl_mmx(short *coeff_ptr, short *zbin_ptr,
-;                           short *qcoeff_ptr,short *dequant_ptr,
-;                           short *scan_mask, short *round_ptr,
-;                           short *quant_ptr, short *dqcoeff_ptr);
-global sym(vp9_fast_quantize_b_impl_mmx)
-sym(vp9_fast_quantize_b_impl_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 8
-    push rsi
-    push rdi
-    ; end prolog
-
-
-        mov             rsi,        arg(0) ;coeff_ptr
-        movq            mm0,        [rsi]
-
-        mov             rax,        arg(1) ;zbin_ptr
-        movq            mm1,        [rax]
-
-        movq            mm3,        mm0
-        psraw           mm0,        15
-
-        pxor            mm3,        mm0
-        psubw           mm3,        mm0         ; abs
-
-        movq            mm2,        mm3
-        pcmpgtw         mm1,        mm2
-
-        pandn           mm1,        mm2
-        movq            mm3,        mm1
-
-        mov             rdx,        arg(6) ;quant_ptr
-        movq            mm1,        [rdx]
-
-        mov             rcx,        arg(5) ;round_ptr
-        movq            mm2,        [rcx]
-
-        paddw           mm3,        mm2
-        pmulhuw         mm3,        mm1
-
-        pxor            mm3,        mm0
-        psubw           mm3,        mm0     ;gain the sign back
-
-        mov             rdi,        arg(2) ;qcoeff_ptr
-        movq            mm0,        mm3
-
-        movq            [rdi],      mm3
-
-        mov             rax,        arg(3) ;dequant_ptr
-        movq            mm2,        [rax]
-
-        pmullw          mm3,        mm2
-        mov             rax,        arg(7) ;dqcoeff_ptr
-
-        movq            [rax],      mm3
-
-        ; next 8
-        movq            mm4,        [rsi+8]
-
-        mov             rax,        arg(1) ;zbin_ptr
-        movq            mm5,        [rax+8]
-
-        movq            mm7,        mm4
-        psraw           mm4,        15
-
-        pxor            mm7,        mm4
-        psubw           mm7,        mm4         ; abs
-
-        movq            mm6,        mm7
-        pcmpgtw         mm5,        mm6
-
-        pandn           mm5,        mm6
-        movq            mm7,        mm5
-
-        movq            mm5,        [rdx+8]
-        movq            mm6,        [rcx+8]
-
-        paddw           mm7,        mm6
-        pmulhuw         mm7,        mm5
-
-        pxor            mm7,        mm4
-        psubw           mm7,        mm4;gain the sign back
-
-        mov             rdi,        arg(2) ;qcoeff_ptr
-
-        movq            mm1,        mm7
-        movq            [rdi+8],    mm7
-
-        mov             rax,        arg(3) ;dequant_ptr
-        movq            mm6,        [rax+8]
-
-        pmullw          mm7,        mm6
-        mov             rax,        arg(7) ;dqcoeff_ptr
-
-        movq            [rax+8],    mm7
-
-
-                ; next 8
-        movq            mm4,        [rsi+16]
-
-        mov             rax,        arg(1) ;zbin_ptr
-        movq            mm5,        [rax+16]
-
-        movq            mm7,        mm4
-        psraw           mm4,        15
-
-        pxor            mm7,        mm4
-        psubw           mm7,        mm4         ; abs
-
-        movq            mm6,        mm7
-        pcmpgtw         mm5,        mm6
-
-        pandn           mm5,        mm6
-        movq            mm7,        mm5
-
-        movq            mm5,        [rdx+16]
-        movq            mm6,        [rcx+16]
-
-        paddw           mm7,        mm6
-        pmulhuw         mm7,        mm5
-
-        pxor            mm7,        mm4
-        psubw           mm7,        mm4;gain the sign back
-
-        mov             rdi,        arg(2) ;qcoeff_ptr
-
-        movq            mm1,        mm7
-        movq            [rdi+16],   mm7
-
-        mov             rax,        arg(3) ;dequant_ptr
-        movq            mm6,        [rax+16]
-
-        pmullw          mm7,        mm6
-        mov             rax,        arg(7) ;dqcoeff_ptr
-
-        movq            [rax+16],   mm7
-
-
-                ; next 8
-        movq            mm4,        [rsi+24]
-
-        mov             rax,        arg(1) ;zbin_ptr
-        movq            mm5,        [rax+24]
-
-        movq            mm7,        mm4
-        psraw           mm4,        15
-
-        pxor            mm7,        mm4
-        psubw           mm7,        mm4         ; abs
-
-        movq            mm6,        mm7
-        pcmpgtw         mm5,        mm6
-
-        pandn           mm5,        mm6
-        movq            mm7,        mm5
-
-        movq            mm5,        [rdx+24]
-        movq            mm6,        [rcx+24]
-
-        paddw           mm7,        mm6
-        pmulhuw         mm7,        mm5
-
-        pxor            mm7,        mm4
-        psubw           mm7,        mm4;gain the sign back
-
-        mov             rdi,        arg(2) ;qcoeff_ptr
-
-        movq            mm1,        mm7
-        movq            [rdi+24],   mm7
-
-        mov             rax,        arg(3) ;dequant_ptr
-        movq            mm6,        [rax+24]
-
-        pmullw          mm7,        mm6
-        mov             rax,        arg(7) ;dqcoeff_ptr
-
-        movq            [rax+24],   mm7
-
-
-
-        mov             rdi,        arg(4) ;scan_mask
-        mov             rsi,        arg(2) ;qcoeff_ptr
-
-        pxor            mm5,        mm5
-        pxor            mm7,        mm7
-
-        movq            mm0,        [rsi]
-        movq            mm1,        [rsi+8]
-
-        movq            mm2,        [rdi]
-        movq            mm3,        [rdi+8];
-
-        pcmpeqw         mm0,        mm7
-        pcmpeqw         mm1,        mm7
-
-        pcmpeqw         mm6,        mm6
-        pxor            mm0,        mm6
-
-        pxor            mm1,        mm6
-        psrlw           mm0,        15
-
-        psrlw           mm1,        15
-        pmaddwd         mm0,        mm2
-
-        pmaddwd         mm1,        mm3
-        movq            mm5,        mm0
-
-        paddd           mm5,        mm1
-
-        movq            mm0,        [rsi+16]
-        movq            mm1,        [rsi+24]
-
-        movq            mm2,        [rdi+16]
-        movq            mm3,        [rdi+24];
-
-        pcmpeqw         mm0,        mm7
-        pcmpeqw         mm1,        mm7
-
-        pcmpeqw         mm6,        mm6
-        pxor            mm0,        mm6
-
-        pxor            mm1,        mm6
-        psrlw           mm0,        15
-
-        psrlw           mm1,        15
-        pmaddwd         mm0,        mm2
-
-        pmaddwd         mm1,        mm3
-        paddd           mm5,        mm0
-
-        paddd           mm5,        mm1
-        movq            mm0,        mm5
-
-        psrlq           mm5,        32
-        paddd           mm0,        mm5
-
-        ; eob adjustment begins here
-        movq            rcx,        mm0
-        and             rcx,        0xffff
-
-        xor             rdx,        rdx
-        sub             rdx,        rcx ; rdx=-rcx
-
-        bsr             rax,        rcx
-        inc             rax
-
-        sar             rdx,        31
-        and             rax,        rdx
-        ; Substitute the sse assembly for the old mmx mixed assembly/C. The
-        ; following is kept as reference
-        ;    movq            rcx,        mm0
-        ;    bsr             rax,        rcx
-        ;
-        ;    mov             eob,        rax
-        ;    mov             eee,        rcx
-        ;
-        ;if(eee==0)
-        ;{
-        ;    eob=-1;
-        ;}
-        ;else if(eee<0)
-        ;{
-        ;    eob=15;
-        ;}
-        ;d->eob = eob+1;
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
--- a/vp8/encoder/x86/quantize_sse2.asm
+++ /dev/null
@@ -1,380 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license and patent
-;  grant that can be found in the LICENSE file in the root of the source
-;  tree. All contributing project authors may be found in the AUTHORS
-;  file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-%include "asm_enc_offsets.asm"
-
-
-; void vp9_regular_quantize_b_sse2 | arg
-;  (BLOCK  *b,                     |  0
-;   BLOCKD *d)                     |  1
-
-global sym(vp9_regular_quantize_b_sse2)
-sym(vp9_regular_quantize_b_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SAVE_XMM 7
-    GET_GOT     rbx
-
-%if ABI_IS_32BIT
-    push        rdi
-    push        rsi
-%else
-  %ifidn __OUTPUT_FORMAT__,x64
-    push        rdi
-    push        rsi
-  %endif
-%endif
-
-    ALIGN_STACK 16, rax
-    %define zrun_zbin_boost   0  ;  8
-    %define abs_minus_zbin    8  ; 32
-    %define temp_qcoeff       40 ; 32
-    %define qcoeff            72 ; 32
-    %define stack_size        104
-    sub         rsp, stack_size
-    ; end prolog
-
-%if ABI_IS_32BIT
-    mov         rdi, arg(0)                 ; BLOCK *b
-    mov         rsi, arg(1)                 ; BLOCKD *d
-%else
-  %ifidn __OUTPUT_FORMAT__,x64
-    mov         rdi, rcx                    ; BLOCK *b
-    mov         rsi, rdx                    ; BLOCKD *d
-  %else
-    ;mov         rdi, rdi                    ; BLOCK *b
-    ;mov         rsi, rsi                    ; BLOCKD *d
-  %endif
-%endif
-
-    mov         rdx, [rdi + vp9_block_coeff] ; coeff_ptr
-    mov         rcx, [rdi + vp9_block_zbin] ; zbin_ptr
-    movd        xmm7, [rdi + vp9_block_zbin_extra] ; zbin_oq_value
-
-    ; z
-    movdqa      xmm0, [rdx]
-    movdqa      xmm4, [rdx + 16]
-    mov         rdx, [rdi + vp9_block_round] ; round_ptr
-
-    pshuflw     xmm7, xmm7, 0
-    punpcklwd   xmm7, xmm7                  ; duplicated zbin_oq_value
-
-    movdqa      xmm1, xmm0
-    movdqa      xmm5, xmm4
-
-    ; sz
-    psraw       xmm0, 15
-    psraw       xmm4, 15
-
-    ; (z ^ sz)
-    pxor        xmm1, xmm0
-    pxor        xmm5, xmm4
-
-    ; x = abs(z)
-    psubw       xmm1, xmm0
-    psubw       xmm5, xmm4
-
-    movdqa      xmm2, [rcx]
-    movdqa      xmm3, [rcx + 16]
-    mov         rcx, [rdi + vp9_block_quant] ; quant_ptr
-
-    ; *zbin_ptr + zbin_oq_value
-    paddw       xmm2, xmm7
-    paddw       xmm3, xmm7
-
-    ; x - (*zbin_ptr + zbin_oq_value)
-    psubw       xmm1, xmm2
-    psubw       xmm5, xmm3
-    movdqa      [rsp + abs_minus_zbin], xmm1
-    movdqa      [rsp + abs_minus_zbin + 16], xmm5
-
-    ; add (zbin_ptr + zbin_oq_value) back
-    paddw       xmm1, xmm2
-    paddw       xmm5, xmm3
-
-    movdqa      xmm2, [rdx]
-    movdqa      xmm6, [rdx + 16]
-
-    movdqa      xmm3, [rcx]
-    movdqa      xmm7, [rcx + 16]
-
-    ; x + round
-    paddw       xmm1, xmm2
-    paddw       xmm5, xmm6
-
-    ; y = x * quant_ptr >> 16
-    pmulhw      xmm3, xmm1
-    pmulhw      xmm7, xmm5
-
-    ; y += x
-    paddw       xmm1, xmm3
-    paddw       xmm5, xmm7
-
-    movdqa      [rsp + temp_qcoeff], xmm1
-    movdqa      [rsp + temp_qcoeff + 16], xmm5
-
-    pxor        xmm6, xmm6
-    ; zero qcoeff
-    movdqa      [rsp + qcoeff], xmm6
-    movdqa      [rsp + qcoeff + 16], xmm6
-
-    mov         rdx, [rdi + vp9_block_zrun_zbin_boost] ; zbin_boost_ptr
-    mov         rax, [rdi + vp9_block_quant_shift] ; quant_shift_ptr
-    mov         [rsp + zrun_zbin_boost], rdx
-
-%macro ZIGZAG_LOOP 1
-    ; x
-    movsx       ecx, WORD PTR[rsp + abs_minus_zbin + %1 * 2]
-
-    ; if (x >= zbin)
-    sub         cx, WORD PTR[rdx]           ; x - zbin
-    lea         rdx, [rdx + 2]              ; zbin_boost_ptr++
-    jl          .rq_zigzag_loop_%1           ; x < zbin
-
-    movsx       edi, WORD PTR[rsp + temp_qcoeff + %1 * 2]
-
-    ; downshift by quant_shift[rc]
-    movsx       cx, BYTE PTR[rax + %1]      ; quant_shift_ptr[rc]
-    sar         edi, cl                     ; also sets Z bit
-    je          .rq_zigzag_loop_%1           ; !y
-    mov         WORD PTR[rsp + qcoeff + %1 * 2], di ;qcoeff_ptr[rc] = temp_qcoeff[rc]
-    mov         rdx, [rsp + zrun_zbin_boost] ; reset to b->zrun_zbin_boost
-.rq_zigzag_loop_%1:
-%endmacro
-; in vp9_default_zig_zag1d order: see vp8/common/entropy.c
-ZIGZAG_LOOP  0
-ZIGZAG_LOOP  1
-ZIGZAG_LOOP  4
-ZIGZAG_LOOP  8
-ZIGZAG_LOOP  5
-ZIGZAG_LOOP  2
-ZIGZAG_LOOP  3
-ZIGZAG_LOOP  6
-ZIGZAG_LOOP  9
-ZIGZAG_LOOP 12
-ZIGZAG_LOOP 13
-ZIGZAG_LOOP 10
-ZIGZAG_LOOP  7
-ZIGZAG_LOOP 11
-ZIGZAG_LOOP 14
-ZIGZAG_LOOP 15
-
-    movdqa      xmm2, [rsp + qcoeff]
-    movdqa      xmm3, [rsp + qcoeff + 16]
-
-    mov         rcx, [rsi + vp9_blockd_dequant] ; dequant_ptr
-    mov         rdi, [rsi + vp9_blockd_dqcoeff] ; dqcoeff_ptr
-
-    ; y ^ sz
-    pxor        xmm2, xmm0
-    pxor        xmm3, xmm4
-    ; x = (y ^ sz) - sz
-    psubw       xmm2, xmm0
-    psubw       xmm3, xmm4
-
-    ; dequant
-    movdqa      xmm0, [rcx]
-    movdqa      xmm1, [rcx + 16]
-
-    mov         rcx, [rsi + vp9_blockd_qcoeff] ; qcoeff_ptr
-
-    pmullw      xmm0, xmm2
-    pmullw      xmm1, xmm3
-
-    movdqa      [rcx], xmm2        ; store qcoeff
-    movdqa      [rcx + 16], xmm3
-    movdqa      [rdi], xmm0        ; store dqcoeff
-    movdqa      [rdi + 16], xmm1
-
-    ; select the last value (in zig_zag order) for EOB
-    pcmpeqw     xmm2, xmm6
-    pcmpeqw     xmm3, xmm6
-    ; !
-    pcmpeqw     xmm6, xmm6
-    pxor        xmm2, xmm6
-    pxor        xmm3, xmm6
-    ; mask inv_zig_zag
-    pand        xmm2, [GLOBAL(inv_zig_zag)]
-    pand        xmm3, [GLOBAL(inv_zig_zag + 16)]
-    ; select the max value
-    pmaxsw      xmm2, xmm3
-    pshufd      xmm3, xmm2, 00001110b
-    pmaxsw      xmm2, xmm3
-    pshuflw     xmm3, xmm2, 00001110b
-    pmaxsw      xmm2, xmm3
-    pshuflw     xmm3, xmm2, 00000001b
-    pmaxsw      xmm2, xmm3
-    movd        eax, xmm2
-    and         eax, 0xff
-    mov         [rsi + vp9_blockd_eob], eax
-
-    ; begin epilog
-    add         rsp, stack_size
-    pop         rsp
-%if ABI_IS_32BIT
-    pop         rsi
-    pop         rdi
-%else
-  %ifidn __OUTPUT_FORMAT__,x64
-    pop         rsi
-    pop         rdi
-  %endif
-%endif
-    RESTORE_GOT
-    RESTORE_XMM
-    pop         rbp
-    ret
-
-; void vp9_fast_quantize_b_sse2 | arg
-;  (BLOCK  *b,                  |  0
-;   BLOCKD *d)                  |  1
-
-global sym(vp9_fast_quantize_b_sse2)
-sym(vp9_fast_quantize_b_sse2):
-    push        rbp
-    mov         rbp, rsp
-    GET_GOT     rbx
-
-%if ABI_IS_32BIT
-    push        rdi
-    push        rsi
-%else
-  %ifidn __OUTPUT_FORMAT__,x64
-    push        rdi
-    push        rsi
-  %else
-    ; these registers are used for passing arguments
-  %endif
-%endif
-
-    ; end prolog
-
-%if ABI_IS_32BIT
-    mov         rdi, arg(0)                 ; BLOCK *b
-    mov         rsi, arg(1)                 ; BLOCKD *d
-%else
-  %ifidn __OUTPUT_FORMAT__,x64
-    mov         rdi, rcx                    ; BLOCK *b
-    mov         rsi, rdx                    ; BLOCKD *d
-  %else
-    ;mov         rdi, rdi                    ; BLOCK *b
-    ;mov         rsi, rsi                    ; BLOCKD *d
-  %endif
-%endif
-
-    mov         rax, [rdi + vp9_block_coeff]
-    mov         rcx, [rdi + vp9_block_round]
-    mov         rdx, [rdi + vp9_block_quant_fast]
-
-    ; z = coeff
-    movdqa      xmm0, [rax]
-    movdqa      xmm4, [rax + 16]
-
-    ; dup z so we can save sz
-    movdqa      xmm1, xmm0
-    movdqa      xmm5, xmm4
-
-    ; sz = z >> 15
-    psraw       xmm0, 15
-    psraw       xmm4, 15
-
-    ; x = abs(z) = (z ^ sz) - sz
-    pxor        xmm1, xmm0
-    pxor        xmm5, xmm4
-    psubw       xmm1, xmm0
-    psubw       xmm5, xmm4
-
-    ; x += round
-    paddw       xmm1, [rcx]
-    paddw       xmm5, [rcx + 16]
-
-    mov         rax, [rsi + vp9_blockd_qcoeff]
-    mov         rcx, [rsi + vp9_blockd_dequant]
-    mov         rdi, [rsi + vp9_blockd_dqcoeff]
-
-    ; y = x * quant >> 16
-    pmulhw      xmm1, [rdx]
-    pmulhw      xmm5, [rdx + 16]
-
-    ; x = (y ^ sz) - sz
-    pxor        xmm1, xmm0
-    pxor        xmm5, xmm4
-    psubw       xmm1, xmm0
-    psubw       xmm5, xmm4
-
-    ; qcoeff = x
-    movdqa      [rax], xmm1
-    movdqa      [rax + 16], xmm5
-
-    ; x * dequant
-    movdqa      xmm2, xmm1
-    movdqa      xmm3, xmm5
-    pmullw      xmm2, [rcx]
-    pmullw      xmm3, [rcx + 16]
-
-    ; dqcoeff = x * dequant
-    movdqa      [rdi], xmm2
-    movdqa      [rdi + 16], xmm3
-
-    pxor        xmm4, xmm4                  ;clear all bits
-    pcmpeqw     xmm1, xmm4
-    pcmpeqw     xmm5, xmm4
-
-    pcmpeqw     xmm4, xmm4                  ;set all bits
-    pxor        xmm1, xmm4
-    pxor        xmm5, xmm4
-
-    pand        xmm1, [GLOBAL(inv_zig_zag)]
-    pand        xmm5, [GLOBAL(inv_zig_zag + 16)]
-
-    pmaxsw      xmm1, xmm5
-
-    ; now down to 8
-    pshufd      xmm5, xmm1, 00001110b
-
-    pmaxsw      xmm1, xmm5
-
-    ; only 4 left
-    pshuflw     xmm5, xmm1, 00001110b
-
-    pmaxsw      xmm1, xmm5
-
-    ; okay, just 2!
-    pshuflw     xmm5, xmm1, 00000001b
-
-    pmaxsw      xmm1, xmm5
-
-    movd        eax, xmm1
-    and         eax, 0xff
-    mov         [rsi + vp9_blockd_eob], eax
-
-    ; begin epilog
-%if ABI_IS_32BIT
-    pop         rsi
-    pop         rdi
-%else
-  %ifidn __OUTPUT_FORMAT__,x64
-    pop         rsi
-    pop         rdi
-  %endif
-%endif
-
-    RESTORE_GOT
-    pop         rbp
-    ret
-
-SECTION_RODATA
-align 16
-inv_zig_zag:
-  dw 0x0001, 0x0002, 0x0006, 0x0007
-  dw 0x0003, 0x0005, 0x0008, 0x000d
-  dw 0x0004, 0x0009, 0x000c, 0x000e
-  dw 0x000a, 0x000b, 0x000f, 0x0010
--- a/vp8/encoder/x86/quantize_sse4.asm
+++ /dev/null
@@ -1,254 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license and patent
-;  grant that can be found in the LICENSE file in the root of the source
-;  tree. All contributing project authors may be found in the AUTHORS
-;  file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-%include "asm_enc_offsets.asm"
-
-
-; void vp9_regular_quantize_b_sse4 | arg
-;  (BLOCK  *b,                     |  0
-;   BLOCKD *d)                     |  1
-
-global sym(vp9_regular_quantize_b_sse4)
-sym(vp9_regular_quantize_b_sse4):
-
-%if ABI_IS_32BIT
-    push        rbp
-    mov         rbp, rsp
-    GET_GOT     rbx
-    push        rdi
-    push        rsi
-
-    ALIGN_STACK 16, rax
-    %define qcoeff      0 ; 32
-    %define stack_size 32
-    sub         rsp, stack_size
-%else
-  %ifidn __OUTPUT_FORMAT__,x64
-    SAVE_XMM 8, u
-    push        rdi
-    push        rsi
-  %endif
-%endif
-    ; end prolog
-
-%if ABI_IS_32BIT
-    mov         rdi, arg(0)                 ; BLOCK *b
-    mov         rsi, arg(1)                 ; BLOCKD *d
-%else
-  %ifidn __OUTPUT_FORMAT__,x64
-    mov         rdi, rcx                    ; BLOCK *b
-    mov         rsi, rdx                    ; BLOCKD *d
-  %else
-    ;mov         rdi, rdi                    ; BLOCK *b
-    ;mov         rsi, rsi                    ; BLOCKD *d
-  %endif
-%endif
-
-    mov         rax, [rdi + vp9_block_coeff]
-    mov         rcx, [rdi + vp9_block_zbin]
-    mov         rdx, [rdi + vp9_block_round]
-    movd        xmm7, [rdi + vp9_block_zbin_extra]
-
-    ; z
-    movdqa      xmm0, [rax]
-    movdqa      xmm1, [rax + 16]
-
-    ; duplicate zbin_oq_value
-    pshuflw     xmm7, xmm7, 0
-    punpcklwd   xmm7, xmm7
-
-    movdqa      xmm2, xmm0
-    movdqa      xmm3, xmm1
-
-    ; sz
-    psraw       xmm0, 15
-    psraw       xmm1, 15
-
-    ; (z ^ sz)
-    pxor        xmm2, xmm0
-    pxor        xmm3, xmm1
-
-    ; x = abs(z)
-    psubw       xmm2, xmm0
-    psubw       xmm3, xmm1
-
-    ; zbin
-    movdqa      xmm4, [rcx]
-    movdqa      xmm5, [rcx + 16]
-
-    ; *zbin_ptr + zbin_oq_value
-    paddw       xmm4, xmm7
-    paddw       xmm5, xmm7
-
-    movdqa      xmm6, xmm2
-    movdqa      xmm7, xmm3
-
-    ; x - (*zbin_ptr + zbin_oq_value)
-    psubw       xmm6, xmm4
-    psubw       xmm7, xmm5
-
-    ; round
-    movdqa      xmm4, [rdx]
-    movdqa      xmm5, [rdx + 16]
-
-    mov         rax, [rdi + vp9_block_quant_shift]
-    mov         rcx, [rdi + vp9_block_quant]
-    mov         rdx, [rdi + vp9_block_zrun_zbin_boost]
-
-    ; x + round
-    paddw       xmm2, xmm4
-    paddw       xmm3, xmm5
-
-    ; quant
-    movdqa      xmm4, [rcx]
-    movdqa      xmm5, [rcx + 16]
-
-    ; y = x * quant_ptr >> 16
-    pmulhw      xmm4, xmm2
-    pmulhw      xmm5, xmm3
-
-    ; y += x
-    paddw       xmm2, xmm4
-    paddw       xmm3, xmm5
-
-    pxor        xmm4, xmm4
-%if ABI_IS_32BIT
-    movdqa      [rsp + qcoeff], xmm4
-    movdqa      [rsp + qcoeff + 16], xmm4
-%else
-    pxor        xmm8, xmm8
-%endif
-
-    ; quant_shift
-    movdqa      xmm5, [rax]
-
-    ; zrun_zbin_boost
-    mov         rax, rdx
-
-%macro ZIGZAG_LOOP 5
-    ; x
-    pextrw      ecx, %4, %2
-
-    ; if (x >= zbin)
-    sub         cx, WORD PTR[rdx]           ; x - zbin
-    lea         rdx, [rdx + 2]              ; zbin_boost_ptr++
-    jl          .rq_zigzag_loop_%1          ; x < zbin
-
-    pextrw      edi, %3, %2                 ; y
-
-    ; downshift by quant_shift[rc]
-    pextrb      ecx, xmm5, %1               ; quant_shift[rc]
-    sar         edi, cl                     ; also sets Z bit
-    je          .rq_zigzag_loop_%1          ; !y
-%if ABI_IS_32BIT
-    mov         WORD PTR[rsp + qcoeff + %1 *2], di
-%else
-    pinsrw      %5, edi, %2                 ; qcoeff[rc]
-%endif
-    mov         rdx, rax                    ; reset to b->zrun_zbin_boost
-.rq_zigzag_loop_%1:
-%endmacro
-; in vp9_default_zig_zag1d order: see vp8/common/entropy.c
-ZIGZAG_LOOP  0, 0, xmm2, xmm6, xmm4
-ZIGZAG_LOOP  1, 1, xmm2, xmm6, xmm4
-ZIGZAG_LOOP  4, 4, xmm2, xmm6, xmm4
-ZIGZAG_LOOP  8, 0, xmm3, xmm7, xmm8
-ZIGZAG_LOOP  5, 5, xmm2, xmm6, xmm4
-ZIGZAG_LOOP  2, 2, xmm2, xmm6, xmm4
-ZIGZAG_LOOP  3, 3, xmm2, xmm6, xmm4
-ZIGZAG_LOOP  6, 6, xmm2, xmm6, xmm4
-ZIGZAG_LOOP  9, 1, xmm3, xmm7, xmm8
-ZIGZAG_LOOP 12, 4, xmm3, xmm7, xmm8
-ZIGZAG_LOOP 13, 5, xmm3, xmm7, xmm8
-ZIGZAG_LOOP 10, 2, xmm3, xmm7, xmm8
-ZIGZAG_LOOP  7, 7, xmm2, xmm6, xmm4
-ZIGZAG_LOOP 11, 3, xmm3, xmm7, xmm8
-ZIGZAG_LOOP 14, 6, xmm3, xmm7, xmm8
-ZIGZAG_LOOP 15, 7, xmm3, xmm7, xmm8
-
-    mov         rcx, [rsi + vp9_blockd_dequant]
-    mov         rdi, [rsi + vp9_blockd_dqcoeff]
-
-%if ABI_IS_32BIT
-    movdqa      xmm4, [rsp + qcoeff]
-    movdqa      xmm5, [rsp + qcoeff + 16]
-%else
-    %define     xmm5 xmm8
-%endif
-
-    ; y ^ sz
-    pxor        xmm4, xmm0
-    pxor        xmm5, xmm1
-    ; x = (y ^ sz) - sz
-    psubw       xmm4, xmm0
-    psubw       xmm5, xmm1
-
-    ; dequant
-    movdqa      xmm0, [rcx]
-    movdqa      xmm1, [rcx + 16]
-
-    mov         rcx, [rsi + vp9_blockd_qcoeff]
-
-    pmullw      xmm0, xmm4
-    pmullw      xmm1, xmm5
-
-    ; store qcoeff
-    movdqa      [rcx], xmm4
-    movdqa      [rcx + 16], xmm5
-
-    ; store dqcoeff
-    movdqa      [rdi], xmm0
-    movdqa      [rdi + 16], xmm1
-
-    ; select the last value (in zig_zag order) for EOB
-    pxor        xmm6, xmm6
-    pcmpeqw     xmm4, xmm6
-    pcmpeqw     xmm5, xmm6
-
-    packsswb    xmm4, xmm5
-    pshufb      xmm4, [GLOBAL(zig_zag1d)]
-    pmovmskb    edx, xmm4
-    xor         rdi, rdi
-    mov         eax, -1
-    xor         dx, ax
-    bsr         eax, edx
-    sub         edi, edx
-    sar         edi, 31
-    add         eax, 1
-    and         eax, edi
-
-    mov         [rsi + vp9_blockd_eob], eax
-
-    ; begin epilog
-%if ABI_IS_32BIT
-    add         rsp, stack_size
-    pop         rsp
-
-    pop         rsi
-    pop         rdi
-    RESTORE_GOT
-    pop         rbp
-%else
-  %undef xmm5
-  %ifidn __OUTPUT_FORMAT__,x64
-    pop         rsi
-    pop         rdi
-    RESTORE_XMM
-  %endif
-%endif
-
-    ret
-
-SECTION_RODATA
-align 16
-; vp8/common/entropy.c: vp9_default_zig_zag1d
-zig_zag1d:
-    db 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
--- a/vp8/encoder/x86/quantize_ssse3.asm
+++ /dev/null
@@ -1,138 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license and patent
-;  grant that can be found in the LICENSE file in the root of the source
-;  tree. All contributing project authors may be found in the AUTHORS
-;  file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-%include "asm_enc_offsets.asm"
-
-
-; void vp9_fast_quantize_b_ssse3 | arg
-;  (BLOCK  *b,                   |  0
-;   BLOCKD *d)                   |  1
-;
-
-global sym(vp9_fast_quantize_b_ssse3)
-sym(vp9_fast_quantize_b_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    GET_GOT     rbx
-
-%if ABI_IS_32BIT
-    push        rdi
-    push        rsi
-%else
-  %ifidn __OUTPUT_FORMAT__,x64
-    push        rdi
-    push        rsi
-  %endif
-%endif
-    ; end prolog
-
-%if ABI_IS_32BIT
-    mov         rdi, arg(0)                 ; BLOCK *b
-    mov         rsi, arg(1)                 ; BLOCKD *d
-%else
-  %ifidn __OUTPUT_FORMAT__,x64
-    mov         rdi, rcx                    ; BLOCK *b
-    mov         rsi, rdx                    ; BLOCKD *d
-  %else
-    ;mov         rdi, rdi                    ; BLOCK *b
-    ;mov         rsi, rsi                    ; BLOCKD *d
-  %endif
-%endif
-
-    mov         rax, [rdi + vp9_block_coeff]
-    mov         rcx, [rdi + vp9_block_round]
-    mov         rdx, [rdi + vp9_block_quant_fast]
-
-    ; coeff
-    movdqa      xmm0, [rax]
-    movdqa      xmm4, [rax + 16]
-
-    ; round
-    movdqa      xmm2, [rcx]
-    movdqa      xmm3, [rcx + 16]
-
-    movdqa      xmm1, xmm0
-    movdqa      xmm5, xmm4
-
-    ; sz = z >> 15
-    psraw       xmm0, 15
-    psraw       xmm4, 15
-
-    pabsw       xmm1, xmm1
-    pabsw       xmm5, xmm5
-
-    paddw       xmm1, xmm2
-    paddw       xmm5, xmm3
-
-    ; quant_fast
-    pmulhw      xmm1, [rdx]
-    pmulhw      xmm5, [rdx + 16]
-
-    mov         rax, [rsi + vp9_blockd_qcoeff]
-    mov         rdi, [rsi + vp9_blockd_dequant]
-    mov         rcx, [rsi + vp9_blockd_dqcoeff]
-
-    pxor        xmm1, xmm0
-    pxor        xmm5, xmm4
-    psubw       xmm1, xmm0
-    psubw       xmm5, xmm4
-
-    movdqa      [rax], xmm1
-    movdqa      [rax + 16], xmm5
-
-    movdqa      xmm2, [rdi]
-    movdqa      xmm3, [rdi + 16]
-
-    pxor        xmm4, xmm4
-    pmullw      xmm2, xmm1
-    pmullw      xmm3, xmm5
-
-    pcmpeqw     xmm1, xmm4                  ;non zero mask
-    pcmpeqw     xmm5, xmm4                  ;non zero mask
-    packsswb    xmm1, xmm5
-    pshufb      xmm1, [GLOBAL(zz_shuf)]
-
-    pmovmskb    edx, xmm1
-
-    xor         rdi, rdi
-    mov         eax, -1
-    xor         dx, ax                      ;flip the bits for bsr
-    bsr         eax, edx
-
-    movdqa      [rcx], xmm2                 ;store dqcoeff
-    movdqa      [rcx + 16], xmm3            ;store dqcoeff
-
-    sub         edi, edx                    ;check for all zeros in bit mask
-    sar         edi, 31                     ;0 or -1
-    add         eax, 1
-    and         eax, edi                    ;if the bit mask was all zero,
-                                            ;then eob = 0
-    mov         [rsi + vp9_blockd_eob], eax
-
-    ; begin epilog
-%if ABI_IS_32BIT
-    pop         rsi
-    pop         rdi
-%else
-  %ifidn __OUTPUT_FORMAT__,x64
-    pop         rsi
-    pop         rdi
-  %endif
-%endif
-
-    RESTORE_GOT
-    pop         rbp
-    ret
-
-SECTION_RODATA
-align 16
-zz_shuf:
-    db 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
--- a/vp8/encoder/x86/quantize_x86.h
+++ /dev/null
@@ -1,48 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license and patent
- *  grant that can be found in the LICENSE file in the root of the source
- *  tree. All contributing project authors may be found in the AUTHORS
- *  file in the root of the source tree.
- */
-
-#ifndef QUANTIZE_X86_H
-#define QUANTIZE_X86_H
-
-
-/* Note:
- *
- * This platform is commonly built for runtime CPU detection. If you modify
- * any of the function mappings present in this file, be sure to also update
- * them in the function pointer initialization code
- */
-#if HAVE_MMX
-
-#endif /* HAVE_MMX */
-
-
-#if HAVE_SSE2
-extern prototype_quantize_block(vp9_regular_quantize_b_sse2);
-#if !CONFIG_RUNTIME_CPU_DETECT
-
-#undef vp9_quantize_quantb
-#define vp9_quantize_quantb vp9_regular_quantize_b_sse2
-#endif /* !CONFIG_RUNTIME_CPU_DETECT */
-
-#endif /* HAVE_SSE2 */
-
-
-#if HAVE_SSE4_1
-extern prototype_quantize_block(vp9_regular_quantize_b_sse4);
-
-#if !CONFIG_RUNTIME_CPU_DETECT
-
-#undef vp9_quantize_quantb
-#define vp9_quantize_quantb vp9_regular_quantize_b_sse4
-
-#endif /* !CONFIG_RUNTIME_CPU_DETECT */
-
-#endif /* HAVE_SSE4_1 */
-
-#endif /* QUANTIZE_X86_H */
--- a/vp8/encoder/x86/sad_mmx.asm
+++ /dev/null
@@ -1,427 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-global sym(vp9_sad16x16_mmx)
-global sym(vp9_sad8x16_mmx)
-global sym(vp9_sad8x8_mmx)
-global sym(vp9_sad4x4_mmx)
-global sym(vp9_sad16x8_mmx)
-
-;unsigned int vp9_sad16x16_mmx(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride)
-sym(vp9_sad16x16_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 4
-    push rsi
-    push rdi
-    ; end prolog
-
-        mov             rsi,        arg(0) ;src_ptr
-        mov             rdi,        arg(2) ;ref_ptr
-
-        movsxd          rax,        dword ptr arg(1) ;src_stride
-        movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-        lea             rcx,        [rsi+rax*8]
-
-        lea             rcx,        [rcx+rax*8]
-        pxor            mm7,        mm7
-
-        pxor            mm6,        mm6
-
-.x16x16sad_mmx_loop:
-
-        movq            mm0,        QWORD PTR [rsi]
-        movq            mm2,        QWORD PTR [rsi+8]
-
-        movq            mm1,        QWORD PTR [rdi]
-        movq            mm3,        QWORD PTR [rdi+8]
-
-        movq            mm4,        mm0
-        movq            mm5,        mm2
-
-        psubusb         mm0,        mm1
-        psubusb         mm1,        mm4
-
-        psubusb         mm2,        mm3
-        psubusb         mm3,        mm5
-
-        por             mm0,        mm1
-        por             mm2,        mm3
-
-        movq            mm1,        mm0
-        movq            mm3,        mm2
-
-        punpcklbw       mm0,        mm6
-        punpcklbw       mm2,        mm6
-
-        punpckhbw       mm1,        mm6
-        punpckhbw       mm3,        mm6
-
-        paddw           mm0,        mm2
-        paddw           mm1,        mm3
-
-
-        lea             rsi,        [rsi+rax]
-        add             rdi,        rdx
-
-        paddw           mm7,        mm0
-        paddw           mm7,        mm1
-
-        cmp             rsi,        rcx
-        jne             .x16x16sad_mmx_loop
-
-
-        movq            mm0,        mm7
-
-        punpcklwd       mm0,        mm6
-        punpckhwd       mm7,        mm6
-
-        paddw           mm0,        mm7
-        movq            mm7,        mm0
-
-
-        psrlq           mm0,        32
-        paddw           mm7,        mm0
-
-        movq            rax,        mm7
-
-    pop rdi
-    pop rsi
-    mov rsp, rbp
-    ; begin epilog
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;unsigned int vp9_sad8x16_mmx(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride)
-sym(vp9_sad8x16_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 4
-    push rsi
-    push rdi
-    ; end prolog
-
-        mov             rsi,        arg(0) ;src_ptr
-        mov             rdi,        arg(2) ;ref_ptr
-
-        movsxd          rax,        dword ptr arg(1) ;src_stride
-        movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-        lea             rcx,        [rsi+rax*8]
-
-        lea             rcx,        [rcx+rax*8]
-        pxor            mm7,        mm7
-
-        pxor            mm6,        mm6
-
-.x8x16sad_mmx_loop:
-
-        movq            mm0,        QWORD PTR [rsi]
-        movq            mm1,        QWORD PTR [rdi]
-
-        movq            mm2,        mm0
-        psubusb         mm0,        mm1
-
-        psubusb         mm1,        mm2
-        por             mm0,        mm1
-
-        movq            mm2,        mm0
-        punpcklbw       mm0,        mm6
-
-        punpckhbw       mm2,        mm6
-        lea             rsi,        [rsi+rax]
-
-        add             rdi,        rdx
-        paddw           mm7,        mm0
-
-        paddw           mm7,        mm2
-        cmp             rsi,        rcx
-
-        jne             .x8x16sad_mmx_loop
-
-        movq            mm0,        mm7
-        punpcklwd       mm0,        mm6
-
-        punpckhwd       mm7,        mm6
-        paddw           mm0,        mm7
-
-        movq            mm7,        mm0
-        psrlq           mm0,        32
-
-        paddw           mm7,        mm0
-        movq            rax,        mm7
-
-    pop rdi
-    pop rsi
-    mov rsp, rbp
-    ; begin epilog
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;unsigned int vp9_sad8x8_mmx(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride)
-sym(vp9_sad8x8_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 4
-    push rsi
-    push rdi
-    ; end prolog
-
-        mov             rsi,        arg(0) ;src_ptr
-        mov             rdi,        arg(2) ;ref_ptr
-
-        movsxd          rax,        dword ptr arg(1) ;src_stride
-        movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-        lea             rcx,        [rsi+rax*8]
-        pxor            mm7,        mm7
-
-        pxor            mm6,        mm6
-
-.x8x8sad_mmx_loop:
-
-        movq            mm0,        QWORD PTR [rsi]
-        movq            mm1,        QWORD PTR [rdi]
-
-        movq            mm2,        mm0
-        psubusb         mm0,        mm1
-
-        psubusb         mm1,        mm2
-        por             mm0,        mm1
-
-        movq            mm2,        mm0
-        punpcklbw       mm0,        mm6
-
-        punpckhbw       mm2,        mm6
-        paddw           mm0,        mm2
-
-        lea             rsi,       [rsi+rax]
-        add             rdi,        rdx
-
-        paddw           mm7,       mm0
-        cmp             rsi,        rcx
-
-        jne             .x8x8sad_mmx_loop
-
-        movq            mm0,        mm7
-        punpcklwd       mm0,        mm6
-
-        punpckhwd       mm7,        mm6
-        paddw           mm0,        mm7
-
-        movq            mm7,        mm0
-        psrlq           mm0,        32
-
-        paddw           mm7,        mm0
-        movq            rax,        mm7
-
-    pop rdi
-    pop rsi
-    mov rsp, rbp
-    ; begin epilog
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;unsigned int vp9_sad4x4_mmx(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride)
-sym(vp9_sad4x4_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 4
-    push rsi
-    push rdi
-    ; end prolog
-
-        mov             rsi,        arg(0) ;src_ptr
-        mov             rdi,        arg(2) ;ref_ptr
-
-        movsxd          rax,        dword ptr arg(1) ;src_stride
-        movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-        movd            mm0,        DWORD PTR [rsi]
-        movd            mm1,        DWORD PTR [rdi]
-
-        movd            mm2,        DWORD PTR [rsi+rax]
-        movd            mm3,        DWORD PTR [rdi+rdx]
-
-        punpcklbw       mm0,        mm2
-        punpcklbw       mm1,        mm3
-
-        movq            mm2,        mm0
-        psubusb         mm0,        mm1
-
-        psubusb         mm1,        mm2
-        por             mm0,        mm1
-
-        movq            mm2,        mm0
-        pxor            mm3,        mm3
-
-        punpcklbw       mm0,        mm3
-        punpckhbw       mm2,        mm3
-
-        paddw           mm0,        mm2
-
-        lea             rsi,        [rsi+rax*2]
-        lea             rdi,        [rdi+rdx*2]
-
-        movd            mm4,        DWORD PTR [rsi]
-        movd            mm5,        DWORD PTR [rdi]
-
-        movd            mm6,        DWORD PTR [rsi+rax]
-        movd            mm7,        DWORD PTR [rdi+rdx]
-
-        punpcklbw       mm4,        mm6
-        punpcklbw       mm5,        mm7
-
-        movq            mm6,        mm4
-        psubusb         mm4,        mm5
-
-        psubusb         mm5,        mm6
-        por             mm4,        mm5
-
-        movq            mm5,        mm4
-        punpcklbw       mm4,        mm3
-
-        punpckhbw       mm5,        mm3
-        paddw           mm4,        mm5
-
-        paddw           mm0,        mm4
-        movq            mm1,        mm0
-
-        punpcklwd       mm0,        mm3
-        punpckhwd       mm1,        mm3
-
-        paddw           mm0,        mm1
-        movq            mm1,        mm0
-
-        psrlq           mm0,        32
-        paddw           mm0,        mm1
-
-        movq            rax,        mm0
-
-    pop rdi
-    pop rsi
-    mov rsp, rbp
-    ; begin epilog
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;unsigned int vp9_sad16x8_mmx(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride)
-sym(vp9_sad16x8_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 4
-    push rsi
-    push rdi
-    ; end prolog
-
-        mov             rsi,        arg(0) ;src_ptr
-        mov             rdi,        arg(2) ;ref_ptr
-
-        movsxd          rax,        dword ptr arg(1) ;src_stride
-        movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-        lea             rcx,        [rsi+rax*8]
-        pxor            mm7,        mm7
-
-        pxor            mm6,        mm6
-
-.x16x8sad_mmx_loop:
-
-        movq            mm0,       [rsi]
-        movq            mm1,       [rdi]
-
-        movq            mm2,        [rsi+8]
-        movq            mm3,        [rdi+8]
-
-        movq            mm4,        mm0
-        movq            mm5,        mm2
-
-        psubusb         mm0,        mm1
-        psubusb         mm1,        mm4
-
-        psubusb         mm2,        mm3
-        psubusb         mm3,        mm5
-
-        por             mm0,        mm1
-        por             mm2,        mm3
-
-        movq            mm1,        mm0
-        movq            mm3,        mm2
-
-        punpcklbw       mm0,        mm6
-        punpckhbw       mm1,        mm6
-
-        punpcklbw       mm2,        mm6
-        punpckhbw       mm3,        mm6
-
-
-        paddw           mm0,        mm2
-        paddw           mm1,        mm3
-
-        paddw           mm0,        mm1
-        lea             rsi,        [rsi+rax]
-
-        add             rdi,        rdx
-        paddw           mm7,        mm0
-
-        cmp             rsi,        rcx
-        jne             .x16x8sad_mmx_loop
-
-        movq            mm0,        mm7
-        punpcklwd       mm0,        mm6
-
-        punpckhwd       mm7,        mm6
-        paddw           mm0,        mm7
-
-        movq            mm7,        mm0
-        psrlq           mm0,        32
-
-        paddw           mm7,        mm0
-        movq            rax,        mm7
-
-    pop rdi
-    pop rsi
-    mov rsp, rbp
-    ; begin epilog
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
--- a/vp8/encoder/x86/sad_sse2.asm
+++ /dev/null
@@ -1,410 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-;unsigned int vp9_sad16x16_wmt(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride)
-global sym(vp9_sad16x16_wmt)
-sym(vp9_sad16x16_wmt):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 4
-    SAVE_XMM 6
-    push        rsi
-    push        rdi
-    ; end prolog
-
-        mov             rsi,        arg(0) ;src_ptr
-        mov             rdi,        arg(2) ;ref_ptr
-
-        movsxd          rax,        dword ptr arg(1) ;src_stride
-        movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-        lea             rcx,        [rsi+rax*8]
-
-        lea             rcx,        [rcx+rax*8]
-        pxor            xmm6,       xmm6
-
-.x16x16sad_wmt_loop:
-
-        movq            xmm0,       QWORD PTR [rsi]
-        movq            xmm2,       QWORD PTR [rsi+8]
-
-        movq            xmm1,       QWORD PTR [rdi]
-        movq            xmm3,       QWORD PTR [rdi+8]
-
-        movq            xmm4,       QWORD PTR [rsi+rax]
-        movq            xmm5,       QWORD PTR [rdi+rdx]
-
-
-        punpcklbw       xmm0,       xmm2
-        punpcklbw       xmm1,       xmm3
-
-        psadbw          xmm0,       xmm1
-        movq            xmm2,       QWORD PTR [rsi+rax+8]
-
-        movq            xmm3,       QWORD PTR [rdi+rdx+8]
-        lea             rsi,        [rsi+rax*2]
-
-        lea             rdi,        [rdi+rdx*2]
-        punpcklbw       xmm4,       xmm2
-
-        punpcklbw       xmm5,       xmm3
-        psadbw          xmm4,       xmm5
-
-        paddw           xmm6,       xmm0
-        paddw           xmm6,       xmm4
-
-        cmp             rsi,        rcx
-        jne             .x16x16sad_wmt_loop
-
-        movq            xmm0,       xmm6
-        psrldq          xmm6,       8
-
-        paddw           xmm0,       xmm6
-        movq            rax,        xmm0
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;unsigned int vp9_sad8x16_wmt(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride,
-;    int  max_err)
-global sym(vp9_sad8x16_wmt)
-sym(vp9_sad8x16_wmt):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    push        rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-        mov             rsi,        arg(0) ;src_ptr
-        mov             rdi,        arg(2) ;ref_ptr
-
-        movsxd          rbx,        dword ptr arg(1) ;src_stride
-        movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-        lea             rcx,        [rsi+rbx*8]
-
-        lea             rcx,        [rcx+rbx*8]
-        pxor            mm7,        mm7
-
-.x8x16sad_wmt_loop:
-
-        movq            rax,        mm7
-        cmp             eax,        arg(4)
-        jg              .x8x16sad_wmt_early_exit
-
-        movq            mm0,        QWORD PTR [rsi]
-        movq            mm1,        QWORD PTR [rdi]
-
-        movq            mm2,        QWORD PTR [rsi+rbx]
-        movq            mm3,        QWORD PTR [rdi+rdx]
-
-        psadbw          mm0,        mm1
-        psadbw          mm2,        mm3
-
-        lea             rsi,        [rsi+rbx*2]
-        lea             rdi,        [rdi+rdx*2]
-
-        paddw           mm7,        mm0
-        paddw           mm7,        mm2
-
-        cmp             rsi,        rcx
-        jne             .x8x16sad_wmt_loop
-
-        movq            rax,        mm7
-
-.x8x16sad_wmt_early_exit:
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    pop         rbx
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;unsigned int vp9_sad8x8_wmt(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride)
-global sym(vp9_sad8x8_wmt)
-sym(vp9_sad8x8_wmt):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    push        rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-        mov             rsi,        arg(0) ;src_ptr
-        mov             rdi,        arg(2) ;ref_ptr
-
-        movsxd          rbx,        dword ptr arg(1) ;src_stride
-        movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-        lea             rcx,        [rsi+rbx*8]
-        pxor            mm7,        mm7
-
-.x8x8sad_wmt_loop:
-
-        movq            rax,        mm7
-        cmp             eax,        arg(4)
-        jg              .x8x8sad_wmt_early_exit
-
-        movq            mm0,        QWORD PTR [rsi]
-        movq            mm1,        QWORD PTR [rdi]
-
-        psadbw          mm0,        mm1
-        lea             rsi,        [rsi+rbx]
-
-        add             rdi,        rdx
-        paddw           mm7,        mm0
-
-        cmp             rsi,        rcx
-        jne             .x8x8sad_wmt_loop
-
-        movq            rax,        mm7
-.x8x8sad_wmt_early_exit:
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    pop         rbx
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;unsigned int vp9_sad4x4_wmt(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride)
-global sym(vp9_sad4x4_wmt)
-sym(vp9_sad4x4_wmt):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 4
-    push        rsi
-    push        rdi
-    ; end prolog
-
-        mov             rsi,        arg(0) ;src_ptr
-        mov             rdi,        arg(2) ;ref_ptr
-
-        movsxd          rax,        dword ptr arg(1) ;src_stride
-        movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-        movd            mm0,        DWORD PTR [rsi]
-        movd            mm1,        DWORD PTR [rdi]
-
-        movd            mm2,        DWORD PTR [rsi+rax]
-        movd            mm3,        DWORD PTR [rdi+rdx]
-
-        punpcklbw       mm0,        mm2
-        punpcklbw       mm1,        mm3
-
-        psadbw          mm0,        mm1
-        lea             rsi,        [rsi+rax*2]
-
-        lea             rdi,        [rdi+rdx*2]
-        movd            mm4,        DWORD PTR [rsi]
-
-        movd            mm5,        DWORD PTR [rdi]
-        movd            mm6,        DWORD PTR [rsi+rax]
-
-        movd            mm7,        DWORD PTR [rdi+rdx]
-        punpcklbw       mm4,        mm6
-
-        punpcklbw       mm5,        mm7
-        psadbw          mm4,        mm5
-
-        paddw           mm0,        mm4
-        movq            rax,        mm0
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;unsigned int vp9_sad16x8_wmt(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride)
-global sym(vp9_sad16x8_wmt)
-sym(vp9_sad16x8_wmt):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    push        rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-
-        mov             rsi,        arg(0) ;src_ptr
-        mov             rdi,        arg(2) ;ref_ptr
-
-        movsxd          rbx,        dword ptr arg(1) ;src_stride
-        movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-        lea             rcx,        [rsi+rbx*8]
-        pxor            mm7,        mm7
-
-.x16x8sad_wmt_loop:
-
-        movq            rax,        mm7
-        cmp             eax,        arg(4)
-        jg              .x16x8sad_wmt_early_exit
-
-        movq            mm0,        QWORD PTR [rsi]
-        movq            mm2,        QWORD PTR [rsi+8]
-
-        movq            mm1,        QWORD PTR [rdi]
-        movq            mm3,        QWORD PTR [rdi+8]
-
-        movq            mm4,        QWORD PTR [rsi+rbx]
-        movq            mm5,        QWORD PTR [rdi+rdx]
-
-        psadbw          mm0,        mm1
-        psadbw          mm2,        mm3
-
-        movq            mm1,        QWORD PTR [rsi+rbx+8]
-        movq            mm3,        QWORD PTR [rdi+rdx+8]
-
-        psadbw          mm4,        mm5
-        psadbw          mm1,        mm3
-
-        lea             rsi,        [rsi+rbx*2]
-        lea             rdi,        [rdi+rdx*2]
-
-        paddw           mm0,        mm2
-        paddw           mm4,        mm1
-
-        paddw           mm7,        mm0
-        paddw           mm7,        mm4
-
-        cmp             rsi,        rcx
-        jne             .x16x8sad_wmt_loop
-
-        movq            rax,        mm7
-
-.x16x8sad_wmt_early_exit:
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    pop         rbx
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void vp9_copy32xn_sse2(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *dst_ptr,
-;    int  dst_stride,
-;    int height);
-global sym(vp9_copy32xn_sse2)
-sym(vp9_copy32xn_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-        mov             rsi,        arg(0) ;src_ptr
-        mov             rdi,        arg(2) ;dst_ptr
-
-        movsxd          rax,        dword ptr arg(1) ;src_stride
-        movsxd          rdx,        dword ptr arg(3) ;dst_stride
-        movsxd          rcx,        dword ptr arg(4) ;height
-
-.block_copy_sse2_loopx4:
-        movdqu          xmm0,       XMMWORD PTR [rsi]
-        movdqu          xmm1,       XMMWORD PTR [rsi + 16]
-        movdqu          xmm2,       XMMWORD PTR [rsi + rax]
-        movdqu          xmm3,       XMMWORD PTR [rsi + rax + 16]
-
-        lea             rsi,        [rsi+rax*2]
-
-        movdqu          xmm4,       XMMWORD PTR [rsi]
-        movdqu          xmm5,       XMMWORD PTR [rsi + 16]
-        movdqu          xmm6,       XMMWORD PTR [rsi + rax]
-        movdqu          xmm7,       XMMWORD PTR [rsi + rax + 16]
-
-        lea             rsi,    [rsi+rax*2]
-
-        movdqa          XMMWORD PTR [rdi], xmm0
-        movdqa          XMMWORD PTR [rdi + 16], xmm1
-        movdqa          XMMWORD PTR [rdi + rdx], xmm2
-        movdqa          XMMWORD PTR [rdi + rdx + 16], xmm3
-
-        lea             rdi,    [rdi+rdx*2]
-
-        movdqa          XMMWORD PTR [rdi], xmm4
-        movdqa          XMMWORD PTR [rdi + 16], xmm5
-        movdqa          XMMWORD PTR [rdi + rdx], xmm6
-        movdqa          XMMWORD PTR [rdi + rdx + 16], xmm7
-
-        lea             rdi,    [rdi+rdx*2]
-
-        sub             rcx,     4
-        cmp             rcx,     4
-        jge             .block_copy_sse2_loopx4
-
-        cmp             rcx, 0
-        je              .copy_is_done
-
-.block_copy_sse2_loop:
-        movdqu          xmm0,       XMMWORD PTR [rsi]
-        movdqu          xmm1,       XMMWORD PTR [rsi + 16]
-        lea             rsi,    [rsi+rax]
-
-        movdqa          XMMWORD PTR [rdi], xmm0
-        movdqa          XMMWORD PTR [rdi + 16], xmm1
-        lea             rdi,    [rdi+rdx]
-
-        sub             rcx,     1
-        jne             .block_copy_sse2_loop
-
-.copy_is_done:
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
--- a/vp8/encoder/x86/sad_sse3.asm
+++ /dev/null
@@ -1,960 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-%include "vpx_ports/x86_abi_support.asm"
-
-%macro STACK_FRAME_CREATE_X3 0
-%if ABI_IS_32BIT
-  %define     src_ptr       rsi
-  %define     src_stride    rax
-  %define     ref_ptr       rdi
-  %define     ref_stride    rdx
-  %define     end_ptr       rcx
-  %define     ret_var       rbx
-  %define     result_ptr    arg(4)
-  %define     max_err       arg(4)
-  %define     height        dword ptr arg(4)
-    push        rbp
-    mov         rbp,        rsp
-    push        rsi
-    push        rdi
-    push        rbx
-
-    mov         rsi,        arg(0)              ; src_ptr
-    mov         rdi,        arg(2)              ; ref_ptr
-
-    movsxd      rax,        dword ptr arg(1)    ; src_stride
-    movsxd      rdx,        dword ptr arg(3)    ; ref_stride
-%else
-  %ifidn __OUTPUT_FORMAT__,x64
-    SAVE_XMM 7, u
-    %define     src_ptr     rcx
-    %define     src_stride  rdx
-    %define     ref_ptr     r8
-    %define     ref_stride  r9
-    %define     end_ptr     r10
-    %define     ret_var     r11
-    %define     result_ptr  [rsp+xmm_stack_space+8+4*8]
-    %define     max_err     [rsp+xmm_stack_space+8+4*8]
-    %define     height      dword ptr [rsp+xmm_stack_space+8+4*8]
-  %else
-    %define     src_ptr     rdi
-    %define     src_stride  rsi
-    %define     ref_ptr     rdx
-    %define     ref_stride  rcx
-    %define     end_ptr     r9
-    %define     ret_var     r10
-    %define     result_ptr  r8
-    %define     max_err     r8
-    %define     height      r8
-  %endif
-%endif
-
-%endmacro
-
-%macro STACK_FRAME_DESTROY_X3 0
-  %define     src_ptr
-  %define     src_stride
-  %define     ref_ptr
-  %define     ref_stride
-  %define     end_ptr
-  %define     ret_var
-  %define     result_ptr
-  %define     max_err
-  %define     height
-
-%if ABI_IS_32BIT
-    pop         rbx
-    pop         rdi
-    pop         rsi
-    pop         rbp
-%else
-  %ifidn __OUTPUT_FORMAT__,x64
-    RESTORE_XMM
-  %endif
-%endif
-    ret
-%endmacro
-
-%macro STACK_FRAME_CREATE_X4 0
-%if ABI_IS_32BIT
-  %define     src_ptr       rsi
-  %define     src_stride    rax
-  %define     r0_ptr        rcx
-  %define     r1_ptr        rdx
-  %define     r2_ptr        rbx
-  %define     r3_ptr        rdi
-  %define     ref_stride    rbp
-  %define     result_ptr    arg(4)
-    push        rbp
-    mov         rbp,        rsp
-    push        rsi
-    push        rdi
-    push        rbx
-
-    push        rbp
-    mov         rdi,        arg(2)              ; ref_ptr_base
-
-    LOAD_X4_ADDRESSES rdi, rcx, rdx, rax, rdi
-
-    mov         rsi,        arg(0)              ; src_ptr
-
-    movsxd      rbx,        dword ptr arg(1)    ; src_stride
-    movsxd      rbp,        dword ptr arg(3)    ; ref_stride
-
-    xchg        rbx,        rax
-%else
-  %ifidn __OUTPUT_FORMAT__,x64
-    SAVE_XMM 7, u
-    %define     src_ptr     rcx
-    %define     src_stride  rdx
-    %define     r0_ptr      rsi
-    %define     r1_ptr      r10
-    %define     r2_ptr      r11
-    %define     r3_ptr      r8
-    %define     ref_stride  r9
-    %define     result_ptr  [rsp+xmm_stack_space+16+4*8]
-    push        rsi
-
-    LOAD_X4_ADDRESSES r8, r0_ptr, r1_ptr, r2_ptr, r3_ptr
-  %else
-    %define     src_ptr     rdi
-    %define     src_stride  rsi
-    %define     r0_ptr      r9
-    %define     r1_ptr      r10
-    %define     r2_ptr      r11
-    %define     r3_ptr      rdx
-    %define     ref_stride  rcx
-    %define     result_ptr  r8
-
-    LOAD_X4_ADDRESSES rdx, r0_ptr, r1_ptr, r2_ptr, r3_ptr
-
-  %endif
-%endif
-%endmacro
-
-%macro STACK_FRAME_DESTROY_X4 0
-  %define     src_ptr
-  %define     src_stride
-  %define     r0_ptr
-  %define     r1_ptr
-  %define     r2_ptr
-  %define     r3_ptr
-  %define     ref_stride
-  %define     result_ptr
-
-%if ABI_IS_32BIT
-    pop         rbx
-    pop         rdi
-    pop         rsi
-    pop         rbp
-%else
-  %ifidn __OUTPUT_FORMAT__,x64
-    pop         rsi
-    RESTORE_XMM
-  %endif
-%endif
-    ret
-%endmacro
-
-%macro PROCESS_16X2X3 5
-%if %1==0
-        movdqa          xmm0,       XMMWORD PTR [%2]
-        lddqu           xmm5,       XMMWORD PTR [%3]
-        lddqu           xmm6,       XMMWORD PTR [%3+1]
-        lddqu           xmm7,       XMMWORD PTR [%3+2]
-
-        psadbw          xmm5,       xmm0
-        psadbw          xmm6,       xmm0
-        psadbw          xmm7,       xmm0
-%else
-        movdqa          xmm0,       XMMWORD PTR [%2]
-        lddqu           xmm1,       XMMWORD PTR [%3]
-        lddqu           xmm2,       XMMWORD PTR [%3+1]
-        lddqu           xmm3,       XMMWORD PTR [%3+2]
-
-        psadbw          xmm1,       xmm0
-        psadbw          xmm2,       xmm0
-        psadbw          xmm3,       xmm0
-
-        paddw           xmm5,       xmm1
-        paddw           xmm6,       xmm2
-        paddw           xmm7,       xmm3
-%endif
-        movdqa          xmm0,       XMMWORD PTR [%2+%4]
-        lddqu           xmm1,       XMMWORD PTR [%3+%5]
-        lddqu           xmm2,       XMMWORD PTR [%3+%5+1]
-        lddqu           xmm3,       XMMWORD PTR [%3+%5+2]
-
-%if %1==0 || %1==1
-        lea             %2,         [%2+%4*2]
-        lea             %3,         [%3+%5*2]
-%endif
-
-        psadbw          xmm1,       xmm0
-        psadbw          xmm2,       xmm0
-        psadbw          xmm3,       xmm0
-
-        paddw           xmm5,       xmm1
-        paddw           xmm6,       xmm2
-        paddw           xmm7,       xmm3
-%endmacro
-
-%macro PROCESS_8X2X3 5
-%if %1==0
-        movq            mm0,       QWORD PTR [%2]
-        movq            mm5,       QWORD PTR [%3]
-        movq            mm6,       QWORD PTR [%3+1]
-        movq            mm7,       QWORD PTR [%3+2]
-
-        psadbw          mm5,       mm0
-        psadbw          mm6,       mm0
-        psadbw          mm7,       mm0
-%else
-        movq            mm0,       QWORD PTR [%2]
-        movq            mm1,       QWORD PTR [%3]
-        movq            mm2,       QWORD PTR [%3+1]
-        movq            mm3,       QWORD PTR [%3+2]
-
-        psadbw          mm1,       mm0
-        psadbw          mm2,       mm0
-        psadbw          mm3,       mm0
-
-        paddw           mm5,       mm1
-        paddw           mm6,       mm2
-        paddw           mm7,       mm3
-%endif
-        movq            mm0,       QWORD PTR [%2+%4]
-        movq            mm1,       QWORD PTR [%3+%5]
-        movq            mm2,       QWORD PTR [%3+%5+1]
-        movq            mm3,       QWORD PTR [%3+%5+2]
-
-%if %1==0 || %1==1
-        lea             %2,        [%2+%4*2]
-        lea             %3,        [%3+%5*2]
-%endif
-
-        psadbw          mm1,       mm0
-        psadbw          mm2,       mm0
-        psadbw          mm3,       mm0
-
-        paddw           mm5,       mm1
-        paddw           mm6,       mm2
-        paddw           mm7,       mm3
-%endmacro
-
-%macro LOAD_X4_ADDRESSES 5
-        mov             %2,         [%1+REG_SZ_BYTES*0]
-        mov             %3,         [%1+REG_SZ_BYTES*1]
-
-        mov             %4,         [%1+REG_SZ_BYTES*2]
-        mov             %5,         [%1+REG_SZ_BYTES*3]
-%endmacro
-
-%macro PROCESS_16X2X4 8
-%if %1==0
-        movdqa          xmm0,       XMMWORD PTR [%2]
-        lddqu           xmm4,       XMMWORD PTR [%3]
-        lddqu           xmm5,       XMMWORD PTR [%4]
-        lddqu           xmm6,       XMMWORD PTR [%5]
-        lddqu           xmm7,       XMMWORD PTR [%6]
-
-        psadbw          xmm4,       xmm0
-        psadbw          xmm5,       xmm0
-        psadbw          xmm6,       xmm0
-        psadbw          xmm7,       xmm0
-%else
-        movdqa          xmm0,       XMMWORD PTR [%2]
-        lddqu           xmm1,       XMMWORD PTR [%3]
-        lddqu           xmm2,       XMMWORD PTR [%4]
-        lddqu           xmm3,       XMMWORD PTR [%5]
-
-        psadbw          xmm1,       xmm0
-        psadbw          xmm2,       xmm0
-        psadbw          xmm3,       xmm0
-
-        paddw           xmm4,       xmm1
-        lddqu           xmm1,       XMMWORD PTR [%6]
-        paddw           xmm5,       xmm2
-        paddw           xmm6,       xmm3
-
-        psadbw          xmm1,       xmm0
-        paddw           xmm7,       xmm1
-%endif
-        movdqa          xmm0,       XMMWORD PTR [%2+%7]
-        lddqu           xmm1,       XMMWORD PTR [%3+%8]
-        lddqu           xmm2,       XMMWORD PTR [%4+%8]
-        lddqu           xmm3,       XMMWORD PTR [%5+%8]
-
-        psadbw          xmm1,       xmm0
-        psadbw          xmm2,       xmm0
-        psadbw          xmm3,       xmm0
-
-        paddw           xmm4,       xmm1
-        lddqu           xmm1,       XMMWORD PTR [%6+%8]
-        paddw           xmm5,       xmm2
-        paddw           xmm6,       xmm3
-
-%if %1==0 || %1==1
-        lea             %2,         [%2+%7*2]
-        lea             %3,         [%3+%8*2]
-
-        lea             %4,         [%4+%8*2]
-        lea             %5,         [%5+%8*2]
-
-        lea             %6,         [%6+%8*2]
-%endif
-        psadbw          xmm1,       xmm0
-        paddw           xmm7,       xmm1
-
-%endmacro
-
-%macro PROCESS_8X2X4 8
-%if %1==0
-        movq            mm0,        QWORD PTR [%2]
-        movq            mm4,        QWORD PTR [%3]
-        movq            mm5,        QWORD PTR [%4]
-        movq            mm6,        QWORD PTR [%5]
-        movq            mm7,        QWORD PTR [%6]
-
-        psadbw          mm4,        mm0
-        psadbw          mm5,        mm0
-        psadbw          mm6,        mm0
-        psadbw          mm7,        mm0
-%else
-        movq            mm0,        QWORD PTR [%2]
-        movq            mm1,        QWORD PTR [%3]
-        movq            mm2,        QWORD PTR [%4]
-        movq            mm3,        QWORD PTR [%5]
-
-        psadbw          mm1,        mm0
-        psadbw          mm2,        mm0
-        psadbw          mm3,        mm0
-
-        paddw           mm4,        mm1
-        movq            mm1,        QWORD PTR [%6]
-        paddw           mm5,        mm2
-        paddw           mm6,        mm3
-
-        psadbw          mm1,        mm0
-        paddw           mm7,        mm1
-%endif
-        movq            mm0,        QWORD PTR [%2+%7]
-        movq            mm1,        QWORD PTR [%3+%8]
-        movq            mm2,        QWORD PTR [%4+%8]
-        movq            mm3,        QWORD PTR [%5+%8]
-
-        psadbw          mm1,        mm0
-        psadbw          mm2,        mm0
-        psadbw          mm3,        mm0
-
-        paddw           mm4,        mm1
-        movq            mm1,        QWORD PTR [%6+%8]
-        paddw           mm5,        mm2
-        paddw           mm6,        mm3
-
-%if %1==0 || %1==1
-        lea             %2,         [%2+%7*2]
-        lea             %3,         [%3+%8*2]
-
-        lea             %4,         [%4+%8*2]
-        lea             %5,         [%5+%8*2]
-
-        lea             %6,         [%6+%8*2]
-%endif
-        psadbw          mm1,        mm0
-        paddw           mm7,        mm1
-
-%endmacro
-
-;void int vp9_sad16x16x3_sse3(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride,
-;    int  *results)
-global sym(vp9_sad16x16x3_sse3)
-sym(vp9_sad16x16x3_sse3):
-
-    STACK_FRAME_CREATE_X3
-
-        PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
-
-        mov             rcx,        result_ptr
-
-        movq            xmm0,       xmm5
-        psrldq          xmm5,       8
-
-        paddw           xmm0,       xmm5
-        movd            [rcx],      xmm0
-;-
-        movq            xmm0,       xmm6
-        psrldq          xmm6,       8
-
-        paddw           xmm0,       xmm6
-        movd            [rcx+4],    xmm0
-;-
-        movq            xmm0,       xmm7
-        psrldq          xmm7,       8
-
-        paddw           xmm0,       xmm7
-        movd            [rcx+8],    xmm0
-
-    STACK_FRAME_DESTROY_X3
-
-;void int vp9_sad16x8x3_sse3(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride,
-;    int  *results)
-global sym(vp9_sad16x8x3_sse3)
-sym(vp9_sad16x8x3_sse3):
-
-    STACK_FRAME_CREATE_X3
-
-        PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
-
-        mov             rcx,        result_ptr
-
-        movq            xmm0,       xmm5
-        psrldq          xmm5,       8
-
-        paddw           xmm0,       xmm5
-        movd            [rcx],      xmm0
-;-
-        movq            xmm0,       xmm6
-        psrldq          xmm6,       8
-
-        paddw           xmm0,       xmm6
-        movd            [rcx+4],    xmm0
-;-
-        movq            xmm0,       xmm7
-        psrldq          xmm7,       8
-
-        paddw           xmm0,       xmm7
-        movd            [rcx+8],    xmm0
-
-    STACK_FRAME_DESTROY_X3
-
-;void int vp9_sad8x16x3_sse3(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride,
-;    int  *results)
-global sym(vp9_sad8x16x3_sse3)
-sym(vp9_sad8x16x3_sse3):
-
-    STACK_FRAME_CREATE_X3
-
-        PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
-
-        mov             rcx,        result_ptr
-
-        punpckldq       mm5,        mm6
-
-        movq            [rcx],      mm5
-        movd            [rcx+8],    mm7
-
-    STACK_FRAME_DESTROY_X3
-
-;void int vp9_sad8x8x3_sse3(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride,
-;    int  *results)
-global sym(vp9_sad8x8x3_sse3)
-sym(vp9_sad8x8x3_sse3):
-
-    STACK_FRAME_CREATE_X3
-
-        PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
-
-        mov             rcx,        result_ptr
-
-        punpckldq       mm5,        mm6
-
-        movq            [rcx],      mm5
-        movd            [rcx+8],    mm7
-
-    STACK_FRAME_DESTROY_X3
-
-;void int vp9_sad4x4x3_sse3(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride,
-;    int  *results)
-global sym(vp9_sad4x4x3_sse3)
-sym(vp9_sad4x4x3_sse3):
-
-    STACK_FRAME_CREATE_X3
-
-        movd            mm0,        DWORD PTR [src_ptr]
-        movd            mm1,        DWORD PTR [ref_ptr]
-
-        movd            mm2,        DWORD PTR [src_ptr+src_stride]
-        movd            mm3,        DWORD PTR [ref_ptr+ref_stride]
-
-        punpcklbw       mm0,        mm2
-        punpcklbw       mm1,        mm3
-
-        movd            mm4,        DWORD PTR [ref_ptr+1]
-        movd            mm5,        DWORD PTR [ref_ptr+2]
-
-        movd            mm2,        DWORD PTR [ref_ptr+ref_stride+1]
-        movd            mm3,        DWORD PTR [ref_ptr+ref_stride+2]
-
-        psadbw          mm1,        mm0
-
-        punpcklbw       mm4,        mm2
-        punpcklbw       mm5,        mm3
-
-        psadbw          mm4,        mm0
-        psadbw          mm5,        mm0
-
-        lea             src_ptr,    [src_ptr+src_stride*2]
-        lea             ref_ptr,    [ref_ptr+ref_stride*2]
-
-        movd            mm0,        DWORD PTR [src_ptr]
-        movd            mm2,        DWORD PTR [ref_ptr]
-
-        movd            mm3,        DWORD PTR [src_ptr+src_stride]
-        movd            mm6,        DWORD PTR [ref_ptr+ref_stride]
-
-        punpcklbw       mm0,        mm3
-        punpcklbw       mm2,        mm6
-
-        movd            mm3,        DWORD PTR [ref_ptr+1]
-        movd            mm7,        DWORD PTR [ref_ptr+2]
-
-        psadbw          mm2,        mm0
-
-        paddw           mm1,        mm2
-
-        movd            mm2,        DWORD PTR [ref_ptr+ref_stride+1]
-        movd            mm6,        DWORD PTR [ref_ptr+ref_stride+2]
-
-        punpcklbw       mm3,        mm2
-        punpcklbw       mm7,        mm6
-
-        psadbw          mm3,        mm0
-        psadbw          mm7,        mm0
-
-        paddw           mm3,        mm4
-        paddw           mm7,        mm5
-
-        mov             rcx,        result_ptr
-
-        punpckldq       mm1,        mm3
-
-        movq            [rcx],      mm1
-        movd            [rcx+8],    mm7
-
-    STACK_FRAME_DESTROY_X3
-
-;unsigned int vp9_sad16x16_sse3(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride,
-;    int  max_err)
-;%define lddqu movdqu
-global sym(vp9_sad16x16_sse3)
-sym(vp9_sad16x16_sse3):
-
-    STACK_FRAME_CREATE_X3
-
-        mov             end_ptr,    4
-        pxor            xmm7,        xmm7
-
-.vp9_sad16x16_sse3_loop:
-        movdqa          xmm0,       XMMWORD PTR [src_ptr]
-        movdqu          xmm1,       XMMWORD PTR [ref_ptr]
-        movdqa          xmm2,       XMMWORD PTR [src_ptr+src_stride]
-        movdqu          xmm3,       XMMWORD PTR [ref_ptr+ref_stride]
-
-        lea             src_ptr,    [src_ptr+src_stride*2]
-        lea             ref_ptr,    [ref_ptr+ref_stride*2]
-
-        movdqa          xmm4,       XMMWORD PTR [src_ptr]
-        movdqu          xmm5,       XMMWORD PTR [ref_ptr]
-        movdqa          xmm6,       XMMWORD PTR [src_ptr+src_stride]
-
-        psadbw          xmm0,       xmm1
-
-        movdqu          xmm1,       XMMWORD PTR [ref_ptr+ref_stride]
-
-        psadbw          xmm2,       xmm3
-        psadbw          xmm4,       xmm5
-        psadbw          xmm6,       xmm1
-
-        lea             src_ptr,    [src_ptr+src_stride*2]
-        lea             ref_ptr,    [ref_ptr+ref_stride*2]
-
-        paddw           xmm7,        xmm0
-        paddw           xmm7,        xmm2
-        paddw           xmm7,        xmm4
-        paddw           xmm7,        xmm6
-
-        sub             end_ptr,     1
-        jne             .vp9_sad16x16_sse3_loop
-
-        movq            xmm0,       xmm7
-        psrldq          xmm7,       8
-        paddw           xmm0,       xmm7
-        movq            rax,        xmm0
-
-    STACK_FRAME_DESTROY_X3
-
-;void vp9_copy32xn_sse3(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *dst_ptr,
-;    int  dst_stride,
-;    int height);
-global sym(vp9_copy32xn_sse3)
-sym(vp9_copy32xn_sse3):
-
-    STACK_FRAME_CREATE_X3
-
-.block_copy_sse3_loopx4:
-        lea             end_ptr,    [src_ptr+src_stride*2]
-
-        movdqu          xmm0,       XMMWORD PTR [src_ptr]
-        movdqu          xmm1,       XMMWORD PTR [src_ptr + 16]
-        movdqu          xmm2,       XMMWORD PTR [src_ptr + src_stride]
-        movdqu          xmm3,       XMMWORD PTR [src_ptr + src_stride + 16]
-        movdqu          xmm4,       XMMWORD PTR [end_ptr]
-        movdqu          xmm5,       XMMWORD PTR [end_ptr + 16]
-        movdqu          xmm6,       XMMWORD PTR [end_ptr + src_stride]
-        movdqu          xmm7,       XMMWORD PTR [end_ptr + src_stride + 16]
-
-        lea             src_ptr,    [src_ptr+src_stride*4]
-
-        lea             end_ptr,    [ref_ptr+ref_stride*2]
-
-        movdqa          XMMWORD PTR [ref_ptr], xmm0
-        movdqa          XMMWORD PTR [ref_ptr + 16], xmm1
-        movdqa          XMMWORD PTR [ref_ptr + ref_stride], xmm2
-        movdqa          XMMWORD PTR [ref_ptr + ref_stride + 16], xmm3
-        movdqa          XMMWORD PTR [end_ptr], xmm4
-        movdqa          XMMWORD PTR [end_ptr + 16], xmm5
-        movdqa          XMMWORD PTR [end_ptr + ref_stride], xmm6
-        movdqa          XMMWORD PTR [end_ptr + ref_stride + 16], xmm7
-
-        lea             ref_ptr,    [ref_ptr+ref_stride*4]
-
-        sub             height,     4
-        cmp             height,     4
-        jge             .block_copy_sse3_loopx4
-
-        ;Check to see if there is more rows need to be copied.
-        cmp             height, 0
-        je              .copy_is_done
-
-.block_copy_sse3_loop:
-        movdqu          xmm0,       XMMWORD PTR [src_ptr]
-        movdqu          xmm1,       XMMWORD PTR [src_ptr + 16]
-        lea             src_ptr,    [src_ptr+src_stride]
-
-        movdqa          XMMWORD PTR [ref_ptr], xmm0
-        movdqa          XMMWORD PTR [ref_ptr + 16], xmm1
-        lea             ref_ptr,    [ref_ptr+ref_stride]
-
-        sub             height,     1
-        jne             .block_copy_sse3_loop
-
-.copy_is_done:
-    STACK_FRAME_DESTROY_X3
-
-;void vp9_sad16x16x4d_sse3(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr_base,
-;    int  ref_stride,
-;    int  *results)
-global sym(vp9_sad16x16x4d_sse3)
-sym(vp9_sad16x16x4d_sse3):
-
-    STACK_FRAME_CREATE_X4
-
-        PROCESS_16X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-        PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-        PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-        PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-        PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-        PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-        PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-        PROCESS_16X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-
-%if ABI_IS_32BIT
-        pop             rbp
-%endif
-        mov             rcx,        result_ptr
-
-        movq            xmm0,       xmm4
-        psrldq          xmm4,       8
-
-        paddw           xmm0,       xmm4
-        movd            [rcx],      xmm0
-;-
-        movq            xmm0,       xmm5
-        psrldq          xmm5,       8
-
-        paddw           xmm0,       xmm5
-        movd            [rcx+4],    xmm0
-;-
-        movq            xmm0,       xmm6
-        psrldq          xmm6,       8
-
-        paddw           xmm0,       xmm6
-        movd            [rcx+8],    xmm0
-;-
-        movq            xmm0,       xmm7
-        psrldq          xmm7,       8
-
-        paddw           xmm0,       xmm7
-        movd            [rcx+12],   xmm0
-
-    STACK_FRAME_DESTROY_X4
-
-;void vp9_sad16x8x4d_sse3(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr_base,
-;    int  ref_stride,
-;    int  *results)
-global sym(vp9_sad16x8x4d_sse3)
-sym(vp9_sad16x8x4d_sse3):
-
-    STACK_FRAME_CREATE_X4
-
-        PROCESS_16X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-        PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-        PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-        PROCESS_16X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-
-%if ABI_IS_32BIT
-        pop             rbp
-%endif
-        mov             rcx,        result_ptr
-
-        movq            xmm0,       xmm4
-        psrldq          xmm4,       8
-
-        paddw           xmm0,       xmm4
-        movd            [rcx],      xmm0
-;-
-        movq            xmm0,       xmm5
-        psrldq          xmm5,       8
-
-        paddw           xmm0,       xmm5
-        movd            [rcx+4],    xmm0
-;-
-        movq            xmm0,       xmm6
-        psrldq          xmm6,       8
-
-        paddw           xmm0,       xmm6
-        movd            [rcx+8],    xmm0
-;-
-        movq            xmm0,       xmm7
-        psrldq          xmm7,       8
-
-        paddw           xmm0,       xmm7
-        movd            [rcx+12],   xmm0
-
-    STACK_FRAME_DESTROY_X4
-
-;void int vp9_sad8x16x4d_sse3(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride,
-;    int  *results)
-global sym(vp9_sad8x16x4d_sse3)
-sym(vp9_sad8x16x4d_sse3):
-
-    STACK_FRAME_CREATE_X4
-
-        PROCESS_8X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-        PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-        PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-        PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-        PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-        PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-        PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-        PROCESS_8X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-
-%if ABI_IS_32BIT
-        pop             rbp
-%endif
-        mov             rcx,        result_ptr
-
-        punpckldq       mm4,        mm5
-        punpckldq       mm6,        mm7
-
-        movq            [rcx],      mm4
-        movq            [rcx+8],    mm6
-
-    STACK_FRAME_DESTROY_X4
-
-;void int vp9_sad8x8x4d_sse3(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride,
-;    int  *results)
-global sym(vp9_sad8x8x4d_sse3)
-sym(vp9_sad8x8x4d_sse3):
-
-    STACK_FRAME_CREATE_X4
-
-        PROCESS_8X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-        PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-        PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-        PROCESS_8X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-
-%if ABI_IS_32BIT
-        pop             rbp
-%endif
-        mov             rcx,        result_ptr
-
-        punpckldq       mm4,        mm5
-        punpckldq       mm6,        mm7
-
-        movq            [rcx],      mm4
-        movq            [rcx+8],    mm6
-
-    STACK_FRAME_DESTROY_X4
-
-;void int vp9_sad4x4x4d_sse3(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride,
-;    int  *results)
-global sym(vp9_sad4x4x4d_sse3)
-sym(vp9_sad4x4x4d_sse3):
-
-    STACK_FRAME_CREATE_X4
-
-        movd            mm0,        DWORD PTR [src_ptr]
-        movd            mm1,        DWORD PTR [r0_ptr]
-
-        movd            mm2,        DWORD PTR [src_ptr+src_stride]
-        movd            mm3,        DWORD PTR [r0_ptr+ref_stride]
-
-        punpcklbw       mm0,        mm2
-        punpcklbw       mm1,        mm3
-
-        movd            mm4,        DWORD PTR [r1_ptr]
-        movd            mm5,        DWORD PTR [r2_ptr]
-
-        movd            mm6,        DWORD PTR [r3_ptr]
-        movd            mm2,        DWORD PTR [r1_ptr+ref_stride]
-
-        movd            mm3,        DWORD PTR [r2_ptr+ref_stride]
-        movd            mm7,        DWORD PTR [r3_ptr+ref_stride]
-
-        psadbw          mm1,        mm0
-
-        punpcklbw       mm4,        mm2
-        punpcklbw       mm5,        mm3
-
-        punpcklbw       mm6,        mm7
-        psadbw          mm4,        mm0
-
-        psadbw          mm5,        mm0
-        psadbw          mm6,        mm0
-
-
-
-        lea             src_ptr,    [src_ptr+src_stride*2]
-        lea             r0_ptr,     [r0_ptr+ref_stride*2]
-
-        lea             r1_ptr,     [r1_ptr+ref_stride*2]
-        lea             r2_ptr,     [r2_ptr+ref_stride*2]
-
-        lea             r3_ptr,     [r3_ptr+ref_stride*2]
-
-        movd            mm0,        DWORD PTR [src_ptr]
-        movd            mm2,        DWORD PTR [r0_ptr]
-
-        movd            mm3,        DWORD PTR [src_ptr+src_stride]
-        movd            mm7,        DWORD PTR [r0_ptr+ref_stride]
-
-        punpcklbw       mm0,        mm3
-        punpcklbw       mm2,        mm7
-
-        movd            mm3,        DWORD PTR [r1_ptr]
-        movd            mm7,        DWORD PTR [r2_ptr]
-
-        psadbw          mm2,        mm0
-%if ABI_IS_32BIT
-        mov             rax,        rbp
-
-        pop             rbp
-%define     ref_stride    rax
-%endif
-        mov             rsi,        result_ptr
-
-        paddw           mm1,        mm2
-        movd            [rsi],      mm1
-
-        movd            mm2,        DWORD PTR [r1_ptr+ref_stride]
-        movd            mm1,        DWORD PTR [r2_ptr+ref_stride]
-
-        punpcklbw       mm3,        mm2
-        punpcklbw       mm7,        mm1
-
-        psadbw          mm3,        mm0
-        psadbw          mm7,        mm0
-
-        movd            mm2,        DWORD PTR [r3_ptr]
-        movd            mm1,        DWORD PTR [r3_ptr+ref_stride]
-
-        paddw           mm3,        mm4
-        paddw           mm7,        mm5
-
-        movd            [rsi+4],    mm3
-        punpcklbw       mm2,        mm1
-
-        movd            [rsi+8],    mm7
-        psadbw          mm2,        mm0
-
-        paddw           mm2,        mm6
-        movd            [rsi+12],   mm2
-
-
-    STACK_FRAME_DESTROY_X4
-
--- a/vp8/encoder/x86/sad_sse4.asm
+++ /dev/null
@@ -1,353 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-%macro PROCESS_16X2X8 1
-%if %1
-        movdqa          xmm0,       XMMWORD PTR [rsi]
-        movq            xmm1,       MMWORD PTR [rdi]
-        movq            xmm3,       MMWORD PTR [rdi+8]
-        movq            xmm2,       MMWORD PTR [rdi+16]
-        punpcklqdq      xmm1,       xmm3
-        punpcklqdq      xmm3,       xmm2
-
-        movdqa          xmm2,       xmm1
-        mpsadbw         xmm1,       xmm0,  0x0
-        mpsadbw         xmm2,       xmm0,  0x5
-
-        psrldq          xmm0,       8
-
-        movdqa          xmm4,       xmm3
-        mpsadbw         xmm3,       xmm0,  0x0
-        mpsadbw         xmm4,       xmm0,  0x5
-
-        paddw           xmm1,       xmm2
-        paddw           xmm1,       xmm3
-        paddw           xmm1,       xmm4
-%else
-        movdqa          xmm0,       XMMWORD PTR [rsi]
-        movq            xmm5,       MMWORD PTR [rdi]
-        movq            xmm3,       MMWORD PTR [rdi+8]
-        movq            xmm2,       MMWORD PTR [rdi+16]
-        punpcklqdq      xmm5,       xmm3
-        punpcklqdq      xmm3,       xmm2
-
-        movdqa          xmm2,       xmm5
-        mpsadbw         xmm5,       xmm0,  0x0
-        mpsadbw         xmm2,       xmm0,  0x5
-
-        psrldq          xmm0,       8
-
-        movdqa          xmm4,       xmm3
-        mpsadbw         xmm3,       xmm0,  0x0
-        mpsadbw         xmm4,       xmm0,  0x5
-
-        paddw           xmm5,       xmm2
-        paddw           xmm5,       xmm3
-        paddw           xmm5,       xmm4
-
-        paddw           xmm1,       xmm5
-%endif
-        movdqa          xmm0,       XMMWORD PTR [rsi + rax]
-        movq            xmm5,       MMWORD PTR [rdi+ rdx]
-        movq            xmm3,       MMWORD PTR [rdi+ rdx+8]
-        movq            xmm2,       MMWORD PTR [rdi+ rdx+16]
-        punpcklqdq      xmm5,       xmm3
-        punpcklqdq      xmm3,       xmm2
-
-        lea             rsi,        [rsi+rax*2]
-        lea             rdi,        [rdi+rdx*2]
-
-        movdqa          xmm2,       xmm5
-        mpsadbw         xmm5,       xmm0,  0x0
-        mpsadbw         xmm2,       xmm0,  0x5
-
-        psrldq          xmm0,       8
-        movdqa          xmm4,       xmm3
-        mpsadbw         xmm3,       xmm0,  0x0
-        mpsadbw         xmm4,       xmm0,  0x5
-
-        paddw           xmm5,       xmm2
-        paddw           xmm5,       xmm3
-        paddw           xmm5,       xmm4
-
-        paddw           xmm1,       xmm5
-%endmacro
-
-%macro PROCESS_8X2X8 1
-%if %1
-        movq            xmm0,       MMWORD PTR [rsi]
-        movq            xmm1,       MMWORD PTR [rdi]
-        movq            xmm3,       MMWORD PTR [rdi+8]
-        punpcklqdq      xmm1,       xmm3
-
-        movdqa          xmm2,       xmm1
-        mpsadbw         xmm1,       xmm0,  0x0
-        mpsadbw         xmm2,       xmm0,  0x5
-        paddw           xmm1,       xmm2
-%else
-        movq            xmm0,       MMWORD PTR [rsi]
-        movq            xmm5,       MMWORD PTR [rdi]
-        movq            xmm3,       MMWORD PTR [rdi+8]
-        punpcklqdq      xmm5,       xmm3
-
-        movdqa          xmm2,       xmm5
-        mpsadbw         xmm5,       xmm0,  0x0
-        mpsadbw         xmm2,       xmm0,  0x5
-        paddw           xmm5,       xmm2
-
-        paddw           xmm1,       xmm5
-%endif
-        movq            xmm0,       MMWORD PTR [rsi + rax]
-        movq            xmm5,       MMWORD PTR [rdi+ rdx]
-        movq            xmm3,       MMWORD PTR [rdi+ rdx+8]
-        punpcklqdq      xmm5,       xmm3
-
-        lea             rsi,        [rsi+rax*2]
-        lea             rdi,        [rdi+rdx*2]
-
-        movdqa          xmm2,       xmm5
-        mpsadbw         xmm5,       xmm0,  0x0
-        mpsadbw         xmm2,       xmm0,  0x5
-        paddw           xmm5,       xmm2
-
-        paddw           xmm1,       xmm5
-%endmacro
-
-%macro PROCESS_4X2X8 1
-%if %1
-        movd            xmm0,       [rsi]
-        movq            xmm1,       MMWORD PTR [rdi]
-        movq            xmm3,       MMWORD PTR [rdi+8]
-        punpcklqdq      xmm1,       xmm3
-
-        mpsadbw         xmm1,       xmm0,  0x0
-%else
-        movd            xmm0,       [rsi]
-        movq            xmm5,       MMWORD PTR [rdi]
-        movq            xmm3,       MMWORD PTR [rdi+8]
-        punpcklqdq      xmm5,       xmm3
-
-        mpsadbw         xmm5,       xmm0,  0x0
-
-        paddw           xmm1,       xmm5
-%endif
-        movd            xmm0,       [rsi + rax]
-        movq            xmm5,       MMWORD PTR [rdi+ rdx]
-        movq            xmm3,       MMWORD PTR [rdi+ rdx+8]
-        punpcklqdq      xmm5,       xmm3
-
-        lea             rsi,        [rsi+rax*2]
-        lea             rdi,        [rdi+rdx*2]
-
-        mpsadbw         xmm5,       xmm0,  0x0
-
-        paddw           xmm1,       xmm5
-%endmacro
-
-
-;void vp9_sad16x16x8_sse4(
-;    const unsigned char *src_ptr,
-;    int  src_stride,
-;    const unsigned char *ref_ptr,
-;    int  ref_stride,
-;    unsigned short *sad_array);
-global sym(vp9_sad16x16x8_sse4)
-sym(vp9_sad16x16x8_sse4):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    push        rsi
-    push        rdi
-    ; end prolog
-
-        mov             rsi,        arg(0)           ;src_ptr
-        mov             rdi,        arg(2)           ;ref_ptr
-
-        movsxd          rax,        dword ptr arg(1) ;src_stride
-        movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-        PROCESS_16X2X8 1
-        PROCESS_16X2X8 0
-        PROCESS_16X2X8 0
-        PROCESS_16X2X8 0
-        PROCESS_16X2X8 0
-        PROCESS_16X2X8 0
-        PROCESS_16X2X8 0
-        PROCESS_16X2X8 0
-
-        mov             rdi,        arg(4)           ;Results
-        movdqa          XMMWORD PTR [rdi],    xmm1
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vp9_sad16x8x8_sse4(
-;    const unsigned char *src_ptr,
-;    int  src_stride,
-;    const unsigned char *ref_ptr,
-;    int  ref_stride,
-;    unsigned short *sad_array
-;);
-global sym(vp9_sad16x8x8_sse4)
-sym(vp9_sad16x8x8_sse4):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    push        rsi
-    push        rdi
-    ; end prolog
-
-        mov             rsi,        arg(0)           ;src_ptr
-        mov             rdi,        arg(2)           ;ref_ptr
-
-        movsxd          rax,        dword ptr arg(1) ;src_stride
-        movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-        PROCESS_16X2X8 1
-        PROCESS_16X2X8 0
-        PROCESS_16X2X8 0
-        PROCESS_16X2X8 0
-
-        mov             rdi,        arg(4)           ;Results
-        movdqa          XMMWORD PTR [rdi],    xmm1
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vp9_sad8x8x8_sse4(
-;    const unsigned char *src_ptr,
-;    int  src_stride,
-;    const unsigned char *ref_ptr,
-;    int  ref_stride,
-;    unsigned short *sad_array
-;);
-global sym(vp9_sad8x8x8_sse4)
-sym(vp9_sad8x8x8_sse4):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    push        rsi
-    push        rdi
-    ; end prolog
-
-        mov             rsi,        arg(0)           ;src_ptr
-        mov             rdi,        arg(2)           ;ref_ptr
-
-        movsxd          rax,        dword ptr arg(1) ;src_stride
-        movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-        PROCESS_8X2X8 1
-        PROCESS_8X2X8 0
-        PROCESS_8X2X8 0
-        PROCESS_8X2X8 0
-
-        mov             rdi,        arg(4)           ;Results
-        movdqa          XMMWORD PTR [rdi],    xmm1
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vp9_sad8x16x8_sse4(
-;    const unsigned char *src_ptr,
-;    int  src_stride,
-;    const unsigned char *ref_ptr,
-;    int  ref_stride,
-;    unsigned short *sad_array
-;);
-global sym(vp9_sad8x16x8_sse4)
-sym(vp9_sad8x16x8_sse4):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    push        rsi
-    push        rdi
-    ; end prolog
-
-        mov             rsi,        arg(0)           ;src_ptr
-        mov             rdi,        arg(2)           ;ref_ptr
-
-        movsxd          rax,        dword ptr arg(1) ;src_stride
-        movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-        PROCESS_8X2X8 1
-        PROCESS_8X2X8 0
-        PROCESS_8X2X8 0
-        PROCESS_8X2X8 0
-        PROCESS_8X2X8 0
-        PROCESS_8X2X8 0
-        PROCESS_8X2X8 0
-        PROCESS_8X2X8 0
-        mov             rdi,        arg(4)           ;Results
-        movdqa          XMMWORD PTR [rdi],    xmm1
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vp9_sad4x4x8_c(
-;    const unsigned char *src_ptr,
-;    int  src_stride,
-;    const unsigned char *ref_ptr,
-;    int  ref_stride,
-;    unsigned short *sad_array
-;);
-global sym(vp9_sad4x4x8_sse4)
-sym(vp9_sad4x4x8_sse4):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    push        rsi
-    push        rdi
-    ; end prolog
-
-        mov             rsi,        arg(0)           ;src_ptr
-        mov             rdi,        arg(2)           ;ref_ptr
-
-        movsxd          rax,        dword ptr arg(1) ;src_stride
-        movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-        PROCESS_4X2X8 1
-        PROCESS_4X2X8 0
-
-        mov             rdi,        arg(4)           ;Results
-        movdqa          XMMWORD PTR [rdi],    xmm1
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-
-
--- a/vp8/encoder/x86/sad_ssse3.asm
+++ /dev/null
@@ -1,370 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-%macro PROCESS_16X2X3 1
-%if %1
-        movdqa          xmm0,       XMMWORD PTR [rsi]
-        lddqu           xmm5,       XMMWORD PTR [rdi]
-        lddqu           xmm6,       XMMWORD PTR [rdi+1]
-        lddqu           xmm7,       XMMWORD PTR [rdi+2]
-
-        psadbw          xmm5,       xmm0
-        psadbw          xmm6,       xmm0
-        psadbw          xmm7,       xmm0
-%else
-        movdqa          xmm0,       XMMWORD PTR [rsi]
-        lddqu           xmm1,       XMMWORD PTR [rdi]
-        lddqu           xmm2,       XMMWORD PTR [rdi+1]
-        lddqu           xmm3,       XMMWORD PTR [rdi+2]
-
-        psadbw          xmm1,       xmm0
-        psadbw          xmm2,       xmm0
-        psadbw          xmm3,       xmm0
-
-        paddw           xmm5,       xmm1
-        paddw           xmm6,       xmm2
-        paddw           xmm7,       xmm3
-%endif
-        movdqa          xmm0,       XMMWORD PTR [rsi+rax]
-        lddqu           xmm1,       XMMWORD PTR [rdi+rdx]
-        lddqu           xmm2,       XMMWORD PTR [rdi+rdx+1]
-        lddqu           xmm3,       XMMWORD PTR [rdi+rdx+2]
-
-        lea             rsi,        [rsi+rax*2]
-        lea             rdi,        [rdi+rdx*2]
-
-        psadbw          xmm1,       xmm0
-        psadbw          xmm2,       xmm0
-        psadbw          xmm3,       xmm0
-
-        paddw           xmm5,       xmm1
-        paddw           xmm6,       xmm2
-        paddw           xmm7,       xmm3
-%endmacro
-
-%macro PROCESS_16X2X3_OFFSET 2
-%if %1
-        movdqa          xmm0,       XMMWORD PTR [rsi]
-        movdqa          xmm4,       XMMWORD PTR [rdi]
-        movdqa          xmm7,       XMMWORD PTR [rdi+16]
-
-        movdqa          xmm5,       xmm7
-        palignr         xmm5,       xmm4,       %2
-
-        movdqa          xmm6,       xmm7
-        palignr         xmm6,       xmm4,       (%2+1)
-
-        palignr         xmm7,       xmm4,       (%2+2)
-
-        psadbw          xmm5,       xmm0
-        psadbw          xmm6,       xmm0
-        psadbw          xmm7,       xmm0
-%else
-        movdqa          xmm0,       XMMWORD PTR [rsi]
-        movdqa          xmm4,       XMMWORD PTR [rdi]
-        movdqa          xmm3,       XMMWORD PTR [rdi+16]
-
-        movdqa          xmm1,       xmm3
-        palignr         xmm1,       xmm4,       %2
-
-        movdqa          xmm2,       xmm3
-        palignr         xmm2,       xmm4,       (%2+1)
-
-        palignr         xmm3,       xmm4,       (%2+2)
-
-        psadbw          xmm1,       xmm0
-        psadbw          xmm2,       xmm0
-        psadbw          xmm3,       xmm0
-
-        paddw           xmm5,       xmm1
-        paddw           xmm6,       xmm2
-        paddw           xmm7,       xmm3
-%endif
-        movdqa          xmm0,       XMMWORD PTR [rsi+rax]
-        movdqa          xmm4,       XMMWORD PTR [rdi+rdx]
-        movdqa          xmm3,       XMMWORD PTR [rdi+rdx+16]
-
-        movdqa          xmm1,       xmm3
-        palignr         xmm1,       xmm4,       %2
-
-        movdqa          xmm2,       xmm3
-        palignr         xmm2,       xmm4,       (%2+1)
-
-        palignr         xmm3,       xmm4,       (%2+2)
-
-        lea             rsi,        [rsi+rax*2]
-        lea             rdi,        [rdi+rdx*2]
-
-        psadbw          xmm1,       xmm0
-        psadbw          xmm2,       xmm0
-        psadbw          xmm3,       xmm0
-
-        paddw           xmm5,       xmm1
-        paddw           xmm6,       xmm2
-        paddw           xmm7,       xmm3
-%endmacro
-
-%macro PROCESS_16X16X3_OFFSET 2
-%2_aligned_by_%1:
-
-        sub             rdi,        %1
-
-        PROCESS_16X2X3_OFFSET 1, %1
-        PROCESS_16X2X3_OFFSET 0, %1
-        PROCESS_16X2X3_OFFSET 0, %1
-        PROCESS_16X2X3_OFFSET 0, %1
-        PROCESS_16X2X3_OFFSET 0, %1
-        PROCESS_16X2X3_OFFSET 0, %1
-        PROCESS_16X2X3_OFFSET 0, %1
-        PROCESS_16X2X3_OFFSET 0, %1
-
-        jmp             %2_store_off
-
-%endmacro
-
-%macro PROCESS_16X8X3_OFFSET 2
-%2_aligned_by_%1:
-
-        sub             rdi,        %1
-
-        PROCESS_16X2X3_OFFSET 1, %1
-        PROCESS_16X2X3_OFFSET 0, %1
-        PROCESS_16X2X3_OFFSET 0, %1
-        PROCESS_16X2X3_OFFSET 0, %1
-
-        jmp             %2_store_off
-
-%endmacro
-
-;void int vp9_sad16x16x3_ssse3(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride,
-;    int  *results)
-global sym(vp9_sad16x16x3_ssse3)
-sym(vp9_sad16x16x3_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    push        rcx
-    ; end prolog
-
-        mov             rsi,        arg(0) ;src_ptr
-        mov             rdi,        arg(2) ;ref_ptr
-
-        mov             rdx,        0xf
-        and             rdx,        rdi
-
-        jmp .vp9_sad16x16x3_ssse3_skiptable
-.vp9_sad16x16x3_ssse3_jumptable:
-        dd .vp9_sad16x16x3_ssse3_aligned_by_0  - .vp9_sad16x16x3_ssse3_do_jump
-        dd .vp9_sad16x16x3_ssse3_aligned_by_1  - .vp9_sad16x16x3_ssse3_do_jump
-        dd .vp9_sad16x16x3_ssse3_aligned_by_2  - .vp9_sad16x16x3_ssse3_do_jump
-        dd .vp9_sad16x16x3_ssse3_aligned_by_3  - .vp9_sad16x16x3_ssse3_do_jump
-        dd .vp9_sad16x16x3_ssse3_aligned_by_4  - .vp9_sad16x16x3_ssse3_do_jump
-        dd .vp9_sad16x16x3_ssse3_aligned_by_5  - .vp9_sad16x16x3_ssse3_do_jump
-        dd .vp9_sad16x16x3_ssse3_aligned_by_6  - .vp9_sad16x16x3_ssse3_do_jump
-        dd .vp9_sad16x16x3_ssse3_aligned_by_7  - .vp9_sad16x16x3_ssse3_do_jump
-        dd .vp9_sad16x16x3_ssse3_aligned_by_8  - .vp9_sad16x16x3_ssse3_do_jump
-        dd .vp9_sad16x16x3_ssse3_aligned_by_9  - .vp9_sad16x16x3_ssse3_do_jump
-        dd .vp9_sad16x16x3_ssse3_aligned_by_10 - .vp9_sad16x16x3_ssse3_do_jump
-        dd .vp9_sad16x16x3_ssse3_aligned_by_11 - .vp9_sad16x16x3_ssse3_do_jump
-        dd .vp9_sad16x16x3_ssse3_aligned_by_12 - .vp9_sad16x16x3_ssse3_do_jump
-        dd .vp9_sad16x16x3_ssse3_aligned_by_13 - .vp9_sad16x16x3_ssse3_do_jump
-        dd .vp9_sad16x16x3_ssse3_aligned_by_14 - .vp9_sad16x16x3_ssse3_do_jump
-        dd .vp9_sad16x16x3_ssse3_aligned_by_15 - .vp9_sad16x16x3_ssse3_do_jump
-.vp9_sad16x16x3_ssse3_skiptable:
-
-        call .vp9_sad16x16x3_ssse3_do_jump
-.vp9_sad16x16x3_ssse3_do_jump:
-        pop             rcx                         ; get the address of do_jump
-        mov             rax,  .vp9_sad16x16x3_ssse3_jumptable - .vp9_sad16x16x3_ssse3_do_jump
-        add             rax,  rcx  ; get the absolute address of vp9_sad16x16x3_ssse3_jumptable
-
-        movsxd          rax,  dword [rax + 4*rdx]   ; get the 32 bit offset from the jumptable
-        add             rcx,        rax
-
-        movsxd          rax,        dword ptr arg(1) ;src_stride
-        movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-        jmp             rcx
-
-        PROCESS_16X16X3_OFFSET 0,  .vp9_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 1,  .vp9_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 2,  .vp9_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 3,  .vp9_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 4,  .vp9_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 5,  .vp9_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 6,  .vp9_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 7,  .vp9_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 8,  .vp9_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 9,  .vp9_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 10, .vp9_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 11, .vp9_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 12, .vp9_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 13, .vp9_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 14, .vp9_sad16x16x3_ssse3
-
-.vp9_sad16x16x3_ssse3_aligned_by_15:
-        PROCESS_16X2X3 1
-        PROCESS_16X2X3 0
-        PROCESS_16X2X3 0
-        PROCESS_16X2X3 0
-        PROCESS_16X2X3 0
-        PROCESS_16X2X3 0
-        PROCESS_16X2X3 0
-        PROCESS_16X2X3 0
-
-.vp9_sad16x16x3_ssse3_store_off:
-        mov             rdi,        arg(4) ;Results
-
-        movq            xmm0,       xmm5
-        psrldq          xmm5,       8
-
-        paddw           xmm0,       xmm5
-        movd            [rdi],      xmm0
-;-
-        movq            xmm0,       xmm6
-        psrldq          xmm6,       8
-
-        paddw           xmm0,       xmm6
-        movd            [rdi+4],    xmm0
-;-
-        movq            xmm0,       xmm7
-        psrldq          xmm7,       8
-
-        paddw           xmm0,       xmm7
-        movd            [rdi+8],    xmm0
-
-    ; begin epilog
-    pop         rcx
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void int vp9_sad16x8x3_ssse3(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride,
-;    int  *results)
-global sym(vp9_sad16x8x3_ssse3)
-sym(vp9_sad16x8x3_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    push        rcx
-    ; end prolog
-
-        mov             rsi,        arg(0) ;src_ptr
-        mov             rdi,        arg(2) ;ref_ptr
-
-        mov             rdx,        0xf
-        and             rdx,        rdi
-
-        jmp .vp9_sad16x8x3_ssse3_skiptable
-.vp9_sad16x8x3_ssse3_jumptable:
-        dd .vp9_sad16x8x3_ssse3_aligned_by_0  - .vp9_sad16x8x3_ssse3_do_jump
-        dd .vp9_sad16x8x3_ssse3_aligned_by_1  - .vp9_sad16x8x3_ssse3_do_jump
-        dd .vp9_sad16x8x3_ssse3_aligned_by_2  - .vp9_sad16x8x3_ssse3_do_jump
-        dd .vp9_sad16x8x3_ssse3_aligned_by_3  - .vp9_sad16x8x3_ssse3_do_jump
-        dd .vp9_sad16x8x3_ssse3_aligned_by_4  - .vp9_sad16x8x3_ssse3_do_jump
-        dd .vp9_sad16x8x3_ssse3_aligned_by_5  - .vp9_sad16x8x3_ssse3_do_jump
-        dd .vp9_sad16x8x3_ssse3_aligned_by_6  - .vp9_sad16x8x3_ssse3_do_jump
-        dd .vp9_sad16x8x3_ssse3_aligned_by_7  - .vp9_sad16x8x3_ssse3_do_jump
-        dd .vp9_sad16x8x3_ssse3_aligned_by_8  - .vp9_sad16x8x3_ssse3_do_jump
-        dd .vp9_sad16x8x3_ssse3_aligned_by_9  - .vp9_sad16x8x3_ssse3_do_jump
-        dd .vp9_sad16x8x3_ssse3_aligned_by_10 - .vp9_sad16x8x3_ssse3_do_jump
-        dd .vp9_sad16x8x3_ssse3_aligned_by_11 - .vp9_sad16x8x3_ssse3_do_jump
-        dd .vp9_sad16x8x3_ssse3_aligned_by_12 - .vp9_sad16x8x3_ssse3_do_jump
-        dd .vp9_sad16x8x3_ssse3_aligned_by_13 - .vp9_sad16x8x3_ssse3_do_jump
-        dd .vp9_sad16x8x3_ssse3_aligned_by_14 - .vp9_sad16x8x3_ssse3_do_jump
-        dd .vp9_sad16x8x3_ssse3_aligned_by_15 - .vp9_sad16x8x3_ssse3_do_jump
-.vp9_sad16x8x3_ssse3_skiptable:
-
-        call .vp9_sad16x8x3_ssse3_do_jump
-.vp9_sad16x8x3_ssse3_do_jump:
-        pop             rcx                         ; get the address of do_jump
-        mov             rax,  .vp9_sad16x8x3_ssse3_jumptable - .vp9_sad16x8x3_ssse3_do_jump
-        add             rax,  rcx  ; get the absolute address of vp9_sad16x8x3_ssse3_jumptable
-
-        movsxd          rax,  dword [rax + 4*rdx]   ; get the 32 bit offset from the jumptable
-        add             rcx,        rax
-
-        movsxd          rax,        dword ptr arg(1) ;src_stride
-        movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-        jmp             rcx
-
-        PROCESS_16X8X3_OFFSET 0,  .vp9_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 1,  .vp9_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 2,  .vp9_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 3,  .vp9_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 4,  .vp9_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 5,  .vp9_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 6,  .vp9_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 7,  .vp9_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 8,  .vp9_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 9,  .vp9_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 10, .vp9_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 11, .vp9_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 12, .vp9_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 13, .vp9_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 14, .vp9_sad16x8x3_ssse3
-
-.vp9_sad16x8x3_ssse3_aligned_by_15:
-
-        PROCESS_16X2X3 1
-        PROCESS_16X2X3 0
-        PROCESS_16X2X3 0
-        PROCESS_16X2X3 0
-
-.vp9_sad16x8x3_ssse3_store_off:
-        mov             rdi,        arg(4) ;Results
-
-        movq            xmm0,       xmm5
-        psrldq          xmm5,       8
-
-        paddw           xmm0,       xmm5
-        movd            [rdi],      xmm0
-;-
-        movq            xmm0,       xmm6
-        psrldq          xmm6,       8
-
-        paddw           xmm0,       xmm6
-        movd            [rdi+4],    xmm0
-;-
-        movq            xmm0,       xmm7
-        psrldq          xmm7,       8
-
-        paddw           xmm0,       xmm7
-        movd            [rdi+8],    xmm0
-
-    ; begin epilog
-    pop         rcx
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
--- a/vp8/encoder/x86/ssim_opt.asm
+++ /dev/null
@@ -1,216 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-%include "vpx_ports/x86_abi_support.asm"
-
-; tabulate_ssim - sums sum_s,sum_r,sum_sq_s,sum_sq_r, sum_sxr
-%macro TABULATE_SSIM 0
-        paddusw         xmm15, xmm3  ; sum_s
-        paddusw         xmm14, xmm4  ; sum_r
-        movdqa          xmm1, xmm3
-        pmaddwd         xmm1, xmm1
-        paddd           xmm13, xmm1 ; sum_sq_s
-        movdqa          xmm2, xmm4
-        pmaddwd         xmm2, xmm2
-        paddd           xmm12, xmm2 ; sum_sq_r
-        pmaddwd         xmm3, xmm4
-        paddd           xmm11, xmm3  ; sum_sxr
-%endmacro
-
-; Sum across the register %1 starting with q words
-%macro SUM_ACROSS_Q 1
-        movdqa          xmm2,%1
-        punpckldq       %1,xmm0
-        punpckhdq       xmm2,xmm0
-        paddq           %1,xmm2
-        movdqa          xmm2,%1
-        punpcklqdq      %1,xmm0
-        punpckhqdq      xmm2,xmm0
-        paddq           %1,xmm2
-%endmacro
-
-; Sum across the register %1 starting with q words
-%macro SUM_ACROSS_W 1
-        movdqa          xmm1, %1
-        punpcklwd       %1,xmm0
-        punpckhwd       xmm1,xmm0
-        paddd           %1, xmm1
-        SUM_ACROSS_Q    %1
-%endmacro
-;void ssim_parms_sse2(
-;    unsigned char *s,
-;    int sp,
-;    unsigned char *r,
-;    int rp
-;    unsigned long *sum_s,
-;    unsigned long *sum_r,
-;    unsigned long *sum_sq_s,
-;    unsigned long *sum_sq_r,
-;    unsigned long *sum_sxr);
-;
-; TODO: Use parm passing through structure, probably don't need the pxors
-; ( calling app will initialize to 0 ) could easily fit everything in sse2
-; without too much hastle, and can probably do better estimates with psadw
-; or pavgb At this point this is just meant to be first pass for calculating
-; all the parms needed for 16x16 ssim so we can play with dssim as distortion
-; in mode selection code.
-global sym(vp9_ssim_parms_16x16_sse2)
-sym(vp9_ssim_parms_16x16_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 9
-    SAVE_XMM 15
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    mov             rsi,        arg(0) ;s
-    mov             rcx,        arg(1) ;sp
-    mov             rdi,        arg(2) ;r
-    mov             rax,        arg(3) ;rp
-
-    pxor            xmm0, xmm0
-    pxor            xmm15,xmm15  ;sum_s
-    pxor            xmm14,xmm14  ;sum_r
-    pxor            xmm13,xmm13  ;sum_sq_s
-    pxor            xmm12,xmm12  ;sum_sq_r
-    pxor            xmm11,xmm11  ;sum_sxr
-
-    mov             rdx, 16      ;row counter
-.NextRow:
-
-    ;grab source and reference pixels
-    movdqu          xmm5, [rsi]
-    movdqu          xmm6, [rdi]
-    movdqa          xmm3, xmm5
-    movdqa          xmm4, xmm6
-    punpckhbw       xmm3, xmm0 ; high_s
-    punpckhbw       xmm4, xmm0 ; high_r
-
-    TABULATE_SSIM
-
-    movdqa          xmm3, xmm5
-    movdqa          xmm4, xmm6
-    punpcklbw       xmm3, xmm0 ; low_s
-    punpcklbw       xmm4, xmm0 ; low_r
-
-    TABULATE_SSIM
-
-    add             rsi, rcx   ; next s row
-    add             rdi, rax   ; next r row
-
-    dec             rdx        ; counter
-    jnz .NextRow
-
-    SUM_ACROSS_W    xmm15
-    SUM_ACROSS_W    xmm14
-    SUM_ACROSS_Q    xmm13
-    SUM_ACROSS_Q    xmm12
-    SUM_ACROSS_Q    xmm11
-
-    mov             rdi,arg(4)
-    movd            [rdi], xmm15;
-    mov             rdi,arg(5)
-    movd            [rdi], xmm14;
-    mov             rdi,arg(6)
-    movd            [rdi], xmm13;
-    mov             rdi,arg(7)
-    movd            [rdi], xmm12;
-    mov             rdi,arg(8)
-    movd            [rdi], xmm11;
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void ssim_parms_sse2(
-;    unsigned char *s,
-;    int sp,
-;    unsigned char *r,
-;    int rp
-;    unsigned long *sum_s,
-;    unsigned long *sum_r,
-;    unsigned long *sum_sq_s,
-;    unsigned long *sum_sq_r,
-;    unsigned long *sum_sxr);
-;
-; TODO: Use parm passing through structure, probably don't need the pxors
-; ( calling app will initialize to 0 ) could easily fit everything in sse2
-; without too much hastle, and can probably do better estimates with psadw
-; or pavgb At this point this is just meant to be first pass for calculating
-; all the parms needed for 16x16 ssim so we can play with dssim as distortion
-; in mode selection code.
-global sym(vp9_ssim_parms_8x8_sse2)
-sym(vp9_ssim_parms_8x8_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 9
-    SAVE_XMM 15
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    mov             rsi,        arg(0) ;s
-    mov             rcx,        arg(1) ;sp
-    mov             rdi,        arg(2) ;r
-    mov             rax,        arg(3) ;rp
-
-    pxor            xmm0, xmm0
-    pxor            xmm15,xmm15  ;sum_s
-    pxor            xmm14,xmm14  ;sum_r
-    pxor            xmm13,xmm13  ;sum_sq_s
-    pxor            xmm12,xmm12  ;sum_sq_r
-    pxor            xmm11,xmm11  ;sum_sxr
-
-    mov             rdx, 8      ;row counter
-.NextRow:
-
-    ;grab source and reference pixels
-    movq            xmm3, [rsi]
-    movq            xmm4, [rdi]
-    punpcklbw       xmm3, xmm0 ; low_s
-    punpcklbw       xmm4, xmm0 ; low_r
-
-    TABULATE_SSIM
-
-    add             rsi, rcx   ; next s row
-    add             rdi, rax   ; next r row
-
-    dec             rdx        ; counter
-    jnz .NextRow
-
-    SUM_ACROSS_W    xmm15
-    SUM_ACROSS_W    xmm14
-    SUM_ACROSS_Q    xmm13
-    SUM_ACROSS_Q    xmm12
-    SUM_ACROSS_Q    xmm11
-
-    mov             rdi,arg(4)
-    movd            [rdi], xmm15;
-    mov             rdi,arg(5)
-    movd            [rdi], xmm14;
-    mov             rdi,arg(6)
-    movd            [rdi], xmm13;
-    mov             rdi,arg(7)
-    movd            [rdi], xmm12;
-    mov             rdi,arg(8)
-    movd            [rdi], xmm11;
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
--- a/vp8/encoder/x86/subtract_mmx.asm
+++ /dev/null
@@ -1,432 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-;void vp9_subtract_b_mmx_impl(unsigned char *z,  int src_stride,
-;                            short *diff, unsigned char *Predictor,
-;                            int pitch);
-global sym(vp9_subtract_b_mmx_impl)
-sym(vp9_subtract_b_mmx_impl):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    push rsi
-    push rdi
-    ; end prolog
-
-
-        mov     rdi,        arg(2) ;diff
-        mov     rax,        arg(3) ;Predictor
-        mov     rsi,        arg(0) ;z
-        movsxd  rdx,        dword ptr arg(1);src_stride;
-        movsxd  rcx,        dword ptr arg(4);pitch
-        pxor    mm7,        mm7
-
-        movd    mm0,        [rsi]
-        movd    mm1,        [rax]
-        punpcklbw   mm0,    mm7
-        punpcklbw   mm1,    mm7
-        psubw   mm0,        mm1
-        movq    [rdi],      mm0
-
-
-        movd    mm0,        [rsi+rdx]
-        movd    mm1,        [rax+rcx]
-        punpcklbw   mm0,    mm7
-        punpcklbw   mm1,    mm7
-        psubw   mm0,        mm1
-        movq    [rdi+rcx*2],mm0
-
-
-        movd    mm0,        [rsi+rdx*2]
-        movd    mm1,        [rax+rcx*2]
-        punpcklbw   mm0,    mm7
-        punpcklbw   mm1,    mm7
-        psubw   mm0,        mm1
-        movq    [rdi+rcx*4],        mm0
-
-        lea     rsi,        [rsi+rdx*2]
-        lea     rcx,        [rcx+rcx*2]
-
-
-
-        movd    mm0,        [rsi+rdx]
-        movd    mm1,        [rax+rcx]
-        punpcklbw   mm0,    mm7
-        punpcklbw   mm1,    mm7
-        psubw   mm0,        mm1
-        movq    [rdi+rcx*2],        mm0
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void vp9_subtract_mby_mmx(short *diff, unsigned char *src, unsigned char *pred, int stride)
-global sym(vp9_subtract_mby_mmx)
-sym(vp9_subtract_mby_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 4
-    push rsi
-    push rdi
-    ; end prolog
-
-
-            mov         rsi,            arg(1) ;src
-            mov         rdi,            arg(0) ;diff
-
-            mov         rax,            arg(2) ;pred
-            movsxd      rdx,            dword ptr arg(3) ;stride
-
-            mov         rcx,            16
-            pxor        mm0,            mm0
-
-.submby_loop:
-
-            movq        mm1,            [rsi]
-            movq        mm3,            [rax]
-
-            movq        mm2,            mm1
-            movq        mm4,            mm3
-
-            punpcklbw   mm1,            mm0
-            punpcklbw   mm3,            mm0
-
-            punpckhbw   mm2,            mm0
-            punpckhbw   mm4,            mm0
-
-            psubw       mm1,            mm3
-            psubw       mm2,            mm4
-
-            movq        [rdi],          mm1
-            movq        [rdi+8],        mm2
-
-
-            movq        mm1,            [rsi+8]
-            movq        mm3,            [rax+8]
-
-            movq        mm2,            mm1
-            movq        mm4,            mm3
-
-            punpcklbw   mm1,            mm0
-            punpcklbw   mm3,            mm0
-
-            punpckhbw   mm2,            mm0
-            punpckhbw   mm4,            mm0
-
-            psubw       mm1,            mm3
-            psubw       mm2,            mm4
-
-            movq        [rdi+16],       mm1
-            movq        [rdi+24],       mm2
-
-
-            add         rdi,            32
-            add         rax,            16
-
-            lea         rsi,            [rsi+rdx]
-
-            sub         rcx,            1
-            jnz         .submby_loop
-
-    pop rdi
-    pop rsi
-    ; begin epilog
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vp9_subtract_mbuv_mmx(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride)
-global sym(vp9_subtract_mbuv_mmx)
-sym(vp9_subtract_mbuv_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    push rsi
-    push rdi
-    ; end prolog
-
-    ;short *udiff = diff + 256;
-    ;short *vdiff = diff + 320;
-    ;unsigned char *upred = pred + 256;
-    ;unsigned char *vpred = pred + 320;
-
-        ;unsigned char  *z    = usrc;
-        ;unsigned short *diff = udiff;
-        ;unsigned char  *Predictor= upred;
-
-            mov     rdi,        arg(0) ;diff
-            mov     rax,        arg(3) ;pred
-            mov     rsi,        arg(1) ;z = usrc
-            add     rdi,        256*2  ;diff = diff + 256 (shorts)
-            add     rax,        256    ;Predictor = pred + 256
-            movsxd  rdx,        dword ptr arg(4) ;stride;
-            pxor    mm7,        mm7
-
-            movq    mm0,        [rsi]
-            movq    mm1,        [rax]
-            movq    mm3,        mm0
-            movq    mm4,        mm1
-            punpcklbw   mm0,    mm7
-            punpcklbw   mm1,    mm7
-            punpckhbw   mm3,    mm7
-            punpckhbw   mm4,    mm7
-            psubw   mm0,        mm1
-            psubw   mm3,        mm4
-            movq    [rdi],      mm0
-            movq    [rdi+8],    mm3
-
-
-            movq    mm0,        [rsi+rdx]
-            movq    mm1,        [rax+8]
-            movq    mm3,        mm0
-            movq    mm4,        mm1
-            punpcklbw   mm0,    mm7
-            punpcklbw   mm1,    mm7
-            punpckhbw   mm3,    mm7
-            punpckhbw   mm4,    mm7
-            psubw   mm0,        mm1
-            psubw   mm3,        mm4
-            movq    [rdi+16],   mm0
-            movq    [rdi+24],   mm3
-
-            movq    mm0,        [rsi+rdx*2]
-            movq    mm1,        [rax+16]
-            movq    mm3,        mm0
-            movq    mm4,        mm1
-            punpcklbw   mm0,    mm7
-            punpcklbw   mm1,    mm7
-            punpckhbw   mm3,    mm7
-            punpckhbw   mm4,    mm7
-            psubw   mm0,        mm1
-            psubw   mm3,        mm4
-            movq    [rdi+32],   mm0
-            movq    [rdi+40],   mm3
-            lea     rsi,        [rsi+rdx*2]
-
-
-            movq    mm0,        [rsi+rdx]
-            movq    mm1,        [rax+24]
-            movq    mm3,        mm0
-            movq    mm4,        mm1
-            punpcklbw   mm0,    mm7
-            punpcklbw   mm1,    mm7
-            punpckhbw   mm3,    mm7
-            punpckhbw   mm4,    mm7
-            psubw   mm0,        mm1
-            psubw   mm3,        mm4
-
-            movq    [rdi+48],   mm0
-            movq    [rdi+56],   mm3
-
-
-            add     rdi,        64
-            add     rax,        32
-            lea     rsi,        [rsi+rdx*2]
-
-
-            movq    mm0,        [rsi]
-            movq    mm1,        [rax]
-            movq    mm3,        mm0
-            movq    mm4,        mm1
-            punpcklbw   mm0,    mm7
-            punpcklbw   mm1,    mm7
-            punpckhbw   mm3,    mm7
-            punpckhbw   mm4,    mm7
-            psubw   mm0,        mm1
-            psubw   mm3,        mm4
-            movq    [rdi],      mm0
-            movq    [rdi+8],    mm3
-
-
-            movq    mm0,        [rsi+rdx]
-            movq    mm1,        [rax+8]
-            movq    mm3,        mm0
-            movq    mm4,        mm1
-            punpcklbw   mm0,    mm7
-            punpcklbw   mm1,    mm7
-            punpckhbw   mm3,    mm7
-            punpckhbw   mm4,    mm7
-            psubw   mm0,        mm1
-            psubw   mm3,        mm4
-            movq    [rdi+16],   mm0
-            movq    [rdi+24],   mm3
-
-            movq    mm0,        [rsi+rdx*2]
-            movq    mm1,        [rax+16]
-            movq    mm3,        mm0
-            movq    mm4,        mm1
-            punpcklbw   mm0,    mm7
-            punpcklbw   mm1,    mm7
-            punpckhbw   mm3,    mm7
-            punpckhbw   mm4,    mm7
-            psubw   mm0,        mm1
-            psubw   mm3,        mm4
-            movq    [rdi+32],   mm0
-            movq    [rdi+40],   mm3
-            lea     rsi,        [rsi+rdx*2]
-
-
-            movq    mm0,        [rsi+rdx]
-            movq    mm1,        [rax+24]
-            movq    mm3,        mm0
-            movq    mm4,        mm1
-            punpcklbw   mm0,    mm7
-            punpcklbw   mm1,    mm7
-            punpckhbw   mm3,    mm7
-            punpckhbw   mm4,    mm7
-            psubw   mm0,        mm1
-            psubw   mm3,        mm4
-
-            movq    [rdi+48],   mm0
-            movq    [rdi+56],   mm3
-
-        ;unsigned char  *z    = vsrc;
-        ;unsigned short *diff = vdiff;
-        ;unsigned char  *Predictor= vpred;
-
-            mov     rdi,        arg(0) ;diff
-            mov     rax,        arg(3) ;pred
-            mov     rsi,        arg(2) ;z = usrc
-            add     rdi,        320*2  ;diff = diff + 320 (shorts)
-            add     rax,        320    ;Predictor = pred + 320
-            movsxd  rdx,        dword ptr arg(4) ;stride;
-            pxor    mm7,        mm7
-
-            movq    mm0,        [rsi]
-            movq    mm1,        [rax]
-            movq    mm3,        mm0
-            movq    mm4,        mm1
-            punpcklbw   mm0,    mm7
-            punpcklbw   mm1,    mm7
-            punpckhbw   mm3,    mm7
-            punpckhbw   mm4,    mm7
-            psubw   mm0,        mm1
-            psubw   mm3,        mm4
-            movq    [rdi],      mm0
-            movq    [rdi+8],    mm3
-
-
-            movq    mm0,        [rsi+rdx]
-            movq    mm1,        [rax+8]
-            movq    mm3,        mm0
-            movq    mm4,        mm1
-            punpcklbw   mm0,    mm7
-            punpcklbw   mm1,    mm7
-            punpckhbw   mm3,    mm7
-            punpckhbw   mm4,    mm7
-            psubw   mm0,        mm1
-            psubw   mm3,        mm4
-            movq    [rdi+16],   mm0
-            movq    [rdi+24],   mm3
-
-            movq    mm0,        [rsi+rdx*2]
-            movq    mm1,        [rax+16]
-            movq    mm3,        mm0
-            movq    mm4,        mm1
-            punpcklbw   mm0,    mm7
-            punpcklbw   mm1,    mm7
-            punpckhbw   mm3,    mm7
-            punpckhbw   mm4,    mm7
-            psubw   mm0,        mm1
-            psubw   mm3,        mm4
-            movq    [rdi+32],   mm0
-            movq    [rdi+40],   mm3
-            lea     rsi,        [rsi+rdx*2]
-
-
-            movq    mm0,        [rsi+rdx]
-            movq    mm1,        [rax+24]
-            movq    mm3,        mm0
-            movq    mm4,        mm1
-            punpcklbw   mm0,    mm7
-            punpcklbw   mm1,    mm7
-            punpckhbw   mm3,    mm7
-            punpckhbw   mm4,    mm7
-            psubw   mm0,        mm1
-            psubw   mm3,        mm4
-
-            movq    [rdi+48],   mm0
-            movq    [rdi+56],   mm3
-
-
-            add     rdi,        64
-            add     rax,        32
-            lea     rsi,        [rsi+rdx*2]
-
-
-            movq    mm0,        [rsi]
-            movq    mm1,        [rax]
-            movq    mm3,        mm0
-            movq    mm4,        mm1
-            punpcklbw   mm0,    mm7
-            punpcklbw   mm1,    mm7
-            punpckhbw   mm3,    mm7
-            punpckhbw   mm4,    mm7
-            psubw   mm0,        mm1
-            psubw   mm3,        mm4
-            movq    [rdi],      mm0
-            movq    [rdi+8],    mm3
-
-
-            movq    mm0,        [rsi+rdx]
-            movq    mm1,        [rax+8]
-            movq    mm3,        mm0
-            movq    mm4,        mm1
-            punpcklbw   mm0,    mm7
-            punpcklbw   mm1,    mm7
-            punpckhbw   mm3,    mm7
-            punpckhbw   mm4,    mm7
-            psubw   mm0,        mm1
-            psubw   mm3,        mm4
-            movq    [rdi+16],   mm0
-            movq    [rdi+24],   mm3
-
-            movq    mm0,        [rsi+rdx*2]
-            movq    mm1,        [rax+16]
-            movq    mm3,        mm0
-            movq    mm4,        mm1
-            punpcklbw   mm0,    mm7
-            punpcklbw   mm1,    mm7
-            punpckhbw   mm3,    mm7
-            punpckhbw   mm4,    mm7
-            psubw   mm0,        mm1
-            psubw   mm3,        mm4
-            movq    [rdi+32],   mm0
-            movq    [rdi+40],   mm3
-            lea     rsi,        [rsi+rdx*2]
-
-
-            movq    mm0,        [rsi+rdx]
-            movq    mm1,        [rax+24]
-            movq    mm3,        mm0
-            movq    mm4,        mm1
-            punpcklbw   mm0,    mm7
-            punpcklbw   mm1,    mm7
-            punpckhbw   mm3,    mm7
-            punpckhbw   mm4,    mm7
-            psubw   mm0,        mm1
-            psubw   mm3,        mm4
-
-            movq    [rdi+48],   mm0
-            movq    [rdi+56],   mm3
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
--- a/vp8/encoder/x86/subtract_sse2.asm
+++ /dev/null
@@ -1,356 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-;void vp9_subtract_b_sse2_impl(unsigned char *z,  int src_stride,
-;                            short *diff, unsigned char *Predictor,
-;                            int pitch);
-global sym(vp9_subtract_b_sse2_impl)
-sym(vp9_subtract_b_sse2_impl):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    GET_GOT     rbx
-    push rsi
-    push rdi
-    ; end prolog
-
-        mov     rdi,        arg(2) ;diff
-        mov     rax,        arg(3) ;Predictor
-        mov     rsi,        arg(0) ;z
-        movsxd  rdx,        dword ptr arg(1);src_stride;
-        movsxd  rcx,        dword ptr arg(4);pitch
-        pxor    mm7,        mm7
-
-        movd    mm0,        [rsi]
-        movd    mm1,        [rax]
-        punpcklbw   mm0,    mm7
-        punpcklbw   mm1,    mm7
-        psubw   mm0,        mm1
-        movq    MMWORD PTR [rdi],      mm0
-
-        movd    mm0,        [rsi+rdx]
-        movd    mm1,        [rax+rcx]
-        punpcklbw   mm0,    mm7
-        punpcklbw   mm1,    mm7
-        psubw   mm0,        mm1
-        movq    MMWORD PTR [rdi+rcx*2], mm0
-
-        movd    mm0,        [rsi+rdx*2]
-        movd    mm1,        [rax+rcx*2]
-        punpcklbw   mm0,    mm7
-        punpcklbw   mm1,    mm7
-        psubw   mm0,        mm1
-        movq    MMWORD PTR [rdi+rcx*4], mm0
-
-        lea     rsi,        [rsi+rdx*2]
-        lea     rcx,        [rcx+rcx*2]
-
-        movd    mm0,        [rsi+rdx]
-        movd    mm1,        [rax+rcx]
-        punpcklbw   mm0,    mm7
-        punpcklbw   mm1,    mm7
-        psubw   mm0,        mm1
-        movq    MMWORD PTR [rdi+rcx*2], mm0
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vp9_subtract_mby_sse2(short *diff, unsigned char *src, unsigned char *pred, int stride)
-global sym(vp9_subtract_mby_sse2)
-sym(vp9_subtract_mby_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 4
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push rsi
-    push rdi
-    ; end prolog
-
-            mov         rsi,            arg(1) ;src
-            mov         rdi,            arg(0) ;diff
-
-            mov         rax,            arg(2) ;pred
-            movsxd      rdx,            dword ptr arg(3) ;stride
-
-            mov         rcx,            8      ; do two lines at one time
-
-.submby_loop:
-            movdqa      xmm0,           XMMWORD PTR [rsi]   ; src
-            movdqa      xmm1,           XMMWORD PTR [rax]   ; pred
-
-            movdqa      xmm2,           xmm0
-            psubb       xmm0,           xmm1
-
-            pxor        xmm1,           [GLOBAL(t80)]   ;convert to signed values
-            pxor        xmm2,           [GLOBAL(t80)]
-            pcmpgtb     xmm1,           xmm2            ; obtain sign information
-
-            movdqa      xmm2,    xmm0
-            movdqa      xmm3,    xmm1
-            punpcklbw   xmm0,    xmm1            ; put sign back to subtraction
-            punpckhbw   xmm2,    xmm3            ; put sign back to subtraction
-
-            movdqa      XMMWORD PTR [rdi],   xmm0
-            movdqa      XMMWORD PTR [rdi +16], xmm2
-
-            movdqa      xmm4,           XMMWORD PTR [rsi + rdx]
-            movdqa      xmm5,           XMMWORD PTR [rax + 16]
-
-            movdqa      xmm6,           xmm4
-            psubb       xmm4,           xmm5
-
-            pxor        xmm5,           [GLOBAL(t80)]   ;convert to signed values
-            pxor        xmm6,           [GLOBAL(t80)]
-            pcmpgtb     xmm5,           xmm6            ; obtain sign information
-
-            movdqa      xmm6,    xmm4
-            movdqa      xmm7,    xmm5
-            punpcklbw   xmm4,    xmm5            ; put sign back to subtraction
-            punpckhbw   xmm6,    xmm7            ; put sign back to subtraction
-
-            movdqa      XMMWORD PTR [rdi +32], xmm4
-            movdqa      XMMWORD PTR [rdi +48], xmm6
-
-            add         rdi,            64
-            add         rax,            32
-            lea         rsi,            [rsi+rdx*2]
-
-            sub         rcx,            1
-            jnz         .submby_loop
-
-    pop rdi
-    pop rsi
-    ; begin epilog
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vp9_subtract_mbuv_sse2(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride)
-global sym(vp9_subtract_mbuv_sse2)
-sym(vp9_subtract_mbuv_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    GET_GOT     rbx
-    push rsi
-    push rdi
-    ; end prolog
-
-            mov     rdi,        arg(0) ;diff
-            mov     rax,        arg(3) ;pred
-            mov     rsi,        arg(1) ;z = usrc
-            add     rdi,        256*2  ;diff = diff + 256 (shorts)
-            add     rax,        256    ;Predictor = pred + 256
-            movsxd  rdx,        dword ptr arg(4) ;stride;
-            lea     rcx,        [rdx + rdx*2]
-
-            ;u
-            ;line 0 1
-            movq       xmm0,    MMWORD PTR [rsi]  ; src
-            movq       xmm2,    MMWORD PTR [rsi+rdx]
-            movdqa     xmm1,    XMMWORD PTR [rax]  ; pred
-            punpcklqdq xmm0,    xmm2
-
-            movdqa     xmm2,    xmm0
-            psubb      xmm0,    xmm1            ; subtraction with sign missed
-
-            pxor       xmm1,    [GLOBAL(t80)]   ;convert to signed values
-            pxor       xmm2,    [GLOBAL(t80)]
-            pcmpgtb    xmm1,    xmm2            ; obtain sign information
-
-            movdqa     xmm2,    xmm0
-            movdqa     xmm3,    xmm1
-            punpcklbw  xmm0,    xmm1            ; put sign back to subtraction
-            punpckhbw  xmm2,    xmm3            ; put sign back to subtraction
-
-            movdqa     XMMWORD PTR [rdi],   xmm0
-            movdqa     XMMWORD PTR [rdi +16],   xmm2
-
-            ;line 2 3
-            movq       xmm0,    MMWORD PTR [rsi+rdx*2]  ; src
-            movq       xmm2,    MMWORD PTR [rsi+rcx]
-            movdqa     xmm1,    XMMWORD PTR [rax+16]  ; pred
-            punpcklqdq xmm0,    xmm2
-
-            movdqa     xmm2,    xmm0
-            psubb      xmm0,    xmm1            ; subtraction with sign missed
-
-            pxor       xmm1,    [GLOBAL(t80)]   ;convert to signed values
-            pxor       xmm2,    [GLOBAL(t80)]
-            pcmpgtb    xmm1,    xmm2            ; obtain sign information
-
-            movdqa     xmm2,    xmm0
-            movdqa     xmm3,    xmm1
-            punpcklbw  xmm0,    xmm1            ; put sign back to subtraction
-            punpckhbw  xmm2,    xmm3            ; put sign back to subtraction
-
-            movdqa     XMMWORD PTR [rdi + 32],   xmm0
-            movdqa     XMMWORD PTR [rdi + 48],   xmm2
-
-            ;line 4 5
-            lea        rsi,     [rsi + rdx*4]
-
-            movq       xmm0,    MMWORD PTR [rsi]  ; src
-            movq       xmm2,    MMWORD PTR [rsi+rdx]
-            movdqa     xmm1,    XMMWORD PTR [rax + 32]  ; pred
-            punpcklqdq xmm0,    xmm2
-
-            movdqa     xmm2,    xmm0
-            psubb      xmm0,    xmm1            ; subtraction with sign missed
-
-            pxor       xmm1,    [GLOBAL(t80)]   ;convert to signed values
-            pxor       xmm2,    [GLOBAL(t80)]
-            pcmpgtb    xmm1,    xmm2            ; obtain sign information
-
-            movdqa     xmm2,    xmm0
-            movdqa     xmm3,    xmm1
-            punpcklbw  xmm0,    xmm1            ; put sign back to subtraction
-            punpckhbw  xmm2,    xmm3            ; put sign back to subtraction
-
-            movdqa     XMMWORD PTR [rdi + 64],   xmm0
-            movdqa     XMMWORD PTR [rdi + 80],   xmm2
-
-            ;line 6 7
-            movq       xmm0,    MMWORD PTR [rsi+rdx*2]  ; src
-            movq       xmm2,    MMWORD PTR [rsi+rcx]
-            movdqa     xmm1,    XMMWORD PTR [rax+ 48]  ; pred
-            punpcklqdq xmm0,    xmm2
-
-            movdqa     xmm2,    xmm0
-            psubb      xmm0,    xmm1            ; subtraction with sign missed
-
-            pxor       xmm1,    [GLOBAL(t80)]   ;convert to signed values
-            pxor       xmm2,    [GLOBAL(t80)]
-            pcmpgtb    xmm1,    xmm2            ; obtain sign information
-
-            movdqa     xmm2,    xmm0
-            movdqa     xmm3,    xmm1
-            punpcklbw  xmm0,    xmm1            ; put sign back to subtraction
-            punpckhbw  xmm2,    xmm3            ; put sign back to subtraction
-
-            movdqa     XMMWORD PTR [rdi + 96],   xmm0
-            movdqa     XMMWORD PTR [rdi + 112],  xmm2
-
-            ;v
-            mov     rsi,        arg(2) ;z = vsrc
-            add     rdi,        64*2  ;diff = diff + 320 (shorts)
-            add     rax,        64    ;Predictor = pred + 320
-
-            ;line 0 1
-            movq       xmm0,    MMWORD PTR [rsi]  ; src
-            movq       xmm2,    MMWORD PTR [rsi+rdx]
-            movdqa     xmm1,    XMMWORD PTR [rax]  ; pred
-            punpcklqdq xmm0,    xmm2
-
-            movdqa     xmm2,    xmm0
-            psubb      xmm0,    xmm1            ; subtraction with sign missed
-
-            pxor       xmm1,    [GLOBAL(t80)]   ;convert to signed values
-            pxor       xmm2,    [GLOBAL(t80)]
-            pcmpgtb    xmm1,    xmm2            ; obtain sign information
-
-            movdqa     xmm2,    xmm0
-            movdqa     xmm3,    xmm1
-            punpcklbw  xmm0,    xmm1            ; put sign back to subtraction
-            punpckhbw  xmm2,    xmm3            ; put sign back to subtraction
-
-            movdqa     XMMWORD PTR [rdi],   xmm0
-            movdqa     XMMWORD PTR [rdi +16],   xmm2
-
-            ;line 2 3
-            movq       xmm0,    MMWORD PTR [rsi+rdx*2]  ; src
-            movq       xmm2,    MMWORD PTR [rsi+rcx]
-            movdqa     xmm1,    XMMWORD PTR [rax+16]  ; pred
-            punpcklqdq xmm0,    xmm2
-
-            movdqa     xmm2,    xmm0
-            psubb      xmm0,    xmm1            ; subtraction with sign missed
-
-            pxor       xmm1,    [GLOBAL(t80)]   ;convert to signed values
-            pxor       xmm2,    [GLOBAL(t80)]
-            pcmpgtb    xmm1,    xmm2            ; obtain sign information
-
-            movdqa     xmm2,    xmm0
-            movdqa     xmm3,    xmm1
-            punpcklbw  xmm0,    xmm1            ; put sign back to subtraction
-            punpckhbw  xmm2,    xmm3            ; put sign back to subtraction
-
-            movdqa     XMMWORD PTR [rdi + 32],   xmm0
-            movdqa     XMMWORD PTR [rdi + 48],   xmm2
-
-            ;line 4 5
-            lea        rsi,     [rsi + rdx*4]
-
-            movq       xmm0,    MMWORD PTR [rsi]  ; src
-            movq       xmm2,    MMWORD PTR [rsi+rdx]
-            movdqa     xmm1,    XMMWORD PTR [rax + 32]  ; pred
-            punpcklqdq xmm0,    xmm2
-
-            movdqa     xmm2,    xmm0
-            psubb      xmm0,    xmm1            ; subtraction with sign missed
-
-            pxor       xmm1,    [GLOBAL(t80)]   ;convert to signed values
-            pxor       xmm2,    [GLOBAL(t80)]
-            pcmpgtb    xmm1,    xmm2            ; obtain sign information
-
-            movdqa     xmm2,    xmm0
-            movdqa     xmm3,    xmm1
-            punpcklbw  xmm0,    xmm1            ; put sign back to subtraction
-            punpckhbw  xmm2,    xmm3            ; put sign back to subtraction
-
-            movdqa     XMMWORD PTR [rdi + 64],   xmm0
-            movdqa     XMMWORD PTR [rdi + 80],   xmm2
-
-            ;line 6 7
-            movq       xmm0,    MMWORD PTR [rsi+rdx*2]  ; src
-            movq       xmm2,    MMWORD PTR [rsi+rcx]
-            movdqa     xmm1,    XMMWORD PTR [rax+ 48]  ; pred
-            punpcklqdq xmm0,    xmm2
-
-            movdqa     xmm2,    xmm0
-            psubb      xmm0,    xmm1            ; subtraction with sign missed
-
-            pxor       xmm1,    [GLOBAL(t80)]   ;convert to signed values
-            pxor       xmm2,    [GLOBAL(t80)]
-            pcmpgtb    xmm1,    xmm2            ; obtain sign information
-
-            movdqa     xmm2,    xmm0
-            movdqa     xmm3,    xmm1
-            punpcklbw  xmm0,    xmm1            ; put sign back to subtraction
-            punpckhbw  xmm2,    xmm3            ; put sign back to subtraction
-
-            movdqa     XMMWORD PTR [rdi + 96],   xmm0
-            movdqa     XMMWORD PTR [rdi + 112],  xmm2
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-SECTION_RODATA
-align 16
-t80:
-    times 16 db 0x80
--- a/vp8/encoder/x86/temporal_filter_apply_sse2.asm
+++ /dev/null
@@ -1,207 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-; void vp9_temporal_filter_apply_sse2 | arg
-;  (unsigned char  *frame1,           |  0
-;   unsigned int    stride,           |  1
-;   unsigned char  *frame2,           |  2
-;   unsigned int    block_size,       |  3
-;   int             strength,         |  4
-;   int             filter_weight,    |  5
-;   unsigned int   *accumulator,      |  6
-;   unsigned short *count)            |  7
-global sym(vp9_temporal_filter_apply_sse2)
-sym(vp9_temporal_filter_apply_sse2):
-
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 8
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ALIGN_STACK 16, rax
-    %define block_size    0
-    %define strength      16
-    %define filter_weight 32
-    %define rounding_bit  48
-    %define rbp_backup    64
-    %define stack_size    80
-    sub         rsp,           stack_size
-    mov         [rsp + rbp_backup], rbp
-    ; end prolog
-
-        mov         rdx,            arg(3)
-        mov         [rsp + block_size], rdx
-        movd        xmm6,            arg(4)
-        movdqa      [rsp + strength], xmm6 ; where strength is used, all 16 bytes are read
-
-        ; calculate the rounding bit outside the loop
-        ; 0x8000 >> (16 - strength)
-        mov         rdx,            16
-        sub         rdx,            arg(4) ; 16 - strength
-        movd        xmm4,           rdx    ; can't use rdx w/ shift
-        movdqa      xmm5,           [GLOBAL(_const_top_bit)]
-        psrlw       xmm5,           xmm4
-        movdqa      [rsp + rounding_bit], xmm5
-
-        mov         rsi,            arg(0) ; src/frame1
-        mov         rdx,            arg(2) ; predictor frame
-        mov         rdi,            arg(6) ; accumulator
-        mov         rax,            arg(7) ; count
-
-        ; dup the filter weight and store for later
-        movd        xmm0,           arg(5) ; filter_weight
-        pshuflw     xmm0,           xmm0, 0
-        punpcklwd   xmm0,           xmm0
-        movdqa      [rsp + filter_weight], xmm0
-
-        mov         rbp,            arg(1) ; stride
-        pxor        xmm7,           xmm7   ; zero for extraction
-
-        lea         rcx,            [rdx + 16*16*1]
-        cmp         dword ptr [rsp + block_size], 8
-        jne         .temporal_filter_apply_load_16
-        lea         rcx,            [rdx + 8*8*1]
-
-.temporal_filter_apply_load_8:
-        movq        xmm0,           [rsi]  ; first row
-        lea         rsi,            [rsi + rbp] ; += stride
-        punpcklbw   xmm0,           xmm7   ; src[ 0- 7]
-        movq        xmm1,           [rsi]  ; second row
-        lea         rsi,            [rsi + rbp] ; += stride
-        punpcklbw   xmm1,           xmm7   ; src[ 8-15]
-        jmp         .temporal_filter_apply_load_finished
-
-.temporal_filter_apply_load_16:
-        movdqa      xmm0,           [rsi]  ; src (frame1)
-        lea         rsi,            [rsi + rbp] ; += stride
-        movdqa      xmm1,           xmm0
-        punpcklbw   xmm0,           xmm7   ; src[ 0- 7]
-        punpckhbw   xmm1,           xmm7   ; src[ 8-15]
-
-.temporal_filter_apply_load_finished:
-        movdqa      xmm2,           [rdx]  ; predictor (frame2)
-        movdqa      xmm3,           xmm2
-        punpcklbw   xmm2,           xmm7   ; pred[ 0- 7]
-        punpckhbw   xmm3,           xmm7   ; pred[ 8-15]
-
-        ; modifier = src_byte - pixel_value
-        psubw       xmm0,           xmm2   ; src - pred[ 0- 7]
-        psubw       xmm1,           xmm3   ; src - pred[ 8-15]
-
-        ; modifier *= modifier
-        pmullw      xmm0,           xmm0   ; modifer[ 0- 7]^2
-        pmullw      xmm1,           xmm1   ; modifer[ 8-15]^2
-
-        ; modifier *= 3
-        pmullw      xmm0,           [GLOBAL(_const_3w)]
-        pmullw      xmm1,           [GLOBAL(_const_3w)]
-
-        ; modifer += 0x8000 >> (16 - strength)
-        paddw       xmm0,           [rsp + rounding_bit]
-        paddw       xmm1,           [rsp + rounding_bit]
-
-        ; modifier >>= strength
-        psrlw       xmm0,           [rsp + strength]
-        psrlw       xmm1,           [rsp + strength]
-
-        ; modifier = 16 - modifier
-        ; saturation takes care of modifier > 16
-        movdqa      xmm3,           [GLOBAL(_const_16w)]
-        movdqa      xmm2,           [GLOBAL(_const_16w)]
-        psubusw     xmm3,           xmm1
-        psubusw     xmm2,           xmm0
-
-        ; modifier *= filter_weight
-        pmullw      xmm2,           [rsp + filter_weight]
-        pmullw      xmm3,           [rsp + filter_weight]
-
-        ; count
-        movdqa      xmm4,           [rax]
-        movdqa      xmm5,           [rax+16]
-        ; += modifier
-        paddw       xmm4,           xmm2
-        paddw       xmm5,           xmm3
-        ; write back
-        movdqa      [rax],          xmm4
-        movdqa      [rax+16],       xmm5
-        lea         rax,            [rax + 16*2] ; count += 16*(sizeof(short))
-
-        ; load and extract the predictor up to shorts
-        pxor        xmm7,           xmm7
-        movdqa      xmm0,           [rdx]
-        lea         rdx,            [rdx + 16*1] ; pred += 16*(sizeof(char))
-        movdqa      xmm1,           xmm0
-        punpcklbw   xmm0,           xmm7   ; pred[ 0- 7]
-        punpckhbw   xmm1,           xmm7   ; pred[ 8-15]
-
-        ; modifier *= pixel_value
-        pmullw      xmm0,           xmm2
-        pmullw      xmm1,           xmm3
-
-        ; expand to double words
-        movdqa      xmm2,           xmm0
-        punpcklwd   xmm0,           xmm7   ; [ 0- 3]
-        punpckhwd   xmm2,           xmm7   ; [ 4- 7]
-        movdqa      xmm3,           xmm1
-        punpcklwd   xmm1,           xmm7   ; [ 8-11]
-        punpckhwd   xmm3,           xmm7   ; [12-15]
-
-        ; accumulator
-        movdqa      xmm4,           [rdi]
-        movdqa      xmm5,           [rdi+16]
-        movdqa      xmm6,           [rdi+32]
-        movdqa      xmm7,           [rdi+48]
-        ; += modifier
-        paddd       xmm4,           xmm0
-        paddd       xmm5,           xmm2
-        paddd       xmm6,           xmm1
-        paddd       xmm7,           xmm3
-        ; write back
-        movdqa      [rdi],          xmm4
-        movdqa      [rdi+16],       xmm5
-        movdqa      [rdi+32],       xmm6
-        movdqa      [rdi+48],       xmm7
-        lea         rdi,            [rdi + 16*4] ; accumulator += 16*(sizeof(int))
-
-        cmp         rdx,            rcx
-        je          .temporal_filter_apply_epilog
-        pxor        xmm7,           xmm7   ; zero for extraction
-        cmp         dword ptr [rsp + block_size], 16
-        je          .temporal_filter_apply_load_16
-        jmp         .temporal_filter_apply_load_8
-
-.temporal_filter_apply_epilog:
-    ; begin epilog
-    mov         rbp,            [rsp + rbp_backup]
-    add         rsp,            stack_size
-    pop         rsp
-    pop         rdi
-    pop         rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-SECTION_RODATA
-align 16
-_const_3w:
-    times 8 dw 3
-align 16
-_const_top_bit:
-    times 8 dw 1<<15
-align 16
-_const_16w
-    times 8 dw 16
--- a/vp8/encoder/x86/temporal_filter_x86.h
+++ /dev/null
@@ -1,27 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef __INC_TEMPORAL_FILTER_X86_H
-#define __INC_TEMPORAL_FILTER_X86_H
-
-#if HAVE_SSE2
-extern prototype_apply(vp9_temporal_filter_apply_sse2);
-
-#if !CONFIG_RUNTIME_CPU_DETECT
-
-#undef  vp9_temporal_filter_apply
-#define vp9_temporal_filter_apply vp9_temporal_filter_apply_sse2
-
-#endif
-
-#endif
-
-#endif // __INC_TEMPORAL_FILTER_X86_H
--- a/vp8/encoder/x86/variance_impl_mmx.asm
+++ /dev/null
@@ -1,851 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-;unsigned int vp9_get_mb_ss_mmx( short *src_ptr )
-global sym(vp9_get_mb_ss_mmx)
-sym(vp9_get_mb_ss_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    GET_GOT     rbx
-    push rsi
-    push rdi
-    sub         rsp, 8
-    ; end prolog
-
-        mov         rax, arg(0) ;src_ptr
-        mov         rcx, 16
-        pxor        mm4, mm4
-
-.NEXTROW:
-        movq        mm0, [rax]
-        movq        mm1, [rax+8]
-        movq        mm2, [rax+16]
-        movq        mm3, [rax+24]
-        pmaddwd     mm0, mm0
-        pmaddwd     mm1, mm1
-        pmaddwd     mm2, mm2
-        pmaddwd     mm3, mm3
-
-        paddd       mm4, mm0
-        paddd       mm4, mm1
-        paddd       mm4, mm2
-        paddd       mm4, mm3
-
-        add         rax, 32
-        dec         rcx
-        ja          .NEXTROW
-        movq        QWORD PTR [rsp], mm4
-
-        ;return sum[0]+sum[1];
-        movsxd      rax, dword ptr [rsp]
-        movsxd      rcx, dword ptr [rsp+4]
-        add         rax, rcx
-
-
-    ; begin epilog
-    add rsp, 8
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;unsigned int vp9_get8x8var_mmx
-;(
-;    unsigned char *src_ptr,
-;    int  source_stride,
-;    unsigned char *ref_ptr,
-;    int  recon_stride,
-;    unsigned int *SSE,
-;    int *Sum
-;)
-global sym(vp9_get8x8var_mmx)
-sym(vp9_get8x8var_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    push rsi
-    push rdi
-    push rbx
-    sub         rsp, 16
-    ; end prolog
-
-
-        pxor        mm5, mm5                    ; Blank mmx6
-        pxor        mm6, mm6                    ; Blank mmx7
-        pxor        mm7, mm7                    ; Blank mmx7
-
-        mov         rax, arg(0) ;[src_ptr]  ; Load base addresses
-        mov         rbx, arg(2) ;[ref_ptr]
-        movsxd      rcx, dword ptr arg(1) ;[source_stride]
-        movsxd      rdx, dword ptr arg(3) ;[recon_stride]
-
-        ; Row 1
-        movq        mm0, [rax]                  ; Copy eight bytes to mm0
-        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
-        movq        mm2, mm0                    ; Take copies
-        movq        mm3, mm1                    ; Take copies
-
-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
-        punpcklbw   mm1, mm6
-        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
-        punpckhbw   mm3, mm6
-        psubsw      mm0, mm1                    ; A-B (low order) to MM0
-        psubsw      mm2, mm3                    ; A-B (high order) to MM2
-
-        paddw       mm5, mm0                    ; accumulate differences in mm5
-        paddw       mm5, mm2                    ; accumulate differences in mm5
-
-        pmaddwd     mm0, mm0                    ; square and accumulate
-        pmaddwd     mm2, mm2                    ; square and accumulate
-        add         rbx,rdx                     ; Inc pointer into ref data
-        add         rax,rcx                     ; Inc pointer into the new data
-        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
-        paddd       mm7, mm0                    ; accumulate in mm7
-        paddd       mm7, mm2                    ; accumulate in mm7
-
-
-        ; Row 2
-        movq        mm0, [rax]                  ; Copy eight bytes to mm0
-        movq        mm2, mm0                    ; Take copies
-        movq        mm3, mm1                    ; Take copies
-
-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
-        punpcklbw   mm1, mm6
-        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
-        punpckhbw   mm3, mm6
-        psubsw      mm0, mm1                    ; A-B (low order) to MM0
-        psubsw      mm2, mm3                    ; A-B (high order) to MM2
-
-        paddw       mm5, mm0                    ; accumulate differences in mm5
-        paddw       mm5, mm2                    ; accumulate differences in mm5
-
-        pmaddwd     mm0, mm0                    ; square and accumulate
-        pmaddwd     mm2, mm2                    ; square and accumulate
-        add         rbx,rdx                     ; Inc pointer into ref data
-        add         rax,rcx                     ; Inc pointer into the new data
-        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
-        paddd       mm7, mm0                    ; accumulate in mm7
-        paddd       mm7, mm2                    ; accumulate in mm7
-
-        ; Row 3
-        movq        mm0, [rax]                  ; Copy eight bytes to mm0
-        movq        mm2, mm0                    ; Take copies
-        movq        mm3, mm1                    ; Take copies
-
-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
-        punpcklbw   mm1, mm6
-        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
-        punpckhbw   mm3, mm6
-        psubsw      mm0, mm1                    ; A-B (low order) to MM0
-        psubsw      mm2, mm3                    ; A-B (high order) to MM2
-
-        paddw       mm5, mm0                    ; accumulate differences in mm5
-        paddw       mm5, mm2                    ; accumulate differences in mm5
-
-        pmaddwd     mm0, mm0                    ; square and accumulate
-        pmaddwd     mm2, mm2                    ; square and accumulate
-        add         rbx,rdx                     ; Inc pointer into ref data
-        add         rax,rcx                     ; Inc pointer into the new data
-        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
-        paddd       mm7, mm0                    ; accumulate in mm7
-        paddd       mm7, mm2                    ; accumulate in mm7
-
-        ; Row 4
-        movq        mm0, [rax]                  ; Copy eight bytes to mm0
-        movq        mm2, mm0                    ; Take copies
-        movq        mm3, mm1                    ; Take copies
-
-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
-        punpcklbw   mm1, mm6
-        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
-        punpckhbw   mm3, mm6
-        psubsw      mm0, mm1                    ; A-B (low order) to MM0
-        psubsw      mm2, mm3                    ; A-B (high order) to MM2
-
-        paddw       mm5, mm0                    ; accumulate differences in mm5
-        paddw       mm5, mm2                    ; accumulate differences in mm5
-
-        pmaddwd     mm0, mm0                    ; square and accumulate
-        pmaddwd     mm2, mm2                    ; square and accumulate
-        add         rbx,rdx                     ; Inc pointer into ref data
-        add         rax,rcx                     ; Inc pointer into the new data
-        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
-        paddd       mm7, mm0                    ; accumulate in mm7
-        paddd       mm7, mm2                    ; accumulate in mm7
-
-        ; Row 5
-        movq        mm0, [rax]                  ; Copy eight bytes to mm0
-        movq        mm2, mm0                    ; Take copies
-        movq        mm3, mm1                    ; Take copies
-
-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
-        punpcklbw   mm1, mm6
-        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
-        punpckhbw   mm3, mm6
-        psubsw      mm0, mm1                    ; A-B (low order) to MM0
-        psubsw      mm2, mm3                    ; A-B (high order) to MM2
-
-        paddw       mm5, mm0                    ; accumulate differences in mm5
-        paddw       mm5, mm2                    ; accumulate differences in mm5
-
-        pmaddwd     mm0, mm0                    ; square and accumulate
-        pmaddwd     mm2, mm2                    ; square and accumulate
-        add         rbx,rdx                     ; Inc pointer into ref data
-        add         rax,rcx                     ; Inc pointer into the new data
-        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
-        ;              movq        mm4, [rbx + rdx]
-        paddd       mm7, mm0                    ; accumulate in mm7
-        paddd       mm7, mm2                    ; accumulate in mm7
-
-        ; Row 6
-        movq        mm0, [rax]                  ; Copy eight bytes to mm0
-        movq        mm2, mm0                    ; Take copies
-        movq        mm3, mm1                    ; Take copies
-
-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
-        punpcklbw   mm1, mm6
-        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
-        punpckhbw   mm3, mm6
-        psubsw      mm0, mm1                    ; A-B (low order) to MM0
-        psubsw      mm2, mm3                    ; A-B (high order) to MM2
-
-        paddw       mm5, mm0                    ; accumulate differences in mm5
-        paddw       mm5, mm2                    ; accumulate differences in mm5
-
-        pmaddwd     mm0, mm0                    ; square and accumulate
-        pmaddwd     mm2, mm2                    ; square and accumulate
-        add         rbx,rdx                     ; Inc pointer into ref data
-        add         rax,rcx                     ; Inc pointer into the new data
-        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
-        paddd       mm7, mm0                    ; accumulate in mm7
-        paddd       mm7, mm2                    ; accumulate in mm7
-
-        ; Row 7
-        movq        mm0, [rax]                  ; Copy eight bytes to mm0
-        movq        mm2, mm0                    ; Take copies
-        movq        mm3, mm1                    ; Take copies
-
-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
-        punpcklbw   mm1, mm6
-        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
-        punpckhbw   mm3, mm6
-        psubsw      mm0, mm1                    ; A-B (low order) to MM0
-        psubsw      mm2, mm3                    ; A-B (high order) to MM2
-
-        paddw       mm5, mm0                    ; accumulate differences in mm5
-        paddw       mm5, mm2                    ; accumulate differences in mm5
-
-        pmaddwd     mm0, mm0                    ; square and accumulate
-        pmaddwd     mm2, mm2                    ; square and accumulate
-        add         rbx,rdx                     ; Inc pointer into ref data
-        add         rax,rcx                     ; Inc pointer into the new data
-        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
-        paddd       mm7, mm0                    ; accumulate in mm7
-        paddd       mm7, mm2                    ; accumulate in mm7
-
-        ; Row 8
-        movq        mm0, [rax]                  ; Copy eight bytes to mm0
-        movq        mm2, mm0                    ; Take copies
-        movq        mm3, mm1                    ; Take copies
-
-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
-        punpcklbw   mm1, mm6
-        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
-        punpckhbw   mm3, mm6
-        psubsw      mm0, mm1                    ; A-B (low order) to MM0
-        psubsw      mm2, mm3                    ; A-B (high order) to MM2
-
-        paddw       mm5, mm0                    ; accumulate differences in mm5
-        paddw       mm5, mm2                    ; accumulate differences in mm5
-
-        pmaddwd     mm0, mm0                    ; square and accumulate
-        pmaddwd     mm2, mm2                    ; square and accumulate
-        add         rbx,rdx                     ; Inc pointer into ref data
-        add         rax,rcx                     ; Inc pointer into the new data
-        paddd       mm7, mm0                    ; accumulate in mm7
-        paddd       mm7, mm2                    ; accumulate in mm7
-
-        ; Now accumulate the final results.
-        movq        QWORD PTR [rsp+8], mm5      ; copy back accumulated results into normal memory
-        movq        QWORD PTR [rsp], mm7        ; copy back accumulated results into normal memory
-        movsx       rdx, WORD PTR [rsp+8]
-        movsx       rcx, WORD PTR [rsp+10]
-        movsx       rbx, WORD PTR [rsp+12]
-        movsx       rax, WORD PTR [rsp+14]
-        add         rdx, rcx
-        add         rbx, rax
-        add         rdx, rbx    ;XSum
-        movsxd      rax, DWORD PTR [rsp]
-        movsxd      rcx, DWORD PTR [rsp+4]
-        add         rax, rcx    ;XXSum
-        mov         rsi, arg(4) ;SSE
-        mov         rdi, arg(5) ;Sum
-        mov         dword ptr [rsi], eax
-        mov         dword ptr [rdi], edx
-        xor         rax, rax    ; return 0
-
-
-    ; begin epilog
-    add rsp, 16
-    pop rbx
-    pop rdi
-    pop rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-
-;unsigned int
-;vp9_get4x4var_mmx
-;(
-;    unsigned char *src_ptr,
-;    int  source_stride,
-;    unsigned char *ref_ptr,
-;    int  recon_stride,
-;    unsigned int *SSE,
-;    int *Sum
-;)
-global sym(vp9_get4x4var_mmx)
-sym(vp9_get4x4var_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    push rsi
-    push rdi
-    push rbx
-    sub         rsp, 16
-    ; end prolog
-
-
-        pxor        mm5, mm5                    ; Blank mmx6
-        pxor        mm6, mm6                    ; Blank mmx7
-        pxor        mm7, mm7                    ; Blank mmx7
-
-        mov         rax, arg(0) ;[src_ptr]  ; Load base addresses
-        mov         rbx, arg(2) ;[ref_ptr]
-        movsxd      rcx, dword ptr arg(1) ;[source_stride]
-        movsxd      rdx, dword ptr arg(3) ;[recon_stride]
-
-        ; Row 1
-        movq        mm0, [rax]                  ; Copy eight bytes to mm0
-        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
-        punpcklbw   mm1, mm6
-        psubsw      mm0, mm1                    ; A-B (low order) to MM0
-        paddw       mm5, mm0                    ; accumulate differences in mm5
-        pmaddwd     mm0, mm0                    ; square and accumulate
-        add         rbx,rdx                     ; Inc pointer into ref data
-        add         rax,rcx                     ; Inc pointer into the new data
-        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
-        paddd       mm7, mm0                    ; accumulate in mm7
-
-
-        ; Row 2
-        movq        mm0, [rax]                  ; Copy eight bytes to mm0
-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
-        punpcklbw   mm1, mm6
-        psubsw      mm0, mm1                    ; A-B (low order) to MM0
-        paddw       mm5, mm0                    ; accumulate differences in mm5
-
-        pmaddwd     mm0, mm0                    ; square and accumulate
-        add         rbx,rdx                     ; Inc pointer into ref data
-        add         rax,rcx                     ; Inc pointer into the new data
-        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
-        paddd       mm7, mm0                    ; accumulate in mm7
-
-        ; Row 3
-        movq        mm0, [rax]                  ; Copy eight bytes to mm0
-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
-        punpcklbw   mm1, mm6
-        psubsw      mm0, mm1                    ; A-B (low order) to MM0
-        paddw       mm5, mm0                    ; accumulate differences in mm5
-
-        pmaddwd     mm0, mm0                    ; square and accumulate
-        add         rbx,rdx                     ; Inc pointer into ref data
-        add         rax,rcx                     ; Inc pointer into the new data
-        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
-        paddd       mm7, mm0                    ; accumulate in mm7
-
-        ; Row 4
-        movq        mm0, [rax]                  ; Copy eight bytes to mm0
-
-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
-        punpcklbw   mm1, mm6
-        psubsw      mm0, mm1                    ; A-B (low order) to MM0
-
-        paddw       mm5, mm0                    ; accumulate differences in mm5
-
-        pmaddwd     mm0, mm0                    ; square and accumulate
-        paddd       mm7, mm0                    ; accumulate in mm7
-
-
-        ; Now accumulate the final results.
-        movq        QWORD PTR [rsp+8], mm5      ; copy back accumulated results into normal memory
-        movq        QWORD PTR [rsp], mm7        ; copy back accumulated results into normal memory
-        movsx       rdx, WORD PTR [rsp+8]
-        movsx       rcx, WORD PTR [rsp+10]
-        movsx       rbx, WORD PTR [rsp+12]
-        movsx       rax, WORD PTR [rsp+14]
-        add         rdx, rcx
-        add         rbx, rax
-        add         rdx, rbx    ;XSum
-        movsxd      rax, DWORD PTR [rsp]
-        movsxd      rcx, DWORD PTR [rsp+4]
-        add         rax, rcx    ;XXSum
-        mov         rsi, arg(4) ;SSE
-        mov         rdi, arg(5) ;Sum
-        mov         dword ptr [rsi], eax
-        mov         dword ptr [rdi], edx
-        xor         rax, rax    ; return 0
-
-
-    ; begin epilog
-    add rsp, 16
-    pop rbx
-    pop rdi
-    pop rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-
-;unsigned int
-;vp9_get4x4sse_cs_mmx
-;(
-;    unsigned char *src_ptr,
-;    int  source_stride,
-;    unsigned char *ref_ptr,
-;    int  recon_stride
-;)
-global sym(vp9_get4x4sse_cs_mmx)
-sym(vp9_get4x4sse_cs_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 4
-    push rsi
-    push rdi
-    push rbx
-    ; end prolog
-
-
-        pxor        mm6, mm6                    ; Blank mmx7
-        pxor        mm7, mm7                    ; Blank mmx7
-
-        mov         rax, arg(0) ;[src_ptr]  ; Load base addresses
-        mov         rbx, arg(2) ;[ref_ptr]
-        movsxd      rcx, dword ptr arg(1) ;[source_stride]
-        movsxd      rdx, dword ptr arg(3) ;[recon_stride]
-        ; Row 1
-        movd        mm0, [rax]                  ; Copy eight bytes to mm0
-        movd        mm1, [rbx]                  ; Copy eight bytes to mm1
-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
-        punpcklbw   mm1, mm6
-        psubsw      mm0, mm1                    ; A-B (low order) to MM0
-        pmaddwd     mm0, mm0                    ; square and accumulate
-        add         rbx,rdx                     ; Inc pointer into ref data
-        add         rax,rcx                     ; Inc pointer into the new data
-        movd        mm1, [rbx]                  ; Copy eight bytes to mm1
-        paddd       mm7, mm0                    ; accumulate in mm7
-
-        ; Row 2
-        movd        mm0, [rax]                  ; Copy eight bytes to mm0
-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
-        punpcklbw   mm1, mm6
-        psubsw      mm0, mm1                    ; A-B (low order) to MM0
-        pmaddwd     mm0, mm0                    ; square and accumulate
-        add         rbx,rdx                     ; Inc pointer into ref data
-        add         rax,rcx                     ; Inc pointer into the new data
-        movd        mm1, [rbx]                  ; Copy eight bytes to mm1
-        paddd       mm7, mm0                    ; accumulate in mm7
-
-        ; Row 3
-        movd        mm0, [rax]                  ; Copy eight bytes to mm0
-        punpcklbw   mm1, mm6
-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
-        psubsw      mm0, mm1                    ; A-B (low order) to MM0
-
-        pmaddwd     mm0, mm0                    ; square and accumulate
-        add         rbx,rdx                     ; Inc pointer into ref data
-        add         rax,rcx                     ; Inc pointer into the new data
-        movd        mm1, [rbx]                  ; Copy eight bytes to mm1
-        paddd       mm7, mm0                    ; accumulate in mm7
-
-        ; Row 4
-        movd        mm0, [rax]                  ; Copy eight bytes to mm0
-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
-        punpcklbw   mm1, mm6
-        psubsw      mm0, mm1                    ; A-B (low order) to MM0
-        pmaddwd     mm0, mm0                    ; square and accumulate
-        paddd       mm7, mm0                    ; accumulate in mm7
-
-        movq        mm0,    mm7                 ;
-        psrlq       mm7,    32
-
-        paddd       mm0,    mm7
-        movq        rax,    mm0
-
-
-    ; begin epilog
-    pop rbx
-    pop rdi
-    pop rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-%define mmx_filter_shift            7
-
-;void vp9_filter_block2d_bil4x4_var_mmx
-;(
-;    unsigned char *ref_ptr,
-;    int ref_pixels_per_line,
-;    unsigned char *src_ptr,
-;    int src_pixels_per_line,
-;    unsigned short *HFilter,
-;    unsigned short *VFilter,
-;    int *sum,
-;    unsigned int *sumsquared
-;)
-global sym(vp9_filter_block2d_bil4x4_var_mmx)
-sym(vp9_filter_block2d_bil4x4_var_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 8
-    GET_GOT     rbx
-    push rsi
-    push rdi
-    sub         rsp, 16
-    ; end prolog
-
-
-        pxor            mm6,            mm6                 ;
-        pxor            mm7,            mm7                 ;
-
-        mov             rax,            arg(4) ;HFilter             ;
-        mov             rdx,            arg(5) ;VFilter             ;
-
-        mov             rsi,            arg(0) ;ref_ptr              ;
-        mov             rdi,            arg(2) ;src_ptr              ;
-
-        mov             rcx,            4                   ;
-        pxor            mm0,            mm0                 ;
-
-        movd            mm1,            [rsi]               ;
-        movd            mm3,            [rsi+1]             ;
-
-        punpcklbw       mm1,            mm0                 ;
-        pmullw          mm1,            [rax]               ;
-
-        punpcklbw       mm3,            mm0                 ;
-        pmullw          mm3,            [rax+8]             ;
-
-        paddw           mm1,            mm3                 ;
-        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
-
-        psraw           mm1,            mmx_filter_shift    ;
-        movq            mm5,            mm1
-
-%if ABI_IS_32BIT
-        add             rsi, dword ptr  arg(1) ;ref_pixels_per_line    ;
-%else
-        movsxd          r8, dword ptr  arg(1) ;ref_pixels_per_line    ;
-        add             rsi, r8
-%endif
-
-.filter_block2d_bil4x4_var_mmx_loop:
-
-        movd            mm1,            [rsi]               ;
-        movd            mm3,            [rsi+1]             ;
-
-        punpcklbw       mm1,            mm0                 ;
-        pmullw          mm1,            [rax]               ;
-
-        punpcklbw       mm3,            mm0                 ;
-        pmullw          mm3,            [rax+8]             ;
-
-        paddw           mm1,            mm3                 ;
-        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
-
-        psraw           mm1,            mmx_filter_shift    ;
-        movq            mm3,            mm5                 ;
-
-        movq            mm5,            mm1                 ;
-        pmullw          mm3,            [rdx]               ;
-
-        pmullw          mm1,            [rdx+8]             ;
-        paddw           mm1,            mm3                 ;
-
-
-        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
-        psraw           mm1,            mmx_filter_shift    ;
-
-        movd            mm3,            [rdi]               ;
-        punpcklbw       mm3,            mm0                 ;
-
-        psubw           mm1,            mm3                 ;
-        paddw           mm6,            mm1                 ;
-
-        pmaddwd         mm1,            mm1                 ;
-        paddd           mm7,            mm1                 ;
-
-%if ABI_IS_32BIT
-        add             rsi,            dword ptr arg(1) ;ref_pixels_per_line    ;
-        add             rdi,            dword ptr arg(3) ;src_pixels_per_line    ;
-%else
-        movsxd          r8,             dword ptr arg(1) ;ref_pixels_per_line
-        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line
-        add             rsi,            r8
-        add             rdi,            r9
-%endif
-        sub             rcx,            1                   ;
-        jnz             .filter_block2d_bil4x4_var_mmx_loop       ;
-
-
-        pxor            mm3,            mm3                 ;
-        pxor            mm2,            mm2                 ;
-
-        punpcklwd       mm2,            mm6                 ;
-        punpckhwd       mm3,            mm6                 ;
-
-        paddd           mm2,            mm3                 ;
-        movq            mm6,            mm2                 ;
-
-        psrlq           mm6,            32                  ;
-        paddd           mm2,            mm6                 ;
-
-        psrad           mm2,            16                  ;
-        movq            mm4,            mm7                 ;
-
-        psrlq           mm4,            32                  ;
-        paddd           mm4,            mm7                 ;
-
-        mov             rdi,            arg(6) ;sum
-        mov             rsi,            arg(7) ;sumsquared
-
-        movd            dword ptr [rdi],          mm2                 ;
-        movd            dword ptr [rsi],          mm4                 ;
-
-
-
-    ; begin epilog
-    add rsp, 16
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-
-
-;void vp9_filter_block2d_bil_var_mmx
-;(
-;    unsigned char *ref_ptr,
-;    int ref_pixels_per_line,
-;    unsigned char *src_ptr,
-;    int src_pixels_per_line,
-;    unsigned int Height,
-;    unsigned short *HFilter,
-;    unsigned short *VFilter,
-;    int *sum,
-;    unsigned int *sumsquared
-;)
-global sym(vp9_filter_block2d_bil_var_mmx)
-sym(vp9_filter_block2d_bil_var_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 9
-    GET_GOT     rbx
-    push rsi
-    push rdi
-    sub         rsp, 16
-    ; end prolog
-
-        pxor            mm6,            mm6                 ;
-        pxor            mm7,            mm7                 ;
-        mov             rax,            arg(5) ;HFilter             ;
-
-        mov             rdx,            arg(6) ;VFilter             ;
-        mov             rsi,            arg(0) ;ref_ptr              ;
-
-        mov             rdi,            arg(2) ;src_ptr              ;
-        movsxd          rcx,            dword ptr arg(4) ;Height              ;
-
-        pxor            mm0,            mm0                 ;
-        movq            mm1,            [rsi]               ;
-
-        movq            mm3,            [rsi+1]             ;
-        movq            mm2,            mm1                 ;
-
-        movq            mm4,            mm3                 ;
-        punpcklbw       mm1,            mm0                 ;
-
-        punpckhbw       mm2,            mm0                 ;
-        pmullw          mm1,            [rax]               ;
-
-        pmullw          mm2,            [rax]               ;
-        punpcklbw       mm3,            mm0                 ;
-
-        punpckhbw       mm4,            mm0                 ;
-        pmullw          mm3,            [rax+8]             ;
-
-        pmullw          mm4,            [rax+8]             ;
-        paddw           mm1,            mm3                 ;
-
-        paddw           mm2,            mm4                 ;
-        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
-
-        psraw           mm1,            mmx_filter_shift    ;
-        paddw           mm2,            [GLOBAL(mmx_bi_rd)] ;
-
-        psraw           mm2,            mmx_filter_shift    ;
-        movq            mm5,            mm1
-
-        packuswb        mm5,            mm2                 ;
-%if ABI_IS_32BIT
-        add             rsi,            dword ptr arg(1) ;ref_pixels_per_line
-%else
-        movsxd          r8,             dword ptr arg(1) ;ref_pixels_per_line
-        add             rsi,            r8
-%endif
-
-.filter_block2d_bil_var_mmx_loop:
-
-        movq            mm1,            [rsi]               ;
-        movq            mm3,            [rsi+1]             ;
-
-        movq            mm2,            mm1                 ;
-        movq            mm4,            mm3                 ;
-
-        punpcklbw       mm1,            mm0                 ;
-        punpckhbw       mm2,            mm0                 ;
-
-        pmullw          mm1,            [rax]               ;
-        pmullw          mm2,            [rax]               ;
-
-        punpcklbw       mm3,            mm0                 ;
-        punpckhbw       mm4,            mm0                 ;
-
-        pmullw          mm3,            [rax+8]             ;
-        pmullw          mm4,            [rax+8]             ;
-
-        paddw           mm1,            mm3                 ;
-        paddw           mm2,            mm4                 ;
-
-        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
-        psraw           mm1,            mmx_filter_shift    ;
-
-        paddw           mm2,            [GLOBAL(mmx_bi_rd)] ;
-        psraw           mm2,            mmx_filter_shift    ;
-
-        movq            mm3,            mm5                 ;
-        movq            mm4,            mm5                 ;
-
-        punpcklbw       mm3,            mm0                 ;
-        punpckhbw       mm4,            mm0                 ;
-
-        movq            mm5,            mm1                 ;
-        packuswb        mm5,            mm2                 ;
-
-        pmullw          mm3,            [rdx]               ;
-        pmullw          mm4,            [rdx]               ;
-
-        pmullw          mm1,            [rdx+8]             ;
-        pmullw          mm2,            [rdx+8]             ;
-
-        paddw           mm1,            mm3                 ;
-        paddw           mm2,            mm4                 ;
-
-        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
-        paddw           mm2,            [GLOBAL(mmx_bi_rd)] ;
-
-        psraw           mm1,            mmx_filter_shift    ;
-        psraw           mm2,            mmx_filter_shift    ;
-
-        movq            mm3,            [rdi]               ;
-        movq            mm4,            mm3                 ;
-
-        punpcklbw       mm3,            mm0                 ;
-        punpckhbw       mm4,            mm0                 ;
-
-        psubw           mm1,            mm3                 ;
-        psubw           mm2,            mm4                 ;
-
-        paddw           mm6,            mm1                 ;
-        pmaddwd         mm1,            mm1                 ;
-
-        paddw           mm6,            mm2                 ;
-        pmaddwd         mm2,            mm2                 ;
-
-        paddd           mm7,            mm1                 ;
-        paddd           mm7,            mm2                 ;
-
-%if ABI_IS_32BIT
-        add             rsi,            dword ptr arg(1) ;ref_pixels_per_line    ;
-        add             rdi,            dword ptr arg(3) ;src_pixels_per_line    ;
-%else
-        movsxd          r8,             dword ptr arg(1) ;ref_pixels_per_line    ;
-        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line    ;
-        add             rsi,            r8
-        add             rdi,            r9
-%endif
-        sub             rcx,            1                   ;
-        jnz             .filter_block2d_bil_var_mmx_loop       ;
-
-
-        pxor            mm3,            mm3                 ;
-        pxor            mm2,            mm2                 ;
-
-        punpcklwd       mm2,            mm6                 ;
-        punpckhwd       mm3,            mm6                 ;
-
-        paddd           mm2,            mm3                 ;
-        movq            mm6,            mm2                 ;
-
-        psrlq           mm6,            32                  ;
-        paddd           mm2,            mm6                 ;
-
-        psrad           mm2,            16                  ;
-        movq            mm4,            mm7                 ;
-
-        psrlq           mm4,            32                  ;
-        paddd           mm4,            mm7                 ;
-
-        mov             rdi,            arg(7) ;sum
-        mov             rsi,            arg(8) ;sumsquared
-
-        movd            dword ptr [rdi],          mm2                 ;
-        movd            dword ptr [rsi],          mm4                 ;
-
-    ; begin epilog
-    add rsp, 16
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-SECTION_RODATA
-;short mmx_bi_rd[4] = { 64, 64, 64, 64};
-align 16
-mmx_bi_rd:
-    times 4 dw 64
--- a/vp8/encoder/x86/variance_impl_sse2.asm
+++ /dev/null
@@ -1,1367 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-%define xmm_filter_shift            7
-
-;unsigned int vp9_get_mb_ss_sse2
-;(
-;    short *src_ptr
-;)
-global sym(vp9_get_mb_ss_sse2)
-sym(vp9_get_mb_ss_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 1
-    GET_GOT     rbx
-    push rsi
-    push rdi
-    sub         rsp, 16
-    ; end prolog
-
-
-        mov         rax, arg(0) ;[src_ptr]
-        mov         rcx, 8
-        pxor        xmm4, xmm4
-
-.NEXTROW:
-        movdqa      xmm0, [rax]
-        movdqa      xmm1, [rax+16]
-        movdqa      xmm2, [rax+32]
-        movdqa      xmm3, [rax+48]
-        pmaddwd     xmm0, xmm0
-        pmaddwd     xmm1, xmm1
-        pmaddwd     xmm2, xmm2
-        pmaddwd     xmm3, xmm3
-
-        paddd       xmm0, xmm1
-        paddd       xmm2, xmm3
-        paddd       xmm4, xmm0
-        paddd       xmm4, xmm2
-
-        add         rax, 0x40
-        dec         rcx
-        ja          .NEXTROW
-
-        movdqa      xmm3,xmm4
-        psrldq      xmm4,8
-        paddd       xmm4,xmm3
-        movdqa      xmm3,xmm4
-        psrldq      xmm4,4
-        paddd       xmm4,xmm3
-        movq        rax,xmm4
-
-
-    ; begin epilog
-    add rsp, 16
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;unsigned int vp9_get16x16var_sse2
-;(
-;    unsigned char   *  src_ptr,
-;    int             source_stride,
-;    unsigned char   *  ref_ptr,
-;    int             recon_stride,
-;    unsigned int    *  SSE,
-;    int             *  Sum
-;)
-global sym(vp9_get16x16var_sse2)
-sym(vp9_get16x16var_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push rbx
-    push rsi
-    push rdi
-    ; end prolog
-
-        mov         rsi,            arg(0) ;[src_ptr]
-        mov         rdi,            arg(2) ;[ref_ptr]
-
-        movsxd      rax,            DWORD PTR arg(1) ;[source_stride]
-        movsxd      rdx,            DWORD PTR arg(3) ;[recon_stride]
-
-        ; Prefetch data
-        lea             rcx,    [rax+rax*2]
-        prefetcht0      [rsi]
-        prefetcht0      [rsi+rax]
-        prefetcht0      [rsi+rax*2]
-        prefetcht0      [rsi+rcx]
-        lea             rbx,    [rsi+rax*4]
-        prefetcht0      [rbx]
-        prefetcht0      [rbx+rax]
-        prefetcht0      [rbx+rax*2]
-        prefetcht0      [rbx+rcx]
-
-        lea             rcx,    [rdx+rdx*2]
-        prefetcht0      [rdi]
-        prefetcht0      [rdi+rdx]
-        prefetcht0      [rdi+rdx*2]
-        prefetcht0      [rdi+rcx]
-        lea             rbx,    [rdi+rdx*4]
-        prefetcht0      [rbx]
-        prefetcht0      [rbx+rdx]
-        prefetcht0      [rbx+rdx*2]
-        prefetcht0      [rbx+rcx]
-
-        pxor        xmm0,           xmm0                        ; clear xmm0 for unpack
-        pxor        xmm7,           xmm7                        ; clear xmm7 for accumulating diffs
-
-        pxor        xmm6,           xmm6                        ; clear xmm6 for accumulating sse
-        mov         rcx,            16
-
-.var16loop:
-        movdqu      xmm1,           XMMWORD PTR [rsi]
-        movdqu      xmm2,           XMMWORD PTR [rdi]
-
-        prefetcht0      [rsi+rax*8]
-        prefetcht0      [rdi+rdx*8]
-
-        movdqa      xmm3,           xmm1
-        movdqa      xmm4,           xmm2
-
-
-        punpcklbw   xmm1,           xmm0
-        punpckhbw   xmm3,           xmm0
-
-        punpcklbw   xmm2,           xmm0
-        punpckhbw   xmm4,           xmm0
-
-
-        psubw       xmm1,           xmm2
-        psubw       xmm3,           xmm4
-
-        paddw       xmm7,           xmm1
-        pmaddwd     xmm1,           xmm1
-
-        paddw       xmm7,           xmm3
-        pmaddwd     xmm3,           xmm3
-
-        paddd       xmm6,           xmm1
-        paddd       xmm6,           xmm3
-
-        add         rsi,            rax
-        add         rdi,            rdx
-
-        sub         rcx,            1
-        jnz         .var16loop
-
-
-        movdqa      xmm1,           xmm6
-        pxor        xmm6,           xmm6
-
-        pxor        xmm5,           xmm5
-        punpcklwd   xmm6,           xmm7
-
-        punpckhwd   xmm5,           xmm7
-        psrad       xmm5,           16
-
-        psrad       xmm6,           16
-        paddd       xmm6,           xmm5
-
-        movdqa      xmm2,           xmm1
-        punpckldq   xmm1,           xmm0
-
-        punpckhdq   xmm2,           xmm0
-        movdqa      xmm7,           xmm6
-
-        paddd       xmm1,           xmm2
-        punpckldq   xmm6,           xmm0
-
-        punpckhdq   xmm7,           xmm0
-        paddd       xmm6,           xmm7
-
-        movdqa      xmm2,           xmm1
-        movdqa      xmm7,           xmm6
-
-        psrldq      xmm1,           8
-        psrldq      xmm6,           8
-
-        paddd       xmm7,           xmm6
-        paddd       xmm1,           xmm2
-
-        mov         rax,            arg(5) ;[Sum]
-        mov         rdi,            arg(4) ;[SSE]
-
-        movd DWORD PTR [rax],       xmm7
-        movd DWORD PTR [rdi],       xmm1
-
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    pop rbx
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-
-
-;unsigned int vp9_get8x8var_sse2
-;(
-;    unsigned char   *  src_ptr,
-;    int             source_stride,
-;    unsigned char   *  ref_ptr,
-;    int             recon_stride,
-;    unsigned int    *  SSE,
-;    int             *  Sum
-;)
-global sym(vp9_get8x8var_sse2)
-sym(vp9_get8x8var_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push rsi
-    push rdi
-    sub         rsp, 16
-    ; end prolog
-
-        mov         rsi,            arg(0) ;[src_ptr]
-        mov         rdi,            arg(2) ;[ref_ptr]
-
-        movsxd      rax,            DWORD PTR arg(1) ;[source_stride]
-        movsxd      rdx,            DWORD PTR arg(3) ;[recon_stride]
-
-        pxor        xmm0,           xmm0                        ; clear xmm0 for unpack
-        pxor        xmm7,           xmm7                        ; clear xmm7 for accumulating diffs
-
-        movq        xmm1,           QWORD PTR [rsi]
-        movq        xmm2,           QWORD PTR [rdi]
-
-        punpcklbw   xmm1,           xmm0
-        punpcklbw   xmm2,           xmm0
-
-        psubsw      xmm1,           xmm2
-        paddw       xmm7,           xmm1
-
-        pmaddwd     xmm1,           xmm1
-
-        movq        xmm2,           QWORD PTR[rsi + rax]
-        movq        xmm3,           QWORD PTR[rdi + rdx]
-
-        punpcklbw   xmm2,           xmm0
-        punpcklbw   xmm3,           xmm0
-
-        psubsw      xmm2,           xmm3
-        paddw       xmm7,           xmm2
-
-        pmaddwd     xmm2,           xmm2
-        paddd       xmm1,           xmm2
-
-
-        movq        xmm2,           QWORD PTR[rsi + rax * 2]
-        movq        xmm3,           QWORD PTR[rdi + rdx * 2]
-
-        punpcklbw   xmm2,           xmm0
-        punpcklbw   xmm3,           xmm0
-
-        psubsw      xmm2,           xmm3
-        paddw       xmm7,           xmm2
-
-        pmaddwd     xmm2,           xmm2
-        paddd       xmm1,           xmm2
-
-
-        lea         rsi,            [rsi + rax * 2]
-        lea         rdi,            [rdi + rdx * 2]
-        movq        xmm2,           QWORD PTR[rsi + rax]
-        movq        xmm3,           QWORD PTR[rdi + rdx]
-
-        punpcklbw   xmm2,           xmm0
-        punpcklbw   xmm3,           xmm0
-
-        psubsw      xmm2,           xmm3
-        paddw       xmm7,           xmm2
-
-        pmaddwd     xmm2,           xmm2
-        paddd       xmm1,           xmm2
-
-        movq        xmm2,           QWORD PTR[rsi + rax *2]
-        movq        xmm3,           QWORD PTR[rdi + rdx *2]
-
-        punpcklbw   xmm2,           xmm0
-        punpcklbw   xmm3,           xmm0
-
-        psubsw      xmm2,           xmm3
-        paddw       xmm7,           xmm2
-
-        pmaddwd     xmm2,           xmm2
-        paddd       xmm1,           xmm2
-
-
-        lea         rsi,            [rsi + rax * 2]
-        lea         rdi,            [rdi + rdx * 2]
-
-
-        movq        xmm2,           QWORD PTR[rsi + rax]
-        movq        xmm3,           QWORD PTR[rdi + rdx]
-
-        punpcklbw   xmm2,           xmm0
-        punpcklbw   xmm3,           xmm0
-
-        psubsw      xmm2,           xmm3
-        paddw       xmm7,           xmm2
-
-        pmaddwd     xmm2,           xmm2
-        paddd       xmm1,           xmm2
-
-        movq        xmm2,           QWORD PTR[rsi + rax *2]
-        movq        xmm3,           QWORD PTR[rdi + rdx *2]
-
-        punpcklbw   xmm2,           xmm0
-        punpcklbw   xmm3,           xmm0
-
-        psubsw      xmm2,           xmm3
-        paddw       xmm7,           xmm2
-
-        pmaddwd     xmm2,           xmm2
-        paddd       xmm1,           xmm2
-
-
-        lea         rsi,            [rsi + rax * 2]
-        lea         rdi,            [rdi + rdx * 2]
-
-        movq        xmm2,           QWORD PTR[rsi + rax]
-        movq        xmm3,           QWORD PTR[rdi + rdx]
-
-        punpcklbw   xmm2,           xmm0
-        punpcklbw   xmm3,           xmm0
-
-        psubsw      xmm2,           xmm3
-        paddw       xmm7,           xmm2
-
-        pmaddwd     xmm2,           xmm2
-        paddd       xmm1,           xmm2
-
-
-        movdqa      xmm6,           xmm7
-        punpcklwd   xmm6,           xmm0
-
-        punpckhwd   xmm7,           xmm0
-        movdqa      xmm2,           xmm1
-
-        paddw       xmm6,           xmm7
-        punpckldq   xmm1,           xmm0
-
-        punpckhdq   xmm2,           xmm0
-        movdqa      xmm7,           xmm6
-
-        paddd       xmm1,           xmm2
-        punpckldq   xmm6,           xmm0
-
-        punpckhdq   xmm7,           xmm0
-        paddw       xmm6,           xmm7
-
-        movdqa      xmm2,           xmm1
-        movdqa      xmm7,           xmm6
-
-        psrldq      xmm1,           8
-        psrldq      xmm6,           8
-
-        paddw       xmm7,           xmm6
-        paddd       xmm1,           xmm2
-
-        mov         rax,            arg(5) ;[Sum]
-        mov         rdi,            arg(4) ;[SSE]
-
-        movq        rdx,            xmm7
-        movsx       rcx,            dx
-
-        mov  dword ptr [rax],       ecx
-        movd DWORD PTR [rdi],       xmm1
-
-    ; begin epilog
-    add rsp, 16
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void vp9_filter_block2d_bil_var_sse2
-;(
-;    unsigned char *ref_ptr,
-;    int ref_pixels_per_line,
-;    unsigned char *src_ptr,
-;    int src_pixels_per_line,
-;    unsigned int Height,
-;    int  xoffset,
-;    int  yoffset,
-;    int *sum,
-;    unsigned int *sumsquared;;
-;
-;)
-global sym(vp9_filter_block2d_bil_var_sse2)
-sym(vp9_filter_block2d_bil_var_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 9
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push rsi
-    push rdi
-    push rbx
-    ; end prolog
-
-        pxor            xmm6,           xmm6                 ;
-        pxor            xmm7,           xmm7                 ;
-
-        lea             rsi,            [GLOBAL(xmm_bi_rd)]  ; rounding
-        movdqa          xmm4,           XMMWORD PTR [rsi]
-
-        lea             rcx,            [GLOBAL(bilinear_filters_sse2)]
-        movsxd          rax,            dword ptr arg(5)     ; xoffset
-
-        cmp             rax,            0                    ; skip first_pass filter if xoffset=0
-        je              filter_block2d_bil_var_sse2_sp_only
-
-        shl             rax,            5                    ; point to filter coeff with xoffset
-        lea             rax,            [rax + rcx]          ; HFilter
-
-        movsxd          rdx,            dword ptr arg(6)     ; yoffset
-
-        cmp             rdx,            0                    ; skip second_pass filter if yoffset=0
-        je              filter_block2d_bil_var_sse2_fp_only
-
-        shl             rdx,            5
-        lea             rdx,            [rdx + rcx]          ; VFilter
-
-        mov             rsi,            arg(0)               ;ref_ptr
-        mov             rdi,            arg(2)               ;src_ptr
-        movsxd          rcx,            dword ptr arg(4)     ;Height
-
-        pxor            xmm0,           xmm0                 ;
-        movq            xmm1,           QWORD PTR [rsi]      ;
-        movq            xmm3,           QWORD PTR [rsi+1]    ;
-
-        punpcklbw       xmm1,           xmm0                 ;
-        pmullw          xmm1,           [rax]                ;
-        punpcklbw       xmm3,           xmm0
-        pmullw          xmm3,           [rax+16]             ;
-
-        paddw           xmm1,           xmm3                 ;
-        paddw           xmm1,           xmm4                 ;
-        psraw           xmm1,           xmm_filter_shift     ;
-        movdqa          xmm5,           xmm1
-
-        movsxd          rbx,            dword ptr arg(1) ;ref_pixels_per_line
-        lea             rsi,            [rsi + rbx]
-%if ABI_IS_32BIT=0
-        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line
-%endif
-
-filter_block2d_bil_var_sse2_loop:
-        movq            xmm1,           QWORD PTR [rsi]               ;
-        movq            xmm3,           QWORD PTR [rsi+1]             ;
-
-        punpcklbw       xmm1,           xmm0                 ;
-        pmullw          xmm1,           [rax]               ;
-        punpcklbw       xmm3,           xmm0                 ;
-        pmullw          xmm3,           [rax+16]             ;
-
-        paddw           xmm1,           xmm3                 ;
-        paddw           xmm1,           xmm4               ;
-        psraw           xmm1,           xmm_filter_shift    ;
-
-        movdqa          xmm3,           xmm5                 ;
-        movdqa          xmm5,           xmm1                 ;
-
-        pmullw          xmm3,           [rdx]               ;
-        pmullw          xmm1,           [rdx+16]             ;
-        paddw           xmm1,           xmm3                 ;
-        paddw           xmm1,           xmm4                 ;
-        psraw           xmm1,           xmm_filter_shift    ;
-
-        movq            xmm3,           QWORD PTR [rdi]               ;
-        punpcklbw       xmm3,           xmm0                 ;
-
-        psubw           xmm1,           xmm3                 ;
-        paddw           xmm6,           xmm1                 ;
-
-        pmaddwd         xmm1,           xmm1                 ;
-        paddd           xmm7,           xmm1                 ;
-
-        lea             rsi,            [rsi + rbx]          ;ref_pixels_per_line
-%if ABI_IS_32BIT
-        add             rdi,            dword ptr arg(3)     ;src_pixels_per_line
-%else
-        lea             rdi,            [rdi + r9]
-%endif
-
-        sub             rcx,            1                   ;
-        jnz             filter_block2d_bil_var_sse2_loop       ;
-
-        jmp             filter_block2d_bil_variance
-
-filter_block2d_bil_var_sse2_sp_only:
-        movsxd          rdx,            dword ptr arg(6)     ; yoffset
-
-        cmp             rdx,            0                    ; skip all if both xoffset=0 and yoffset=0
-        je              filter_block2d_bil_var_sse2_full_pixel
-
-        shl             rdx,            5
-        lea             rdx,            [rdx + rcx]          ; VFilter
-
-        mov             rsi,            arg(0)               ;ref_ptr
-        mov             rdi,            arg(2)               ;src_ptr
-        movsxd          rcx,            dword ptr arg(4)     ;Height
-        movsxd          rax,            dword ptr arg(1)     ;ref_pixels_per_line
-
-        pxor            xmm0,           xmm0                 ;
-        movq            xmm1,           QWORD PTR [rsi]      ;
-        punpcklbw       xmm1,           xmm0                 ;
-
-        movsxd          rbx,            dword ptr arg(3)     ;src_pixels_per_line
-        lea             rsi,            [rsi + rax]
-
-filter_block2d_bil_sp_only_loop:
-        movq            xmm3,           QWORD PTR [rsi]             ;
-        punpcklbw       xmm3,           xmm0                 ;
-        movdqa          xmm5,           xmm3
-
-        pmullw          xmm1,           [rdx]               ;
-        pmullw          xmm3,           [rdx+16]             ;
-        paddw           xmm1,           xmm3                 ;
-        paddw           xmm1,           xmm4                 ;
-        psraw           xmm1,           xmm_filter_shift    ;
-
-        movq            xmm3,           QWORD PTR [rdi]               ;
-        punpcklbw       xmm3,           xmm0                 ;
-
-        psubw           xmm1,           xmm3                 ;
-        paddw           xmm6,           xmm1                 ;
-
-        pmaddwd         xmm1,           xmm1                 ;
-        paddd           xmm7,           xmm1                 ;
-
-        movdqa          xmm1,           xmm5                 ;
-        lea             rsi,            [rsi + rax]          ;ref_pixels_per_line
-        lea             rdi,            [rdi + rbx]          ;src_pixels_per_line
-
-        sub             rcx,            1                   ;
-        jnz             filter_block2d_bil_sp_only_loop       ;
-
-        jmp             filter_block2d_bil_variance
-
-filter_block2d_bil_var_sse2_full_pixel:
-        mov             rsi,            arg(0)               ;ref_ptr
-        mov             rdi,            arg(2)               ;src_ptr
-        movsxd          rcx,            dword ptr arg(4)     ;Height
-        movsxd          rax,            dword ptr arg(1)     ;ref_pixels_per_line
-        movsxd          rbx,            dword ptr arg(3)     ;src_pixels_per_line
-        pxor            xmm0,           xmm0                 ;
-
-filter_block2d_bil_full_pixel_loop:
-        movq            xmm1,           QWORD PTR [rsi]               ;
-        punpcklbw       xmm1,           xmm0                 ;
-
-        movq            xmm2,           QWORD PTR [rdi]               ;
-        punpcklbw       xmm2,           xmm0                 ;
-
-        psubw           xmm1,           xmm2                 ;
-        paddw           xmm6,           xmm1                 ;
-
-        pmaddwd         xmm1,           xmm1                 ;
-        paddd           xmm7,           xmm1                 ;
-
-        lea             rsi,            [rsi + rax]          ;ref_pixels_per_line
-        lea             rdi,            [rdi + rbx]          ;src_pixels_per_line
-
-        sub             rcx,            1                   ;
-        jnz             filter_block2d_bil_full_pixel_loop       ;
-
-        jmp             filter_block2d_bil_variance
-
-filter_block2d_bil_var_sse2_fp_only:
-        mov             rsi,            arg(0)               ;ref_ptr
-        mov             rdi,            arg(2)               ;src_ptr
-        movsxd          rcx,            dword ptr arg(4)     ;Height
-        movsxd          rdx,            dword ptr arg(1)     ;ref_pixels_per_line
-
-        pxor            xmm0,           xmm0                 ;
-        movsxd          rbx,            dword ptr arg(3)     ;src_pixels_per_line
-
-filter_block2d_bil_fp_only_loop:
-        movq            xmm1,           QWORD PTR [rsi]       ;
-        movq            xmm3,           QWORD PTR [rsi+1]     ;
-
-        punpcklbw       xmm1,           xmm0                 ;
-        pmullw          xmm1,           [rax]               ;
-        punpcklbw       xmm3,           xmm0                 ;
-        pmullw          xmm3,           [rax+16]             ;
-
-        paddw           xmm1,           xmm3                 ;
-        paddw           xmm1,           xmm4  ;
-        psraw           xmm1,           xmm_filter_shift    ;
-
-        movq            xmm3,           QWORD PTR [rdi]     ;
-        punpcklbw       xmm3,           xmm0                 ;
-
-        psubw           xmm1,           xmm3                 ;
-        paddw           xmm6,           xmm1                 ;
-
-        pmaddwd         xmm1,           xmm1                 ;
-        paddd           xmm7,           xmm1                 ;
-        lea             rsi,            [rsi + rdx]
-        lea             rdi,            [rdi + rbx]          ;src_pixels_per_line
-
-        sub             rcx,            1                   ;
-        jnz             filter_block2d_bil_fp_only_loop       ;
-
-        jmp             filter_block2d_bil_variance
-
-filter_block2d_bil_variance:
-        movdq2q         mm6,            xmm6                ;
-        movdq2q         mm7,            xmm7                ;
-
-        psrldq          xmm6,           8
-        psrldq          xmm7,           8
-
-        movdq2q         mm2,            xmm6
-        movdq2q         mm3,            xmm7
-
-        paddw           mm6,            mm2
-        paddd           mm7,            mm3
-
-        pxor            mm3,            mm3                 ;
-        pxor            mm2,            mm2                 ;
-
-        punpcklwd       mm2,            mm6                 ;
-        punpckhwd       mm3,            mm6                 ;
-
-        paddd           mm2,            mm3                 ;
-        movq            mm6,            mm2                 ;
-
-        psrlq           mm6,            32                  ;
-        paddd           mm2,            mm6                 ;
-
-        psrad           mm2,            16                  ;
-        movq            mm4,            mm7                 ;
-
-        psrlq           mm4,            32                  ;
-        paddd           mm4,            mm7                 ;
-
-        mov             rsi,            arg(7) ; sum
-        mov             rdi,            arg(8) ; sumsquared
-
-        movd            [rsi],          mm2    ; xsum
-        movd            [rdi],          mm4    ; xxsum
-
-    ; begin epilog
-    pop rbx
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vp9_half_horiz_vert_variance8x_h_sse2
-;(
-;    unsigned char *ref_ptr,
-;    int ref_pixels_per_line,
-;    unsigned char *src_ptr,
-;    int src_pixels_per_line,
-;    unsigned int Height,
-;    int *sum,
-;    unsigned int *sumsquared
-;)
-global sym(vp9_half_horiz_vert_variance8x_h_sse2)
-sym(vp9_half_horiz_vert_variance8x_h_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push rsi
-    push rdi
-    ; end prolog
-
-%if ABI_IS_32BIT=0
-    movsxd          r8, dword ptr arg(1) ;ref_pixels_per_line
-    movsxd          r9, dword ptr arg(3) ;src_pixels_per_line
-%endif
-
-        pxor            xmm6,           xmm6                ;  error accumulator
-        pxor            xmm7,           xmm7                ;  sse eaccumulator
-        mov             rsi,            arg(0) ;ref_ptr              ;
-
-        mov             rdi,            arg(2) ;src_ptr              ;
-        movsxd          rcx,            dword ptr arg(4) ;Height              ;
-        movsxd          rax,            dword ptr arg(1) ;ref_pixels_per_line
-
-        pxor            xmm0,           xmm0                ;
-
-        movq            xmm5,           QWORD PTR [rsi]     ;  xmm5 = s0,s1,s2..s8
-        movq            xmm3,           QWORD PTR [rsi+1]   ;  xmm3 = s1,s2,s3..s9
-        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3) horizontal line 1
-
-%if ABI_IS_32BIT
-        add             rsi,            dword ptr arg(1) ;ref_pixels_per_line    ;  next source
-%else
-        add             rsi, r8
-%endif
-
-.half_horiz_vert_variance8x_h_1:
-
-        movq            xmm1,           QWORD PTR [rsi]     ;
-        movq            xmm2,           QWORD PTR [rsi+1]   ;
-        pavgb           xmm1,           xmm2                ;  xmm1 = avg(xmm1,xmm3) horizontal line i+1
-
-        pavgb           xmm5,           xmm1                ;  xmm = vertical average of the above
-        punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above
-
-        movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d8
-        punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above
-
-        psubw           xmm5,           xmm3                ;  xmm5 -= xmm3
-        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
-        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
-        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
-
-        movdqa          xmm5,           xmm1                ;  save xmm1 for use on the next row
-
-%if ABI_IS_32BIT
-        add             esi,            dword ptr arg(1) ;ref_pixels_per_line    ;  next source
-        add             edi,            dword ptr arg(3) ;src_pixels_per_line    ;  next destination
-%else
-        add             rsi, r8
-        add             rdi, r9
-%endif
-
-        sub             rcx,            1                   ;
-        jnz             .half_horiz_vert_variance8x_h_1     ;
-
-        movdq2q         mm6,            xmm6                ;
-        movdq2q         mm7,            xmm7                ;
-
-        psrldq          xmm6,           8
-        psrldq          xmm7,           8
-
-        movdq2q         mm2,            xmm6
-        movdq2q         mm3,            xmm7
-
-        paddw           mm6,            mm2
-        paddd           mm7,            mm3
-
-        pxor            mm3,            mm3                 ;
-        pxor            mm2,            mm2                 ;
-
-        punpcklwd       mm2,            mm6                 ;
-        punpckhwd       mm3,            mm6                 ;
-
-        paddd           mm2,            mm3                 ;
-        movq            mm6,            mm2                 ;
-
-        psrlq           mm6,            32                  ;
-        paddd           mm2,            mm6                 ;
-
-        psrad           mm2,            16                  ;
-        movq            mm4,            mm7                 ;
-
-        psrlq           mm4,            32                  ;
-        paddd           mm4,            mm7                 ;
-
-        mov             rsi,            arg(5) ; sum
-        mov             rdi,            arg(6) ; sumsquared
-
-        movd            [rsi],          mm2                 ;
-        movd            [rdi],          mm4                 ;
-
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void vp9_half_horiz_vert_variance16x_h_sse2
-;(
-;    unsigned char *ref_ptr,
-;    int ref_pixels_per_line,
-;    unsigned char *src_ptr,
-;    int src_pixels_per_line,
-;    unsigned int Height,
-;    int *sum,
-;    unsigned int *sumsquared
-;)
-global sym(vp9_half_horiz_vert_variance16x_h_sse2)
-sym(vp9_half_horiz_vert_variance16x_h_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push rsi
-    push rdi
-    ; end prolog
-
-        pxor            xmm6,           xmm6                ;  error accumulator
-        pxor            xmm7,           xmm7                ;  sse eaccumulator
-        mov             rsi,            arg(0) ;ref_ptr              ;
-
-        mov             rdi,            arg(2) ;src_ptr              ;
-        movsxd          rcx,            dword ptr arg(4) ;Height              ;
-        movsxd          rax,            dword ptr arg(1) ;ref_pixels_per_line
-        movsxd          rdx,            dword ptr arg(3)    ;src_pixels_per_line
-
-        pxor            xmm0,           xmm0                ;
-
-        movdqu          xmm5,           XMMWORD PTR [rsi]
-        movdqu          xmm3,           XMMWORD PTR [rsi+1]
-        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3) horizontal line 1
-
-        lea             rsi,            [rsi + rax]
-
-.half_horiz_vert_variance16x_h_1:
-        movdqu          xmm1,           XMMWORD PTR [rsi]     ;
-        movdqu          xmm2,           XMMWORD PTR [rsi+1]   ;
-        pavgb           xmm1,           xmm2                ;  xmm1 = avg(xmm1,xmm3) horizontal line i+1
-
-        pavgb           xmm5,           xmm1                ;  xmm = vertical average of the above
-
-        movdqa          xmm4,           xmm5
-        punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above
-        punpckhbw       xmm4,           xmm0
-
-        movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d7
-        punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above
-        psubw           xmm5,           xmm3                ;  xmm5 -= xmm3
-
-        movq            xmm3,           QWORD PTR [rdi+8]
-        punpcklbw       xmm3,           xmm0
-        psubw           xmm4,           xmm3
-
-        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
-        paddw           xmm6,           xmm4
-        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
-        pmaddwd         xmm4,           xmm4
-        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
-        paddd           xmm7,           xmm4
-
-        movdqa          xmm5,           xmm1                ;  save xmm1 for use on the next row
-
-        lea             rsi,            [rsi + rax]
-        lea             rdi,            [rdi + rdx]
-
-        sub             rcx,            1                   ;
-        jnz             .half_horiz_vert_variance16x_h_1    ;
-
-        pxor        xmm1,           xmm1
-        pxor        xmm5,           xmm5
-
-        punpcklwd   xmm0,           xmm6
-        punpckhwd   xmm1,           xmm6
-        psrad       xmm0,           16
-        psrad       xmm1,           16
-        paddd       xmm0,           xmm1
-        movdqa      xmm1,           xmm0
-
-        movdqa      xmm6,           xmm7
-        punpckldq   xmm6,           xmm5
-        punpckhdq   xmm7,           xmm5
-        paddd       xmm6,           xmm7
-
-        punpckldq   xmm0,           xmm5
-        punpckhdq   xmm1,           xmm5
-        paddd       xmm0,           xmm1
-
-        movdqa      xmm7,           xmm6
-        movdqa      xmm1,           xmm0
-
-        psrldq      xmm7,           8
-        psrldq      xmm1,           8
-
-        paddd       xmm6,           xmm7
-        paddd       xmm0,           xmm1
-
-        mov         rsi,            arg(5) ;[Sum]
-        mov         rdi,            arg(6) ;[SSE]
-
-        movd        [rsi],       xmm0
-        movd        [rdi],       xmm6
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vp9_half_vert_variance8x_h_sse2
-;(
-;    unsigned char *ref_ptr,
-;    int ref_pixels_per_line,
-;    unsigned char *src_ptr,
-;    int src_pixels_per_line,
-;    unsigned int Height,
-;    int *sum,
-;    unsigned int *sumsquared
-;)
-global sym(vp9_half_vert_variance8x_h_sse2)
-sym(vp9_half_vert_variance8x_h_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push rsi
-    push rdi
-    ; end prolog
-
-%if ABI_IS_32BIT=0
-    movsxd          r8, dword ptr arg(1) ;ref_pixels_per_line
-    movsxd          r9, dword ptr arg(3) ;src_pixels_per_line
-%endif
-
-        pxor            xmm6,           xmm6                ;  error accumulator
-        pxor            xmm7,           xmm7                ;  sse eaccumulator
-        mov             rsi,            arg(0) ;ref_ptr              ;
-
-        mov             rdi,            arg(2) ;src_ptr              ;
-        movsxd          rcx,            dword ptr arg(4) ;Height              ;
-        movsxd          rax,            dword ptr arg(1) ;ref_pixels_per_line
-
-        pxor            xmm0,           xmm0                ;
-.half_vert_variance8x_h_1:
-        movq            xmm5,           QWORD PTR [rsi]     ;  xmm5 = s0,s1,s2..s8
-        movq            xmm3,           QWORD PTR [rsi+rax] ;  xmm3 = s1,s2,s3..s9
-
-        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3)
-        punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above
-
-        movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d8
-        punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above
-
-        psubw           xmm5,           xmm3                ;  xmm5 -= xmm3
-        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
-        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
-        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
-
-%if ABI_IS_32BIT
-        add             esi,            dword ptr arg(1) ;ref_pixels_per_line    ;  next source
-        add             edi,            dword ptr arg(3) ;src_pixels_per_line    ;  next destination
-%else
-        add             rsi, r8
-        add             rdi, r9
-%endif
-
-        sub             rcx,            1                   ;
-        jnz             .half_vert_variance8x_h_1          ;
-
-        movdq2q         mm6,            xmm6                ;
-        movdq2q         mm7,            xmm7                ;
-
-        psrldq          xmm6,           8
-        psrldq          xmm7,           8
-
-        movdq2q         mm2,            xmm6
-        movdq2q         mm3,            xmm7
-
-        paddw           mm6,            mm2
-        paddd           mm7,            mm3
-
-        pxor            mm3,            mm3                 ;
-        pxor            mm2,            mm2                 ;
-
-        punpcklwd       mm2,            mm6                 ;
-        punpckhwd       mm3,            mm6                 ;
-
-        paddd           mm2,            mm3                 ;
-        movq            mm6,            mm2                 ;
-
-        psrlq           mm6,            32                  ;
-        paddd           mm2,            mm6                 ;
-
-        psrad           mm2,            16                  ;
-        movq            mm4,            mm7                 ;
-
-        psrlq           mm4,            32                  ;
-        paddd           mm4,            mm7                 ;
-
-        mov             rsi,            arg(5) ; sum
-        mov             rdi,            arg(6) ; sumsquared
-
-        movd            [rsi],          mm2                 ;
-        movd            [rdi],          mm4                 ;
-
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void vp9_half_vert_variance16x_h_sse2
-;(
-;    unsigned char *ref_ptr,
-;    int ref_pixels_per_line,
-;    unsigned char *src_ptr,
-;    int src_pixels_per_line,
-;    unsigned int Height,
-;    int *sum,
-;    unsigned int *sumsquared
-;)
-global sym(vp9_half_vert_variance16x_h_sse2)
-sym(vp9_half_vert_variance16x_h_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push rsi
-    push rdi
-    ; end prolog
-
-        pxor            xmm6,           xmm6                ;  error accumulator
-        pxor            xmm7,           xmm7                ;  sse eaccumulator
-        mov             rsi,            arg(0)              ;ref_ptr
-
-        mov             rdi,            arg(2)              ;src_ptr
-        movsxd          rcx,            dword ptr arg(4)    ;Height
-        movsxd          rax,            dword ptr arg(1)    ;ref_pixels_per_line
-        movsxd          rdx,            dword ptr arg(3)    ;src_pixels_per_line
-
-        movdqu          xmm5,           XMMWORD PTR [rsi]
-        lea             rsi,            [rsi + rax          ]
-        pxor            xmm0,           xmm0
-
-.half_vert_variance16x_h_1:
-        movdqu          xmm3,           XMMWORD PTR [rsi]
-
-        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3)
-        movdqa          xmm4,           xmm5
-        punpcklbw       xmm5,           xmm0
-        punpckhbw       xmm4,           xmm0
-
-        movq            xmm2,           QWORD PTR [rdi]
-        punpcklbw       xmm2,           xmm0
-        psubw           xmm5,           xmm2
-        movq            xmm2,           QWORD PTR [rdi+8]
-        punpcklbw       xmm2,           xmm0
-        psubw           xmm4,           xmm2
-
-        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
-        paddw           xmm6,           xmm4
-        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
-        pmaddwd         xmm4,           xmm4
-        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
-        paddd           xmm7,           xmm4
-
-        movdqa          xmm5,           xmm3
-
-        lea             rsi,            [rsi + rax]
-        lea             rdi,            [rdi + rdx]
-
-        sub             rcx,            1
-        jnz             .half_vert_variance16x_h_1
-
-        pxor        xmm1,           xmm1
-        pxor        xmm5,           xmm5
-
-        punpcklwd   xmm0,           xmm6
-        punpckhwd   xmm1,           xmm6
-        psrad       xmm0,           16
-        psrad       xmm1,           16
-        paddd       xmm0,           xmm1
-        movdqa      xmm1,           xmm0
-
-        movdqa      xmm6,           xmm7
-        punpckldq   xmm6,           xmm5
-        punpckhdq   xmm7,           xmm5
-        paddd       xmm6,           xmm7
-
-        punpckldq   xmm0,           xmm5
-        punpckhdq   xmm1,           xmm5
-        paddd       xmm0,           xmm1
-
-        movdqa      xmm7,           xmm6
-        movdqa      xmm1,           xmm0
-
-        psrldq      xmm7,           8
-        psrldq      xmm1,           8
-
-        paddd       xmm6,           xmm7
-        paddd       xmm0,           xmm1
-
-        mov         rsi,            arg(5) ;[Sum]
-        mov         rdi,            arg(6) ;[SSE]
-
-        movd        [rsi],       xmm0
-        movd        [rdi],       xmm6
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vp9_half_horiz_variance8x_h_sse2
-;(
-;    unsigned char *ref_ptr,
-;    int ref_pixels_per_line,
-;    unsigned char *src_ptr,
-;    int src_pixels_per_line,
-;    unsigned int Height,
-;    int *sum,
-;    unsigned int *sumsquared
-;)
-global sym(vp9_half_horiz_variance8x_h_sse2)
-sym(vp9_half_horiz_variance8x_h_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push rsi
-    push rdi
-    ; end prolog
-
-%if ABI_IS_32BIT=0
-    movsxd          r8, dword ptr arg(1) ;ref_pixels_per_line
-    movsxd          r9, dword ptr arg(3) ;src_pixels_per_line
-%endif
-
-        pxor            xmm6,           xmm6                ;  error accumulator
-        pxor            xmm7,           xmm7                ;  sse eaccumulator
-        mov             rsi,            arg(0) ;ref_ptr              ;
-
-        mov             rdi,            arg(2) ;src_ptr              ;
-        movsxd          rcx,            dword ptr arg(4) ;Height              ;
-
-        pxor            xmm0,           xmm0                ;
-.half_horiz_variance8x_h_1:
-        movq            xmm5,           QWORD PTR [rsi]     ;  xmm5 = s0,s1,s2..s8
-        movq            xmm3,           QWORD PTR [rsi+1]   ;  xmm3 = s1,s2,s3..s9
-
-        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3)
-        punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above
-
-        movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d8
-        punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above
-
-        psubw           xmm5,           xmm3                ;  xmm5 -= xmm3
-        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
-        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
-        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
-
-%if ABI_IS_32BIT
-        add             esi,            dword ptr arg(1) ;ref_pixels_per_line    ;  next source
-        add             edi,            dword ptr arg(3) ;src_pixels_per_line    ;  next destination
-%else
-        add             rsi, r8
-        add             rdi, r9
-%endif
-        sub             rcx,            1                   ;
-        jnz             .half_horiz_variance8x_h_1          ;
-
-        movdq2q         mm6,            xmm6                ;
-        movdq2q         mm7,            xmm7                ;
-
-        psrldq          xmm6,           8
-        psrldq          xmm7,           8
-
-        movdq2q         mm2,            xmm6
-        movdq2q         mm3,            xmm7
-
-        paddw           mm6,            mm2
-        paddd           mm7,            mm3
-
-        pxor            mm3,            mm3                 ;
-        pxor            mm2,            mm2                 ;
-
-        punpcklwd       mm2,            mm6                 ;
-        punpckhwd       mm3,            mm6                 ;
-
-        paddd           mm2,            mm3                 ;
-        movq            mm6,            mm2                 ;
-
-        psrlq           mm6,            32                  ;
-        paddd           mm2,            mm6                 ;
-
-        psrad           mm2,            16                  ;
-        movq            mm4,            mm7                 ;
-
-        psrlq           mm4,            32                  ;
-        paddd           mm4,            mm7                 ;
-
-        mov             rsi,            arg(5) ; sum
-        mov             rdi,            arg(6) ; sumsquared
-
-        movd            [rsi],          mm2                 ;
-        movd            [rdi],          mm4                 ;
-
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void vp9_half_horiz_variance16x_h_sse2
-;(
-;    unsigned char *ref_ptr,
-;    int ref_pixels_per_line,
-;    unsigned char *src_ptr,
-;    int src_pixels_per_line,
-;    unsigned int Height,
-;    int *sum,
-;    unsigned int *sumsquared
-;)
-global sym(vp9_half_horiz_variance16x_h_sse2)
-sym(vp9_half_horiz_variance16x_h_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push rsi
-    push rdi
-    ; end prolog
-
-        pxor            xmm6,           xmm6                ;  error accumulator
-        pxor            xmm7,           xmm7                ;  sse eaccumulator
-        mov             rsi,            arg(0) ;ref_ptr              ;
-
-        mov             rdi,            arg(2) ;src_ptr              ;
-        movsxd          rcx,            dword ptr arg(4) ;Height              ;
-        movsxd          rax,            dword ptr arg(1) ;ref_pixels_per_line
-        movsxd          rdx,            dword ptr arg(3)    ;src_pixels_per_line
-
-        pxor            xmm0,           xmm0                ;
-
-.half_horiz_variance16x_h_1:
-        movdqu          xmm5,           XMMWORD PTR [rsi]     ;  xmm5 = s0,s1,s2..s15
-        movdqu          xmm3,           XMMWORD PTR [rsi+1]   ;  xmm3 = s1,s2,s3..s16
-
-        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3)
-        movdqa          xmm1,           xmm5
-        punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above
-        punpckhbw       xmm1,           xmm0
-
-        movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d7
-        punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above
-        movq            xmm2,           QWORD PTR [rdi+8]
-        punpcklbw       xmm2,           xmm0
-
-        psubw           xmm5,           xmm3                ;  xmm5 -= xmm3
-        psubw           xmm1,           xmm2
-        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
-        paddw           xmm6,           xmm1
-        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
-        pmaddwd         xmm1,           xmm1
-        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
-        paddd           xmm7,           xmm1
-
-        lea             rsi,            [rsi + rax]
-        lea             rdi,            [rdi + rdx]
-
-        sub             rcx,            1                   ;
-        jnz             .half_horiz_variance16x_h_1         ;
-
-        pxor        xmm1,           xmm1
-        pxor        xmm5,           xmm5
-
-        punpcklwd   xmm0,           xmm6
-        punpckhwd   xmm1,           xmm6
-        psrad       xmm0,           16
-        psrad       xmm1,           16
-        paddd       xmm0,           xmm1
-        movdqa      xmm1,           xmm0
-
-        movdqa      xmm6,           xmm7
-        punpckldq   xmm6,           xmm5
-        punpckhdq   xmm7,           xmm5
-        paddd       xmm6,           xmm7
-
-        punpckldq   xmm0,           xmm5
-        punpckhdq   xmm1,           xmm5
-        paddd       xmm0,           xmm1
-
-        movdqa      xmm7,           xmm6
-        movdqa      xmm1,           xmm0
-
-        psrldq      xmm7,           8
-        psrldq      xmm1,           8
-
-        paddd       xmm6,           xmm7
-        paddd       xmm0,           xmm1
-
-        mov         rsi,            arg(5) ;[Sum]
-        mov         rdi,            arg(6) ;[SSE]
-
-        movd        [rsi],       xmm0
-        movd        [rdi],       xmm6
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-SECTION_RODATA
-;    short xmm_bi_rd[8] = { 64, 64, 64, 64,64, 64, 64, 64};
-align 16
-xmm_bi_rd:
-    times 8 dw 64
-align 16
-bilinear_filters_sse2:
-    dw 128, 128, 128, 128, 128, 128, 128, 128,  0,  0,  0,  0,  0,  0,  0,  0
-    dw 120, 120, 120, 120, 120, 120, 120, 120,  8,  8,  8,  8,  8,  8,  8,  8
-    dw 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16
-    dw 104, 104, 104, 104, 104, 104, 104, 104, 24, 24, 24, 24, 24, 24, 24, 24
-    dw 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32
-    dw 88, 88, 88, 88, 88, 88, 88, 88, 40, 40, 40, 40, 40, 40, 40, 40
-    dw 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48
-    dw 72, 72, 72, 72, 72, 72, 72, 72, 56, 56, 56, 56, 56, 56, 56, 56
-    dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
-    dw 56, 56, 56, 56, 56, 56, 56, 56, 72, 72, 72, 72, 72, 72, 72, 72
-    dw 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80
-    dw 40, 40, 40, 40, 40, 40, 40, 40, 88, 88, 88, 88, 88, 88, 88, 88
-    dw 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96
-    dw 24, 24, 24, 24, 24, 24, 24, 24, 104, 104, 104, 104, 104, 104, 104, 104
-    dw 16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112
-    dw 8, 8, 8, 8, 8, 8, 8, 8, 120, 120, 120, 120, 120, 120, 120, 120
--- a/vp8/encoder/x86/variance_impl_ssse3.asm
+++ /dev/null
@@ -1,372 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-%define xmm_filter_shift            7
-
-
-;void vp9_filter_block2d_bil_var_ssse3
-;(
-;    unsigned char *ref_ptr,
-;    int ref_pixels_per_line,
-;    unsigned char *src_ptr,
-;    int src_pixels_per_line,
-;    unsigned int Height,
-;    int  xoffset,
-;    int  yoffset,
-;    int *sum,
-;    unsigned int *sumsquared;;
-;
-;)
-;Note: The filter coefficient at offset=0 is 128. Since the second register
-;for Pmaddubsw is signed bytes, we must calculate zero offset seperately.
-global sym(vp9_filter_block2d_bil_var_ssse3)
-sym(vp9_filter_block2d_bil_var_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 9
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push rsi
-    push rdi
-    ; end prolog
-
-        pxor            xmm6,           xmm6
-        pxor            xmm7,           xmm7
-
-        lea             rcx,            [GLOBAL(bilinear_filters_ssse3)]
-        movsxd          rax,            dword ptr arg(5)     ; xoffset
-
-        cmp             rax,            0                    ; skip first_pass filter if xoffset=0
-        je              .filter_block2d_bil_var_ssse3_sp_only
-
-        shl             rax,            4                    ; point to filter coeff with xoffset
-        lea             rax,            [rax + rcx]          ; HFilter
-
-        movsxd          rdx,            dword ptr arg(6)     ; yoffset
-
-        cmp             rdx,            0                    ; skip second_pass filter if yoffset=0
-        je              .filter_block2d_bil_var_ssse3_fp_only
-
-        shl             rdx,            4
-        lea             rdx,            [rdx + rcx]          ; VFilter
-
-        mov             rsi,            arg(0)               ;ref_ptr
-        mov             rdi,            arg(2)               ;src_ptr
-        movsxd          rcx,            dword ptr arg(4)     ;Height
-
-        movdqu          xmm0,           XMMWORD PTR [rsi]
-        movdqu          xmm1,           XMMWORD PTR [rsi+1]
-        movdqa          xmm2,           xmm0
-
-        punpcklbw       xmm0,           xmm1
-        punpckhbw       xmm2,           xmm1
-        pmaddubsw       xmm0,           [rax]
-        pmaddubsw       xmm2,           [rax]
-
-        paddw           xmm0,           [GLOBAL(xmm_bi_rd)]
-        paddw           xmm2,           [GLOBAL(xmm_bi_rd)]
-        psraw           xmm0,           xmm_filter_shift
-        psraw           xmm2,           xmm_filter_shift
-
-        packuswb        xmm0,           xmm2
-
-%if ABI_IS_32BIT
-        add             rsi,            dword ptr arg(1) ;ref_pixels_per_line
-%else
-        movsxd          r8,             dword ptr arg(1) ;ref_pixels_per_line
-        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line
-        lea             rsi,            [rsi + r8]
-%endif
-
-.filter_block2d_bil_var_ssse3_loop:
-        movdqu          xmm1,           XMMWORD PTR [rsi]
-        movdqu          xmm2,           XMMWORD PTR [rsi+1]
-        movdqa          xmm3,           xmm1
-
-        punpcklbw       xmm1,           xmm2
-        punpckhbw       xmm3,           xmm2
-        pmaddubsw       xmm1,           [rax]
-        pmaddubsw       xmm3,           [rax]
-
-        paddw           xmm1,           [GLOBAL(xmm_bi_rd)]
-        paddw           xmm3,           [GLOBAL(xmm_bi_rd)]
-        psraw           xmm1,           xmm_filter_shift
-        psraw           xmm3,           xmm_filter_shift
-        packuswb        xmm1,           xmm3
-
-        movdqa          xmm2,           xmm0
-        movdqa          xmm0,           xmm1
-        movdqa          xmm3,           xmm2
-
-        punpcklbw       xmm2,           xmm1
-        punpckhbw       xmm3,           xmm1
-        pmaddubsw       xmm2,           [rdx]
-        pmaddubsw       xmm3,           [rdx]
-
-        paddw           xmm2,           [GLOBAL(xmm_bi_rd)]
-        paddw           xmm3,           [GLOBAL(xmm_bi_rd)]
-        psraw           xmm2,           xmm_filter_shift
-        psraw           xmm3,           xmm_filter_shift
-
-        movq            xmm1,           QWORD PTR [rdi]
-        pxor            xmm4,           xmm4
-        punpcklbw       xmm1,           xmm4
-        movq            xmm5,           QWORD PTR [rdi+8]
-        punpcklbw       xmm5,           xmm4
-
-        psubw           xmm2,           xmm1
-        psubw           xmm3,           xmm5
-        paddw           xmm6,           xmm2
-        paddw           xmm6,           xmm3
-        pmaddwd         xmm2,           xmm2
-        pmaddwd         xmm3,           xmm3
-        paddd           xmm7,           xmm2
-        paddd           xmm7,           xmm3
-
-%if ABI_IS_32BIT
-        add             rsi,            dword ptr arg(1)     ;ref_pixels_per_line
-        add             rdi,            dword ptr arg(3)     ;src_pixels_per_line
-%else
-        lea             rsi,            [rsi + r8]
-        lea             rdi,            [rdi + r9]
-%endif
-
-        sub             rcx,            1
-        jnz             .filter_block2d_bil_var_ssse3_loop
-
-        jmp             .filter_block2d_bil_variance
-
-.filter_block2d_bil_var_ssse3_sp_only:
-        movsxd          rdx,            dword ptr arg(6)     ; yoffset
-
-        cmp             rdx,            0                    ; Both xoffset =0 and yoffset=0
-        je              .filter_block2d_bil_var_ssse3_full_pixel
-
-        shl             rdx,            4
-        lea             rdx,            [rdx + rcx]          ; VFilter
-
-        mov             rsi,            arg(0)               ;ref_ptr
-        mov             rdi,            arg(2)               ;src_ptr
-        movsxd          rcx,            dword ptr arg(4)     ;Height
-        movsxd          rax,            dword ptr arg(1)     ;ref_pixels_per_line
-
-        movdqu          xmm1,           XMMWORD PTR [rsi]
-        movdqa          xmm0,           xmm1
-
-%if ABI_IS_32BIT=0
-        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line
-%endif
-
-        lea             rsi,            [rsi + rax]
-
-.filter_block2d_bil_sp_only_loop:
-        movdqu          xmm3,           XMMWORD PTR [rsi]
-        movdqa          xmm2,           xmm1
-        movdqa          xmm0,           xmm3
-
-        punpcklbw       xmm1,           xmm3
-        punpckhbw       xmm2,           xmm3
-        pmaddubsw       xmm1,           [rdx]
-        pmaddubsw       xmm2,           [rdx]
-
-        paddw           xmm1,           [GLOBAL(xmm_bi_rd)]
-        paddw           xmm2,           [GLOBAL(xmm_bi_rd)]
-        psraw           xmm1,           xmm_filter_shift
-        psraw           xmm2,           xmm_filter_shift
-
-        movq            xmm3,           QWORD PTR [rdi]
-        pxor            xmm4,           xmm4
-        punpcklbw       xmm3,           xmm4
-        movq            xmm5,           QWORD PTR [rdi+8]
-        punpcklbw       xmm5,           xmm4
-
-        psubw           xmm1,           xmm3
-        psubw           xmm2,           xmm5
-        paddw           xmm6,           xmm1
-        paddw           xmm6,           xmm2
-        pmaddwd         xmm1,           xmm1
-        pmaddwd         xmm2,           xmm2
-        paddd           xmm7,           xmm1
-        paddd           xmm7,           xmm2
-
-        movdqa          xmm1,           xmm0
-        lea             rsi,            [rsi + rax]          ;ref_pixels_per_line
-
-%if ABI_IS_32BIT
-        add             rdi,            dword ptr arg(3)     ;src_pixels_per_line
-%else
-        lea             rdi,            [rdi + r9]
-%endif
-
-        sub             rcx,            1
-        jnz             .filter_block2d_bil_sp_only_loop
-
-        jmp             .filter_block2d_bil_variance
-
-.filter_block2d_bil_var_ssse3_full_pixel:
-        mov             rsi,            arg(0)               ;ref_ptr
-        mov             rdi,            arg(2)               ;src_ptr
-        movsxd          rcx,            dword ptr arg(4)     ;Height
-        movsxd          rax,            dword ptr arg(1)     ;ref_pixels_per_line
-        movsxd          rdx,            dword ptr arg(3)     ;src_pixels_per_line
-        pxor            xmm0,           xmm0
-
-.filter_block2d_bil_full_pixel_loop:
-        movq            xmm1,           QWORD PTR [rsi]
-        punpcklbw       xmm1,           xmm0
-        movq            xmm2,           QWORD PTR [rsi+8]
-        punpcklbw       xmm2,           xmm0
-
-        movq            xmm3,           QWORD PTR [rdi]
-        punpcklbw       xmm3,           xmm0
-        movq            xmm4,           QWORD PTR [rdi+8]
-        punpcklbw       xmm4,           xmm0
-
-        psubw           xmm1,           xmm3
-        psubw           xmm2,           xmm4
-        paddw           xmm6,           xmm1
-        paddw           xmm6,           xmm2
-        pmaddwd         xmm1,           xmm1
-        pmaddwd         xmm2,           xmm2
-        paddd           xmm7,           xmm1
-        paddd           xmm7,           xmm2
-
-        lea             rsi,            [rsi + rax]          ;ref_pixels_per_line
-        lea             rdi,            [rdi + rdx]          ;src_pixels_per_line
-        sub             rcx,            1
-        jnz             .filter_block2d_bil_full_pixel_loop
-
-        jmp             .filter_block2d_bil_variance
-
-.filter_block2d_bil_var_ssse3_fp_only:
-        mov             rsi,            arg(0)               ;ref_ptr
-        mov             rdi,            arg(2)               ;src_ptr
-        movsxd          rcx,            dword ptr arg(4)     ;Height
-        movsxd          rdx,            dword ptr arg(1)     ;ref_pixels_per_line
-
-        pxor            xmm0,           xmm0
-
-%if ABI_IS_32BIT=0
-        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line
-%endif
-
-.filter_block2d_bil_fp_only_loop:
-        movdqu          xmm1,           XMMWORD PTR [rsi]
-        movdqu          xmm2,           XMMWORD PTR [rsi+1]
-        movdqa          xmm3,           xmm1
-
-        punpcklbw       xmm1,           xmm2
-        punpckhbw       xmm3,           xmm2
-        pmaddubsw       xmm1,           [rax]
-        pmaddubsw       xmm3,           [rax]
-
-        paddw           xmm1,           [GLOBAL(xmm_bi_rd)]
-        paddw           xmm3,           [GLOBAL(xmm_bi_rd)]
-        psraw           xmm1,           xmm_filter_shift
-        psraw           xmm3,           xmm_filter_shift
-
-        movq            xmm2,           XMMWORD PTR [rdi]
-        pxor            xmm4,           xmm4
-        punpcklbw       xmm2,           xmm4
-        movq            xmm5,           QWORD PTR [rdi+8]
-        punpcklbw       xmm5,           xmm4
-
-        psubw           xmm1,           xmm2
-        psubw           xmm3,           xmm5
-        paddw           xmm6,           xmm1
-        paddw           xmm6,           xmm3
-        pmaddwd         xmm1,           xmm1
-        pmaddwd         xmm3,           xmm3
-        paddd           xmm7,           xmm1
-        paddd           xmm7,           xmm3
-
-        lea             rsi,            [rsi + rdx]
-%if ABI_IS_32BIT
-        add             rdi,            dword ptr arg(3)     ;src_pixels_per_line
-%else
-        lea             rdi,            [rdi + r9]
-%endif
-
-        sub             rcx,            1
-        jnz             .filter_block2d_bil_fp_only_loop
-
-        jmp             .filter_block2d_bil_variance
-
-.filter_block2d_bil_variance:
-        pxor        xmm0,           xmm0
-        pxor        xmm1,           xmm1
-        pxor        xmm5,           xmm5
-
-        punpcklwd   xmm0,           xmm6
-        punpckhwd   xmm1,           xmm6
-        psrad       xmm0,           16
-        psrad       xmm1,           16
-        paddd       xmm0,           xmm1
-        movdqa      xmm1,           xmm0
-
-        movdqa      xmm6,           xmm7
-        punpckldq   xmm6,           xmm5
-        punpckhdq   xmm7,           xmm5
-        paddd       xmm6,           xmm7
-
-        punpckldq   xmm0,           xmm5
-        punpckhdq   xmm1,           xmm5
-        paddd       xmm0,           xmm1
-
-        movdqa      xmm7,           xmm6
-        movdqa      xmm1,           xmm0
-
-        psrldq      xmm7,           8
-        psrldq      xmm1,           8
-
-        paddd       xmm6,           xmm7
-        paddd       xmm0,           xmm1
-
-        mov         rsi,            arg(7) ;[Sum]
-        mov         rdi,            arg(8) ;[SSE]
-
-        movd        [rsi],       xmm0
-        movd        [rdi],       xmm6
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-SECTION_RODATA
-align 16
-xmm_bi_rd:
-    times 8 dw 64
-align 16
-bilinear_filters_ssse3:
-    times 8 db 128, 0
-    times 8 db 120, 8
-    times 8 db 112, 16
-    times 8 db 104, 24
-    times 8 db  96, 32
-    times 8 db  88, 40
-    times 8 db  80, 48
-    times 8 db  72, 56
-    times 8 db  64, 64
-    times 8 db  56, 72
-    times 8 db  48, 80
-    times 8 db  40, 88
-    times 8 db  32, 96
-    times 8 db  24, 104
-    times 8 db  16, 112
-    times 8 db   8, 120
--- a/vp8/encoder/x86/variance_mmx.c
+++ /dev/null
@@ -1,406 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "vpx_config.h"
-#include "vp8/encoder/variance.h"
-#include "vp8/common/pragmas.h"
-#include "vpx_ports/mem.h"
-
-extern void filter_block1d_h6_mmx
-(
-  const unsigned char *src_ptr,
-  unsigned short *output_ptr,
-  unsigned int src_pixels_per_line,
-  unsigned int pixel_step,
-  unsigned int output_height,
-  unsigned int output_width,
-  short *vp7_filter
-);
-extern void filter_block1d_v6_mmx
-(
-  const short *src_ptr,
-  unsigned char *output_ptr,
-  unsigned int pixels_per_line,
-  unsigned int pixel_step,
-  unsigned int output_height,
-  unsigned int output_width,
-  short *vp7_filter
-);
-
-extern unsigned int vp9_get_mb_ss_mmx(const short *src_ptr);
-extern unsigned int vp9_get8x8var_mmx
-(
-  const unsigned char *src_ptr,
-  int  source_stride,
-  const unsigned char *ref_ptr,
-  int  recon_stride,
-  unsigned int *SSE,
-  int *Sum
-);
-extern unsigned int vp9_get4x4var_mmx
-(
-  const unsigned char *src_ptr,
-  int  source_stride,
-  const unsigned char *ref_ptr,
-  int  recon_stride,
-  unsigned int *SSE,
-  int *Sum
-);
-extern void vp9_filter_block2d_bil4x4_var_mmx
-(
-  const unsigned char *ref_ptr,
-  int ref_pixels_per_line,
-  const unsigned char *src_ptr,
-  int src_pixels_per_line,
-  const short *HFilter,
-  const short *VFilter,
-  int *sum,
-  unsigned int *sumsquared
-);
-extern void vp9_filter_block2d_bil_var_mmx
-(
-  const unsigned char *ref_ptr,
-  int ref_pixels_per_line,
-  const unsigned char *src_ptr,
-  int src_pixels_per_line,
-  unsigned int Height,
-  const short *HFilter,
-  const short *VFilter,
-  int *sum,
-  unsigned int *sumsquared
-);
-
-
-unsigned int vp9_variance4x4_mmx(
-  const unsigned char *src_ptr,
-  int  source_stride,
-  const unsigned char *ref_ptr,
-  int  recon_stride,
-  unsigned int *sse) {
-  unsigned int var;
-  int avg;
-
-  vp9_get4x4var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg);
-  *sse = var;
-  return (var - ((avg * avg) >> 4));
-
-}
-
-unsigned int vp9_variance8x8_mmx(
-  const unsigned char *src_ptr,
-  int  source_stride,
-  const unsigned char *ref_ptr,
-  int  recon_stride,
-  unsigned int *sse) {
-  unsigned int var;
-  int avg;
-
-  vp9_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg);
-  *sse = var;
-
-  return (var - ((avg * avg) >> 6));
-
-}
-
-unsigned int vp9_mse16x16_mmx(
-  const unsigned char *src_ptr,
-  int  source_stride,
-  const unsigned char *ref_ptr,
-  int  recon_stride,
-  unsigned int *sse) {
-  unsigned int sse0, sse1, sse2, sse3, var;
-  int sum0, sum1, sum2, sum3;
-
-
-  vp9_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0);
-  vp9_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
-  vp9_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse2, &sum2);
-  vp9_get8x8var_mmx(src_ptr + 8 * source_stride + 8, source_stride, ref_ptr + 8 * recon_stride + 8, recon_stride, &sse3, &sum3);
-
-  var = sse0 + sse1 + sse2 + sse3;
-  *sse = var;
-  return var;
-}
-
-
-unsigned int vp9_variance16x16_mmx(
-  const unsigned char *src_ptr,
-  int  source_stride,
-  const unsigned char *ref_ptr,
-  int  recon_stride,
-  unsigned int *sse) {
-  unsigned int sse0, sse1, sse2, sse3, var;
-  int sum0, sum1, sum2, sum3, avg;
-
-
-  vp9_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0);
-  vp9_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
-  vp9_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse2, &sum2);
-  vp9_get8x8var_mmx(src_ptr + 8 * source_stride + 8, source_stride, ref_ptr + 8 * recon_stride + 8, recon_stride, &sse3, &sum3);
-
-  var = sse0 + sse1 + sse2 + sse3;
-  avg = sum0 + sum1 + sum2 + sum3;
-  *sse = var;
-  return (var - ((avg * avg) >> 8));
-}
-
-unsigned int vp9_variance16x8_mmx(
-  const unsigned char *src_ptr,
-  int  source_stride,
-  const unsigned char *ref_ptr,
-  int  recon_stride,
-  unsigned int *sse) {
-  unsigned int sse0, sse1, var;
-  int sum0, sum1, avg;
-
-  vp9_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0);
-  vp9_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
-
-  var = sse0 + sse1;
-  avg = sum0 + sum1;
-  *sse = var;
-  return (var - ((avg * avg) >> 7));
-
-}
-
-
-unsigned int vp9_variance8x16_mmx(
-  const unsigned char *src_ptr,
-  int  source_stride,
-  const unsigned char *ref_ptr,
-  int  recon_stride,
-  unsigned int *sse) {
-  unsigned int sse0, sse1, var;
-  int sum0, sum1, avg;
-
-  vp9_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0);
-  vp9_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse1, &sum1);
-
-  var = sse0 + sse1;
-  avg = sum0 + sum1;
-  *sse = var;
-
-  return (var - ((avg * avg) >> 7));
-
-}
-
-
-
-
-///////////////////////////////////////////////////////////////////////////
-// the mmx function that does the bilinear filtering and var calculation //
-// int one pass                                                          //
-///////////////////////////////////////////////////////////////////////////
-DECLARE_ALIGNED(16, const short, vp9_bilinear_filters_mmx[16][8]) = {
-  { 128, 128, 128, 128,  0,  0,  0,  0 },
-  { 120, 120, 120, 120,  8,  8,  8,  8 },
-  { 112, 112, 112, 112, 16, 16, 16, 16 },
-  { 104, 104, 104, 104, 24, 24, 24, 24 },
-  {  96, 96, 96, 96, 32, 32, 32, 32 },
-  {  88, 88, 88, 88, 40, 40, 40, 40 },
-  {  80, 80, 80, 80, 48, 48, 48, 48 },
-  {  72, 72, 72, 72, 56, 56, 56, 56 },
-  {  64, 64, 64, 64, 64, 64, 64, 64 },
-  {  56, 56, 56, 56, 72, 72, 72, 72 },
-  {  48, 48, 48, 48, 80, 80, 80, 80 },
-  {  40, 40, 40, 40, 88, 88, 88, 88 },
-  {  32, 32, 32, 32, 96, 96, 96, 96 },
-  {  24, 24, 24, 24, 104, 104, 104, 104 },
-  {  16, 16, 16, 16, 112, 112, 112, 112 },
-  {   8,  8,  8,  8, 120, 120, 120, 120 }
-};
-
-unsigned int vp9_sub_pixel_variance4x4_mmx
-(
-  const unsigned char  *src_ptr,
-  int  src_pixels_per_line,
-  int  xoffset,
-  int  yoffset,
-  const unsigned char *dst_ptr,
-  int dst_pixels_per_line,
-  unsigned int *sse)
-
-{
-  int xsum;
-  unsigned int xxsum;
-  vp9_filter_block2d_bil4x4_var_mmx(
-    src_ptr, src_pixels_per_line,
-    dst_ptr, dst_pixels_per_line,
-    vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset],
-    &xsum, &xxsum
-  );
-  *sse = xxsum;
-  return (xxsum - ((xsum * xsum) >> 4));
-}
-
-
-unsigned int vp9_sub_pixel_variance8x8_mmx
-(
-  const unsigned char  *src_ptr,
-  int  src_pixels_per_line,
-  int  xoffset,
-  int  yoffset,
-  const unsigned char *dst_ptr,
-  int dst_pixels_per_line,
-  unsigned int *sse
-) {
-
-  int xsum;
-  unsigned int xxsum;
-  vp9_filter_block2d_bil_var_mmx(
-    src_ptr, src_pixels_per_line,
-    dst_ptr, dst_pixels_per_line, 8,
-    vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset],
-    &xsum, &xxsum
-  );
-  *sse = xxsum;
-  return (xxsum - ((xsum * xsum) >> 6));
-}
-
-unsigned int vp9_sub_pixel_variance16x16_mmx
-(
-  const unsigned char  *src_ptr,
-  int  src_pixels_per_line,
-  int  xoffset,
-  int  yoffset,
-  const unsigned char *dst_ptr,
-  int dst_pixels_per_line,
-  unsigned int *sse
-) {
-
-  int xsum0, xsum1;
-  unsigned int xxsum0, xxsum1;
-
-  vp9_filter_block2d_bil_var_mmx(
-    src_ptr, src_pixels_per_line,
-    dst_ptr, dst_pixels_per_line, 16,
-    vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset],
-    &xsum0, &xxsum0
-  );
-
-  vp9_filter_block2d_bil_var_mmx(
-    src_ptr + 8, src_pixels_per_line,
-    dst_ptr + 8, dst_pixels_per_line, 16,
-    vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset],
-    &xsum1, &xxsum1
-  );
-
-  xsum0 += xsum1;
-  xxsum0 += xxsum1;
-
-  *sse = xxsum0;
-  return (xxsum0 - ((xsum0 * xsum0) >> 8));
-
-
-}
-
-unsigned int vp9_sub_pixel_mse16x16_mmx(
-  const unsigned char  *src_ptr,
-  int  src_pixels_per_line,
-  int  xoffset,
-  int  yoffset,
-  const unsigned char *dst_ptr,
-  int dst_pixels_per_line,
-  unsigned int *sse
-) {
-  vp9_sub_pixel_variance16x16_mmx(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse);
-  return *sse;
-}
-
-unsigned int vp9_sub_pixel_variance16x8_mmx
-(
-  const unsigned char  *src_ptr,
-  int  src_pixels_per_line,
-  int  xoffset,
-  int  yoffset,
-  const unsigned char *dst_ptr,
-  int dst_pixels_per_line,
-  unsigned int *sse
-) {
-  int xsum0, xsum1;
-  unsigned int xxsum0, xxsum1;
-
-
-  vp9_filter_block2d_bil_var_mmx(
-    src_ptr, src_pixels_per_line,
-    dst_ptr, dst_pixels_per_line, 8,
-    vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset],
-    &xsum0, &xxsum0
-  );
-
-
-  vp9_filter_block2d_bil_var_mmx(
-    src_ptr + 8, src_pixels_per_line,
-    dst_ptr + 8, dst_pixels_per_line, 8,
-    vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset],
-    &xsum1, &xxsum1
-  );
-
-  xsum0 += xsum1;
-  xxsum0 += xxsum1;
-
-  *sse = xxsum0;
-  return (xxsum0 - ((xsum0 * xsum0) >> 7));
-}
-
-unsigned int vp9_sub_pixel_variance8x16_mmx
-(
-  const unsigned char  *src_ptr,
-  int  src_pixels_per_line,
-  int  xoffset,
-  int  yoffset,
-  const unsigned char *dst_ptr,
-  int dst_pixels_per_line,
-  unsigned int *sse
-) {
-  int xsum;
-  unsigned int xxsum;
-  vp9_filter_block2d_bil_var_mmx(
-    src_ptr, src_pixels_per_line,
-    dst_ptr, dst_pixels_per_line, 16,
-    vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset],
-    &xsum, &xxsum
-  );
-  *sse = xxsum;
-  return (xxsum - ((xsum * xsum) >> 7));
-}
-
-
-unsigned int vp9_variance_halfpixvar16x16_h_mmx(
-  const unsigned char *src_ptr,
-  int  source_stride,
-  const unsigned char *ref_ptr,
-  int  recon_stride,
-  unsigned int *sse) {
-  return vp9_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 8, 0,
-                                         ref_ptr, recon_stride, sse);
-}
-
-
-unsigned int vp9_variance_halfpixvar16x16_v_mmx(
-  const unsigned char *src_ptr,
-  int  source_stride,
-  const unsigned char *ref_ptr,
-  int  recon_stride,
-  unsigned int *sse) {
-  return vp9_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 0, 8,
-                                         ref_ptr, recon_stride, sse);
-}
-
-
-unsigned int vp9_variance_halfpixvar16x16_hv_mmx(
-  const unsigned char *src_ptr,
-  int  source_stride,
-  const unsigned char *ref_ptr,
-  int  recon_stride,
-  unsigned int *sse) {
-  return vp9_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 8, 8,
-                                         ref_ptr, recon_stride, sse);
-}
--- a/vp8/encoder/x86/variance_sse2.c
+++ /dev/null
@@ -1,517 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "vpx_config.h"
-#include "vp8/encoder/variance.h"
-#include "vp8/common/pragmas.h"
-#include "vpx_ports/mem.h"
-
-#define HALFNDX 8
-
-extern void filter_block1d_h6_mmx(const unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
-extern void filter_block1d_v6_mmx(const short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
-extern void filter_block1d8_h6_sse2(const unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
-extern void filter_block1d8_v6_sse2(const short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
-
-extern void vp9_filter_block2d_bil4x4_var_mmx
-(
-  const unsigned char *ref_ptr,
-  int ref_pixels_per_line,
-  const unsigned char *src_ptr,
-  int src_pixels_per_line,
-  const short *HFilter,
-  const short *VFilter,
-  int *sum,
-  unsigned int *sumsquared
-);
-
-extern unsigned int vp9_get4x4var_mmx
-(
-  const unsigned char *src_ptr,
-  int  source_stride,
-  const unsigned char *ref_ptr,
-  int  recon_stride,
-  unsigned int *SSE,
-  int *Sum
-);
-
-unsigned int vp9_get_mb_ss_sse2
-(
-  const short *src_ptr
-);
-unsigned int vp9_get16x16var_sse2
-(
-  const unsigned char *src_ptr,
-  int source_stride,
-  const unsigned char *ref_ptr,
-  int recon_stride,
-  unsigned int *SSE,
-  int *Sum
-);
-unsigned int vp9_get8x8var_sse2
-(
-  const unsigned char *src_ptr,
-  int source_stride,
-  const unsigned char *ref_ptr,
-  int recon_stride,
-  unsigned int *SSE,
-  int *Sum
-);
-void vp9_filter_block2d_bil_var_sse2
-(
-  const unsigned char *ref_ptr,
-  int ref_pixels_per_line,
-  const unsigned char *src_ptr,
-  int src_pixels_per_line,
-  unsigned int Height,
-  int  xoffset,
-  int  yoffset,
-  int *sum,
-  unsigned int *sumsquared
-);
-void vp9_half_horiz_vert_variance8x_h_sse2
-(
-  const unsigned char *ref_ptr,
-  int ref_pixels_per_line,
-  const unsigned char *src_ptr,
-  int src_pixels_per_line,
-  unsigned int Height,
-  int *sum,
-  unsigned int *sumsquared
-);
-void vp9_half_horiz_vert_variance16x_h_sse2
-(
-  const unsigned char *ref_ptr,
-  int ref_pixels_per_line,
-  const unsigned char *src_ptr,
-  int src_pixels_per_line,
-  unsigned int Height,
-  int *sum,
-  unsigned int *sumsquared
-);
-void vp9_half_horiz_variance8x_h_sse2
-(
-  const unsigned char *ref_ptr,
-  int ref_pixels_per_line,
-  const unsigned char *src_ptr,
-  int src_pixels_per_line,
-  unsigned int Height,
-  int *sum,
-  unsigned int *sumsquared
-);
-void vp9_half_horiz_variance16x_h_sse2
-(
-  const unsigned char *ref_ptr,
-  int ref_pixels_per_line,
-  const unsigned char *src_ptr,
-  int src_pixels_per_line,
-  unsigned int Height,
-  int *sum,
-  unsigned int *sumsquared
-);
-void vp9_half_vert_variance8x_h_sse2
-(
-  const unsigned char *ref_ptr,
-  int ref_pixels_per_line,
-  const unsigned char *src_ptr,
-  int src_pixels_per_line,
-  unsigned int Height,
-  int *sum,
-  unsigned int *sumsquared
-);
-void vp9_half_vert_variance16x_h_sse2
-(
-  const unsigned char *ref_ptr,
-  int ref_pixels_per_line,
-  const unsigned char *src_ptr,
-  int src_pixels_per_line,
-  unsigned int Height,
-  int *sum,
-  unsigned int *sumsquared
-);
-
-DECLARE_ALIGNED(16, extern short, vp9_bilinear_filters_mmx[16][8]);
-
-unsigned int vp9_variance4x4_wmt(
-  const unsigned char *src_ptr,
-  int  source_stride,
-  const unsigned char *ref_ptr,
-  int  recon_stride,
-  unsigned int *sse) {
-  unsigned int var;
-  int avg;
-
-  vp9_get4x4var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg);
-  *sse = var;
-  return (var - ((avg * avg) >> 4));
-
-}
-
-unsigned int vp9_variance8x8_wmt
-(
-  const unsigned char *src_ptr,
-  int  source_stride,
-  const unsigned char *ref_ptr,
-  int  recon_stride,
-  unsigned int *sse) {
-  unsigned int var;
-  int avg;
-
-  vp9_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg);
-  *sse = var;
-  return (var - ((avg * avg) >> 6));
-
-}
-
-
-unsigned int vp9_variance16x16_wmt
-(
-  const unsigned char *src_ptr,
-  int  source_stride,
-  const unsigned char *ref_ptr,
-  int  recon_stride,
-  unsigned int *sse) {
-  unsigned int sse0;
-  int sum0;
-
-
-  vp9_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0);
-  *sse = sse0;
-  return (sse0 - ((sum0 * sum0) >> 8));
-}
-unsigned int vp9_mse16x16_wmt(
-  const unsigned char *src_ptr,
-  int  source_stride,
-  const unsigned char *ref_ptr,
-  int  recon_stride,
-  unsigned int *sse) {
-
-  unsigned int sse0;
-  int sum0;
-  vp9_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0);
-  *sse = sse0;
-  return sse0;
-
-}
-
-
-unsigned int vp9_variance16x8_wmt
-(
-  const unsigned char *src_ptr,
-  int  source_stride,
-  const unsigned char *ref_ptr,
-  int  recon_stride,
-  unsigned int *sse) {
-  unsigned int sse0, sse1, var;
-  int sum0, sum1, avg;
-
-  vp9_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0);
-  vp9_get8x8var_sse2(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
-
-  var = sse0 + sse1;
-  avg = sum0 + sum1;
-  *sse = var;
-  return (var - ((avg * avg) >> 7));
-
-}
-
-unsigned int vp9_variance8x16_wmt
-(
-  const unsigned char *src_ptr,
-  int  source_stride,
-  const unsigned char *ref_ptr,
-  int  recon_stride,
-  unsigned int *sse) {
-  unsigned int sse0, sse1, var;
-  int sum0, sum1, avg;
-
-  vp9_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0);
-  vp9_get8x8var_sse2(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse1, &sum1);
-
-  var = sse0 + sse1;
-  avg = sum0 + sum1;
-  *sse = var;
-  return (var - ((avg * avg) >> 7));
-
-}
-
-unsigned int vp9_sub_pixel_variance4x4_wmt
-(
-  const unsigned char  *src_ptr,
-  int  src_pixels_per_line,
-  int  xoffset,
-  int  yoffset,
-  const unsigned char *dst_ptr,
-  int dst_pixels_per_line,
-  unsigned int *sse
-) {
-  int xsum;
-  unsigned int xxsum;
-  vp9_filter_block2d_bil4x4_var_mmx(
-    src_ptr, src_pixels_per_line,
-    dst_ptr, dst_pixels_per_line,
-    vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset],
-    &xsum, &xxsum
-  );
-  *sse = xxsum;
-  return (xxsum - ((xsum * xsum) >> 4));
-}
-
-
-unsigned int vp9_sub_pixel_variance8x8_wmt
-(
-  const unsigned char  *src_ptr,
-  int  src_pixels_per_line,
-  int  xoffset,
-  int  yoffset,
-  const unsigned char *dst_ptr,
-  int dst_pixels_per_line,
-  unsigned int *sse
-) {
-  int xsum;
-  unsigned int xxsum;
-
-  if (xoffset == HALFNDX && yoffset == 0) {
-    vp9_half_horiz_variance8x_h_sse2(
-      src_ptr, src_pixels_per_line,
-      dst_ptr, dst_pixels_per_line, 8,
-      &xsum, &xxsum);
-  } else if (xoffset == 0 && yoffset == HALFNDX) {
-    vp9_half_vert_variance8x_h_sse2(
-      src_ptr, src_pixels_per_line,
-      dst_ptr, dst_pixels_per_line, 8,
-      &xsum, &xxsum);
-  } else if (xoffset == HALFNDX && yoffset == HALFNDX) {
-    vp9_half_horiz_vert_variance8x_h_sse2(
-      src_ptr, src_pixels_per_line,
-      dst_ptr, dst_pixels_per_line, 8,
-      &xsum, &xxsum);
-  } else {
-    vp9_filter_block2d_bil_var_sse2(
-      src_ptr, src_pixels_per_line,
-      dst_ptr, dst_pixels_per_line, 8,
-      xoffset, yoffset,
-      &xsum, &xxsum);
-  }
-
-  *sse = xxsum;
-  return (xxsum - ((xsum * xsum) >> 6));
-}
-
-unsigned int vp9_sub_pixel_variance16x16_wmt
-(
-  const unsigned char  *src_ptr,
-  int  src_pixels_per_line,
-  int  xoffset,
-  int  yoffset,
-  const unsigned char *dst_ptr,
-  int dst_pixels_per_line,
-  unsigned int *sse
-) {
-  int xsum0, xsum1;
-  unsigned int xxsum0, xxsum1;
-
-
-  // note we could avoid these if statements if the calling function
-  // just called the appropriate functions inside.
-  if (xoffset == HALFNDX && yoffset == 0) {
-    vp9_half_horiz_variance16x_h_sse2(
-      src_ptr, src_pixels_per_line,
-      dst_ptr, dst_pixels_per_line, 16,
-      &xsum0, &xxsum0);
-  } else if (xoffset == 0 && yoffset == HALFNDX) {
-    vp9_half_vert_variance16x_h_sse2(
-      src_ptr, src_pixels_per_line,
-      dst_ptr, dst_pixels_per_line, 16,
-      &xsum0, &xxsum0);
-  } else if (xoffset == HALFNDX && yoffset == HALFNDX) {
-    vp9_half_horiz_vert_variance16x_h_sse2(
-      src_ptr, src_pixels_per_line,
-      dst_ptr, dst_pixels_per_line, 16,
-      &xsum0, &xxsum0);
-  } else {
-    vp9_filter_block2d_bil_var_sse2(
-      src_ptr, src_pixels_per_line,
-      dst_ptr, dst_pixels_per_line, 16,
-      xoffset, yoffset,
-      &xsum0, &xxsum0
-    );
-
-    vp9_filter_block2d_bil_var_sse2(
-      src_ptr + 8, src_pixels_per_line,
-      dst_ptr + 8, dst_pixels_per_line, 16,
-      xoffset, yoffset,
-      &xsum1, &xxsum1
-    );
-    xsum0 += xsum1;
-    xxsum0 += xxsum1;
-  }
-
-  *sse = xxsum0;
-  return (xxsum0 - ((xsum0 * xsum0) >> 8));
-}
-
-unsigned int vp9_sub_pixel_mse16x16_wmt(
-  const unsigned char  *src_ptr,
-  int  src_pixels_per_line,
-  int  xoffset,
-  int  yoffset,
-  const unsigned char *dst_ptr,
-  int dst_pixels_per_line,
-  unsigned int *sse
-) {
-  vp9_sub_pixel_variance16x16_wmt(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse);
-  return *sse;
-}
-
-unsigned int vp9_sub_pixel_variance16x8_wmt
-(
-  const unsigned char  *src_ptr,
-  int  src_pixels_per_line,
-  int  xoffset,
-  int  yoffset,
-  const unsigned char *dst_ptr,
-  int dst_pixels_per_line,
-  unsigned int *sse
-
-) {
-  int xsum0, xsum1;
-  unsigned int xxsum0, xxsum1;
-
-  if (xoffset == HALFNDX && yoffset == 0) {
-    vp9_half_horiz_variance16x_h_sse2(
-      src_ptr, src_pixels_per_line,
-      dst_ptr, dst_pixels_per_line, 8,
-      &xsum0, &xxsum0);
-  } else if (xoffset == 0 && yoffset == HALFNDX) {
-    vp9_half_vert_variance16x_h_sse2(
-      src_ptr, src_pixels_per_line,
-      dst_ptr, dst_pixels_per_line, 8,
-      &xsum0, &xxsum0);
-  } else if (xoffset == HALFNDX && yoffset == HALFNDX) {
-    vp9_half_horiz_vert_variance16x_h_sse2(
-      src_ptr, src_pixels_per_line,
-      dst_ptr, dst_pixels_per_line, 8,
-      &xsum0, &xxsum0);
-  } else {
-    vp9_filter_block2d_bil_var_sse2(
-      src_ptr, src_pixels_per_line,
-      dst_ptr, dst_pixels_per_line, 8,
-      xoffset, yoffset,
-      &xsum0, &xxsum0);
-
-    vp9_filter_block2d_bil_var_sse2(
-      src_ptr + 8, src_pixels_per_line,
-      dst_ptr + 8, dst_pixels_per_line, 8,
-      xoffset, yoffset,
-      &xsum1, &xxsum1);
-    xsum0 += xsum1;
-    xxsum0 += xxsum1;
-  }
-
-  *sse = xxsum0;
-  return (xxsum0 - ((xsum0 * xsum0) >> 7));
-}
-
-unsigned int vp9_sub_pixel_variance8x16_wmt
-(
-  const unsigned char  *src_ptr,
-  int  src_pixels_per_line,
-  int  xoffset,
-  int  yoffset,
-  const unsigned char *dst_ptr,
-  int dst_pixels_per_line,
-  unsigned int *sse
-) {
-  int xsum;
-  unsigned int xxsum;
-
-  if (xoffset == HALFNDX && yoffset == 0) {
-    vp9_half_horiz_variance8x_h_sse2(
-      src_ptr, src_pixels_per_line,
-      dst_ptr, dst_pixels_per_line, 16,
-      &xsum, &xxsum);
-  } else if (xoffset == 0 && yoffset == HALFNDX) {
-    vp9_half_vert_variance8x_h_sse2(
-      src_ptr, src_pixels_per_line,
-      dst_ptr, dst_pixels_per_line, 16,
-      &xsum, &xxsum);
-  } else if (xoffset == HALFNDX && yoffset == HALFNDX) {
-    vp9_half_horiz_vert_variance8x_h_sse2(
-      src_ptr, src_pixels_per_line,
-      dst_ptr, dst_pixels_per_line, 16,
-      &xsum, &xxsum);
-  } else {
-    vp9_filter_block2d_bil_var_sse2(
-      src_ptr, src_pixels_per_line,
-      dst_ptr, dst_pixels_per_line, 16,
-      xoffset, yoffset,
-      &xsum, &xxsum);
-  }
-
-  *sse = xxsum;
-  return (xxsum - ((xsum * xsum) >> 7));
-}
-
-
-unsigned int vp9_variance_halfpixvar16x16_h_wmt(
-  const unsigned char *src_ptr,
-  int  src_pixels_per_line,
-  const unsigned char *dst_ptr,
-  int  dst_pixels_per_line,
-  unsigned int *sse) {
-  int xsum0;
-  unsigned int xxsum0;
-
-  vp9_half_horiz_variance16x_h_sse2(
-    src_ptr, src_pixels_per_line,
-    dst_ptr, dst_pixels_per_line, 16,
-    &xsum0, &xxsum0);
-
-  *sse = xxsum0;
-  return (xxsum0 - ((xsum0 * xsum0) >> 8));
-}
-
-
-unsigned int vp9_variance_halfpixvar16x16_v_wmt(
-  const unsigned char *src_ptr,
-  int  src_pixels_per_line,
-  const unsigned char *dst_ptr,
-  int  dst_pixels_per_line,
-  unsigned int *sse) {
-  int xsum0;
-  unsigned int xxsum0;
-  vp9_half_vert_variance16x_h_sse2(
-    src_ptr, src_pixels_per_line,
-    dst_ptr, dst_pixels_per_line, 16,
-    &xsum0, &xxsum0);
-
-  *sse = xxsum0;
-  return (xxsum0 - ((xsum0 * xsum0) >> 8));
-}
-
-
-unsigned int vp9_variance_halfpixvar16x16_hv_wmt(
-  const unsigned char *src_ptr,
-  int  src_pixels_per_line,
-  const unsigned char *dst_ptr,
-  int  dst_pixels_per_line,
-  unsigned int *sse) {
-  int xsum0;
-  unsigned int xxsum0;
-
-  vp9_half_horiz_vert_variance16x_h_sse2(
-    src_ptr, src_pixels_per_line,
-    dst_ptr, dst_pixels_per_line, 16,
-    &xsum0, &xxsum0);
-
-  *sse = xxsum0;
-  return (xxsum0 - ((xsum0 * xsum0) >> 8));
-}
--- a/vp8/encoder/x86/variance_ssse3.c
+++ /dev/null
@@ -1,151 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "vpx_config.h"
-#include "vp8/encoder/variance.h"
-#include "vp8/common/pragmas.h"
-#include "vpx_ports/mem.h"
-
-#define HALFNDX 8
-
-extern unsigned int vp9_get16x16var_sse2
-(
-  const unsigned char *src_ptr,
-  int source_stride,
-  const unsigned char *ref_ptr,
-  int recon_stride,
-  unsigned int *SSE,
-  int *Sum
-);
-extern void vp9_half_horiz_vert_variance16x_h_sse2
-(
-  const unsigned char *ref_ptr,
-  int ref_pixels_per_line,
-  const unsigned char *src_ptr,
-  int src_pixels_per_line,
-  unsigned int Height,
-  int *sum,
-  unsigned int *sumsquared
-);
-extern void vp9_half_horiz_variance16x_h_sse2
-(
-  const unsigned char *ref_ptr,
-  int ref_pixels_per_line,
-  const unsigned char *src_ptr,
-  int src_pixels_per_line,
-  unsigned int Height,
-  int *sum,
-  unsigned int *sumsquared
-);
-extern void vp9_half_vert_variance16x_h_sse2
-(
-  const unsigned char *ref_ptr,
-  int ref_pixels_per_line,
-  const unsigned char *src_ptr,
-  int src_pixels_per_line,
-  unsigned int Height,
-  int *sum,
-  unsigned int *sumsquared
-);
-extern void vp9_filter_block2d_bil_var_ssse3
-(
-  const unsigned char *ref_ptr,
-  int ref_pixels_per_line,
-  const unsigned char *src_ptr,
-  int src_pixels_per_line,
-  unsigned int Height,
-  int  xoffset,
-  int  yoffset,
-  int *sum,
-  unsigned int *sumsquared
-);
-
-unsigned int vp9_sub_pixel_variance16x16_ssse3
-(
-  const unsigned char  *src_ptr,
-  int  src_pixels_per_line,
-  int  xoffset,
-  int  yoffset,
-  const unsigned char *dst_ptr,
-  int dst_pixels_per_line,
-  unsigned int *sse
-) {
-  int xsum0;
-  unsigned int xxsum0;
-
-  // note we could avoid these if statements if the calling function
-  // just called the appropriate functions inside.
-  if (xoffset == HALFNDX && yoffset == 0) {
-    vp9_half_horiz_variance16x_h_sse2(
-      src_ptr, src_pixels_per_line,
-      dst_ptr, dst_pixels_per_line, 16,
-      &xsum0, &xxsum0);
-  } else if (xoffset == 0 && yoffset == HALFNDX) {
-    vp9_half_vert_variance16x_h_sse2(
-      src_ptr, src_pixels_per_line,
-      dst_ptr, dst_pixels_per_line, 16,
-      &xsum0, &xxsum0);
-  } else if (xoffset == HALFNDX && yoffset == HALFNDX) {
-    vp9_half_horiz_vert_variance16x_h_sse2(
-      src_ptr, src_pixels_per_line,
-      dst_ptr, dst_pixels_per_line, 16,
-      &xsum0, &xxsum0);
-  } else {
-    vp9_filter_block2d_bil_var_ssse3(
-      src_ptr, src_pixels_per_line,
-      dst_ptr, dst_pixels_per_line, 16,
-      xoffset, yoffset,
-      &xsum0, &xxsum0);
-  }
-
-  *sse = xxsum0;
-  return (xxsum0 - ((xsum0 * xsum0) >> 8));
-}
-
-unsigned int vp9_sub_pixel_variance16x8_ssse3
-(
-  const unsigned char  *src_ptr,
-  int  src_pixels_per_line,
-  int  xoffset,
-  int  yoffset,
-  const unsigned char *dst_ptr,
-  int dst_pixels_per_line,
-  unsigned int *sse
-
-) {
-  int xsum0;
-  unsigned int xxsum0;
-
-  if (xoffset == HALFNDX && yoffset == 0) {
-    vp9_half_horiz_variance16x_h_sse2(
-      src_ptr, src_pixels_per_line,
-      dst_ptr, dst_pixels_per_line, 8,
-      &xsum0, &xxsum0);
-  } else if (xoffset == 0 && yoffset == HALFNDX) {
-    vp9_half_vert_variance16x_h_sse2(
-      src_ptr, src_pixels_per_line,
-      dst_ptr, dst_pixels_per_line, 8,
-      &xsum0, &xxsum0);
-  } else if (xoffset == HALFNDX && yoffset == HALFNDX) {
-    vp9_half_horiz_vert_variance16x_h_sse2(
-      src_ptr, src_pixels_per_line,
-      dst_ptr, dst_pixels_per_line, 8,
-      &xsum0, &xxsum0);
-  } else {
-    vp9_filter_block2d_bil_var_ssse3(
-      src_ptr, src_pixels_per_line,
-      dst_ptr, dst_pixels_per_line, 8,
-      xoffset, yoffset,
-      &xsum0, &xxsum0);
-  }
-
-  *sse = xxsum0;
-  return (xxsum0 - ((xsum0 * xsum0) >> 7));
-}
--- a/vp8/encoder/x86/x86_csystemdependent.c
+++ /dev/null
@@ -1,114 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vpx_ports/config.h"
-#include "vpx_ports/x86.h"
-#include "vp8/encoder/variance.h"
-#include "vp8/encoder/onyx_int.h"
-
-
-#if HAVE_MMX
-void vp9_short_fdct8x4_mmx(short *input, short *output, int pitch) {
-  vp9_short_fdct4x4_mmx(input,   output,    pitch);
-  vp9_short_fdct4x4_mmx(input + 4, output + 16, pitch);
-}
-
-int vp9_mbblock_error_mmx_impl(short *coeff_ptr, short *dcoef_ptr, int dc);
-int vp9_mbblock_error_mmx(MACROBLOCK *mb, int dc) {
-  short *coeff_ptr =  mb->block[0].coeff;
-  short *dcoef_ptr =  mb->e_mbd.block[0].dqcoeff;
-  return vp9_mbblock_error_mmx_impl(coeff_ptr, dcoef_ptr, dc);
-}
-
-int vp9_mbuverror_mmx_impl(short *s_ptr, short *d_ptr);
-int vp9_mbuverror_mmx(MACROBLOCK *mb) {
-  short *s_ptr = &mb->coeff[256];
-  short *d_ptr = &mb->e_mbd.dqcoeff[256];
-  return vp9_mbuverror_mmx_impl(s_ptr, d_ptr);
-}
-
-void vp9_subtract_b_mmx_impl(unsigned char *z,  int src_stride,
-                             short *diff, unsigned char *predictor,
-                             int pitch);
-void vp9_subtract_b_mmx(BLOCK *be, BLOCKD *bd, int pitch) {
-  unsigned char *z = *(be->base_src) + be->src;
-  unsigned int  src_stride = be->src_stride;
-  short *diff = &be->src_diff[0];
-  unsigned char *predictor = &bd->predictor[0];
-  vp9_subtract_b_mmx_impl(z, src_stride, diff, predictor, pitch);
-}
-
-#endif
-
-#if HAVE_SSE2
-int vp9_mbblock_error_xmm_impl(short *coeff_ptr, short *dcoef_ptr, int dc);
-int vp9_mbblock_error_xmm(MACROBLOCK *mb, int dc) {
-  short *coeff_ptr =  mb->block[0].coeff;
-  short *dcoef_ptr =  mb->e_mbd.block[0].dqcoeff;
-  return vp9_mbblock_error_xmm_impl(coeff_ptr, dcoef_ptr, dc);
-}
-
-int vp9_mbuverror_xmm_impl(short *s_ptr, short *d_ptr);
-int vp9_mbuverror_xmm(MACROBLOCK *mb) {
-  short *s_ptr = &mb->coeff[256];
-  short *d_ptr = &mb->e_mbd.dqcoeff[256];
-  return vp9_mbuverror_xmm_impl(s_ptr, d_ptr);
-}
-
-void vp9_subtract_b_sse2_impl(unsigned char *z,  int src_stride,
-                              short *diff, unsigned char *predictor,
-                              int pitch);
-void vp9_subtract_b_sse2(BLOCK *be, BLOCKD *bd, int pitch) {
-  unsigned char *z = *(be->base_src) + be->src;
-  unsigned int  src_stride = be->src_stride;
-  short *diff = &be->src_diff[0];
-  unsigned char *predictor = &bd->predictor[0];
-  vp9_subtract_b_sse2_impl(z, src_stride, diff, predictor, pitch);
-}
-
-#endif
-
-void vp9_arch_x86_encoder_init(VP9_COMP *cpi) {
-#if CONFIG_RUNTIME_CPU_DETECT
-  int flags = x86_simd_caps();
-
-  /* Note:
-   *
-   * This platform can be built without runtime CPU detection as well. If
-   * you modify any of the function mappings present in this file, be sure
-   * to also update them in static mapings (<arch>/filename_<arch>.h)
-   */
-
-  /* Override default functions with fastest ones for this CPU. */
-#if HAVE_SSE2
-  if (flags & HAS_SSE2) {
-    cpi->rtcd.temporal.apply                 = vp9_temporal_filter_apply_sse2;
-
-  }
-#endif
-
-#if HAVE_SSE3
-  if (flags & HAS_SSE3) {
-    cpi->rtcd.search.full_search             = vp9_full_search_sadx3;
-    cpi->rtcd.search.diamond_search          = vp9_diamond_search_sadx4;
-    cpi->rtcd.search.refining_search         = vp9_refining_search_sadx4;
-  }
-#endif
-
-
-#if HAVE_SSE4_1
-  if (flags & HAS_SSE4_1) {
-    cpi->rtcd.search.full_search             = vp9_full_search_sadx8;
-  }
-#endif
-
-#endif
-}
--- a/vp8/exports_dec
+++ /dev/null
@@ -1,2 +1,0 @@
-data vpx_codec_vp8_dx_algo
-text vpx_codec_vp8_dx
--- a/vp8/exports_enc
+++ /dev/null
@@ -1,4 +1,0 @@
-data vpx_codec_vp8_cx_algo
-text vpx_codec_vp8_cx
-data vpx_codec_vp8x_cx_algo
-text vpx_codec_vp8x_cx
--- a/vp8/vp8_common.mk
+++ /dev/null
@@ -1,179 +1,0 @@
-##
-##  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-##
-##  Use of this source code is governed by a BSD-style license
-##  that can be found in the LICENSE file in the root of the source
-##  tree. An additional intellectual property rights grant can be found
-##  in the file PATENTS.  All contributing project authors may
-##  be found in the AUTHORS file in the root of the source tree.
-##
-
-VP8_COMMON_SRCS-yes += vp8_common.mk
-VP8_COMMON_SRCS-yes += common/type_aliases.h
-VP8_COMMON_SRCS-yes += common/pragmas.h
-VP8_COMMON_SRCS-yes += common/ppflags.h
-VP8_COMMON_SRCS-yes += common/onyx.h
-VP8_COMMON_SRCS-yes += common/onyxd.h
-VP8_COMMON_SRCS-yes += common/alloccommon.c
-VP8_COMMON_SRCS-yes += common/asm_com_offsets.c
-VP8_COMMON_SRCS-yes += common/blockd.c
-VP8_COMMON_SRCS-yes += common/coefupdateprobs.h
-VP8_COMMON_SRCS-yes += common/debugmodes.c
-VP8_COMMON_SRCS-yes += common/entropy.c
-VP8_COMMON_SRCS-yes += common/entropymode.c
-VP8_COMMON_SRCS-yes += common/entropymv.c
-VP8_COMMON_SRCS-yes += common/extend.c
-VP8_COMMON_SRCS-yes += common/filter.c
-VP8_COMMON_SRCS-yes += common/filter.h
-VP8_COMMON_SRCS-yes += common/findnearmv.c
-VP8_COMMON_SRCS-yes += common/generic/systemdependent.c
-VP8_COMMON_SRCS-yes += common/idctllm.c
-VP8_COMMON_SRCS-yes += common/alloccommon.h
-VP8_COMMON_SRCS-yes += common/blockd.h
-VP8_COMMON_SRCS-yes += common/common.h
-VP8_COMMON_SRCS-yes += common/common_types.h
-VP8_COMMON_SRCS-yes += common/entropy.h
-VP8_COMMON_SRCS-yes += common/entropymode.h
-VP8_COMMON_SRCS-yes += common/entropymv.h
-VP8_COMMON_SRCS-yes += common/extend.h
-VP8_COMMON_SRCS-yes += common/findnearmv.h
-VP8_COMMON_SRCS-yes += common/header.h
-VP8_COMMON_SRCS-yes += common/idct.h
-VP8_COMMON_SRCS-yes += common/invtrans.h
-VP8_COMMON_SRCS-yes += common/loopfilter.h
-VP8_COMMON_SRCS-yes += common/modecont.h
-VP8_COMMON_SRCS-yes += common/mv.h
-VP8_COMMON_SRCS-yes += common/onyxc_int.h
-VP8_COMMON_SRCS-yes += common/pred_common.h
-VP8_COMMON_SRCS-yes += common/pred_common.c
-VP8_COMMON_SRCS-yes += common/quant_common.h
-VP8_COMMON_SRCS-yes += common/reconinter.h
-VP8_COMMON_SRCS-yes += common/reconintra.h
-VP8_COMMON_SRCS-yes += common/reconintra4x4.h
-VP8_COMMON_SRCS-yes += common/rtcd.c
-VP8_COMMON_SRCS-yes += common/rtcd_defs.sh
-VP8_COMMON_SRCS-yes += common/sadmxn.h
-VP8_COMMON_SRCS-yes += common/seg_common.h
-VP8_COMMON_SRCS-yes += common/seg_common.c
-VP8_COMMON_SRCS-yes += common/setupintrarecon.h
-VP8_COMMON_SRCS-yes += common/subpixel.h
-VP8_COMMON_SRCS-yes += common/swapyv12buffer.h
-VP8_COMMON_SRCS-yes += common/systemdependent.h
-VP8_COMMON_SRCS-yes += common/treecoder.h
-VP8_COMMON_SRCS-yes += common/invtrans.c
-VP8_COMMON_SRCS-yes += common/loopfilter.c
-VP8_COMMON_SRCS-yes += common/loopfilter_filters.c
-VP8_COMMON_SRCS-yes += common/mbpitch.c
-VP8_COMMON_SRCS-yes += common/modecont.c
-VP8_COMMON_SRCS-yes += common/modecontext.c
-VP8_COMMON_SRCS-yes += common/mvref_common.c
-VP8_COMMON_SRCS-yes += common/mvref_common.h
-VP8_COMMON_SRCS-yes += common/quant_common.c
-VP8_COMMON_SRCS-yes += common/recon.c
-VP8_COMMON_SRCS-yes += common/reconinter.c
-VP8_COMMON_SRCS-yes += common/reconintra.c
-VP8_COMMON_SRCS-yes += common/reconintra4x4.c
-VP8_COMMON_SRCS-yes += common/setupintrarecon.c
-VP8_COMMON_SRCS-yes += common/swapyv12buffer.c
-VP8_COMMON_SRCS-$(CONFIG_POSTPROC_VISUALIZER) += common/textblit.c
-VP8_COMMON_SRCS-yes += common/treecoder.c
-VP8_COMMON_SRCS-$(CONFIG_IMPLICIT_SEGMENTATION) += common/implicit_segmentation.c
-
-VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/idct_x86.h
-VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/subpixel_x86.h
-VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/loopfilter_x86.h
-VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/postproc_x86.h
-VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/x86_systemdependent.c
-VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp8_asm_stubs.c
-VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/loopfilter_x86.c
-VP8_COMMON_SRCS-$(CONFIG_POSTPROC) += common/postproc.h
-VP8_COMMON_SRCS-$(CONFIG_POSTPROC) += common/postproc.c
-VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/idctllm_mmx.asm
-VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/iwalsh_mmx.asm
-VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/recon_mmx.asm
-VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/subpixel_mmx.asm
-VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/loopfilter_mmx.asm
-VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/idctllm_sse2.asm
-VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/recon_sse2.asm
-VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/recon_wrapper_sse2.c
-VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/subpixel_sse2.asm
-VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/loopfilter_sse2.asm
-VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/iwalsh_sse2.asm
-VP8_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/subpixel_8t_ssse3.asm
-VP8_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/subpixel_ssse3.asm
-ifeq ($(CONFIG_POSTPROC),yes)
-VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/postproc_mmx.asm
-VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/postproc_sse2.asm
-endif
-
-# common (c)
-ifeq ($(CONFIG_CSM),yes)
-VP8_COMMON_SRCS-yes += common/maskingmv.c
-VP8_COMMON_SRCS-$(HAVE_SSE3) += common/x86/mask_sse3.asm
-endif
-
-VP8_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/filter_sse4.c
-ifeq ($(HAVE_SSE4_1),yes)
-vp8/common/x86/filter_sse4.c.o: CFLAGS += -msse4
-endif
-
-VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/filter_sse2.c
-VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/sadmxn_x86.c
-ifeq ($(HAVE_SSE2),yes)
-vp8/common/x86/filter_sse2.c.o: CFLAGS += -msse2
-vp8/common/x86/loopfilter_x86.c.o: CFLAGS += -msse2
-vp8/common/x86/sadmxn_x86.c.o: CFLAGS += -msse2
-endif
-
-VP8_COMMON_SRCS-$(ARCH_ARM)  += common/arm/arm_systemdependent.c
-VP8_COMMON_SRCS-$(ARCH_ARM)  += common/arm/bilinearfilter_arm.c
-VP8_COMMON_SRCS-$(ARCH_ARM)  += common/arm/bilinearfilter_arm.h
-VP8_COMMON_SRCS-$(ARCH_ARM)  += common/arm/filter_arm.c
-VP8_COMMON_SRCS-$(ARCH_ARM)  += common/arm/idct_arm.h
-VP8_COMMON_SRCS-$(ARCH_ARM)  += common/arm/loopfilter_arm.c
-VP8_COMMON_SRCS-$(ARCH_ARM)  += common/arm/loopfilter_arm.h
-VP8_COMMON_SRCS-$(ARCH_ARM)  += common/arm/recon_arm.h
-VP8_COMMON_SRCS-$(ARCH_ARM)  += common/arm/reconintra_arm.c
-VP8_COMMON_SRCS-$(ARCH_ARM)  += common/arm/subpixel_arm.h
-
-# common (armv6)
-VP8_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/armv6/bilinearfilter_v6$(ASM)
-VP8_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/armv6/copymem8x4_v6$(ASM)
-VP8_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/armv6/copymem8x8_v6$(ASM)
-VP8_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/armv6/copymem16x16_v6$(ASM)
-VP8_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/armv6/dc_only_idct_add_v6$(ASM)
-VP8_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/armv6/iwalsh_v6$(ASM)
-VP8_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/armv6/filter_v6$(ASM)
-VP8_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/armv6/idct_v6$(ASM)
-VP8_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/armv6/loopfilter_v6$(ASM)
-VP8_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/armv6/recon_v6$(ASM)
-VP8_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/armv6/simpleloopfilter_v6$(ASM)
-VP8_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/armv6/sixtappredict8x4_v6$(ASM)
-
-# common (neon)
-VP8_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/bilinearpredict4x4_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/bilinearpredict8x4_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/bilinearpredict8x8_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/bilinearpredict16x16_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/copymem8x4_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/copymem8x8_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/copymem16x16_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/dc_only_idct_add_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/iwalsh_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/loopfilter_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/loopfiltersimplehorizontaledge_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/loopfiltersimpleverticaledge_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/mbloopfilter_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/recon2b_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/recon4b_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/reconb_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/shortidct4x4llm_1_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/shortidct4x4llm_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/sixtappredict4x4_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/sixtappredict8x4_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/sixtappredict8x8_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/sixtappredict16x16_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/recon16x16mb_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/buildintrapredictorsmby_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/save_neon_reg$(ASM)
-VP8_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/recon_neon.c
--- a/vp8/vp8_cx_iface.c
+++ /dev/null
@@ -1,1169 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vpx/vpx_codec.h"
-#include "vpx/internal/vpx_codec_internal.h"
-#include "vpx_version.h"
-#include "vp8/encoder/onyx_int.h"
-#include "vpx/vp8e.h"
-#include "vp8/encoder/firstpass.h"
-#include "vp8/common/onyx.h"
-#include <stdlib.h>
-#include <string.h>
-
-/* This value is a sentinel for determining whether the user has set a mode
- * directly through the deprecated VP8E_SET_ENCODING_MODE control.
- */
-#define NO_MODE_SET 255
-
-struct vp8_extracfg {
-  struct vpx_codec_pkt_list *pkt_list;
-  vp8e_encoding_mode      encoding_mode;               /** best, good, realtime            */
-  int                         cpu_used;                    /** available cpu percentage in 1/16*/
-  unsigned int                enable_auto_alt_ref;           /** if encoder decides to uses alternate reference frame */
-  unsigned int                noise_sensitivity;
-  unsigned int                Sharpness;
-  unsigned int                static_thresh;
-  unsigned int                token_partitions;
-  unsigned int                arnr_max_frames;    /* alt_ref Noise Reduction Max Frame Count */
-  unsigned int                arnr_strength;    /* alt_ref Noise Reduction Strength */
-  unsigned int                arnr_type;        /* alt_ref filter type */
-  unsigned int                experimental;
-  vp8e_tuning                 tuning;
-  unsigned int                cq_level;         /* constrained quality level */
-  unsigned int                rc_max_intra_bitrate_pct;
-
-};
-
-struct extraconfig_map {
-  int                 usage;
-  struct vp8_extracfg cfg;
-};
-
-static const struct extraconfig_map extracfg_map[] = {
-  {
-    0,
-    {
-      NULL,
-      VP8_BEST_QUALITY_ENCODING,  /* Encoding Mode */
-      0,                          /* cpu_used      */
-      0,                          /* enable_auto_alt_ref */
-      0,                          /* noise_sensitivity */
-      0,                          /* Sharpness */
-      0,                          /* static_thresh */
-      VP8_ONE_TOKENPARTITION,     /* token_partitions */
-      0,                          /* arnr_max_frames */
-      3,                          /* arnr_strength */
-      3,                          /* arnr_type*/
-      0,                          /* experimental mode */
-      0,                          /* tuning*/
-      10,                         /* cq_level */
-      0,                          /* rc_max_intra_bitrate_pct */
-    }
-  }
-};
-
-struct vpx_codec_alg_priv {
-  vpx_codec_priv_t        base;
-  vpx_codec_enc_cfg_t     cfg;
-  struct vp8_extracfg     vp8_cfg;
-  VP9_CONFIG              oxcf;
-  VP9_PTR             cpi;
-  unsigned char          *cx_data;
-  unsigned int            cx_data_sz;
-  vpx_image_t             preview_img;
-  unsigned int            next_frame_flag;
-  vp8_postproc_cfg_t      preview_ppcfg;
-  vpx_codec_pkt_list_decl(64) pkt_list;              // changed to accomendate the maximum number of lagged frames allowed
-  int                         deprecated_mode;
-  unsigned int                fixed_kf_cntr;
-};
-
-
-static vpx_codec_err_t
-update_error_state(vpx_codec_alg_priv_t                 *ctx,
-                   const struct vpx_internal_error_info *error) {
-  vpx_codec_err_t res;
-
-  if ((res = error->error_code))
-    ctx->base.err_detail = error->has_detail
-                           ? error->detail
-                           : NULL;
-
-  return res;
-}
-
-
-#undef ERROR
-#define ERROR(str) do {\
-    ctx->base.err_detail = str;\
-    return VPX_CODEC_INVALID_PARAM;\
-  } while(0)
-
-#define RANGE_CHECK(p,memb,lo,hi) do {\
-    if(!(((p)->memb == lo || (p)->memb > (lo)) && (p)->memb <= hi)) \
-      ERROR(#memb " out of range ["#lo".."#hi"]");\
-  } while(0)
-
-#define RANGE_CHECK_HI(p,memb,hi) do {\
-    if(!((p)->memb <= (hi))) \
-      ERROR(#memb " out of range [.."#hi"]");\
-  } while(0)
-
-#define RANGE_CHECK_LO(p,memb,lo) do {\
-    if(!((p)->memb >= (lo))) \
-      ERROR(#memb " out of range ["#lo"..]");\
-  } while(0)
-
-#define RANGE_CHECK_BOOL(p,memb) do {\
-    if(!!((p)->memb) != (p)->memb) ERROR(#memb " expected boolean");\
-  } while(0)
-
-static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t      *ctx,
-                                       const vpx_codec_enc_cfg_t *cfg,
-                                       const struct vp8_extracfg *vp8_cfg) {
-  RANGE_CHECK(cfg, g_w,                   1, 16383); /* 14 bits available */
-  RANGE_CHECK(cfg, g_h,                   1, 16383); /* 14 bits available */
-  RANGE_CHECK(cfg, g_timebase.den,        1, 1000000000);
-  RANGE_CHECK(cfg, g_timebase.num,        1, cfg->g_timebase.den);
-  RANGE_CHECK_HI(cfg, g_profile,          3);
-  RANGE_CHECK_HI(cfg, rc_max_quantizer,   63);
-  RANGE_CHECK_HI(cfg, rc_min_quantizer,   cfg->rc_max_quantizer);
-  RANGE_CHECK_HI(cfg, g_threads,          64);
-  RANGE_CHECK_HI(cfg, g_lag_in_frames,    MAX_LAG_BUFFERS);
-  RANGE_CHECK(cfg, rc_end_usage,          VPX_VBR, VPX_CQ);
-  RANGE_CHECK_HI(cfg, rc_undershoot_pct,  1000);
-  RANGE_CHECK_HI(cfg, rc_overshoot_pct,   1000);
-  RANGE_CHECK_HI(cfg, rc_2pass_vbr_bias_pct, 100);
-  RANGE_CHECK(cfg, kf_mode,               VPX_KF_DISABLED, VPX_KF_AUTO);
-  // RANGE_CHECK_BOOL(cfg,                 g_delete_firstpassfile);
-  RANGE_CHECK_BOOL(cfg,                   rc_resize_allowed);
-  RANGE_CHECK_HI(cfg, rc_dropframe_thresh,   100);
-  RANGE_CHECK_HI(cfg, rc_resize_up_thresh,   100);
-  RANGE_CHECK_HI(cfg, rc_resize_down_thresh, 100);
-  RANGE_CHECK(cfg,        g_pass,         VPX_RC_ONE_PASS, VPX_RC_LAST_PASS);
-
-  /* VP8 does not support a lower bound on the keyframe interval in
-   * automatic keyframe placement mode.
-   */
-  if (cfg->kf_mode != VPX_KF_DISABLED && cfg->kf_min_dist != cfg->kf_max_dist
-      && cfg->kf_min_dist > 0)
-    ERROR("kf_min_dist not supported in auto mode, use 0 "
-          "or kf_max_dist instead.");
-
-  RANGE_CHECK_BOOL(vp8_cfg,               enable_auto_alt_ref);
-  RANGE_CHECK(vp8_cfg, cpu_used,           -16, 16);
-
-  RANGE_CHECK(vp8_cfg, encoding_mode,      VP8_BEST_QUALITY_ENCODING, VP8_REAL_TIME_ENCODING);
-  RANGE_CHECK_HI(vp8_cfg, noise_sensitivity,  6);
-
-  RANGE_CHECK(vp8_cfg, token_partitions,   VP8_ONE_TOKENPARTITION, VP8_EIGHT_TOKENPARTITION);
-  RANGE_CHECK_HI(vp8_cfg, Sharpness,       7);
-  RANGE_CHECK(vp8_cfg, arnr_max_frames, 0, 15);
-  RANGE_CHECK_HI(vp8_cfg, arnr_strength,   6);
-  RANGE_CHECK(vp8_cfg, arnr_type,       1, 3);
-  RANGE_CHECK(vp8_cfg, cq_level, 0, 63);
-
-  if (cfg->g_pass == VPX_RC_LAST_PASS) {
-    size_t           packet_sz = sizeof(FIRSTPASS_STATS);
-    int              n_packets = cfg->rc_twopass_stats_in.sz / packet_sz;
-    FIRSTPASS_STATS *stats;
-
-    if (!cfg->rc_twopass_stats_in.buf)
-      ERROR("rc_twopass_stats_in.buf not set.");
-
-    if (cfg->rc_twopass_stats_in.sz % packet_sz)
-      ERROR("rc_twopass_stats_in.sz indicates truncated packet.");
-
-    if (cfg->rc_twopass_stats_in.sz < 2 * packet_sz)
-      ERROR("rc_twopass_stats_in requires at least two packets.");
-
-    stats = (void *)((char *)cfg->rc_twopass_stats_in.buf
-                     + (n_packets - 1) * packet_sz);
-
-    if ((int)(stats->count + 0.5) != n_packets - 1)
-      ERROR("rc_twopass_stats_in missing EOS stats packet");
-  }
-
-  return VPX_CODEC_OK;
-}
-
-
-static vpx_codec_err_t validate_img(vpx_codec_alg_priv_t *ctx,
-                                    const vpx_image_t    *img) {
-  switch (img->fmt) {
-    case VPX_IMG_FMT_YV12:
-    case VPX_IMG_FMT_I420:
-    case VPX_IMG_FMT_VPXI420:
-    case VPX_IMG_FMT_VPXYV12:
-      break;
-    default:
-      ERROR("Invalid image format. Only YV12 and I420 images are supported");
-  }
-
-  if ((img->d_w != ctx->cfg.g_w) || (img->d_h != ctx->cfg.g_h))
-    ERROR("Image size must match encoder init configuration size");
-
-  return VPX_CODEC_OK;
-}
-
-
-static vpx_codec_err_t set_vp8e_config(VP9_CONFIG *oxcf,
-                                       vpx_codec_enc_cfg_t cfg,
-                                       struct vp8_extracfg vp8_cfg) {
-  oxcf->Version               = cfg.g_profile;
-  oxcf->Version              |= vp8_cfg.experimental ? 0x4 : 0;
-
-  oxcf->Width                 = cfg.g_w;
-  oxcf->Height                = cfg.g_h;
-  /* guess a frame rate if out of whack, use 30 */
-  oxcf->frame_rate             = (double)(cfg.g_timebase.den) / (double)(cfg.g_timebase.num);
-
-  if (oxcf->frame_rate > 180) {
-    oxcf->frame_rate = 30;
-  }
-
-  switch (cfg.g_pass) {
-    case VPX_RC_ONE_PASS:
-      oxcf->Mode = MODE_BESTQUALITY;
-      break;
-    case VPX_RC_FIRST_PASS:
-      oxcf->Mode = MODE_FIRSTPASS;
-      break;
-    case VPX_RC_LAST_PASS:
-      oxcf->Mode = MODE_SECONDPASS_BEST;
-      break;
-  }
-
-  if (cfg.g_pass == VPX_RC_FIRST_PASS) {
-    oxcf->allow_lag              = 0;
-    oxcf->lag_in_frames           = 0;
-  } else {
-    oxcf->allow_lag              = (cfg.g_lag_in_frames) > 0;
-    oxcf->lag_in_frames           = cfg.g_lag_in_frames;
-  }
-
-  // VBR only supported for now.
-  // CBR code has been deprectated for experimental phase.
-  // CQ mode not yet tested
-  oxcf->end_usage          = USAGE_LOCAL_FILE_PLAYBACK;
-  /*if (cfg.rc_end_usage == VPX_CQ)
-      oxcf->end_usage      = USAGE_CONSTRAINED_QUALITY;
-  else
-      oxcf->end_usage      = USAGE_LOCAL_FILE_PLAYBACK;*/
-
-  oxcf->target_bandwidth       = cfg.rc_target_bitrate;
-  oxcf->rc_max_intra_bitrate_pct = vp8_cfg.rc_max_intra_bitrate_pct;
-
-  oxcf->best_allowed_q          = cfg.rc_min_quantizer;
-  oxcf->worst_allowed_q         = cfg.rc_max_quantizer;
-  oxcf->cq_level                = vp8_cfg.cq_level;
-  oxcf->fixed_q = -1;
-
-  oxcf->under_shoot_pct         = cfg.rc_undershoot_pct;
-  oxcf->over_shoot_pct          = cfg.rc_overshoot_pct;
-
-  oxcf->maximum_buffer_size     = cfg.rc_buf_sz;
-  oxcf->starting_buffer_level   = cfg.rc_buf_initial_sz;
-  oxcf->optimal_buffer_level    = cfg.rc_buf_optimal_sz;
-
-  oxcf->two_pass_vbrbias        = cfg.rc_2pass_vbr_bias_pct;
-  oxcf->two_pass_vbrmin_section  = cfg.rc_2pass_vbr_minsection_pct;
-  oxcf->two_pass_vbrmax_section  = cfg.rc_2pass_vbr_maxsection_pct;
-
-  oxcf->auto_key               = cfg.kf_mode == VPX_KF_AUTO
-                                 && cfg.kf_min_dist != cfg.kf_max_dist;
-  // oxcf->kf_min_dist         = cfg.kf_min_dis;
-  oxcf->key_freq               = cfg.kf_max_dist;
-
-  // oxcf->delete_first_pass_file = cfg.g_delete_firstpassfile;
-  // strcpy(oxcf->first_pass_file, cfg.g_firstpass_file);
-
-  oxcf->cpu_used               =  vp8_cfg.cpu_used;
-  oxcf->encode_breakout        =  vp8_cfg.static_thresh;
-  oxcf->play_alternate         =  vp8_cfg.enable_auto_alt_ref;
-  oxcf->noise_sensitivity      =  vp8_cfg.noise_sensitivity;
-  oxcf->Sharpness             =  vp8_cfg.Sharpness;
-
-  oxcf->two_pass_stats_in        =  cfg.rc_twopass_stats_in;
-  oxcf->output_pkt_list         =  vp8_cfg.pkt_list;
-
-  oxcf->arnr_max_frames = vp8_cfg.arnr_max_frames;
-  oxcf->arnr_strength =  vp8_cfg.arnr_strength;
-  oxcf->arnr_type =      vp8_cfg.arnr_type;
-
-  oxcf->tuning = vp8_cfg.tuning;
-
-#if CONFIG_LOSSLESS
-  oxcf->lossless = cfg.lossless;
-#endif
-
-  /*
-      printf("Current VP8 Settings: \n");
-      printf("target_bandwidth: %d\n", oxcf->target_bandwidth);
-      printf("noise_sensitivity: %d\n", oxcf->noise_sensitivity);
-      printf("Sharpness: %d\n",    oxcf->Sharpness);
-      printf("cpu_used: %d\n",  oxcf->cpu_used);
-      printf("Mode: %d\n",     oxcf->Mode);
-      printf("delete_first_pass_file: %d\n",  oxcf->delete_first_pass_file);
-      printf("auto_key: %d\n",  oxcf->auto_key);
-      printf("key_freq: %d\n", oxcf->key_freq);
-      printf("end_usage: %d\n", oxcf->end_usage);
-      printf("under_shoot_pct: %d\n", oxcf->under_shoot_pct);
-      printf("over_shoot_pct: %d\n", oxcf->over_shoot_pct);
-      printf("starting_buffer_level: %d\n", oxcf->starting_buffer_level);
-      printf("optimal_buffer_level: %d\n",  oxcf->optimal_buffer_level);
-      printf("maximum_buffer_size: %d\n", oxcf->maximum_buffer_size);
-      printf("fixed_q: %d\n",  oxcf->fixed_q);
-      printf("worst_allowed_q: %d\n", oxcf->worst_allowed_q);
-      printf("best_allowed_q: %d\n", oxcf->best_allowed_q);
-      printf("two_pass_vbrbias: %d\n",  oxcf->two_pass_vbrbias);
-      printf("two_pass_vbrmin_section: %d\n", oxcf->two_pass_vbrmin_section);
-      printf("two_pass_vbrmax_section: %d\n", oxcf->two_pass_vbrmax_section);
-      printf("allow_lag: %d\n", oxcf->allow_lag);
-      printf("lag_in_frames: %d\n", oxcf->lag_in_frames);
-      printf("play_alternate: %d\n", oxcf->play_alternate);
-      printf("Version: %d\n", oxcf->Version);
-      printf("encode_breakout: %d\n", oxcf->encode_breakout);
-  */
-  return VPX_CODEC_OK;
-}
-
-static vpx_codec_err_t vp8e_set_config(vpx_codec_alg_priv_t       *ctx,
-                                       const vpx_codec_enc_cfg_t  *cfg) {
-  vpx_codec_err_t res;
-
-  if ((cfg->g_w != ctx->cfg.g_w) || (cfg->g_h != ctx->cfg.g_h))
-    ERROR("Cannot change width or height after initialization");
-
-  /* Prevent increasing lag_in_frames. This check is stricter than it needs
-   * to be -- the limit is not increasing past the first lag_in_frames
-   * value, but we don't track the initial config, only the last successful
-   * config.
-   */
-  if ((cfg->g_lag_in_frames > ctx->cfg.g_lag_in_frames))
-    ERROR("Cannot increase lag_in_frames");
-
-  res = validate_config(ctx, cfg, &ctx->vp8_cfg);
-
-  if (!res) {
-    ctx->cfg = *cfg;
-    set_vp8e_config(&ctx->oxcf, ctx->cfg, ctx->vp8_cfg);
-    vp9_change_config(ctx->cpi, &ctx->oxcf);
-  }
-
-  return res;
-}
-
-
-int vp9_reverse_trans(int q);
-
-
-static vpx_codec_err_t get_param(vpx_codec_alg_priv_t *ctx,
-                                 int                   ctrl_id,
-                                 va_list               args) {
-  void *arg = va_arg(args, void *);
-
-#define MAP(id, var) case id: *(RECAST(id, arg)) = var; break
-
-  if (!arg)
-    return VPX_CODEC_INVALID_PARAM;
-
-  switch (ctrl_id) {
-      MAP(VP8E_GET_LAST_QUANTIZER, vp9_get_quantizer(ctx->cpi));
-      MAP(VP8E_GET_LAST_QUANTIZER_64,
-          vp9_reverse_trans(vp9_get_quantizer(ctx->cpi)));
-  }
-
-  return VPX_CODEC_OK;
-#undef MAP
-}
-
-
-static vpx_codec_err_t set_param(vpx_codec_alg_priv_t *ctx,
-                                 int                   ctrl_id,
-                                 va_list               args) {
-  vpx_codec_err_t     res  = VPX_CODEC_OK;
-  struct vp8_extracfg xcfg = ctx->vp8_cfg;
-
-#define MAP(id, var) case id: var = CAST(id, args); break;
-
-  switch (ctrl_id) {
-      MAP(VP8E_SET_ENCODING_MODE,         ctx->deprecated_mode);
-      MAP(VP8E_SET_CPUUSED,               xcfg.cpu_used);
-      MAP(VP8E_SET_ENABLEAUTOALTREF,      xcfg.enable_auto_alt_ref);
-      MAP(VP8E_SET_NOISE_SENSITIVITY,     xcfg.noise_sensitivity);
-      MAP(VP8E_SET_SHARPNESS,             xcfg.Sharpness);
-      MAP(VP8E_SET_STATIC_THRESHOLD,      xcfg.static_thresh);
-      MAP(VP8E_SET_TOKEN_PARTITIONS,      xcfg.token_partitions);
-
-      MAP(VP8E_SET_ARNR_MAXFRAMES,        xcfg.arnr_max_frames);
-      MAP(VP8E_SET_ARNR_STRENGTH,        xcfg.arnr_strength);
-      MAP(VP8E_SET_ARNR_TYPE,        xcfg.arnr_type);
-      MAP(VP8E_SET_TUNING,                xcfg.tuning);
-      MAP(VP8E_SET_CQ_LEVEL,              xcfg.cq_level);
-      MAP(VP8E_SET_MAX_INTRA_BITRATE_PCT, xcfg.rc_max_intra_bitrate_pct);
-
-  }
-
-  res = validate_config(ctx, &ctx->cfg, &xcfg);
-
-  if (!res) {
-    ctx->vp8_cfg = xcfg;
-    set_vp8e_config(&ctx->oxcf, ctx->cfg, ctx->vp8_cfg);
-    vp9_change_config(ctx->cpi, &ctx->oxcf);
-  }
-
-  return res;
-#undef MAP
-}
-
-
-static vpx_codec_err_t vp8e_common_init(vpx_codec_ctx_t *ctx,
-                                        int              experimental) {
-  vpx_codec_err_t        res = VPX_DEC_OK;
-  struct vpx_codec_alg_priv *priv;
-  vpx_codec_enc_cfg_t       *cfg;
-  unsigned int               i;
-
-  VP9_PTR optr;
-
-  if (!ctx->priv) {
-    priv = calloc(1, sizeof(struct vpx_codec_alg_priv));
-
-    if (!priv) {
-      return VPX_CODEC_MEM_ERROR;
-    }
-
-    ctx->priv = &priv->base;
-    ctx->priv->sz = sizeof(*ctx->priv);
-    ctx->priv->iface = ctx->iface;
-    ctx->priv->alg_priv = priv;
-    ctx->priv->init_flags = ctx->init_flags;
-
-    if (ctx->config.enc) {
-      /* Update the reference to the config structure to an
-       * internal copy.
-       */
-      ctx->priv->alg_priv->cfg = *ctx->config.enc;
-      ctx->config.enc = &ctx->priv->alg_priv->cfg;
-    }
-
-    cfg =  &ctx->priv->alg_priv->cfg;
-
-    /* Select the extra vp6 configuration table based on the current
-     * usage value. If the current usage value isn't found, use the
-     * values for usage case 0.
-     */
-    for (i = 0;
-         extracfg_map[i].usage && extracfg_map[i].usage != cfg->g_usage;
-         i++);
-
-    priv->vp8_cfg = extracfg_map[i].cfg;
-    priv->vp8_cfg.pkt_list = &priv->pkt_list.head;
-    priv->vp8_cfg.experimental = experimental;
-
-    priv->cx_data_sz = priv->cfg.g_w * priv->cfg.g_h * 3 / 2 * 2;
-
-    if (priv->cx_data_sz < 4096) priv->cx_data_sz = 4096;
-
-    priv->cx_data = malloc(priv->cx_data_sz);
-
-    if (!priv->cx_data) {
-      return VPX_CODEC_MEM_ERROR;
-    }
-
-    priv->deprecated_mode = NO_MODE_SET;
-
-    vp9_initialize_enc();
-
-    res = validate_config(priv, &priv->cfg, &priv->vp8_cfg);
-
-    if (!res) {
-      set_vp8e_config(&ctx->priv->alg_priv->oxcf,
-                      ctx->priv->alg_priv->cfg,
-                      ctx->priv->alg_priv->vp8_cfg);
-      optr = vp9_create_compressor(&ctx->priv->alg_priv->oxcf);
-
-      if (!optr)
-        res = VPX_CODEC_MEM_ERROR;
-      else
-        ctx->priv->alg_priv->cpi = optr;
-    }
-  }
-
-  return res;
-}
-
-
-static vpx_codec_err_t vp8e_init(vpx_codec_ctx_t *ctx) {
-  return vp8e_common_init(ctx, 0);
-}
-
-
-#if CONFIG_EXPERIMENTAL
-static vpx_codec_err_t vp8e_exp_init(vpx_codec_ctx_t *ctx) {
-  return vp8e_common_init(ctx, 1);
-}
-#endif
-
-
-static vpx_codec_err_t vp8e_destroy(vpx_codec_alg_priv_t *ctx) {
-
-  free(ctx->cx_data);
-  vp9_remove_compressor(&ctx->cpi);
-  free(ctx);
-  return VPX_CODEC_OK;
-}
-
-static vpx_codec_err_t image2yuvconfig(const vpx_image_t   *img,
-                                       YV12_BUFFER_CONFIG  *yv12) {
-  vpx_codec_err_t        res = VPX_CODEC_OK;
-  yv12->y_buffer = img->planes[VPX_PLANE_Y];
-  yv12->u_buffer = img->planes[VPX_PLANE_U];
-  yv12->v_buffer = img->planes[VPX_PLANE_V];
-
-  yv12->y_width  = img->d_w;
-  yv12->y_height = img->d_h;
-  yv12->uv_width = (1 + yv12->y_width) / 2;
-  yv12->uv_height = (1 + yv12->y_height) / 2;
-
-  yv12->y_stride = img->stride[VPX_PLANE_Y];
-  yv12->uv_stride = img->stride[VPX_PLANE_U];
-
-  yv12->border  = (img->stride[VPX_PLANE_Y] - img->w) / 2;
-  yv12->clrtype = (img->fmt == VPX_IMG_FMT_VPXI420 || img->fmt == VPX_IMG_FMT_VPXYV12); // REG_YUV = 0
-  return res;
-}
-
-static void pick_quickcompress_mode(vpx_codec_alg_priv_t  *ctx,
-                                    unsigned long          duration,
-                                    unsigned long          deadline) {
-  unsigned int new_qc;
-
-  /* Use best quality mode if no deadline is given. */
-  if (deadline)
-    new_qc = MODE_GOODQUALITY;
-  else
-    new_qc = MODE_BESTQUALITY;
-
-  if (ctx->cfg.g_pass == VPX_RC_FIRST_PASS)
-    new_qc = MODE_FIRSTPASS;
-  else if (ctx->cfg.g_pass == VPX_RC_LAST_PASS)
-    new_qc = (new_qc == MODE_BESTQUALITY)
-             ? MODE_SECONDPASS_BEST
-             : MODE_SECONDPASS;
-
-  if (ctx->oxcf.Mode != new_qc) {
-    ctx->oxcf.Mode = new_qc;
-    vp9_change_config(ctx->cpi, &ctx->oxcf);
-  }
-}
-
-
-static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t  *ctx,
-                                   const vpx_image_t     *img,
-                                   vpx_codec_pts_t        pts,
-                                   unsigned long          duration,
-                                   vpx_enc_frame_flags_t  flags,
-                                   unsigned long          deadline) {
-  vpx_codec_err_t res = VPX_CODEC_OK;
-
-  if (img)
-    res = validate_img(ctx, img);
-
-  pick_quickcompress_mode(ctx, duration, deadline);
-  vpx_codec_pkt_list_init(&ctx->pkt_list);
-
-  /* Handle Flags */
-  if (((flags & VP8_EFLAG_NO_UPD_GF) && (flags & VP8_EFLAG_FORCE_GF))
-      || ((flags & VP8_EFLAG_NO_UPD_ARF) && (flags & VP8_EFLAG_FORCE_ARF))) {
-    ctx->base.err_detail = "Conflicting flags.";
-    return VPX_CODEC_INVALID_PARAM;
-  }
-
-  if (flags & (VP8_EFLAG_NO_REF_LAST | VP8_EFLAG_NO_REF_GF
-               | VP8_EFLAG_NO_REF_ARF)) {
-    int ref = 7;
-
-    if (flags & VP8_EFLAG_NO_REF_LAST)
-      ref ^= VP9_LAST_FLAG;
-
-    if (flags & VP8_EFLAG_NO_REF_GF)
-      ref ^= VP9_GOLD_FLAG;
-
-    if (flags & VP8_EFLAG_NO_REF_ARF)
-      ref ^= VP9_ALT_FLAG;
-
-    vp9_use_as_reference(ctx->cpi, ref);
-  }
-
-  if (flags & (VP8_EFLAG_NO_UPD_LAST | VP8_EFLAG_NO_UPD_GF
-               | VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_FORCE_GF
-               | VP8_EFLAG_FORCE_ARF)) {
-    int upd = 7;
-
-    if (flags & VP8_EFLAG_NO_UPD_LAST)
-      upd ^= VP9_LAST_FLAG;
-
-    if (flags & VP8_EFLAG_NO_UPD_GF)
-      upd ^= VP9_GOLD_FLAG;
-
-    if (flags & VP8_EFLAG_NO_UPD_ARF)
-      upd ^= VP9_ALT_FLAG;
-
-    vp9_update_reference(ctx->cpi, upd);
-  }
-
-  if (flags & VP8_EFLAG_NO_UPD_ENTROPY) {
-    vp9_update_entropy(ctx->cpi, 0);
-  }
-
-  /* Handle fixed keyframe intervals */
-  if (ctx->cfg.kf_mode == VPX_KF_AUTO
-      && ctx->cfg.kf_min_dist == ctx->cfg.kf_max_dist) {
-    if (++ctx->fixed_kf_cntr > ctx->cfg.kf_min_dist) {
-      flags |= VPX_EFLAG_FORCE_KF;
-      ctx->fixed_kf_cntr = 1;
-    }
-  }
-
-  /* Initialize the encoder instance on the first frame*/
-  if (!res && ctx->cpi) {
-    unsigned int lib_flags;
-    YV12_BUFFER_CONFIG sd;
-    int64_t dst_time_stamp, dst_end_time_stamp;
-    unsigned long size, cx_data_sz;
-    unsigned char *cx_data;
-
-    /* Set up internal flags */
-    if (ctx->base.init_flags & VPX_CODEC_USE_PSNR)
-      ((VP9_COMP *)ctx->cpi)->b_calculate_psnr = 1;
-
-    // if (ctx->base.init_flags & VPX_CODEC_USE_OUTPUT_PARTITION)
-    //    ((VP9_COMP *)ctx->cpi)->output_partition = 1;
-
-    /* Convert API flags to internal codec lib flags */
-    lib_flags = (flags & VPX_EFLAG_FORCE_KF) ? FRAMEFLAGS_KEY : 0;
-
-    /* vp8 use 10,000,000 ticks/second as time stamp */
-    dst_time_stamp    = pts * 10000000 * ctx->cfg.g_timebase.num / ctx->cfg.g_timebase.den;
-    dst_end_time_stamp = (pts + duration) * 10000000 * ctx->cfg.g_timebase.num / ctx->cfg.g_timebase.den;
-
-    if (img != NULL) {
-      res = image2yuvconfig(img, &sd);
-
-      if (vp9_receive_raw_frame(ctx->cpi, ctx->next_frame_flag | lib_flags,
-                                &sd, dst_time_stamp, dst_end_time_stamp)) {
-        VP9_COMP *cpi = (VP9_COMP *)ctx->cpi;
-        res = update_error_state(ctx, &cpi->common.error);
-      }
-
-      /* reset for next frame */
-      ctx->next_frame_flag = 0;
-    }
-
-    cx_data = ctx->cx_data;
-    cx_data_sz = ctx->cx_data_sz;
-    lib_flags = 0;
-
-    while (cx_data_sz >= ctx->cx_data_sz / 2 &&
-           -1 != vp9_get_compressed_data(ctx->cpi, &lib_flags, &size,
-                                         cx_data, &dst_time_stamp,
-                                         &dst_end_time_stamp, !img)) {
-      if (size) {
-        vpx_codec_pts_t    round, delta;
-        vpx_codec_cx_pkt_t pkt;
-        VP9_COMP *cpi = (VP9_COMP *)ctx->cpi;
-
-        /* Add the frame packet to the list of returned packets. */
-        round = 1000000 * ctx->cfg.g_timebase.num / 2 - 1;
-        delta = (dst_end_time_stamp - dst_time_stamp);
-        pkt.kind = VPX_CODEC_CX_FRAME_PKT;
-        pkt.data.frame.pts =
-          (dst_time_stamp * ctx->cfg.g_timebase.den + round)
-          / ctx->cfg.g_timebase.num / 10000000;
-        pkt.data.frame.duration =
-          (delta * ctx->cfg.g_timebase.den + round)
-          / ctx->cfg.g_timebase.num / 10000000;
-        pkt.data.frame.flags = lib_flags << 16;
-
-        if (lib_flags & FRAMEFLAGS_KEY)
-          pkt.data.frame.flags |= VPX_FRAME_IS_KEY;
-
-        if (!cpi->common.show_frame) {
-          pkt.data.frame.flags |= VPX_FRAME_IS_INVISIBLE;
-
-          // This timestamp should be as close as possible to the
-          // prior PTS so that if a decoder uses pts to schedule when
-          // to do this, we start right after last frame was decoded.
-          // Invisible frames have no duration.
-          pkt.data.frame.pts = ((cpi->last_time_stamp_seen
-                                 * ctx->cfg.g_timebase.den + round)
-                                / ctx->cfg.g_timebase.num / 10000000) + 1;
-          pkt.data.frame.duration = 0;
-        }
-
-        if (cpi->droppable)
-          pkt.data.frame.flags |= VPX_FRAME_IS_DROPPABLE;
-
-        /*if (cpi->output_partition)
-        {
-            int i;
-            const int num_partitions = 1;
-
-            pkt.data.frame.flags |= VPX_FRAME_IS_FRAGMENT;
-
-            for (i = 0; i < num_partitions; ++i)
-            {
-                pkt.data.frame.buf = cx_data;
-                pkt.data.frame.sz = cpi->partition_sz[i];
-                pkt.data.frame.partition_id = i;
-                // don't set the fragment bit for the last partition
-                if (i == (num_partitions - 1))
-                    pkt.data.frame.flags &= ~VPX_FRAME_IS_FRAGMENT;
-                vpx_codec_pkt_list_add(&ctx->pkt_list.head, &pkt);
-                cx_data += cpi->partition_sz[i];
-                cx_data_sz -= cpi->partition_sz[i];
-            }
-        }
-        else*/
-        {
-          pkt.data.frame.buf = cx_data;
-          pkt.data.frame.sz  = size;
-          pkt.data.frame.partition_id = -1;
-          vpx_codec_pkt_list_add(&ctx->pkt_list.head, &pkt);
-          cx_data += size;
-          cx_data_sz -= size;
-        }
-
-        // printf("timestamp: %lld, duration: %d\n", pkt->data.frame.pts, pkt->data.frame.duration);
-      }
-    }
-  }
-
-  return res;
-}
-
-
-static const vpx_codec_cx_pkt_t *vp8e_get_cxdata(vpx_codec_alg_priv_t  *ctx,
-                                                 vpx_codec_iter_t      *iter) {
-  return vpx_codec_pkt_list_get(&ctx->pkt_list.head, iter);
-}
-
-static vpx_codec_err_t vp8e_set_reference(vpx_codec_alg_priv_t *ctx,
-                                          int ctr_id,
-                                          va_list args) {
-  vpx_ref_frame_t *data = va_arg(args, vpx_ref_frame_t *);
-
-  if (data) {
-    vpx_ref_frame_t *frame = (vpx_ref_frame_t *)data;
-    YV12_BUFFER_CONFIG sd;
-
-    image2yuvconfig(&frame->img, &sd);
-    vp9_set_reference_enc(ctx->cpi, frame->frame_type, &sd);
-    return VPX_CODEC_OK;
-  } else
-    return VPX_CODEC_INVALID_PARAM;
-
-}
-
-static vpx_codec_err_t vp8e_get_reference(vpx_codec_alg_priv_t *ctx,
-                                          int ctr_id,
-                                          va_list args) {
-
-  vpx_ref_frame_t *data = va_arg(args, vpx_ref_frame_t *);
-
-  if (data) {
-    vpx_ref_frame_t *frame = (vpx_ref_frame_t *)data;
-    YV12_BUFFER_CONFIG sd;
-
-    image2yuvconfig(&frame->img, &sd);
-    vp9_get_reference_enc(ctx->cpi, frame->frame_type, &sd);
-    return VPX_CODEC_OK;
-  } else
-    return VPX_CODEC_INVALID_PARAM;
-}
-
-static vpx_codec_err_t vp8e_set_previewpp(vpx_codec_alg_priv_t *ctx,
-                                          int ctr_id,
-                                          va_list args) {
-#if CONFIG_POSTPROC
-  vp8_postproc_cfg_t *data = va_arg(args, vp8_postproc_cfg_t *);
-  (void)ctr_id;
-
-  if (data) {
-    ctx->preview_ppcfg = *((vp8_postproc_cfg_t *)data);
-    return VPX_CODEC_OK;
-  } else
-    return VPX_CODEC_INVALID_PARAM;
-#else
-  (void)ctx;
-  (void)ctr_id;
-  (void)args;
-  return VPX_CODEC_INCAPABLE;
-#endif
-}
-
-
-static vpx_image_t *vp8e_get_preview(vpx_codec_alg_priv_t *ctx) {
-
-  YV12_BUFFER_CONFIG sd;
-  vp9_ppflags_t flags = {0};
-
-  if (ctx->preview_ppcfg.post_proc_flag) {
-    flags.post_proc_flag        = ctx->preview_ppcfg.post_proc_flag;
-    flags.deblocking_level      = ctx->preview_ppcfg.deblocking_level;
-    flags.noise_level           = ctx->preview_ppcfg.noise_level;
-  }
-
-  if (0 == vp9_get_preview_raw_frame(ctx->cpi, &sd, &flags)) {
-
-    /*
-    vpx_img_wrap(&ctx->preview_img, VPX_IMG_FMT_YV12,
-        sd.y_width + 2*VP8BORDERINPIXELS,
-        sd.y_height + 2*VP8BORDERINPIXELS,
-        1,
-        sd.buffer_alloc);
-    vpx_img_set_rect(&ctx->preview_img,
-        VP8BORDERINPIXELS, VP8BORDERINPIXELS,
-        sd.y_width, sd.y_height);
-        */
-
-    ctx->preview_img.bps = 12;
-    ctx->preview_img.planes[VPX_PLANE_Y] = sd.y_buffer;
-    ctx->preview_img.planes[VPX_PLANE_U] = sd.u_buffer;
-    ctx->preview_img.planes[VPX_PLANE_V] = sd.v_buffer;
-
-    if (sd.clrtype == REG_YUV)
-      ctx->preview_img.fmt = VPX_IMG_FMT_I420;
-    else
-      ctx->preview_img.fmt = VPX_IMG_FMT_VPXI420;
-
-    ctx->preview_img.x_chroma_shift = 1;
-    ctx->preview_img.y_chroma_shift = 1;
-
-    ctx->preview_img.d_w = sd.y_width;
-    ctx->preview_img.d_h = sd.y_height;
-    ctx->preview_img.stride[VPX_PLANE_Y] = sd.y_stride;
-    ctx->preview_img.stride[VPX_PLANE_U] = sd.uv_stride;
-    ctx->preview_img.stride[VPX_PLANE_V] = sd.uv_stride;
-    ctx->preview_img.w   = sd.y_width;
-    ctx->preview_img.h   = sd.y_height;
-
-    return &ctx->preview_img;
-  } else
-    return NULL;
-}
-
-static vpx_codec_err_t vp8e_update_entropy(vpx_codec_alg_priv_t *ctx,
-                                           int ctr_id,
-                                           va_list args) {
-  int update = va_arg(args, int);
-  vp9_update_entropy(ctx->cpi, update);
-  return VPX_CODEC_OK;
-
-}
-
-static vpx_codec_err_t vp8e_update_reference(vpx_codec_alg_priv_t *ctx,
-                                             int ctr_id,
-                                             va_list args) {
-  int update = va_arg(args, int);
-  vp9_update_reference(ctx->cpi, update);
-  return VPX_CODEC_OK;
-}
-
-static vpx_codec_err_t vp8e_use_reference(vpx_codec_alg_priv_t *ctx,
-                                          int ctr_id,
-                                          va_list args) {
-  int reference_flag = va_arg(args, int);
-  vp9_use_as_reference(ctx->cpi, reference_flag);
-  return VPX_CODEC_OK;
-}
-
-static vpx_codec_err_t vp8e_set_roi_map(vpx_codec_alg_priv_t *ctx,
-                                        int ctr_id,
-                                        va_list args) {
-  vpx_roi_map_t *data = va_arg(args, vpx_roi_map_t *);
-
-  if (data) {
-    vpx_roi_map_t *roi = (vpx_roi_map_t *)data;
-
-    if (!vp9_set_roimap(ctx->cpi, roi->roi_map, roi->rows, roi->cols,
-                        roi->delta_q, roi->delta_lf, roi->static_threshold))
-      return VPX_CODEC_OK;
-    else
-      return VPX_CODEC_INVALID_PARAM;
-  } else
-    return VPX_CODEC_INVALID_PARAM;
-}
-
-
-static vpx_codec_err_t vp8e_set_activemap(vpx_codec_alg_priv_t *ctx,
-                                          int ctr_id,
-                                          va_list args) {
-  vpx_active_map_t *data = va_arg(args, vpx_active_map_t *);
-
-  if (data) {
-
-    vpx_active_map_t *map = (vpx_active_map_t *)data;
-
-    if (!vp9_set_active_map(ctx->cpi, map->active_map, map->rows, map->cols))
-      return VPX_CODEC_OK;
-    else
-      return VPX_CODEC_INVALID_PARAM;
-  } else
-    return VPX_CODEC_INVALID_PARAM;
-}
-
-static vpx_codec_err_t vp8e_set_scalemode(vpx_codec_alg_priv_t *ctx,
-                                          int ctr_id,
-                                          va_list args) {
-
-  vpx_scaling_mode_t *data =  va_arg(args, vpx_scaling_mode_t *);
-
-  if (data) {
-    int res;
-    vpx_scaling_mode_t scalemode = *(vpx_scaling_mode_t *)data;
-    res = vp9_set_internal_size(ctx->cpi, scalemode.h_scaling_mode,
-                                scalemode.v_scaling_mode);
-
-    if (!res) {
-      /*force next frame a key frame to effect scaling mode */
-      ctx->next_frame_flag |= FRAMEFLAGS_KEY;
-      return VPX_CODEC_OK;
-    } else
-      return VPX_CODEC_INVALID_PARAM;
-  } else
-    return VPX_CODEC_INVALID_PARAM;
-}
-
-
-static vpx_codec_ctrl_fn_map_t vp8e_ctf_maps[] = {
-  {VP8_SET_REFERENCE,                 vp8e_set_reference},
-  {VP8_COPY_REFERENCE,                vp8e_get_reference},
-  {VP8_SET_POSTPROC,                  vp8e_set_previewpp},
-  {VP8E_UPD_ENTROPY,                  vp8e_update_entropy},
-  {VP8E_UPD_REFERENCE,                vp8e_update_reference},
-  {VP8E_USE_REFERENCE,                vp8e_use_reference},
-  {VP8E_SET_ROI_MAP,                  vp8e_set_roi_map},
-  {VP8E_SET_ACTIVEMAP,                vp8e_set_activemap},
-  {VP8E_SET_SCALEMODE,                vp8e_set_scalemode},
-  {VP8E_SET_ENCODING_MODE,            set_param},
-  {VP8E_SET_CPUUSED,                  set_param},
-  {VP8E_SET_NOISE_SENSITIVITY,        set_param},
-  {VP8E_SET_ENABLEAUTOALTREF,         set_param},
-  {VP8E_SET_SHARPNESS,                set_param},
-  {VP8E_SET_STATIC_THRESHOLD,         set_param},
-  {VP8E_SET_TOKEN_PARTITIONS,         set_param},
-  {VP8E_GET_LAST_QUANTIZER,           get_param},
-  {VP8E_GET_LAST_QUANTIZER_64,        get_param},
-  {VP8E_SET_ARNR_MAXFRAMES,           set_param},
-  {VP8E_SET_ARNR_STRENGTH,           set_param},
-  {VP8E_SET_ARNR_TYPE,           set_param},
-  {VP8E_SET_TUNING,                   set_param},
-  {VP8E_SET_CQ_LEVEL,                 set_param},
-  {VP8E_SET_MAX_INTRA_BITRATE_PCT,    set_param},
-  { -1, NULL},
-};
-
-static vpx_codec_enc_cfg_map_t vp8e_usage_cfg_map[] = {
-  {
-    0,
-    {
-      0,                  /* g_usage */
-      0,                  /* g_threads */
-      0,                  /* g_profile */
-
-      320,                /* g_width */
-      240,                /* g_height */
-      {1, 30},            /* g_timebase */
-
-      0,                  /* g_error_resilient */
-
-      VPX_RC_ONE_PASS,    /* g_pass */
-
-      0,                  /* g_lag_in_frames */
-
-      0,                  /* rc_dropframe_thresh */
-      0,                  /* rc_resize_allowed */
-      60,                 /* rc_resize_down_thresold */
-      30,                 /* rc_resize_up_thresold */
-
-      VPX_VBR,            /* rc_end_usage */
-#if VPX_ENCODER_ABI_VERSION > (1 + VPX_CODEC_ABI_VERSION)
-      {0},                /* rc_twopass_stats_in */
-#endif
-      256,                /* rc_target_bandwidth */
-      4,                  /* rc_min_quantizer */
-      63,                 /* rc_max_quantizer */
-      100,                /* rc_undershoot_pct */
-      100,                /* rc_overshoot_pct */
-
-      6000,               /* rc_max_buffer_size */
-      4000,               /* rc_buffer_initial_size; */
-      5000,               /* rc_buffer_optimal_size; */
-
-      50,                 /* rc_two_pass_vbrbias  */
-      0,                  /* rc_two_pass_vbrmin_section */
-      400,                /* rc_two_pass_vbrmax_section */
-
-      /* keyframing settings (kf) */
-      VPX_KF_AUTO,        /* g_kfmode*/
-      0,                  /* kf_min_dist */
-      9999,               /* kf_max_dist */
-
-#if VPX_ENCODER_ABI_VERSION == (1 + VPX_CODEC_ABI_VERSION)
-      1,                  /* g_delete_first_pass_file */
-      "vp8.fpf"           /* first pass filename */
-#endif
-    }
-  },
-  { -1, {NOT_IMPLEMENTED}}
-};
-
-
-#ifndef VERSION_STRING
-#define VERSION_STRING
-#endif
-CODEC_INTERFACE(vpx_codec_vp8_cx) = {
-  "WebM Project VP8 Encoder" VERSION_STRING,
-  VPX_CODEC_INTERNAL_ABI_VERSION,
-  VPX_CODEC_CAP_ENCODER | VPX_CODEC_CAP_PSNR |
-  VPX_CODEC_CAP_OUTPUT_PARTITION,
-  /* vpx_codec_caps_t          caps; */
-  vp8e_init,          /* vpx_codec_init_fn_t       init; */
-  vp8e_destroy,       /* vpx_codec_destroy_fn_t    destroy; */
-  vp8e_ctf_maps,      /* vpx_codec_ctrl_fn_map_t  *ctrl_maps; */
-  NOT_IMPLEMENTED,    /* vpx_codec_get_mmap_fn_t   get_mmap; */
-  NOT_IMPLEMENTED,    /* vpx_codec_set_mmap_fn_t   set_mmap; */
-  {
-    NOT_IMPLEMENTED,    /* vpx_codec_peek_si_fn_t    peek_si; */
-    NOT_IMPLEMENTED,    /* vpx_codec_get_si_fn_t     get_si; */
-    NOT_IMPLEMENTED,    /* vpx_codec_decode_fn_t     decode; */
-    NOT_IMPLEMENTED,    /* vpx_codec_frame_get_fn_t  frame_get; */
-  },
-  {
-    vp8e_usage_cfg_map, /* vpx_codec_enc_cfg_map_t    peek_si; */
-    vp8e_encode,        /* vpx_codec_encode_fn_t      encode; */
-    vp8e_get_cxdata,    /* vpx_codec_get_cx_data_fn_t   frame_get; */
-    vp8e_set_config,
-    NOT_IMPLEMENTED,
-    vp8e_get_preview,
-  } /* encoder functions */
-};
-
-
-#if CONFIG_EXPERIMENTAL
-
-CODEC_INTERFACE(vpx_codec_vp8x_cx) = {
-  "VP8 Experimental Encoder" VERSION_STRING,
-  VPX_CODEC_INTERNAL_ABI_VERSION,
-  VPX_CODEC_CAP_ENCODER | VPX_CODEC_CAP_PSNR,
-  /* vpx_codec_caps_t          caps; */
-  vp8e_exp_init,      /* vpx_codec_init_fn_t       init; */
-  vp8e_destroy,       /* vpx_codec_destroy_fn_t    destroy; */
-  vp8e_ctf_maps,      /* vpx_codec_ctrl_fn_map_t  *ctrl_maps; */
-  NOT_IMPLEMENTED,    /* vpx_codec_get_mmap_fn_t   get_mmap; */
-  NOT_IMPLEMENTED,    /* vpx_codec_set_mmap_fn_t   set_mmap; */
-  {
-    NOT_IMPLEMENTED,    /* vpx_codec_peek_si_fn_t    peek_si; */
-    NOT_IMPLEMENTED,    /* vpx_codec_get_si_fn_t     get_si; */
-    NOT_IMPLEMENTED,    /* vpx_codec_decode_fn_t     decode; */
-    NOT_IMPLEMENTED,    /* vpx_codec_frame_get_fn_t  frame_get; */
-  },
-  {
-    vp8e_usage_cfg_map, /* vpx_codec_enc_cfg_map_t    peek_si; */
-    vp8e_encode,        /* vpx_codec_encode_fn_t      encode; */
-    vp8e_get_cxdata,    /* vpx_codec_get_cx_data_fn_t   frame_get; */
-    vp8e_set_config,
-    NOT_IMPLEMENTED,
-    vp8e_get_preview,
-  } /* encoder functions */
-};
-#endif
-
-
-/*
- * BEGIN BACKWARDS COMPATIBILITY SHIM.
- */
-#define FORCE_KEY   2
-static vpx_codec_err_t api1_control(vpx_codec_alg_priv_t *ctx,
-                                    int                   ctrl_id,
-                                    va_list               args) {
-  vpx_codec_ctrl_fn_map_t *entry;
-
-  switch (ctrl_id) {
-    case VP8E_SET_FLUSHFLAG:
-      /* VP8 sample code did VP8E_SET_FLUSHFLAG followed by
-       * vpx_codec_get_cx_data() rather than vpx_codec_encode().
-       */
-      return vp8e_encode(ctx, NULL, 0, 0, 0, 0);
-    case VP8E_SET_FRAMETYPE:
-      ctx->base.enc.tbd |= FORCE_KEY;
-      return VPX_CODEC_OK;
-  }
-
-  for (entry = vp8e_ctf_maps; entry && entry->fn; entry++) {
-    if (!entry->ctrl_id || entry->ctrl_id == ctrl_id) {
-      return entry->fn(ctx, ctrl_id, args);
-    }
-  }
-
-  return VPX_CODEC_ERROR;
-}
-
-
-static vpx_codec_ctrl_fn_map_t api1_ctrl_maps[] = {
-  {0, api1_control},
-  { -1, NULL}
-};
-
-
-static vpx_codec_err_t api1_encode(vpx_codec_alg_priv_t  *ctx,
-                                   const vpx_image_t     *img,
-                                   vpx_codec_pts_t        pts,
-                                   unsigned long          duration,
-                                   vpx_enc_frame_flags_t  flags,
-                                   unsigned long          deadline) {
-  int force = ctx->base.enc.tbd;
-
-  ctx->base.enc.tbd = 0;
-  return vp8e_encode
-         (ctx,
-          img,
-          pts,
-          duration,
-          flags | ((force & FORCE_KEY) ? VPX_EFLAG_FORCE_KF : 0),
-          deadline);
-}
-
-
-vpx_codec_iface_t vpx_enc_vp8_algo = {
-  "WebM Project VP8 Encoder (Deprecated API)" VERSION_STRING,
-  VPX_CODEC_INTERNAL_ABI_VERSION,
-  VPX_CODEC_CAP_ENCODER,
-  /* vpx_codec_caps_t          caps; */
-  vp8e_init,          /* vpx_codec_init_fn_t       init; */
-  vp8e_destroy,       /* vpx_codec_destroy_fn_t    destroy; */
-  api1_ctrl_maps,     /* vpx_codec_ctrl_fn_map_t  *ctrl_maps; */
-  NOT_IMPLEMENTED,    /* vpx_codec_get_mmap_fn_t   get_mmap; */
-  NOT_IMPLEMENTED,    /* vpx_codec_set_mmap_fn_t   set_mmap; */
-  {NOT_IMPLEMENTED},  /* decoder functions */
-  {
-    vp8e_usage_cfg_map, /* vpx_codec_enc_cfg_map_t    peek_si; */
-    api1_encode,        /* vpx_codec_encode_fn_t      encode; */
-    vp8e_get_cxdata,    /* vpx_codec_get_cx_data_fn_t   frame_get; */
-    vp8e_set_config,
-    NOT_IMPLEMENTED,
-    vp8e_get_preview,
-  } /* encoder functions */
-};
--- a/vp8/vp8_dx_iface.c
+++ /dev/null
@@ -1,717 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include <stdlib.h>
-#include <string.h>
-#include "vpx/vpx_decoder.h"
-#include "vpx/vp8dx.h"
-#include "vpx/internal/vpx_codec_internal.h"
-#include "vpx_version.h"
-#include "common/onyxd.h"
-#include "decoder/onyxd_int.h"
-
-#define VP8_CAP_POSTPROC (CONFIG_POSTPROC ? VPX_CODEC_CAP_POSTPROC : 0)
-typedef vpx_codec_stream_info_t  vp8_stream_info_t;
-
-/* Structures for handling memory allocations */
-typedef enum {
-  VP8_SEG_ALG_PRIV     = 256,
-  VP8_SEG_MAX
-} mem_seg_id_t;
-#define NELEMENTS(x) ((int)(sizeof(x)/sizeof(x[0])))
-
-static unsigned long vp8_priv_sz(const vpx_codec_dec_cfg_t *si, vpx_codec_flags_t);
-
-typedef struct {
-  unsigned int   id;
-  unsigned long  sz;
-  unsigned int   align;
-  unsigned int   flags;
-  unsigned long(*calc_sz)(const vpx_codec_dec_cfg_t *, vpx_codec_flags_t);
-} mem_req_t;
-
-static const mem_req_t vp8_mem_req_segs[] = {
-  {VP8_SEG_ALG_PRIV,    0, 8, VPX_CODEC_MEM_ZERO, vp8_priv_sz},
-  {VP8_SEG_MAX, 0, 0, 0, NULL}
-};
-
-struct vpx_codec_alg_priv {
-  vpx_codec_priv_t        base;
-  vpx_codec_mmap_t        mmaps[NELEMENTS(vp8_mem_req_segs) - 1];
-  vpx_codec_dec_cfg_t     cfg;
-  vp8_stream_info_t       si;
-  int                     defer_alloc;
-  int                     decoder_init;
-  VP9D_PTR                pbi;
-  int                     postproc_cfg_set;
-  vp8_postproc_cfg_t      postproc_cfg;
-#if CONFIG_POSTPROC_VISUALIZER
-  unsigned int            dbg_postproc_flag;
-  int                     dbg_color_ref_frame_flag;
-  int                     dbg_color_mb_modes_flag;
-  int                     dbg_color_b_modes_flag;
-  int                     dbg_display_mv_flag;
-#endif
-  vpx_image_t             img;
-  int                     img_setup;
-  int                     img_avail;
-};
-
-static unsigned long vp8_priv_sz(const vpx_codec_dec_cfg_t *si,
-                                 vpx_codec_flags_t flags) {
-  /* Although this declaration is constant, we can't use it in the requested
-   * segments list because we want to define the requested segments list
-   * before defining the private type (so that the number of memory maps is
-   * known)
-   */
-  (void)si;
-  return sizeof(vpx_codec_alg_priv_t);
-}
-
-
-static void vp8_mmap_dtor(vpx_codec_mmap_t *mmap) {
-  free(mmap->priv);
-}
-
-static vpx_codec_err_t vp8_mmap_alloc(vpx_codec_mmap_t *mmap) {
-  vpx_codec_err_t  res;
-  unsigned int   align;
-
-  align = mmap->align ? mmap->align - 1 : 0;
-
-  if (mmap->flags & VPX_CODEC_MEM_ZERO)
-    mmap->priv = calloc(1, mmap->sz + align);
-  else
-    mmap->priv = malloc(mmap->sz + align);
-
-  res = (mmap->priv) ? VPX_CODEC_OK : VPX_CODEC_MEM_ERROR;
-  mmap->base = (void *)((((uintptr_t)mmap->priv) + align) & ~(uintptr_t)align);
-  mmap->dtor = vp8_mmap_dtor;
-  return res;
-}
-
-static vpx_codec_err_t vp8_validate_mmaps(const vp8_stream_info_t *si,
-                                          const vpx_codec_mmap_t *mmaps,
-                                          vpx_codec_flags_t init_flags) {
-  int i;
-  vpx_codec_err_t res = VPX_CODEC_OK;
-
-  for (i = 0; i < NELEMENTS(vp8_mem_req_segs) - 1; i++) {
-    /* Ensure the segment has been allocated */
-    if (!mmaps[i].base) {
-      res = VPX_CODEC_MEM_ERROR;
-      break;
-    }
-
-    /* Verify variable size segment is big enough for the current si. */
-    if (vp8_mem_req_segs[i].calc_sz) {
-      vpx_codec_dec_cfg_t cfg;
-
-      cfg.w = si->w;
-      cfg.h = si->h;
-
-      if (mmaps[i].sz < vp8_mem_req_segs[i].calc_sz(&cfg, init_flags)) {
-        res = VPX_CODEC_MEM_ERROR;
-        break;
-      }
-    }
-  }
-
-  return res;
-}
-
-static void vp8_init_ctx(vpx_codec_ctx_t *ctx, const vpx_codec_mmap_t *mmap) {
-  int i;
-
-  ctx->priv = mmap->base;
-  ctx->priv->sz = sizeof(*ctx->priv);
-  ctx->priv->iface = ctx->iface;
-  ctx->priv->alg_priv = mmap->base;
-
-  for (i = 0; i < NELEMENTS(ctx->priv->alg_priv->mmaps); i++)
-    ctx->priv->alg_priv->mmaps[i].id = vp8_mem_req_segs[i].id;
-
-  ctx->priv->alg_priv->mmaps[0] = *mmap;
-  ctx->priv->alg_priv->si.sz = sizeof(ctx->priv->alg_priv->si);
-  ctx->priv->init_flags = ctx->init_flags;
-
-  if (ctx->config.dec) {
-    /* Update the reference to the config structure to an internal copy. */
-    ctx->priv->alg_priv->cfg = *ctx->config.dec;
-    ctx->config.dec = &ctx->priv->alg_priv->cfg;
-  }
-}
-
-static void *mmap_lkup(vpx_codec_alg_priv_t *ctx, unsigned int id) {
-  int i;
-
-  for (i = 0; i < NELEMENTS(ctx->mmaps); i++)
-    if (ctx->mmaps[i].id == id)
-      return ctx->mmaps[i].base;
-
-  return NULL;
-}
-static void vp8_finalize_mmaps(vpx_codec_alg_priv_t *ctx) {
-  /* nothing to clean up */
-}
-
-static vpx_codec_err_t vp8_init(vpx_codec_ctx_t *ctx) {
-  vpx_codec_err_t        res = VPX_CODEC_OK;
-
-  /* This function only allocates space for the vpx_codec_alg_priv_t
-   * structure. More memory may be required at the time the stream
-   * information becomes known.
-   */
-  if (!ctx->priv) {
-    vpx_codec_mmap_t mmap;
-
-    mmap.id = vp8_mem_req_segs[0].id;
-    mmap.sz = sizeof(vpx_codec_alg_priv_t);
-    mmap.align = vp8_mem_req_segs[0].align;
-    mmap.flags = vp8_mem_req_segs[0].flags;
-
-    res = vp8_mmap_alloc(&mmap);
-
-    if (!res) {
-      vp8_init_ctx(ctx, &mmap);
-
-      ctx->priv->alg_priv->defer_alloc = 1;
-      /*post processing level initialized to do nothing */
-    }
-  }
-
-  return res;
-}
-
-static vpx_codec_err_t vp8_destroy(vpx_codec_alg_priv_t *ctx) {
-  int i;
-
-  vp9_remove_decompressor(ctx->pbi);
-
-  for (i = NELEMENTS(ctx->mmaps) - 1; i >= 0; i--) {
-    if (ctx->mmaps[i].dtor)
-      ctx->mmaps[i].dtor(&ctx->mmaps[i]);
-  }
-
-  return VPX_CODEC_OK;
-}
-
-static vpx_codec_err_t vp8_peek_si(const uint8_t         *data,
-                                   unsigned int           data_sz,
-                                   vpx_codec_stream_info_t *si) {
-  vpx_codec_err_t res = VPX_CODEC_OK;
-
-  if (data + data_sz <= data)
-    res = VPX_CODEC_INVALID_PARAM;
-  else {
-    /* Parse uncompresssed part of key frame header.
-     * 3 bytes:- including version, frame type and an offset
-     * 3 bytes:- sync code (0x9d, 0x01, 0x2a)
-     * 4 bytes:- including image width and height in the lowest 14 bits
-     *           of each 2-byte value.
-     */
-    si->is_kf = 0;
-
-    if (data_sz >= 10 && !(data[0] & 0x01)) { /* I-Frame */
-      const uint8_t *c = data + 3;
-      si->is_kf = 1;
-
-      /* vet via sync code */
-      if (c[0] != 0x9d || c[1] != 0x01 || c[2] != 0x2a)
-        res = VPX_CODEC_UNSUP_BITSTREAM;
-
-      si->w = (c[3] | (c[4] << 8)) & 0x3fff;
-      si->h = (c[5] | (c[6] << 8)) & 0x3fff;
-
-      /*printf("w=%d, h=%d\n", si->w, si->h);*/
-      if (!(si->h | si->w))
-        res = VPX_CODEC_UNSUP_BITSTREAM;
-    } else
-      res = VPX_CODEC_UNSUP_BITSTREAM;
-  }
-
-  return res;
-
-}
-
-static vpx_codec_err_t vp8_get_si(vpx_codec_alg_priv_t    *ctx,
-                                  vpx_codec_stream_info_t *si) {
-
-  unsigned int sz;
-
-  if (si->sz >= sizeof(vp8_stream_info_t))
-    sz = sizeof(vp8_stream_info_t);
-  else
-    sz = sizeof(vpx_codec_stream_info_t);
-
-  memcpy(si, &ctx->si, sz);
-  si->sz = sz;
-
-  return VPX_CODEC_OK;
-}
-
-
-static vpx_codec_err_t
-update_error_state(vpx_codec_alg_priv_t                 *ctx,
-                   const struct vpx_internal_error_info *error) {
-  vpx_codec_err_t res;
-
-  if ((res = error->error_code))
-    ctx->base.err_detail = error->has_detail
-                           ? error->detail
-                           : NULL;
-
-  return res;
-}
-
-static void yuvconfig2image(vpx_image_t               *img,
-                            const YV12_BUFFER_CONFIG  *yv12,
-                            void                      *user_priv) {
-  /** vpx_img_wrap() doesn't allow specifying independent strides for
-    * the Y, U, and V planes, nor other alignment adjustments that
-    * might be representable by a YV12_BUFFER_CONFIG, so we just
-    * initialize all the fields.*/
-  img->fmt = yv12->clrtype == REG_YUV ?
-             VPX_IMG_FMT_I420 : VPX_IMG_FMT_VPXI420;
-  img->w = yv12->y_stride;
-  img->h = (yv12->y_height + 2 * VP8BORDERINPIXELS + 15) & ~15;
-  img->d_w = yv12->y_width;
-  img->d_h = yv12->y_height;
-  img->x_chroma_shift = 1;
-  img->y_chroma_shift = 1;
-  img->planes[VPX_PLANE_Y] = yv12->y_buffer;
-  img->planes[VPX_PLANE_U] = yv12->u_buffer;
-  img->planes[VPX_PLANE_V] = yv12->v_buffer;
-  img->planes[VPX_PLANE_ALPHA] = NULL;
-  img->stride[VPX_PLANE_Y] = yv12->y_stride;
-  img->stride[VPX_PLANE_U] = yv12->uv_stride;
-  img->stride[VPX_PLANE_V] = yv12->uv_stride;
-  img->stride[VPX_PLANE_ALPHA] = yv12->y_stride;
-  img->bps = 12;
-  img->user_priv = user_priv;
-  img->img_data = yv12->buffer_alloc;
-  img->img_data_owner = 0;
-  img->self_allocd = 0;
-}
-
-static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t  *ctx,
-                                  const uint8_t         *data,
-                                  unsigned int            data_sz,
-                                  void                    *user_priv,
-                                  long                    deadline) {
-  vpx_codec_err_t res = VPX_CODEC_OK;
-
-  ctx->img_avail = 0;
-
-  /* Determine the stream parameters. Note that we rely on peek_si to
-   * validate that we have a buffer that does not wrap around the top
-   * of the heap.
-   */
-  if (!ctx->si.h)
-    res = ctx->base.iface->dec.peek_si(data, data_sz, &ctx->si);
-
-
-  /* Perform deferred allocations, if required */
-  if (!res && ctx->defer_alloc) {
-    int i;
-
-    for (i = 1; !res && i < NELEMENTS(ctx->mmaps); i++) {
-      vpx_codec_dec_cfg_t cfg;
-
-      cfg.w = ctx->si.w;
-      cfg.h = ctx->si.h;
-      ctx->mmaps[i].id = vp8_mem_req_segs[i].id;
-      ctx->mmaps[i].sz = vp8_mem_req_segs[i].sz;
-      ctx->mmaps[i].align = vp8_mem_req_segs[i].align;
-      ctx->mmaps[i].flags = vp8_mem_req_segs[i].flags;
-
-      if (!ctx->mmaps[i].sz)
-        ctx->mmaps[i].sz = vp8_mem_req_segs[i].calc_sz(&cfg,
-                                                       ctx->base.init_flags);
-
-      res = vp8_mmap_alloc(&ctx->mmaps[i]);
-    }
-
-    if (!res)
-      vp8_finalize_mmaps(ctx);
-
-    ctx->defer_alloc = 0;
-  }
-
-  /* Initialize the decoder instance on the first frame*/
-  if (!res && !ctx->decoder_init) {
-    res = vp8_validate_mmaps(&ctx->si, ctx->mmaps, ctx->base.init_flags);
-
-    if (!res) {
-      VP9D_CONFIG oxcf;
-      VP9D_PTR optr;
-
-      vp9_initialize_dec();
-
-      oxcf.Width = ctx->si.w;
-      oxcf.Height = ctx->si.h;
-      oxcf.Version = 9;
-      oxcf.postprocess = 0;
-      oxcf.max_threads = ctx->cfg.threads;
-      optr = vp9_create_decompressor(&oxcf);
-
-      /* If postprocessing was enabled by the application and a
-       * configuration has not been provided, default it.
-       */
-      if (!ctx->postproc_cfg_set
-          && (ctx->base.init_flags & VPX_CODEC_USE_POSTPROC)) {
-        ctx->postproc_cfg.post_proc_flag =
-          VP8_DEBLOCK | VP8_DEMACROBLOCK;
-        ctx->postproc_cfg.deblocking_level = 4;
-        ctx->postproc_cfg.noise_level = 0;
-      }
-
-      if (!optr)
-        res = VPX_CODEC_ERROR;
-      else
-        ctx->pbi = optr;
-    }
-
-    ctx->decoder_init = 1;
-  }
-
-  if (!res && ctx->pbi) {
-    YV12_BUFFER_CONFIG sd;
-    int64_t time_stamp = 0, time_end_stamp = 0;
-    vp9_ppflags_t flags = {0};
-
-    if (ctx->base.init_flags & VPX_CODEC_USE_POSTPROC) {
-      flags.post_proc_flag = ctx->postproc_cfg.post_proc_flag
-#if CONFIG_POSTPROC_VISUALIZER
-
-                             | ((ctx->dbg_color_ref_frame_flag != 0) ? VP9D_DEBUG_CLR_FRM_REF_BLKS : 0)
-                             | ((ctx->dbg_color_mb_modes_flag != 0) ? VP9D_DEBUG_CLR_BLK_MODES : 0)
-                             | ((ctx->dbg_color_b_modes_flag != 0) ? VP9D_DEBUG_CLR_BLK_MODES : 0)
-                             | ((ctx->dbg_display_mv_flag != 0) ? VP9D_DEBUG_DRAW_MV : 0)
-#endif
-;
-      flags.deblocking_level      = ctx->postproc_cfg.deblocking_level;
-      flags.noise_level           = ctx->postproc_cfg.noise_level;
-#if CONFIG_POSTPROC_VISUALIZER
-      flags.display_ref_frame_flag = ctx->dbg_color_ref_frame_flag;
-      flags.display_mb_modes_flag = ctx->dbg_color_mb_modes_flag;
-      flags.display_b_modes_flag  = ctx->dbg_color_b_modes_flag;
-      flags.display_mv_flag       = ctx->dbg_display_mv_flag;
-#endif
-    }
-
-    if (vp9_receive_compressed_data(ctx->pbi, data_sz, data, deadline)) {
-      VP9D_COMP *pbi = (VP9D_COMP *)ctx->pbi;
-      res = update_error_state(ctx, &pbi->common.error);
-    }
-
-    if (!res && 0 == vp9_get_raw_frame(ctx->pbi, &sd, &time_stamp,
-                                       &time_end_stamp, &flags)) {
-      yuvconfig2image(&ctx->img, &sd, user_priv);
-      ctx->img_avail = 1;
-    }
-  }
-
-  return res;
-}
-
-static vpx_image_t *vp8_get_frame(vpx_codec_alg_priv_t  *ctx,
-                                  vpx_codec_iter_t      *iter) {
-  vpx_image_t *img = NULL;
-
-  if (ctx->img_avail) {
-    /* iter acts as a flip flop, so an image is only returned on the first
-     * call to get_frame.
-     */
-    if (!(*iter)) {
-      img = &ctx->img;
-      *iter = img;
-    }
-  }
-
-  return img;
-}
-
-
-static
-vpx_codec_err_t vp8_xma_get_mmap(const vpx_codec_ctx_t      *ctx,
-                                 vpx_codec_mmap_t           *mmap,
-                                 vpx_codec_iter_t           *iter) {
-  vpx_codec_err_t     res;
-  const mem_req_t  *seg_iter = *iter;
-
-  /* Get address of next segment request */
-  do {
-    if (!seg_iter)
-      seg_iter = vp8_mem_req_segs;
-    else if (seg_iter->id != VP8_SEG_MAX)
-      seg_iter++;
-
-    *iter = (vpx_codec_iter_t)seg_iter;
-
-    if (seg_iter->id != VP8_SEG_MAX) {
-      mmap->id = seg_iter->id;
-      mmap->sz = seg_iter->sz;
-      mmap->align = seg_iter->align;
-      mmap->flags = seg_iter->flags;
-
-      if (!seg_iter->sz)
-        mmap->sz = seg_iter->calc_sz(ctx->config.dec, ctx->init_flags);
-
-      res = VPX_CODEC_OK;
-    } else
-      res = VPX_CODEC_LIST_END;
-  } while (!mmap->sz && res != VPX_CODEC_LIST_END);
-
-  return res;
-}
-
-static vpx_codec_err_t vp8_xma_set_mmap(vpx_codec_ctx_t         *ctx,
-                                        const vpx_codec_mmap_t  *mmap) {
-  vpx_codec_err_t res = VPX_CODEC_MEM_ERROR;
-  int i, done;
-
-  if (!ctx->priv) {
-    if (mmap->id == VP8_SEG_ALG_PRIV) {
-      if (!ctx->priv) {
-        vp8_init_ctx(ctx, mmap);
-        res = VPX_CODEC_OK;
-      }
-    }
-  }
-
-  done = 1;
-
-  if (!res && ctx->priv->alg_priv) {
-    for (i = 0; i < NELEMENTS(ctx->priv->alg_priv->mmaps); i++) {
-      if (ctx->priv->alg_priv->mmaps[i].id == mmap->id)
-        if (!ctx->priv->alg_priv->mmaps[i].base) {
-          ctx->priv->alg_priv->mmaps[i] = *mmap;
-          res = VPX_CODEC_OK;
-        }
-
-      done &= (ctx->priv->alg_priv->mmaps[i].base != NULL);
-    }
-  }
-
-  if (done && !res) {
-    vp8_finalize_mmaps(ctx->priv->alg_priv);
-    res = ctx->iface->init(ctx);
-  }
-
-  return res;
-}
-
-static vpx_codec_err_t image2yuvconfig(const vpx_image_t   *img,
-                                       YV12_BUFFER_CONFIG  *yv12) {
-  vpx_codec_err_t        res = VPX_CODEC_OK;
-  yv12->y_buffer = img->planes[VPX_PLANE_Y];
-  yv12->u_buffer = img->planes[VPX_PLANE_U];
-  yv12->v_buffer = img->planes[VPX_PLANE_V];
-
-  yv12->y_width  = img->d_w;
-  yv12->y_height = img->d_h;
-  yv12->uv_width = yv12->y_width / 2;
-  yv12->uv_height = yv12->y_height / 2;
-
-  yv12->y_stride = img->stride[VPX_PLANE_Y];
-  yv12->uv_stride = img->stride[VPX_PLANE_U];
-
-  yv12->border  = (img->stride[VPX_PLANE_Y] - img->d_w) / 2;
-  yv12->clrtype = (img->fmt == VPX_IMG_FMT_VPXI420 ||
-                   img->fmt == VPX_IMG_FMT_VPXYV12);
-
-  return res;
-}
-
-
-static vpx_codec_err_t vp9_set_reference(vpx_codec_alg_priv_t *ctx,
-                                         int ctr_id,
-                                         va_list args) {
-
-  vpx_ref_frame_t *data = va_arg(args, vpx_ref_frame_t *);
-
-  if (data) {
-    vpx_ref_frame_t *frame = (vpx_ref_frame_t *)data;
-    YV12_BUFFER_CONFIG sd;
-
-    image2yuvconfig(&frame->img, &sd);
-
-    return vp9_set_reference_dec(ctx->pbi, frame->frame_type, &sd);
-  } else
-    return VPX_CODEC_INVALID_PARAM;
-
-}
-
-static vpx_codec_err_t vp9_get_reference(vpx_codec_alg_priv_t *ctx,
-                                         int ctr_id,
-                                         va_list args) {
-
-  vpx_ref_frame_t *data = va_arg(args, vpx_ref_frame_t *);
-
-  if (data) {
-    vpx_ref_frame_t *frame = (vpx_ref_frame_t *)data;
-    YV12_BUFFER_CONFIG sd;
-
-    image2yuvconfig(&frame->img, &sd);
-
-    return vp9_get_reference_dec(ctx->pbi, frame->frame_type, &sd);
-  } else
-    return VPX_CODEC_INVALID_PARAM;
-
-}
-
-static vpx_codec_err_t vp8_set_postproc(vpx_codec_alg_priv_t *ctx,
-                                        int ctr_id,
-                                        va_list args) {
-#if CONFIG_POSTPROC
-  vp8_postproc_cfg_t *data = va_arg(args, vp8_postproc_cfg_t *);
-
-  if (data) {
-    ctx->postproc_cfg_set = 1;
-    ctx->postproc_cfg = *((vp8_postproc_cfg_t *)data);
-    return VPX_CODEC_OK;
-  } else
-    return VPX_CODEC_INVALID_PARAM;
-
-#else
-  return VPX_CODEC_INCAPABLE;
-#endif
-}
-
-static vpx_codec_err_t vp8_set_dbg_options(vpx_codec_alg_priv_t *ctx,
-                                           int ctrl_id,
-                                           va_list args) {
-#if CONFIG_POSTPROC_VISUALIZER && CONFIG_POSTPROC
-  int data = va_arg(args, int);
-
-#define MAP(id, var) case id: var = data; break;
-
-  switch (ctrl_id) {
-      MAP(VP8_SET_DBG_COLOR_REF_FRAME,   ctx->dbg_color_ref_frame_flag);
-      MAP(VP8_SET_DBG_COLOR_MB_MODES,    ctx->dbg_color_mb_modes_flag);
-      MAP(VP8_SET_DBG_COLOR_B_MODES,     ctx->dbg_color_b_modes_flag);
-      MAP(VP8_SET_DBG_DISPLAY_MV,        ctx->dbg_display_mv_flag);
-  }
-
-  return VPX_CODEC_OK;
-#else
-  return VPX_CODEC_INCAPABLE;
-#endif
-}
-
-static vpx_codec_err_t vp8_get_last_ref_updates(vpx_codec_alg_priv_t *ctx,
-                                                int ctrl_id,
-                                                va_list args) {
-  int *update_info = va_arg(args, int *);
-  VP9D_COMP *pbi = (VP9D_COMP *)ctx->pbi;
-
-  if (update_info) {
-    *update_info = pbi->common.refresh_alt_ref_frame * (int) VP8_ALTR_FRAME
-                   + pbi->common.refresh_golden_frame * (int) VP8_GOLD_FRAME
-                   + pbi->common.refresh_last_frame * (int) VP8_LAST_FRAME;
-
-    return VPX_CODEC_OK;
-  } else
-    return VPX_CODEC_INVALID_PARAM;
-}
-
-
-static vpx_codec_err_t vp8_get_frame_corrupted(vpx_codec_alg_priv_t *ctx,
-                                               int ctrl_id,
-                                               va_list args) {
-
-  int *corrupted = va_arg(args, int *);
-
-  if (corrupted) {
-    VP9D_COMP *pbi = (VP9D_COMP *)ctx->pbi;
-    *corrupted = pbi->common.frame_to_show->corrupted;
-
-    return VPX_CODEC_OK;
-  } else
-    return VPX_CODEC_INVALID_PARAM;
-
-}
-
-static vpx_codec_ctrl_fn_map_t ctf_maps[] = {
-  {VP8_SET_REFERENCE,             vp9_set_reference},
-  {VP8_COPY_REFERENCE,            vp9_get_reference},
-  {VP8_SET_POSTPROC,              vp8_set_postproc},
-  {VP8_SET_DBG_COLOR_REF_FRAME,   vp8_set_dbg_options},
-  {VP8_SET_DBG_COLOR_MB_MODES,    vp8_set_dbg_options},
-  {VP8_SET_DBG_COLOR_B_MODES,     vp8_set_dbg_options},
-  {VP8_SET_DBG_DISPLAY_MV,        vp8_set_dbg_options},
-  {VP8D_GET_LAST_REF_UPDATES,     vp8_get_last_ref_updates},
-  {VP8D_GET_FRAME_CORRUPTED,      vp8_get_frame_corrupted},
-  { -1, NULL},
-};
-
-
-#ifndef VERSION_STRING
-#define VERSION_STRING
-#endif
-CODEC_INTERFACE(vpx_codec_vp8_dx) = {
-  "WebM Project VP8 Decoder" VERSION_STRING,
-  VPX_CODEC_INTERNAL_ABI_VERSION,
-  VPX_CODEC_CAP_DECODER | VP8_CAP_POSTPROC |
-  VPX_CODEC_CAP_INPUT_PARTITION,
-  /* vpx_codec_caps_t          caps; */
-  vp8_init,         /* vpx_codec_init_fn_t       init; */
-  vp8_destroy,      /* vpx_codec_destroy_fn_t    destroy; */
-  ctf_maps,         /* vpx_codec_ctrl_fn_map_t  *ctrl_maps; */
-  vp8_xma_get_mmap, /* vpx_codec_get_mmap_fn_t   get_mmap; */
-  vp8_xma_set_mmap, /* vpx_codec_set_mmap_fn_t   set_mmap; */
-  {
-    vp8_peek_si,      /* vpx_codec_peek_si_fn_t    peek_si; */
-    vp8_get_si,       /* vpx_codec_get_si_fn_t     get_si; */
-    vp8_decode,       /* vpx_codec_decode_fn_t     decode; */
-    vp8_get_frame,    /* vpx_codec_frame_get_fn_t  frame_get; */
-  },
-  {
-    /* encoder functions */
-    NOT_IMPLEMENTED,
-    NOT_IMPLEMENTED,
-    NOT_IMPLEMENTED,
-    NOT_IMPLEMENTED,
-    NOT_IMPLEMENTED,
-    NOT_IMPLEMENTED
-  }
-};
-
-/*
- * BEGIN BACKWARDS COMPATIBILITY SHIM.
- */
-vpx_codec_iface_t vpx_codec_vp8_algo = {
-  "WebM Project VP8 Decoder (Deprecated API)" VERSION_STRING,
-  VPX_CODEC_INTERNAL_ABI_VERSION,
-  VPX_CODEC_CAP_DECODER | VP8_CAP_POSTPROC,
-  /* vpx_codec_caps_t          caps; */
-  vp8_init,         /* vpx_codec_init_fn_t       init; */
-  vp8_destroy,      /* vpx_codec_destroy_fn_t    destroy; */
-  ctf_maps,         /* vpx_codec_ctrl_fn_map_t  *ctrl_maps; */
-  vp8_xma_get_mmap, /* vpx_codec_get_mmap_fn_t   get_mmap; */
-  vp8_xma_set_mmap, /* vpx_codec_set_mmap_fn_t   set_mmap; */
-  {
-    vp8_peek_si,      /* vpx_codec_peek_si_fn_t    peek_si; */
-    vp8_get_si,       /* vpx_codec_get_si_fn_t     get_si; */
-    vp8_decode,       /* vpx_codec_decode_fn_t     decode; */
-    vp8_get_frame,    /* vpx_codec_frame_get_fn_t  frame_get; */
-  },
-  {
-    /* encoder functions */
-    NOT_IMPLEMENTED,
-    NOT_IMPLEMENTED,
-    NOT_IMPLEMENTED,
-    NOT_IMPLEMENTED,
-    NOT_IMPLEMENTED,
-    NOT_IMPLEMENTED
-  }
-};
--- a/vp8/vp8cx.mk
+++ /dev/null
@@ -1,120 +1,0 @@
-##
-##  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-##
-##  Use of this source code is governed by a BSD-style license
-##  that can be found in the LICENSE file in the root of the source
-##  tree. An additional intellectual property rights grant can be found
-##  in the file PATENTS.  All contributing project authors may
-##  be found in the AUTHORS file in the root of the source tree.
-##
-
-
-include $(SRC_PATH_BARE)/$(VP8_PREFIX)vp8_common.mk
-
-VP8_CX_EXPORTS += exports_enc
-
-VP8_CX_SRCS-yes += $(VP8_COMMON_SRCS-yes)
-VP8_CX_SRCS-no  += $(VP8_COMMON_SRCS-no)
-VP8_CX_SRCS_REMOVE-yes += $(VP8_COMMON_SRCS_REMOVE-yes)
-VP8_CX_SRCS_REMOVE-no  += $(VP8_COMMON_SRCS_REMOVE-no)
-
-ifeq ($(ARCH_ARM),yes)
-  include $(SRC_PATH_BARE)/$(VP8_PREFIX)vp8cx_arm.mk
-endif
-
-VP8_CX_SRCS-yes += vp8_cx_iface.c
-
-# encoder
-#INCLUDES += algo/vpx_common/vpx_mem/include
-#INCLUDES += common
-#INCLUDES += common
-#INCLUDES += common
-#INCLUDES += algo/vpx_ref/cpu_id/include
-#INCLUDES += common
-#INCLUDES += encoder
-
-VP8_CX_SRCS-yes += encoder/asm_enc_offsets.c
-VP8_CX_SRCS-yes += encoder/bitstream.c
-VP8_CX_SRCS-yes += encoder/boolhuff.c
-VP8_CX_SRCS-yes += encoder/dct.c
-VP8_CX_SRCS-yes += encoder/encodeframe.c
-VP8_CX_SRCS-yes += encoder/encodeintra.c
-VP8_CX_SRCS-yes += encoder/encodemb.c
-VP8_CX_SRCS-yes += encoder/encodemv.c
-VP8_CX_SRCS-yes += encoder/firstpass.c
-VP8_CX_SRCS-yes += encoder/generic/csystemdependent.c
-VP8_CX_SRCS-yes += encoder/block.h
-VP8_CX_SRCS-yes += encoder/boolhuff.h
-VP8_CX_SRCS-yes += encoder/bitstream.h
-VP8_CX_SRCS-yes += encoder/encodeintra.h
-VP8_CX_SRCS-yes += encoder/encodemb.h
-VP8_CX_SRCS-yes += encoder/encodemv.h
-VP8_CX_SRCS-yes += encoder/firstpass.h
-VP8_CX_SRCS-yes += encoder/lookahead.c
-VP8_CX_SRCS-yes += encoder/lookahead.h
-VP8_CX_SRCS-yes += encoder/mcomp.h
-VP8_CX_SRCS-yes += encoder/modecosts.h
-VP8_CX_SRCS-yes += encoder/onyx_int.h
-VP8_CX_SRCS-yes += encoder/psnr.h
-VP8_CX_SRCS-yes += encoder/quantize.h
-VP8_CX_SRCS-yes += encoder/ratectrl.h
-VP8_CX_SRCS-yes += encoder/rdopt.h
-VP8_CX_SRCS-yes += encoder/tokenize.h
-VP8_CX_SRCS-yes += encoder/treewriter.h
-VP8_CX_SRCS-yes += encoder/variance.h
-VP8_CX_SRCS-yes += encoder/mcomp.c
-VP8_CX_SRCS-yes += encoder/modecosts.c
-VP8_CX_SRCS-yes += encoder/onyx_if.c
-VP8_CX_SRCS-yes += encoder/picklpf.c
-VP8_CX_SRCS-yes += encoder/psnr.c
-VP8_CX_SRCS-yes += encoder/quantize.c
-VP8_CX_SRCS-yes += encoder/ratectrl.c
-VP8_CX_SRCS-yes += encoder/rdopt.c
-VP8_CX_SRCS-yes += encoder/sad_c.c
-VP8_CX_SRCS-yes += encoder/satd_c.c
-VP8_CX_SRCS-yes += encoder/segmentation.c
-VP8_CX_SRCS-yes += encoder/segmentation.h
-VP8_CX_SRCS-$(CONFIG_INTERNAL_STATS) += encoder/ssim.c
-VP8_CX_SRCS-yes += encoder/tokenize.c
-VP8_CX_SRCS-yes += encoder/treewriter.c
-VP8_CX_SRCS-yes += encoder/variance_c.c
-ifeq ($(CONFIG_POSTPROC),yes)
-VP8_CX_SRCS-$(CONFIG_INTERNAL_STATS) += common/postproc.h
-VP8_CX_SRCS-$(CONFIG_INTERNAL_STATS) += common/postproc.c
-endif
-VP8_CX_SRCS-yes += encoder/temporal_filter.c
-VP8_CX_SRCS-yes += encoder/temporal_filter.h
-VP8_CX_SRCS-yes += encoder/mbgraph.c
-VP8_CX_SRCS-yes += encoder/mbgraph.h
-
-
-VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/mcomp_x86.h
-VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/quantize_x86.h
-VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/temporal_filter_x86.h
-VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/x86_csystemdependent.c
-VP8_CX_SRCS-$(HAVE_MMX) += encoder/x86/variance_mmx.c
-VP8_CX_SRCS-$(HAVE_MMX) += encoder/x86/variance_impl_mmx.asm
-VP8_CX_SRCS-$(HAVE_MMX) += encoder/x86/sad_mmx.asm
-VP8_CX_SRCS-$(HAVE_MMX) += encoder/x86/dct_mmx.asm
-VP8_CX_SRCS-$(HAVE_MMX) += encoder/x86/subtract_mmx.asm
-VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/dct_sse2.asm
-VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/variance_sse2.c
-VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/variance_impl_sse2.asm
-VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/sad_sse2.asm
-VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/fwalsh_sse2.asm
-VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/quantize_sse2.asm
-VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/subtract_sse2.asm
-VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/temporal_filter_apply_sse2.asm
-VP8_CX_SRCS-$(HAVE_SSE3) += encoder/x86/sad_sse3.asm
-VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/sad_ssse3.asm
-VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/variance_ssse3.c
-VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/variance_impl_ssse3.asm
-VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/quantize_ssse3.asm
-VP8_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/sad_sse4.asm
-VP8_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/quantize_sse4.asm
-VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/quantize_mmx.asm
-VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/encodeopt.asm
-VP8_CX_SRCS-$(ARCH_X86_64) += encoder/x86/ssim_opt.asm
-
-
-VP8_CX_SRCS-yes := $(filter-out $(VP8_CX_SRCS_REMOVE-yes),$(VP8_CX_SRCS-yes))
--- a/vp8/vp8cx_arm.mk
+++ /dev/null
@@ -1,63 +1,0 @@
-##
-##  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-##
-##  Use of this source code is governed by a BSD-style license
-##  that can be found in the LICENSE file in the root of the source
-##  tree. An additional intellectual property rights grant can be found
-##  in the file PATENTS.  All contributing project authors may
-##  be found in the AUTHORS file in the root of the source tree.
-##
-
-
-#VP8_CX_SRCS list is modified according to different platforms.
-
-#File list for arm
-# encoder
-VP8_CX_SRCS-$(ARCH_ARM)  += encoder/arm/arm_csystemdependent.c
-
-VP8_CX_SRCS-$(ARCH_ARM)  += encoder/arm/dct_arm.c
-VP8_CX_SRCS-$(ARCH_ARM)  += encoder/arm/dct_arm.h
-VP8_CX_SRCS-$(ARCH_ARM)  += encoder/arm/encodemb_arm.h
-VP8_CX_SRCS-$(ARCH_ARM)  += encoder/arm/quantize_arm.c
-VP8_CX_SRCS-$(ARCH_ARM)  += encoder/arm/quantize_arm.h
-VP8_CX_SRCS-$(ARCH_ARM)  += encoder/arm/variance_arm.c
-VP8_CX_SRCS-$(ARCH_ARM)  += encoder/arm/variance_arm.h
-
-#File list for armv5te
-# encoder
-VP8_CX_SRCS-$(HAVE_ARMV5TE) += encoder/arm/boolhuff_arm.c
-VP8_CX_SRCS_REMOVE-$(HAVE_ARMV5TE)  += encoder/boolhuff.c
-VP8_CX_SRCS-$(HAVE_ARMV5TE)  += encoder/arm/armv5te/boolhuff_armv5te$(ASM)
-VP8_CX_SRCS-$(HAVE_ARMV5TE)  += encoder/arm/armv5te/vp8_packtokens_armv5$(ASM)
-VP8_CX_SRCS-$(HAVE_ARMV5TE)  += encoder/arm/armv5te/vp8_packtokens_mbrow_armv5$(ASM)
-VP8_CX_SRCS-$(HAVE_ARMV5TE)  += encoder/arm/armv5te/vp8_packtokens_partitions_armv5$(ASM)
-
-#File list for armv6
-# encoder
-VP8_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/armv6/vp8_subtract_armv6$(ASM)
-VP8_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/armv6/vp8_short_fdct4x4_armv6$(ASM)
-VP8_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/armv6/vp8_fast_quantize_b_armv6$(ASM)
-VP8_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/armv6/vp8_sad16x16_armv6$(ASM)
-VP8_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/armv6/vp8_variance16x16_armv6$(ASM)
-VP8_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6$(ASM)
-VP8_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6$(ASM)
-VP8_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6$(ASM)
-VP8_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/armv6/vp8_mse16x16_armv6$(ASM)
-VP8_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/armv6/vp8_variance8x8_armv6$(ASM)
-VP8_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/armv6/walsh_v6$(ASM)
-
-#File list for neon
-# encoder
-VP8_CX_SRCS-$(HAVE_ARMV7)  += encoder/arm/neon/fastquantizeb_neon$(ASM)
-VP8_CX_SRCS-$(HAVE_ARMV7)  += encoder/arm/neon/picklpf_arm.c
-VP8_CX_SRCS-$(HAVE_ARMV7)  += encoder/arm/neon/sad8_neon$(ASM)
-VP8_CX_SRCS-$(HAVE_ARMV7)  += encoder/arm/neon/sad16_neon$(ASM)
-VP8_CX_SRCS-$(HAVE_ARMV7)  += encoder/arm/neon/shortfdct_neon$(ASM)
-VP8_CX_SRCS-$(HAVE_ARMV7)  += encoder/arm/neon/subtract_neon$(ASM)
-VP8_CX_SRCS-$(HAVE_ARMV7)  += encoder/arm/neon/variance_neon$(ASM)
-VP8_CX_SRCS-$(HAVE_ARMV7)  += encoder/arm/neon/vp8_mse16x16_neon$(ASM)
-VP8_CX_SRCS-$(HAVE_ARMV7)  += encoder/arm/neon/vp8_subpixelvariance8x8_neon$(ASM)
-VP8_CX_SRCS-$(HAVE_ARMV7)  += encoder/arm/neon/vp8_subpixelvariance16x16_neon$(ASM)
-VP8_CX_SRCS-$(HAVE_ARMV7)  += encoder/arm/neon/vp8_subpixelvariance16x16s_neon$(ASM)
-VP8_CX_SRCS-$(HAVE_ARMV7)  += encoder/arm/neon/vp8_memcpy_neon$(ASM)
-VP8_CX_SRCS-$(HAVE_ARMV7)  += encoder/arm/neon/vp8_shortwalsh4x4_neon$(ASM)
--- a/vp8/vp8dx.mk
+++ /dev/null
@@ -1,71 +1,0 @@
-##
-##  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-##
-##  Use of this source code is governed by a BSD-style license
-##  that can be found in the LICENSE file in the root of the source
-##  tree. An additional intellectual property rights grant can be found
-##  in the file PATENTS.  All contributing project authors may
-##  be found in the AUTHORS file in the root of the source tree.
-##
-
-
-include $(SRC_PATH_BARE)/$(VP8_PREFIX)vp8_common.mk
-
-VP8_DX_EXPORTS += exports_dec
-
-VP8_DX_SRCS-yes += $(VP8_COMMON_SRCS-yes)
-VP8_DX_SRCS-no  += $(VP8_COMMON_SRCS-no)
-VP8_DX_SRCS_REMOVE-yes += $(VP8_COMMON_SRCS_REMOVE-yes)
-VP8_DX_SRCS_REMOVE-no  += $(VP8_COMMON_SRCS_REMOVE-no)
-
-ifeq ($(ARCH_ARM),yes)
-  include $(SRC_PATH_BARE)/$(VP8_PREFIX)vp8dx_arm.mk
-endif
-
-VP8_DX_SRCS-yes += vp8_dx_iface.c
-
-# common
-#define ARM
-#define DISABLE_THREAD
-
-#INCLUDES += algo/vpx_common/vpx_mem/include
-#INCLUDES += common
-#INCLUDES += common
-#INCLUDES += common
-#INCLUDES += common
-#INCLUDES += decoder
-
-
-
-# decoder
-#define ARM
-#define DISABLE_THREAD
-
-#INCLUDES += algo/vpx_common/vpx_mem/include
-#INCLUDES += common
-#INCLUDES += common
-#INCLUDES += common
-#INCLUDES += common
-#INCLUDES += decoder
-
-VP8_DX_SRCS-yes += decoder/asm_dec_offsets.c
-VP8_DX_SRCS-yes += decoder/dboolhuff.c
-VP8_DX_SRCS-yes += decoder/decodemv.c
-VP8_DX_SRCS-yes += decoder/decodframe.c
-VP8_DX_SRCS-yes += decoder/dequantize.c
-VP8_DX_SRCS-yes += decoder/detokenize.c
-VP8_DX_SRCS-yes += decoder/dboolhuff.h
-VP8_DX_SRCS-yes += decoder/decodemv.h
-VP8_DX_SRCS-yes += decoder/dequantize.h
-VP8_DX_SRCS-yes += decoder/detokenize.h
-VP8_DX_SRCS-yes += decoder/onyxd_int.h
-VP8_DX_SRCS-yes += decoder/treereader.h
-VP8_DX_SRCS-yes += decoder/onyxd_if.c
-VP8_DX_SRCS-yes += decoder/idct_blk.c
-
-VP8_DX_SRCS-yes := $(filter-out $(VP8_DX_SRCS_REMOVE-yes),$(VP8_DX_SRCS-yes))
-
-VP8_DX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += decoder/x86/x86_dsystemdependent.c
-VP8_DX_SRCS-$(HAVE_MMX) += decoder/x86/dequantize_mmx.asm
-VP8_DX_SRCS-$(HAVE_MMX) += decoder/x86/idct_blk_mmx.c
-VP8_DX_SRCS-$(HAVE_SSE2) += decoder/x86/idct_blk_sse2.c
--- a/vp8/vp8dx_arm.mk
+++ /dev/null
@@ -1,29 +1,0 @@
-##
-##  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-##
-##  Use of this source code is governed by a BSD-style license
-##  that can be found in the LICENSE file in the root of the source
-##  tree. An additional intellectual property rights grant can be found
-##  in the file PATENTS.  All contributing project authors may
-##  be found in the AUTHORS file in the root of the source tree.
-##
-
-
-#VP8_DX_SRCS list is modified according to different platforms.
-
-VP8_DX_SRCS-$(ARCH_ARM)  += decoder/arm/dequantize_arm.c
-
-#File list for armv6
-VP8_DX_SRCS-$(HAVE_ARMV6)  += decoder/arm/armv6/dequant_dc_idct_v6$(ASM)
-VP8_DX_SRCS-$(HAVE_ARMV6)  += decoder/arm/armv6/dequant_idct_v6$(ASM)
-VP8_DX_SRCS-$(HAVE_ARMV6)  += decoder/arm/armv6/dequantize_v6$(ASM)
-VP8_DX_SRCS-$(HAVE_ARMV6)  += decoder/arm/armv6/idct_blk_v6.c
-
-#File list for neon
-VP8_DX_SRCS-$(HAVE_ARMV7)  += decoder/arm/neon/idct_dequant_dc_full_2x_neon$(ASM)
-VP8_DX_SRCS-$(HAVE_ARMV7)  += decoder/arm/neon/idct_dequant_dc_0_2x_neon$(ASM)
-VP8_DX_SRCS-$(HAVE_ARMV7)  += decoder/arm/neon/dequant_idct_neon$(ASM)
-VP8_DX_SRCS-$(HAVE_ARMV7)  += decoder/arm/neon/idct_dequant_full_2x_neon$(ASM)
-VP8_DX_SRCS-$(HAVE_ARMV7)  += decoder/arm/neon/idct_dequant_0_2x_neon$(ASM)
-VP8_DX_SRCS-$(HAVE_ARMV7)  += decoder/arm/neon/dequantizeb_neon$(ASM)
-VP8_DX_SRCS-$(HAVE_ARMV7)  += decoder/arm/neon/idct_blk_neon.c
--- /dev/null
+++ b/vp9/common/alloccommon.c
@@ -1,0 +1,220 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_ports/config.h"
+#include "blockd.h"
+#include "vpx_mem/vpx_mem.h"
+#include "onyxc_int.h"
+#include "findnearmv.h"
+#include "entropymode.h"
+#include "entropymv.h"
+#include "systemdependent.h"
+
+
+void vp9_update_mode_info_border(VP9_COMMON *cpi, MODE_INFO *mi_base) {
+  int stride = cpi->mode_info_stride;
+  int i;
+
+  // Clear down top border row
+  vpx_memset(mi_base, 0, sizeof(MODE_INFO) * cpi->mode_info_stride);
+
+  // Clear left border column
+  for (i = 1; i < cpi->mb_rows + 1; i++) {
+    vpx_memset(&mi_base[i * stride], 0, sizeof(MODE_INFO));
+  }
+}
+
+void vp9_update_mode_info_in_image(VP9_COMMON *cpi, MODE_INFO *mi) {
+  int i, j;
+
+  // For each in image mode_info element set the in image flag to 1
+  for (i = 0; i < cpi->mb_rows; i++) {
+    for (j = 0; j < cpi->mb_cols; j++) {
+      mi->mbmi.mb_in_image = 1;
+      mi++;   // Next element in the row
+    }
+
+    mi++;       // Step over border element at start of next row
+  }
+}
+
+void vp9_de_alloc_frame_buffers(VP9_COMMON *oci) {
+  int i;
+
+  for (i = 0; i < NUM_YV12_BUFFERS; i++)
+    vp8_yv12_de_alloc_frame_buffer(&oci->yv12_fb[i]);
+
+  vp8_yv12_de_alloc_frame_buffer(&oci->temp_scale_frame);
+  vp8_yv12_de_alloc_frame_buffer(&oci->post_proc_buffer);
+
+  vpx_free(oci->above_context);
+  vpx_free(oci->mip);
+  vpx_free(oci->prev_mip);
+
+  oci->above_context = 0;
+  oci->mip = 0;
+  oci->prev_mip = 0;
+
+}
+
+int vp9_alloc_frame_buffers(VP9_COMMON *oci, int width, int height) {
+  int i;
+
+  vp9_de_alloc_frame_buffers(oci);
+
+  /* our internal buffers are always multiples of 16 */
+  if ((width & 0xf) != 0)
+    width += 16 - (width & 0xf);
+
+  if ((height & 0xf) != 0)
+    height += 16 - (height & 0xf);
+
+
+  for (i = 0; i < NUM_YV12_BUFFERS; i++) {
+    oci->fb_idx_ref_cnt[i] = 0;
+    oci->yv12_fb[i].flags = 0;
+    if (vp8_yv12_alloc_frame_buffer(&oci->yv12_fb[i], width, height, VP8BORDERINPIXELS) < 0) {
+      vp9_de_alloc_frame_buffers(oci);
+      return 1;
+    }
+  }
+
+  oci->new_fb_idx = 0;
+  oci->lst_fb_idx = 1;
+  oci->gld_fb_idx = 2;
+  oci->alt_fb_idx = 3;
+
+  oci->fb_idx_ref_cnt[0] = 1;
+  oci->fb_idx_ref_cnt[1] = 1;
+  oci->fb_idx_ref_cnt[2] = 1;
+  oci->fb_idx_ref_cnt[3] = 1;
+
+  if (vp8_yv12_alloc_frame_buffer(&oci->temp_scale_frame,   width, 16, VP8BORDERINPIXELS) < 0) {
+    vp9_de_alloc_frame_buffers(oci);
+    return 1;
+  }
+
+  if (vp8_yv12_alloc_frame_buffer(&oci->post_proc_buffer, width, height, VP8BORDERINPIXELS) < 0) {
+    vp9_de_alloc_frame_buffers(oci);
+    return 1;
+  }
+
+  oci->mb_rows = height >> 4;
+  oci->mb_cols = width >> 4;
+  oci->MBs = oci->mb_rows * oci->mb_cols;
+  oci->mode_info_stride = oci->mb_cols + 1;
+  oci->mip = vpx_calloc((oci->mb_cols + 1) * (oci->mb_rows + 1), sizeof(MODE_INFO));
+
+  if (!oci->mip) {
+    vp9_de_alloc_frame_buffers(oci);
+    return 1;
+  }
+
+  oci->mi = oci->mip + oci->mode_info_stride + 1;
+
+  /* allocate memory for last frame MODE_INFO array */
+
+  oci->prev_mip = vpx_calloc((oci->mb_cols + 1) * (oci->mb_rows + 1), sizeof(MODE_INFO));
+
+  if (!oci->prev_mip) {
+    vp9_de_alloc_frame_buffers(oci);
+    return 1;
+  }
+
+  oci->prev_mi = oci->prev_mip + oci->mode_info_stride + 1;
+
+  oci->above_context = vpx_calloc(sizeof(ENTROPY_CONTEXT_PLANES) * oci->mb_cols, 1);
+
+  if (!oci->above_context) {
+    vp9_de_alloc_frame_buffers(oci);
+    return 1;
+  }
+
+  vp9_update_mode_info_border(oci, oci->mip);
+  vp9_update_mode_info_in_image(oci, oci->mi);
+
+  return 0;
+}
+void vp9_setup_version(VP9_COMMON *cm) {
+  if (cm->version & 0x4) {
+    if (!CONFIG_EXPERIMENTAL)
+      vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,
+                         "Bitstream was created by an experimental "
+                         "encoder");
+    cm->experimental = 1;
+  }
+
+  switch (cm->version & 0x3) {
+    case 0:
+      cm->no_lpf = 0;
+      cm->filter_type = NORMAL_LOOPFILTER;
+      cm->use_bilinear_mc_filter = 0;
+      cm->full_pixel = 0;
+      break;
+    case 1:
+      cm->no_lpf = 0;
+      cm->filter_type = SIMPLE_LOOPFILTER;
+      cm->use_bilinear_mc_filter = 1;
+      cm->full_pixel = 0;
+      break;
+    case 2:
+    case 3:
+      cm->no_lpf = 1;
+      cm->filter_type = NORMAL_LOOPFILTER;
+      cm->use_bilinear_mc_filter = 1;
+      cm->full_pixel = 0;
+      break;
+      // Full pel only code deprecated in experimental code base
+      // case 3:
+      //    cm->no_lpf = 1;
+      //    cm->filter_type = SIMPLE_LOOPFILTER;
+      //    cm->use_bilinear_mc_filter = 1;
+      //    cm->full_pixel = 1;
+      //    break;
+  }
+}
+void vp9_create_common(VP9_COMMON *oci) {
+  vp9_machine_specific_config(oci);
+
+  vp9_init_mbmode_probs(oci);
+
+  vp9_default_bmode_probs(oci->fc.bmode_prob);
+
+  oci->txfm_mode = ONLY_4X4;
+  oci->mb_no_coeff_skip = 1;
+  oci->comp_pred_mode = HYBRID_PREDICTION;
+  oci->no_lpf = 0;
+  oci->filter_type = NORMAL_LOOPFILTER;
+  oci->use_bilinear_mc_filter = 0;
+  oci->full_pixel = 0;
+  oci->clr_type = REG_YUV;
+  oci->clamp_type = RECON_CLAMP_REQUIRED;
+
+  /* Initialise reference frame sign bias structure to defaults */
+  vpx_memset(oci->ref_frame_sign_bias, 0, sizeof(oci->ref_frame_sign_bias));
+
+  /* Default disable buffer to buffer copying */
+  oci->copy_buffer_to_gf = 0;
+  oci->copy_buffer_to_arf = 0;
+  oci->kf_ymode_probs_update = 0;
+}
+
+void vp9_remove_common(VP9_COMMON *oci) {
+  vp9_de_alloc_frame_buffers(oci);
+}
+
+void vp9_initialize_common() {
+  vp9_coef_tree_initialize();
+
+  vp9_entropy_mode_init();
+
+  vp9_entropy_mv_init();
+}
--- /dev/null
+++ b/vp9/common/alloccommon.h
@@ -1,0 +1,26 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_ALLOCCOMMON_H
+#define __INC_ALLOCCOMMON_H
+
+#include "onyxc_int.h"
+
+void vp9_create_common(VP9_COMMON *oci);
+void vp9_remove_common(VP9_COMMON *oci);
+void vp9_de_alloc_frame_buffers(VP9_COMMON *oci);
+int vp9_alloc_frame_buffers(VP9_COMMON *oci, int width, int height);
+void vp9_setup_version(VP9_COMMON *oci);
+
+void vp9_update_mode_info_border(VP9_COMMON *cpi, MODE_INFO *mi_base);
+void vp9_update_mode_info_in_image(VP9_COMMON *cpi, MODE_INFO *mi);
+
+#endif
--- /dev/null
+++ b/vp9/common/arm/arm_systemdependent.c
@@ -1,0 +1,92 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_ports/config.h"
+#include "vpx_ports/arm.h"
+#include "vp9/common/pragmas.h"
+#include "vp9/common/subpixel.h"
+#include "vp9/common/loopfilter.h"
+#include "vp9/common/recon.h"
+#include "vp9/common/idct.h"
+#include "vp9/common/onyxc_int.h"
+
+void vp9_arch_arm_common_init(VP9_COMMON *ctx) {
+#if CONFIG_RUNTIME_CPU_DETECT
+  VP9_COMMON_RTCD *rtcd = &ctx->rtcd;
+  int flags = arm_cpu_caps();
+  rtcd->flags = flags;
+
+  /* Override default functions with fastest ones for this CPU. */
+#if HAVE_ARMV5TE
+  if (flags & HAS_EDSP) {
+  }
+#endif
+
+// The commented functions need to be re-written for vpx.
+#if HAVE_ARMV6
+  if (flags & HAS_MEDIA) {
+    rtcd->subpix.sixtap16x16   = vp9_sixtap_predict16x16_armv6;
+    rtcd->subpix.sixtap8x8     = vp9_sixtap_predict8x8_armv6;
+    rtcd->subpix.sixtap8x4     = vp9_sixtap_predict8x4_armv6;
+    rtcd->subpix.sixtap4x4     = vp9_sixtap_predict_armv6;
+
+    rtcd->subpix.bilinear16x16 = vp9_bilinear_predict16x16_armv6;
+    rtcd->subpix.bilinear8x8   = vp9_bilinear_predict8x8_armv6;
+    rtcd->subpix.bilinear8x4   = vp9_bilinear_predict8x4_armv6;
+    rtcd->subpix.bilinear4x4   = vp9_bilinear_predict4x4_armv6;
+
+    // rtcd->idct.idct1        = vp9_short_idct4x4llm_1_v6;
+    // rtcd->idct.idct16       = vp9_short_idct4x4llm_v6_dual;
+    // rtcd->idct.iwalsh1      = vp9_short_inv_walsh4x4_1_v6;
+    // rtcd->idct.iwalsh16     = vp9_short_inv_walsh4x4_v6;
+
+    rtcd->recon.copy16x16   = vp9_copy_mem16x16_v6;
+    rtcd->recon.copy8x8     = vp9_copy_mem8x8_v6;
+    rtcd->recon.copy8x4     = vp9_copy_mem8x4_v6;
+    rtcd->recon.recon       = vp9_recon_b_armv6;
+    rtcd->recon.recon2      = vp9_recon2b_armv6;
+    rtcd->recon.recon4      = vp9_recon4b_armv6;
+  }
+#endif
+
+#if HAVE_ARMV7
+  if (flags & HAS_NEON) {
+    rtcd->subpix.sixtap16x16   = vp9_sixtap_predict16x16_neon;
+    rtcd->subpix.sixtap8x8     = vp9_sixtap_predict8x8_neon;
+    rtcd->subpix.sixtap8x4     = vp9_sixtap_predict8x4_neon;
+    rtcd->subpix.sixtap4x4     = vp9_sixtap_predict_neon;
+
+    rtcd->subpix.bilinear16x16 = vp9_bilinear_predict16x16_neon;
+    rtcd->subpix.bilinear8x8   = vp9_bilinear_predict8x8_neon;
+    rtcd->subpix.bilinear8x4   = vp9_bilinear_predict8x4_neon;
+    rtcd->subpix.bilinear4x4   = vp9_bilinear_predict4x4_neon;
+
+    // rtcd->idct.idct1        = vp9_short_idct4x4llm_1_neon;
+    // rtcd->idct.idct16       = vp9_short_idct4x4llm_neon;
+    // rtcd->idct.iwalsh1      = vp9_short_inv_walsh4x4_1_neon;
+    // rtcd->idct.iwalsh16     = vp9_short_inv_walsh4x4_neon;
+
+    rtcd->recon.copy16x16   = vp9_copy_mem16x16_neon;
+    rtcd->recon.copy8x8     = vp9_copy_mem8x8_neon;
+    rtcd->recon.copy8x4     = vp9_copy_mem8x4_neon;
+    rtcd->recon.recon       = vp9_recon_b_neon;
+    rtcd->recon.recon2      = vp9_recon2b_neon;
+    rtcd->recon.recon4      = vp9_recon4b_neon;
+    rtcd->recon.recon_mb    = vp9_recon_mb_neon;
+    rtcd->recon.build_intra_predictors_mby =
+      vp9_build_intra_predictors_mby_neon;
+    rtcd->recon.build_intra_predictors_mby_s =
+      vp9_build_intra_predictors_mby_s_neon;
+  }
+#endif
+
+#endif
+}
--- /dev/null
+++ b/vp9/common/arm/armv6/bilinearfilter_v6.asm
@@ -1,0 +1,237 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp9_filter_block2d_bil_first_pass_armv6|
+    EXPORT  |vp9_filter_block2d_bil_second_pass_armv6|
+
+    AREA    |.text|, CODE, READONLY  ; name this block of code
+
+;-------------------------------------
+; r0    unsigned char  *src_ptr,
+; r1    unsigned short *dst_ptr,
+; r2    unsigned int    src_pitch,
+; r3    unsigned int    height,
+; stack unsigned int    width,
+; stack const short    *vp9_filter
+;-------------------------------------
+; The output is transposed stroed in output array to make it easy for second pass filtering.
+|vp9_filter_block2d_bil_first_pass_armv6| PROC
+    stmdb   sp!, {r4 - r11, lr}
+
+    ldr     r11, [sp, #40]                  ; vp9_filter address
+    ldr     r4, [sp, #36]                   ; width
+
+    mov     r12, r3                         ; outer-loop counter
+
+    add     r7, r2, r4                      ; preload next row
+    pld     [r0, r7]
+
+    sub     r2, r2, r4                      ; src increment for height loop
+
+    ldr     r5, [r11]                       ; load up filter coefficients
+
+    mov     r3, r3, lsl #1                  ; height*2
+    add     r3, r3, #2                      ; plus 2 to make output buffer 4-bit aligned since height is actually (height+1)
+
+    mov     r11, r1                         ; save dst_ptr for each row
+
+    cmp     r5, #128                        ; if filter coef = 128, then skip the filter
+    beq     bil_null_1st_filter
+
+|bil_height_loop_1st_v6|
+    ldrb    r6, [r0]                        ; load source data
+    ldrb    r7, [r0, #1]
+    ldrb    r8, [r0, #2]
+    mov     lr, r4, lsr #2                  ; 4-in-parellel loop counter
+
+|bil_width_loop_1st_v6|
+    ldrb    r9, [r0, #3]
+    ldrb    r10, [r0, #4]
+
+    pkhbt   r6, r6, r7, lsl #16             ; src[1] | src[0]
+    pkhbt   r7, r7, r8, lsl #16             ; src[2] | src[1]
+
+    smuad   r6, r6, r5                      ; apply the filter
+    pkhbt   r8, r8, r9, lsl #16             ; src[3] | src[2]
+    smuad   r7, r7, r5
+    pkhbt   r9, r9, r10, lsl #16            ; src[4] | src[3]
+
+    smuad   r8, r8, r5
+    smuad   r9, r9, r5
+
+    add     r0, r0, #4
+    subs    lr, lr, #1
+
+    add     r6, r6, #0x40                   ; round_shift_and_clamp
+    add     r7, r7, #0x40
+    usat    r6, #16, r6, asr #7
+    usat    r7, #16, r7, asr #7
+
+    strh    r6, [r1], r3                    ; result is transposed and stored
+
+    add     r8, r8, #0x40                   ; round_shift_and_clamp
+    strh    r7, [r1], r3
+    add     r9, r9, #0x40
+    usat    r8, #16, r8, asr #7
+    usat    r9, #16, r9, asr #7
+
+    strh    r8, [r1], r3                    ; result is transposed and stored
+
+    ldrneb  r6, [r0]                        ; load source data
+    strh    r9, [r1], r3
+
+    ldrneb  r7, [r0, #1]
+    ldrneb  r8, [r0, #2]
+
+    bne     bil_width_loop_1st_v6
+
+    add     r0, r0, r2                      ; move to next input row
+    subs    r12, r12, #1
+
+    add     r9, r2, r4, lsl #1              ; adding back block width
+    pld     [r0, r9]                        ; preload next row
+
+    add     r11, r11, #2                    ; move over to next column
+    mov     r1, r11
+
+    bne     bil_height_loop_1st_v6
+
+    ldmia   sp!, {r4 - r11, pc}
+
+|bil_null_1st_filter|
+|bil_height_loop_null_1st|
+    mov     lr, r4, lsr #2                  ; loop counter
+
+|bil_width_loop_null_1st|
+    ldrb    r6, [r0]                        ; load data
+    ldrb    r7, [r0, #1]
+    ldrb    r8, [r0, #2]
+    ldrb    r9, [r0, #3]
+
+    strh    r6, [r1], r3                    ; store it to immediate buffer
+    add     r0, r0, #4
+    strh    r7, [r1], r3
+    subs    lr, lr, #1
+    strh    r8, [r1], r3
+    strh    r9, [r1], r3
+
+    bne     bil_width_loop_null_1st
+
+    subs    r12, r12, #1
+    add     r0, r0, r2                      ; move to next input line
+    add     r11, r11, #2                    ; move over to next column
+    mov     r1, r11
+
+    bne     bil_height_loop_null_1st
+
+    ldmia   sp!, {r4 - r11, pc}
+
+    ENDP  ; |vp9_filter_block2d_bil_first_pass_armv6|
+
+
+;---------------------------------
+; r0    unsigned short *src_ptr,
+; r1    unsigned char  *dst_ptr,
+; r2    int             dst_pitch,
+; r3    unsigned int    height,
+; stack unsigned int    width,
+; stack const short    *vp9_filter
+;---------------------------------
+|vp9_filter_block2d_bil_second_pass_armv6| PROC
+    stmdb   sp!, {r4 - r11, lr}
+
+    ldr     r11, [sp, #40]                  ; vp9_filter address
+    ldr     r4, [sp, #36]                   ; width
+
+    ldr     r5, [r11]                       ; load up filter coefficients
+    mov     r12, r4                         ; outer-loop counter = width, since we work on transposed data matrix
+    mov     r11, r1
+
+    cmp     r5, #128                        ; if filter coef = 128, then skip the filter
+    beq     bil_null_2nd_filter
+
+|bil_height_loop_2nd|
+    ldr     r6, [r0]                        ; load the data
+    ldr     r8, [r0, #4]
+    ldrh    r10, [r0, #8]
+    mov     lr, r3, lsr #2                  ; loop counter
+
+|bil_width_loop_2nd|
+    pkhtb   r7, r6, r8                      ; src[1] | src[2]
+    pkhtb   r9, r8, r10                     ; src[3] | src[4]
+
+    smuad   r6, r6, r5                      ; apply filter
+    smuad   r8, r8, r5                      ; apply filter
+
+    subs    lr, lr, #1
+
+    smuadx  r7, r7, r5                      ; apply filter
+    smuadx  r9, r9, r5                      ; apply filter
+
+    add     r0, r0, #8
+
+    add     r6, r6, #0x40                   ; round_shift_and_clamp
+    add     r7, r7, #0x40
+    usat    r6, #8, r6, asr #7
+    usat    r7, #8, r7, asr #7
+    strb    r6, [r1], r2                    ; the result is transposed back and stored
+
+    add     r8, r8, #0x40                   ; round_shift_and_clamp
+    strb    r7, [r1], r2
+    add     r9, r9, #0x40
+    usat    r8, #8, r8, asr #7
+    usat    r9, #8, r9, asr #7
+    strb    r8, [r1], r2                    ; the result is transposed back and stored
+
+    ldrne   r6, [r0]                        ; load data
+    strb    r9, [r1], r2
+    ldrne   r8, [r0, #4]
+    ldrneh  r10, [r0, #8]
+
+    bne     bil_width_loop_2nd
+
+    subs    r12, r12, #1
+    add     r0, r0, #4                      ; update src for next row
+    add     r11, r11, #1
+    mov     r1, r11
+
+    bne     bil_height_loop_2nd
+    ldmia   sp!, {r4 - r11, pc}
+
+|bil_null_2nd_filter|
+|bil_height_loop_null_2nd|
+    mov     lr, r3, lsr #2
+
+|bil_width_loop_null_2nd|
+    ldr     r6, [r0], #4                    ; load data
+    subs    lr, lr, #1
+    ldr     r8, [r0], #4
+
+    strb    r6, [r1], r2                    ; store data
+    mov     r7, r6, lsr #16
+    strb    r7, [r1], r2
+    mov     r9, r8, lsr #16
+    strb    r8, [r1], r2
+    strb    r9, [r1], r2
+
+    bne     bil_width_loop_null_2nd
+
+    subs    r12, r12, #1
+    add     r0, r0, #4
+    add     r11, r11, #1
+    mov     r1, r11
+
+    bne     bil_height_loop_null_2nd
+
+    ldmia   sp!, {r4 - r11, pc}
+    ENDP  ; |vp9_filter_block2d_second_pass_armv6|
+
+    END
--- /dev/null
+++ b/vp9/common/arm/armv6/copymem16x16_v6.asm
@@ -1,0 +1,186 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp9_copy_mem16x16_v6|
+    ; ARM
+    ; REQUIRE8
+    ; PRESERVE8
+
+    AREA    Block, CODE, READONLY ; name this block of code
+;void copy_mem16x16_v6( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride)
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
+|vp9_copy_mem16x16_v6| PROC
+    stmdb       sp!, {r4 - r7}
+    ;push   {r4-r7}
+
+    ;preload
+    pld     [r0, #31]                ; preload for next 16x16 block
+
+    ands    r4, r0, #15
+    beq     copy_mem16x16_fast
+
+    ands    r4, r0, #7
+    beq     copy_mem16x16_8
+
+    ands    r4, r0, #3
+    beq     copy_mem16x16_4
+
+    ;copy one byte each time
+    ldrb    r4, [r0]
+    ldrb    r5, [r0, #1]
+    ldrb    r6, [r0, #2]
+    ldrb    r7, [r0, #3]
+
+    mov     r12, #16
+
+copy_mem16x16_1_loop
+    strb    r4, [r2]
+    strb    r5, [r2, #1]
+    strb    r6, [r2, #2]
+    strb    r7, [r2, #3]
+
+    ldrb    r4, [r0, #4]
+    ldrb    r5, [r0, #5]
+    ldrb    r6, [r0, #6]
+    ldrb    r7, [r0, #7]
+
+    subs    r12, r12, #1
+
+    strb    r4, [r2, #4]
+    strb    r5, [r2, #5]
+    strb    r6, [r2, #6]
+    strb    r7, [r2, #7]
+
+    ldrb    r4, [r0, #8]
+    ldrb    r5, [r0, #9]
+    ldrb    r6, [r0, #10]
+    ldrb    r7, [r0, #11]
+
+    strb    r4, [r2, #8]
+    strb    r5, [r2, #9]
+    strb    r6, [r2, #10]
+    strb    r7, [r2, #11]
+
+    ldrb    r4, [r0, #12]
+    ldrb    r5, [r0, #13]
+    ldrb    r6, [r0, #14]
+    ldrb    r7, [r0, #15]
+
+    add     r0, r0, r1
+
+    strb    r4, [r2, #12]
+    strb    r5, [r2, #13]
+    strb    r6, [r2, #14]
+    strb    r7, [r2, #15]
+
+    add     r2, r2, r3
+
+    ldrneb  r4, [r0]
+    ldrneb  r5, [r0, #1]
+    ldrneb  r6, [r0, #2]
+    ldrneb  r7, [r0, #3]
+
+    pld     [r0, #31]               ; preload for next 16x16 block
+
+    bne     copy_mem16x16_1_loop
+
+    ldmia       sp!, {r4 - r7}
+    ;pop        {r4-r7}
+    mov     pc, lr
+
+;copy 4 bytes each time
+copy_mem16x16_4
+    ldr     r4, [r0]
+    ldr     r5, [r0, #4]
+    ldr     r6, [r0, #8]
+    ldr     r7, [r0, #12]
+
+    mov     r12, #16
+
+copy_mem16x16_4_loop
+    subs    r12, r12, #1
+    add     r0, r0, r1
+
+    str     r4, [r2]
+    str     r5, [r2, #4]
+    str     r6, [r2, #8]
+    str     r7, [r2, #12]
+
+    add     r2, r2, r3
+
+    ldrne   r4, [r0]
+    ldrne   r5, [r0, #4]
+    ldrne   r6, [r0, #8]
+    ldrne   r7, [r0, #12]
+
+    pld     [r0, #31]               ; preload for next 16x16 block
+
+    bne     copy_mem16x16_4_loop
+
+    ldmia       sp!, {r4 - r7}
+    ;pop        {r4-r7}
+    mov     pc, lr
+
+;copy 8 bytes each time
+copy_mem16x16_8
+    sub     r1, r1, #16
+    sub     r3, r3, #16
+
+    mov     r12, #16
+
+copy_mem16x16_8_loop
+    ldmia   r0!, {r4-r5}
+    ;ldm        r0, {r4-r5}
+    ldmia   r0!, {r6-r7}
+
+    add     r0, r0, r1
+
+    stmia   r2!, {r4-r5}
+    subs    r12, r12, #1
+    ;stm        r2, {r4-r5}
+    stmia   r2!, {r6-r7}
+
+    add     r2, r2, r3
+
+    pld     [r0, #31]               ; preload for next 16x16 block
+    bne     copy_mem16x16_8_loop
+
+    ldmia       sp!, {r4 - r7}
+    ;pop        {r4-r7}
+    mov     pc, lr
+
+;copy 16 bytes each time
+copy_mem16x16_fast
+    ;sub        r1, r1, #16
+    ;sub        r3, r3, #16
+
+    mov     r12, #16
+
+copy_mem16x16_fast_loop
+    ldmia   r0, {r4-r7}
+    ;ldm        r0, {r4-r7}
+    add     r0, r0, r1
+
+    subs    r12, r12, #1
+    stmia   r2, {r4-r7}
+    ;stm        r2, {r4-r7}
+    add     r2, r2, r3
+
+    pld     [r0, #31]               ; preload for next 16x16 block
+    bne     copy_mem16x16_fast_loop
+
+    ldmia       sp!, {r4 - r7}
+    ;pop        {r4-r7}
+    mov     pc, lr
+
+    ENDP  ; |vp9_copy_mem16x16_v6|
+
+    END
--- /dev/null
+++ b/vp9/common/arm/armv6/copymem8x4_v6.asm
@@ -1,0 +1,128 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp9_copy_mem8x4_v6|
+    ; ARM
+    ; REQUIRE8
+    ; PRESERVE8
+
+    AREA    Block, CODE, READONLY ; name this block of code
+;void vp9_copy_mem8x4_v6( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride)
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
+|vp9_copy_mem8x4_v6| PROC
+    ;push   {r4-r5}
+    stmdb  sp!, {r4-r5}
+
+    ;preload
+    pld     [r0]
+    pld     [r0, r1]
+    pld     [r0, r1, lsl #1]
+
+    ands    r4, r0, #7
+    beq     copy_mem8x4_fast
+
+    ands    r4, r0, #3
+    beq     copy_mem8x4_4
+
+    ;copy 1 byte each time
+    ldrb    r4, [r0]
+    ldrb    r5, [r0, #1]
+
+    mov     r12, #4
+
+copy_mem8x4_1_loop
+    strb    r4, [r2]
+    strb    r5, [r2, #1]
+
+    ldrb    r4, [r0, #2]
+    ldrb    r5, [r0, #3]
+
+    subs    r12, r12, #1
+
+    strb    r4, [r2, #2]
+    strb    r5, [r2, #3]
+
+    ldrb    r4, [r0, #4]
+    ldrb    r5, [r0, #5]
+
+    strb    r4, [r2, #4]
+    strb    r5, [r2, #5]
+
+    ldrb    r4, [r0, #6]
+    ldrb    r5, [r0, #7]
+
+    add     r0, r0, r1
+
+    strb    r4, [r2, #6]
+    strb    r5, [r2, #7]
+
+    add     r2, r2, r3
+
+    ldrneb  r4, [r0]
+    ldrneb  r5, [r0, #1]
+
+    bne     copy_mem8x4_1_loop
+
+    ldmia       sp!, {r4 - r5}
+    ;pop        {r4-r5}
+    mov     pc, lr
+
+;copy 4 bytes each time
+copy_mem8x4_4
+    ldr     r4, [r0]
+    ldr     r5, [r0, #4]
+
+    mov     r12, #4
+
+copy_mem8x4_4_loop
+    subs    r12, r12, #1
+    add     r0, r0, r1
+
+    str     r4, [r2]
+    str     r5, [r2, #4]
+
+    add     r2, r2, r3
+
+    ldrne   r4, [r0]
+    ldrne   r5, [r0, #4]
+
+    bne     copy_mem8x4_4_loop
+
+    ldmia  sp!, {r4-r5}
+    ;pop        {r4-r5}
+    mov     pc, lr
+
+;copy 8 bytes each time
+copy_mem8x4_fast
+    ;sub        r1, r1, #8
+    ;sub        r3, r3, #8
+
+    mov     r12, #4
+
+copy_mem8x4_fast_loop
+    ldmia   r0, {r4-r5}
+    ;ldm        r0, {r4-r5}
+    add     r0, r0, r1
+
+    subs    r12, r12, #1
+    stmia   r2, {r4-r5}
+    ;stm        r2, {r4-r5}
+    add     r2, r2, r3
+
+    bne     copy_mem8x4_fast_loop
+
+    ldmia  sp!, {r4-r5}
+    ;pop        {r4-r5}
+    mov     pc, lr
+
+    ENDP  ; |vp9_copy_mem8x4_v6|
+
+    END
--- /dev/null
+++ b/vp9/common/arm/armv6/copymem8x8_v6.asm
@@ -1,0 +1,128 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp9_copy_mem8x8_v6|
+    ; ARM
+    ; REQUIRE8
+    ; PRESERVE8
+
+    AREA    Block, CODE, READONLY ; name this block of code
+;void copy_mem8x8_v6( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride)
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
+|vp9_copy_mem8x8_v6| PROC
+    ;push   {r4-r5}
+    stmdb  sp!, {r4-r5}
+
+    ;preload
+    pld     [r0]
+    pld     [r0, r1]
+    pld     [r0, r1, lsl #1]
+
+    ands    r4, r0, #7
+    beq     copy_mem8x8_fast
+
+    ands    r4, r0, #3
+    beq     copy_mem8x8_4
+
+    ;copy 1 byte each time
+    ldrb    r4, [r0]
+    ldrb    r5, [r0, #1]
+
+    mov     r12, #8
+
+copy_mem8x8_1_loop
+    strb    r4, [r2]
+    strb    r5, [r2, #1]
+
+    ldrb    r4, [r0, #2]
+    ldrb    r5, [r0, #3]
+
+    subs    r12, r12, #1
+
+    strb    r4, [r2, #2]
+    strb    r5, [r2, #3]
+
+    ldrb    r4, [r0, #4]
+    ldrb    r5, [r0, #5]
+
+    strb    r4, [r2, #4]
+    strb    r5, [r2, #5]
+
+    ldrb    r4, [r0, #6]
+    ldrb    r5, [r0, #7]
+
+    add     r0, r0, r1
+
+    strb    r4, [r2, #6]
+    strb    r5, [r2, #7]
+
+    add     r2, r2, r3
+
+    ldrneb  r4, [r0]
+    ldrneb  r5, [r0, #1]
+
+    bne     copy_mem8x8_1_loop
+
+    ldmia       sp!, {r4 - r5}
+    ;pop        {r4-r5}
+    mov     pc, lr
+
+;copy 4 bytes each time
+copy_mem8x8_4
+    ldr     r4, [r0]
+    ldr     r5, [r0, #4]
+
+    mov     r12, #8
+
+copy_mem8x8_4_loop
+    subs    r12, r12, #1
+    add     r0, r0, r1
+
+    str     r4, [r2]
+    str     r5, [r2, #4]
+
+    add     r2, r2, r3
+
+    ldrne   r4, [r0]
+    ldrne   r5, [r0, #4]
+
+    bne     copy_mem8x8_4_loop
+
+    ldmia       sp!, {r4 - r5}
+    ;pop        {r4-r5}
+    mov     pc, lr
+
+;copy 8 bytes each time
+copy_mem8x8_fast
+    ;sub        r1, r1, #8
+    ;sub        r3, r3, #8
+
+    mov     r12, #8
+
+copy_mem8x8_fast_loop
+    ldmia   r0, {r4-r5}
+    ;ldm        r0, {r4-r5}
+    add     r0, r0, r1
+
+    subs    r12, r12, #1
+    stmia   r2, {r4-r5}
+    ;stm        r2, {r4-r5}
+    add     r2, r2, r3
+
+    bne     copy_mem8x8_fast_loop
+
+    ldmia  sp!, {r4-r5}
+    ;pop        {r4-r5}
+    mov     pc, lr
+
+    ENDP  ; |vp9_copy_mem8x8_v6|
+
+    END
--- /dev/null
+++ b/vp9/common/arm/armv6/dc_only_idct_add_v6.asm
@@ -1,0 +1,67 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+    EXPORT  |vp8_dc_only_idct_add_v6|
+
+    AREA    |.text|, CODE, READONLY
+
+;void vp8_dc_only_idct_add_v6(short input_dc, unsigned char *pred_ptr,
+;                             unsigned char *dst_ptr, int pitch, int stride)
+; r0  input_dc
+; r1  pred_ptr
+; r2  dest_ptr
+; r3  pitch
+; sp  stride
+
+|vp8_dc_only_idct_add_v6| PROC
+    stmdb       sp!, {r4 - r7, lr}
+
+    add         r0, r0, #4                ; input_dc += 4
+    ldr         r12, c0x0000FFFF
+    ldr         r4, [r1], r3
+    ldr         r6, [r1], r3
+    and         r0, r12, r0, asr #3       ; input_dc >> 3 + mask
+    ldr         lr, [sp, #20]
+    orr         r0, r0, r0, lsl #16       ; a1 | a1
+
+    uxtab16     r5, r0, r4                ; a1+2 | a1+0
+    uxtab16     r4, r0, r4, ror #8        ; a1+3 | a1+1
+    uxtab16     r7, r0, r6
+    uxtab16     r6, r0, r6, ror #8
+    usat16      r5, #8, r5
+    usat16      r4, #8, r4
+    usat16      r7, #8, r7
+    usat16      r6, #8, r6
+    orr         r5, r5, r4, lsl #8
+    orr         r7, r7, r6, lsl #8
+    ldr         r4, [r1], r3
+    ldr         r6, [r1]
+    str         r5, [r2], lr
+    str         r7, [r2], lr
+
+    uxtab16     r5, r0, r4
+    uxtab16     r4, r0, r4, ror #8
+    uxtab16     r7, r0, r6
+    uxtab16     r6, r0, r6, ror #8
+    usat16      r5, #8, r5
+    usat16      r4, #8, r4
+    usat16      r7, #8, r7
+    usat16      r6, #8, r6
+    orr         r5, r5, r4, lsl #8
+    orr         r7, r7, r6, lsl #8
+    str         r5, [r2], lr
+    str         r7, [r2]
+
+    ldmia       sp!, {r4 - r7, pc}
+
+    ENDP  ; |vp8_dc_only_idct_add_v6|
+
+; Constant Pool
+c0x0000FFFF DCD 0x0000FFFF
+    END
--- /dev/null
+++ b/vp9/common/arm/armv6/filter_v6.asm
@@ -1,0 +1,624 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp9_filter_block2d_first_pass_armv6|
+    EXPORT  |vp9_filter_block2d_first_pass_16x16_armv6|
+    EXPORT  |vp9_filter_block2d_first_pass_8x8_armv6|
+    EXPORT  |vp9_filter_block2d_second_pass_armv6|
+    EXPORT  |vp9_filter4_block2d_second_pass_armv6|
+    EXPORT  |vp9_filter_block2d_first_pass_only_armv6|
+    EXPORT  |vp9_filter_block2d_second_pass_only_armv6|
+
+    AREA    |.text|, CODE, READONLY  ; name this block of code
+;-------------------------------------
+; r0    unsigned char *src_ptr
+; r1    short         *output_ptr
+; r2    unsigned int src_pixels_per_line
+; r3    unsigned int output_width
+; stack unsigned int output_height
+; stack const short *vp9_filter
+;-------------------------------------
+; vp9_filter the input and put in the output array.  Apply the 6 tap FIR filter with
+; the output being a 2 byte value and the intput being a 1 byte value.
+|vp9_filter_block2d_first_pass_armv6| PROC
+    stmdb   sp!, {r4 - r11, lr}
+
+    ldr     r11, [sp, #40]                  ; vp9_filter address
+    ldr     r7, [sp, #36]                   ; output height
+
+    sub     r2, r2, r3                      ; inside loop increments input array,
+                                            ; so the height loop only needs to add
+                                            ; r2 - width to the input pointer
+
+    mov     r3, r3, lsl #1                  ; multiply width by 2 because using shorts
+    add     r12, r3, #16                    ; square off the output
+    sub     sp, sp, #4
+
+    ldr     r4, [r11]                       ; load up packed filter coefficients
+    ldr     r5, [r11, #4]
+    ldr     r6, [r11, #8]
+
+    str     r1, [sp]                        ; push destination to stack
+    mov     r7, r7, lsl #16                 ; height is top part of counter
+
+; six tap filter
+|height_loop_1st_6|
+    ldrb    r8, [r0, #-2]                   ; load source data
+    ldrb    r9, [r0, #-1]
+    ldrb    r10, [r0], #2
+    orr     r7, r7, r3, lsr #2              ; construct loop counter
+
+|width_loop_1st_6|
+    ldrb    r11, [r0, #-1]
+
+    pkhbt   lr, r8, r9, lsl #16             ; r9 | r8
+    pkhbt   r8, r9, r10, lsl #16            ; r10 | r9
+
+    ldrb    r9, [r0]
+
+    smuad   lr, lr, r4                      ; apply the filter
+    pkhbt   r10, r10, r11, lsl #16          ; r11 | r10
+    smuad   r8, r8, r4
+    pkhbt   r11, r11, r9, lsl #16           ; r9 | r11
+
+    smlad   lr, r10, r5, lr
+    ldrb    r10, [r0, #1]
+    smlad   r8, r11, r5, r8
+    ldrb    r11, [r0, #2]
+
+    sub     r7, r7, #1
+
+    pkhbt   r9, r9, r10, lsl #16            ; r10 | r9
+    pkhbt   r10, r10, r11, lsl #16          ; r11 | r10
+
+    smlad   lr, r9, r6, lr
+    smlad   r11, r10, r6, r8
+
+    ands    r10, r7, #0xff                  ; test loop counter
+
+    add     lr, lr, #0x40                   ; round_shift_and_clamp
+    ldrneb  r8, [r0, #-2]                   ; load data for next loop
+    usat    lr, #8, lr, asr #7
+    add     r11, r11, #0x40
+    ldrneb  r9, [r0, #-1]
+    usat    r11, #8, r11, asr #7
+
+    strh    lr, [r1], r12                   ; result is transposed and stored, which
+                                            ; will make second pass filtering easier.
+    ldrneb  r10, [r0], #2
+    strh    r11, [r1], r12
+
+    bne     width_loop_1st_6
+
+    ldr     r1, [sp]                        ; load and update dst address
+    subs    r7, r7, #0x10000
+    add     r0, r0, r2                      ; move to next input line
+
+    add     r1, r1, #2                      ; move over to next column
+    str     r1, [sp]
+
+    bne     height_loop_1st_6
+
+    add     sp, sp, #4
+    ldmia   sp!, {r4 - r11, pc}
+
+    ENDP
+
+; --------------------------
+; 16x16 version
+; -----------------------------
+|vp9_filter_block2d_first_pass_16x16_armv6| PROC
+    stmdb   sp!, {r4 - r11, lr}
+
+    ldr     r11, [sp, #40]                  ; vp9_filter address
+    ldr     r7, [sp, #36]                   ; output height
+
+    add     r4, r2, #18                     ; preload next low
+    pld     [r0, r4]
+
+    sub     r2, r2, r3                      ; inside loop increments input array,
+                                            ; so the height loop only needs to add
+                                            ; r2 - width to the input pointer
+
+    mov     r3, r3, lsl #1                  ; multiply width by 2 because using shorts
+    add     r12, r3, #16                    ; square off the output
+    sub     sp, sp, #4
+
+    ldr     r4, [r11]                       ; load up packed filter coefficients
+    ldr     r5, [r11, #4]
+    ldr     r6, [r11, #8]
+
+    str     r1, [sp]                        ; push destination to stack
+    mov     r7, r7, lsl #16                 ; height is top part of counter
+
+; six tap filter
+|height_loop_1st_16_6|
+    ldrb    r8, [r0, #-2]                   ; load source data
+    ldrb    r9, [r0, #-1]
+    ldrb    r10, [r0], #2
+    orr     r7, r7, r3, lsr #2              ; construct loop counter
+
+|width_loop_1st_16_6|
+    ldrb    r11, [r0, #-1]
+
+    pkhbt   lr, r8, r9, lsl #16             ; r9 | r8
+    pkhbt   r8, r9, r10, lsl #16            ; r10 | r9
+
+    ldrb    r9, [r0]
+
+    smuad   lr, lr, r4                      ; apply the filter
+    pkhbt   r10, r10, r11, lsl #16          ; r11 | r10
+    smuad   r8, r8, r4
+    pkhbt   r11, r11, r9, lsl #16           ; r9 | r11
+
+    smlad   lr, r10, r5, lr
+    ldrb    r10, [r0, #1]
+    smlad   r8, r11, r5, r8
+    ldrb    r11, [r0, #2]
+
+    sub     r7, r7, #1
+
+    pkhbt   r9, r9, r10, lsl #16            ; r10 | r9
+    pkhbt   r10, r10, r11, lsl #16          ; r11 | r10
+
+    smlad   lr, r9, r6, lr
+    smlad   r11, r10, r6, r8
+
+    ands    r10, r7, #0xff                  ; test loop counter
+
+    add     lr, lr, #0x40                   ; round_shift_and_clamp
+    ldrneb  r8, [r0, #-2]                   ; load data for next loop
+    usat    lr, #8, lr, asr #7
+    add     r11, r11, #0x40
+    ldrneb  r9, [r0, #-1]
+    usat    r11, #8, r11, asr #7
+
+    strh    lr, [r1], r12                   ; result is transposed and stored, which
+                                            ; will make second pass filtering easier.
+    ldrneb  r10, [r0], #2
+    strh    r11, [r1], r12
+
+    bne     width_loop_1st_16_6
+
+    ldr     r1, [sp]                        ; load and update dst address
+    subs    r7, r7, #0x10000
+    add     r0, r0, r2                      ; move to next input line
+
+    add     r11, r2, #34                    ; adding back block width(=16)
+    pld     [r0, r11]                       ; preload next low
+
+    add     r1, r1, #2                      ; move over to next column
+    str     r1, [sp]
+
+    bne     height_loop_1st_16_6
+
+    add     sp, sp, #4
+    ldmia   sp!, {r4 - r11, pc}
+
+    ENDP
+
+; --------------------------
+; 8x8 version
+; -----------------------------
+|vp9_filter_block2d_first_pass_8x8_armv6| PROC
+    stmdb   sp!, {r4 - r11, lr}
+
+    ldr     r11, [sp, #40]                  ; vp9_filter address
+    ldr     r7, [sp, #36]                   ; output height
+
+    add     r4, r2, #10                     ; preload next low
+    pld     [r0, r4]
+
+    sub     r2, r2, r3                      ; inside loop increments input array,
+                                            ; so the height loop only needs to add
+                                            ; r2 - width to the input pointer
+
+    mov     r3, r3, lsl #1                  ; multiply width by 2 because using shorts
+    add     r12, r3, #16                    ; square off the output
+    sub     sp, sp, #4
+
+    ldr     r4, [r11]                       ; load up packed filter coefficients
+    ldr     r5, [r11, #4]
+    ldr     r6, [r11, #8]
+
+    str     r1, [sp]                        ; push destination to stack
+    mov     r7, r7, lsl #16                 ; height is top part of counter
+
+; six tap filter
+|height_loop_1st_8_6|
+    ldrb    r8, [r0, #-2]                   ; load source data
+    ldrb    r9, [r0, #-1]
+    ldrb    r10, [r0], #2
+    orr     r7, r7, r3, lsr #2              ; construct loop counter
+
+|width_loop_1st_8_6|
+    ldrb    r11, [r0, #-1]
+
+    pkhbt   lr, r8, r9, lsl #16             ; r9 | r8
+    pkhbt   r8, r9, r10, lsl #16            ; r10 | r9
+
+    ldrb    r9, [r0]
+
+    smuad   lr, lr, r4                      ; apply the filter
+    pkhbt   r10, r10, r11, lsl #16          ; r11 | r10
+    smuad   r8, r8, r4
+    pkhbt   r11, r11, r9, lsl #16           ; r9 | r11
+
+    smlad   lr, r10, r5, lr
+    ldrb    r10, [r0, #1]
+    smlad   r8, r11, r5, r8
+    ldrb    r11, [r0, #2]
+
+    sub     r7, r7, #1
+
+    pkhbt   r9, r9, r10, lsl #16            ; r10 | r9
+    pkhbt   r10, r10, r11, lsl #16          ; r11 | r10
+
+    smlad   lr, r9, r6, lr
+    smlad   r11, r10, r6, r8
+
+    ands    r10, r7, #0xff                  ; test loop counter
+
+    add     lr, lr, #0x40                   ; round_shift_and_clamp
+    ldrneb  r8, [r0, #-2]                   ; load data for next loop
+    usat    lr, #8, lr, asr #7
+    add     r11, r11, #0x40
+    ldrneb  r9, [r0, #-1]
+    usat    r11, #8, r11, asr #7
+
+    strh    lr, [r1], r12                   ; result is transposed and stored, which
+                                            ; will make second pass filtering easier.
+    ldrneb  r10, [r0], #2
+    strh    r11, [r1], r12
+
+    bne     width_loop_1st_8_6
+
+    ldr     r1, [sp]                        ; load and update dst address
+    subs    r7, r7, #0x10000
+    add     r0, r0, r2                      ; move to next input line
+
+    add     r11, r2, #18                    ; adding back block width(=8)
+    pld     [r0, r11]                       ; preload next low
+
+    add     r1, r1, #2                      ; move over to next column
+    str     r1, [sp]
+
+    bne     height_loop_1st_8_6
+
+    add     sp, sp, #4
+    ldmia   sp!, {r4 - r11, pc}
+
+    ENDP
+
+;---------------------------------
+; r0    short         *src_ptr,
+; r1    unsigned char *output_ptr,
+; r2    unsigned int output_pitch,
+; r3    unsigned int cnt,
+; stack const short *vp9_filter
+;---------------------------------
+|vp9_filter_block2d_second_pass_armv6| PROC
+    stmdb   sp!, {r4 - r11, lr}
+
+    ldr     r11, [sp, #36]                  ; vp9_filter address
+    sub     sp, sp, #4
+    mov     r7, r3, lsl #16                 ; height is top part of counter
+    str     r1, [sp]                        ; push destination to stack
+
+    ldr     r4, [r11]                       ; load up packed filter coefficients
+    ldr     r5, [r11, #4]
+    ldr     r6, [r11, #8]
+
+    pkhbt   r12, r5, r4                     ; pack the filter differently
+    pkhbt   r11, r6, r5
+
+    sub     r0, r0, #4                      ; offset input buffer
+
+|height_loop_2nd|
+    ldr     r8, [r0]                        ; load the data
+    ldr     r9, [r0, #4]
+    orr     r7, r7, r3, lsr #1              ; loop counter
+
+|width_loop_2nd|
+    smuad   lr, r4, r8                      ; apply filter
+    sub     r7, r7, #1
+    smulbt  r8, r4, r8
+
+    ldr     r10, [r0, #8]
+
+    smlad   lr, r5, r9, lr
+    smladx  r8, r12, r9, r8
+
+    ldrh    r9, [r0, #12]
+
+    smlad   lr, r6, r10, lr
+    smladx  r8, r11, r10, r8
+
+    add     r0, r0, #4
+    smlatb  r10, r6, r9, r8
+
+    add     lr, lr, #0x40                   ; round_shift_and_clamp
+    ands    r8, r7, #0xff
+    usat    lr, #8, lr, asr #7
+    add     r10, r10, #0x40
+    strb    lr, [r1], r2                    ; the result is transposed back and stored
+    usat    r10, #8, r10, asr #7
+
+    ldrne   r8, [r0]                        ; load data for next loop
+    ldrne   r9, [r0, #4]
+    strb    r10, [r1], r2
+
+    bne     width_loop_2nd
+
+    ldr     r1, [sp]                        ; update dst for next loop
+    subs    r7, r7, #0x10000
+    add     r0, r0, #16                     ; updata src for next loop
+    add     r1, r1, #1
+    str     r1, [sp]
+
+    bne     height_loop_2nd
+
+    add     sp, sp, #4
+    ldmia   sp!, {r4 - r11, pc}
+
+    ENDP
+
+;---------------------------------
+; r0    short         *src_ptr,
+; r1    unsigned char *output_ptr,
+; r2    unsigned int output_pitch,
+; r3    unsigned int cnt,
+; stack const short *vp9_filter
+;---------------------------------
+|vp9_filter4_block2d_second_pass_armv6| PROC
+    stmdb   sp!, {r4 - r11, lr}
+
+    ldr     r11, [sp, #36]                  ; vp9_filter address
+    mov     r7, r3, lsl #16                 ; height is top part of counter
+
+    ldr     r4, [r11]                       ; load up packed filter coefficients
+    add     lr, r1, r3                      ; save final destination pointer
+    ldr     r5, [r11, #4]
+    ldr     r6, [r11, #8]
+
+    pkhbt   r12, r5, r4                     ; pack the filter differently
+    pkhbt   r11, r6, r5
+    mov     r4, #0x40                       ; rounding factor (for smlad{x})
+
+|height_loop_2nd_4|
+    ldrd    r8, [r0, #-4]                   ; load the data
+    orr     r7, r7, r3, lsr #1              ; loop counter
+
+|width_loop_2nd_4|
+    ldr     r10, [r0, #4]!
+    smladx  r6, r9, r12, r4                 ; apply filter
+    pkhbt   r8, r9, r8
+    smlad   r5, r8, r12, r4
+    pkhbt   r8, r10, r9
+    smladx  r6, r10, r11, r6
+    sub     r7, r7, #1
+    smlad   r5, r8, r11, r5
+
+    mov     r8, r9                          ; shift the data for the next loop
+    mov     r9, r10
+
+    usat    r6, #8, r6, asr #7              ; shift and clamp
+    usat    r5, #8, r5, asr #7
+
+    strb    r5, [r1], r2                    ; the result is transposed back and stored
+    tst     r7, #0xff
+    strb    r6, [r1], r2
+
+    bne     width_loop_2nd_4
+
+    subs    r7, r7, #0x10000
+    add     r0, r0, #16                     ; update src for next loop
+    sub     r1, lr, r7, lsr #16             ; update dst for next loop
+
+    bne     height_loop_2nd_4
+
+    ldmia   sp!, {r4 - r11, pc}
+
+    ENDP
+
+;------------------------------------
+; r0    unsigned char *src_ptr
+; r1    unsigned char *output_ptr,
+; r2    unsigned int src_pixels_per_line
+; r3    unsigned int cnt,
+; stack unsigned int output_pitch,
+; stack const short *vp9_filter
+;------------------------------------
+|vp9_filter_block2d_first_pass_only_armv6| PROC
+    stmdb   sp!, {r4 - r11, lr}
+
+    add     r7, r2, r3                      ; preload next low
+    add     r7, r7, #2
+    pld     [r0, r7]
+
+    ldr     r4, [sp, #36]                   ; output pitch
+    ldr     r11, [sp, #40]                  ; HFilter address
+    sub     sp, sp, #8
+
+    mov     r7, r3
+    sub     r2, r2, r3                      ; inside loop increments input array,
+                                            ; so the height loop only needs to add
+                                            ; r2 - width to the input pointer
+
+    sub     r4, r4, r3
+    str     r4, [sp]                        ; save modified output pitch
+    str     r2, [sp, #4]
+
+    mov     r2, #0x40
+
+    ldr     r4, [r11]                       ; load up packed filter coefficients
+    ldr     r5, [r11, #4]
+    ldr     r6, [r11, #8]
+
+; six tap filter
+|height_loop_1st_only_6|
+    ldrb    r8, [r0, #-2]                   ; load data
+    ldrb    r9, [r0, #-1]
+    ldrb    r10, [r0], #2
+
+    mov     r12, r3, lsr #1                 ; loop counter
+
+|width_loop_1st_only_6|
+    ldrb    r11, [r0, #-1]
+
+    pkhbt   lr, r8, r9, lsl #16             ; r9 | r8
+    pkhbt   r8, r9, r10, lsl #16            ; r10 | r9
+
+    ldrb    r9, [r0]
+
+;;  smuad   lr, lr, r4
+    smlad   lr, lr, r4, r2
+    pkhbt   r10, r10, r11, lsl #16          ; r11 | r10
+;;  smuad   r8, r8, r4
+    smlad   r8, r8, r4, r2
+    pkhbt   r11, r11, r9, lsl #16           ; r9 | r11
+
+    smlad   lr, r10, r5, lr
+    ldrb    r10, [r0, #1]
+    smlad   r8, r11, r5, r8
+    ldrb    r11, [r0, #2]
+
+    subs    r12, r12, #1
+
+    pkhbt   r9, r9, r10, lsl #16            ; r10 | r9
+    pkhbt   r10, r10, r11, lsl #16          ; r11 | r10
+
+    smlad   lr, r9, r6, lr
+    smlad   r10, r10, r6, r8
+
+;;  add     lr, lr, #0x40                   ; round_shift_and_clamp
+    ldrneb  r8, [r0, #-2]                   ; load data for next loop
+    usat    lr, #8, lr, asr #7
+;;  add     r10, r10, #0x40
+    strb    lr, [r1], #1                    ; store the result
+    usat    r10, #8, r10, asr #7
+
+    ldrneb  r9, [r0, #-1]
+    strb    r10, [r1], #1
+    ldrneb  r10, [r0], #2
+
+    bne     width_loop_1st_only_6
+
+    ldr     lr, [sp]                        ; load back output pitch
+    ldr     r12, [sp, #4]                   ; load back output pitch
+    subs    r7, r7, #1
+    add     r0, r0, r12                     ; updata src for next loop
+
+    add     r11, r12, r3                    ; preload next low
+    add     r11, r11, #2
+    pld     [r0, r11]
+
+    add     r1, r1, lr                      ; update dst for next loop
+
+    bne     height_loop_1st_only_6
+
+    add     sp, sp, #8
+    ldmia   sp!, {r4 - r11, pc}
+    ENDP  ; |vp9_filter_block2d_first_pass_only_armv6|
+
+
+;------------------------------------
+; r0    unsigned char *src_ptr,
+; r1    unsigned char *output_ptr,
+; r2    unsigned int src_pixels_per_line
+; r3    unsigned int cnt,
+; stack unsigned int output_pitch,
+; stack const short *vp9_filter
+;------------------------------------
+|vp9_filter_block2d_second_pass_only_armv6| PROC
+    stmdb   sp!, {r4 - r11, lr}
+
+    ldr     r11, [sp, #40]                  ; VFilter address
+    ldr     r12, [sp, #36]                  ; output pitch
+
+    mov     r7, r3, lsl #16                 ; height is top part of counter
+    sub     r0, r0, r2, lsl #1              ; need 6 elements for filtering, 2 before, 3 after
+
+    sub     sp, sp, #8
+
+    ldr     r4, [r11]                       ; load up packed filter coefficients
+    ldr     r5, [r11, #4]
+    ldr     r6, [r11, #8]
+
+    str     r0, [sp]                        ; save r0 to stack
+    str     r1, [sp, #4]                    ; save dst to stack
+
+; six tap filter
+|width_loop_2nd_only_6|
+    ldrb    r8, [r0], r2                    ; load data
+    orr     r7, r7, r3                      ; loop counter
+    ldrb    r9, [r0], r2
+    ldrb    r10, [r0], r2
+
+|height_loop_2nd_only_6|
+    ; filter first column in this inner loop, than, move to next colum.
+    ldrb    r11, [r0], r2
+
+    pkhbt   lr, r8, r9, lsl #16             ; r9 | r8
+    pkhbt   r8, r9, r10, lsl #16            ; r10 | r9
+
+    ldrb    r9, [r0], r2
+
+    smuad   lr, lr, r4
+    pkhbt   r10, r10, r11, lsl #16          ; r11 | r10
+    smuad   r8, r8, r4
+    pkhbt   r11, r11, r9, lsl #16           ; r9 | r11
+
+    smlad   lr, r10, r5, lr
+    ldrb    r10, [r0], r2
+    smlad   r8, r11, r5, r8
+    ldrb    r11, [r0]
+
+    sub     r7, r7, #2
+    sub     r0, r0, r2, lsl #2
+
+    pkhbt   r9, r9, r10, lsl #16            ; r10 | r9
+    pkhbt   r10, r10, r11, lsl #16          ; r11 | r10
+
+    smlad   lr, r9, r6, lr
+    smlad   r10, r10, r6, r8
+
+    ands    r9, r7, #0xff
+
+    add     lr, lr, #0x40                   ; round_shift_and_clamp
+    ldrneb  r8, [r0], r2                    ; load data for next loop
+    usat    lr, #8, lr, asr #7
+    add     r10, r10, #0x40
+    strb    lr, [r1], r12                   ; store the result for the column
+    usat    r10, #8, r10, asr #7
+
+    ldrneb  r9, [r0], r2
+    strb    r10, [r1], r12
+    ldrneb  r10, [r0], r2
+
+    bne     height_loop_2nd_only_6
+
+    ldr     r0, [sp]
+    ldr     r1, [sp, #4]
+    subs    r7, r7, #0x10000
+    add     r0, r0, #1                      ; move to filter next column
+    str     r0, [sp]
+    add     r1, r1, #1
+    str     r1, [sp, #4]
+
+    bne     width_loop_2nd_only_6
+
+    add     sp, sp, #8
+
+    ldmia   sp!, {r4 - r11, pc}
+    ENDP  ; |vp9_filter_block2d_second_pass_only_armv6|
+
+    END
--- /dev/null
+++ b/vp9/common/arm/armv6/idct_v6.asm
@@ -1,0 +1,345 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+;                   r0  r1  r2  r3  r4  r5  r6  r7  r8  r9  r10 r11 r12     r14
+    EXPORT  |vp8_short_idct4x4llm_1_v6|
+    EXPORT  |vp8_short_idct4x4llm_v6|
+    EXPORT  |vp8_short_idct4x4llm_v6_scott|
+    EXPORT  |vp8_short_idct4x4llm_v6_dual|
+
+    AREA    |.text|, CODE, READONLY
+
+;********************************************************************************
+;*  void short_idct4x4llm_1_v6(INT16 * input, INT16 * output, INT32 pitch)
+;*      r0  INT16 * input
+;*      r1  INT16 * output
+;*      r2  INT32 pitch
+;*  bench:  3/5
+;********************************************************************************
+
+|vp8_short_idct4x4llm_1_v6| PROC         ;   cycles  in  out pit
+            ;
+    ldrsh   r0, [r0]    ; load input[0] 1, r0 un 2
+    add r0, r0, #4  ;   1   +4
+    stmdb   sp!, {r4, r5, lr}   ; make room for wide writes 1                   backup
+    mov r0, r0, asr #3  ; (input[0] + 4) >> 3   1, r0 req`d ^1  >> 3
+    pkhbt   r4, r0, r0, lsl #16 ; pack r0 into r4   1, r0 req`d ^1                  pack
+    mov r5, r4  ; expand                        expand
+
+    strd    r4, [r1], r2    ; *output = r0, post inc    1
+    strd    r4, [r1], r2    ;   1
+    strd    r4, [r1], r2    ;   1
+    strd    r4, [r1]    ;   1
+            ;
+    ldmia   sp!, {r4, r5, pc}   ; replace vars, return                      restore
+    ENDP        ; |vp8_short_idct4x4llm_1_v6|
+;********************************************************************************
+;********************************************************************************
+;********************************************************************************
+
+;********************************************************************************
+;*  void short_idct4x4llm_v6(INT16 * input, INT16 * output, INT32 pitch)
+;*      r0  INT16 * input
+;*      r1  INT16 * output
+;*      r2  INT32 pitch
+;*  bench:
+;********************************************************************************
+
+|vp8_short_idct4x4llm_v6| PROC           ;   cycles  in  out pit
+            ;
+    stmdb   sp!, {r4-r11, lr}   ; backup registers  1                   backup
+            ;
+    mov r4, #0x00004E00 ;   1                   cst
+    orr r4, r4, #0x0000007B ; cospi8sqrt2minus1
+    mov r5, #0x00008A00 ;   1                       cst
+    orr r5, r5, #0x0000008C ; sinpi8sqrt2
+            ;
+    mov r6, #4  ; i=4   1                           i
+loop1           ;
+    ldrsh   r12, [r0, #8]   ; input[4]  1, r12 unavail 2                                                    [4]
+    ldrsh   r3, [r0, #24]   ; input[12] 1, r3 unavail 2             [12]
+    ldrsh   r8, [r0, #16]   ; input[8]  1, r8 unavail 2                                 [8]
+    ldrsh   r7, [r0], #0x2  ; input[0]  1, r7 unavail 2 ++                          [0]
+    smulwb  r10, r5, r12    ; ([4] * sinpi8sqrt2) >> 16 1, r10 un 2, r12/r5 ^1                                          t1
+    smulwb  r11, r4, r3 ; ([12] * cospi8sqrt2minus1) >> 16  1, r11 un 2, r3/r4 ^1                                               t2
+    add r9, r7, r8  ; a1 = [0] + [8]    1                                       a1
+    sub r7, r7, r8  ; b1 = [0] - [8]    1                               b1
+    add r11, r3, r11    ; temp2 1
+    rsb r11, r11, r10   ; c1 = temp1 - temp2    1                                               c1
+    smulwb  r3, r5, r3  ; ([12] * sinpi8sqrt2) >> 16    1, r3 un 2, r3/r5 ^ 1               t2
+    smulwb  r10, r4, r12    ; ([4] * cospi8sqrt2minus1) >> 16   1, r10 un 2, r12/r4 ^1                                          t1
+    add r8, r7, r11 ; b1 + c1   1                                   b+c
+    strh    r8, [r1, r2]    ; out[pitch] = b1+c1    1
+    sub r7, r7, r11 ; b1 - c1   1                               b-c
+    add r10, r12, r10   ; temp1 1
+    add r3, r10, r3 ; d1 = temp1 + temp2    1               d1
+    add r10, r9, r3 ; a1 + d1   1                                           a+d
+    sub r3, r9, r3  ; a1 - d1   1               a-d
+    add r8, r2, r2  ; pitch * 2 1                                   p*2
+    strh    r7, [r1, r8]    ; out[pitch*2] = b1-c1  1
+    add r7, r2, r2, lsl #1  ; pitch * 3 1                               p*3
+    strh    r3, [r1, r7]    ; out[pitch*3] = a1-d1  1
+    subs    r6, r6, #1  ; i--   1                           --
+    strh    r10, [r1], #0x2 ; out[0] = a1+d1    1       ++
+    bne loop1   ; if i>0, continue
+            ;
+    sub r1, r1, #8  ; set up out for next loop  1       -4
+            ; for this iteration, input=prev output
+    mov r6, #4  ; i=4   1                           i
+;   b   returnfull
+loop2           ;
+    ldrsh   r11, [r1, #2]   ; input[1]  1, r11 un 2                                             [1]
+    ldrsh   r8, [r1, #6]    ; input[3]  1, r8 un 2                                  [3]
+    ldrsh   r3, [r1, #4]    ; input[2]  1, r3 un 2              [2]
+    ldrsh   r0, [r1]    ; input[0]  1, r0 un 2  [0]
+    smulwb  r9, r5, r11 ; ([1] * sinpi8sqrt2) >> 16 1, r9 un 2, r5/r11 ^1                                       t1
+    smulwb  r10, r4, r8 ; ([3] * cospi8sqrt2minus1) >> 16   1, r10 un 2, r4/r8 ^1                                           t2
+    add r7, r0, r3  ; a1 = [0] + [2]    1                               a1
+    sub r0, r0, r3  ; b1 = [0] - [2]    1   b1
+    add r10, r8, r10    ; temp2 1
+    rsb r9, r10, r9 ; c1 = temp1 - temp2    1                                       c1
+    smulwb  r8, r5, r8  ; ([3] * sinpi8sqrt2) >> 16 1, r8 un 2, r5/r8 ^1                                    t2
+    smulwb  r10, r4, r11    ; ([1] * cospi8sqrt2minus1) >> 16   1, r10 un 2, r4/r11 ^1                                          t1
+    add r3, r0, r9  ; b1+c1 1               b+c
+    add r3, r3, #4  ; b1+c1+4   1               +4
+    add r10, r11, r10   ; temp1 1
+    mov r3, r3, asr #3  ; b1+c1+4 >> 3  1, r3 ^1                >>3
+    strh    r3, [r1, #2]    ; out[1] = b1+c1    1
+    add r10, r10, r8    ; d1 = temp1 + temp2    1                                           d1
+    add r3, r7, r10 ; a1+d1 1               a+d
+    add r3, r3, #4  ; a1+d1+4   1               +4
+    sub r7, r7, r10 ; a1-d1 1                               a-d
+    add r7, r7, #4  ; a1-d1+4   1                               +4
+    mov r3, r3, asr #3  ; a1+d1+4 >> 3  1, r3 ^1                >>3
+    mov r7, r7, asr #3  ; a1-d1+4 >> 3  1, r7 ^1                                >>3
+    strh    r7, [r1, #6]    ; out[3] = a1-d1    1
+    sub r0, r0, r9  ; b1-c1 1   b-c
+    add r0, r0, #4  ; b1-c1+4   1   +4
+    subs    r6, r6, #1  ; i--   1                           --
+    mov r0, r0, asr #3  ; b1-c1+4 >> 3  1, r0 ^1    >>3
+    strh    r0, [r1, #4]    ; out[2] = b1-c1    1
+    strh    r3, [r1], r2    ; out[0] = a1+d1    1
+;   add r1, r1, r2  ; out += pitch  1       ++
+    bne loop2   ; if i>0, continue
+returnfull          ;
+    ldmia   sp!, {r4 - r11, pc} ; replace vars, return                      restore
+    ENDP
+
+;********************************************************************************
+;********************************************************************************
+;********************************************************************************
+
+;********************************************************************************
+;*  void short_idct4x4llm_v6_scott(INT16 * input, INT16 * output, INT32 pitch)
+;*      r0  INT16 * input
+;*      r1  INT16 * output
+;*      r2  INT32 pitch
+;*  bench:
+;********************************************************************************
+
+|vp8_short_idct4x4llm_v6_scott| PROC         ;   cycles  in  out pit
+;   mov r0, #0  ;
+;   ldr r0, [r0]    ;
+    stmdb   sp!, {r4 - r11, lr} ; backup registers  1                   backup
+            ;
+    mov r3, #0x00004E00 ;                   cos
+    orr r3, r3, #0x0000007B ; cospi8sqrt2minus1
+    mov r4, #0x00008A00 ;                       sin
+    orr r4, r4, #0x0000008C ; sinpi8sqrt2
+            ;
+    mov r5, #0x2    ; i                         i
+            ;
+short_idct4x4llm_v6_scott_loop1          ;
+    ldr r10, [r0, #(4*2)]   ; i5 | i4                                               5,4
+    ldr r11, [r0, #(12*2)]  ; i13 | i12                                                 13,12
+            ;
+    smulwb  r6, r4, r10 ; ((ip[4] * sinpi8sqrt2) >> 16)                             lt1
+    smulwb  r7, r3, r11 ; ((ip[12] * cospi8sqrt2minus1) >> 16)                                  lt2
+            ;
+    smulwb  r12, r3, r10    ; ((ip[4] * cospi8sqrt2misu1) >> 16)                                                        l2t2
+    smulwb  r14, r4, r11    ; ((ip[12] * sinpi8sqrt2) >> 16)                                                                l2t1
+            ;
+    add r6, r6, r7  ; partial c1                                lt1-lt2
+    add r12, r12, r14   ; partial d1                                                        l2t2+l2t1
+            ;
+    smulwt  r14, r4, r10    ; ((ip[5] * sinpi8sqrt2) >> 16)                                                             ht1
+    smulwt  r7, r3, r11 ; ((ip[13] * cospi8sqrt2minus1) >> 16)                                  ht2
+            ;
+    smulwt  r8, r3, r10 ; ((ip[5] * cospi8sqrt2minus1) >> 16)                                       h2t1
+    smulwt  r9, r4, r11 ; ((ip[13] * sinpi8sqrt2) >> 16)                                            h2t2
+            ;
+    add r7, r14, r7 ; partial c1_2                                  ht1+ht2
+    sub r8, r8, r9  ; partial d1_2                                      h2t1-h2t2
+            ;
+    pkhbt   r6, r6, r7, lsl #16 ; partial c1_2 | partial c1_1                               pack
+    pkhbt   r12, r12, r8, lsl #16   ; partial d1_2 | partial d1_1                                                       pack
+            ;
+    usub16  r6, r6, r10 ; c1_2 | c1_1                               c
+    uadd16  r12, r12, r11   ; d1_2 | d1_1                                                       d
+            ;
+    ldr r10, [r0, #0]   ; i1 | i0                                               1,0
+    ldr r11, [r0, #(8*2)]   ; i9 | i10                                                  9,10
+            ;
+;;;;;;  add r0, r0, #0x4    ;       +4
+;;;;;;  add r1, r1, #0x4    ;           +4
+            ;
+    uadd16  r8, r10, r11    ; i1 + i9 | i0 + i8 aka a1                                      a
+    usub16  r9, r10, r11    ; i1 - i9 | i0 - i8 aka b1                                          b
+            ;
+    uadd16  r7, r8, r12 ; a1 + d1 pair                                  a+d
+    usub16  r14, r8, r12    ; a1 - d1 pair                                                              a-d
+            ;
+    str r7, [r1]    ; op[0] = a1 + d1
+    str r14, [r1, r2]   ; op[pitch*3] = a1 - d1
+            ;
+    add r0, r0, #0x4    ; op[pitch] = b1 + c1       ++
+    add r1, r1, #0x4    ; op[pitch*2] = b1 - c1         ++
+            ;
+    subs    r5, r5, #0x1    ;                           --
+    bne short_idct4x4llm_v6_scott_loop1  ;
+            ;
+    sub r1, r1, #16 ; reset output ptr
+    mov r5, #0x4    ;
+    mov r0, r1  ; input = output
+            ;
+short_idct4x4llm_v6_scott_loop2          ;
+            ;
+    subs    r5, r5, #0x1    ;
+    bne short_idct4x4llm_v6_scott_loop2  ;
+            ;
+    ldmia   sp!, {r4 - r11, pc} ;
+    ENDP        ;
+            ;
+;********************************************************************************
+;********************************************************************************
+;********************************************************************************
+
+;********************************************************************************
+;*  void short_idct4x4llm_v6_dual(INT16 * input, INT16 * output, INT32 pitch)
+;*      r0  INT16 * input
+;*      r1  INT16 * output
+;*      r2  INT32 pitch
+;*  bench:
+;********************************************************************************
+
+|vp8_short_idct4x4llm_v6_dual| PROC          ;   cycles  in  out pit
+            ;
+    stmdb   sp!, {r4-r11, lr}   ; backup registers  1                   backup
+    mov r3, #0x00004E00 ;                   cos
+    orr r3, r3, #0x0000007B ; cospi8sqrt2minus1
+    mov r4, #0x00008A00 ;                       sin
+    orr r4, r4, #0x0000008C ; sinpi8sqrt2
+    mov r5, #0x2    ; i=2                           i
+loop1_dual
+    ldr r6, [r0, #(4*2)]    ; i5 | i4                               5|4
+    ldr r12, [r0, #(12*2)]  ; i13 | i12                                                     13|12
+    ldr r14, [r0, #(8*2)]   ; i9 | i8                                                               9|8
+
+    smulwt  r9, r3, r6  ; (ip[5] * cospi8sqrt2minus1) >> 16                                         5c
+    smulwb  r7, r3, r6  ; (ip[4] * cospi8sqrt2minus1) >> 16                                 4c
+    smulwt  r10, r4, r6 ; (ip[5] * sinpi8sqrt2) >> 16                                               5s
+    smulwb  r8, r4, r6  ; (ip[4] * sinpi8sqrt2) >> 16                                       4s
+    pkhbt   r7, r7, r9, lsl #16 ; 5c | 4c
+    smulwt  r11, r3, r12    ; (ip[13] * cospi8sqrt2minus1) >> 16                                                    13c
+    pkhbt   r8, r8, r10, lsl #16    ; 5s | 4s
+    uadd16  r6, r6, r7  ; 5c+5 | 4c+4
+    smulwt  r7, r4, r12 ; (ip[13] * sinpi8sqrt2) >> 16                                  13s
+    smulwb  r9, r3, r12 ; (ip[12] * cospi8sqrt2minus1) >> 16                                            12c
+    smulwb  r10, r4, r12    ; (ip[12] * sinpi8sqrt2) >> 16                                              12s
+    subs    r5, r5, #0x1    ; i--                           --
+    pkhbt   r9, r9, r11, lsl #16    ; 13c | 12c
+    ldr r11, [r0], #0x4 ; i1 | i0       ++                                          1|0
+    pkhbt   r10, r10, r7, lsl #16   ; 13s | 12s
+    uadd16  r7, r12, r9 ; 13c+13 | 12c+12
+    usub16  r7, r8, r7  ; c                                 c
+    uadd16  r6, r6, r10 ; d                             d
+    uadd16  r10, r11, r14   ; a                                             a
+    usub16  r8, r11, r14    ; b                                     b
+    uadd16  r9, r10, r6 ; a+d                                           a+d
+    usub16  r10, r10, r6    ; a-d                                               a-d
+    uadd16  r6, r8, r7  ; b+c                               b+c
+    usub16  r7, r8, r7  ; b-c                                   b-c
+    str r6, [r1, r2]    ; o5 | o4
+    add r6, r2, r2  ; pitch * 2                             p2
+    str r7, [r1, r6]    ; o9 | o8
+    add r6,  r6, r2 ; pitch * 3                             p3
+    str r10, [r1, r6]   ; o13 | o12
+    str r9, [r1], #0x4  ; o1 | o0           ++
+    bne loop1_dual  ;
+    mov r5, #0x2    ; i=2                           i
+    sub r0, r1, #8  ; reset input/output        i/o
+loop2_dual
+    ldr r6, [r0, r2]    ; i5 | i4                               5|4
+    ldr r1, [r0]    ; i1 | i0           1|0
+    ldr r12, [r0, #0x4] ; i3 | i2                                                       3|2
+    add r14, r2, #0x4   ; pitch + 2                                                             p+2
+    ldr r14, [r0, r14]  ; i7 | i6                                                               7|6
+    smulwt  r9, r3, r6  ; (ip[5] * cospi8sqrt2minus1) >> 16                                         5c
+    smulwt  r7, r3, r1  ; (ip[1] * cospi8sqrt2minus1) >> 16                                 1c
+    smulwt  r10, r4, r6 ; (ip[5] * sinpi8sqrt2) >> 16                                               5s
+    smulwt  r8, r4, r1  ; (ip[1] * sinpi8sqrt2) >> 16                                       1s
+    pkhbt   r11, r6, r1, lsl #16    ; i0 | i4                                                   0|4
+    pkhbt   r7, r9, r7, lsl #16 ; 1c | 5c
+    pkhbt   r8, r10, r8, lsl #16    ; 1s | 5s = temp1 �                                     tc1
+    pkhtb   r1, r1, r6, asr #16 ; i1 | i5           1|5
+    uadd16  r1, r7, r1  ; 1c+1 | 5c+5 = temp2 (d)           td2
+    pkhbt   r9, r14, r12, lsl #16   ; i2 | i6                                           2|6
+    uadd16  r10, r11, r9    ; a                                             a
+    usub16  r9, r11, r9 ; b                                         b
+    pkhtb   r6, r12, r14, asr #16   ; i3 | i7                               3|7
+    subs    r5, r5, #0x1    ; i--                           --
+    smulwt  r7, r3, r6  ; (ip[3] * cospi8sqrt2minus1) >> 16                                 3c
+    smulwt  r11, r4, r6 ; (ip[3] * sinpi8sqrt2) >> 16                                                   3s
+    smulwb  r12, r3, r6 ; (ip[7] * cospi8sqrt2minus1) >> 16                                                     7c
+    smulwb  r14, r4, r6 ; (ip[7] * sinpi8sqrt2) >> 16                                                               7s
+
+    pkhbt   r7, r12, r7, lsl #16    ; 3c | 7c
+    pkhbt   r11, r14, r11, lsl #16  ; 3s | 7s = temp1 (d)                                                   td1
+    uadd16  r6, r7, r6  ; 3c+3 | 7c+7 = temp2  (c)                              tc2
+    usub16  r12, r8, r6 ; c (o1 | o5)                                                       c
+    uadd16  r6, r11, r1 ; d (o3 | o7)                               d
+    uadd16  r7, r10, r6 ; a+d                                   a+d
+    mov r8, #0x4    ; set up 4's                                        4
+    orr r8, r8, #0x40000    ;                                       4|4
+    usub16  r6, r10, r6 ; a-d                               a-d
+    uadd16  r6, r6, r8  ; a-d+4                             3|7
+    uadd16  r7, r7, r8  ; a+d+4                                 0|4
+    uadd16  r10, r9, r12    ; b+c                                               b+c
+    usub16  r1, r9, r12 ; b-c           b-c
+    uadd16  r10, r10, r8    ; b+c+4                                             1|5
+    uadd16  r1, r1, r8  ; b-c+4         2|6
+    mov r8, r10, asr #19    ; o1 >> 3
+    strh    r8, [r0, #2]    ; o1
+    mov r8, r1, asr #19 ; o2 >> 3
+    strh    r8, [r0, #4]    ; o2
+    mov r8, r6, asr #19 ; o3 >> 3
+    strh    r8, [r0, #6]    ; o3
+    mov r8, r7, asr #19 ; o0 >> 3
+    strh    r8, [r0], r2    ; o0        +p
+    sxth    r10, r10    ;
+    mov r8, r10, asr #3 ; o5 >> 3
+    strh    r8, [r0, #2]    ; o5
+    sxth    r1, r1  ;
+    mov r8, r1, asr #3  ; o6 >> 3
+    strh    r8, [r0, #4]    ; o6
+    sxth    r6, r6  ;
+    mov r8, r6, asr #3  ; o7 >> 3
+    strh    r8, [r0, #6]    ; o7
+    sxth    r7, r7  ;
+    mov r8, r7, asr #3  ; o4 >> 3
+    strh    r8, [r0], r2    ; o4        +p
+;;;;;   subs    r5, r5, #0x1    ; i--                           --
+    bne loop2_dual  ;
+            ;
+    ldmia   sp!, {r4 - r11, pc} ; replace vars, return                      restore
+    ENDP
+
+    END
--- /dev/null
+++ b/vp9/common/arm/armv6/iwalsh_v6.asm
@@ -1,0 +1,152 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+    EXPORT |vp8_short_inv_walsh4x4_v6|
+    EXPORT |vp8_short_inv_walsh4x4_1_v6|
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA    |.text|, CODE, READONLY  ; name this block of code
+
+;short vp8_short_inv_walsh4x4_v6(short *input, short *output)
+|vp8_short_inv_walsh4x4_v6| PROC
+
+    stmdb       sp!, {r4 - r11, lr}
+
+    ldr         r2, [r0], #4         ; [1  |  0]
+    ldr         r3, [r0], #4         ; [3  |  2]
+    ldr         r4, [r0], #4         ; [5  |  4]
+    ldr         r5, [r0], #4         ; [7  |  6]
+    ldr         r6, [r0], #4         ; [9  |  8]
+    ldr         r7, [r0], #4         ; [11 | 10]
+    ldr         r8, [r0], #4         ; [13 | 12]
+    ldr         r9, [r0]             ; [15 | 14]
+
+    qadd16      r10, r2, r8          ; a1 [1+13  |  0+12]
+    qadd16      r11, r4, r6          ; b1 [5+9   |  4+8]
+    qsub16      r12, r4, r6          ; c1 [5-9   |  4-8]
+    qsub16      lr, r2, r8           ; d1 [1-13  |  0-12]
+
+    qadd16      r2, r10, r11         ; a1 + b1 [1  |  0]
+    qadd16      r4, r12, lr          ; c1 + d1 [5  |  4]
+    qsub16      r6, r10, r11         ; a1 - b1 [9  |  8]
+    qsub16      r8, lr, r12          ; d1 - c1 [13 | 12]
+
+    qadd16      r10, r3, r9          ; a1 [3+15  |  2+14]
+    qadd16      r11, r5, r7          ; b1 [7+11  |  6+10]
+    qsub16      r12, r5, r7          ; c1 [7-11  |  6-10]
+    qsub16      lr, r3, r9           ; d1 [3-15  |  2-14]
+
+    qadd16      r3, r10, r11         ; a1 + b1 [3  |  2]
+    qadd16      r5, r12, lr          ; c1 + d1 [7  |  6]
+    qsub16      r7, r10, r11         ; a1 - b1 [11 | 10]
+    qsub16      r9, lr, r12          ; d1 - c1 [15 | 14]
+
+    ; first transform complete
+
+    qsubaddx    r10, r2, r3          ; [c1|a1] [1-2   |   0+3]
+    qaddsubx    r11, r2, r3          ; [b1|d1] [1+2   |   0-3]
+    qsubaddx    r12, r4, r5          ; [c1|a1] [5-6   |   4+7]
+    qaddsubx    lr, r4, r5           ; [b1|d1] [5+6   |   4-7]
+
+    qaddsubx    r2, r10, r11         ; [b2|c2] [c1+d1 | a1-b1]
+    qaddsubx    r3, r11, r10         ; [a2|d2] [b1+a1 | d1-c1]
+    ldr         r10, c0x00030003
+    qaddsubx    r4, r12, lr          ; [b2|c2] [c1+d1 | a1-b1]
+    qaddsubx    r5, lr, r12          ; [a2|d2] [b1+a1 | d1-c1]
+
+    qadd16      r2, r2, r10          ; [b2+3|c2+3]
+    qadd16      r3, r3, r10          ; [a2+3|d2+3]
+    qadd16      r4, r4, r10          ; [b2+3|c2+3]
+    qadd16      r5, r5, r10          ; [a2+3|d2+3]
+
+    asr         r12, r2, #3          ; [1  |  x]
+    pkhtb       r12, r12, r3, asr #19; [1  |  0]
+    lsl         lr, r3, #16          ; [~3 |  x]
+    lsl         r2, r2, #16          ; [~2 |  x]
+    asr         lr, lr, #3           ; [3  |  x]
+    pkhtb       lr, lr, r2, asr #19  ; [3  |  2]
+
+    asr         r2, r4, #3           ; [5  |  x]
+    pkhtb       r2, r2, r5, asr #19  ; [5  |  4]
+    lsl         r3, r5, #16          ; [~7 |  x]
+    lsl         r4, r4, #16          ; [~6 |  x]
+    asr         r3, r3, #3           ; [7  |  x]
+    pkhtb       r3, r3, r4, asr #19  ; [7  |  6]
+
+    str         r12, [r1], #4
+    str         lr, [r1], #4
+    str         r2, [r1], #4
+    str         r3, [r1], #4
+
+    qsubaddx    r2, r6, r7           ; [c1|a1] [9-10  |  8+11]
+    qaddsubx    r3, r6, r7           ; [b1|d1] [9+10  |  8-11]
+    qsubaddx    r4, r8, r9           ; [c1|a1] [13-14 | 12+15]
+    qaddsubx    r5, r8, r9           ; [b1|d1] [13+14 | 12-15]
+
+    qaddsubx    r6, r2, r3           ; [b2|c2] [c1+d1 | a1-b1]
+    qaddsubx    r7, r3, r2           ; [a2|d2] [b1+a1 | d1-c1]
+    qaddsubx    r8, r4, r5           ; [b2|c2] [c1+d1 | a1-b1]
+    qaddsubx    r9, r5, r4           ; [a2|d2] [b1+a1 | d1-c1]
+
+    qadd16      r6, r6, r10          ; [b2+3|c2+3]
+    qadd16      r7, r7, r10          ; [a2+3|d2+3]
+    qadd16      r8, r8, r10          ; [b2+3|c2+3]
+    qadd16      r9, r9, r10          ; [a2+3|d2+3]
+
+    asr         r2, r6, #3           ; [9  |  x]
+    pkhtb       r2, r2, r7, asr #19  ; [9  |  8]
+    lsl         r3, r7, #16          ; [~11|  x]
+    lsl         r4, r6, #16          ; [~10|  x]
+    asr         r3, r3, #3           ; [11 |  x]
+    pkhtb       r3, r3, r4, asr #19  ; [11 | 10]
+
+    asr         r4, r8, #3           ; [13 |  x]
+    pkhtb       r4, r4, r9, asr #19  ; [13 | 12]
+    lsl         r5, r9, #16          ; [~15|  x]
+    lsl         r6, r8, #16          ; [~14|  x]
+    asr         r5, r5, #3           ; [15 |  x]
+    pkhtb       r5, r5, r6, asr #19  ; [15 | 14]
+
+    str         r2, [r1], #4
+    str         r3, [r1], #4
+    str         r4, [r1], #4
+    str         r5, [r1]
+
+    ldmia       sp!, {r4 - r11, pc}
+    ENDP        ; |vp8_short_inv_walsh4x4_v6|
+
+
+;short vp8_short_inv_walsh4x4_1_v6(short *input, short *output)
+|vp8_short_inv_walsh4x4_1_v6| PROC
+
+    ldrsh       r2, [r0]             ; [0]
+    add         r2, r2, #3           ; [0] + 3
+    asr         r2, r2, #3           ; a1 ([0]+3) >> 3
+    lsl         r2, r2, #16          ; [a1 |  x]
+    orr         r2, r2, r2, lsr #16  ; [a1 | a1]
+
+    str         r2, [r1], #4
+    str         r2, [r1], #4
+    str         r2, [r1], #4
+    str         r2, [r1], #4
+    str         r2, [r1], #4
+    str         r2, [r1], #4
+    str         r2, [r1], #4
+    str         r2, [r1]
+
+    bx          lr
+    ENDP        ; |vp8_short_inv_walsh4x4_1_v6|
+
+; Constant Pool
+c0x00030003 DCD 0x00030003
+    END
--- /dev/null
+++ b/vp9/common/arm/armv6/loopfilter_v6.asm
@@ -1,0 +1,1282 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT |vp9_loop_filter_horizontal_edge_armv6|
+    EXPORT |vp9_mbloop_filter_horizontal_edge_armv6|
+    EXPORT |vp9_loop_filter_vertical_edge_armv6|
+    EXPORT |vp9_mbloop_filter_vertical_edge_armv6|
+
+    AREA    |.text|, CODE, READONLY  ; name this block of code
+
+    MACRO
+    TRANSPOSE_MATRIX $a0, $a1, $a2, $a3, $b0, $b1, $b2, $b3
+    ; input: $a0, $a1, $a2, $a3; output: $b0, $b1, $b2, $b3
+    ; a0: 03 02 01 00
+    ; a1: 13 12 11 10
+    ; a2: 23 22 21 20
+    ; a3: 33 32 31 30
+    ;     b3 b2 b1 b0
+
+    uxtb16      $b1, $a1                    ; xx 12 xx 10
+    uxtb16      $b0, $a0                    ; xx 02 xx 00
+    uxtb16      $b3, $a3                    ; xx 32 xx 30
+    uxtb16      $b2, $a2                    ; xx 22 xx 20
+    orr         $b1, $b0, $b1, lsl #8       ; 12 02 10 00
+    orr         $b3, $b2, $b3, lsl #8       ; 32 22 30 20
+
+    uxtb16      $a1, $a1, ror #8            ; xx 13 xx 11
+    uxtb16      $a3, $a3, ror #8            ; xx 33 xx 31
+    uxtb16      $a0, $a0, ror #8            ; xx 03 xx 01
+    uxtb16      $a2, $a2, ror #8            ; xx 23 xx 21
+    orr         $a0, $a0, $a1, lsl #8       ; 13 03 11 01
+    orr         $a2, $a2, $a3, lsl #8       ; 33 23 31 21
+
+    pkhtb       $b2, $b3, $b1, asr #16      ; 32 22 12 02   -- p1
+    pkhbt       $b0, $b1, $b3, lsl #16      ; 30 20 10 00   -- p3
+
+    pkhtb       $b3, $a2, $a0, asr #16      ; 33 23 13 03   -- p0
+    pkhbt       $b1, $a0, $a2, lsl #16      ; 31 21 11 01   -- p2
+    MEND
+
+
+src         RN  r0
+pstep       RN  r1
+count       RN  r5
+
+;r0     unsigned char *src_ptr,
+;r1     int src_pixel_step,
+;r2     const char *blimit,
+;r3     const char *limit,
+;stack  const char *thresh,
+;stack  int  count
+
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+|vp9_loop_filter_horizontal_edge_armv6| PROC
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+    stmdb       sp!, {r4 - r11, lr}
+
+    sub         src, src, pstep, lsl #2     ; move src pointer down by 4 lines
+    ldr         count, [sp, #40]            ; count for 8-in-parallel
+    ldr         r6, [sp, #36]               ; load thresh address
+    sub         sp, sp, #16                 ; create temp buffer
+
+    ldr         r9, [src], pstep            ; p3
+    ldrb        r4, [r2]                    ; blimit
+    ldr         r10, [src], pstep           ; p2
+    ldrb        r2, [r3]                    ; limit
+    ldr         r11, [src], pstep           ; p1
+    orr         r4, r4, r4, lsl #8
+    ldrb        r3, [r6]                    ; thresh
+    orr         r2, r2, r2, lsl #8
+    mov         count, count, lsl #1        ; 4-in-parallel
+    orr         r4, r4, r4, lsl #16
+    orr         r3, r3, r3, lsl #8
+    orr         r2, r2, r2, lsl #16
+    orr         r3, r3, r3, lsl #16
+
+|Hnext8|
+    ; vp9_filter_mask() function
+    ; calculate breakout conditions
+    ldr         r12, [src], pstep           ; p0
+
+    uqsub8      r6, r9, r10                 ; p3 - p2
+    uqsub8      r7, r10, r9                 ; p2 - p3
+    uqsub8      r8, r10, r11                ; p2 - p1
+    uqsub8      r10, r11, r10               ; p1 - p2
+
+    orr         r6, r6, r7                  ; abs (p3-p2)
+    orr         r8, r8, r10                 ; abs (p2-p1)
+    uqsub8      lr, r6, r2                  ; compare to limit. lr: vp9_filter_mask
+    uqsub8      r8, r8, r2                  ; compare to limit
+    uqsub8      r6, r11, r12                ; p1 - p0
+    orr         lr, lr, r8
+    uqsub8      r7, r12, r11                ; p0 - p1
+    ldr         r9, [src], pstep            ; q0
+    ldr         r10, [src], pstep           ; q1
+    orr         r6, r6, r7                  ; abs (p1-p0)
+    uqsub8      r7, r6, r2                  ; compare to limit
+    uqsub8      r8, r6, r3                  ; compare to thresh  -- save r8 for later
+    orr         lr, lr, r7
+
+    uqsub8      r6, r11, r10                ; p1 - q1
+    uqsub8      r7, r10, r11                ; q1 - p1
+    uqsub8      r11, r12, r9                ; p0 - q0
+    uqsub8      r12, r9, r12                ; q0 - p0
+    orr         r6, r6, r7                  ; abs (p1-q1)
+    ldr         r7, c0x7F7F7F7F
+    orr         r12, r11, r12               ; abs (p0-q0)
+    ldr         r11, [src], pstep           ; q2
+    uqadd8      r12, r12, r12               ; abs (p0-q0) * 2
+    and         r6, r7, r6, lsr #1          ; abs (p1-q1) / 2
+    uqsub8      r7, r9, r10                 ; q0 - q1
+    uqadd8      r12, r12, r6                ; abs (p0-q0)*2 + abs (p1-q1)/2
+    uqsub8      r6, r10, r9                 ; q1 - q0
+    uqsub8      r12, r12, r4                ; compare to flimit
+    uqsub8      r9, r11, r10                ; q2 - q1
+
+    orr         lr, lr, r12
+
+    ldr         r12, [src], pstep           ; q3
+    uqsub8      r10, r10, r11               ; q1 - q2
+    orr         r6, r7, r6                  ; abs (q1-q0)
+    orr         r10, r9, r10                ; abs (q2-q1)
+    uqsub8      r7, r6, r2                  ; compare to limit
+    uqsub8      r10, r10, r2                ; compare to limit
+    uqsub8      r6, r6, r3                  ; compare to thresh -- save r6 for later
+    orr         lr, lr, r7
+    orr         lr, lr, r10
+
+    uqsub8      r10, r12, r11               ; q3 - q2
+    uqsub8      r9, r11, r12                ; q2 - q3
+
+    mvn         r11, #0                     ; r11 == -1
+
+    orr         r10, r10, r9                ; abs (q3-q2)
+    uqsub8      r10, r10, r2                ; compare to limit
+
+    mov         r12, #0
+    orr         lr, lr, r10
+    sub         src, src, pstep, lsl #2
+
+    usub8       lr, r12, lr                 ; use usub8 instead of ssub8
+    sel         lr, r11, r12                ; filter mask: lr
+
+    cmp         lr, #0
+    beq         hskip_filter                 ; skip filtering
+
+    sub         src, src, pstep, lsl #1     ; move src pointer down by 6 lines
+
+    ;vp8_hevmask() function
+    ;calculate high edge variance
+    orr         r10, r6, r8                 ; calculate vp8_hevmask
+
+    ldr         r7, [src], pstep            ; p1
+
+    usub8       r10, r12, r10               ; use usub8 instead of ssub8
+    sel         r6, r12, r11                ; obtain vp8_hevmask: r6
+
+    ;vp9_filter() function
+    ldr         r8, [src], pstep            ; p0
+    ldr         r12, c0x80808080
+    ldr         r9, [src], pstep            ; q0
+    ldr         r10, [src], pstep           ; q1
+
+    eor         r7, r7, r12                 ; p1 offset to convert to a signed value
+    eor         r8, r8, r12                 ; p0 offset to convert to a signed value
+    eor         r9, r9, r12                 ; q0 offset to convert to a signed value
+    eor         r10, r10, r12               ; q1 offset to convert to a signed value
+
+    str         r9, [sp]                    ; store qs0 temporarily
+    str         r8, [sp, #4]                ; store ps0 temporarily
+    str         r10, [sp, #8]               ; store qs1 temporarily
+    str         r7, [sp, #12]               ; store ps1 temporarily
+
+    qsub8       r7, r7, r10                 ; vp9_signed_char_clamp(ps1-qs1)
+    qsub8       r8, r9, r8                  ; vp9_signed_char_clamp(vp9_filter + 3 * ( qs0 - ps0))
+
+    and         r7, r7, r6                  ; vp9_filter (r7) &= hev
+
+    qadd8       r7, r7, r8
+    ldr         r9, c0x03030303             ; r9 = 3 --modified for vp8
+
+    qadd8       r7, r7, r8
+    ldr         r10, c0x04040404
+
+    qadd8       r7, r7, r8
+    and         r7, r7, lr                  ; vp9_filter &= mask;
+
+    ;modify code for vp8 -- Filter1 = vp9_filter (r7)
+    qadd8       r8 , r7 , r9                ; Filter2 (r8) = vp9_signed_char_clamp(vp9_filter+3)
+    qadd8       r7 , r7 , r10               ; vp9_filter = vp9_signed_char_clamp(vp9_filter+4)
+
+    mov         r9, #0
+    shadd8      r8 , r8 , r9                ; Filter2 >>= 3
+    shadd8      r7 , r7 , r9                ; vp9_filter >>= 3
+    shadd8      r8 , r8 , r9
+    shadd8      r7 , r7 , r9
+    shadd8      lr , r8 , r9                ; lr: Filter2
+    shadd8      r7 , r7 , r9                ; r7: filter
+
+    ;usub8      lr, r8, r10                 ; s = (s==4)*-1
+    ;sel        lr, r11, r9
+    ;usub8      r8, r10, r8
+    ;sel        r8, r11, r9
+    ;and        r8, r8, lr                  ; -1 for each element that equals 4
+
+    ;calculate output
+    ;qadd8      lr, r8, r7                  ; u = vp9_signed_char_clamp(s + vp9_filter)
+
+    ldr         r8, [sp]                    ; load qs0
+    ldr         r9, [sp, #4]                ; load ps0
+
+    ldr         r10, c0x01010101
+
+    qsub8       r8 ,r8, r7                  ; u = vp9_signed_char_clamp(qs0 - vp9_filter)
+    qadd8       r9, r9, lr                  ; u = vp9_signed_char_clamp(ps0 + Filter2)
+
+    ;end of modification for vp8
+
+    mov         lr, #0
+    sadd8       r7, r7 , r10                ; vp9_filter += 1
+    shadd8      r7, r7, lr                  ; vp9_filter >>= 1
+
+    ldr         r11, [sp, #12]              ; load ps1
+    ldr         r10, [sp, #8]               ; load qs1
+
+    bic         r7, r7, r6                  ; vp9_filter &= ~hev
+    sub         src, src, pstep, lsl #2
+
+    qadd8       r11, r11, r7                ; u = vp9_signed_char_clamp(ps1 + vp9_filter)
+    qsub8       r10, r10,r7                 ; u = vp9_signed_char_clamp(qs1 - vp9_filter)
+
+    eor         r11, r11, r12               ; *op1 = u^0x80
+    str         r11, [src], pstep           ; store op1
+    eor         r9, r9, r12                 ; *op0 = u^0x80
+    str         r9, [src], pstep            ; store op0 result
+    eor         r8, r8, r12                 ; *oq0 = u^0x80
+    str         r8, [src], pstep            ; store oq0 result
+    eor         r10, r10, r12               ; *oq1 = u^0x80
+    str         r10, [src], pstep           ; store oq1
+
+    sub         src, src, pstep, lsl #1
+
+|hskip_filter|
+    add         src, src, #4
+    sub         src, src, pstep, lsl #2
+
+    subs        count, count, #1
+
+    ldrne       r9, [src], pstep            ; p3
+    ldrne       r10, [src], pstep           ; p2
+    ldrne       r11, [src], pstep           ; p1
+
+    bne         Hnext8
+
+    add         sp, sp, #16
+    ldmia       sp!, {r4 - r11, pc}
+    ENDP        ; |vp9_loop_filter_horizontal_edge_armv6|
+
+
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+|vp8_mbloop_filter_horizontal_edge_armv6| PROC
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+    stmdb       sp!, {r4 - r11, lr}
+
+    sub         src, src, pstep, lsl #2     ; move src pointer down by 4 lines
+    ldr         count, [sp, #40]            ; count for 8-in-parallel
+    ldr         r6, [sp, #36]               ; load thresh address
+    sub         sp, sp, #16                 ; create temp buffer
+
+    ldr         r9, [src], pstep            ; p3
+    ldrb        r4, [r2]                    ; blimit
+    ldr         r10, [src], pstep           ; p2
+    ldrb        r2, [r3]                    ; limit
+    ldr         r11, [src], pstep           ; p1
+    orr         r4, r4, r4, lsl #8
+    ldrb        r3, [r6]                    ; thresh
+    orr         r2, r2, r2, lsl #8
+    mov         count, count, lsl #1        ; 4-in-parallel
+    orr         r4, r4, r4, lsl #16
+    orr         r3, r3, r3, lsl #8
+    orr         r2, r2, r2, lsl #16
+    orr         r3, r3, r3, lsl #16
+
+|MBHnext8|
+
+    ; vp9_filter_mask() function
+    ; calculate breakout conditions
+    ldr         r12, [src], pstep           ; p0
+
+    uqsub8      r6, r9, r10                 ; p3 - p2
+    uqsub8      r7, r10, r9                 ; p2 - p3
+    uqsub8      r8, r10, r11                ; p2 - p1
+    uqsub8      r10, r11, r10               ; p1 - p2
+
+    orr         r6, r6, r7                  ; abs (p3-p2)
+    orr         r8, r8, r10                 ; abs (p2-p1)
+    uqsub8      lr, r6, r2                  ; compare to limit. lr: vp9_filter_mask
+    uqsub8      r8, r8, r2                  ; compare to limit
+
+    uqsub8      r6, r11, r12                ; p1 - p0
+    orr         lr, lr, r8
+    uqsub8      r7, r12, r11                ; p0 - p1
+    ldr         r9, [src], pstep            ; q0
+    ldr         r10, [src], pstep           ; q1
+    orr         r6, r6, r7                  ; abs (p1-p0)
+    uqsub8      r7, r6, r2                  ; compare to limit
+    uqsub8      r8, r6, r3                  ; compare to thresh  -- save r8 for later
+    orr         lr, lr, r7
+
+    uqsub8      r6, r11, r10                ; p1 - q1
+    uqsub8      r7, r10, r11                ; q1 - p1
+    uqsub8      r11, r12, r9                ; p0 - q0
+    uqsub8      r12, r9, r12                ; q0 - p0
+    orr         r6, r6, r7                  ; abs (p1-q1)
+    ldr         r7, c0x7F7F7F7F
+    orr         r12, r11, r12               ; abs (p0-q0)
+    ldr         r11, [src], pstep           ; q2
+    uqadd8      r12, r12, r12               ; abs (p0-q0) * 2
+    and         r6, r7, r6, lsr #1          ; abs (p1-q1) / 2
+    uqsub8      r7, r9, r10                 ; q0 - q1
+    uqadd8      r12, r12, r6                ; abs (p0-q0)*2 + abs (p1-q1)/2
+    uqsub8      r6, r10, r9                 ; q1 - q0
+    uqsub8      r12, r12, r4                ; compare to flimit
+    uqsub8      r9, r11, r10                ; q2 - q1
+
+    orr         lr, lr, r12
+
+    ldr         r12, [src], pstep           ; q3
+
+    uqsub8      r10, r10, r11               ; q1 - q2
+    orr         r6, r7, r6                  ; abs (q1-q0)
+    orr         r10, r9, r10                ; abs (q2-q1)
+    uqsub8      r7, r6, r2                  ; compare to limit
+    uqsub8      r10, r10, r2                ; compare to limit
+    uqsub8      r6, r6, r3                  ; compare to thresh -- save r6 for later
+    orr         lr, lr, r7
+    orr         lr, lr, r10
+
+    uqsub8      r10, r12, r11               ; q3 - q2
+    uqsub8      r9, r11, r12                ; q2 - q3
+
+    mvn         r11, #0                     ; r11 == -1
+
+    orr         r10, r10, r9                ; abs (q3-q2)
+    uqsub8      r10, r10, r2                ; compare to limit
+
+    mov         r12, #0
+
+    orr         lr, lr, r10
+
+    usub8       lr, r12, lr                 ; use usub8 instead of ssub8
+    sel         lr, r11, r12                ; filter mask: lr
+
+    cmp         lr, #0
+    beq         mbhskip_filter               ; skip filtering
+
+    ;vp8_hevmask() function
+    ;calculate high edge variance
+    sub         src, src, pstep, lsl #2     ; move src pointer down by 6 lines
+    sub         src, src, pstep, lsl #1
+
+    orr         r10, r6, r8
+    ldr         r7, [src], pstep            ; p1
+
+    usub8       r10, r12, r10
+    sel         r6, r12, r11                ; hev mask: r6
+
+    ;vp8_mbfilter() function
+    ;p2, q2 are only needed at the end. Don't need to load them in now.
+    ldr         r8, [src], pstep            ; p0
+    ldr         r12, c0x80808080
+    ldr         r9, [src], pstep            ; q0
+    ldr         r10, [src]                  ; q1
+
+    eor         r7, r7, r12                 ; ps1
+    eor         r8, r8, r12                 ; ps0
+    eor         r9, r9, r12                 ; qs0
+    eor         r10, r10, r12               ; qs1
+
+    qsub8       r12, r9, r8                 ; vp9_signed_char_clamp(vp9_filter + 3 * ( qs0 - ps0))
+    str         r7, [sp, #12]               ; store ps1 temporarily
+    qsub8       r7, r7, r10                 ; vp9_signed_char_clamp(ps1-qs1)
+    str         r10, [sp, #8]               ; store qs1 temporarily
+    qadd8       r7, r7, r12
+    str         r9, [sp]                    ; store qs0 temporarily
+    qadd8       r7, r7, r12
+    str         r8, [sp, #4]                ; store ps0 temporarily
+    qadd8       r7, r7, r12                 ; vp9_filter: r7
+
+    ldr         r10, c0x03030303            ; r10 = 3 --modified for vp8
+    ldr         r9, c0x04040404
+
+    and         r7, r7, lr                  ; vp9_filter &= mask (lr is free)
+
+    mov         r12, r7                     ; Filter2: r12
+    and         r12, r12, r6                ; Filter2 &= hev
+
+    ;modify code for vp8
+    ;save bottom 3 bits so that we round one side +4 and the other +3
+    qadd8       r8 , r12 , r9               ; Filter1 (r8) = vp9_signed_char_clamp(Filter2+4)
+    qadd8       r12 , r12 , r10             ; Filter2 (r12) = vp9_signed_char_clamp(Filter2+3)
+
+    mov         r10, #0
+    shadd8      r8 , r8 , r10               ; Filter1 >>= 3
+    shadd8      r12 , r12 , r10             ; Filter2 >>= 3
+    shadd8      r8 , r8 , r10
+    shadd8      r12 , r12 , r10
+    shadd8      r8 , r8 , r10               ; r8: Filter1
+    shadd8      r12 , r12 , r10             ; r12: Filter2
+
+    ldr         r9, [sp]                    ; load qs0
+    ldr         r11, [sp, #4]               ; load ps0
+
+    qsub8       r9 , r9, r8                 ; qs0 = vp9_signed_char_clamp(qs0 - Filter1)
+    qadd8       r11, r11, r12               ; ps0 = vp9_signed_char_clamp(ps0 + Filter2)
+
+    ;save bottom 3 bits so that we round one side +4 and the other +3
+    ;and            r8, r12, r10                ; s = Filter2 & 7 (s: r8)
+    ;qadd8      r12 , r12 , r9              ; Filter2 = vp9_signed_char_clamp(Filter2+4)
+    ;mov            r10, #0
+    ;shadd8     r12 , r12 , r10             ; Filter2 >>= 3
+    ;usub8      lr, r8, r9                  ; s = (s==4)*-1
+    ;sel            lr, r11, r10
+    ;shadd8     r12 , r12 , r10
+    ;usub8      r8, r9, r8
+    ;sel            r8, r11, r10
+    ;ldr            r9, [sp]                    ; load qs0
+    ;ldr            r11, [sp, #4]               ; load ps0
+    ;shadd8     r12 , r12 , r10
+    ;and            r8, r8, lr                  ; -1 for each element that equals 4
+    ;qadd8      r10, r8, r12                ; u = vp9_signed_char_clamp(s + Filter2)
+    ;qsub8      r9 , r9, r12                ; qs0 = vp9_signed_char_clamp(qs0 - Filter2)
+    ;qadd8      r11, r11, r10               ; ps0 = vp9_signed_char_clamp(ps0 + u)
+
+    ;end of modification for vp8
+
+    bic         r12, r7, r6                 ; vp9_filter &= ~hev    ( r6 is free)
+    ;mov        r12, r7
+
+    ;roughly 3/7th difference across boundary
+    mov         lr, #0x1b                   ; 27
+    mov         r7, #0x3f                   ; 63
+
+    sxtb16      r6, r12
+    sxtb16      r10, r12, ror #8
+    smlabb      r8, r6, lr, r7
+    smlatb      r6, r6, lr, r7
+    smlabb      r7, r10, lr, r7
+    smultb      r10, r10, lr
+    ssat        r8, #8, r8, asr #7
+    ssat        r6, #8, r6, asr #7
+    add         r10, r10, #63
+    ssat        r7, #8, r7, asr #7
+    ssat        r10, #8, r10, asr #7
+
+    ldr         lr, c0x80808080
+
+    pkhbt       r6, r8, r6, lsl #16
+    pkhbt       r10, r7, r10, lsl #16
+    uxtb16      r6, r6
+    uxtb16      r10, r10
+
+    sub         src, src, pstep
+
+    orr         r10, r6, r10, lsl #8        ; u = vp9_signed_char_clamp((63 + Filter2 * 27)>>7)
+
+    qsub8       r8, r9, r10                 ; s = vp9_signed_char_clamp(qs0 - u)
+    qadd8       r10, r11, r10               ; s = vp9_signed_char_clamp(ps0 + u)
+    eor         r8, r8, lr                  ; *oq0 = s^0x80
+    str         r8, [src]                   ; store *oq0
+    sub         src, src, pstep
+    eor         r10, r10, lr                ; *op0 = s^0x80
+    str         r10, [src]                  ; store *op0
+
+    ;roughly 2/7th difference across boundary
+    mov         lr, #0x12                   ; 18
+    mov         r7, #0x3f                   ; 63
+
+    sxtb16      r6, r12
+    sxtb16      r10, r12, ror #8
+    smlabb      r8, r6, lr, r7
+    smlatb      r6, r6, lr, r7
+    smlabb      r9, r10, lr, r7
+    smlatb      r10, r10, lr, r7
+    ssat        r8, #8, r8, asr #7
+    ssat        r6, #8, r6, asr #7
+    ssat        r9, #8, r9, asr #7
+    ssat        r10, #8, r10, asr #7
+
+    ldr         lr, c0x80808080
+
+    pkhbt       r6, r8, r6, lsl #16
+    pkhbt       r10, r9, r10, lsl #16
+
+    ldr         r9, [sp, #8]                ; load qs1
+    ldr         r11, [sp, #12]              ; load ps1
+
+    uxtb16      r6, r6
+    uxtb16      r10, r10
+
+    sub         src, src, pstep
+
+    orr         r10, r6, r10, lsl #8        ; u = vp9_signed_char_clamp((63 + Filter2 * 18)>>7)
+
+    qadd8       r11, r11, r10               ; s = vp9_signed_char_clamp(ps1 + u)
+    qsub8       r8, r9, r10                 ; s = vp9_signed_char_clamp(qs1 - u)
+    eor         r11, r11, lr                ; *op1 = s^0x80
+    str         r11, [src], pstep           ; store *op1
+    eor         r8, r8, lr                  ; *oq1 = s^0x80
+    add         src, src, pstep, lsl #1
+
+    mov         r7, #0x3f                   ; 63
+
+    str         r8, [src], pstep            ; store *oq1
+
+    ;roughly 1/7th difference across boundary
+    mov         lr, #0x9                    ; 9
+    ldr         r9, [src]                   ; load q2
+
+    sxtb16      r6, r12
+    sxtb16      r10, r12, ror #8
+    smlabb      r8, r6, lr, r7
+    smlatb      r6, r6, lr, r7
+    smlabb      r12, r10, lr, r7
+    smlatb      r10, r10, lr, r7
+    ssat        r8, #8, r8, asr #7
+    ssat        r6, #8, r6, asr #7
+    ssat        r12, #8, r12, asr #7
+    ssat        r10, #8, r10, asr #7
+
+    sub         src, src, pstep, lsl #2
+
+    pkhbt       r6, r8, r6, lsl #16
+    pkhbt       r10, r12, r10, lsl #16
+
+    sub         src, src, pstep
+    ldr         lr, c0x80808080
+
+    ldr         r11, [src]                  ; load p2
+
+    uxtb16      r6, r6
+    uxtb16      r10, r10
+
+    eor         r9, r9, lr
+    eor         r11, r11, lr
+
+    orr         r10, r6, r10, lsl #8        ; u = vp9_signed_char_clamp((63 + Filter2 * 9)>>7)
+
+    qadd8       r8, r11, r10                ; s = vp9_signed_char_clamp(ps2 + u)
+    qsub8       r10, r9, r10                ; s = vp9_signed_char_clamp(qs2 - u)
+    eor         r8, r8, lr                  ; *op2 = s^0x80
+    str         r8, [src], pstep, lsl #2    ; store *op2
+    add         src, src, pstep
+    eor         r10, r10, lr                ; *oq2 = s^0x80
+    str         r10, [src], pstep, lsl #1   ; store *oq2
+
+|mbhskip_filter|
+    add         src, src, #4
+    sub         src, src, pstep, lsl #3
+    subs        count, count, #1
+
+    ldrne       r9, [src], pstep            ; p3
+    ldrne       r10, [src], pstep           ; p2
+    ldrne       r11, [src], pstep           ; p1
+
+    bne         MBHnext8
+
+    add         sp, sp, #16
+    ldmia       sp!, {r4 - r11, pc}
+    ENDP        ; |vp8_mbloop_filter_horizontal_edge_armv6|
+
+
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+|vp9_loop_filter_vertical_edge_armv6| PROC
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+    stmdb       sp!, {r4 - r11, lr}
+
+    sub         src, src, #4                ; move src pointer down by 4
+    ldr         count, [sp, #40]            ; count for 8-in-parallel
+    ldr         r12, [sp, #36]              ; load thresh address
+    sub         sp, sp, #16                 ; create temp buffer
+
+    ldr         r6, [src], pstep            ; load source data
+    ldrb        r4, [r2]                    ; blimit
+    ldr         r7, [src], pstep
+    ldrb        r2, [r3]                    ; limit
+    ldr         r8, [src], pstep
+    orr         r4, r4, r4, lsl #8
+    ldrb        r3, [r12]                   ; thresh
+    orr         r2, r2, r2, lsl #8
+    ldr         lr, [src], pstep
+    mov         count, count, lsl #1        ; 4-in-parallel
+    orr         r4, r4, r4, lsl #16
+    orr         r3, r3, r3, lsl #8
+    orr         r2, r2, r2, lsl #16
+    orr         r3, r3, r3, lsl #16
+
+|Vnext8|
+
+    ; vp9_filter_mask() function
+    ; calculate breakout conditions
+    ; transpose the source data for 4-in-parallel operation
+    TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12
+
+    uqsub8      r7, r9, r10                 ; p3 - p2
+    uqsub8      r8, r10, r9                 ; p2 - p3
+    uqsub8      r9, r10, r11                ; p2 - p1
+    uqsub8      r10, r11, r10               ; p1 - p2
+    orr         r7, r7, r8                  ; abs (p3-p2)
+    orr         r10, r9, r10                ; abs (p2-p1)
+    uqsub8      lr, r7, r2                  ; compare to limit. lr: vp9_filter_mask
+    uqsub8      r10, r10, r2                ; compare to limit
+
+    sub         src, src, pstep, lsl #2     ; move src pointer down by 4 lines
+
+    orr         lr, lr, r10
+
+    uqsub8      r6, r11, r12                ; p1 - p0
+    uqsub8      r7, r12, r11                ; p0 - p1
+    add         src, src, #4                ; move src pointer up by 4
+    orr         r6, r6, r7                  ; abs (p1-p0)
+    str         r11, [sp, #12]              ; save p1
+    uqsub8      r10, r6, r2                 ; compare to limit
+    uqsub8      r11, r6, r3                 ; compare to thresh
+    orr         lr, lr, r10
+
+    ; transpose uses 8 regs(r6 - r12 and lr). Need to save reg value now
+    ; transpose the source data for 4-in-parallel operation
+    ldr         r6, [src], pstep            ; load source data
+    str         r11, [sp]                   ; push r11 to stack
+    ldr         r7, [src], pstep
+    str         r12, [sp, #4]               ; save current reg before load q0 - q3 data
+    ldr         r8, [src], pstep
+    str         lr, [sp, #8]
+    ldr         lr, [src], pstep
+
+    TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12
+
+    ldr         lr, [sp, #8]                ; load back (f)limit accumulator
+
+    uqsub8      r6, r12, r11                ; q3 - q2
+    uqsub8      r7, r11, r12                ; q2 - q3
+    uqsub8      r12, r11, r10               ; q2 - q1
+    uqsub8      r11, r10, r11               ; q1 - q2
+    orr         r6, r6, r7                  ; abs (q3-q2)
+    orr         r7, r12, r11                ; abs (q2-q1)
+    uqsub8      r6, r6, r2                  ; compare to limit
+    uqsub8      r7, r7, r2                  ; compare to limit
+    ldr         r11, [sp, #4]               ; load back p0
+    ldr         r12, [sp, #12]              ; load back p1
+    orr         lr, lr, r6
+    orr         lr, lr, r7
+
+    uqsub8      r6, r11, r9                 ; p0 - q0
+    uqsub8      r7, r9, r11                 ; q0 - p0
+    uqsub8      r8, r12, r10                ; p1 - q1
+    uqsub8      r11, r10, r12               ; q1 - p1
+    orr         r6, r6, r7                  ; abs (p0-q0)
+    ldr         r7, c0x7F7F7F7F
+    orr         r8, r8, r11                 ; abs (p1-q1)
+    uqadd8      r6, r6, r6                  ; abs (p0-q0) * 2
+    and         r8, r7, r8, lsr #1          ; abs (p1-q1) / 2
+    uqsub8      r11, r10, r9                ; q1 - q0
+    uqadd8      r6, r8, r6                  ; abs (p0-q0)*2 + abs (p1-q1)/2
+    uqsub8      r12, r9, r10                ; q0 - q1
+    uqsub8      r6, r6, r4                  ; compare to flimit
+
+    orr         r9, r11, r12                ; abs (q1-q0)
+    uqsub8      r8, r9, r2                  ; compare to limit
+    uqsub8      r10, r9, r3                 ; compare to thresh
+    orr         lr, lr, r6
+    orr         lr, lr, r8
+
+    mvn         r11, #0                     ; r11 == -1
+    mov         r12, #0
+
+    usub8       lr, r12, lr
+    ldr         r9, [sp]                    ; load the compared result
+    sel         lr, r11, r12                ; filter mask: lr
+
+    cmp         lr, #0
+    beq         vskip_filter                 ; skip filtering
+
+    ;vp8_hevmask() function
+    ;calculate high edge variance
+
+    sub         src, src, pstep, lsl #2     ; move src pointer down by 4 lines
+
+    orr         r9, r9, r10
+
+    ldrh        r7, [src, #-2]
+    ldrh        r8, [src], pstep
+
+    usub8       r9, r12, r9
+    sel         r6, r12, r11                ; hev mask: r6
+
+    ;vp9_filter() function
+    ; load soure data to r6, r11, r12, lr
+    ldrh        r9, [src, #-2]
+    ldrh        r10, [src], pstep
+
+    pkhbt       r12, r7, r8, lsl #16
+
+    ldrh        r7, [src, #-2]
+    ldrh        r8, [src], pstep
+
+    pkhbt       r11, r9, r10, lsl #16
+
+    ldrh        r9, [src, #-2]
+    ldrh        r10, [src], pstep
+
+    ; Transpose needs 8 regs(r6 - r12, and lr). Save r6 and lr first
+    str         r6, [sp]
+    str         lr, [sp, #4]
+
+    pkhbt       r6, r7, r8, lsl #16
+    pkhbt       lr, r9, r10, lsl #16
+
+    ;transpose r12, r11, r6, lr to r7, r8, r9, r10
+    TRANSPOSE_MATRIX r12, r11, r6, lr, r7, r8, r9, r10
+
+    ;load back hev_mask r6 and filter_mask lr
+    ldr         r12, c0x80808080
+    ldr         r6, [sp]
+    ldr         lr, [sp, #4]
+
+    eor         r7, r7, r12                 ; p1 offset to convert to a signed value
+    eor         r8, r8, r12                 ; p0 offset to convert to a signed value
+    eor         r9, r9, r12                 ; q0 offset to convert to a signed value
+    eor         r10, r10, r12               ; q1 offset to convert to a signed value
+
+    str         r9, [sp]                    ; store qs0 temporarily
+    str         r8, [sp, #4]                ; store ps0 temporarily
+    str         r10, [sp, #8]               ; store qs1 temporarily
+    str         r7, [sp, #12]               ; store ps1 temporarily
+
+    qsub8       r7, r7, r10                 ; vp9_signed_char_clamp(ps1-qs1)
+    qsub8       r8, r9, r8                  ; vp9_signed_char_clamp(vp9_filter + 3 * ( qs0 - ps0))
+
+    and         r7, r7, r6                  ;  vp9_filter (r7) &= hev (r7 : filter)
+
+    qadd8       r7, r7, r8
+    ldr         r9, c0x03030303             ; r9 = 3 --modified for vp8
+
+    qadd8       r7, r7, r8
+    ldr         r10, c0x04040404
+
+    qadd8       r7, r7, r8
+    ;mvn         r11, #0                     ; r11 == -1
+
+    and         r7, r7, lr                  ; vp9_filter &= mask
+
+    ;modify code for vp8 -- Filter1 = vp9_filter (r7)
+    qadd8       r8 , r7 , r9                ; Filter2 (r8) = vp9_signed_char_clamp(vp9_filter+3)
+    qadd8       r7 , r7 , r10               ; vp9_filter = vp9_signed_char_clamp(vp9_filter+4)
+
+    mov         r9, #0
+    shadd8      r8 , r8 , r9                ; Filter2 >>= 3
+    shadd8      r7 , r7 , r9                ; vp9_filter >>= 3
+    shadd8      r8 , r8 , r9
+    shadd8      r7 , r7 , r9
+    shadd8      lr , r8 , r9                ; lr: filter2
+    shadd8      r7 , r7 , r9                ; r7: filter
+
+    ;usub8      lr, r8, r10                 ; s = (s==4)*-1
+    ;sel            lr, r11, r9
+    ;usub8      r8, r10, r8
+    ;sel            r8, r11, r9
+    ;and            r8, r8, lr                  ; -1 for each element that equals 4 -- r8: s
+
+    ;calculate output
+    ;qadd8      lr, r8, r7                  ; u = vp9_signed_char_clamp(s + vp9_filter)
+
+    ldr         r8, [sp]                    ; load qs0
+    ldr         r9, [sp, #4]                ; load ps0
+
+    ldr         r10, c0x01010101
+
+    qsub8       r8, r8, r7                  ; u = vp9_signed_char_clamp(qs0 - vp9_filter)
+    qadd8       r9, r9, lr                  ; u = vp9_signed_char_clamp(ps0 + Filter2)
+    ;end of modification for vp8
+
+    eor         r8, r8, r12
+    eor         r9, r9, r12
+
+    mov         lr, #0
+
+    sadd8       r7, r7, r10
+    shadd8      r7, r7, lr
+
+    ldr         r10, [sp, #8]               ; load qs1
+    ldr         r11, [sp, #12]              ; load ps1
+
+    bic         r7, r7, r6                  ; r7: vp9_filter
+
+    qsub8       r10 , r10, r7               ; u = vp9_signed_char_clamp(qs1 - vp9_filter)
+    qadd8       r11, r11, r7                ; u = vp9_signed_char_clamp(ps1 + vp9_filter)
+    eor         r10, r10, r12
+    eor         r11, r11, r12
+
+    sub         src, src, pstep, lsl #2
+
+    ;we can use TRANSPOSE_MATRIX macro to transpose output - input: q1, q0, p0, p1
+    ;output is b0, b1, b2, b3
+    ;b0: 03 02 01 00
+    ;b1: 13 12 11 10
+    ;b2: 23 22 21 20
+    ;b3: 33 32 31 30
+    ;    p1 p0 q0 q1
+    ;   (a3 a2 a1 a0)
+    TRANSPOSE_MATRIX r11, r9, r8, r10, r6, r7, r12, lr
+
+    strh        r6, [src, #-2]              ; store the result
+    mov         r6, r6, lsr #16
+    strh        r6, [src], pstep
+
+    strh        r7, [src, #-2]
+    mov         r7, r7, lsr #16
+    strh        r7, [src], pstep
+
+    strh        r12, [src, #-2]
+    mov         r12, r12, lsr #16
+    strh        r12, [src], pstep
+
+    strh        lr, [src, #-2]
+    mov         lr, lr, lsr #16
+    strh        lr, [src], pstep
+
+|vskip_filter|
+    sub         src, src, #4
+    subs        count, count, #1
+
+    ldrne       r6, [src], pstep            ; load source data
+    ldrne       r7, [src], pstep
+    ldrne       r8, [src], pstep
+    ldrne       lr, [src], pstep
+
+    bne         Vnext8
+
+    add         sp, sp, #16
+
+    ldmia       sp!, {r4 - r11, pc}
+    ENDP        ; |vp9_loop_filter_vertical_edge_armv6|
+
+
+
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+|vp8_mbloop_filter_vertical_edge_armv6| PROC
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+    stmdb       sp!, {r4 - r11, lr}
+
+    sub         src, src, #4                ; move src pointer down by 4
+    ldr         count, [sp, #40]            ; count for 8-in-parallel
+    ldr         r12, [sp, #36]              ; load thresh address
+    pld         [src, #23]                  ; preload for next block
+    sub         sp, sp, #16                 ; create temp buffer
+
+    ldr         r6, [src], pstep            ; load source data
+    ldrb        r4, [r2]                    ; blimit
+    pld         [src, #23]
+    ldr         r7, [src], pstep
+    ldrb        r2, [r3]                    ; limit
+    pld         [src, #23]
+    ldr         r8, [src], pstep
+    orr         r4, r4, r4, lsl #8
+    ldrb        r3, [r12]                   ; thresh
+    orr         r2, r2, r2, lsl #8
+    pld         [src, #23]
+    ldr         lr, [src], pstep
+    mov         count, count, lsl #1        ; 4-in-parallel
+    orr         r4, r4, r4, lsl #16
+    orr         r3, r3, r3, lsl #8
+    orr         r2, r2, r2, lsl #16
+    orr         r3, r3, r3, lsl #16
+
+|MBVnext8|
+    ; vp9_filter_mask() function
+    ; calculate breakout conditions
+    ; transpose the source data for 4-in-parallel operation
+    TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12
+
+    uqsub8      r7, r9, r10                 ; p3 - p2
+    uqsub8      r8, r10, r9                 ; p2 - p3
+    uqsub8      r9, r10, r11                ; p2 - p1
+    uqsub8      r10, r11, r10               ; p1 - p2
+    orr         r7, r7, r8                  ; abs (p3-p2)
+    orr         r10, r9, r10                ; abs (p2-p1)
+    uqsub8      lr, r7, r2                  ; compare to limit. lr: vp9_filter_mask
+    uqsub8      r10, r10, r2                ; compare to limit
+
+    sub         src, src, pstep, lsl #2     ; move src pointer down by 4 lines
+
+    orr         lr, lr, r10
+
+    uqsub8      r6, r11, r12                ; p1 - p0
+    uqsub8      r7, r12, r11                ; p0 - p1
+    add         src, src, #4                ; move src pointer up by 4
+    orr         r6, r6, r7                  ; abs (p1-p0)
+    str         r11, [sp, #12]              ; save p1
+    uqsub8      r10, r6, r2                 ; compare to limit
+    uqsub8      r11, r6, r3                 ; compare to thresh
+    orr         lr, lr, r10
+
+    ; transpose uses 8 regs(r6 - r12 and lr). Need to save reg value now
+    ; transpose the source data for 4-in-parallel operation
+    ldr         r6, [src], pstep            ; load source data
+    str         r11, [sp]                   ; push r11 to stack
+    ldr         r7, [src], pstep
+    str         r12, [sp, #4]               ; save current reg before load q0 - q3 data
+    ldr         r8, [src], pstep
+    str         lr, [sp, #8]
+    ldr         lr, [src], pstep
+
+
+    TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12
+
+    ldr         lr, [sp, #8]                ; load back (f)limit accumulator
+
+    uqsub8      r6, r12, r11                ; q3 - q2
+    uqsub8      r7, r11, r12                ; q2 - q3
+    uqsub8      r12, r11, r10               ; q2 - q1
+    uqsub8      r11, r10, r11               ; q1 - q2
+    orr         r6, r6, r7                  ; abs (q3-q2)
+    orr         r7, r12, r11                ; abs (q2-q1)
+    uqsub8      r6, r6, r2                  ; compare to limit
+    uqsub8      r7, r7, r2                  ; compare to limit
+    ldr         r11, [sp, #4]               ; load back p0
+    ldr         r12, [sp, #12]              ; load back p1
+    orr         lr, lr, r6
+    orr         lr, lr, r7
+
+    uqsub8      r6, r11, r9                 ; p0 - q0
+    uqsub8      r7, r9, r11                 ; q0 - p0
+    uqsub8      r8, r12, r10                ; p1 - q1
+    uqsub8      r11, r10, r12               ; q1 - p1
+    orr         r6, r6, r7                  ; abs (p0-q0)
+    ldr         r7, c0x7F7F7F7F
+    orr         r8, r8, r11                 ; abs (p1-q1)
+    uqadd8      r6, r6, r6                  ; abs (p0-q0) * 2
+    and         r8, r7, r8, lsr #1          ; abs (p1-q1) / 2
+    uqsub8      r11, r10, r9                ; q1 - q0
+    uqadd8      r6, r8, r6                  ; abs (p0-q0)*2 + abs (p1-q1)/2
+    uqsub8      r12, r9, r10                ; q0 - q1
+    uqsub8      r6, r6, r4                  ; compare to flimit
+
+    orr         r9, r11, r12                ; abs (q1-q0)
+    uqsub8      r8, r9, r2                  ; compare to limit
+    uqsub8      r10, r9, r3                 ; compare to thresh
+    orr         lr, lr, r6
+    orr         lr, lr, r8
+
+    mvn         r11, #0                     ; r11 == -1
+    mov         r12, #0
+
+    usub8       lr, r12, lr
+    ldr         r9, [sp]                    ; load the compared result
+    sel         lr, r11, r12                ; filter mask: lr
+
+    cmp         lr, #0
+    beq         mbvskip_filter               ; skip filtering
+
+
+
+    ;vp8_hevmask() function
+    ;calculate high edge variance
+
+    sub         src, src, pstep, lsl #2     ; move src pointer down by 4 lines
+
+    orr         r9, r9, r10
+
+    ldrh        r7, [src, #-2]
+    ldrh        r8, [src], pstep
+
+    usub8       r9, r12, r9
+    sel         r6, r12, r11                ; hev mask: r6
+
+
+    ; vp8_mbfilter() function
+    ; p2, q2 are only needed at the end. Don't need to load them in now.
+    ; Transpose needs 8 regs(r6 - r12, and lr). Save r6 and lr first
+    ; load soure data to r6, r11, r12, lr
+    ldrh        r9, [src, #-2]
+    ldrh        r10, [src], pstep
+
+    pkhbt       r12, r7, r8, lsl #16
+
+    ldrh        r7, [src, #-2]
+    ldrh        r8, [src], pstep
+
+    pkhbt       r11, r9, r10, lsl #16
+
+    ldrh        r9, [src, #-2]
+    ldrh        r10, [src], pstep
+
+    str         r6, [sp]                    ; save r6
+    str         lr, [sp, #4]                ; save lr
+
+    pkhbt       r6, r7, r8, lsl #16
+    pkhbt       lr, r9, r10, lsl #16
+
+    ;transpose r12, r11, r6, lr to p1, p0, q0, q1
+    TRANSPOSE_MATRIX r12, r11, r6, lr, r7, r8, r9, r10
+
+    ;load back hev_mask r6 and filter_mask lr
+    ldr         r12, c0x80808080
+    ldr         r6, [sp]
+    ldr         lr, [sp, #4]
+
+    eor         r7, r7, r12                 ; ps1
+    eor         r8, r8, r12                 ; ps0
+    eor         r9, r9, r12                 ; qs0
+    eor         r10, r10, r12               ; qs1
+
+    qsub8       r12, r9, r8                 ; vp9_signed_char_clamp(vp9_filter + 3 * ( qs0 - ps0))
+    str         r7, [sp, #12]               ; store ps1 temporarily
+    qsub8       r7, r7, r10                 ; vp9_signed_char_clamp(ps1-qs1)
+    str         r10, [sp, #8]               ; store qs1 temporarily
+    qadd8       r7, r7, r12
+    str         r9, [sp]                    ; store qs0 temporarily
+    qadd8       r7, r7, r12
+    str         r8, [sp, #4]                ; store ps0 temporarily
+    qadd8       r7, r7, r12                 ; vp9_filter: r7
+
+    ldr         r10, c0x03030303            ; r10 = 3 --modified for vp8
+    ldr         r9, c0x04040404
+    ;mvn         r11, #0                     ; r11 == -1
+
+    and         r7, r7, lr                  ; vp9_filter &= mask (lr is free)
+
+    mov         r12, r7                     ; Filter2: r12
+    and         r12, r12, r6                ; Filter2 &= hev
+
+    ;modify code for vp8
+    ;save bottom 3 bits so that we round one side +4 and the other +3
+    qadd8       r8 , r12 , r9               ; Filter1 (r8) = vp9_signed_char_clamp(Filter2+4)
+    qadd8       r12 , r12 , r10             ; Filter2 (r12) = vp9_signed_char_clamp(Filter2+3)
+
+    mov         r10, #0
+    shadd8      r8 , r8 , r10               ; Filter1 >>= 3
+    shadd8      r12 , r12 , r10             ; Filter2 >>= 3
+    shadd8      r8 , r8 , r10
+    shadd8      r12 , r12 , r10
+    shadd8      r8 , r8 , r10               ; r8: Filter1
+    shadd8      r12 , r12 , r10             ; r12: Filter2
+
+    ldr         r9, [sp]                    ; load qs0
+    ldr         r11, [sp, #4]               ; load ps0
+
+    qsub8       r9 , r9, r8                 ; qs0 = vp9_signed_char_clamp(qs0 - Filter1)
+    qadd8       r11, r11, r12               ; ps0 = vp9_signed_char_clamp(ps0 + Filter2)
+
+    ;save bottom 3 bits so that we round one side +4 and the other +3
+    ;and            r8, r12, r10                ; s = Filter2 & 7 (s: r8)
+    ;qadd8      r12 , r12 , r9              ; Filter2 = vp9_signed_char_clamp(Filter2+4)
+    ;mov            r10, #0
+    ;shadd8     r12 , r12 , r10             ; Filter2 >>= 3
+    ;usub8      lr, r8, r9                  ; s = (s==4)*-1
+    ;sel            lr, r11, r10
+    ;shadd8     r12 , r12 , r10
+    ;usub8      r8, r9, r8
+    ;sel            r8, r11, r10
+    ;ldr            r9, [sp]                    ; load qs0
+    ;ldr            r11, [sp, #4]               ; load ps0
+    ;shadd8     r12 , r12 , r10
+    ;and            r8, r8, lr                  ; -1 for each element that equals 4
+    ;qadd8      r10, r8, r12                ; u = vp9_signed_char_clamp(s + Filter2)
+    ;qsub8      r9 , r9, r12                ; qs0 = vp9_signed_char_clamp(qs0 - Filter2)
+    ;qadd8      r11, r11, r10               ; ps0 = vp9_signed_char_clamp(ps0 + u)
+
+    ;end of modification for vp8
+
+    bic         r12, r7, r6                 ;vp9_filter &= ~hev    ( r6 is free)
+    ;mov            r12, r7
+
+    ;roughly 3/7th difference across boundary
+    mov         lr, #0x1b                   ; 27
+    mov         r7, #0x3f                   ; 63
+
+    sxtb16      r6, r12
+    sxtb16      r10, r12, ror #8
+    smlabb      r8, r6, lr, r7
+    smlatb      r6, r6, lr, r7
+    smlabb      r7, r10, lr, r7
+    smultb      r10, r10, lr
+    ssat        r8, #8, r8, asr #7
+    ssat        r6, #8, r6, asr #7
+    add         r10, r10, #63
+    ssat        r7, #8, r7, asr #7
+    ssat        r10, #8, r10, asr #7
+
+    ldr         lr, c0x80808080
+
+    pkhbt       r6, r8, r6, lsl #16
+    pkhbt       r10, r7, r10, lsl #16
+    uxtb16      r6, r6
+    uxtb16      r10, r10
+
+    sub         src, src, pstep, lsl #2     ; move src pointer down by 4 lines
+
+    orr         r10, r6, r10, lsl #8        ; u = vp9_signed_char_clamp((63 + Filter2 * 27)>>7)
+
+    qsub8       r8, r9, r10                 ; s = vp9_signed_char_clamp(qs0 - u)
+    qadd8       r10, r11, r10               ; s = vp9_signed_char_clamp(ps0 + u)
+    eor         r8, r8, lr                  ; *oq0 = s^0x80
+    eor         r10, r10, lr                ; *op0 = s^0x80
+
+    strb        r10, [src, #-1]             ; store op0 result
+    strb        r8, [src], pstep            ; store oq0 result
+    mov         r10, r10, lsr #8
+    mov         r8, r8, lsr #8
+    strb        r10, [src, #-1]
+    strb        r8, [src], pstep
+    mov         r10, r10, lsr #8
+    mov         r8, r8, lsr #8
+    strb        r10, [src, #-1]
+    strb        r8, [src], pstep
+    mov         r10, r10, lsr #8
+    mov         r8, r8, lsr #8
+    strb        r10, [src, #-1]
+    strb        r8, [src], pstep
+
+    ;roughly 2/7th difference across boundary
+    mov         lr, #0x12                   ; 18
+    mov         r7, #0x3f                   ; 63
+
+    sxtb16      r6, r12
+    sxtb16      r10, r12, ror #8
+    smlabb      r8, r6, lr, r7
+    smlatb      r6, r6, lr, r7
+    smlabb      r9, r10, lr, r7
+
+    smlatb      r10, r10, lr, r7
+    ssat        r8, #8, r8, asr #7
+    ssat        r6, #8, r6, asr #7
+    ssat        r9, #8, r9, asr #7
+    ssat        r10, #8, r10, asr #7
+
+    sub         src, src, pstep, lsl #2     ; move src pointer down by 4 lines
+
+    pkhbt       r6, r8, r6, lsl #16
+    pkhbt       r10, r9, r10, lsl #16
+
+    ldr         r9, [sp, #8]                ; load qs1
+    ldr         r11, [sp, #12]              ; load ps1
+    ldr         lr, c0x80808080
+
+    uxtb16      r6, r6
+    uxtb16      r10, r10
+
+    add         src, src, #2
+
+    orr         r10, r6, r10, lsl #8        ; u = vp9_signed_char_clamp((63 + Filter2 * 18)>>7)
+
+    qsub8       r8, r9, r10                 ; s = vp9_signed_char_clamp(qs1 - u)
+    qadd8       r10, r11, r10               ; s = vp9_signed_char_clamp(ps1 + u)
+    eor         r8, r8, lr                  ; *oq1 = s^0x80
+    eor         r10, r10, lr                ; *op1 = s^0x80
+
+    ldrb        r11, [src, #-5]             ; load p2 for 1/7th difference across boundary
+    strb        r10, [src, #-4]             ; store op1
+    strb        r8, [src, #-1]              ; store oq1
+    ldrb        r9, [src], pstep            ; load q2 for 1/7th difference across boundary
+
+    mov         r10, r10, lsr #8
+    mov         r8, r8, lsr #8
+
+    ldrb        r6, [src, #-5]
+    strb        r10, [src, #-4]
+    strb        r8, [src, #-1]
+    ldrb        r7, [src], pstep
+
+    mov         r10, r10, lsr #8
+    mov         r8, r8, lsr #8
+    orr         r11, r11, r6, lsl #8
+    orr         r9, r9, r7, lsl #8
+
+    ldrb        r6, [src, #-5]
+    strb        r10, [src, #-4]
+    strb        r8, [src, #-1]
+    ldrb        r7, [src], pstep
+
+    mov         r10, r10, lsr #8
+    mov         r8, r8, lsr #8
+    orr         r11, r11, r6, lsl #16
+    orr         r9, r9, r7, lsl #16
+
+    ldrb        r6, [src, #-5]
+    strb        r10, [src, #-4]
+    strb        r8, [src, #-1]
+    ldrb        r7, [src], pstep
+    orr         r11, r11, r6, lsl #24
+    orr         r9, r9, r7, lsl #24
+
+    ;roughly 1/7th difference across boundary
+    eor         r9, r9, lr
+    eor         r11, r11, lr
+
+    mov         lr, #0x9                    ; 9
+    mov         r7, #0x3f                   ; 63
+
+    sxtb16      r6, r12
+    sxtb16      r10, r12, ror #8
+    smlabb      r8, r6, lr, r7
+    smlatb      r6, r6, lr, r7
+    smlabb      r12, r10, lr, r7
+    smlatb      r10, r10, lr, r7
+    ssat        r8, #8, r8, asr #7
+    ssat        r6, #8, r6, asr #7
+    ssat        r12, #8, r12, asr #7
+    ssat        r10, #8, r10, asr #7
+
+    sub         src, src, pstep, lsl #2
+
+    pkhbt       r6, r8, r6, lsl #16
+    pkhbt       r10, r12, r10, lsl #16
+
+    uxtb16      r6, r6
+    uxtb16      r10, r10
+
+    ldr         lr, c0x80808080
+
+    orr         r10, r6, r10, lsl #8        ; u = vp9_signed_char_clamp((63 + Filter2 * 9)>>7)
+
+    qadd8       r8, r11, r10                ; s = vp9_signed_char_clamp(ps2 + u)
+    qsub8       r10, r9, r10                ; s = vp9_signed_char_clamp(qs2 - u)
+    eor         r8, r8, lr                  ; *op2 = s^0x80
+    eor         r10, r10, lr                ; *oq2 = s^0x80
+
+    strb        r8, [src, #-5]              ; store *op2
+    strb        r10, [src], pstep           ; store *oq2
+    mov         r8, r8, lsr #8
+    mov         r10, r10, lsr #8
+    strb        r8, [src, #-5]
+    strb        r10, [src], pstep
+    mov         r8, r8, lsr #8
+    mov         r10, r10, lsr #8
+    strb        r8, [src, #-5]
+    strb        r10, [src], pstep
+    mov         r8, r8, lsr #8
+    mov         r10, r10, lsr #8
+    strb        r8, [src, #-5]
+    strb        r10, [src], pstep
+
+    ;adjust src pointer for next loop
+    sub         src, src, #2
+
+|mbvskip_filter|
+    sub         src, src, #4
+    subs        count, count, #1
+
+    pld         [src, #23]                  ; preload for next block
+    ldrne       r6, [src], pstep            ; load source data
+    pld         [src, #23]
+    ldrne       r7, [src], pstep
+    pld         [src, #23]
+    ldrne       r8, [src], pstep
+    pld         [src, #23]
+    ldrne       lr, [src], pstep
+
+    bne         MBVnext8
+
+    add         sp, sp, #16
+
+    ldmia       sp!, {r4 - r11, pc}
+    ENDP        ; |vp8_mbloop_filter_vertical_edge_armv6|
+
+; Constant Pool
+c0x80808080 DCD     0x80808080
+c0x03030303 DCD     0x03030303
+c0x04040404 DCD     0x04040404
+c0x01010101 DCD     0x01010101
+c0x7F7F7F7F DCD     0x7F7F7F7F
+
+    END
--- /dev/null
+++ b/vp9/common/arm/armv6/recon_v6.asm
@@ -1,0 +1,281 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_recon_b_armv6|
+    EXPORT  |vp8_recon2b_armv6|
+    EXPORT  |vp8_recon4b_armv6|
+
+    AREA    |.text|, CODE, READONLY  ; name this block of code
+prd     RN  r0
+dif     RN  r1
+dst     RN  r2
+stride      RN  r3
+
+;void recon_b(unsigned char *pred_ptr, short *diff_ptr, unsigned char *dst_ptr, int stride)
+; R0 char* pred_ptr
+; R1 short * dif_ptr
+; R2 char * dst_ptr
+; R3 int stride
+
+; Description:
+; Loop through the block adding the Pred and Diff together.  Clamp and then
+; store back into the Dst.
+
+; Restrictions :
+; all buffers are expected to be 4 byte aligned coming in and
+; going out.
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
+;
+;
+;
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
+|vp8_recon_b_armv6| PROC
+    stmdb   sp!, {r4 - r9, lr}
+
+    ;0, 1, 2, 3
+    ldr     r4, [prd], #16          ; 3 | 2 | 1 | 0
+    ldr     r6, [dif, #0]           ;     1 |     0
+    ldr     r7, [dif, #4]           ;     3 |     2
+
+    pkhbt   r8, r6, r7, lsl #16     ;     2 |     0
+    pkhtb   r9, r7, r6, asr #16     ;     3 |     1
+
+    uxtab16 r8, r8, r4              ;     2 |     0  +  3 | 2 | 2 | 0
+    uxtab16 r9, r9, r4, ror #8      ;     3 |     1  +  0 | 3 | 2 | 1
+
+    usat16  r8, #8, r8
+    usat16  r9, #8, r9
+    add     dif, dif, #32
+    orr     r8, r8, r9, lsl #8
+
+    str     r8, [dst], stride
+
+    ;0, 1, 2, 3
+    ldr     r4, [prd], #16          ; 3 | 2 | 1 | 0
+;;  ldr     r6, [dif, #8]           ;     1 |     0
+;;  ldr     r7, [dif, #12]          ;     3 |     2
+    ldr     r6, [dif, #0]           ;     1 |     0
+    ldr     r7, [dif, #4]           ;     3 |     2
+
+    pkhbt   r8, r6, r7, lsl #16     ;     2 |     0
+    pkhtb   r9, r7, r6, asr #16     ;     3 |     1
+
+    uxtab16 r8, r8, r4              ;     2 |     0  +  3 | 2 | 2 | 0
+    uxtab16 r9, r9, r4, ror #8      ;     3 |     1  +  0 | 3 | 2 | 1
+
+    usat16  r8, #8, r8
+    usat16  r9, #8, r9
+    add     dif, dif, #32
+    orr     r8, r8, r9, lsl #8
+
+    str     r8, [dst], stride
+
+    ;0, 1, 2, 3
+    ldr     r4, [prd], #16          ; 3 | 2 | 1 | 0
+;;  ldr     r6, [dif, #16]          ;     1 |     0
+;;  ldr     r7, [dif, #20]          ;     3 |     2
+    ldr     r6, [dif, #0]           ;     1 |     0
+    ldr     r7, [dif, #4]           ;     3 |     2
+
+    pkhbt   r8, r6, r7, lsl #16     ;     2 |     0
+    pkhtb   r9, r7, r6, asr #16     ;     3 |     1
+
+    uxtab16 r8, r8, r4              ;     2 |     0  +  3 | 2 | 2 | 0
+    uxtab16 r9, r9, r4, ror #8      ;     3 |     1  +  0 | 3 | 2 | 1
+
+    usat16  r8, #8, r8
+    usat16  r9, #8, r9
+    add     dif, dif, #32
+    orr     r8, r8, r9, lsl #8
+
+    str     r8, [dst], stride
+
+    ;0, 1, 2, 3
+    ldr     r4, [prd], #16          ; 3 | 2 | 1 | 0
+;;  ldr     r6, [dif, #24]          ;     1 |     0
+;;  ldr     r7, [dif, #28]          ;     3 |     2
+    ldr     r6, [dif, #0]           ;     1 |     0
+    ldr     r7, [dif, #4]           ;     3 |     2
+
+    pkhbt   r8, r6, r7, lsl #16     ;     2 |     0
+    pkhtb   r9, r7, r6, asr #16     ;     3 |     1
+
+    uxtab16 r8, r8, r4              ;     2 |     0  +  3 | 2 | 2 | 0
+    uxtab16 r9, r9, r4, ror #8      ;     3 |     1  +  0 | 3 | 2 | 1
+
+    usat16  r8, #8, r8
+    usat16  r9, #8, r9
+    orr     r8, r8, r9, lsl #8
+
+    str     r8, [dst], stride
+
+    ldmia   sp!, {r4 - r9, pc}
+
+    ENDP    ; |recon_b|
+
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
+;
+;
+;
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
+; R0 char  *pred_ptr
+; R1 short *dif_ptr
+; R2 char  *dst_ptr
+; R3 int stride
+|vp8_recon4b_armv6| PROC
+    stmdb   sp!, {r4 - r9, lr}
+
+    mov     lr, #4
+
+recon4b_loop
+    ;0, 1, 2, 3
+    ldr     r4, [prd], #4           ; 3 | 2 | 1 | 0
+    ldr     r6, [dif, #0]           ;     1 |     0
+    ldr     r7, [dif, #4]           ;     3 |     2
+
+    pkhbt   r8, r6, r7, lsl #16     ;     2 |     0
+    pkhtb   r9, r7, r6, asr #16     ;     3 |     1
+
+    uxtab16 r8, r8, r4              ;     2 |     0  +  3 | 2 | 2 | 0
+    uxtab16 r9, r9, r4, ror #8      ;     3 |     1  +  0 | 3 | 2 | 1
+
+    usat16  r8, #8, r8
+    usat16  r9, #8, r9
+    orr     r8, r8, r9, lsl #8
+
+    str     r8, [dst]
+
+    ;4, 5, 6, 7
+    ldr     r4, [prd], #4
+;;  ldr     r6, [dif, #32]
+;;  ldr     r7, [dif, #36]
+    ldr     r6, [dif, #8]
+    ldr     r7, [dif, #12]
+
+    pkhbt   r8, r6, r7, lsl #16
+    pkhtb   r9, r7, r6, asr #16
+
+    uxtab16 r8, r8, r4
+    uxtab16 r9, r9, r4, ror #8
+    usat16  r8, #8, r8
+    usat16  r9, #8, r9
+    orr     r8, r8, r9, lsl #8
+
+    str     r8, [dst, #4]
+
+    ;8, 9, 10, 11
+    ldr     r4, [prd], #4
+;;  ldr     r6, [dif, #64]
+;;  ldr     r7, [dif, #68]
+    ldr     r6, [dif, #16]
+    ldr     r7, [dif, #20]
+
+    pkhbt   r8, r6, r7, lsl #16
+    pkhtb   r9, r7, r6, asr #16
+
+    uxtab16 r8, r8, r4
+    uxtab16 r9, r9, r4, ror #8
+    usat16  r8, #8, r8
+    usat16  r9, #8, r9
+    orr     r8, r8, r9, lsl #8
+
+    str     r8, [dst, #8]
+
+    ;12, 13, 14, 15
+    ldr     r4, [prd], #4
+;;  ldr     r6, [dif, #96]
+;;  ldr     r7, [dif, #100]
+    ldr     r6, [dif, #24]
+    ldr     r7, [dif, #28]
+
+    pkhbt   r8, r6, r7, lsl #16
+    pkhtb   r9, r7, r6, asr #16
+
+    uxtab16 r8, r8, r4
+    uxtab16 r9, r9, r4, ror #8
+    usat16  r8, #8, r8
+    usat16  r9, #8, r9
+    orr     r8, r8, r9, lsl #8
+
+    str     r8, [dst, #12]
+
+    add     dst, dst, stride
+;;  add     dif, dif, #8
+    add     dif, dif, #32
+
+    subs    lr, lr, #1
+    bne     recon4b_loop
+
+    ldmia   sp!, {r4 - r9, pc}
+
+    ENDP    ; |Recon4B|
+
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
+;
+;
+;
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
+; R0 char  *pred_ptr
+; R1 short *dif_ptr
+; R2 char  *dst_ptr
+; R3 int stride
+|vp8_recon2b_armv6| PROC
+    stmdb   sp!, {r4 - r9, lr}
+
+    mov     lr, #4
+
+recon2b_loop
+    ;0, 1, 2, 3
+    ldr     r4, [prd], #4
+    ldr     r6, [dif, #0]
+    ldr     r7, [dif, #4]
+
+    pkhbt   r8, r6, r7, lsl #16
+    pkhtb   r9, r7, r6, asr #16
+
+    uxtab16 r8, r8, r4
+    uxtab16 r9, r9, r4, ror #8
+    usat16  r8, #8, r8
+    usat16  r9, #8, r9
+    orr     r8, r8, r9, lsl #8
+
+    str     r8, [dst]
+
+    ;4, 5, 6, 7
+    ldr     r4, [prd], #4
+;;  ldr     r6, [dif, #32]
+;;  ldr     r7, [dif, #36]
+    ldr     r6, [dif, #8]
+    ldr     r7, [dif, #12]
+
+    pkhbt   r8, r6, r7, lsl #16
+    pkhtb   r9, r7, r6, asr #16
+
+    uxtab16 r8, r8, r4
+    uxtab16 r9, r9, r4, ror #8
+    usat16  r8, #8, r8
+    usat16  r9, #8, r9
+    orr     r8, r8, r9, lsl #8
+
+    str     r8, [dst, #4]
+
+    add     dst, dst, stride
+;;  add     dif, dif, #8
+    add     dif, dif, #16
+
+    subs    lr, lr, #1
+    bne     recon2b_loop
+
+    ldmia   sp!, {r4 - r9, pc}
+
+    ENDP    ; |Recon2B|
+
+    END
--- /dev/null
+++ b/vp9/common/arm/armv6/simpleloopfilter_v6.asm
@@ -1,0 +1,286 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT |vp9_loop_filter_simple_horizontal_edge_armv6|
+    EXPORT |vp9_loop_filter_simple_vertical_edge_armv6|
+
+    AREA    |.text|, CODE, READONLY  ; name this block of code
+
+    MACRO
+    TRANSPOSE_MATRIX $a0, $a1, $a2, $a3, $b0, $b1, $b2, $b3
+    ; input: $a0, $a1, $a2, $a3; output: $b0, $b1, $b2, $b3
+    ; a0: 03 02 01 00
+    ; a1: 13 12 11 10
+    ; a2: 23 22 21 20
+    ; a3: 33 32 31 30
+    ;     b3 b2 b1 b0
+
+    uxtb16      $b1, $a1                    ; xx 12 xx 10
+    uxtb16      $b0, $a0                    ; xx 02 xx 00
+    uxtb16      $b3, $a3                    ; xx 32 xx 30
+    uxtb16      $b2, $a2                    ; xx 22 xx 20
+    orr         $b1, $b0, $b1, lsl #8       ; 12 02 10 00
+    orr         $b3, $b2, $b3, lsl #8       ; 32 22 30 20
+
+    uxtb16      $a1, $a1, ror #8            ; xx 13 xx 11
+    uxtb16      $a3, $a3, ror #8            ; xx 33 xx 31
+    uxtb16      $a0, $a0, ror #8            ; xx 03 xx 01
+    uxtb16      $a2, $a2, ror #8            ; xx 23 xx 21
+    orr         $a0, $a0, $a1, lsl #8       ; 13 03 11 01
+    orr         $a2, $a2, $a3, lsl #8       ; 33 23 31 21
+
+    pkhtb       $b2, $b3, $b1, asr #16      ; 32 22 12 02   -- p1
+    pkhbt       $b0, $b1, $b3, lsl #16      ; 30 20 10 00   -- p3
+
+    pkhtb       $b3, $a2, $a0, asr #16      ; 33 23 13 03   -- p0
+    pkhbt       $b1, $a0, $a2, lsl #16      ; 31 21 11 01   -- p2
+    MEND
+
+
+
+src         RN  r0
+pstep       RN  r1
+
+;r0     unsigned char *src_ptr,
+;r1     int src_pixel_step,
+;r2     const char *blimit
+
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+|vp9_loop_filter_simple_horizontal_edge_armv6| PROC
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+    stmdb       sp!, {r4 - r11, lr}
+
+    ldrb        r12, [r2]                   ; blimit
+    ldr         r3, [src, -pstep, lsl #1]   ; p1
+    ldr         r4, [src, -pstep]           ; p0
+    ldr         r5, [src]                   ; q0
+    ldr         r6, [src, pstep]            ; q1
+    orr         r12, r12, r12, lsl #8       ; blimit
+    ldr         r2, c0x80808080
+    orr         r12, r12, r12, lsl #16      ; blimit
+    mov         r9, #4                      ; double the count. we're doing 4 at a time
+    mov         lr, #0                      ; need 0 in a couple places
+
+|simple_hnext8|
+    ; vp8_simple_filter_mask()
+
+    uqsub8      r7, r3, r6                  ; p1 - q1
+    uqsub8      r8, r6, r3                  ; q1 - p1
+    uqsub8      r10, r4, r5                 ; p0 - q0
+    uqsub8      r11, r5, r4                 ; q0 - p0
+    orr         r8, r8, r7                  ; abs(p1 - q1)
+    orr         r10, r10, r11               ; abs(p0 - q0)
+    uqadd8      r10, r10, r10               ; abs(p0 - q0) * 2
+    uhadd8      r8, r8, lr                  ; abs(p1 - q2) >> 1
+    uqadd8      r10, r10, r8                ; abs(p0 - q0)*2 + abs(p1 - q1)/2
+    mvn         r8, #0
+    usub8       r10, r12, r10               ; compare to flimit. usub8 sets GE flags
+    sel         r10, r8, lr                 ; filter mask: F or 0
+    cmp         r10, #0
+    beq         simple_hskip_filter         ; skip filtering if all masks are 0x00
+
+    ;vp8_simple_filter()
+
+    eor         r3, r3, r2                  ; p1 offset to convert to a signed value
+    eor         r6, r6, r2                  ; q1 offset to convert to a signed value
+    eor         r4, r4, r2                  ; p0 offset to convert to a signed value
+    eor         r5, r5, r2                  ; q0 offset to convert to a signed value
+
+    qsub8       r3, r3, r6                  ; vp9_filter = p1 - q1
+    qsub8       r6, r5, r4                  ; q0 - p0
+    qadd8       r3, r3, r6                  ; += q0 - p0
+    ldr         r7, c0x04040404
+    qadd8       r3, r3, r6                  ; += q0 - p0
+    ldr         r8, c0x03030303
+    qadd8       r3, r3, r6                  ; vp9_filter = p1-q1 + 3*(q0-p0))
+    ;STALL
+    and         r3, r3, r10                 ; vp9_filter &= mask
+
+    qadd8       r7 , r3 , r7                ; Filter1 = vp9_filter + 4
+    qadd8       r8 , r3 , r8                ; Filter2 = vp9_filter + 3
+
+    shadd8      r7 , r7 , lr
+    shadd8      r8 , r8 , lr
+    shadd8      r7 , r7 , lr
+    shadd8      r8 , r8 , lr
+    shadd8      r7 , r7 , lr                ; Filter1 >>= 3
+    shadd8      r8 , r8 , lr                ; Filter2 >>= 3
+
+    qsub8       r5 ,r5, r7                  ; u = q0 - Filter1
+    qadd8       r4, r4, r8                  ; u = p0 + Filter2
+    eor         r5, r5, r2                  ; *oq0 = u^0x80
+    str         r5, [src]                   ; store oq0 result
+    eor         r4, r4, r2                  ; *op0 = u^0x80
+    str         r4, [src, -pstep]           ; store op0 result
+
+|simple_hskip_filter|
+    subs        r9, r9, #1
+    addne       src, src, #4                ; next row
+
+    ldrne       r3, [src, -pstep, lsl #1]   ; p1
+    ldrne       r4, [src, -pstep]           ; p0
+    ldrne       r5, [src]                   ; q0
+    ldrne       r6, [src, pstep]            ; q1
+
+    bne         simple_hnext8
+
+    ldmia       sp!, {r4 - r11, pc}
+    ENDP        ; |vp9_loop_filter_simple_horizontal_edge_armv6|
+
+
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+|vp9_loop_filter_simple_vertical_edge_armv6| PROC
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+    stmdb       sp!, {r4 - r11, lr}
+
+    ldrb        r12, [r2]                   ; r12: blimit
+    ldr         r2, c0x80808080
+    orr         r12, r12, r12, lsl #8
+
+    ; load soure data to r7, r8, r9, r10
+    ldrh        r3, [src, #-2]
+    pld         [src, #23]                  ; preload for next block
+    ldrh        r4, [src], pstep
+    orr         r12, r12, r12, lsl #16
+
+    ldrh        r5, [src, #-2]
+    pld         [src, #23]
+    ldrh        r6, [src], pstep
+
+    pkhbt       r7, r3, r4, lsl #16
+
+    ldrh        r3, [src, #-2]
+    pld         [src, #23]
+    ldrh        r4, [src], pstep
+
+    pkhbt       r8, r5, r6, lsl #16
+
+    ldrh        r5, [src, #-2]
+    pld         [src, #23]
+    ldrh        r6, [src], pstep
+    mov         r11, #4                     ; double the count. we're doing 4 at a time
+
+|simple_vnext8|
+    ; vp8_simple_filter_mask() function
+    pkhbt       r9, r3, r4, lsl #16
+    pkhbt       r10, r5, r6, lsl #16
+
+    ;transpose r7, r8, r9, r10 to r3, r4, r5, r6
+    TRANSPOSE_MATRIX r7, r8, r9, r10, r3, r4, r5, r6
+
+    uqsub8      r7, r3, r6                  ; p1 - q1
+    uqsub8      r8, r6, r3                  ; q1 - p1
+    uqsub8      r9, r4, r5                  ; p0 - q0
+    uqsub8      r10, r5, r4                 ; q0 - p0
+    orr         r7, r7, r8                  ; abs(p1 - q1)
+    orr         r9, r9, r10                 ; abs(p0 - q0)
+    mov         r8, #0
+    uqadd8      r9, r9, r9                  ; abs(p0 - q0) * 2
+    uhadd8      r7, r7, r8                  ; abs(p1 - q1) / 2
+    uqadd8      r7, r7, r9                  ; abs(p0 - q0)*2 + abs(p1 - q1)/2
+    mvn         r10, #0                     ; r10 == -1
+
+    usub8       r7, r12, r7                 ; compare to flimit
+    sel         lr, r10, r8                 ; filter mask
+
+    cmp         lr, #0
+    beq         simple_vskip_filter         ; skip filtering
+
+    ;vp8_simple_filter() function
+    eor         r3, r3, r2                  ; p1 offset to convert to a signed value
+    eor         r6, r6, r2                  ; q1 offset to convert to a signed value
+    eor         r4, r4, r2                  ; p0 offset to convert to a signed value
+    eor         r5, r5, r2                  ; q0 offset to convert to a signed value
+
+    qsub8       r3, r3, r6                  ; vp9_filter = p1 - q1
+    qsub8       r6, r5, r4                  ; q0 - p0
+
+    qadd8       r3, r3, r6                  ; vp9_filter += q0 - p0
+    ldr         r9, c0x03030303             ; r9 = 3
+
+    qadd8       r3, r3, r6                  ; vp9_filter += q0 - p0
+    ldr         r7, c0x04040404
+
+    qadd8       r3, r3, r6                  ; vp9_filter = p1-q1 + 3*(q0-p0))
+    ;STALL
+    and         r3, r3, lr                  ; vp9_filter &= mask
+
+    qadd8       r9 , r3 , r9                ; Filter2 = vp9_filter + 3
+    qadd8       r3 , r3 , r7                ; Filter1 = vp9_filter + 4
+
+    shadd8      r9 , r9 , r8
+    shadd8      r3 , r3 , r8
+    shadd8      r9 , r9 , r8
+    shadd8      r3 , r3 , r8
+    shadd8      r9 , r9 , r8                ; Filter2 >>= 3
+    shadd8      r3 , r3 , r8                ; Filter1 >>= 3
+
+    ;calculate output
+    sub         src, src, pstep, lsl #2
+
+    qadd8       r4, r4, r9                  ; u = p0 + Filter2
+    qsub8       r5, r5, r3                  ; u = q0 - Filter1
+    eor         r4, r4, r2                  ; *op0 = u^0x80
+    eor         r5, r5, r2                  ; *oq0 = u^0x80
+
+    strb        r4, [src, #-1]              ; store the result
+    mov         r4, r4, lsr #8
+    strb        r5, [src], pstep
+    mov         r5, r5, lsr #8
+
+    strb        r4, [src, #-1]
+    mov         r4, r4, lsr #8
+    strb        r5, [src], pstep
+    mov         r5, r5, lsr #8
+
+    strb        r4, [src, #-1]
+    mov         r4, r4, lsr #8
+    strb        r5, [src], pstep
+    mov         r5, r5, lsr #8
+
+    strb        r4, [src, #-1]
+    strb        r5, [src], pstep
+
+|simple_vskip_filter|
+    subs        r11, r11, #1
+
+    ; load soure data to r7, r8, r9, r10
+    ldrneh      r3, [src, #-2]
+    pld         [src, #23]                  ; preload for next block
+    ldrneh      r4, [src], pstep
+
+    ldrneh      r5, [src, #-2]
+    pld         [src, #23]
+    ldrneh      r6, [src], pstep
+
+    pkhbt       r7, r3, r4, lsl #16
+
+    ldrneh      r3, [src, #-2]
+    pld         [src, #23]
+    ldrneh      r4, [src], pstep
+
+    pkhbt       r8, r5, r6, lsl #16
+
+    ldrneh      r5, [src, #-2]
+    pld         [src, #23]
+    ldrneh      r6, [src], pstep
+
+    bne         simple_vnext8
+
+    ldmia       sp!, {r4 - r11, pc}
+    ENDP        ; |vp9_loop_filter_simple_vertical_edge_armv6|
+
+; Constant Pool
+c0x80808080 DCD     0x80808080
+c0x03030303 DCD     0x03030303
+c0x04040404 DCD     0x04040404
+
+    END
--- /dev/null
+++ b/vp9/common/arm/armv6/sixtappredict8x4_v6.asm
@@ -1,0 +1,273 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_sixtap_predict8x4_armv6|
+
+    AREA    |.text|, CODE, READONLY  ; name this block of code
+;-------------------------------------
+; r0    unsigned char *src_ptr,
+; r1    int  src_pixels_per_line,
+; r2    int  xoffset,
+; r3    int  yoffset,
+; stack unsigned char *dst_ptr,
+; stack int  dst_pitch
+;-------------------------------------
+;note: In first pass, store the result in transpose(8linesx9columns) on stack. Temporary stack size is 184.
+;Line width is 20 that is 9 short data plus 2 to make it 4bytes aligned. In second pass, load data from stack,
+;and the result is stored in transpose.
+|vp8_sixtap_predict8x4_armv6| PROC
+    stmdb       sp!, {r4 - r11, lr}
+    str         r3, [sp, #-184]!            ;reserve space on stack for temporary storage, store yoffset
+
+    cmp         r2, #0                      ;skip first_pass filter if xoffset=0
+    add         lr, sp, #4                  ;point to temporary buffer
+    beq         skip_firstpass_filter
+
+;first-pass filter
+    adr         r12, filter8_coeff
+    sub         r0, r0, r1, lsl #1
+
+    add         r3, r1, #10                 ; preload next low
+    pld         [r0, r3]
+
+    add         r2, r12, r2, lsl #4         ;calculate filter location
+    add         r0, r0, #3                  ;adjust src only for loading convinience
+
+    ldr         r3, [r2]                    ; load up packed filter coefficients
+    ldr         r4, [r2, #4]
+    ldr         r5, [r2, #8]
+
+    mov         r2, #0x90000                ; height=9 is top part of counter
+
+    sub         r1, r1, #8
+
+|first_pass_hloop_v6|
+    ldrb        r6, [r0, #-5]               ; load source data
+    ldrb        r7, [r0, #-4]
+    ldrb        r8, [r0, #-3]
+    ldrb        r9, [r0, #-2]
+    ldrb        r10, [r0, #-1]
+
+    orr         r2, r2, #0x4                ; construct loop counter. width=8=4x2
+
+    pkhbt       r6, r6, r7, lsl #16         ; r7 | r6
+    pkhbt       r7, r7, r8, lsl #16         ; r8 | r7
+
+    pkhbt       r8, r8, r9, lsl #16         ; r9 | r8
+    pkhbt       r9, r9, r10, lsl #16        ; r10 | r9
+
+|first_pass_wloop_v6|
+    smuad       r11, r6, r3                 ; vp9_filter[0], vp9_filter[1]
+    smuad       r12, r7, r3
+
+    ldrb        r6, [r0], #1
+
+    smlad       r11, r8, r4, r11            ; vp9_filter[2], vp9_filter[3]
+    ldrb        r7, [r0], #1
+    smlad       r12, r9, r4, r12
+
+    pkhbt       r10, r10, r6, lsl #16       ; r10 | r9
+    pkhbt       r6, r6, r7, lsl #16         ; r11 | r10
+    smlad       r11, r10, r5, r11           ; vp9_filter[4], vp9_filter[5]
+    smlad       r12, r6, r5, r12
+
+    sub         r2, r2, #1
+
+    add         r11, r11, #0x40             ; round_shift_and_clamp
+    tst         r2, #0xff                   ; test loop counter
+    usat        r11, #8, r11, asr #7
+    add         r12, r12, #0x40
+    strh        r11, [lr], #20              ; result is transposed and stored, which
+    usat        r12, #8, r12, asr #7
+
+    strh        r12, [lr], #20
+
+    movne       r11, r6
+    movne       r12, r7
+
+    movne       r6, r8
+    movne       r7, r9
+    movne       r8, r10
+    movne       r9, r11
+    movne       r10, r12
+
+    bne         first_pass_wloop_v6
+
+    ;;add       r9, ppl, #30                ; attempt to load 2 adjacent cache lines
+    ;;IF ARCHITECTURE=6
+    ;pld        [src, ppl]
+    ;;pld       [src, r9]
+    ;;ENDIF
+
+    subs        r2, r2, #0x10000
+
+    sub         lr, lr, #158
+
+    add         r0, r0, r1                  ; move to next input line
+
+    add         r11, r1, #18                ; preload next low. adding back block width(=8), which is subtracted earlier
+    pld         [r0, r11]
+
+    bne         first_pass_hloop_v6
+
+;second pass filter
+secondpass_filter
+    ldr         r3, [sp], #4                ; load back yoffset
+    ldr         r0, [sp, #216]              ; load dst address from stack 180+36
+    ldr         r1, [sp, #220]              ; load dst stride from stack 180+40
+
+    cmp         r3, #0
+    beq         skip_secondpass_filter
+
+    adr         r12, filter8_coeff
+    add         lr, r12, r3, lsl #4         ;calculate filter location
+
+    mov         r2, #0x00080000
+
+    ldr         r3, [lr]                    ; load up packed filter coefficients
+    ldr         r4, [lr, #4]
+    ldr         r5, [lr, #8]
+
+    pkhbt       r12, r4, r3                 ; pack the filter differently
+    pkhbt       r11, r5, r4
+
+second_pass_hloop_v6
+    ldr         r6, [sp]                    ; load the data
+    ldr         r7, [sp, #4]
+
+    orr         r2, r2, #2                  ; loop counter
+
+second_pass_wloop_v6
+    smuad       lr, r3, r6                  ; apply filter
+    smulbt      r10, r3, r6
+
+    ldr         r8, [sp, #8]
+
+    smlad       lr, r4, r7, lr
+    smladx      r10, r12, r7, r10
+
+    ldrh        r9, [sp, #12]
+
+    smlad       lr, r5, r8, lr
+    smladx      r10, r11, r8, r10
+
+    add         sp, sp, #4
+    smlatb      r10, r5, r9, r10
+
+    sub         r2, r2, #1
+
+    add         lr, lr, #0x40               ; round_shift_and_clamp
+    tst         r2, #0xff
+    usat        lr, #8, lr, asr #7
+    add         r10, r10, #0x40
+    strb        lr, [r0], r1                ; the result is transposed back and stored
+    usat        r10, #8, r10, asr #7
+
+    strb        r10, [r0],r1
+
+    movne       r6, r7
+    movne       r7, r8
+
+    bne         second_pass_wloop_v6
+
+    subs        r2, r2, #0x10000
+    add         sp, sp, #12                 ; updata src for next loop (20-8)
+    sub         r0, r0, r1, lsl #2
+    add         r0, r0, #1
+
+    bne         second_pass_hloop_v6
+
+    add         sp, sp, #20
+    ldmia       sp!, {r4 - r11, pc}
+
+;--------------------
+skip_firstpass_filter
+    sub         r0, r0, r1, lsl #1
+    sub         r1, r1, #8
+    mov         r2, #9
+
+skip_firstpass_hloop
+    ldrb        r4, [r0], #1                ; load data
+    subs        r2, r2, #1
+    ldrb        r5, [r0], #1
+    strh        r4, [lr], #20               ; store it to immediate buffer
+    ldrb        r6, [r0], #1                ; load data
+    strh        r5, [lr], #20
+    ldrb        r7, [r0], #1
+    strh        r6, [lr], #20
+    ldrb        r8, [r0], #1
+    strh        r7, [lr], #20
+    ldrb        r9, [r0], #1
+    strh        r8, [lr], #20
+    ldrb        r10, [r0], #1
+    strh        r9, [lr], #20
+    ldrb        r11, [r0], #1
+    strh        r10, [lr], #20
+    add         r0, r0, r1                  ; move to next input line
+    strh        r11, [lr], #20
+
+    sub         lr, lr, #158                ; move over to next column
+    bne         skip_firstpass_hloop
+
+    b           secondpass_filter
+
+;--------------------
+skip_secondpass_filter
+    mov         r2, #8
+    add         sp, sp, #4                  ;start from src[0] instead of src[-2]
+
+skip_secondpass_hloop
+    ldr         r6, [sp], #4
+    subs        r2, r2, #1
+    ldr         r8, [sp], #4
+
+    mov         r7, r6, lsr #16             ; unpack
+    strb        r6, [r0], r1
+    mov         r9, r8, lsr #16
+    strb        r7, [r0], r1
+    add         sp, sp, #12                 ; 20-8
+    strb        r8, [r0], r1
+    strb        r9, [r0], r1
+
+    sub         r0, r0, r1, lsl #2
+    add         r0, r0, #1
+
+    bne         skip_secondpass_hloop
+
+    add         sp, sp, #16                 ; 180 - (160 +4)
+
+    ldmia       sp!, {r4 - r11, pc}
+
+    ENDP
+
+;-----------------
+;One word each is reserved. Label filter_coeff can be used to access the data.
+;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
+filter8_coeff
+    DCD     0x00000000,     0x00000080,     0x00000000,     0x00000000
+    DCD     0xfffa0000,     0x000c007b,     0x0000ffff,     0x00000000
+    DCD     0xfff50002,     0x0024006c,     0x0001fff8,     0x00000000
+    DCD     0xfff70000,     0x0032005d,     0x0000fffa,     0x00000000
+    DCD     0xfff00003,     0x004d004d,     0x0003fff0,     0x00000000
+    DCD     0xfffa0000,     0x005d0032,     0x0000fff7,     0x00000000
+    DCD     0xfff80001,     0x006c0024,     0x0002fff5,     0x00000000
+    DCD     0xffff0000,     0x007b000c,     0x0000fffa,     0x00000000
+
+    ;DCD        0,  0,  128,    0,   0,  0
+    ;DCD        0, -6,  123,   12,  -1,  0
+    ;DCD        2, -11, 108,   36,  -8,  1
+    ;DCD        0, -9,   93,   50,  -6,  0
+    ;DCD        3, -16,  77,   77, -16,  3
+    ;DCD        0, -6,   50,   93,  -9,  0
+    ;DCD        1, -8,   36,  108, -11,  2
+    ;DCD        0, -1,   12,  123,  -6,  0
+
+    END
--- /dev/null
+++ b/vp9/common/arm/bilinearfilter_arm.c
@@ -1,0 +1,108 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include <math.h>
+#include "vp9/common/filter.h"
+#include "vp9/common/subpixel.h"
+#include "bilinearfilter_arm.h"
+
+void vp9_filter_block2d_bil_armv6
+(
+  unsigned char *src_ptr,
+  unsigned char *dst_ptr,
+  unsigned int   src_pitch,
+  unsigned int   dst_pitch,
+  const short   *HFilter,
+  const short   *VFilter,
+  int            Width,
+  int            Height
+) {
+  unsigned short FData[36 * 16]; /* Temp data buffer used in filtering */
+
+  /* First filter 1-D horizontally... */
+  vp9_filter_block2d_bil_first_pass_armv6(src_ptr, FData, src_pitch, Height + 1, Width, HFilter);
+
+  /* then 1-D vertically... */
+  vp9_filter_block2d_bil_second_pass_armv6(FData, dst_ptr, dst_pitch, Height, Width, VFilter);
+}
+
+
+void vp9_bilinear_predict4x4_armv6
+(
+  unsigned char  *src_ptr,
+  int   src_pixels_per_line,
+  int  xoffset,
+  int  yoffset,
+  unsigned char *dst_ptr,
+  int dst_pitch
+) {
+  const short  *HFilter;
+  const short  *VFilter;
+
+  HFilter = vp8_bilinear_filters[xoffset];
+  VFilter = vp8_bilinear_filters[yoffset];
+
+  vp9_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 4, 4);
+}
+
+void vp9_bilinear_predict8x8_armv6
+(
+  unsigned char  *src_ptr,
+  int  src_pixels_per_line,
+  int  xoffset,
+  int  yoffset,
+  unsigned char *dst_ptr,
+  int  dst_pitch
+) {
+  const short  *HFilter;
+  const short  *VFilter;
+
+  HFilter = vp8_bilinear_filters[xoffset];
+  VFilter = vp8_bilinear_filters[yoffset];
+
+  vp9_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 8);
+}
+
+void vp9_bilinear_predict8x4_armv6
+(
+  unsigned char  *src_ptr,
+  int  src_pixels_per_line,
+  int  xoffset,
+  int  yoffset,
+  unsigned char *dst_ptr,
+  int  dst_pitch
+) {
+  const short  *HFilter;
+  const short  *VFilter;
+
+  HFilter = vp8_bilinear_filters[xoffset];
+  VFilter = vp8_bilinear_filters[yoffset];
+
+  vp9_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 4);
+}
+
+void vp9_bilinear_predict16x16_armv6
+(
+  unsigned char  *src_ptr,
+  int  src_pixels_per_line,
+  int  xoffset,
+  int  yoffset,
+  unsigned char *dst_ptr,
+  int  dst_pitch
+) {
+  const short  *HFilter;
+  const short  *VFilter;
+
+  HFilter = vp8_bilinear_filters[xoffset];
+  VFilter = vp8_bilinear_filters[yoffset];
+
+  vp9_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 16, 16);
+}
--- /dev/null
+++ b/vp9/common/arm/bilinearfilter_arm.h
@@ -1,0 +1,35 @@
+/*
+ *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef BILINEARFILTER_ARM_H
+#define BILINEARFILTER_ARM_H
+
+extern void vp9_filter_block2d_bil_first_pass_armv6
+(
+  const unsigned char  *src_ptr,
+  unsigned short       *dst_ptr,
+  unsigned int          src_pitch,
+  unsigned int          height,
+  unsigned int          width,
+  const short          *vp9_filter
+);
+
+extern void vp9_filter_block2d_bil_second_pass_armv6
+(
+  const unsigned short *src_ptr,
+  unsigned char        *dst_ptr,
+  int                   dst_pitch,
+  unsigned int          height,
+  unsigned int          width,
+  const short         *vp9_filter
+);
+
+#endif /* BILINEARFILTER_ARM_H */
--- /dev/null
+++ b/vp9/common/arm/filter_arm.c
@@ -1,0 +1,198 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_ports/config.h"
+#include <math.h>
+#include "vp9/common/filter.h"
+#include "vp9/common/subpixel.h"
+#include "vpx_ports/mem.h"
+
+extern void vp9_filter_block2d_first_pass_armv6
+(
+  unsigned char *src_ptr,
+  short         *output_ptr,
+  unsigned int src_pixels_per_line,
+  unsigned int output_width,
+  unsigned int output_height,
+  const short *vp9_filter
+);
+
+// 8x8
+extern void vp9_filter_block2d_first_pass_8x8_armv6
+(
+  unsigned char *src_ptr,
+  short         *output_ptr,
+  unsigned int src_pixels_per_line,
+  unsigned int output_width,
+  unsigned int output_height,
+  const short *vp9_filter
+);
+
+// 16x16
+extern void vp9_filter_block2d_first_pass_16x16_armv6
+(
+  unsigned char *src_ptr,
+  short         *output_ptr,
+  unsigned int src_pixels_per_line,
+  unsigned int output_width,
+  unsigned int output_height,
+  const short *vp9_filter
+);
+
+extern void vp9_filter_block2d_second_pass_armv6
+(
+  short         *src_ptr,
+  unsigned char *output_ptr,
+  unsigned int output_pitch,
+  unsigned int cnt,
+  const short *vp9_filter
+);
+
+extern void vp9_filter4_block2d_second_pass_armv6
+(
+  short         *src_ptr,
+  unsigned char *output_ptr,
+  unsigned int output_pitch,
+  unsigned int cnt,
+  const short *vp9_filter
+);
+
+extern void vp9_filter_block2d_first_pass_only_armv6
+(
+  unsigned char *src_ptr,
+  unsigned char *output_ptr,
+  unsigned int src_pixels_per_line,
+  unsigned int cnt,
+  unsigned int output_pitch,
+  const short *vp9_filter
+);
+
+
+extern void vp9_filter_block2d_second_pass_only_armv6
+(
+  unsigned char *src_ptr,
+  unsigned char *output_ptr,
+  unsigned int src_pixels_per_line,
+  unsigned int cnt,
+  unsigned int output_pitch,
+  const short *vp9_filter
+);
+
+#if HAVE_ARMV6
+void vp9_sixtap_predict_armv6
+(
+  unsigned char  *src_ptr,
+  int  src_pixels_per_line,
+  int  xoffset,
+  int  yoffset,
+  unsigned char *dst_ptr,
+  int  dst_pitch
+) {
+  const short  *HFilter;
+  const short  *VFilter;
+  DECLARE_ALIGNED_ARRAY(4, short, FData, 12 * 4); /* Temp data buffer used in filtering */
+
+
+  HFilter = vp8_sub_pel_filters[xoffset];   /* 6 tap */
+  VFilter = vp8_sub_pel_filters[yoffset];   /* 6 tap */
+
+  /* Vfilter is null. First pass only */
+  if (xoffset && !yoffset) {
+    /*vp9_filter_block2d_first_pass_armv6 ( src_ptr, FData+2, src_pixels_per_line, 4, 4, HFilter );
+    vp9_filter_block2d_second_pass_armv6 ( FData+2, dst_ptr, dst_pitch, 4, VFilter );*/
+
+    vp9_filter_block2d_first_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 4, dst_pitch, HFilter);
+  }
+  /* Hfilter is null. Second pass only */
+  else if (!xoffset && yoffset) {
+    vp9_filter_block2d_second_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 4, dst_pitch, VFilter);
+  } else {
+    /* Vfilter is a 4 tap filter */
+    if (yoffset & 0x1) {
+      vp9_filter_block2d_first_pass_armv6(src_ptr - src_pixels_per_line, FData + 1, src_pixels_per_line, 4, 7, HFilter);
+      vp9_filter4_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 4, VFilter);
+    }
+    /* Vfilter is 6 tap filter */
+    else {
+      vp9_filter_block2d_first_pass_armv6(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 4, 9, HFilter);
+      vp9_filter_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 4, VFilter);
+    }
+  }
+}
+
+void vp9_sixtap_predict8x8_armv6
+(
+  unsigned char  *src_ptr,
+  int  src_pixels_per_line,
+  int  xoffset,
+  int  yoffset,
+  unsigned char *dst_ptr,
+  int  dst_pitch
+) {
+  const short  *HFilter;
+  const short  *VFilter;
+  DECLARE_ALIGNED_ARRAY(4, short, FData, 16 * 8); /* Temp data buffer used in filtering */
+
+  HFilter = vp8_sub_pel_filters[xoffset];   /* 6 tap */
+  VFilter = vp8_sub_pel_filters[yoffset];   /* 6 tap */
+
+  if (xoffset && !yoffset) {
+    vp9_filter_block2d_first_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 8, dst_pitch, HFilter);
+  }
+  /* Hfilter is null. Second pass only */
+  else if (!xoffset && yoffset) {
+    vp9_filter_block2d_second_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 8, dst_pitch, VFilter);
+  } else {
+    if (yoffset & 0x1) {
+      vp9_filter_block2d_first_pass_8x8_armv6(src_ptr - src_pixels_per_line, FData + 1, src_pixels_per_line, 8, 11, HFilter);
+      vp9_filter4_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 8, VFilter);
+    } else {
+      vp9_filter_block2d_first_pass_8x8_armv6(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 8, 13, HFilter);
+      vp9_filter_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 8, VFilter);
+    }
+  }
+}
+
+
+void vp9_sixtap_predict16x16_armv6
+(
+  unsigned char  *src_ptr,
+  int  src_pixels_per_line,
+  int  xoffset,
+  int  yoffset,
+  unsigned char *dst_ptr,
+  int  dst_pitch
+) {
+  const short  *HFilter;
+  const short  *VFilter;
+  DECLARE_ALIGNED_ARRAY(4, short, FData, 24 * 16);  /* Temp data buffer used in filtering */
+
+  HFilter = vp8_sub_pel_filters[xoffset];   /* 6 tap */
+  VFilter = vp8_sub_pel_filters[yoffset];   /* 6 tap */
+
+  if (xoffset && !yoffset) {
+    vp9_filter_block2d_first_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 16, dst_pitch, HFilter);
+  }
+  /* Hfilter is null. Second pass only */
+  else if (!xoffset && yoffset) {
+    vp9_filter_block2d_second_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 16, dst_pitch, VFilter);
+  } else {
+    if (yoffset & 0x1) {
+      vp9_filter_block2d_first_pass_16x16_armv6(src_ptr - src_pixels_per_line, FData + 1, src_pixels_per_line, 16, 19, HFilter);
+      vp9_filter4_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 16, VFilter);
+    } else {
+      vp9_filter_block2d_first_pass_16x16_armv6(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 16, 21, HFilter);
+      vp9_filter_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 16, VFilter);
+    }
+  }
+
+}
+#endif
--- /dev/null
+++ b/vp9/common/arm/idct_arm.h
@@ -1,0 +1,65 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef IDCT_ARM_H
+#define IDCT_ARM_H
+
+#if HAVE_ARMV6
+extern prototype_idct(vp9_short_idct4x4llm_1_v6);
+extern prototype_idct(vp9_short_idct4x4llm_v6_dual);
+extern prototype_idct_scalar_add(vp9_dc_only_idct_add_v6);
+extern prototype_second_order(vp9_short_inv_walsh4x4_1_v6);
+extern prototype_second_order(vp9_short_inv_walsh4x4_v6);
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+#undef  vp9_idct_idct1
+#define vp9_idct_idct1 vp9_short_idct4x4llm_1_v6
+
+#undef  vp9_idct_idct16
+#define vp9_idct_idct16 vp9_short_idct4x4llm_v6_dual
+
+#undef  vp9_idct_idct1_scalar_add
+#define vp9_idct_idct1_scalar_add vp9_dc_only_idct_add_v6
+
+#undef  vp8_idct_iwalsh1
+#define vp8_idct_iwalsh1 vp9_short_inv_walsh4x4_1_v6
+
+#undef  vp8_idct_iwalsh16
+#define vp8_idct_iwalsh16 vp9_short_inv_walsh4x4_v6
+#endif
+#endif
+
+#if HAVE_ARMV7
+extern prototype_idct(vp9_short_idct4x4llm_1_neon);
+extern prototype_idct(vp9_short_idct4x4llm_neon);
+extern prototype_idct_scalar_add(vp9_dc_only_idct_add_neon);
+extern prototype_second_order(vp9_short_inv_walsh4x4_1_neon);
+extern prototype_second_order(vp9_short_inv_walsh4x4_neon);
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+#undef  vp9_idct_idct1
+#define vp9_idct_idct1 vp9_short_idct4x4llm_1_neon
+
+#undef  vp9_idct_idct16
+#define vp9_idct_idct16 vp9_short_idct4x4llm_neon
+
+#undef  vp9_idct_idct1_scalar_add
+#define vp9_idct_idct1_scalar_add vp9_dc_only_idct_add_neon
+
+#undef  vp8_idct_iwalsh1
+#define vp8_idct_iwalsh1 vp9_short_inv_walsh4x4_1_neon
+
+#undef  vp8_idct_iwalsh16
+#define vp8_idct_iwalsh16 vp9_short_inv_walsh4x4_neon
+#endif
+#endif
+
+#endif
--- /dev/null
+++ b/vp9/common/arm/loopfilter_arm.c
@@ -1,0 +1,166 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_config.h"
+#include "vp9/common/loopfilter.h"
+#include "vp9/common/onyxc_int.h"
+
+#if HAVE_ARMV6
+extern prototype_loopfilter(vp9_loop_filter_horizontal_edge_armv6);
+extern prototype_loopfilter(vp9_loop_filter_vertical_edge_armv6);
+extern prototype_loopfilter(vp9_mbloop_filter_horizontal_edge_armv6);
+extern prototype_loopfilter(vp9_mbloop_filter_vertical_edge_armv6);
+#endif
+
+#if HAVE_ARMV7
+typedef void loopfilter_y_neon(unsigned char *src, int pitch,
+                               unsigned char blimit, unsigned char limit, unsigned char thresh);
+typedef void loopfilter_uv_neon(unsigned char *u, int pitch,
+                                unsigned char blimit, unsigned char limit, unsigned char thresh,
+                                unsigned char *v);
+
+extern loopfilter_y_neon vp9_loop_filter_horizontal_edge_y_neon;
+extern loopfilter_y_neon vp9_loop_filter_vertical_edge_y_neon;
+extern loopfilter_y_neon vp9_mbloop_filter_horizontal_edge_y_neon;
+extern loopfilter_y_neon vp9_mbloop_filter_vertical_edge_y_neon;
+
+extern loopfilter_uv_neon vp9_loop_filter_horizontal_edge_uv_neon;
+extern loopfilter_uv_neon vp9_loop_filter_vertical_edge_uv_neon;
+extern loopfilter_uv_neon vp9_mbloop_filter_horizontal_edge_uv_neon;
+extern loopfilter_uv_neon vp9_mbloop_filter_vertical_edge_uv_neon;
+#endif
+
+#if HAVE_ARMV6
+/*ARMV6 loopfilter functions*/
+/* Horizontal MB filtering */
+void vp9_loop_filter_mbh_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                               int y_stride, int uv_stride, loop_filter_info *lfi) {
+  vp9_mbloop_filter_horizontal_edge_armv6(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2);
+
+  if (u_ptr)
+    vp9_mbloop_filter_horizontal_edge_armv6(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+
+  if (v_ptr)
+    vp9_mbloop_filter_horizontal_edge_armv6(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+}
+
+/* Vertical MB Filtering */
+void vp9_loop_filter_mbv_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                               int y_stride, int uv_stride, loop_filter_info *lfi) {
+  vp9_mbloop_filter_vertical_edge_armv6(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2);
+
+  if (u_ptr)
+    vp9_mbloop_filter_vertical_edge_armv6(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+
+  if (v_ptr)
+    vp9_mbloop_filter_vertical_edge_armv6(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+}
+
+/* Horizontal B Filtering */
+void vp9_loop_filter_bh_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                              int y_stride, int uv_stride, loop_filter_info *lfi) {
+  vp9_loop_filter_horizontal_edge_armv6(y_ptr + 4 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+  vp9_loop_filter_horizontal_edge_armv6(y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+  vp9_loop_filter_horizontal_edge_armv6(y_ptr + 12 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+
+  if (u_ptr)
+    vp9_loop_filter_horizontal_edge_armv6(u_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
+
+  if (v_ptr)
+    vp9_loop_filter_horizontal_edge_armv6(v_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
+}
+
+void vp9_loop_filter_bhs_armv6(unsigned char *y_ptr, int y_stride,
+                               const unsigned char *blimit) {
+  vp9_loop_filter_simple_horizontal_edge_armv6(y_ptr + 4 * y_stride, y_stride, blimit);
+  vp9_loop_filter_simple_horizontal_edge_armv6(y_ptr + 8 * y_stride, y_stride, blimit);
+  vp9_loop_filter_simple_horizontal_edge_armv6(y_ptr + 12 * y_stride, y_stride, blimit);
+}
+
+/* Vertical B Filtering */
+void vp9_loop_filter_bv_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                              int y_stride, int uv_stride, loop_filter_info *lfi) {
+  vp9_loop_filter_vertical_edge_armv6(y_ptr + 4, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+  vp9_loop_filter_vertical_edge_armv6(y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+  vp9_loop_filter_vertical_edge_armv6(y_ptr + 12, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+
+  if (u_ptr)
+    vp9_loop_filter_vertical_edge_armv6(u_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
+
+  if (v_ptr)
+    vp9_loop_filter_vertical_edge_armv6(v_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
+}
+
+void vp9_loop_filter_bvs_armv6(unsigned char *y_ptr, int y_stride,
+                               const unsigned char *blimit) {
+  vp9_loop_filter_simple_vertical_edge_armv6(y_ptr + 4, y_stride, blimit);
+  vp9_loop_filter_simple_vertical_edge_armv6(y_ptr + 8, y_stride, blimit);
+  vp9_loop_filter_simple_vertical_edge_armv6(y_ptr + 12, y_stride, blimit);
+}
+#endif
+
+#if HAVE_ARMV7
+/* NEON loopfilter functions */
+/* Horizontal MB filtering */
+void vp9_loop_filter_mbh_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                              int y_stride, int uv_stride, loop_filter_info *lfi) {
+  unsigned char mblim = *lfi->mblim;
+  unsigned char lim = *lfi->lim;
+  unsigned char hev_thr = *lfi->hev_thr;
+  vp9_mbloop_filter_horizontal_edge_y_neon(y_ptr, y_stride, mblim, lim, hev_thr);
+
+  if (u_ptr)
+    vp9_mbloop_filter_horizontal_edge_uv_neon(u_ptr, uv_stride, mblim, lim, hev_thr, v_ptr);
+}
+
+/* Vertical MB Filtering */
+void vp9_loop_filter_mbv_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                              int y_stride, int uv_stride, loop_filter_info *lfi) {
+  unsigned char mblim = *lfi->mblim;
+  unsigned char lim = *lfi->lim;
+  unsigned char hev_thr = *lfi->hev_thr;
+
+  vp9_mbloop_filter_vertical_edge_y_neon(y_ptr, y_stride, mblim, lim, hev_thr);
+
+  if (u_ptr)
+    vp9_mbloop_filter_vertical_edge_uv_neon(u_ptr, uv_stride, mblim, lim, hev_thr, v_ptr);
+}
+
+/* Horizontal B Filtering */
+void vp9_loop_filter_bh_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                             int y_stride, int uv_stride, loop_filter_info *lfi) {
+  unsigned char blim = *lfi->blim;
+  unsigned char lim = *lfi->lim;
+  unsigned char hev_thr = *lfi->hev_thr;
+
+  vp9_loop_filter_horizontal_edge_y_neon(y_ptr + 4 * y_stride, y_stride, blim, lim, hev_thr);
+  vp9_loop_filter_horizontal_edge_y_neon(y_ptr + 8 * y_stride, y_stride, blim, lim, hev_thr);
+  vp9_loop_filter_horizontal_edge_y_neon(y_ptr + 12 * y_stride, y_stride, blim, lim, hev_thr);
+
+  if (u_ptr)
+    vp9_loop_filter_horizontal_edge_uv_neon(u_ptr + 4 * uv_stride, uv_stride, blim, lim, hev_thr, v_ptr + 4 * uv_stride);
+}
+
+/* Vertical B Filtering */
+void vp9_loop_filter_bv_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                             int y_stride, int uv_stride, loop_filter_info *lfi) {
+  unsigned char blim = *lfi->blim;
+  unsigned char lim = *lfi->lim;
+  unsigned char hev_thr = *lfi->hev_thr;
+
+  vp9_loop_filter_vertical_edge_y_neon(y_ptr + 4, y_stride, blim, lim, hev_thr);
+  vp9_loop_filter_vertical_edge_y_neon(y_ptr + 8, y_stride, blim, lim, hev_thr);
+  vp9_loop_filter_vertical_edge_y_neon(y_ptr + 12, y_stride, blim, lim, hev_thr);
+
+  if (u_ptr)
+    vp9_loop_filter_vertical_edge_uv_neon(u_ptr + 4, uv_stride, blim, lim, hev_thr, v_ptr + 4);
+}
+#endif
--- /dev/null
+++ b/vp9/common/arm/loopfilter_arm.h
@@ -1,0 +1,41 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef LOOPFILTER_ARM_H
+#define LOOPFILTER_ARM_H
+
+#include "vpx_config.h"
+
+#if HAVE_ARMV6
+extern prototype_loopfilter_block(vp9_loop_filter_mbv_armv6);
+extern prototype_loopfilter_block(vp9_loop_filter_bv_armv6);
+extern prototype_loopfilter_block(vp9_loop_filter_mbh_armv6);
+extern prototype_loopfilter_block(vp9_loop_filter_bh_armv6);
+extern prototype_simple_loopfilter(vp9_loop_filter_bvs_armv6);
+extern prototype_simple_loopfilter(vp9_loop_filter_bhs_armv6);
+extern prototype_simple_loopfilter(vp9_loop_filter_simple_horizontal_edge_armv6);
+extern prototype_simple_loopfilter(vp9_loop_filter_simple_vertical_edge_armv6);
+
+#endif /* HAVE_ARMV6 */
+
+#if HAVE_ARMV7
+extern prototype_loopfilter_block(vp9_loop_filter_mbv_neon);
+extern prototype_loopfilter_block(vp9_loop_filter_bv_neon);
+extern prototype_loopfilter_block(vp9_loop_filter_mbh_neon);
+extern prototype_loopfilter_block(vp9_loop_filter_bh_neon);
+extern prototype_simple_loopfilter(vp9_loop_filter_mbvs_neon);
+extern prototype_simple_loopfilter(vp9_loop_filter_bvs_neon);
+extern prototype_simple_loopfilter(vp9_loop_filter_mbhs_neon);
+extern prototype_simple_loopfilter(vp9_loop_filter_bhs_neon);
+
+#endif /* HAVE_ARMV7 */
+
+#endif /* LOOPFILTER_ARM_H */
--- /dev/null
+++ b/vp9/common/arm/neon/bilinearpredict16x16_neon.asm
@@ -1,0 +1,357 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_bilinear_predict16x16_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+; r0    unsigned char  *src_ptr,
+; r1    int  src_pixels_per_line,
+; r2    int  xoffset,
+; r3    int  yoffset,
+; r4    unsigned char *dst_ptr,
+; stack(r5) int  dst_pitch
+
+|vp8_bilinear_predict16x16_neon| PROC
+    push            {r4-r5, lr}
+
+    adr             r12, bifilter16_coeff
+    ldr             r4, [sp, #12]           ;load parameters from stack
+    ldr             r5, [sp, #16]           ;load parameters from stack
+
+    cmp             r2, #0                  ;skip first_pass filter if xoffset=0
+    beq             secondpass_bfilter16x16_only
+
+    add             r2, r12, r2, lsl #3     ;calculate filter location
+
+    cmp             r3, #0                  ;skip second_pass filter if yoffset=0
+
+    vld1.s32        {d31}, [r2]             ;load first_pass filter
+
+    beq             firstpass_bfilter16x16_only
+
+    sub             sp, sp, #272            ;reserve space on stack for temporary storage
+    vld1.u8         {d2, d3, d4}, [r0], r1      ;load src data
+    mov             lr, sp
+    vld1.u8         {d5, d6, d7}, [r0], r1
+
+    mov             r2, #3                  ;loop counter
+    vld1.u8         {d8, d9, d10}, [r0], r1
+
+    vdup.8          d0, d31[0]              ;first_pass filter (d0 d1)
+    vld1.u8         {d11, d12, d13}, [r0], r1
+
+    vdup.8          d1, d31[4]
+
+;First Pass: output_height lines x output_width columns (17x16)
+filt_blk2d_fp16x16_loop_neon
+    pld             [r0]
+    pld             [r0, r1]
+    pld             [r0, r1, lsl #1]
+
+    vmull.u8        q7, d2, d0              ;(src_ptr[0] * vp9_filter[0])
+    vmull.u8        q8, d3, d0
+    vmull.u8        q9, d5, d0
+    vmull.u8        q10, d6, d0
+    vmull.u8        q11, d8, d0
+    vmull.u8        q12, d9, d0
+    vmull.u8        q13, d11, d0
+    vmull.u8        q14, d12, d0
+
+    vext.8          d2, d2, d3, #1          ;construct src_ptr[1]
+    vext.8          d5, d5, d6, #1
+    vext.8          d8, d8, d9, #1
+    vext.8          d11, d11, d12, #1
+
+    vmlal.u8        q7, d2, d1              ;(src_ptr[0] * vp9_filter[1])
+    vmlal.u8        q9, d5, d1
+    vmlal.u8        q11, d8, d1
+    vmlal.u8        q13, d11, d1
+
+    vext.8          d3, d3, d4, #1
+    vext.8          d6, d6, d7, #1
+    vext.8          d9, d9, d10, #1
+    vext.8          d12, d12, d13, #1
+
+    vmlal.u8        q8, d3, d1              ;(src_ptr[0] * vp9_filter[1])
+    vmlal.u8        q10, d6, d1
+    vmlal.u8        q12, d9, d1
+    vmlal.u8        q14, d12, d1
+
+    subs            r2, r2, #1
+
+    vqrshrn.u16    d14, q7, #7              ;shift/round/saturate to u8
+    vqrshrn.u16    d15, q8, #7
+    vqrshrn.u16    d16, q9, #7
+    vqrshrn.u16    d17, q10, #7
+    vqrshrn.u16    d18, q11, #7
+    vqrshrn.u16    d19, q12, #7
+    vqrshrn.u16    d20, q13, #7
+
+    vld1.u8         {d2, d3, d4}, [r0], r1      ;load src data
+    vqrshrn.u16    d21, q14, #7
+    vld1.u8         {d5, d6, d7}, [r0], r1
+
+    vst1.u8         {d14, d15, d16, d17}, [lr]!     ;store result
+    vld1.u8         {d8, d9, d10}, [r0], r1
+    vst1.u8         {d18, d19, d20, d21}, [lr]!
+    vld1.u8         {d11, d12, d13}, [r0], r1
+
+    bne             filt_blk2d_fp16x16_loop_neon
+
+;First-pass filtering for rest 5 lines
+    vld1.u8         {d14, d15, d16}, [r0], r1
+
+    vmull.u8        q9, d2, d0              ;(src_ptr[0] * vp9_filter[0])
+    vmull.u8        q10, d3, d0
+    vmull.u8        q11, d5, d0
+    vmull.u8        q12, d6, d0
+    vmull.u8        q13, d8, d0
+    vmull.u8        q14, d9, d0
+
+    vext.8          d2, d2, d3, #1          ;construct src_ptr[1]
+    vext.8          d5, d5, d6, #1
+    vext.8          d8, d8, d9, #1
+
+    vmlal.u8        q9, d2, d1              ;(src_ptr[0] * vp9_filter[1])
+    vmlal.u8        q11, d5, d1
+    vmlal.u8        q13, d8, d1
+
+    vext.8          d3, d3, d4, #1
+    vext.8          d6, d6, d7, #1
+    vext.8          d9, d9, d10, #1
+
+    vmlal.u8        q10, d3, d1             ;(src_ptr[0] * vp9_filter[1])
+    vmlal.u8        q12, d6, d1
+    vmlal.u8        q14, d9, d1
+
+    vmull.u8        q1, d11, d0
+    vmull.u8        q2, d12, d0
+    vmull.u8        q3, d14, d0
+    vmull.u8        q4, d15, d0
+
+    vext.8          d11, d11, d12, #1       ;construct src_ptr[1]
+    vext.8          d14, d14, d15, #1
+
+    vmlal.u8        q1, d11, d1             ;(src_ptr[0] * vp9_filter[1])
+    vmlal.u8        q3, d14, d1
+
+    vext.8          d12, d12, d13, #1
+    vext.8          d15, d15, d16, #1
+
+    vmlal.u8        q2, d12, d1             ;(src_ptr[0] * vp9_filter[1])
+    vmlal.u8        q4, d15, d1
+
+    vqrshrn.u16    d10, q9, #7              ;shift/round/saturate to u8
+    vqrshrn.u16    d11, q10, #7
+    vqrshrn.u16    d12, q11, #7
+    vqrshrn.u16    d13, q12, #7
+    vqrshrn.u16    d14, q13, #7
+    vqrshrn.u16    d15, q14, #7
+    vqrshrn.u16    d16, q1, #7
+    vqrshrn.u16    d17, q2, #7
+    vqrshrn.u16    d18, q3, #7
+    vqrshrn.u16    d19, q4, #7
+
+    vst1.u8         {d10, d11, d12, d13}, [lr]!         ;store result
+    vst1.u8         {d14, d15, d16, d17}, [lr]!
+    vst1.u8         {d18, d19}, [lr]!
+
+;Second pass: 16x16
+;secondpass_filter
+    add             r3, r12, r3, lsl #3
+    sub             lr, lr, #272
+
+    vld1.u32        {d31}, [r3]             ;load second_pass filter
+
+    vld1.u8         {d22, d23}, [lr]!       ;load src data
+
+    vdup.8          d0, d31[0]              ;second_pass filter parameters (d0 d1)
+    vdup.8          d1, d31[4]
+    mov             r12, #4                 ;loop counter
+
+filt_blk2d_sp16x16_loop_neon
+    vld1.u8         {d24, d25}, [lr]!
+    vmull.u8        q1, d22, d0             ;(src_ptr[0] * vp9_filter[0])
+    vld1.u8         {d26, d27}, [lr]!
+    vmull.u8        q2, d23, d0
+    vld1.u8         {d28, d29}, [lr]!
+    vmull.u8        q3, d24, d0
+    vld1.u8         {d30, d31}, [lr]!
+
+    vmull.u8        q4, d25, d0
+    vmull.u8        q5, d26, d0
+    vmull.u8        q6, d27, d0
+    vmull.u8        q7, d28, d0
+    vmull.u8        q8, d29, d0
+
+    vmlal.u8        q1, d24, d1             ;(src_ptr[pixel_step] * vp9_filter[1])
+    vmlal.u8        q2, d25, d1
+    vmlal.u8        q3, d26, d1
+    vmlal.u8        q4, d27, d1
+    vmlal.u8        q5, d28, d1
+    vmlal.u8        q6, d29, d1
+    vmlal.u8        q7, d30, d1
+    vmlal.u8        q8, d31, d1
+
+    subs            r12, r12, #1
+
+    vqrshrn.u16    d2, q1, #7               ;shift/round/saturate to u8
+    vqrshrn.u16    d3, q2, #7
+    vqrshrn.u16    d4, q3, #7
+    vqrshrn.u16    d5, q4, #7
+    vqrshrn.u16    d6, q5, #7
+    vqrshrn.u16    d7, q6, #7
+    vqrshrn.u16    d8, q7, #7
+    vqrshrn.u16    d9, q8, #7
+
+    vst1.u8         {d2, d3}, [r4], r5      ;store result
+    vst1.u8         {d4, d5}, [r4], r5
+    vst1.u8         {d6, d7}, [r4], r5
+    vmov            q11, q15
+    vst1.u8         {d8, d9}, [r4], r5
+
+    bne             filt_blk2d_sp16x16_loop_neon
+
+    add             sp, sp, #272
+
+    pop             {r4-r5,pc}
+
+;--------------------
+firstpass_bfilter16x16_only
+    mov             r2, #4                      ;loop counter
+    vdup.8          d0, d31[0]                  ;first_pass filter (d0 d1)
+    vdup.8          d1, d31[4]
+
+;First Pass: output_height lines x output_width columns (16x16)
+filt_blk2d_fpo16x16_loop_neon
+    vld1.u8         {d2, d3, d4}, [r0], r1      ;load src data
+    vld1.u8         {d5, d6, d7}, [r0], r1
+    vld1.u8         {d8, d9, d10}, [r0], r1
+    vld1.u8         {d11, d12, d13}, [r0], r1
+
+    pld             [r0]
+    pld             [r0, r1]
+    pld             [r0, r1, lsl #1]
+
+    vmull.u8        q7, d2, d0              ;(src_ptr[0] * vp9_filter[0])
+    vmull.u8        q8, d3, d0
+    vmull.u8        q9, d5, d0
+    vmull.u8        q10, d6, d0
+    vmull.u8        q11, d8, d0
+    vmull.u8        q12, d9, d0
+    vmull.u8        q13, d11, d0
+    vmull.u8        q14, d12, d0
+
+    vext.8          d2, d2, d3, #1          ;construct src_ptr[1]
+    vext.8          d5, d5, d6, #1
+    vext.8          d8, d8, d9, #1
+    vext.8          d11, d11, d12, #1
+
+    vmlal.u8        q7, d2, d1              ;(src_ptr[0] * vp9_filter[1])
+    vmlal.u8        q9, d5, d1
+    vmlal.u8        q11, d8, d1
+    vmlal.u8        q13, d11, d1
+
+    vext.8          d3, d3, d4, #1
+    vext.8          d6, d6, d7, #1
+    vext.8          d9, d9, d10, #1
+    vext.8          d12, d12, d13, #1
+
+    vmlal.u8        q8, d3, d1              ;(src_ptr[0] * vp9_filter[1])
+    vmlal.u8        q10, d6, d1
+    vmlal.u8        q12, d9, d1
+    vmlal.u8        q14, d12, d1
+
+    subs            r2, r2, #1
+
+    vqrshrn.u16    d14, q7, #7              ;shift/round/saturate to u8
+    vqrshrn.u16    d15, q8, #7
+    vqrshrn.u16    d16, q9, #7
+    vqrshrn.u16    d17, q10, #7
+    vqrshrn.u16    d18, q11, #7
+    vqrshrn.u16    d19, q12, #7
+    vqrshrn.u16    d20, q13, #7
+    vst1.u8         {d14, d15}, [r4], r5        ;store result
+    vqrshrn.u16    d21, q14, #7
+
+    vst1.u8         {d16, d17}, [r4], r5
+    vst1.u8         {d18, d19}, [r4], r5
+    vst1.u8         {d20, d21}, [r4], r5
+
+    bne             filt_blk2d_fpo16x16_loop_neon
+    pop             {r4-r5,pc}
+
+;---------------------
+secondpass_bfilter16x16_only
+;Second pass: 16x16
+;secondpass_filter
+    add             r3, r12, r3, lsl #3
+    mov             r12, #4                     ;loop counter
+    vld1.u32        {d31}, [r3]                 ;load second_pass filter
+    vld1.u8         {d22, d23}, [r0], r1        ;load src data
+
+    vdup.8          d0, d31[0]                  ;second_pass filter parameters (d0 d1)
+    vdup.8          d1, d31[4]
+
+filt_blk2d_spo16x16_loop_neon
+    vld1.u8         {d24, d25}, [r0], r1
+    vmull.u8        q1, d22, d0             ;(src_ptr[0] * vp9_filter[0])
+    vld1.u8         {d26, d27}, [r0], r1
+    vmull.u8        q2, d23, d0
+    vld1.u8         {d28, d29}, [r0], r1
+    vmull.u8        q3, d24, d0
+    vld1.u8         {d30, d31}, [r0], r1
+
+    vmull.u8        q4, d25, d0
+    vmull.u8        q5, d26, d0
+    vmull.u8        q6, d27, d0
+    vmull.u8        q7, d28, d0
+    vmull.u8        q8, d29, d0
+
+    vmlal.u8        q1, d24, d1             ;(src_ptr[pixel_step] * vp9_filter[1])
+    vmlal.u8        q2, d25, d1
+    vmlal.u8        q3, d26, d1
+    vmlal.u8        q4, d27, d1
+    vmlal.u8        q5, d28, d1
+    vmlal.u8        q6, d29, d1
+    vmlal.u8        q7, d30, d1
+    vmlal.u8        q8, d31, d1
+
+    vqrshrn.u16    d2, q1, #7               ;shift/round/saturate to u8
+    vqrshrn.u16    d3, q2, #7
+    vqrshrn.u16    d4, q3, #7
+    vqrshrn.u16    d5, q4, #7
+    vqrshrn.u16    d6, q5, #7
+    vqrshrn.u16    d7, q6, #7
+    vqrshrn.u16    d8, q7, #7
+    vqrshrn.u16    d9, q8, #7
+
+    vst1.u8         {d2, d3}, [r4], r5      ;store result
+    subs            r12, r12, #1
+    vst1.u8         {d4, d5}, [r4], r5
+    vmov            q11, q15
+    vst1.u8         {d6, d7}, [r4], r5
+    vst1.u8         {d8, d9}, [r4], r5
+
+    bne             filt_blk2d_spo16x16_loop_neon
+    pop             {r4-r5,pc}
+
+    ENDP
+
+;-----------------
+
+bifilter16_coeff
+    DCD     128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112
+
+    END
--- /dev/null
+++ b/vp9/common/arm/neon/bilinearpredict4x4_neon.asm
@@ -1,0 +1,130 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_bilinear_predict4x4_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+; r0    unsigned char  *src_ptr,
+; r1    int  src_pixels_per_line,
+; r2    int  xoffset,
+; r3    int  yoffset,
+; r4    unsigned char *dst_ptr,
+; stack(lr) int  dst_pitch
+
+|vp8_bilinear_predict4x4_neon| PROC
+    push            {r4, lr}
+
+    adr             r12, bifilter4_coeff
+    ldr             r4, [sp, #8]            ;load parameters from stack
+    ldr             lr, [sp, #12]           ;load parameters from stack
+
+    cmp             r2, #0                  ;skip first_pass filter if xoffset=0
+    beq             skip_firstpass_filter
+
+;First pass: output_height lines x output_width columns (5x4)
+    vld1.u8         {d2}, [r0], r1          ;load src data
+    add             r2, r12, r2, lsl #3     ;calculate Hfilter location (2coeffsx4bytes=8bytes)
+
+    vld1.u8         {d3}, [r0], r1
+    vld1.u32        {d31}, [r2]             ;first_pass filter
+
+    vld1.u8         {d4}, [r0], r1
+    vdup.8          d0, d31[0]              ;first_pass filter (d0-d1)
+    vld1.u8         {d5}, [r0], r1
+    vdup.8          d1, d31[4]
+    vld1.u8         {d6}, [r0], r1
+
+    vshr.u64        q4, q1, #8              ;construct src_ptr[1]
+    vshr.u64        q5, q2, #8
+    vshr.u64        d12, d6, #8
+
+    vzip.32         d2, d3                  ;put 2-line data in 1 register (src_ptr[0])
+    vzip.32         d4, d5
+    vzip.32         d8, d9                  ;put 2-line data in 1 register (src_ptr[1])
+    vzip.32         d10, d11
+
+    vmull.u8        q7, d2, d0              ;(src_ptr[0] * vp9_filter[0])
+    vmull.u8        q8, d4, d0
+    vmull.u8        q9, d6, d0
+
+    vmlal.u8        q7, d8, d1              ;(src_ptr[1] * vp9_filter[1])
+    vmlal.u8        q8, d10, d1
+    vmlal.u8        q9, d12, d1
+
+    vqrshrn.u16    d28, q7, #7              ;shift/round/saturate to u8
+    vqrshrn.u16    d29, q8, #7
+    vqrshrn.u16    d30, q9, #7
+
+;Second pass: 4x4
+secondpass_filter
+    cmp             r3, #0                  ;skip second_pass filter if yoffset=0
+    beq             skip_secondpass_filter
+
+    add             r3, r12, r3, lsl #3 ;calculate Vfilter location
+    vld1.u32        {d31}, [r3]         ;load second_pass filter
+
+    vdup.8          d0, d31[0]              ;second_pass filter parameters (d0-d5)
+    vdup.8          d1, d31[4]
+
+    vmull.u8        q1, d28, d0
+    vmull.u8        q2, d29, d0
+
+    vext.8          d26, d28, d29, #4       ;construct src_ptr[pixel_step]
+    vext.8          d27, d29, d30, #4
+
+    vmlal.u8        q1, d26, d1
+    vmlal.u8        q2, d27, d1
+
+    add             r0, r4, lr
+    add             r1, r0, lr
+    add             r2, r1, lr
+
+    vqrshrn.u16    d2, q1, #7               ;shift/round/saturate to u8
+    vqrshrn.u16    d3, q2, #7
+
+    vst1.32         {d2[0]}, [r4]           ;store result
+    vst1.32         {d2[1]}, [r0]
+    vst1.32         {d3[0]}, [r1]
+    vst1.32         {d3[1]}, [r2]
+
+    pop             {r4, pc}
+
+;--------------------
+skip_firstpass_filter
+
+    vld1.32         {d28[0]}, [r0], r1      ;load src data
+    vld1.32         {d28[1]}, [r0], r1
+    vld1.32         {d29[0]}, [r0], r1
+    vld1.32         {d29[1]}, [r0], r1
+    vld1.32         {d30[0]}, [r0], r1
+
+    b               secondpass_filter
+
+;---------------------
+skip_secondpass_filter
+    vst1.32         {d28[0]}, [r4], lr      ;store result
+    vst1.32         {d28[1]}, [r4], lr
+    vst1.32         {d29[0]}, [r4], lr
+    vst1.32         {d29[1]}, [r4], lr
+
+    pop             {r4, pc}
+
+    ENDP
+
+;-----------------
+
+bifilter4_coeff
+    DCD     128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112
+
+    END
--- /dev/null
+++ b/vp9/common/arm/neon/bilinearpredict8x4_neon.asm
@@ -1,0 +1,135 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_bilinear_predict8x4_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+; r0    unsigned char  *src_ptr,
+; r1    int  src_pixels_per_line,
+; r2    int  xoffset,
+; r3    int  yoffset,
+; r4    unsigned char *dst_ptr,
+; stack(lr) int  dst_pitch
+
+|vp8_bilinear_predict8x4_neon| PROC
+    push            {r4, lr}
+
+    adr             r12, bifilter8x4_coeff
+    ldr             r4, [sp, #8]            ;load parameters from stack
+    ldr             lr, [sp, #12]           ;load parameters from stack
+
+    cmp             r2, #0                  ;skip first_pass filter if xoffset=0
+    beq             skip_firstpass_filter
+
+;First pass: output_height lines x output_width columns (5x8)
+    add             r2, r12, r2, lsl #3     ;calculate filter location
+
+    vld1.u8         {q1}, [r0], r1          ;load src data
+    vld1.u32        {d31}, [r2]             ;load first_pass filter
+    vld1.u8         {q2}, [r0], r1
+    vdup.8          d0, d31[0]              ;first_pass filter (d0 d1)
+    vld1.u8         {q3}, [r0], r1
+    vdup.8          d1, d31[4]
+    vld1.u8         {q4}, [r0], r1
+
+    vmull.u8        q6, d2, d0              ;(src_ptr[0] * vp9_filter[0])
+    vld1.u8         {q5}, [r0], r1
+    vmull.u8        q7, d4, d0
+    vmull.u8        q8, d6, d0
+    vmull.u8        q9, d8, d0
+    vmull.u8        q10, d10, d0
+
+    vext.8          d3, d2, d3, #1          ;construct src_ptr[-1]
+    vext.8          d5, d4, d5, #1
+    vext.8          d7, d6, d7, #1
+    vext.8          d9, d8, d9, #1
+    vext.8          d11, d10, d11, #1
+
+    vmlal.u8        q6, d3, d1              ;(src_ptr[1] * vp9_filter[1])
+    vmlal.u8        q7, d5, d1
+    vmlal.u8        q8, d7, d1
+    vmlal.u8        q9, d9, d1
+    vmlal.u8        q10, d11, d1
+
+    vqrshrn.u16    d22, q6, #7              ;shift/round/saturate to u8
+    vqrshrn.u16    d23, q7, #7
+    vqrshrn.u16    d24, q8, #7
+    vqrshrn.u16    d25, q9, #7
+    vqrshrn.u16    d26, q10, #7
+
+;Second pass: 4x8
+secondpass_filter
+    cmp             r3, #0                  ;skip second_pass filter if yoffset=0
+    beq             skip_secondpass_filter
+
+    add             r3, r12, r3, lsl #3
+    add             r0, r4, lr
+
+    vld1.u32        {d31}, [r3]             ;load second_pass filter
+    add             r1, r0, lr
+
+    vdup.8          d0, d31[0]              ;second_pass filter parameters (d0 d1)
+    vdup.8          d1, d31[4]
+
+    vmull.u8        q1, d22, d0             ;(src_ptr[0] * vp9_filter[0])
+    vmull.u8        q2, d23, d0
+    vmull.u8        q3, d24, d0
+    vmull.u8        q4, d25, d0
+
+    vmlal.u8        q1, d23, d1             ;(src_ptr[pixel_step] * vp9_filter[1])
+    vmlal.u8        q2, d24, d1
+    vmlal.u8        q3, d25, d1
+    vmlal.u8        q4, d26, d1
+
+    add             r2, r1, lr
+
+    vqrshrn.u16    d2, q1, #7               ;shift/round/saturate to u8
+    vqrshrn.u16    d3, q2, #7
+    vqrshrn.u16    d4, q3, #7
+    vqrshrn.u16    d5, q4, #7
+
+    vst1.u8         {d2}, [r4]              ;store result
+    vst1.u8         {d3}, [r0]
+    vst1.u8         {d4}, [r1]
+    vst1.u8         {d5}, [r2]
+
+    pop             {r4, pc}
+
+;--------------------
+skip_firstpass_filter
+    vld1.u8         {d22}, [r0], r1         ;load src data
+    vld1.u8         {d23}, [r0], r1
+    vld1.u8         {d24}, [r0], r1
+    vld1.u8         {d25}, [r0], r1
+    vld1.u8         {d26}, [r0], r1
+
+    b               secondpass_filter
+
+;---------------------
+skip_secondpass_filter
+    vst1.u8         {d22}, [r4], lr         ;store result
+    vst1.u8         {d23}, [r4], lr
+    vst1.u8         {d24}, [r4], lr
+    vst1.u8         {d25}, [r4], lr
+
+    pop             {r4, pc}
+
+    ENDP
+
+;-----------------
+
+bifilter8x4_coeff
+    DCD     128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112
+
+    END
--- /dev/null
+++ b/vp9/common/arm/neon/bilinearpredict8x8_neon.asm
@@ -1,0 +1,183 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_bilinear_predict8x8_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+; r0    unsigned char  *src_ptr,
+; r1    int  src_pixels_per_line,
+; r2    int  xoffset,
+; r3    int  yoffset,
+; r4    unsigned char *dst_ptr,
+; stack(lr) int  dst_pitch
+
+|vp8_bilinear_predict8x8_neon| PROC
+    push            {r4, lr}
+
+    adr             r12, bifilter8_coeff
+    ldr             r4, [sp, #8]            ;load parameters from stack
+    ldr             lr, [sp, #12]           ;load parameters from stack
+
+    cmp             r2, #0                  ;skip first_pass filter if xoffset=0
+    beq             skip_firstpass_filter
+
+;First pass: output_height lines x output_width columns (9x8)
+    add             r2, r12, r2, lsl #3     ;calculate filter location
+
+    vld1.u8         {q1}, [r0], r1          ;load src data
+    vld1.u32        {d31}, [r2]             ;load first_pass filter
+    vld1.u8         {q2}, [r0], r1
+    vdup.8          d0, d31[0]              ;first_pass filter (d0 d1)
+    vld1.u8         {q3}, [r0], r1
+    vdup.8          d1, d31[4]
+    vld1.u8         {q4}, [r0], r1
+
+    vmull.u8        q6, d2, d0              ;(src_ptr[0] * vp9_filter[0])
+    vmull.u8        q7, d4, d0
+    vmull.u8        q8, d6, d0
+    vmull.u8        q9, d8, d0
+
+    vext.8          d3, d2, d3, #1          ;construct src_ptr[-1]
+    vext.8          d5, d4, d5, #1
+    vext.8          d7, d6, d7, #1
+    vext.8          d9, d8, d9, #1
+
+    vmlal.u8        q6, d3, d1              ;(src_ptr[1] * vp9_filter[1])
+    vmlal.u8        q7, d5, d1
+    vmlal.u8        q8, d7, d1
+    vmlal.u8        q9, d9, d1
+
+    vld1.u8         {q1}, [r0], r1          ;load src data
+    vqrshrn.u16    d22, q6, #7              ;shift/round/saturate to u8
+    vld1.u8         {q2}, [r0], r1
+    vqrshrn.u16    d23, q7, #7
+    vld1.u8         {q3}, [r0], r1
+    vqrshrn.u16    d24, q8, #7
+    vld1.u8         {q4}, [r0], r1
+    vqrshrn.u16    d25, q9, #7
+
+    ;first_pass filtering on the rest 5-line data
+    vld1.u8         {q5}, [r0], r1
+
+    vmull.u8        q6, d2, d0              ;(src_ptr[0] * vp9_filter[0])
+    vmull.u8        q7, d4, d0
+    vmull.u8        q8, d6, d0
+    vmull.u8        q9, d8, d0
+    vmull.u8        q10, d10, d0
+
+    vext.8          d3, d2, d3, #1          ;construct src_ptr[-1]
+    vext.8          d5, d4, d5, #1
+    vext.8          d7, d6, d7, #1
+    vext.8          d9, d8, d9, #1
+    vext.8          d11, d10, d11, #1
+
+    vmlal.u8        q6, d3, d1              ;(src_ptr[1] * vp9_filter[1])
+    vmlal.u8        q7, d5, d1
+    vmlal.u8        q8, d7, d1
+    vmlal.u8        q9, d9, d1
+    vmlal.u8        q10, d11, d1
+
+    vqrshrn.u16    d26, q6, #7              ;shift/round/saturate to u8
+    vqrshrn.u16    d27, q7, #7
+    vqrshrn.u16    d28, q8, #7
+    vqrshrn.u16    d29, q9, #7
+    vqrshrn.u16    d30, q10, #7
+
+;Second pass: 8x8
+secondpass_filter
+    cmp             r3, #0                  ;skip second_pass filter if yoffset=0
+    beq             skip_secondpass_filter
+
+    add             r3, r12, r3, lsl #3
+    add             r0, r4, lr
+
+    vld1.u32        {d31}, [r3]             ;load second_pass filter
+    add             r1, r0, lr
+
+    vdup.8          d0, d31[0]              ;second_pass filter parameters (d0 d1)
+    vdup.8          d1, d31[4]
+
+    vmull.u8        q1, d22, d0             ;(src_ptr[0] * vp9_filter[0])
+    vmull.u8        q2, d23, d0
+    vmull.u8        q3, d24, d0
+    vmull.u8        q4, d25, d0
+    vmull.u8        q5, d26, d0
+    vmull.u8        q6, d27, d0
+    vmull.u8        q7, d28, d0
+    vmull.u8        q8, d29, d0
+
+    vmlal.u8        q1, d23, d1             ;(src_ptr[pixel_step] * vp9_filter[1])
+    vmlal.u8        q2, d24, d1
+    vmlal.u8        q3, d25, d1
+    vmlal.u8        q4, d26, d1
+    vmlal.u8        q5, d27, d1
+    vmlal.u8        q6, d28, d1
+    vmlal.u8        q7, d29, d1
+    vmlal.u8        q8, d30, d1
+
+    vqrshrn.u16    d2, q1, #7               ;shift/round/saturate to u8
+    vqrshrn.u16    d3, q2, #7
+    vqrshrn.u16    d4, q3, #7
+    vqrshrn.u16    d5, q4, #7
+    vqrshrn.u16    d6, q5, #7
+    vqrshrn.u16    d7, q6, #7
+    vqrshrn.u16    d8, q7, #7
+    vqrshrn.u16    d9, q8, #7
+
+    vst1.u8         {d2}, [r4]              ;store result
+    vst1.u8         {d3}, [r0]
+    vst1.u8         {d4}, [r1], lr
+    vst1.u8         {d5}, [r1], lr
+    vst1.u8         {d6}, [r1], lr
+    vst1.u8         {d7}, [r1], lr
+    vst1.u8         {d8}, [r1], lr
+    vst1.u8         {d9}, [r1], lr
+
+    pop             {r4, pc}
+
+;--------------------
+skip_firstpass_filter
+    vld1.u8         {d22}, [r0], r1         ;load src data
+    vld1.u8         {d23}, [r0], r1
+    vld1.u8         {d24}, [r0], r1
+    vld1.u8         {d25}, [r0], r1
+    vld1.u8         {d26}, [r0], r1
+    vld1.u8         {d27}, [r0], r1
+    vld1.u8         {d28}, [r0], r1
+    vld1.u8         {d29}, [r0], r1
+    vld1.u8         {d30}, [r0], r1
+
+    b               secondpass_filter
+
+;---------------------
+skip_secondpass_filter
+    vst1.u8         {d22}, [r4], lr         ;store result
+    vst1.u8         {d23}, [r4], lr
+    vst1.u8         {d24}, [r4], lr
+    vst1.u8         {d25}, [r4], lr
+    vst1.u8         {d26}, [r4], lr
+    vst1.u8         {d27}, [r4], lr
+    vst1.u8         {d28}, [r4], lr
+    vst1.u8         {d29}, [r4], lr
+
+    pop             {r4, pc}
+
+    ENDP
+
+;-----------------
+
+bifilter8_coeff
+    DCD     128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112
+
+    END
--- /dev/null
+++ b/vp9/common/arm/neon/buildintrapredictorsmby_neon.asm
@@ -1,0 +1,584 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_build_intra_predictors_mby_neon_func|
+    EXPORT  |vp8_build_intra_predictors_mby_s_neon_func|
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+; r0    unsigned char *y_buffer
+; r1    unsigned char *ypred_ptr
+; r2    int y_stride
+; r3    int mode
+; stack int Up
+; stack int Left
+
+|vp8_build_intra_predictors_mby_neon_func| PROC
+    push            {r4-r8, lr}
+
+    cmp             r3, #0
+    beq             case_dc_pred
+    cmp             r3, #1
+    beq             case_v_pred
+    cmp             r3, #2
+    beq             case_h_pred
+    cmp             r3, #3
+    beq             case_tm_pred
+
+case_dc_pred
+    ldr             r4, [sp, #24]       ; Up
+    ldr             r5, [sp, #28]       ; Left
+
+    ; Default the DC average to 128
+    mov             r12, #128
+    vdup.u8         q0, r12
+
+    ; Zero out running sum
+    mov             r12, #0
+
+    ; compute shift and jump
+    adds            r7, r4, r5
+    beq             skip_dc_pred_up_left
+
+    ; Load above row, if it exists
+    cmp             r4, #0
+    beq             skip_dc_pred_up
+
+    sub             r6, r0, r2
+    vld1.8          {q1}, [r6]
+    vpaddl.u8       q2, q1
+    vpaddl.u16      q3, q2
+    vpaddl.u32      q4, q3
+
+    vmov.32         r4, d8[0]
+    vmov.32         r6, d9[0]
+
+    add             r12, r4, r6
+
+    ; Move back to interger registers
+
+skip_dc_pred_up
+
+    cmp             r5, #0
+    beq             skip_dc_pred_left
+
+    sub             r0, r0, #1
+
+    ; Load left row, if it exists
+    ldrb            r3, [r0], r2
+    ldrb            r4, [r0], r2
+    ldrb            r5, [r0], r2
+    ldrb            r6, [r0], r2
+
+    add             r12, r12, r3
+    add             r12, r12, r4
+    add             r12, r12, r5
+    add             r12, r12, r6
+
+    ldrb            r3, [r0], r2
+    ldrb            r4, [r0], r2
+    ldrb            r5, [r0], r2
+    ldrb            r6, [r0], r2
+
+    add             r12, r12, r3
+    add             r12, r12, r4
+    add             r12, r12, r5
+    add             r12, r12, r6
+
+    ldrb            r3, [r0], r2
+    ldrb            r4, [r0], r2
+    ldrb            r5, [r0], r2
+    ldrb            r6, [r0], r2
+
+    add             r12, r12, r3
+    add             r12, r12, r4
+    add             r12, r12, r5
+    add             r12, r12, r6
+
+    ldrb            r3, [r0], r2
+    ldrb            r4, [r0], r2
+    ldrb            r5, [r0], r2
+    ldrb            r6, [r0]
+
+    add             r12, r12, r3
+    add             r12, r12, r4
+    add             r12, r12, r5
+    add             r12, r12, r6
+
+skip_dc_pred_left
+    add             r7, r7, #3          ; Shift
+    sub             r4, r7, #1
+    mov             r5, #1
+    add             r12, r12, r5, lsl r4
+    mov             r5, r12, lsr r7     ; expected_dc
+
+    vdup.u8         q0, r5
+
+skip_dc_pred_up_left
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q0}, [r1]!
+
+    pop             {r4-r8,pc}
+case_v_pred
+    ; Copy down above row
+    sub             r6, r0, r2
+    vld1.8          {q0}, [r6]
+
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q0}, [r1]!
+    pop             {r4-r8,pc}
+
+case_h_pred
+    ; Load 4x yleft_col
+    sub             r0, r0, #1
+
+    ldrb            r3, [r0], r2
+    ldrb            r4, [r0], r2
+    ldrb            r5, [r0], r2
+    ldrb            r6, [r0], r2
+    vdup.u8         q0, r3
+    vdup.u8         q1, r4
+    vdup.u8         q2, r5
+    vdup.u8         q3, r6
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q1}, [r1]!
+    vst1.u8         {q2}, [r1]!
+    vst1.u8         {q3}, [r1]!
+
+    ldrb            r3, [r0], r2
+    ldrb            r4, [r0], r2
+    ldrb            r5, [r0], r2
+    ldrb            r6, [r0], r2
+    vdup.u8         q0, r3
+    vdup.u8         q1, r4
+    vdup.u8         q2, r5
+    vdup.u8         q3, r6
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q1}, [r1]!
+    vst1.u8         {q2}, [r1]!
+    vst1.u8         {q3}, [r1]!
+
+
+    ldrb            r3, [r0], r2
+    ldrb            r4, [r0], r2
+    ldrb            r5, [r0], r2
+    ldrb            r6, [r0], r2
+    vdup.u8         q0, r3
+    vdup.u8         q1, r4
+    vdup.u8         q2, r5
+    vdup.u8         q3, r6
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q1}, [r1]!
+    vst1.u8         {q2}, [r1]!
+    vst1.u8         {q3}, [r1]!
+
+    ldrb            r3, [r0], r2
+    ldrb            r4, [r0], r2
+    ldrb            r5, [r0], r2
+    ldrb            r6, [r0], r2
+    vdup.u8         q0, r3
+    vdup.u8         q1, r4
+    vdup.u8         q2, r5
+    vdup.u8         q3, r6
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q1}, [r1]!
+    vst1.u8         {q2}, [r1]!
+    vst1.u8         {q3}, [r1]!
+
+    pop             {r4-r8,pc}
+
+case_tm_pred
+    ; Load yabove_row
+    sub             r3, r0, r2
+    vld1.8          {q8}, [r3]
+
+    ; Load ytop_left
+    sub             r3, r3, #1
+    ldrb            r7, [r3]
+
+    vdup.u16        q7, r7
+
+    ; Compute yabove_row - ytop_left
+    mov             r3, #1
+    vdup.u8         q0, r3
+
+    vmull.u8        q4, d16, d0
+    vmull.u8        q5, d17, d0
+
+    vsub.s16        q4, q4, q7
+    vsub.s16        q5, q5, q7
+
+    ; Load 4x yleft_col
+    sub             r0, r0, #1
+    mov             r12, #4
+
+case_tm_pred_loop
+    ldrb            r3, [r0], r2
+    ldrb            r4, [r0], r2
+    ldrb            r5, [r0], r2
+    ldrb            r6, [r0], r2
+    vdup.u16        q0, r3
+    vdup.u16        q1, r4
+    vdup.u16        q2, r5
+    vdup.u16        q3, r6
+
+    vqadd.s16       q8, q0, q4
+    vqadd.s16       q9, q0, q5
+
+    vqadd.s16       q10, q1, q4
+    vqadd.s16       q11, q1, q5
+
+    vqadd.s16       q12, q2, q4
+    vqadd.s16       q13, q2, q5
+
+    vqadd.s16       q14, q3, q4
+    vqadd.s16       q15, q3, q5
+
+    vqshrun.s16     d0, q8, #0
+    vqshrun.s16     d1, q9, #0
+
+    vqshrun.s16     d2, q10, #0
+    vqshrun.s16     d3, q11, #0
+
+    vqshrun.s16     d4, q12, #0
+    vqshrun.s16     d5, q13, #0
+
+    vqshrun.s16     d6, q14, #0
+    vqshrun.s16     d7, q15, #0
+
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q1}, [r1]!
+    vst1.u8         {q2}, [r1]!
+    vst1.u8         {q3}, [r1]!
+
+    subs            r12, r12, #1
+    bne             case_tm_pred_loop
+
+    pop             {r4-r8,pc}
+
+    ENDP
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; r0    unsigned char *y_buffer
+; r1    unsigned char *ypred_ptr
+; r2    int y_stride
+; r3    int mode
+; stack int Up
+; stack int Left
+
+|vp8_build_intra_predictors_mby_s_neon_func| PROC
+    push            {r4-r8, lr}
+
+    mov             r1, r0      ;   unsigned char *ypred_ptr = x->dst.y_buffer; //x->Predictor;
+
+    cmp             r3, #0
+    beq             case_dc_pred_s
+    cmp             r3, #1
+    beq             case_v_pred_s
+    cmp             r3, #2
+    beq             case_h_pred_s
+    cmp             r3, #3
+    beq             case_tm_pred_s
+
+case_dc_pred_s
+    ldr             r4, [sp, #24]       ; Up
+    ldr             r5, [sp, #28]       ; Left
+
+    ; Default the DC average to 128
+    mov             r12, #128
+    vdup.u8         q0, r12
+
+    ; Zero out running sum
+    mov             r12, #0
+
+    ; compute shift and jump
+    adds            r7, r4, r5
+    beq             skip_dc_pred_up_left_s
+
+    ; Load above row, if it exists
+    cmp             r4, #0
+    beq             skip_dc_pred_up_s
+
+    sub             r6, r0, r2
+    vld1.8          {q1}, [r6]
+    vpaddl.u8       q2, q1
+    vpaddl.u16      q3, q2
+    vpaddl.u32      q4, q3
+
+    vmov.32         r4, d8[0]
+    vmov.32         r6, d9[0]
+
+    add             r12, r4, r6
+
+    ; Move back to interger registers
+
+skip_dc_pred_up_s
+
+    cmp             r5, #0
+    beq             skip_dc_pred_left_s
+
+    sub             r0, r0, #1
+
+    ; Load left row, if it exists
+    ldrb            r3, [r0], r2
+    ldrb            r4, [r0], r2
+    ldrb            r5, [r0], r2
+    ldrb            r6, [r0], r2
+
+    add             r12, r12, r3
+    add             r12, r12, r4
+    add             r12, r12, r5
+    add             r12, r12, r6
+
+    ldrb            r3, [r0], r2
+    ldrb            r4, [r0], r2
+    ldrb            r5, [r0], r2
+    ldrb            r6, [r0], r2
+
+    add             r12, r12, r3
+    add             r12, r12, r4
+    add             r12, r12, r5
+    add             r12, r12, r6
+
+    ldrb            r3, [r0], r2
+    ldrb            r4, [r0], r2
+    ldrb            r5, [r0], r2
+    ldrb            r6, [r0], r2
+
+    add             r12, r12, r3
+    add             r12, r12, r4
+    add             r12, r12, r5
+    add             r12, r12, r6
+
+    ldrb            r3, [r0], r2
+    ldrb            r4, [r0], r2
+    ldrb            r5, [r0], r2
+    ldrb            r6, [r0]
+
+    add             r12, r12, r3
+    add             r12, r12, r4
+    add             r12, r12, r5
+    add             r12, r12, r6
+
+skip_dc_pred_left_s
+    add             r7, r7, #3          ; Shift
+    sub             r4, r7, #1
+    mov             r5, #1
+    add             r12, r12, r5, lsl r4
+    mov             r5, r12, lsr r7     ; expected_dc
+
+    vdup.u8         q0, r5
+
+skip_dc_pred_up_left_s
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q0}, [r1], r2
+
+    pop             {r4-r8,pc}
+case_v_pred_s
+    ; Copy down above row
+    sub             r6, r0, r2
+    vld1.8          {q0}, [r6]
+
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q0}, [r1], r2
+    pop             {r4-r8,pc}
+
+case_h_pred_s
+    ; Load 4x yleft_col
+    sub             r0, r0, #1
+
+    ldrb            r3, [r0], r2
+    ldrb            r4, [r0], r2
+    ldrb            r5, [r0], r2
+    ldrb            r6, [r0], r2
+    vdup.u8         q0, r3
+    vdup.u8         q1, r4
+    vdup.u8         q2, r5
+    vdup.u8         q3, r6
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q1}, [r1], r2
+    vst1.u8         {q2}, [r1], r2
+    vst1.u8         {q3}, [r1], r2
+
+    ldrb            r3, [r0], r2
+    ldrb            r4, [r0], r2
+    ldrb            r5, [r0], r2
+    ldrb            r6, [r0], r2
+    vdup.u8         q0, r3
+    vdup.u8         q1, r4
+    vdup.u8         q2, r5
+    vdup.u8         q3, r6
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q1}, [r1], r2
+    vst1.u8         {q2}, [r1], r2
+    vst1.u8         {q3}, [r1], r2
+
+
+    ldrb            r3, [r0], r2
+    ldrb            r4, [r0], r2
+    ldrb            r5, [r0], r2
+    ldrb            r6, [r0], r2
+    vdup.u8         q0, r3
+    vdup.u8         q1, r4
+    vdup.u8         q2, r5
+    vdup.u8         q3, r6
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q1}, [r1], r2
+    vst1.u8         {q2}, [r1], r2
+    vst1.u8         {q3}, [r1], r2
+
+    ldrb            r3, [r0], r2
+    ldrb            r4, [r0], r2
+    ldrb            r5, [r0], r2
+    ldrb            r6, [r0], r2
+    vdup.u8         q0, r3
+    vdup.u8         q1, r4
+    vdup.u8         q2, r5
+    vdup.u8         q3, r6
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q1}, [r1], r2
+    vst1.u8         {q2}, [r1], r2
+    vst1.u8         {q3}, [r1], r2
+
+    pop             {r4-r8,pc}
+
+case_tm_pred_s
+    ; Load yabove_row
+    sub             r3, r0, r2
+    vld1.8          {q8}, [r3]
+
+    ; Load ytop_left
+    sub             r3, r3, #1
+    ldrb            r7, [r3]
+
+    vdup.u16        q7, r7
+
+    ; Compute yabove_row - ytop_left
+    mov             r3, #1
+    vdup.u8         q0, r3
+
+    vmull.u8        q4, d16, d0
+    vmull.u8        q5, d17, d0
+
+    vsub.s16        q4, q4, q7
+    vsub.s16        q5, q5, q7
+
+    ; Load 4x yleft_col
+    sub             r0, r0, #1
+    mov             r12, #4
+
+case_tm_pred_loop_s
+    ldrb            r3, [r0], r2
+    ldrb            r4, [r0], r2
+    ldrb            r5, [r0], r2
+    ldrb            r6, [r0], r2
+    vdup.u16        q0, r3
+    vdup.u16        q1, r4
+    vdup.u16        q2, r5
+    vdup.u16        q3, r6
+
+    vqadd.s16       q8, q0, q4
+    vqadd.s16       q9, q0, q5
+
+    vqadd.s16       q10, q1, q4
+    vqadd.s16       q11, q1, q5
+
+    vqadd.s16       q12, q2, q4
+    vqadd.s16       q13, q2, q5
+
+    vqadd.s16       q14, q3, q4
+    vqadd.s16       q15, q3, q5
+
+    vqshrun.s16     d0, q8, #0
+    vqshrun.s16     d1, q9, #0
+
+    vqshrun.s16     d2, q10, #0
+    vqshrun.s16     d3, q11, #0
+
+    vqshrun.s16     d4, q12, #0
+    vqshrun.s16     d5, q13, #0
+
+    vqshrun.s16     d6, q14, #0
+    vqshrun.s16     d7, q15, #0
+
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q1}, [r1], r2
+    vst1.u8         {q2}, [r1], r2
+    vst1.u8         {q3}, [r1], r2
+
+    subs            r12, r12, #1
+    bne             case_tm_pred_loop_s
+
+    pop             {r4-r8,pc}
+
+    ENDP
+
+
+    END
--- /dev/null
+++ b/vp9/common/arm/neon/copymem16x16_neon.asm
@@ -1,0 +1,59 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp9_copy_mem16x16_neon|
+    ; ARM
+    ; REQUIRE8
+    ; PRESERVE8
+
+    AREA    Block, CODE, READONLY ; name this block of code
+;void copy_mem16x16_neon( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride)
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
+|vp9_copy_mem16x16_neon| PROC
+
+    vld1.u8     {q0}, [r0], r1
+    vld1.u8     {q1}, [r0], r1
+    vld1.u8     {q2}, [r0], r1
+    vst1.u8     {q0}, [r2], r3
+    vld1.u8     {q3}, [r0], r1
+    vst1.u8     {q1}, [r2], r3
+    vld1.u8     {q4}, [r0], r1
+    vst1.u8     {q2}, [r2], r3
+    vld1.u8     {q5}, [r0], r1
+    vst1.u8     {q3}, [r2], r3
+    vld1.u8     {q6}, [r0], r1
+    vst1.u8     {q4}, [r2], r3
+    vld1.u8     {q7}, [r0], r1
+    vst1.u8     {q5}, [r2], r3
+    vld1.u8     {q8}, [r0], r1
+    vst1.u8     {q6}, [r2], r3
+    vld1.u8     {q9}, [r0], r1
+    vst1.u8     {q7}, [r2], r3
+    vld1.u8     {q10}, [r0], r1
+    vst1.u8     {q8}, [r2], r3
+    vld1.u8     {q11}, [r0], r1
+    vst1.u8     {q9}, [r2], r3
+    vld1.u8     {q12}, [r0], r1
+    vst1.u8     {q10}, [r2], r3
+    vld1.u8     {q13}, [r0], r1
+    vst1.u8     {q11}, [r2], r3
+    vld1.u8     {q14}, [r0], r1
+    vst1.u8     {q12}, [r2], r3
+    vld1.u8     {q15}, [r0], r1
+    vst1.u8     {q13}, [r2], r3
+    vst1.u8     {q14}, [r2], r3
+    vst1.u8     {q15}, [r2], r3
+
+    mov     pc, lr
+
+    ENDP  ; |vp9_copy_mem16x16_neon|
+
+    END
--- /dev/null
+++ b/vp9/common/arm/neon/copymem8x4_neon.asm
@@ -1,0 +1,34 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp9_copy_mem8x4_neon|
+    ; ARM
+    ; REQUIRE8
+    ; PRESERVE8
+
+    AREA    Block, CODE, READONLY ; name this block of code
+;void copy_mem8x4_neon( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride)
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
+|vp9_copy_mem8x4_neon| PROC
+    vld1.u8     {d0}, [r0], r1
+    vld1.u8     {d1}, [r0], r1
+    vst1.u8     {d0}, [r2], r3
+    vld1.u8     {d2}, [r0], r1
+    vst1.u8     {d1}, [r2], r3
+    vld1.u8     {d3}, [r0], r1
+    vst1.u8     {d2}, [r2], r3
+    vst1.u8     {d3}, [r2], r3
+
+    mov     pc, lr
+
+    ENDP  ; |vp9_copy_mem8x4_neon|
+
+    END
--- /dev/null
+++ b/vp9/common/arm/neon/copymem8x8_neon.asm
@@ -1,0 +1,43 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp9_copy_mem8x8_neon|
+    ; ARM
+    ; REQUIRE8
+    ; PRESERVE8
+
+    AREA    Block, CODE, READONLY ; name this block of code
+;void copy_mem8x8_neon( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride)
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
+|vp9_copy_mem8x8_neon| PROC
+
+    vld1.u8     {d0}, [r0], r1
+    vld1.u8     {d1}, [r0], r1
+    vst1.u8     {d0}, [r2], r3
+    vld1.u8     {d2}, [r0], r1
+    vst1.u8     {d1}, [r2], r3
+    vld1.u8     {d3}, [r0], r1
+    vst1.u8     {d2}, [r2], r3
+    vld1.u8     {d4}, [r0], r1
+    vst1.u8     {d3}, [r2], r3
+    vld1.u8     {d5}, [r0], r1
+    vst1.u8     {d4}, [r2], r3
+    vld1.u8     {d6}, [r0], r1
+    vst1.u8     {d5}, [r2], r3
+    vld1.u8     {d7}, [r0], r1
+    vst1.u8     {d6}, [r2], r3
+    vst1.u8     {d7}, [r2], r3
+
+    mov     pc, lr
+
+    ENDP  ; |vp9_copy_mem8x8_neon|
+
+    END
--- /dev/null
+++ b/vp9/common/arm/neon/dc_only_idct_add_neon.asm
@@ -1,0 +1,49 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_dc_only_idct_add_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+;void vp8_dc_only_idct_add_neon(short input_dc, unsigned char *pred_ptr,
+;                               unsigned char *dst_ptr, int pitch, int stride)
+; r0  input_dc
+; r1  pred_ptr
+; r2  dst_ptr
+; r3  pitch
+; sp  stride
+|vp8_dc_only_idct_add_neon| PROC
+    add             r0, r0, #4
+    asr             r0, r0, #3
+    ldr             r12, [sp]
+    vdup.16         q0, r0
+
+    vld1.32         {d2[0]}, [r1], r3
+    vld1.32         {d2[1]}, [r1], r3
+    vld1.32         {d4[0]}, [r1], r3
+    vld1.32         {d4[1]}, [r1]
+
+    vaddw.u8        q1, q0, d2
+    vaddw.u8        q2, q0, d4
+
+    vqmovun.s16     d2, q1
+    vqmovun.s16     d4, q2
+
+    vst1.32         {d2[0]}, [r2], r12
+    vst1.32         {d2[1]}, [r2], r12
+    vst1.32         {d4[0]}, [r2], r12
+    vst1.32         {d4[1]}, [r2]
+
+    bx             lr
+
+    ENDP
+    END
--- /dev/null
+++ b/vp9/common/arm/neon/iwalsh_neon.asm
@@ -1,0 +1,80 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+    EXPORT  |vp8_short_inv_walsh4x4_neon|
+    EXPORT  |vp8_short_inv_walsh4x4_1_neon|
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA    |.text|, CODE, READONLY  ; name this block of code
+
+;short vp8_short_inv_walsh4x4_neon(short *input, short *output)
+|vp8_short_inv_walsh4x4_neon| PROC
+
+    ; read in all four lines of values: d0->d3
+    vld1.i16 {q0-q1}, [r0@128]
+
+    ; first for loop
+    vadd.s16 d4, d0, d3 ;a = [0] + [12]
+    vadd.s16 d6, d1, d2 ;b = [4] + [8]
+    vsub.s16 d5, d0, d3 ;d = [0] - [12]
+    vsub.s16 d7, d1, d2 ;c = [4] - [8]
+
+    vadd.s16 q0, q2, q3 ; a+b d+c
+    vsub.s16 q1, q2, q3 ; a-b d-c
+
+    vtrn.32 d0, d2 ;d0:  0  1  8  9
+                   ;d2:  2  3 10 11
+    vtrn.32 d1, d3 ;d1:  4  5 12 13
+                   ;d3:  6  7 14 15
+
+    vtrn.16 d0, d1 ;d0:  0  4  8 12
+                   ;d1:  1  5  9 13
+    vtrn.16 d2, d3 ;d2:  2  6 10 14
+                   ;d3:  3  7 11 15
+
+    ; second for loop
+
+    vadd.s16 d4, d0, d3 ;a = [0] + [3]
+    vadd.s16 d6, d1, d2 ;b = [1] + [2]
+    vsub.s16 d5, d0, d3 ;d = [0] - [3]
+    vsub.s16 d7, d1, d2 ;c = [1] - [2]
+
+    vmov.i16 q8, #3
+
+    vadd.s16 q0, q2, q3 ; a+b d+c
+    vsub.s16 q1, q2, q3 ; a-b d-c
+
+    vadd.i16 q0, q0, q8 ;e/f += 3
+    vadd.i16 q1, q1, q8 ;g/h += 3
+
+    vshr.s16 q0, q0, #3 ;e/f >> 3
+    vshr.s16 q1, q1, #3 ;g/h >> 3
+
+    vst4.i16 {d0,d1,d2,d3}, [r1@128]
+
+    bx lr
+    ENDP    ; |vp8_short_inv_walsh4x4_neon|
+
+
+;short vp8_short_inv_walsh4x4_1_neon(short *input, short *output)
+|vp8_short_inv_walsh4x4_1_neon| PROC
+    ldrsh r2, [r0]          ; load input[0]
+    add r3, r2, #3          ; add 3
+    add r2, r1, #16         ; base for last 8 output
+    asr r0, r3, #3          ; right shift 3
+    vdup.16 q0, r0          ; load and duplicate
+    vst1.16 {q0}, [r1@128]  ; write back 8
+    vst1.16 {q0}, [r2@128]  ; write back last 8
+    bx lr
+    ENDP    ; |vp8_short_inv_walsh4x4_1_neon|
+
+    END
--- /dev/null
+++ b/vp9/common/arm/neon/loopfilter_neon.asm
@@ -1,0 +1,397 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp9_loop_filter_horizontal_edge_y_neon|
+    EXPORT  |vp9_loop_filter_horizontal_edge_uv_neon|
+    EXPORT  |vp9_loop_filter_vertical_edge_y_neon|
+    EXPORT  |vp9_loop_filter_vertical_edge_uv_neon|
+    ARM
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0    unsigned char *src
+; r1    int pitch
+; r2    unsigned char blimit
+; r3    unsigned char limit
+; sp    unsigned char thresh,
+|vp9_loop_filter_horizontal_edge_y_neon| PROC
+    push        {lr}
+    vdup.u8     q0, r2                     ; duplicate blimit
+    vdup.u8     q1, r3                     ; duplicate limit
+    sub         r2, r0, r1, lsl #2         ; move src pointer down by 4 lines
+    ldr         r3, [sp, #4]               ; load thresh
+    add         r12, r2, r1
+    add         r1, r1, r1
+
+    vdup.u8     q2, r3                     ; duplicate thresh
+
+    vld1.u8     {q3}, [r2@128], r1              ; p3
+    vld1.u8     {q4}, [r12@128], r1             ; p2
+    vld1.u8     {q5}, [r2@128], r1              ; p1
+    vld1.u8     {q6}, [r12@128], r1             ; p0
+    vld1.u8     {q7}, [r2@128], r1              ; q0
+    vld1.u8     {q8}, [r12@128], r1             ; q1
+    vld1.u8     {q9}, [r2@128]                  ; q2
+    vld1.u8     {q10}, [r12@128]                ; q3
+
+    sub         r2, r2, r1, lsl #1
+    sub         r12, r12, r1, lsl #1
+
+    bl          vp9_loop_filter_neon
+
+    vst1.u8     {q5}, [r2@128], r1              ; store op1
+    vst1.u8     {q6}, [r12@128], r1             ; store op0
+    vst1.u8     {q7}, [r2@128], r1              ; store oq0
+    vst1.u8     {q8}, [r12@128], r1             ; store oq1
+
+    pop         {pc}
+    ENDP        ; |vp9_loop_filter_horizontal_edge_y_neon|
+
+
+; r0    unsigned char *u,
+; r1    int pitch,
+; r2    unsigned char blimit
+; r3    unsigned char limit
+; sp    unsigned char thresh,
+; sp+4  unsigned char *v
+|vp9_loop_filter_horizontal_edge_uv_neon| PROC
+    push        {lr}
+    vdup.u8     q0, r2                      ; duplicate blimit
+    vdup.u8     q1, r3                      ; duplicate limit
+    ldr         r12, [sp, #4]               ; load thresh
+    ldr         r2, [sp, #8]                ; load v ptr
+    vdup.u8     q2, r12                     ; duplicate thresh
+
+    sub         r3, r0, r1, lsl #2          ; move u pointer down by 4 lines
+    sub         r12, r2, r1, lsl #2         ; move v pointer down by 4 lines
+
+    vld1.u8     {d6}, [r3@64], r1              ; p3
+    vld1.u8     {d7}, [r12@64], r1             ; p3
+    vld1.u8     {d8}, [r3@64], r1              ; p2
+    vld1.u8     {d9}, [r12@64], r1             ; p2
+    vld1.u8     {d10}, [r3@64], r1             ; p1
+    vld1.u8     {d11}, [r12@64], r1            ; p1
+    vld1.u8     {d12}, [r3@64], r1             ; p0
+    vld1.u8     {d13}, [r12@64], r1            ; p0
+    vld1.u8     {d14}, [r3@64], r1             ; q0
+    vld1.u8     {d15}, [r12@64], r1            ; q0
+    vld1.u8     {d16}, [r3@64], r1             ; q1
+    vld1.u8     {d17}, [r12@64], r1            ; q1
+    vld1.u8     {d18}, [r3@64], r1             ; q2
+    vld1.u8     {d19}, [r12@64], r1            ; q2
+    vld1.u8     {d20}, [r3@64]                 ; q3
+    vld1.u8     {d21}, [r12@64]                ; q3
+
+    bl          vp9_loop_filter_neon
+
+    sub         r0, r0, r1, lsl #1
+    sub         r2, r2, r1, lsl #1
+
+    vst1.u8     {d10}, [r0@64], r1             ; store u op1
+    vst1.u8     {d11}, [r2@64], r1             ; store v op1
+    vst1.u8     {d12}, [r0@64], r1             ; store u op0
+    vst1.u8     {d13}, [r2@64], r1             ; store v op0
+    vst1.u8     {d14}, [r0@64], r1             ; store u oq0
+    vst1.u8     {d15}, [r2@64], r1             ; store v oq0
+    vst1.u8     {d16}, [r0@64]                 ; store u oq1
+    vst1.u8     {d17}, [r2@64]                 ; store v oq1
+
+    pop         {pc}
+    ENDP        ; |vp9_loop_filter_horizontal_edge_uv_neon|
+
+; void vp9_loop_filter_vertical_edge_y_neon(unsigned char *src, int pitch,
+;                                           const signed char *flimit,
+;                                           const signed char *limit,
+;                                           const signed char *thresh,
+;                                           int count)
+; r0    unsigned char *src
+; r1    int pitch
+; r2    unsigned char blimit
+; r3    unsigned char limit
+; sp    unsigned char thresh,
+
+|vp9_loop_filter_vertical_edge_y_neon| PROC
+    push        {lr}
+    vdup.u8     q0, r2                     ; duplicate blimit
+    vdup.u8     q1, r3                     ; duplicate limit
+    sub         r2, r0, #4                 ; src ptr down by 4 columns
+    add         r1, r1, r1
+    ldr         r3, [sp, #4]               ; load thresh
+    add         r12, r2, r1, asr #1
+
+    vld1.u8     {d6}, [r2], r1
+    vld1.u8     {d8}, [r12], r1
+    vld1.u8     {d10}, [r2], r1
+    vld1.u8     {d12}, [r12], r1
+    vld1.u8     {d14}, [r2], r1
+    vld1.u8     {d16}, [r12], r1
+    vld1.u8     {d18}, [r2], r1
+    vld1.u8     {d20}, [r12], r1
+
+    vld1.u8     {d7}, [r2], r1              ; load second 8-line src data
+    vld1.u8     {d9}, [r12], r1
+    vld1.u8     {d11}, [r2], r1
+    vld1.u8     {d13}, [r12], r1
+    vld1.u8     {d15}, [r2], r1
+    vld1.u8     {d17}, [r12], r1
+    vld1.u8     {d19}, [r2]
+    vld1.u8     {d21}, [r12]
+
+    ;transpose to 8x16 matrix
+    vtrn.32     q3, q7
+    vtrn.32     q4, q8
+    vtrn.32     q5, q9
+    vtrn.32     q6, q10
+
+    vdup.u8     q2, r3                     ; duplicate thresh
+
+    vtrn.16     q3, q5
+    vtrn.16     q4, q6
+    vtrn.16     q7, q9
+    vtrn.16     q8, q10
+
+    vtrn.8      q3, q4
+    vtrn.8      q5, q6
+    vtrn.8      q7, q8
+    vtrn.8      q9, q10
+
+    bl          vp9_loop_filter_neon
+
+    vswp        d12, d11
+    vswp        d16, d13
+
+    sub         r0, r0, #2                 ; dst ptr
+
+    vswp        d14, d12
+    vswp        d16, d15
+
+    add         r12, r0, r1, asr #1
+
+    ;store op1, op0, oq0, oq1
+    vst4.8      {d10[0], d11[0], d12[0], d13[0]}, [r0], r1
+    vst4.8      {d10[1], d11[1], d12[1], d13[1]}, [r12], r1
+    vst4.8      {d10[2], d11[2], d12[2], d13[2]}, [r0], r1
+    vst4.8      {d10[3], d11[3], d12[3], d13[3]}, [r12], r1
+    vst4.8      {d10[4], d11[4], d12[4], d13[4]}, [r0], r1
+    vst4.8      {d10[5], d11[5], d12[5], d13[5]}, [r12], r1
+    vst4.8      {d10[6], d11[6], d12[6], d13[6]}, [r0], r1
+    vst4.8      {d10[7], d11[7], d12[7], d13[7]}, [r12], r1
+
+    vst4.8      {d14[0], d15[0], d16[0], d17[0]}, [r0], r1
+    vst4.8      {d14[1], d15[1], d16[1], d17[1]}, [r12], r1
+    vst4.8      {d14[2], d15[2], d16[2], d17[2]}, [r0], r1
+    vst4.8      {d14[3], d15[3], d16[3], d17[3]}, [r12], r1
+    vst4.8      {d14[4], d15[4], d16[4], d17[4]}, [r0], r1
+    vst4.8      {d14[5], d15[5], d16[5], d17[5]}, [r12], r1
+    vst4.8      {d14[6], d15[6], d16[6], d17[6]}, [r0]
+    vst4.8      {d14[7], d15[7], d16[7], d17[7]}, [r12]
+
+    pop         {pc}
+    ENDP        ; |vp9_loop_filter_vertical_edge_y_neon|
+
+; void vp9_loop_filter_vertical_edge_uv_neon(unsigned char *u, int pitch
+;                                            const signed char *flimit,
+;                                            const signed char *limit,
+;                                            const signed char *thresh,
+;                                            unsigned char *v)
+; r0    unsigned char *u,
+; r1    int pitch,
+; r2    unsigned char blimit
+; r3    unsigned char limit
+; sp    unsigned char thresh,
+; sp+4  unsigned char *v
+|vp9_loop_filter_vertical_edge_uv_neon| PROC
+    push        {lr}
+    vdup.u8     q0, r2                      ; duplicate blimit
+    sub         r12, r0, #4                 ; move u pointer down by 4 columns
+    ldr         r2, [sp, #8]                ; load v ptr
+    vdup.u8     q1, r3                      ; duplicate limit
+    sub         r3, r2, #4                  ; move v pointer down by 4 columns
+
+    vld1.u8     {d6}, [r12], r1             ;load u data
+    vld1.u8     {d7}, [r3], r1              ;load v data
+    vld1.u8     {d8}, [r12], r1
+    vld1.u8     {d9}, [r3], r1
+    vld1.u8     {d10}, [r12], r1
+    vld1.u8     {d11}, [r3], r1
+    vld1.u8     {d12}, [r12], r1
+    vld1.u8     {d13}, [r3], r1
+    vld1.u8     {d14}, [r12], r1
+    vld1.u8     {d15}, [r3], r1
+    vld1.u8     {d16}, [r12], r1
+    vld1.u8     {d17}, [r3], r1
+    vld1.u8     {d18}, [r12], r1
+    vld1.u8     {d19}, [r3], r1
+    vld1.u8     {d20}, [r12]
+    vld1.u8     {d21}, [r3]
+
+    ldr        r12, [sp, #4]               ; load thresh
+
+    ;transpose to 8x16 matrix
+    vtrn.32     q3, q7
+    vtrn.32     q4, q8
+    vtrn.32     q5, q9
+    vtrn.32     q6, q10
+
+    vdup.u8     q2, r12                     ; duplicate thresh
+
+    vtrn.16     q3, q5
+    vtrn.16     q4, q6
+    vtrn.16     q7, q9
+    vtrn.16     q8, q10
+
+    vtrn.8      q3, q4
+    vtrn.8      q5, q6
+    vtrn.8      q7, q8
+    vtrn.8      q9, q10
+
+    bl          vp9_loop_filter_neon
+
+    vswp        d12, d11
+    vswp        d16, d13
+    vswp        d14, d12
+    vswp        d16, d15
+
+    sub         r0, r0, #2
+    sub         r2, r2, #2
+
+    ;store op1, op0, oq0, oq1
+    vst4.8      {d10[0], d11[0], d12[0], d13[0]}, [r0], r1
+    vst4.8      {d14[0], d15[0], d16[0], d17[0]}, [r2], r1
+    vst4.8      {d10[1], d11[1], d12[1], d13[1]}, [r0], r1
+    vst4.8      {d14[1], d15[1], d16[1], d17[1]}, [r2], r1
+    vst4.8      {d10[2], d11[2], d12[2], d13[2]}, [r0], r1
+    vst4.8      {d14[2], d15[2], d16[2], d17[2]}, [r2], r1
+    vst4.8      {d10[3], d11[3], d12[3], d13[3]}, [r0], r1
+    vst4.8      {d14[3], d15[3], d16[3], d17[3]}, [r2], r1
+    vst4.8      {d10[4], d11[4], d12[4], d13[4]}, [r0], r1
+    vst4.8      {d14[4], d15[4], d16[4], d17[4]}, [r2], r1
+    vst4.8      {d10[5], d11[5], d12[5], d13[5]}, [r0], r1
+    vst4.8      {d14[5], d15[5], d16[5], d17[5]}, [r2], r1
+    vst4.8      {d10[6], d11[6], d12[6], d13[6]}, [r0], r1
+    vst4.8      {d14[6], d15[6], d16[6], d17[6]}, [r2], r1
+    vst4.8      {d10[7], d11[7], d12[7], d13[7]}, [r0]
+    vst4.8      {d14[7], d15[7], d16[7], d17[7]}, [r2]
+
+    pop         {pc}
+    ENDP        ; |vp9_loop_filter_vertical_edge_uv_neon|
+
+; void vp9_loop_filter_neon();
+; This is a helper function for the loopfilters. The invidual functions do the
+; necessary load, transpose (if necessary) and store.
+
+; r0-r3 PRESERVE
+; q0    flimit
+; q1    limit
+; q2    thresh
+; q3    p3
+; q4    p2
+; q5    p1
+; q6    p0
+; q7    q0
+; q8    q1
+; q9    q2
+; q10   q3
+|vp9_loop_filter_neon| PROC
+
+    ; vp9_filter_mask
+    vabd.u8     q11, q3, q4                 ; abs(p3 - p2)
+    vabd.u8     q12, q4, q5                 ; abs(p2 - p1)
+    vabd.u8     q13, q5, q6                 ; abs(p1 - p0)
+    vabd.u8     q14, q8, q7                 ; abs(q1 - q0)
+    vabd.u8     q3, q9, q8                  ; abs(q2 - q1)
+    vabd.u8     q4, q10, q9                 ; abs(q3 - q2)
+
+    vmax.u8     q11, q11, q12
+    vmax.u8     q12, q13, q14
+    vmax.u8     q3, q3, q4
+    vmax.u8     q15, q11, q12
+
+    vabd.u8     q9, q6, q7                  ; abs(p0 - q0)
+
+    ; vp8_hevmask
+    vcgt.u8     q13, q13, q2                ; (abs(p1 - p0) > thresh)*-1
+    vcgt.u8     q14, q14, q2                ; (abs(q1 - q0) > thresh)*-1
+    vmax.u8     q15, q15, q3
+
+    vmov.u8     q10, #0x80                   ; 0x80
+
+    vabd.u8     q2, q5, q8                  ; a = abs(p1 - q1)
+    vqadd.u8    q9, q9, q9                  ; b = abs(p0 - q0) * 2
+
+    vcge.u8     q15, q1, q15
+
+    ; vp9_filter() function
+    ; convert to signed
+    veor        q7, q7, q10                 ; qs0
+    vshr.u8     q2, q2, #1                  ; a = a / 2
+    veor        q6, q6, q10                 ; ps0
+
+    veor        q5, q5, q10                 ; ps1
+    vqadd.u8    q9, q9, q2                  ; a = b + a
+
+    veor        q8, q8, q10                 ; qs1
+
+    vmov.u8     q10, #3                     ; #3
+
+    vsubl.s8    q2, d14, d12                ; ( qs0 - ps0)
+    vsubl.s8    q11, d15, d13
+
+    vcge.u8     q9, q0, q9                  ; (a > flimit * 2 + limit) * -1
+
+    vmovl.u8    q4, d20
+
+    vqsub.s8    q1, q5, q8                  ; vp9_filter = clamp(ps1-qs1)
+    vorr        q14, q13, q14               ; vp8_hevmask
+
+    vmul.i16    q2, q2, q4                  ; 3 * ( qs0 - ps0)
+    vmul.i16    q11, q11, q4
+
+    vand        q1, q1, q14                 ; vp9_filter &= hev
+    vand        q15, q15, q9                ; vp9_filter_mask
+
+    vaddw.s8    q2, q2, d2
+    vaddw.s8    q11, q11, d3
+
+    vmov.u8     q9, #4                      ; #4
+
+    ; vp9_filter = clamp(vp9_filter + 3 * ( qs0 - ps0))
+    vqmovn.s16  d2, q2
+    vqmovn.s16  d3, q11
+    vand        q1, q1, q15                 ; vp9_filter &= mask
+
+    vqadd.s8    q2, q1, q10                 ; Filter2 = clamp(vp9_filter+3)
+    vqadd.s8    q1, q1, q9                  ; Filter1 = clamp(vp9_filter+4)
+    vshr.s8     q2, q2, #3                  ; Filter2 >>= 3
+    vshr.s8     q1, q1, #3                  ; Filter1 >>= 3
+
+
+    vqadd.s8    q11, q6, q2                 ; u = clamp(ps0 + Filter2)
+    vqsub.s8    q10, q7, q1                 ; u = clamp(qs0 - Filter1)
+
+    ; outer tap adjustments: ++vp9_filter >> 1
+    vrshr.s8    q1, q1, #1
+    vbic        q1, q1, q14                 ; vp9_filter &= ~hev
+    vmov.u8     q0, #0x80                   ; 0x80
+    vqadd.s8    q13, q5, q1                 ; u = clamp(ps1 + vp9_filter)
+    vqsub.s8    q12, q8, q1                 ; u = clamp(qs1 - vp9_filter)
+
+    veor        q6, q11, q0                 ; *op0 = u^0x80
+    veor        q7, q10, q0                 ; *oq0 = u^0x80
+    veor        q5, q13, q0                 ; *op1 = u^0x80
+    veor        q8, q12, q0                 ; *oq1 = u^0x80
+
+    bx          lr
+    ENDP        ; |vp9_loop_filter_horizontal_edge_y_neon|
+
+;-----------------
+
+    END
--- /dev/null
+++ b/vp9/common/arm/neon/loopfiltersimplehorizontaledge_neon.asm
@@ -1,0 +1,117 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    ;EXPORT  |vp9_loop_filter_simple_horizontal_edge_neon|
+    EXPORT  |vp9_loop_filter_bhs_neon|
+    EXPORT  |vp9_loop_filter_mbhs_neon|
+    ARM
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0    unsigned char *s, PRESERVE
+; r1    int p, PRESERVE
+; q1    limit, PRESERVE
+
+|vp9_loop_filter_simple_horizontal_edge_neon| PROC
+
+    sub         r3, r0, r1, lsl #1          ; move src pointer down by 2 lines
+
+    vld1.u8     {q7}, [r0@128], r1          ; q0
+    vld1.u8     {q5}, [r3@128], r1          ; p0
+    vld1.u8     {q8}, [r0@128]              ; q1
+    vld1.u8     {q6}, [r3@128]              ; p1
+
+    vabd.u8     q15, q6, q7                 ; abs(p0 - q0)
+    vabd.u8     q14, q5, q8                 ; abs(p1 - q1)
+
+    vqadd.u8    q15, q15, q15               ; abs(p0 - q0) * 2
+    vshr.u8     q14, q14, #1                ; abs(p1 - q1) / 2
+    vmov.u8     q0, #0x80                   ; 0x80
+    vmov.s16    q13, #3
+    vqadd.u8    q15, q15, q14               ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2
+
+    veor        q7, q7, q0                  ; qs0: q0 offset to convert to a signed value
+    veor        q6, q6, q0                  ; ps0: p0 offset to convert to a signed value
+    veor        q5, q5, q0                  ; ps1: p1 offset to convert to a signed value
+    veor        q8, q8, q0                  ; qs1: q1 offset to convert to a signed value
+
+    vcge.u8     q15, q1, q15                ; (abs(p0 - q0)*2 + abs(p1-q1)/2 > limit)*-1
+
+    vsubl.s8    q2, d14, d12                ; ( qs0 - ps0)
+    vsubl.s8    q3, d15, d13
+
+    vqsub.s8    q4, q5, q8                  ; q4: vp9_filter = vp9_signed_char_clamp(ps1-qs1)
+
+    vmul.s16    q2, q2, q13                 ;  3 * ( qs0 - ps0)
+    vmul.s16    q3, q3, q13
+
+    vmov.u8     q10, #0x03                  ; 0x03
+    vmov.u8     q9, #0x04                   ; 0x04
+
+    vaddw.s8    q2, q2, d8                  ; vp9_filter + 3 * ( qs0 - ps0)
+    vaddw.s8    q3, q3, d9
+
+    vqmovn.s16  d8, q2                      ; vp9_filter = vp9_signed_char_clamp(vp9_filter + 3 * ( qs0 - ps0))
+    vqmovn.s16  d9, q3
+
+    vand        q14, q4, q15                ; vp9_filter &= mask
+
+    vqadd.s8    q2, q14, q10                ; Filter2 = vp9_signed_char_clamp(vp9_filter+3)
+    vqadd.s8    q3, q14, q9                 ; Filter1 = vp9_signed_char_clamp(vp9_filter+4)
+    vshr.s8     q2, q2, #3                  ; Filter2 >>= 3
+    vshr.s8     q4, q3, #3                  ; Filter1 >>= 3
+
+    sub         r0, r0, r1
+
+    ;calculate output
+    vqadd.s8    q11, q6, q2                 ; u = vp9_signed_char_clamp(ps0 + Filter2)
+    vqsub.s8    q10, q7, q4                 ; u = vp9_signed_char_clamp(qs0 - Filter1)
+
+    veor        q6, q11, q0                 ; *op0 = u^0x80
+    veor        q7, q10, q0                 ; *oq0 = u^0x80
+
+    vst1.u8     {q6}, [r3@128]              ; store op0
+    vst1.u8     {q7}, [r0@128]              ; store oq0
+
+    bx          lr
+    ENDP        ; |vp9_loop_filter_simple_horizontal_edge_neon|
+
+; r0    unsigned char *y
+; r1    int ystride
+; r2    const unsigned char *blimit
+
+|vp9_loop_filter_bhs_neon| PROC
+    push        {r4, lr}
+    ldrb        r3, [r2]                    ; load blim from mem
+    vdup.s8     q1, r3                      ; duplicate blim
+
+    add         r0, r0, r1, lsl #2          ; src = y_ptr + 4 * y_stride
+    bl          vp9_loop_filter_simple_horizontal_edge_neon
+    ; vp9_loop_filter_simple_horizontal_edge_neon preserves r0, r1 and q1
+    add         r0, r0, r1, lsl #2          ; src = y_ptr + 8* y_stride
+    bl          vp9_loop_filter_simple_horizontal_edge_neon
+    add         r0, r0, r1, lsl #2          ; src = y_ptr + 12 * y_stride
+    pop         {r4, lr}
+    b           vp9_loop_filter_simple_horizontal_edge_neon
+    ENDP        ;|vp9_loop_filter_bhs_neon|
+
+; r0    unsigned char *y
+; r1    int ystride
+; r2    const unsigned char *blimit
+
+|vp9_loop_filter_mbhs_neon| PROC
+    ldrb        r3, [r2]                   ; load blim from mem
+    vdup.s8     q1, r3                     ; duplicate mblim
+    b           vp9_loop_filter_simple_horizontal_edge_neon
+    ENDP        ;|vp9_loop_filter_bhs_neon|
+
+    END
--- /dev/null
+++ b/vp9/common/arm/neon/loopfiltersimpleverticaledge_neon.asm
@@ -1,0 +1,154 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    ;EXPORT  |vp9_loop_filter_simple_vertical_edge_neon|
+    EXPORT |vp9_loop_filter_bvs_neon|
+    EXPORT |vp9_loop_filter_mbvs_neon|
+    ARM
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0    unsigned char *s, PRESERVE
+; r1    int p, PRESERVE
+; q1    limit, PRESERVE
+
+|vp9_loop_filter_simple_vertical_edge_neon| PROC
+    sub         r0, r0, #2                  ; move src pointer down by 2 columns
+    add         r12, r1, r1
+    add         r3, r0, r1
+
+    vld4.8      {d6[0], d7[0], d8[0], d9[0]}, [r0], r12
+    vld4.8      {d6[1], d7[1], d8[1], d9[1]}, [r3], r12
+    vld4.8      {d6[2], d7[2], d8[2], d9[2]}, [r0], r12
+    vld4.8      {d6[3], d7[3], d8[3], d9[3]}, [r3], r12
+    vld4.8      {d6[4], d7[4], d8[4], d9[4]}, [r0], r12
+    vld4.8      {d6[5], d7[5], d8[5], d9[5]}, [r3], r12
+    vld4.8      {d6[6], d7[6], d8[6], d9[6]}, [r0], r12
+    vld4.8      {d6[7], d7[7], d8[7], d9[7]}, [r3], r12
+
+    vld4.8      {d10[0], d11[0], d12[0], d13[0]}, [r0], r12
+    vld4.8      {d10[1], d11[1], d12[1], d13[1]}, [r3], r12
+    vld4.8      {d10[2], d11[2], d12[2], d13[2]}, [r0], r12
+    vld4.8      {d10[3], d11[3], d12[3], d13[3]}, [r3], r12
+    vld4.8      {d10[4], d11[4], d12[4], d13[4]}, [r0], r12
+    vld4.8      {d10[5], d11[5], d12[5], d13[5]}, [r3], r12
+    vld4.8      {d10[6], d11[6], d12[6], d13[6]}, [r0], r12
+    vld4.8      {d10[7], d11[7], d12[7], d13[7]}, [r3]
+
+    vswp        d7, d10
+    vswp        d12, d9
+
+    ;vp9_filter_mask() function
+    ;vp8_hevmask() function
+    sub         r0, r0, r1, lsl #4
+    vabd.u8     q15, q5, q4                 ; abs(p0 - q0)
+    vabd.u8     q14, q3, q6                 ; abs(p1 - q1)
+
+    vqadd.u8    q15, q15, q15               ; abs(p0 - q0) * 2
+    vshr.u8     q14, q14, #1                ; abs(p1 - q1) / 2
+    vmov.u8     q0, #0x80                   ; 0x80
+    vmov.s16    q11, #3
+    vqadd.u8    q15, q15, q14               ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2
+
+    veor        q4, q4, q0                  ; qs0: q0 offset to convert to a signed value
+    veor        q5, q5, q0                  ; ps0: p0 offset to convert to a signed value
+    veor        q3, q3, q0                  ; ps1: p1 offset to convert to a signed value
+    veor        q6, q6, q0                  ; qs1: q1 offset to convert to a signed value
+
+    vcge.u8     q15, q1, q15                ; abs(p0 - q0)*2 + abs(p1-q1)/2 > flimit*2 + limit)*-1
+
+    vsubl.s8    q2, d8, d10                 ; ( qs0 - ps0)
+    vsubl.s8    q13, d9, d11
+
+    vqsub.s8    q14, q3, q6                  ; vp9_filter = vp9_signed_char_clamp(ps1-qs1)
+
+    vmul.s16    q2, q2, q11                 ;  3 * ( qs0 - ps0)
+    vmul.s16    q13, q13, q11
+
+    vmov.u8     q11, #0x03                  ; 0x03
+    vmov.u8     q12, #0x04                  ; 0x04
+
+    vaddw.s8    q2, q2, d28                  ; vp9_filter + 3 * ( qs0 - ps0)
+    vaddw.s8    q13, q13, d29
+
+    vqmovn.s16  d28, q2                      ; vp9_filter = vp9_signed_char_clamp(vp9_filter + 3 * ( qs0 - ps0))
+    vqmovn.s16  d29, q13
+
+    add         r0, r0, #1
+    add         r3, r0, r1
+
+    vand        q14, q14, q15                 ; vp9_filter &= mask
+
+    vqadd.s8    q2, q14, q11                 ; Filter2 = vp9_signed_char_clamp(vp9_filter+3)
+    vqadd.s8    q3, q14, q12                 ; Filter1 = vp9_signed_char_clamp(vp9_filter+4)
+    vshr.s8     q2, q2, #3                  ; Filter2 >>= 3
+    vshr.s8     q14, q3, #3                  ; Filter1 >>= 3
+
+    ;calculate output
+    vqadd.s8    q11, q5, q2                 ; u = vp9_signed_char_clamp(ps0 + Filter2)
+    vqsub.s8    q10, q4, q14                 ; u = vp9_signed_char_clamp(qs0 - Filter1)
+
+    veor        q6, q11, q0                 ; *op0 = u^0x80
+    veor        q7, q10, q0                 ; *oq0 = u^0x80
+    add         r12, r1, r1
+    vswp        d13, d14
+
+    ;store op1, op0, oq0, oq1
+    vst2.8      {d12[0], d13[0]}, [r0], r12
+    vst2.8      {d12[1], d13[1]}, [r3], r12
+    vst2.8      {d12[2], d13[2]}, [r0], r12
+    vst2.8      {d12[3], d13[3]}, [r3], r12
+    vst2.8      {d12[4], d13[4]}, [r0], r12
+    vst2.8      {d12[5], d13[5]}, [r3], r12
+    vst2.8      {d12[6], d13[6]}, [r0], r12
+    vst2.8      {d12[7], d13[7]}, [r3], r12
+    vst2.8      {d14[0], d15[0]}, [r0], r12
+    vst2.8      {d14[1], d15[1]}, [r3], r12
+    vst2.8      {d14[2], d15[2]}, [r0], r12
+    vst2.8      {d14[3], d15[3]}, [r3], r12
+    vst2.8      {d14[4], d15[4]}, [r0], r12
+    vst2.8      {d14[5], d15[5]}, [r3], r12
+    vst2.8      {d14[6], d15[6]}, [r0], r12
+    vst2.8      {d14[7], d15[7]}, [r3]
+
+    bx          lr
+    ENDP        ; |vp9_loop_filter_simple_vertical_edge_neon|
+
+; r0    unsigned char *y
+; r1    int ystride
+; r2    const unsigned char *blimit
+
+|vp9_loop_filter_bvs_neon| PROC
+    push        {r4, lr}
+    ldrb        r3, [r2]                   ; load blim from mem
+    mov         r4, r0
+    add         r0, r0, #4
+    vdup.s8     q1, r3                     ; duplicate blim
+    bl          vp9_loop_filter_simple_vertical_edge_neon
+    ; vp9_loop_filter_simple_vertical_edge_neon preserves  r1 and q1
+    add         r0, r4, #8
+    bl          vp9_loop_filter_simple_vertical_edge_neon
+    add         r0, r4, #12
+    pop         {r4, lr}
+    b           vp9_loop_filter_simple_vertical_edge_neon
+    ENDP        ;|vp9_loop_filter_bvs_neon|
+
+; r0    unsigned char *y
+; r1    int ystride
+; r2    const unsigned char *blimit
+
+|vp9_loop_filter_mbvs_neon| PROC
+    ldrb        r3, [r2]                   ; load mblim from mem
+    vdup.s8     q1, r3                     ; duplicate mblim
+    b           vp9_loop_filter_simple_vertical_edge_neon
+    ENDP        ;|vp9_loop_filter_bvs_neon|
+    END
--- /dev/null
+++ b/vp9/common/arm/neon/mbloopfilter_neon.asm
@@ -1,0 +1,469 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_mbloop_filter_horizontal_edge_y_neon|
+    EXPORT  |vp8_mbloop_filter_horizontal_edge_uv_neon|
+    EXPORT  |vp8_mbloop_filter_vertical_edge_y_neon|
+    EXPORT  |vp8_mbloop_filter_vertical_edge_uv_neon|
+    ARM
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; void vp8_mbloop_filter_horizontal_edge_y_neon(unsigned char *src, int pitch,
+;                                               const unsigned char *blimit,
+;                                               const unsigned char *limit,
+;                                               const unsigned char *thresh)
+; r0    unsigned char *src,
+; r1    int pitch,
+; r2    unsigned char blimit
+; r3    unsigned char limit
+; sp    unsigned char thresh,
+|vp8_mbloop_filter_horizontal_edge_y_neon| PROC
+    push        {lr}
+    add         r1, r1, r1                  ; double stride
+    ldr         r12, [sp, #4]               ; load thresh
+    sub         r0, r0, r1, lsl #1          ; move src pointer down by 4 lines
+    vdup.u8     q2, r12                     ; thresh
+    add         r12, r0, r1,  lsr #1        ; move src pointer up by 1 line
+
+    vld1.u8     {q3}, [r0@128], r1              ; p3
+    vld1.u8     {q4}, [r12@128], r1             ; p2
+    vld1.u8     {q5}, [r0@128], r1              ; p1
+    vld1.u8     {q6}, [r12@128], r1             ; p0
+    vld1.u8     {q7}, [r0@128], r1              ; q0
+    vld1.u8     {q8}, [r12@128], r1             ; q1
+    vld1.u8     {q9}, [r0@128], r1              ; q2
+    vld1.u8     {q10}, [r12@128], r1            ; q3
+
+    bl          vp8_mbloop_filter_neon
+
+    sub         r12, r12, r1, lsl #2
+    add         r0, r12, r1, lsr #1
+
+    vst1.u8     {q4}, [r12@128],r1         ; store op2
+    vst1.u8     {q5}, [r0@128],r1          ; store op1
+    vst1.u8     {q6}, [r12@128], r1        ; store op0
+    vst1.u8     {q7}, [r0@128],r1          ; store oq0
+    vst1.u8     {q8}, [r12@128]            ; store oq1
+    vst1.u8     {q9}, [r0@128]             ; store oq2
+
+    pop         {pc}
+    ENDP        ; |vp8_mbloop_filter_horizontal_edge_y_neon|
+
+; void vp8_mbloop_filter_horizontal_edge_uv_neon(unsigned char *u, int pitch,
+;                                                const unsigned char *blimit,
+;                                                const unsigned char *limit,
+;                                                const unsigned char *thresh,
+;                                                unsigned char *v)
+; r0    unsigned char *u,
+; r1    int pitch,
+; r2    unsigned char blimit
+; r3    unsigned char limit
+; sp    unsigned char thresh,
+; sp+4  unsigned char *v
+
+|vp8_mbloop_filter_horizontal_edge_uv_neon| PROC
+    push        {lr}
+    ldr         r12, [sp, #4]                 ; load thresh
+    sub         r0, r0, r1, lsl #2            ; move u pointer down by 4 lines
+    vdup.u8     q2, r12                       ; thresh
+    ldr         r12, [sp, #8]                 ; load v ptr
+    sub         r12, r12, r1, lsl #2          ; move v pointer down by 4 lines
+
+    vld1.u8     {d6}, [r0@64], r1              ; p3
+    vld1.u8     {d7}, [r12@64], r1              ; p3
+    vld1.u8     {d8}, [r0@64], r1              ; p2
+    vld1.u8     {d9}, [r12@64], r1              ; p2
+    vld1.u8     {d10}, [r0@64], r1             ; p1
+    vld1.u8     {d11}, [r12@64], r1             ; p1
+    vld1.u8     {d12}, [r0@64], r1             ; p0
+    vld1.u8     {d13}, [r12@64], r1             ; p0
+    vld1.u8     {d14}, [r0@64], r1             ; q0
+    vld1.u8     {d15}, [r12@64], r1             ; q0
+    vld1.u8     {d16}, [r0@64], r1             ; q1
+    vld1.u8     {d17}, [r12@64], r1             ; q1
+    vld1.u8     {d18}, [r0@64], r1             ; q2
+    vld1.u8     {d19}, [r12@64], r1             ; q2
+    vld1.u8     {d20}, [r0@64], r1             ; q3
+    vld1.u8     {d21}, [r12@64], r1             ; q3
+
+    bl          vp8_mbloop_filter_neon
+
+    sub         r0, r0, r1, lsl #3
+    sub         r12, r12, r1, lsl #3
+
+    add         r0, r0, r1
+    add         r12, r12, r1
+
+    vst1.u8     {d8}, [r0@64], r1              ; store u op2
+    vst1.u8     {d9}, [r12@64], r1              ; store v op2
+    vst1.u8     {d10}, [r0@64], r1             ; store u op1
+    vst1.u8     {d11}, [r12@64], r1             ; store v op1
+    vst1.u8     {d12}, [r0@64], r1             ; store u op0
+    vst1.u8     {d13}, [r12@64], r1             ; store v op0
+    vst1.u8     {d14}, [r0@64], r1             ; store u oq0
+    vst1.u8     {d15}, [r12@64], r1             ; store v oq0
+    vst1.u8     {d16}, [r0@64], r1             ; store u oq1
+    vst1.u8     {d17}, [r12@64], r1             ; store v oq1
+    vst1.u8     {d18}, [r0@64], r1             ; store u oq2
+    vst1.u8     {d19}, [r12@64], r1             ; store v oq2
+
+    pop         {pc}
+    ENDP        ; |vp8_mbloop_filter_horizontal_edge_uv_neon|
+
+; void vp8_mbloop_filter_vertical_edge_y_neon(unsigned char *src, int pitch,
+;                                             const unsigned char *blimit,
+;                                             const unsigned char *limit,
+;                                             const unsigned char *thresh)
+; r0    unsigned char *src,
+; r1    int pitch,
+; r2    unsigned char blimit
+; r3    unsigned char limit
+; sp    unsigned char thresh,
+|vp8_mbloop_filter_vertical_edge_y_neon| PROC
+    push        {lr}
+    ldr         r12, [sp, #4]               ; load thresh
+    sub         r0, r0, #4                  ; move src pointer down by 4 columns
+    vdup.s8     q2, r12                     ; thresh
+    add         r12, r0, r1, lsl #3         ; move src pointer down by 8 lines
+
+    vld1.u8     {d6}, [r0], r1              ; load first 8-line src data
+    vld1.u8     {d7}, [r12], r1             ; load second 8-line src data
+    vld1.u8     {d8}, [r0], r1
+    vld1.u8     {d9}, [r12], r1
+    vld1.u8     {d10}, [r0], r1
+    vld1.u8     {d11}, [r12], r1
+    vld1.u8     {d12}, [r0], r1
+    vld1.u8     {d13}, [r12], r1
+    vld1.u8     {d14}, [r0], r1
+    vld1.u8     {d15}, [r12], r1
+    vld1.u8     {d16}, [r0], r1
+    vld1.u8     {d17}, [r12], r1
+    vld1.u8     {d18}, [r0], r1
+    vld1.u8     {d19}, [r12], r1
+    vld1.u8     {d20}, [r0], r1
+    vld1.u8     {d21}, [r12], r1
+
+    ;transpose to 8x16 matrix
+    vtrn.32     q3, q7
+    vtrn.32     q4, q8
+    vtrn.32     q5, q9
+    vtrn.32     q6, q10
+
+    vtrn.16     q3, q5
+    vtrn.16     q4, q6
+    vtrn.16     q7, q9
+    vtrn.16     q8, q10
+
+    vtrn.8      q3, q4
+    vtrn.8      q5, q6
+    vtrn.8      q7, q8
+    vtrn.8      q9, q10
+
+    sub         r0, r0, r1, lsl #3
+
+    bl          vp8_mbloop_filter_neon
+
+    sub         r12, r12, r1, lsl #3
+
+    ;transpose to 16x8 matrix
+    vtrn.32     q3, q7
+    vtrn.32     q4, q8
+    vtrn.32     q5, q9
+    vtrn.32     q6, q10
+
+    vtrn.16     q3, q5
+    vtrn.16     q4, q6
+    vtrn.16     q7, q9
+    vtrn.16     q8, q10
+
+    vtrn.8      q3, q4
+    vtrn.8      q5, q6
+    vtrn.8      q7, q8
+    vtrn.8      q9, q10
+
+    ;store op2, op1, op0, oq0, oq1, oq2
+    vst1.8      {d6}, [r0], r1
+    vst1.8      {d7}, [r12], r1
+    vst1.8      {d8}, [r0], r1
+    vst1.8      {d9}, [r12], r1
+    vst1.8      {d10}, [r0], r1
+    vst1.8      {d11}, [r12], r1
+    vst1.8      {d12}, [r0], r1
+    vst1.8      {d13}, [r12], r1
+    vst1.8      {d14}, [r0], r1
+    vst1.8      {d15}, [r12], r1
+    vst1.8      {d16}, [r0], r1
+    vst1.8      {d17}, [r12], r1
+    vst1.8      {d18}, [r0], r1
+    vst1.8      {d19}, [r12], r1
+    vst1.8      {d20}, [r0]
+    vst1.8      {d21}, [r12]
+
+    pop         {pc}
+    ENDP        ; |vp8_mbloop_filter_vertical_edge_y_neon|
+
+; void vp8_mbloop_filter_vertical_edge_uv_neon(unsigned char *u, int pitch,
+;                                              const unsigned char *blimit,
+;                                              const unsigned char *limit,
+;                                              const unsigned char *thresh,
+;                                              unsigned char *v)
+; r0    unsigned char *u,
+; r1    int pitch,
+; r2    const signed char *flimit,
+; r3    const signed char *limit,
+; sp    const signed char *thresh,
+; sp+4  unsigned char *v
+|vp8_mbloop_filter_vertical_edge_uv_neon| PROC
+    push        {lr}
+    ldr         r12, [sp, #4]               ; load thresh
+    sub         r0, r0, #4                  ; move u pointer down by 4 columns
+    vdup.u8     q2, r12                     ; thresh
+    ldr         r12, [sp, #8]               ; load v ptr
+    sub         r12, r12, #4                ; move v pointer down by 4 columns
+
+    vld1.u8     {d6}, [r0], r1              ;load u data
+    vld1.u8     {d7}, [r12], r1             ;load v data
+    vld1.u8     {d8}, [r0], r1
+    vld1.u8     {d9}, [r12], r1
+    vld1.u8     {d10}, [r0], r1
+    vld1.u8     {d11}, [r12], r1
+    vld1.u8     {d12}, [r0], r1
+    vld1.u8     {d13}, [r12], r1
+    vld1.u8     {d14}, [r0], r1
+    vld1.u8     {d15}, [r12], r1
+    vld1.u8     {d16}, [r0], r1
+    vld1.u8     {d17}, [r12], r1
+    vld1.u8     {d18}, [r0], r1
+    vld1.u8     {d19}, [r12], r1
+    vld1.u8     {d20}, [r0], r1
+    vld1.u8     {d21}, [r12], r1
+
+    ;transpose to 8x16 matrix
+    vtrn.32     q3, q7
+    vtrn.32     q4, q8
+    vtrn.32     q5, q9
+    vtrn.32     q6, q10
+
+    vtrn.16     q3, q5
+    vtrn.16     q4, q6
+    vtrn.16     q7, q9
+    vtrn.16     q8, q10
+
+    vtrn.8      q3, q4
+    vtrn.8      q5, q6
+    vtrn.8      q7, q8
+    vtrn.8      q9, q10
+
+    sub         r0, r0, r1, lsl #3
+
+    bl          vp8_mbloop_filter_neon
+
+    sub         r12, r12, r1, lsl #3
+
+    ;transpose to 16x8 matrix
+    vtrn.32     q3, q7
+    vtrn.32     q4, q8
+    vtrn.32     q5, q9
+    vtrn.32     q6, q10
+
+    vtrn.16     q3, q5
+    vtrn.16     q4, q6
+    vtrn.16     q7, q9
+    vtrn.16     q8, q10
+
+    vtrn.8      q3, q4
+    vtrn.8      q5, q6
+    vtrn.8      q7, q8
+    vtrn.8      q9, q10
+
+    ;store op2, op1, op0, oq0, oq1, oq2
+    vst1.8      {d6}, [r0], r1
+    vst1.8      {d7}, [r12], r1
+    vst1.8      {d8}, [r0], r1
+    vst1.8      {d9}, [r12], r1
+    vst1.8      {d10}, [r0], r1
+    vst1.8      {d11}, [r12], r1
+    vst1.8      {d12}, [r0], r1
+    vst1.8      {d13}, [r12], r1
+    vst1.8      {d14}, [r0], r1
+    vst1.8      {d15}, [r12], r1
+    vst1.8      {d16}, [r0], r1
+    vst1.8      {d17}, [r12], r1
+    vst1.8      {d18}, [r0], r1
+    vst1.8      {d19}, [r12], r1
+    vst1.8      {d20}, [r0]
+    vst1.8      {d21}, [r12]
+
+    pop         {pc}
+    ENDP        ; |vp8_mbloop_filter_vertical_edge_uv_neon|
+
+; void vp8_mbloop_filter_neon()
+; This is a helper function for the macroblock loopfilters. The individual
+; functions do the necessary load, transpose (if necessary), preserve (if
+; necessary) and store.
+
+; r0,r1 PRESERVE
+; r2    mblimit
+; r3    limit
+
+; q2    thresh
+; q3    p3 PRESERVE
+; q4    p2
+; q5    p1
+; q6    p0
+; q7    q0
+; q8    q1
+; q9    q2
+; q10   q3 PRESERVE
+
+|vp8_mbloop_filter_neon| PROC
+
+    ; vp9_filter_mask
+    vabd.u8     q11, q3, q4                 ; abs(p3 - p2)
+    vabd.u8     q12, q4, q5                 ; abs(p2 - p1)
+    vabd.u8     q13, q5, q6                 ; abs(p1 - p0)
+    vabd.u8     q14, q8, q7                 ; abs(q1 - q0)
+    vabd.u8     q1, q9, q8                  ; abs(q2 - q1)
+    vabd.u8     q0, q10, q9                 ; abs(q3 - q2)
+
+    vmax.u8     q11, q11, q12
+    vmax.u8     q12, q13, q14
+    vmax.u8     q1, q1, q0
+    vmax.u8     q15, q11, q12
+
+    vabd.u8     q12, q6, q7                 ; abs(p0 - q0)
+
+    ; vp8_hevmask
+    vcgt.u8     q13, q13, q2                ; (abs(p1 - p0) > thresh) * -1
+    vcgt.u8     q14, q14, q2                ; (abs(q1 - q0) > thresh) * -1
+    vmax.u8     q15, q15, q1
+
+    vdup.u8     q1, r3                      ; limit
+    vdup.u8     q2, r2                      ; mblimit
+
+    vmov.u8     q0, #0x80                   ; 0x80
+
+    vcge.u8     q15, q1, q15
+
+    vabd.u8     q1, q5, q8                  ; a = abs(p1 - q1)
+    vqadd.u8    q12, q12, q12               ; b = abs(p0 - q0) * 2
+    vmov.u16    q11, #3                     ; #3
+
+    ; vp9_filter
+    ; convert to signed
+    veor        q7, q7, q0                  ; qs0
+    vshr.u8     q1, q1, #1                  ; a = a / 2
+    veor        q6, q6, q0                  ; ps0
+    veor        q5, q5, q0                  ; ps1
+
+    vqadd.u8    q12, q12, q1                ; a = b + a
+
+    veor        q8, q8, q0                  ; qs1
+    veor        q4, q4, q0                  ; ps2
+    veor        q9, q9, q0                  ; qs2
+
+    vorr        q14, q13, q14               ; vp8_hevmask
+
+    vcge.u8     q12, q2, q12                ; (a > flimit * 2 + limit) * -1
+
+    vsubl.s8    q2, d14, d12                ; qs0 - ps0
+    vsubl.s8    q13, d15, d13
+
+    vqsub.s8    q1, q5, q8                  ; vp9_filter = clamp(ps1-qs1)
+
+    vmul.i16    q2, q2, q11                 ; 3 * ( qs0 - ps0)
+
+    vand        q15, q15, q12               ; vp9_filter_mask
+
+    vmul.i16    q13, q13, q11
+
+    vmov.u8     q12, #3                     ; #3
+
+    vaddw.s8    q2, q2, d2                  ; vp9_filter + 3 * ( qs0 - ps0)
+    vaddw.s8    q13, q13, d3
+
+    vmov.u8     q11, #4                     ; #4
+
+    ; vp9_filter = clamp(vp9_filter + 3 * ( qs0 - ps0))
+    vqmovn.s16  d2, q2
+    vqmovn.s16  d3, q13
+
+    vand        q1, q1, q15                 ; vp9_filter &= mask
+
+    vmov.u16    q15, #63                    ; #63
+
+    vand        q13, q1, q14                ; Filter2 &= hev
+
+    vqadd.s8    q2, q13, q11                ; Filter1 = clamp(Filter2+4)
+    vqadd.s8    q13, q13, q12               ; Filter2 = clamp(Filter2+3)
+
+    vmov        q0, q15
+
+    vshr.s8     q2, q2, #3                  ; Filter1 >>= 3
+    vshr.s8     q13, q13, #3                ; Filter2 >>= 3
+
+    vmov        q11, q15
+    vmov        q12, q15
+
+    vqsub.s8    q7, q7, q2                  ; qs0 = clamp(qs0 - Filter1)
+
+    vqadd.s8    q6, q6, q13                 ; ps0 = clamp(ps0 + Filter2)
+
+    vbic        q1, q1, q14                 ; vp9_filter &= ~hev
+
+    ; roughly 1/7th difference across boundary
+    ; roughly 2/7th difference across boundary
+    ; roughly 3/7th difference across boundary
+
+    vmov.u8     d5, #9                      ; #9
+    vmov.u8     d4, #18                     ; #18
+
+    vmov        q13, q15
+    vmov        q14, q15
+
+    vmlal.s8    q0, d2, d5                  ; 63 + Filter2 * 9
+    vmlal.s8    q11, d3, d5
+    vmov.u8     d5, #27                     ; #27
+    vmlal.s8    q12, d2, d4                 ; 63 + Filter2 * 18
+    vmlal.s8    q13, d3, d4
+    vmlal.s8    q14, d2, d5                 ; 63 + Filter2 * 27
+    vmlal.s8    q15, d3, d5
+
+    vqshrn.s16  d0, q0, #7                  ; u = clamp((63 + Filter2 * 9)>>7)
+    vqshrn.s16  d1, q11, #7
+    vqshrn.s16  d24, q12, #7                ; u = clamp((63 + Filter2 * 18)>>7)
+    vqshrn.s16  d25, q13, #7
+    vqshrn.s16  d28, q14, #7                ; u = clamp((63 + Filter2 * 27)>>7)
+    vqshrn.s16  d29, q15, #7
+
+    vmov.u8     q1, #0x80                   ; 0x80
+
+    vqsub.s8    q11, q9, q0                 ; s = clamp(qs2 - u)
+    vqadd.s8    q0, q4, q0                  ; s = clamp(ps2 + u)
+    vqsub.s8    q13, q8, q12                ; s = clamp(qs1 - u)
+    vqadd.s8    q12, q5, q12                ; s = clamp(ps1 + u)
+    vqsub.s8    q15, q7, q14                ; s = clamp(qs0 - u)
+    vqadd.s8    q14, q6, q14                ; s = clamp(ps0 + u)
+
+    veor        q9, q11, q1                 ; *oq2 = s^0x80
+    veor        q4, q0, q1                  ; *op2 = s^0x80
+    veor        q8, q13, q1                 ; *oq1 = s^0x80
+    veor        q5, q12, q1                 ; *op2 = s^0x80
+    veor        q7, q15, q1                 ; *oq0 = s^0x80
+    veor        q6, q14, q1                 ; *op0 = s^0x80
+
+    bx          lr
+    ENDP        ; |vp8_mbloop_filter_neon|
+
+;-----------------
+
+    END
--- /dev/null
+++ b/vp9/common/arm/neon/recon16x16mb_neon.asm
@@ -1,0 +1,131 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_recon16x16mb_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0    unsigned char  *pred_ptr,
+; r1    short *diff_ptr,
+; r2    unsigned char *dst_ptr,
+; r3    int ystride,
+; stack unsigned char *udst_ptr,
+; stack unsigned char *vdst_ptr
+
+|vp8_recon16x16mb_neon| PROC
+    mov             r12, #4             ;loop counter for Y loop
+
+recon16x16mb_loop_y
+    vld1.u8         {q12, q13}, [r0]!   ;load data from pred_ptr
+    vld1.16         {q8, q9}, [r1]!     ;load data from diff_ptr
+    vld1.u8         {q14, q15}, [r0]!
+    vld1.16         {q10, q11}, [r1]!
+
+    vmovl.u8        q0, d24             ;modify Pred data from 8 bits to 16 bits
+    vmovl.u8        q1, d25
+    vmovl.u8        q2, d26
+    vmovl.u8        q3, d27
+    vmovl.u8        q4, d28
+    vmovl.u8        q5, d29
+    vmovl.u8        q6, d30
+    vld1.16         {q12, q13}, [r1]!
+    vmovl.u8        q7, d31
+    vld1.16         {q14, q15}, [r1]!
+
+    pld             [r0]
+    pld             [r1]
+    pld             [r1, #64]
+
+    vadd.s16        q0, q0, q8          ;add Diff data and Pred data together
+    vadd.s16        q1, q1, q9
+    vadd.s16        q2, q2, q10
+    vadd.s16        q3, q3, q11
+    vadd.s16        q4, q4, q12
+    vadd.s16        q5, q5, q13
+    vadd.s16        q6, q6, q14
+    vadd.s16        q7, q7, q15
+
+    vqmovun.s16     d0, q0              ;CLAMP() saturation
+    vqmovun.s16     d1, q1
+    vqmovun.s16     d2, q2
+    vqmovun.s16     d3, q3
+    vqmovun.s16     d4, q4
+    vqmovun.s16     d5, q5
+    vst1.u8         {q0}, [r2], r3      ;store result
+    vqmovun.s16     d6, q6
+    vst1.u8         {q1}, [r2], r3
+    vqmovun.s16     d7, q7
+    vst1.u8         {q2}, [r2], r3
+    subs            r12, r12, #1
+
+    moveq           r12, #2             ;loop counter for UV loop
+
+    vst1.u8         {q3}, [r2], r3
+    bne             recon16x16mb_loop_y
+
+    mov             r3, r3, lsr #1      ;uv_stride = ystride>>1
+    ldr             r2, [sp]            ;load upred_ptr
+
+recon16x16mb_loop_uv
+    vld1.u8         {q12, q13}, [r0]!   ;load data from pred_ptr
+    vld1.16         {q8, q9}, [r1]!     ;load data from diff_ptr
+    vld1.u8         {q14, q15}, [r0]!
+    vld1.16         {q10, q11}, [r1]!
+
+    vmovl.u8        q0, d24             ;modify Pred data from 8 bits to 16 bits
+    vmovl.u8        q1, d25
+    vmovl.u8        q2, d26
+    vmovl.u8        q3, d27
+    vmovl.u8        q4, d28
+    vmovl.u8        q5, d29
+    vmovl.u8        q6, d30
+    vld1.16         {q12, q13}, [r1]!
+    vmovl.u8        q7, d31
+    vld1.16         {q14, q15}, [r1]!
+
+    vadd.s16        q0, q0, q8          ;add Diff data and Pred data together
+    vadd.s16        q1, q1, q9
+    vadd.s16        q2, q2, q10
+    vadd.s16        q3, q3, q11
+    vadd.s16        q4, q4, q12
+    vadd.s16        q5, q5, q13
+    vadd.s16        q6, q6, q14
+
+    vqmovun.s16     d0, q0              ;CLAMP() saturation
+    vadd.s16        q7, q7, q15
+    vqmovun.s16     d1, q1
+    vqmovun.s16     d2, q2
+    vqmovun.s16     d3, q3
+    vst1.u8         {d0}, [r2], r3      ;store result
+    vqmovun.s16     d4, q4
+    vst1.u8         {d1}, [r2], r3
+    vqmovun.s16     d5, q5
+    vst1.u8         {d2}, [r2], r3
+    vqmovun.s16     d6, q6
+    vst1.u8         {d3}, [r2], r3
+    vqmovun.s16     d7, q7
+    vst1.u8         {d4}, [r2], r3
+    subs            r12, r12, #1
+
+    vst1.u8         {d5}, [r2], r3
+    vst1.u8         {d6}, [r2], r3
+    vst1.u8         {d7}, [r2], r3
+
+    ldrne           r2, [sp, #4]        ;load vpred_ptr
+    bne             recon16x16mb_loop_uv
+
+    bx             lr
+
+    ENDP
+    END
--- /dev/null
+++ b/vp9/common/arm/neon/recon2b_neon.asm
@@ -1,0 +1,54 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_recon2b_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0    unsigned char  *pred_ptr,
+; r1    short *diff_ptr,
+; r2    unsigned char *dst_ptr,
+; r3    int stride
+
+|vp8_recon2b_neon| PROC
+    vld1.u8         {q8, q9}, [r0]      ;load data from pred_ptr
+    vld1.16         {q4, q5}, [r1]!     ;load data from diff_ptr
+
+    vmovl.u8        q0, d16             ;modify Pred data from 8 bits to 16 bits
+    vld1.16         {q6, q7}, [r1]!
+    vmovl.u8        q1, d17
+    vmovl.u8        q2, d18
+    vmovl.u8        q3, d19
+
+    vadd.s16        q0, q0, q4          ;add Diff data and Pred data together
+    vadd.s16        q1, q1, q5
+    vadd.s16        q2, q2, q6
+    vadd.s16        q3, q3, q7
+
+    vqmovun.s16     d0, q0              ;CLAMP() saturation
+    vqmovun.s16     d1, q1
+    vqmovun.s16     d2, q2
+    vqmovun.s16     d3, q3
+    add             r0, r2, r3
+
+    vst1.u8         {d0}, [r2]          ;store result
+    vst1.u8         {d1}, [r0], r3
+    add             r2, r0, r3
+    vst1.u8         {d2}, [r0]
+    vst1.u8         {d3}, [r2], r3
+
+    bx             lr
+
+    ENDP
+    END
--- /dev/null
+++ b/vp9/common/arm/neon/recon4b_neon.asm
@@ -1,0 +1,69 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_recon4b_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0    unsigned char  *pred_ptr,
+; r1    short *diff_ptr,
+; r2    unsigned char *dst_ptr,
+; r3    int stride
+
+|vp8_recon4b_neon| PROC
+    vld1.u8         {q12, q13}, [r0]!   ;load data from pred_ptr
+    vld1.16         {q8, q9}, [r1]!     ;load data from diff_ptr
+    vld1.u8         {q14, q15}, [r0]
+    vld1.16         {q10, q11}, [r1]!
+
+    vmovl.u8        q0, d24             ;modify Pred data from 8 bits to 16 bits
+    vmovl.u8        q1, d25
+    vmovl.u8        q2, d26
+    vmovl.u8        q3, d27
+    vmovl.u8        q4, d28
+    vmovl.u8        q5, d29
+    vmovl.u8        q6, d30
+    vld1.16         {q12, q13}, [r1]!
+    vmovl.u8        q7, d31
+    vld1.16         {q14, q15}, [r1]
+
+    vadd.s16        q0, q0, q8          ;add Diff data and Pred data together
+    vadd.s16        q1, q1, q9
+    vadd.s16        q2, q2, q10
+    vadd.s16        q3, q3, q11
+    vadd.s16        q4, q4, q12
+    vadd.s16        q5, q5, q13
+    vadd.s16        q6, q6, q14
+    vadd.s16        q7, q7, q15
+
+    vqmovun.s16     d0, q0              ;CLAMP() saturation
+    vqmovun.s16     d1, q1
+    vqmovun.s16     d2, q2
+    vqmovun.s16     d3, q3
+    vqmovun.s16     d4, q4
+    vqmovun.s16     d5, q5
+    vqmovun.s16     d6, q6
+    vqmovun.s16     d7, q7
+    add             r0, r2, r3
+
+    vst1.u8         {q0}, [r2]          ;store result
+    vst1.u8         {q1}, [r0], r3
+    add             r2, r0, r3
+    vst1.u8         {q2}, [r0]
+    vst1.u8         {q3}, [r2], r3
+
+    bx             lr
+
+    ENDP
+    END
--- /dev/null
+++ b/vp9/common/arm/neon/recon_neon.c
@@ -1,0 +1,29 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_ports/config.h"
+#include "vp9/common/recon.h"
+#include "vp9/common/blockd.h"
+
+extern void vp8_recon16x16mb_neon(unsigned char *pred_ptr, short *diff_ptr, unsigned char *dst_ptr, int ystride, unsigned char *udst_ptr, unsigned char *vdst_ptr);
+
+void vp8_recon_mb_neon(const vp8_recon_rtcd_vtable_t *rtcd, MACROBLOCKD *xd) {
+  unsigned char *pred_ptr = &xd->predictor[0];
+  short *diff_ptr = &xd->diff[0];
+  unsigned char *dst_ptr = xd->dst.y_buffer;
+  unsigned char *udst_ptr = xd->dst.u_buffer;
+  unsigned char *vdst_ptr = xd->dst.v_buffer;
+  int ystride = xd->dst.y_stride;
+  /*int uv_stride = xd->dst.uv_stride;*/
+
+  vp8_recon16x16mb_neon(pred_ptr, diff_ptr, dst_ptr, ystride,
+                        udst_ptr, vdst_ptr);
+}
--- /dev/null
+++ b/vp9/common/arm/neon/reconb_neon.asm
@@ -1,0 +1,61 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_recon_b_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0    unsigned char  *pred_ptr,
+; r1    short *diff_ptr,
+; r2    unsigned char *dst_ptr,
+; r3    int stride
+
+|vp8_recon_b_neon| PROC
+    mov             r12, #16
+
+    vld1.u8         {d28}, [r0], r12    ;load 4 data/line from pred_ptr
+    vld1.16         {q10, q11}, [r1]!   ;load data from diff_ptr
+    vld1.u8         {d29}, [r0], r12
+    vld1.16         {q11, q12}, [r1]!
+    vld1.u8         {d30}, [r0], r12
+    vld1.16         {q12, q13}, [r1]!
+    vld1.u8         {d31}, [r0], r12
+    vld1.16         {q13}, [r1]
+
+    vmovl.u8        q0, d28             ;modify Pred data from 8 bits to 16 bits
+    vmovl.u8        q1, d29             ;Pred data in d0, d2, d4, d6
+    vmovl.u8        q2, d30
+    vmovl.u8        q3, d31
+
+    vadd.s16        d0, d0, d20         ;add Diff data and Pred data together
+    vadd.s16        d2, d2, d22
+    vadd.s16        d4, d4, d24
+    vadd.s16        d6, d6, d26
+
+    vqmovun.s16     d0, q0              ;CLAMP() saturation
+    vqmovun.s16     d1, q1
+    vqmovun.s16     d2, q2
+    vqmovun.s16     d3, q3
+    add             r1, r2, r3
+
+    vst1.32         {d0[0]}, [r2]       ;store result
+    vst1.32         {d1[0]}, [r1], r3
+    add             r2, r1, r3
+    vst1.32         {d2[0]}, [r1]
+    vst1.32         {d3[0]}, [r2], r3
+
+    bx             lr
+
+    ENDP
+    END
--- /dev/null
+++ b/vp9/common/arm/neon/save_neon_reg.asm
@@ -1,0 +1,36 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp9_push_neon|
+    EXPORT  |vp9_pop_neon|
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+|vp9_push_neon| PROC
+    vst1.i64            {d8, d9, d10, d11}, [r0]!
+    vst1.i64            {d12, d13, d14, d15}, [r0]!
+    bx              lr
+
+    ENDP
+
+|vp9_pop_neon| PROC
+    vld1.i64            {d8, d9, d10, d11}, [r0]!
+    vld1.i64            {d12, d13, d14, d15}, [r0]!
+    bx              lr
+
+    ENDP
+
+    END
+
--- /dev/null
+++ b/vp9/common/arm/neon/shortidct4x4llm_1_neon.asm
@@ -1,0 +1,67 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_short_idct4x4llm_1_neon|
+    EXPORT  |vp8_dc_only_idct_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void vp8_short_idct4x4llm_1_c(short *input, short *output, int pitch);
+; r0    short *input;
+; r1    short *output;
+; r2    int pitch;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+|vp8_short_idct4x4llm_1_neon| PROC
+    vld1.16         {d0[]}, [r0]            ;load input[0]
+
+    add             r3, r1, r2
+    add             r12, r3, r2
+
+    vrshr.s16       d0, d0, #3
+
+    add             r0, r12, r2
+
+    vst1.16         {d0}, [r1]
+    vst1.16         {d0}, [r3]
+    vst1.16         {d0}, [r12]
+    vst1.16         {d0}, [r0]
+
+    bx             lr
+    ENDP
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void vp8_dc_only_idct_c(short input_dc, short *output, int pitch);
+; r0    short input_dc;
+; r1    short *output;
+; r2    int pitch;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+|vp8_dc_only_idct_neon| PROC
+    vdup.16         d0, r0
+
+    add             r3, r1, r2
+    add             r12, r3, r2
+
+    vrshr.s16       d0, d0, #3
+
+    add             r0, r12, r2
+
+    vst1.16         {d0}, [r1]
+    vst1.16         {d0}, [r3]
+    vst1.16         {d0}, [r12]
+    vst1.16         {d0}, [r0]
+
+    bx             lr
+
+    ENDP
+    END
--- /dev/null
+++ b/vp9/common/arm/neon/shortidct4x4llm_neon.asm
@@ -1,0 +1,122 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_short_idct4x4llm_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+;*************************************************************
+;void vp8_short_idct4x4llm_c(short *input, short *output, int pitch)
+;r0 short * input
+;r1 short * output
+;r2 int pitch
+;*************************************************************
+;static const int cospi8sqrt2minus1=20091;
+;static const int sinpi8sqrt2      =35468;
+;static const int rounding = 0;
+;Optimization note: The resulted data from dequantization are signed 13-bit data that is
+;in the range of [-4096, 4095]. This allows to use "vqdmulh"(neon) instruction since
+;it won't go out of range (13+16+1=30bits<32bits). This instruction gives the high half
+;result of the multiplication that is needed in IDCT.
+
+|vp8_short_idct4x4llm_neon| PROC
+    adr             r12, idct_coeff
+    vld1.16         {q1, q2}, [r0]
+    vld1.16         {d0}, [r12]
+
+    vswp            d3, d4                  ;q2(vp[4] vp[12])
+
+    vqdmulh.s16     q3, q2, d0[2]
+    vqdmulh.s16     q4, q2, d0[0]
+
+    vqadd.s16       d12, d2, d3             ;a1
+    vqsub.s16       d13, d2, d3             ;b1
+
+    vshr.s16        q3, q3, #1
+    vshr.s16        q4, q4, #1
+
+    vqadd.s16       q3, q3, q2              ;modify since sinpi8sqrt2 > 65536/2 (negtive number)
+    vqadd.s16       q4, q4, q2
+
+    ;d6 - c1:temp1
+    ;d7 - d1:temp2
+    ;d8 - d1:temp1
+    ;d9 - c1:temp2
+
+    vqsub.s16       d10, d6, d9             ;c1
+    vqadd.s16       d11, d7, d8             ;d1
+
+    vqadd.s16       d2, d12, d11
+    vqadd.s16       d3, d13, d10
+    vqsub.s16       d4, d13, d10
+    vqsub.s16       d5, d12, d11
+
+    vtrn.32         d2, d4
+    vtrn.32         d3, d5
+    vtrn.16         d2, d3
+    vtrn.16         d4, d5
+
+    vswp            d3, d4
+
+    vqdmulh.s16     q3, q2, d0[2]
+    vqdmulh.s16     q4, q2, d0[0]
+
+    vqadd.s16       d12, d2, d3             ;a1
+    vqsub.s16       d13, d2, d3             ;b1
+
+    vshr.s16        q3, q3, #1
+    vshr.s16        q4, q4, #1
+
+    vqadd.s16       q3, q3, q2              ;modify since sinpi8sqrt2 > 65536/2 (negtive number)
+    vqadd.s16       q4, q4, q2
+
+    vqsub.s16       d10, d6, d9             ;c1
+    vqadd.s16       d11, d7, d8             ;d1
+
+    vqadd.s16       d2, d12, d11
+    vqadd.s16       d3, d13, d10
+    vqsub.s16       d4, d13, d10
+    vqsub.s16       d5, d12, d11
+
+    vrshr.s16       d2, d2, #3
+    vrshr.s16       d3, d3, #3
+    vrshr.s16       d4, d4, #3
+    vrshr.s16       d5, d5, #3
+
+    add             r3, r1, r2
+    add             r12, r3, r2
+    add             r0, r12, r2
+
+    vtrn.32         d2, d4
+    vtrn.32         d3, d5
+    vtrn.16         d2, d3
+    vtrn.16         d4, d5
+
+    vst1.16         {d2}, [r1]
+    vst1.16         {d3}, [r3]
+    vst1.16         {d4}, [r12]
+    vst1.16         {d5}, [r0]
+
+    bx             lr
+
+    ENDP
+
+;-----------------
+
+idct_coeff
+    DCD     0x4e7b4e7b, 0x8a8c8a8c
+
+;20091, 20091, 35468, 35468
+
+    END
--- /dev/null
+++ b/vp9/common/arm/neon/sixtappredict16x16_neon.asm
@@ -1,0 +1,490 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_sixtap_predict16x16_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+filter16_coeff
+    DCD     0,  0,  128,    0,   0,  0,   0,  0
+    DCD     0, -6,  123,   12,  -1,  0,   0,  0
+    DCD     2, -11, 108,   36,  -8,  1,   0,  0
+    DCD     0, -9,   93,   50,  -6,  0,   0,  0
+    DCD     3, -16,  77,   77, -16,  3,   0,  0
+    DCD     0, -6,   50,   93,  -9,  0,   0,  0
+    DCD     1, -8,   36,  108, -11,  2,   0,  0
+    DCD     0, -1,   12,  123,  -6,   0,  0,  0
+
+; r0    unsigned char  *src_ptr,
+; r1    int  src_pixels_per_line,
+; r2    int  xoffset,
+; r3    int  yoffset,
+; r4    unsigned char *dst_ptr,
+; stack(r5) int  dst_pitch
+
+;Note: To take advantage of 8-bit mulplication instruction in NEON. First apply abs() to
+; filter coeffs to make them u8. Then, use vmlsl for negtive coeffs. After multiplication,
+; the result can be negtive. So, I treat the result as s16. But, since it is also possible
+; that the result can be a large positive number (> 2^15-1), which could be confused as a
+; negtive number. To avoid that error, apply filter coeffs in the order of 0, 1, 4 ,5 ,2,
+; which ensures that the result stays in s16 range. Finally, saturated add the result by
+; applying 3rd filter coeff. Same applys to other filter functions.
+
+|vp8_sixtap_predict16x16_neon| PROC
+    push            {r4-r5, lr}
+
+    adr             r12, filter16_coeff
+    ldr             r4, [sp, #12]           ;load parameters from stack
+    ldr             r5, [sp, #16]           ;load parameters from stack
+
+    cmp             r2, #0                  ;skip first_pass filter if xoffset=0
+    beq             secondpass_filter16x16_only
+
+    add             r2, r12, r2, lsl #5     ;calculate filter location
+
+    cmp             r3, #0                  ;skip second_pass filter if yoffset=0
+
+    vld1.s32        {q14, q15}, [r2]        ;load first_pass filter
+
+    beq             firstpass_filter16x16_only
+
+    sub             sp, sp, #336            ;reserve space on stack for temporary storage
+    mov             lr, sp
+
+    vabs.s32        q12, q14
+    vabs.s32        q13, q15
+
+    mov             r2, #7                  ;loop counter
+    sub             r0, r0, #2              ;move srcptr back to (line-2) and (column-2)
+    sub             r0, r0, r1, lsl #1
+
+    vdup.8          d0, d24[0]              ;first_pass filter (d0-d5)
+    vdup.8          d1, d24[4]
+    vdup.8          d2, d25[0]
+    vdup.8          d3, d25[4]
+    vdup.8          d4, d26[0]
+    vdup.8          d5, d26[4]
+
+;First Pass: output_height lines x output_width columns (21x16)
+filt_blk2d_fp16x16_loop_neon
+    vld1.u8         {d6, d7, d8}, [r0], r1      ;load src data
+    vld1.u8         {d9, d10, d11}, [r0], r1
+    vld1.u8         {d12, d13, d14}, [r0], r1
+
+    pld             [r0]
+    pld             [r0, r1]
+    pld             [r0, r1, lsl #1]
+
+    vmull.u8        q8, d6, d0              ;(src_ptr[-2] * vp9_filter[0])
+    vmull.u8        q9, d7, d0
+    vmull.u8        q10, d9, d0
+    vmull.u8        q11, d10, d0
+    vmull.u8        q12, d12, d0
+    vmull.u8        q13, d13, d0
+
+    vext.8          d28, d6, d7, #1         ;construct src_ptr[-1]
+    vext.8          d29, d9, d10, #1
+    vext.8          d30, d12, d13, #1
+
+    vmlsl.u8        q8, d28, d1             ;-(src_ptr[-1] * vp9_filter[1])
+    vmlsl.u8        q10, d29, d1
+    vmlsl.u8        q12, d30, d1
+
+    vext.8          d28, d7, d8, #1
+    vext.8          d29, d10, d11, #1
+    vext.8          d30, d13, d14, #1
+
+    vmlsl.u8        q9, d28, d1             ;-(src_ptr[-1] * vp9_filter[1])
+    vmlsl.u8        q11, d29, d1
+    vmlsl.u8        q13, d30, d1
+
+    vext.8          d28, d6, d7, #4         ;construct src_ptr[2]
+    vext.8          d29, d9, d10, #4
+    vext.8          d30, d12, d13, #4
+
+    vmlsl.u8        q8, d28, d4             ;-(src_ptr[2] * vp9_filter[4])
+    vmlsl.u8        q10, d29, d4
+    vmlsl.u8        q12, d30, d4
+
+    vext.8          d28, d7, d8, #4
+    vext.8          d29, d10, d11, #4
+    vext.8          d30, d13, d14, #4
+
+    vmlsl.u8        q9, d28, d4             ;-(src_ptr[2] * vp9_filter[4])
+    vmlsl.u8        q11, d29, d4
+    vmlsl.u8        q13, d30, d4
+
+    vext.8          d28, d6, d7, #5         ;construct src_ptr[3]
+    vext.8          d29, d9, d10, #5
+    vext.8          d30, d12, d13, #5
+
+    vmlal.u8        q8, d28, d5             ;(src_ptr[3] * vp9_filter[5])
+    vmlal.u8        q10, d29, d5
+    vmlal.u8        q12, d30, d5
+
+    vext.8          d28, d7, d8, #5
+    vext.8          d29, d10, d11, #5
+    vext.8          d30, d13, d14, #5
+
+    vmlal.u8        q9, d28, d5             ;(src_ptr[3] * vp9_filter[5])
+    vmlal.u8        q11, d29, d5
+    vmlal.u8        q13, d30, d5
+
+    vext.8          d28, d6, d7, #2         ;construct src_ptr[0]
+    vext.8          d29, d9, d10, #2
+    vext.8          d30, d12, d13, #2
+
+    vmlal.u8        q8, d28, d2             ;(src_ptr[0] * vp9_filter[2])
+    vmlal.u8        q10, d29, d2
+    vmlal.u8        q12, d30, d2
+
+    vext.8          d28, d7, d8, #2
+    vext.8          d29, d10, d11, #2
+    vext.8          d30, d13, d14, #2
+
+    vmlal.u8        q9, d28, d2             ;(src_ptr[0] * vp9_filter[2])
+    vmlal.u8        q11, d29, d2
+    vmlal.u8        q13, d30, d2
+
+    vext.8          d28, d6, d7, #3         ;construct src_ptr[1]
+    vext.8          d29, d9, d10, #3
+    vext.8          d30, d12, d13, #3
+
+    vext.8          d15, d7, d8, #3
+    vext.8          d31, d10, d11, #3
+    vext.8          d6, d13, d14, #3
+
+    vmull.u8        q4, d28, d3             ;(src_ptr[1] * vp9_filter[3])
+    vmull.u8        q5, d29, d3
+    vmull.u8        q6, d30, d3
+
+    vqadd.s16       q8, q4                  ;sum of all (src_data*filter_parameters)
+    vqadd.s16       q10, q5
+    vqadd.s16       q12, q6
+
+    vmull.u8        q6, d15, d3             ;(src_ptr[1] * vp9_filter[3])
+    vmull.u8        q7, d31, d3
+    vmull.u8        q3, d6, d3
+
+    subs            r2, r2, #1
+
+    vqadd.s16       q9, q6
+    vqadd.s16       q11, q7
+    vqadd.s16       q13, q3
+
+    vqrshrun.s16    d6, q8, #7              ;shift/round/saturate to u8
+    vqrshrun.s16    d7, q9, #7
+    vqrshrun.s16    d8, q10, #7
+    vqrshrun.s16    d9, q11, #7
+    vqrshrun.s16    d10, q12, #7
+    vqrshrun.s16    d11, q13, #7
+
+    vst1.u8         {d6, d7, d8}, [lr]!     ;store result
+    vst1.u8         {d9, d10, d11}, [lr]!
+
+    bne             filt_blk2d_fp16x16_loop_neon
+
+;Second pass: 16x16
+;secondpass_filter - do first 8-columns and then second 8-columns
+    add             r3, r12, r3, lsl #5
+    sub             lr, lr, #336
+
+    vld1.s32        {q5, q6}, [r3]          ;load second_pass filter
+    mov             r3, #2                  ;loop counter
+
+    vabs.s32        q7, q5
+    vabs.s32        q8, q6
+
+    mov             r2, #16
+
+    vdup.8          d0, d14[0]              ;second_pass filter parameters (d0-d5)
+    vdup.8          d1, d14[4]
+    vdup.8          d2, d15[0]
+    vdup.8          d3, d15[4]
+    vdup.8          d4, d16[0]
+    vdup.8          d5, d16[4]
+
+filt_blk2d_sp16x16_outloop_neon
+    vld1.u8         {d18}, [lr], r2         ;load src data
+    vld1.u8         {d19}, [lr], r2
+    vld1.u8         {d20}, [lr], r2
+    vld1.u8         {d21}, [lr], r2
+    mov             r12, #4                 ;loop counter
+    vld1.u8         {d22}, [lr], r2
+
+secondpass_inner_loop_neon
+    vld1.u8         {d23}, [lr], r2         ;load src data
+    vld1.u8         {d24}, [lr], r2
+    vld1.u8         {d25}, [lr], r2
+    vld1.u8         {d26}, [lr], r2
+
+    vmull.u8        q3, d18, d0             ;(src_ptr[-2] * vp9_filter[0])
+    vmull.u8        q4, d19, d0
+    vmull.u8        q5, d20, d0
+    vmull.u8        q6, d21, d0
+
+    vmlsl.u8        q3, d19, d1             ;-(src_ptr[-1] * vp9_filter[1])
+    vmlsl.u8        q4, d20, d1
+    vmlsl.u8        q5, d21, d1
+    vmlsl.u8        q6, d22, d1
+
+    vmlsl.u8        q3, d22, d4             ;-(src_ptr[2] * vp9_filter[4])
+    vmlsl.u8        q4, d23, d4
+    vmlsl.u8        q5, d24, d4
+    vmlsl.u8        q6, d25, d4
+
+    vmlal.u8        q3, d20, d2             ;(src_ptr[0] * vp9_filter[2])
+    vmlal.u8        q4, d21, d2
+    vmlal.u8        q5, d22, d2
+    vmlal.u8        q6, d23, d2
+
+    vmlal.u8        q3, d23, d5             ;(src_ptr[3] * vp9_filter[5])
+    vmlal.u8        q4, d24, d5
+    vmlal.u8        q5, d25, d5
+    vmlal.u8        q6, d26, d5
+
+    vmull.u8        q7, d21, d3             ;(src_ptr[1] * vp9_filter[3])
+    vmull.u8        q8, d22, d3
+    vmull.u8        q9, d23, d3
+    vmull.u8        q10, d24, d3
+
+    subs            r12, r12, #1
+
+    vqadd.s16       q7, q3                  ;sum of all (src_data*filter_parameters)
+    vqadd.s16       q8, q4
+    vqadd.s16       q9, q5
+    vqadd.s16       q10, q6
+
+    vqrshrun.s16    d6, q7, #7              ;shift/round/saturate to u8
+    vqrshrun.s16    d7, q8, #7
+    vqrshrun.s16    d8, q9, #7
+    vqrshrun.s16    d9, q10, #7
+
+    vst1.u8         {d6}, [r4], r5          ;store result
+    vmov            q9, q11
+    vst1.u8         {d7}, [r4], r5
+    vmov            q10, q12
+    vst1.u8         {d8}, [r4], r5
+    vmov            d22, d26
+    vst1.u8         {d9}, [r4], r5
+
+    bne             secondpass_inner_loop_neon
+
+    subs            r3, r3, #1
+    sub             lr, lr, #336
+    add             lr, lr, #8
+
+    sub             r4, r4, r5, lsl #4
+    add             r4, r4, #8
+
+    bne filt_blk2d_sp16x16_outloop_neon
+
+    add             sp, sp, #336
+    pop             {r4-r5,pc}
+
+;--------------------
+firstpass_filter16x16_only
+    vabs.s32        q12, q14
+    vabs.s32        q13, q15
+
+    mov             r2, #8                  ;loop counter
+    sub             r0, r0, #2              ;move srcptr back to (column-2)
+
+    vdup.8          d0, d24[0]              ;first_pass filter (d0-d5)
+    vdup.8          d1, d24[4]
+    vdup.8          d2, d25[0]
+    vdup.8          d3, d25[4]
+    vdup.8          d4, d26[0]
+    vdup.8          d5, d26[4]
+
+;First Pass: output_height lines x output_width columns (16x16)
+filt_blk2d_fpo16x16_loop_neon
+    vld1.u8         {d6, d7, d8}, [r0], r1      ;load src data
+    vld1.u8         {d9, d10, d11}, [r0], r1
+
+    pld             [r0]
+    pld             [r0, r1]
+
+    vmull.u8        q6, d6, d0              ;(src_ptr[-2] * vp9_filter[0])
+    vmull.u8        q7, d7, d0
+    vmull.u8        q8, d9, d0
+    vmull.u8        q9, d10, d0
+
+    vext.8          d20, d6, d7, #1         ;construct src_ptr[-1]
+    vext.8          d21, d9, d10, #1
+    vext.8          d22, d7, d8, #1
+    vext.8          d23, d10, d11, #1
+    vext.8          d24, d6, d7, #4         ;construct src_ptr[2]
+    vext.8          d25, d9, d10, #4
+    vext.8          d26, d7, d8, #4
+    vext.8          d27, d10, d11, #4
+    vext.8          d28, d6, d7, #5         ;construct src_ptr[3]
+    vext.8          d29, d9, d10, #5
+
+    vmlsl.u8        q6, d20, d1             ;-(src_ptr[-1] * vp9_filter[1])
+    vmlsl.u8        q8, d21, d1
+    vmlsl.u8        q7, d22, d1             ;-(src_ptr[-1] * vp9_filter[1])
+    vmlsl.u8        q9, d23, d1
+    vmlsl.u8        q6, d24, d4             ;-(src_ptr[2] * vp9_filter[4])
+    vmlsl.u8        q8, d25, d4
+    vmlsl.u8        q7, d26, d4             ;-(src_ptr[2] * vp9_filter[4])
+    vmlsl.u8        q9, d27, d4
+    vmlal.u8        q6, d28, d5             ;(src_ptr[3] * vp9_filter[5])
+    vmlal.u8        q8, d29, d5
+
+    vext.8          d20, d7, d8, #5
+    vext.8          d21, d10, d11, #5
+    vext.8          d22, d6, d7, #2         ;construct src_ptr[0]
+    vext.8          d23, d9, d10, #2
+    vext.8          d24, d7, d8, #2
+    vext.8          d25, d10, d11, #2
+
+    vext.8          d26, d6, d7, #3         ;construct src_ptr[1]
+    vext.8          d27, d9, d10, #3
+    vext.8          d28, d7, d8, #3
+    vext.8          d29, d10, d11, #3
+
+    vmlal.u8        q7, d20, d5             ;(src_ptr[3] * vp9_filter[5])
+    vmlal.u8        q9, d21, d5
+    vmlal.u8        q6, d22, d2             ;(src_ptr[0] * vp9_filter[2])
+    vmlal.u8        q8, d23, d2
+    vmlal.u8        q7, d24, d2             ;(src_ptr[0] * vp9_filter[2])
+    vmlal.u8        q9, d25, d2
+
+    vmull.u8        q10, d26, d3            ;(src_ptr[1] * vp9_filter[3])
+    vmull.u8        q11, d27, d3
+    vmull.u8        q12, d28, d3            ;(src_ptr[1] * vp9_filter[3])
+    vmull.u8        q15, d29, d3
+
+    vqadd.s16       q6, q10                 ;sum of all (src_data*filter_parameters)
+    vqadd.s16       q8, q11
+    vqadd.s16       q7, q12
+    vqadd.s16       q9, q15
+
+    subs            r2, r2, #1
+
+    vqrshrun.s16    d6, q6, #7              ;shift/round/saturate to u8
+    vqrshrun.s16    d7, q7, #7
+    vqrshrun.s16    d8, q8, #7
+    vqrshrun.s16    d9, q9, #7
+
+    vst1.u8         {q3}, [r4], r5              ;store result
+    vst1.u8         {q4}, [r4], r5
+
+    bne             filt_blk2d_fpo16x16_loop_neon
+
+    pop             {r4-r5,pc}
+
+;--------------------
+secondpass_filter16x16_only
+;Second pass: 16x16
+    add             r3, r12, r3, lsl #5
+    sub             r0, r0, r1, lsl #1
+
+    vld1.s32        {q5, q6}, [r3]          ;load second_pass filter
+    mov             r3, #2                  ;loop counter
+
+    vabs.s32        q7, q5
+    vabs.s32        q8, q6
+
+    vdup.8          d0, d14[0]              ;second_pass filter parameters (d0-d5)
+    vdup.8          d1, d14[4]
+    vdup.8          d2, d15[0]
+    vdup.8          d3, d15[4]
+    vdup.8          d4, d16[0]
+    vdup.8          d5, d16[4]
+
+filt_blk2d_spo16x16_outloop_neon
+    vld1.u8         {d18}, [r0], r1         ;load src data
+    vld1.u8         {d19}, [r0], r1
+    vld1.u8         {d20}, [r0], r1
+    vld1.u8         {d21}, [r0], r1
+    mov             r12, #4                 ;loop counter
+    vld1.u8         {d22}, [r0], r1
+
+secondpass_only_inner_loop_neon
+    vld1.u8         {d23}, [r0], r1         ;load src data
+    vld1.u8         {d24}, [r0], r1
+    vld1.u8         {d25}, [r0], r1
+    vld1.u8         {d26}, [r0], r1
+
+    vmull.u8        q3, d18, d0             ;(src_ptr[-2] * vp9_filter[0])
+    vmull.u8        q4, d19, d0
+    vmull.u8        q5, d20, d0
+    vmull.u8        q6, d21, d0
+
+    vmlsl.u8        q3, d19, d1             ;-(src_ptr[-1] * vp9_filter[1])
+    vmlsl.u8        q4, d20, d1
+    vmlsl.u8        q5, d21, d1
+    vmlsl.u8        q6, d22, d1
+
+    vmlsl.u8        q3, d22, d4             ;-(src_ptr[2] * vp9_filter[4])
+    vmlsl.u8        q4, d23, d4
+    vmlsl.u8        q5, d24, d4
+    vmlsl.u8        q6, d25, d4
+
+    vmlal.u8        q3, d20, d2             ;(src_ptr[0] * vp9_filter[2])
+    vmlal.u8        q4, d21, d2
+    vmlal.u8        q5, d22, d2
+    vmlal.u8        q6, d23, d2
+
+    vmlal.u8        q3, d23, d5             ;(src_ptr[3] * vp9_filter[5])
+    vmlal.u8        q4, d24, d5
+    vmlal.u8        q5, d25, d5
+    vmlal.u8        q6, d26, d5
+
+    vmull.u8        q7, d21, d3             ;(src_ptr[1] * vp9_filter[3])
+    vmull.u8        q8, d22, d3
+    vmull.u8        q9, d23, d3
+    vmull.u8        q10, d24, d3
+
+    subs            r12, r12, #1
+
+    vqadd.s16       q7, q3                  ;sum of all (src_data*filter_parameters)
+    vqadd.s16       q8, q4
+    vqadd.s16       q9, q5
+    vqadd.s16       q10, q6
+
+    vqrshrun.s16    d6, q7, #7              ;shift/round/saturate to u8
+    vqrshrun.s16    d7, q8, #7
+    vqrshrun.s16    d8, q9, #7
+    vqrshrun.s16    d9, q10, #7
+
+    vst1.u8         {d6}, [r4], r5          ;store result
+    vmov            q9, q11
+    vst1.u8         {d7}, [r4], r5
+    vmov            q10, q12
+    vst1.u8         {d8}, [r4], r5
+    vmov            d22, d26
+    vst1.u8         {d9}, [r4], r5
+
+    bne             secondpass_only_inner_loop_neon
+
+    subs            r3, r3, #1
+    sub             r0, r0, r1, lsl #4
+    sub             r0, r0, r1, lsl #2
+    sub             r0, r0, r1
+    add             r0, r0, #8
+
+    sub             r4, r4, r5, lsl #4
+    add             r4, r4, #8
+
+    bne filt_blk2d_spo16x16_outloop_neon
+
+    pop             {r4-r5,pc}
+
+    ENDP
+
+;-----------------
+    END
--- /dev/null
+++ b/vp9/common/arm/neon/sixtappredict4x4_neon.asm
@@ -1,0 +1,422 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_sixtap_predict_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+filter4_coeff
+    DCD     0,  0,  128,    0,   0,  0,   0,  0
+    DCD     0, -6,  123,   12,  -1,  0,   0,  0
+    DCD     2, -11, 108,   36,  -8,  1,   0,  0
+    DCD     0, -9,   93,   50,  -6,  0,   0,  0
+    DCD     3, -16,  77,   77, -16,  3,   0,  0
+    DCD     0, -6,   50,   93,  -9,  0,   0,  0
+    DCD     1, -8,   36,  108, -11,  2,   0,  0
+    DCD     0, -1,   12,  123,  -6,   0,  0,  0
+
+; r0    unsigned char  *src_ptr,
+; r1    int  src_pixels_per_line,
+; r2    int  xoffset,
+; r3    int  yoffset,
+; stack(r4) unsigned char *dst_ptr,
+; stack(lr) int  dst_pitch
+
+|vp8_sixtap_predict_neon| PROC
+    push            {r4, lr}
+
+    adr             r12, filter4_coeff
+    ldr             r4, [sp, #8]            ;load parameters from stack
+    ldr             lr, [sp, #12]           ;load parameters from stack
+
+    cmp             r2, #0                  ;skip first_pass filter if xoffset=0
+    beq             secondpass_filter4x4_only
+
+    add             r2, r12, r2, lsl #5     ;calculate filter location
+
+    cmp             r3, #0                  ;skip second_pass filter if yoffset=0
+    vld1.s32        {q14, q15}, [r2]        ;load first_pass filter
+
+    beq             firstpass_filter4x4_only
+
+    vabs.s32        q12, q14                ;get abs(filer_parameters)
+    vabs.s32        q13, q15
+
+    sub             r0, r0, #2              ;go back 2 columns of src data
+    sub             r0, r0, r1, lsl #1      ;go back 2 lines of src data
+
+;First pass: output_height lines x output_width columns (9x4)
+    vld1.u8         {q3}, [r0], r1          ;load first 4-line src data
+    vdup.8          d0, d24[0]              ;first_pass filter (d0-d5)
+    vld1.u8         {q4}, [r0], r1
+    vdup.8          d1, d24[4]
+    vld1.u8         {q5}, [r0], r1
+    vdup.8          d2, d25[0]
+    vld1.u8         {q6}, [r0], r1
+    vdup.8          d3, d25[4]
+    vdup.8          d4, d26[0]
+    vdup.8          d5, d26[4]
+
+    pld             [r0]
+    pld             [r0, r1]
+    pld             [r0, r1, lsl #1]
+
+    vext.8          d18, d6, d7, #5         ;construct src_ptr[3]
+    vext.8          d19, d8, d9, #5
+    vext.8          d20, d10, d11, #5
+    vext.8          d21, d12, d13, #5
+
+    vswp            d7, d8                  ;discard 2nd half data after src_ptr[3] is done
+    vswp            d11, d12
+
+    vzip.32         d18, d19                ;put 2-line data in 1 register (src_ptr[3])
+    vzip.32         d20, d21
+    vmull.u8        q7, d18, d5             ;(src_ptr[3] * vp9_filter[5])
+    vmull.u8        q8, d20, d5
+
+    vmov            q4, q3                  ;keep original src data in q4 q6
+    vmov            q6, q5
+
+    vzip.32         d6, d7                  ;construct src_ptr[-2], and put 2-line data together
+    vzip.32         d10, d11
+    vshr.u64        q9, q4, #8              ;construct src_ptr[-1]
+    vshr.u64        q10, q6, #8
+    vmlal.u8        q7, d6, d0              ;+(src_ptr[-2] * vp9_filter[0])
+    vmlal.u8        q8, d10, d0
+
+    vzip.32         d18, d19                ;put 2-line data in 1 register (src_ptr[-1])
+    vzip.32         d20, d21
+    vshr.u64        q3, q4, #32             ;construct src_ptr[2]
+    vshr.u64        q5, q6, #32
+    vmlsl.u8        q7, d18, d1             ;-(src_ptr[-1] * vp9_filter[1])
+    vmlsl.u8        q8, d20, d1
+
+    vzip.32         d6, d7                  ;put 2-line data in 1 register (src_ptr[2])
+    vzip.32         d10, d11
+    vshr.u64        q9, q4, #16             ;construct src_ptr[0]
+    vshr.u64        q10, q6, #16
+    vmlsl.u8        q7, d6, d4              ;-(src_ptr[2] * vp9_filter[4])
+    vmlsl.u8        q8, d10, d4
+
+    vzip.32         d18, d19                ;put 2-line data in 1 register (src_ptr[0])
+    vzip.32         d20, d21
+    vshr.u64        q3, q4, #24             ;construct src_ptr[1]
+    vshr.u64        q5, q6, #24
+    vmlal.u8        q7, d18, d2             ;(src_ptr[0] * vp9_filter[2])
+    vmlal.u8        q8, d20, d2
+
+    vzip.32         d6, d7                  ;put 2-line data in 1 register (src_ptr[1])
+    vzip.32         d10, d11
+    vmull.u8        q9, d6, d3              ;(src_ptr[1] * vp9_filter[3])
+    vmull.u8        q10, d10, d3
+
+    vld1.u8         {q3}, [r0], r1          ;load rest 5-line src data
+    vld1.u8         {q4}, [r0], r1
+
+    vqadd.s16       q7, q9                  ;sum of all (src_data*filter_parameters)
+    vqadd.s16       q8, q10
+
+    vld1.u8         {q5}, [r0], r1
+    vld1.u8         {q6}, [r0], r1
+
+    vqrshrun.s16    d27, q7, #7             ;shift/round/saturate to u8
+    vqrshrun.s16    d28, q8, #7
+
+    ;First Pass on rest 5-line data
+    vld1.u8         {q11}, [r0], r1
+
+    vext.8          d18, d6, d7, #5         ;construct src_ptr[3]
+    vext.8          d19, d8, d9, #5
+    vext.8          d20, d10, d11, #5
+    vext.8          d21, d12, d13, #5
+
+    vswp            d7, d8                  ;discard 2nd half data after src_ptr[3] is done
+    vswp            d11, d12
+
+    vzip.32         d18, d19                ;put 2-line data in 1 register (src_ptr[3])
+    vzip.32         d20, d21
+    vext.8          d31, d22, d23, #5       ;construct src_ptr[3]
+    vmull.u8        q7, d18, d5             ;(src_ptr[3] * vp9_filter[5])
+    vmull.u8        q8, d20, d5
+    vmull.u8        q12, d31, d5            ;(src_ptr[3] * vp9_filter[5])
+
+    vmov            q4, q3                  ;keep original src data in q4 q6
+    vmov            q6, q5
+
+    vzip.32         d6, d7                  ;construct src_ptr[-2], and put 2-line data together
+    vzip.32         d10, d11
+    vshr.u64        q9, q4, #8              ;construct src_ptr[-1]
+    vshr.u64        q10, q6, #8
+
+    vmlal.u8        q7, d6, d0              ;+(src_ptr[-2] * vp9_filter[0])
+    vmlal.u8        q8, d10, d0
+    vmlal.u8        q12, d22, d0            ;(src_ptr[-2] * vp9_filter[0])
+
+    vzip.32         d18, d19                ;put 2-line data in 1 register (src_ptr[-1])
+    vzip.32         d20, d21
+    vshr.u64        q3, q4, #32             ;construct src_ptr[2]
+    vshr.u64        q5, q6, #32
+    vext.8          d31, d22, d23, #1       ;construct src_ptr[-1]
+
+    vmlsl.u8        q7, d18, d1             ;-(src_ptr[-1] * vp9_filter[1])
+    vmlsl.u8        q8, d20, d1
+    vmlsl.u8        q12, d31, d1            ;-(src_ptr[-1] * vp9_filter[1])
+
+    vzip.32         d6, d7                  ;put 2-line data in 1 register (src_ptr[2])
+    vzip.32         d10, d11
+    vshr.u64        q9, q4, #16             ;construct src_ptr[0]
+    vshr.u64        q10, q6, #16
+    vext.8          d31, d22, d23, #4       ;construct src_ptr[2]
+
+    vmlsl.u8        q7, d6, d4              ;-(src_ptr[2] * vp9_filter[4])
+    vmlsl.u8        q8, d10, d4
+    vmlsl.u8        q12, d31, d4            ;-(src_ptr[2] * vp9_filter[4])
+
+    vzip.32         d18, d19                ;put 2-line data in 1 register (src_ptr[0])
+    vzip.32         d20, d21
+    vshr.u64        q3, q4, #24             ;construct src_ptr[1]
+    vshr.u64        q5, q6, #24
+    vext.8          d31, d22, d23, #2       ;construct src_ptr[0]
+
+    vmlal.u8        q7, d18, d2             ;(src_ptr[0] * vp9_filter[2])
+    vmlal.u8        q8, d20, d2
+    vmlal.u8        q12, d31, d2            ;(src_ptr[0] * vp9_filter[2])
+
+    vzip.32         d6, d7                  ;put 2-line data in 1 register (src_ptr[1])
+    vzip.32         d10, d11
+    vext.8          d31, d22, d23, #3       ;construct src_ptr[1]
+    vmull.u8        q9, d6, d3              ;(src_ptr[1] * vp9_filter[3])
+    vmull.u8        q10, d10, d3
+    vmull.u8        q11, d31, d3            ;(src_ptr[1] * vp9_filter[3])
+
+    add             r3, r12, r3, lsl #5
+
+    vqadd.s16       q7, q9                  ;sum of all (src_data*filter_parameters)
+    vqadd.s16       q8, q10
+    vqadd.s16       q12, q11
+
+    vext.8          d23, d27, d28, #4
+    vld1.s32        {q5, q6}, [r3]          ;load second_pass filter
+
+    vqrshrun.s16    d29, q7, #7             ;shift/round/saturate to u8
+    vqrshrun.s16    d30, q8, #7
+    vqrshrun.s16    d31, q12, #7
+
+;Second pass: 4x4
+    vabs.s32        q7, q5
+    vabs.s32        q8, q6
+
+    vext.8          d24, d28, d29, #4
+    vext.8          d25, d29, d30, #4
+    vext.8          d26, d30, d31, #4
+
+    vdup.8          d0, d14[0]              ;second_pass filter parameters (d0-d5)
+    vdup.8          d1, d14[4]
+    vdup.8          d2, d15[0]
+    vdup.8          d3, d15[4]
+    vdup.8          d4, d16[0]
+    vdup.8          d5, d16[4]
+
+    vmull.u8        q3, d27, d0             ;(src_ptr[-2] * vp9_filter[0])
+    vmull.u8        q4, d28, d0
+
+    vmull.u8        q5, d25, d5             ;(src_ptr[3] * vp9_filter[5])
+    vmull.u8        q6, d26, d5
+
+    vmlsl.u8        q3, d29, d4             ;-(src_ptr[2] * vp9_filter[4])
+    vmlsl.u8        q4, d30, d4
+
+    vmlsl.u8        q5, d23, d1             ;-(src_ptr[-1] * vp9_filter[1])
+    vmlsl.u8        q6, d24, d1
+
+    vmlal.u8        q3, d28, d2             ;(src_ptr[0] * vp9_filter[2])
+    vmlal.u8        q4, d29, d2
+
+    vmlal.u8        q5, d24, d3             ;(src_ptr[1] * vp9_filter[3])
+    vmlal.u8        q6, d25, d3
+
+    add             r0, r4, lr
+    add             r1, r0, lr
+    add             r2, r1, lr
+
+    vqadd.s16       q5, q3                  ;sum of all (src_data*filter_parameters)
+    vqadd.s16       q6, q4
+
+    vqrshrun.s16    d3, q5, #7              ;shift/round/saturate to u8
+    vqrshrun.s16    d4, q6, #7
+
+    vst1.32         {d3[0]}, [r4]           ;store result
+    vst1.32         {d3[1]}, [r0]
+    vst1.32         {d4[0]}, [r1]
+    vst1.32         {d4[1]}, [r2]
+
+    pop             {r4, pc}
+
+
+;---------------------
+firstpass_filter4x4_only
+    vabs.s32        q12, q14                ;get abs(filer_parameters)
+    vabs.s32        q13, q15
+
+    sub             r0, r0, #2              ;go back 2 columns of src data
+
+;First pass: output_height lines x output_width columns (4x4)
+    vld1.u8         {q3}, [r0], r1          ;load first 4-line src data
+    vdup.8          d0, d24[0]              ;first_pass filter (d0-d5)
+    vld1.u8         {q4}, [r0], r1
+    vdup.8          d1, d24[4]
+    vld1.u8         {q5}, [r0], r1
+    vdup.8          d2, d25[0]
+    vld1.u8         {q6}, [r0], r1
+
+    vdup.8          d3, d25[4]
+    vdup.8          d4, d26[0]
+    vdup.8          d5, d26[4]
+
+    vext.8          d18, d6, d7, #5         ;construct src_ptr[3]
+    vext.8          d19, d8, d9, #5
+    vext.8          d20, d10, d11, #5
+    vext.8          d21, d12, d13, #5
+
+    vswp            d7, d8                  ;discard 2nd half data after src_ptr[3] is done
+    vswp            d11, d12
+
+    vzip.32         d18, d19                ;put 2-line data in 1 register (src_ptr[3])
+    vzip.32         d20, d21
+    vmull.u8        q7, d18, d5             ;(src_ptr[3] * vp9_filter[5])
+    vmull.u8        q8, d20, d5
+
+    vmov            q4, q3                  ;keep original src data in q4 q6
+    vmov            q6, q5
+
+    vzip.32         d6, d7                  ;construct src_ptr[-2], and put 2-line data together
+    vzip.32         d10, d11
+    vshr.u64        q9, q4, #8              ;construct src_ptr[-1]
+    vshr.u64        q10, q6, #8
+    vmlal.u8        q7, d6, d0              ;+(src_ptr[-2] * vp9_filter[0])
+    vmlal.u8        q8, d10, d0
+
+    vzip.32         d18, d19                ;put 2-line data in 1 register (src_ptr[-1])
+    vzip.32         d20, d21
+    vshr.u64        q3, q4, #32             ;construct src_ptr[2]
+    vshr.u64        q5, q6, #32
+    vmlsl.u8        q7, d18, d1             ;-(src_ptr[-1] * vp9_filter[1])
+    vmlsl.u8        q8, d20, d1
+
+    vzip.32         d6, d7                  ;put 2-line data in 1 register (src_ptr[2])
+    vzip.32         d10, d11
+    vshr.u64        q9, q4, #16             ;construct src_ptr[0]
+    vshr.u64        q10, q6, #16
+    vmlsl.u8        q7, d6, d4              ;-(src_ptr[2] * vp9_filter[4])
+    vmlsl.u8        q8, d10, d4
+
+    vzip.32         d18, d19                ;put 2-line data in 1 register (src_ptr[0])
+    vzip.32         d20, d21
+    vshr.u64        q3, q4, #24             ;construct src_ptr[1]
+    vshr.u64        q5, q6, #24
+    vmlal.u8        q7, d18, d2             ;(src_ptr[0] * vp9_filter[2])
+    vmlal.u8        q8, d20, d2
+
+    vzip.32         d6, d7                  ;put 2-line data in 1 register (src_ptr[1])
+    vzip.32         d10, d11
+    vmull.u8        q9, d6, d3              ;(src_ptr[1] * vp9_filter[3])
+    vmull.u8        q10, d10, d3
+
+    add             r0, r4, lr
+    add             r1, r0, lr
+    add             r2, r1, lr
+
+    vqadd.s16       q7, q9                  ;sum of all (src_data*filter_parameters)
+    vqadd.s16       q8, q10
+
+    vqrshrun.s16    d27, q7, #7             ;shift/round/saturate to u8
+    vqrshrun.s16    d28, q8, #7
+
+    vst1.32         {d27[0]}, [r4]          ;store result
+    vst1.32         {d27[1]}, [r0]
+    vst1.32         {d28[0]}, [r1]
+    vst1.32         {d28[1]}, [r2]
+
+    pop             {r4, pc}
+
+
+;---------------------
+secondpass_filter4x4_only
+    sub             r0, r0, r1, lsl #1
+    add             r3, r12, r3, lsl #5
+
+    vld1.32         {d27[0]}, [r0], r1      ;load src data
+    vld1.s32        {q5, q6}, [r3]          ;load second_pass filter
+    vld1.32         {d27[1]}, [r0], r1
+    vabs.s32        q7, q5
+    vld1.32         {d28[0]}, [r0], r1
+    vabs.s32        q8, q6
+    vld1.32         {d28[1]}, [r0], r1
+    vdup.8          d0, d14[0]              ;second_pass filter parameters (d0-d5)
+    vld1.32         {d29[0]}, [r0], r1
+    vdup.8          d1, d14[4]
+    vld1.32         {d29[1]}, [r0], r1
+    vdup.8          d2, d15[0]
+    vld1.32         {d30[0]}, [r0], r1
+    vdup.8          d3, d15[4]
+    vld1.32         {d30[1]}, [r0], r1
+    vdup.8          d4, d16[0]
+    vld1.32         {d31[0]}, [r0], r1
+    vdup.8          d5, d16[4]
+
+    vext.8          d23, d27, d28, #4
+    vext.8          d24, d28, d29, #4
+    vext.8          d25, d29, d30, #4
+    vext.8          d26, d30, d31, #4
+
+    vmull.u8        q3, d27, d0             ;(src_ptr[-2] * vp9_filter[0])
+    vmull.u8        q4, d28, d0
+
+    vmull.u8        q5, d25, d5             ;(src_ptr[3] * vp9_filter[5])
+    vmull.u8        q6, d26, d5
+
+    vmlsl.u8        q3, d29, d4             ;-(src_ptr[2] * vp9_filter[4])
+    vmlsl.u8        q4, d30, d4
+
+    vmlsl.u8        q5, d23, d1             ;-(src_ptr[-1] * vp9_filter[1])
+    vmlsl.u8        q6, d24, d1
+
+    vmlal.u8        q3, d28, d2             ;(src_ptr[0] * vp9_filter[2])
+    vmlal.u8        q4, d29, d2
+
+    vmlal.u8        q5, d24, d3             ;(src_ptr[1] * vp9_filter[3])
+    vmlal.u8        q6, d25, d3
+
+    add             r0, r4, lr
+    add             r1, r0, lr
+    add             r2, r1, lr
+
+    vqadd.s16       q5, q3                  ;sum of all (src_data*filter_parameters)
+    vqadd.s16       q6, q4
+
+    vqrshrun.s16    d3, q5, #7              ;shift/round/saturate to u8
+    vqrshrun.s16    d4, q6, #7
+
+    vst1.32         {d3[0]}, [r4]           ;store result
+    vst1.32         {d3[1]}, [r0]
+    vst1.32         {d4[0]}, [r1]
+    vst1.32         {d4[1]}, [r2]
+
+    pop             {r4, pc}
+
+    ENDP
+
+;-----------------
+
+    END
--- /dev/null
+++ b/vp9/common/arm/neon/sixtappredict8x4_neon.asm
@@ -1,0 +1,473 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_sixtap_predict8x4_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+filter8_coeff
+    DCD     0,  0,  128,    0,   0,  0,   0,  0
+    DCD     0, -6,  123,   12,  -1,  0,   0,  0
+    DCD     2, -11, 108,   36,  -8,  1,   0,  0
+    DCD     0, -9,   93,   50,  -6,  0,   0,  0
+    DCD     3, -16,  77,   77, -16,  3,   0,  0
+    DCD     0, -6,   50,   93,  -9,  0,   0,  0
+    DCD     1, -8,   36,  108, -11,  2,   0,  0
+    DCD     0, -1,   12,  123,  -6,   0,  0,  0
+
+; r0    unsigned char  *src_ptr,
+; r1    int  src_pixels_per_line,
+; r2    int  xoffset,
+; r3    int  yoffset,
+; r4    unsigned char *dst_ptr,
+; stack(r5) int  dst_pitch
+
+|vp8_sixtap_predict8x4_neon| PROC
+    push            {r4-r5, lr}
+
+    adr             r12, filter8_coeff
+    ldr             r4, [sp, #12]           ;load parameters from stack
+    ldr             r5, [sp, #16]           ;load parameters from stack
+
+    cmp             r2, #0                  ;skip first_pass filter if xoffset=0
+    beq             secondpass_filter8x4_only
+
+    add             r2, r12, r2, lsl #5     ;calculate filter location
+
+    cmp             r3, #0                  ;skip second_pass filter if yoffset=0
+
+    vld1.s32        {q14, q15}, [r2]        ;load first_pass filter
+
+    beq             firstpass_filter8x4_only
+
+    sub             sp, sp, #32             ;reserve space on stack for temporary storage
+    vabs.s32        q12, q14
+    vabs.s32        q13, q15
+
+    sub             r0, r0, #2              ;move srcptr back to (line-2) and (column-2)
+    mov             lr, sp
+    sub             r0, r0, r1, lsl #1
+
+    vdup.8          d0, d24[0]              ;first_pass filter (d0-d5)
+    vdup.8          d1, d24[4]
+    vdup.8          d2, d25[0]
+
+;First pass: output_height lines x output_width columns (9x8)
+    vld1.u8         {q3}, [r0], r1          ;load src data
+    vdup.8          d3, d25[4]
+    vld1.u8         {q4}, [r0], r1
+    vdup.8          d4, d26[0]
+    vld1.u8         {q5}, [r0], r1
+    vdup.8          d5, d26[4]
+    vld1.u8         {q6}, [r0], r1
+
+    pld             [r0]
+    pld             [r0, r1]
+    pld             [r0, r1, lsl #1]
+
+    vmull.u8        q7, d6, d0              ;(src_ptr[-2] * vp9_filter[0])
+    vmull.u8        q8, d8, d0
+    vmull.u8        q9, d10, d0
+    vmull.u8        q10, d12, d0
+
+    vext.8          d28, d6, d7, #1         ;construct src_ptr[-1]
+    vext.8          d29, d8, d9, #1
+    vext.8          d30, d10, d11, #1
+    vext.8          d31, d12, d13, #1
+
+    vmlsl.u8        q7, d28, d1             ;-(src_ptr[-1] * vp9_filter[1])
+    vmlsl.u8        q8, d29, d1
+    vmlsl.u8        q9, d30, d1
+    vmlsl.u8        q10, d31, d1
+
+    vext.8          d28, d6, d7, #4         ;construct src_ptr[2]
+    vext.8          d29, d8, d9, #4
+    vext.8          d30, d10, d11, #4
+    vext.8          d31, d12, d13, #4
+
+    vmlsl.u8        q7, d28, d4             ;-(src_ptr[2] * vp9_filter[4])
+    vmlsl.u8        q8, d29, d4
+    vmlsl.u8        q9, d30, d4
+    vmlsl.u8        q10, d31, d4
+
+    vext.8          d28, d6, d7, #2         ;construct src_ptr[0]
+    vext.8          d29, d8, d9, #2
+    vext.8          d30, d10, d11, #2
+    vext.8          d31, d12, d13, #2
+
+    vmlal.u8        q7, d28, d2             ;(src_ptr[0] * vp9_filter[2])
+    vmlal.u8        q8, d29, d2
+    vmlal.u8        q9, d30, d2
+    vmlal.u8        q10, d31, d2
+
+    vext.8          d28, d6, d7, #5         ;construct src_ptr[3]
+    vext.8          d29, d8, d9, #5
+    vext.8          d30, d10, d11, #5
+    vext.8          d31, d12, d13, #5
+
+    vmlal.u8        q7, d28, d5             ;(src_ptr[3] * vp9_filter[5])
+    vmlal.u8        q8, d29, d5
+    vmlal.u8        q9, d30, d5
+    vmlal.u8        q10, d31, d5
+
+    vext.8          d28, d6, d7, #3         ;construct src_ptr[1]
+    vext.8          d29, d8, d9, #3
+    vext.8          d30, d10, d11, #3
+    vext.8          d31, d12, d13, #3
+
+    vmull.u8        q3, d28, d3             ;(src_ptr[1] * vp9_filter[3])
+    vmull.u8        q4, d29, d3
+    vmull.u8        q5, d30, d3
+    vmull.u8        q6, d31, d3
+
+    vqadd.s16       q7, q3                  ;sum of all (src_data*filter_parameters)
+    vqadd.s16       q8, q4
+    vqadd.s16       q9, q5
+    vqadd.s16       q10, q6
+
+    vld1.u8         {q3}, [r0], r1          ;load src data
+
+    vqrshrun.s16    d22, q7, #7             ;shift/round/saturate to u8
+    vqrshrun.s16    d23, q8, #7
+    vqrshrun.s16    d24, q9, #7
+    vqrshrun.s16    d25, q10, #7
+
+    vld1.u8         {q4}, [r0], r1
+    vst1.u8         {d22}, [lr]!            ;store result
+    vld1.u8         {q5}, [r0], r1
+    vst1.u8         {d23}, [lr]!
+    vld1.u8         {q6}, [r0], r1
+    vst1.u8         {d24}, [lr]!
+    vld1.u8         {q7}, [r0], r1
+    vst1.u8         {d25}, [lr]!
+
+    ;first_pass filtering on the rest 5-line data
+    vmull.u8        q8, d6, d0              ;(src_ptr[-2] * vp9_filter[0])
+    vmull.u8        q9, d8, d0
+    vmull.u8        q10, d10, d0
+    vmull.u8        q11, d12, d0
+    vmull.u8        q12, d14, d0
+
+    vext.8          d27, d6, d7, #1         ;construct src_ptr[-1]
+    vext.8          d28, d8, d9, #1
+    vext.8          d29, d10, d11, #1
+    vext.8          d30, d12, d13, #1
+    vext.8          d31, d14, d15, #1
+
+    vmlsl.u8        q8, d27, d1             ;-(src_ptr[-1] * vp9_filter[1])
+    vmlsl.u8        q9, d28, d1
+    vmlsl.u8        q10, d29, d1
+    vmlsl.u8        q11, d30, d1
+    vmlsl.u8        q12, d31, d1
+
+    vext.8          d27, d6, d7, #4         ;construct src_ptr[2]
+    vext.8          d28, d8, d9, #4
+    vext.8          d29, d10, d11, #4
+    vext.8          d30, d12, d13, #4
+    vext.8          d31, d14, d15, #4
+
+    vmlsl.u8        q8, d27, d4             ;-(src_ptr[2] * vp9_filter[4])
+    vmlsl.u8        q9, d28, d4
+    vmlsl.u8        q10, d29, d4
+    vmlsl.u8        q11, d30, d4
+    vmlsl.u8        q12, d31, d4
+
+    vext.8          d27, d6, d7, #2         ;construct src_ptr[0]
+    vext.8          d28, d8, d9, #2
+    vext.8          d29, d10, d11, #2
+    vext.8          d30, d12, d13, #2
+    vext.8          d31, d14, d15, #2
+
+    vmlal.u8        q8, d27, d2             ;(src_ptr[0] * vp9_filter[2])
+    vmlal.u8        q9, d28, d2
+    vmlal.u8        q10, d29, d2
+    vmlal.u8        q11, d30, d2
+    vmlal.u8        q12, d31, d2
+
+    vext.8          d27, d6, d7, #5         ;construct src_ptr[3]
+    vext.8          d28, d8, d9, #5
+    vext.8          d29, d10, d11, #5
+    vext.8          d30, d12, d13, #5
+    vext.8          d31, d14, d15, #5
+
+    vmlal.u8        q8, d27, d5             ;(src_ptr[3] * vp9_filter[5])
+    vmlal.u8        q9, d28, d5
+    vmlal.u8        q10, d29, d5
+    vmlal.u8        q11, d30, d5
+    vmlal.u8        q12, d31, d5
+
+    vext.8          d27, d6, d7, #3         ;construct src_ptr[1]
+    vext.8          d28, d8, d9, #3
+    vext.8          d29, d10, d11, #3
+    vext.8          d30, d12, d13, #3
+    vext.8          d31, d14, d15, #3
+
+    vmull.u8        q3, d27, d3             ;(src_ptr[1] * vp9_filter[3])
+    vmull.u8        q4, d28, d3
+    vmull.u8        q5, d29, d3
+    vmull.u8        q6, d30, d3
+    vmull.u8        q7, d31, d3
+
+    vqadd.s16       q8, q3                  ;sum of all (src_data*filter_parameters)
+    vqadd.s16       q9, q4
+    vqadd.s16       q10, q5
+    vqadd.s16       q11, q6
+    vqadd.s16       q12, q7
+
+    vqrshrun.s16    d26, q8, #7             ;shift/round/saturate to u8
+    vqrshrun.s16    d27, q9, #7
+    vqrshrun.s16    d28, q10, #7
+    vqrshrun.s16    d29, q11, #7                ;load intermediate data from stack
+    vqrshrun.s16    d30, q12, #7
+
+;Second pass: 8x4
+;secondpass_filter
+    add             r3, r12, r3, lsl #5
+    sub             lr, lr, #32
+
+    vld1.s32        {q5, q6}, [r3]          ;load second_pass filter
+    vld1.u8         {q11}, [lr]!
+
+    vabs.s32        q7, q5
+    vabs.s32        q8, q6
+
+    vld1.u8         {q12}, [lr]!
+
+    vdup.8          d0, d14[0]              ;second_pass filter parameters (d0-d5)
+    vdup.8          d1, d14[4]
+    vdup.8          d2, d15[0]
+    vdup.8          d3, d15[4]
+    vdup.8          d4, d16[0]
+    vdup.8          d5, d16[4]
+
+    vmull.u8        q3, d22, d0             ;(src_ptr[-2] * vp9_filter[0])
+    vmull.u8        q4, d23, d0
+    vmull.u8        q5, d24, d0
+    vmull.u8        q6, d25, d0
+
+    vmlsl.u8        q3, d23, d1             ;-(src_ptr[-1] * vp9_filter[1])
+    vmlsl.u8        q4, d24, d1
+    vmlsl.u8        q5, d25, d1
+    vmlsl.u8        q6, d26, d1
+
+    vmlsl.u8        q3, d26, d4             ;-(src_ptr[2] * vp9_filter[4])
+    vmlsl.u8        q4, d27, d4
+    vmlsl.u8        q5, d28, d4
+    vmlsl.u8        q6, d29, d4
+
+    vmlal.u8        q3, d24, d2             ;(src_ptr[0] * vp9_filter[2])
+    vmlal.u8        q4, d25, d2
+    vmlal.u8        q5, d26, d2
+    vmlal.u8        q6, d27, d2
+
+    vmlal.u8        q3, d27, d5             ;(src_ptr[3] * vp9_filter[5])
+    vmlal.u8        q4, d28, d5
+    vmlal.u8        q5, d29, d5
+    vmlal.u8        q6, d30, d5
+
+    vmull.u8        q7, d25, d3             ;(src_ptr[1] * vp9_filter[3])
+    vmull.u8        q8, d26, d3
+    vmull.u8        q9, d27, d3
+    vmull.u8        q10, d28, d3
+
+    vqadd.s16       q7, q3                  ;sum of all (src_data*filter_parameters)
+    vqadd.s16       q8, q4
+    vqadd.s16       q9, q5
+    vqadd.s16       q10, q6
+
+    vqrshrun.s16    d6, q7, #7              ;shift/round/saturate to u8
+    vqrshrun.s16    d7, q8, #7
+    vqrshrun.s16    d8, q9, #7
+    vqrshrun.s16    d9, q10, #7
+
+    vst1.u8         {d6}, [r4], r5          ;store result
+    vst1.u8         {d7}, [r4], r5
+    vst1.u8         {d8}, [r4], r5
+    vst1.u8         {d9}, [r4], r5
+
+    add             sp, sp, #32
+    pop             {r4-r5,pc}
+
+;--------------------
+firstpass_filter8x4_only
+    vabs.s32        q12, q14
+    vabs.s32        q13, q15
+
+    sub             r0, r0, #2              ;move srcptr back to (line-2) and (column-2)
+    vld1.u8         {q3}, [r0], r1          ;load src data
+
+    vdup.8          d0, d24[0]              ;first_pass filter (d0-d5)
+    vld1.u8         {q4}, [r0], r1
+    vdup.8          d1, d24[4]
+    vld1.u8         {q5}, [r0], r1
+    vdup.8          d2, d25[0]
+    vld1.u8         {q6}, [r0], r1
+    vdup.8          d3, d25[4]
+    vdup.8          d4, d26[0]
+    vdup.8          d5, d26[4]
+
+;First pass: output_height lines x output_width columns (4x8)
+    pld             [r0]
+    pld             [r0, r1]
+    pld             [r0, r1, lsl #1]
+
+    vmull.u8        q7, d6, d0              ;(src_ptr[-2] * vp9_filter[0])
+    vmull.u8        q8, d8, d0
+    vmull.u8        q9, d10, d0
+    vmull.u8        q10, d12, d0
+
+    vext.8          d28, d6, d7, #1         ;construct src_ptr[-1]
+    vext.8          d29, d8, d9, #1
+    vext.8          d30, d10, d11, #1
+    vext.8          d31, d12, d13, #1
+
+    vmlsl.u8        q7, d28, d1             ;-(src_ptr[-1] * vp9_filter[1])
+    vmlsl.u8        q8, d29, d1
+    vmlsl.u8        q9, d30, d1
+    vmlsl.u8        q10, d31, d1
+
+    vext.8          d28, d6, d7, #4         ;construct src_ptr[2]
+    vext.8          d29, d8, d9, #4
+    vext.8          d30, d10, d11, #4
+    vext.8          d31, d12, d13, #4
+
+    vmlsl.u8        q7, d28, d4             ;-(src_ptr[2] * vp9_filter[4])
+    vmlsl.u8        q8, d29, d4
+    vmlsl.u8        q9, d30, d4
+    vmlsl.u8        q10, d31, d4
+
+    vext.8          d28, d6, d7, #2         ;construct src_ptr[0]
+    vext.8          d29, d8, d9, #2
+    vext.8          d30, d10, d11, #2
+    vext.8          d31, d12, d13, #2
+
+    vmlal.u8        q7, d28, d2             ;(src_ptr[0] * vp9_filter[2])
+    vmlal.u8        q8, d29, d2
+    vmlal.u8        q9, d30, d2
+    vmlal.u8        q10, d31, d2
+
+    vext.8          d28, d6, d7, #5         ;construct src_ptr[3]
+    vext.8          d29, d8, d9, #5
+    vext.8          d30, d10, d11, #5
+    vext.8          d31, d12, d13, #5
+
+    vmlal.u8        q7, d28, d5             ;(src_ptr[3] * vp9_filter[5])
+    vmlal.u8        q8, d29, d5
+    vmlal.u8        q9, d30, d5
+    vmlal.u8        q10, d31, d5
+
+    vext.8          d28, d6, d7, #3         ;construct src_ptr[1]
+    vext.8          d29, d8, d9, #3
+    vext.8          d30, d10, d11, #3
+    vext.8          d31, d12, d13, #3
+
+    vmull.u8        q3, d28, d3             ;(src_ptr[1] * vp9_filter[3])
+    vmull.u8        q4, d29, d3
+    vmull.u8        q5, d30, d3
+    vmull.u8        q6, d31, d3
+
+    vqadd.s16       q7, q3                  ;sum of all (src_data*filter_parameters)
+    vqadd.s16       q8, q4
+    vqadd.s16       q9, q5
+    vqadd.s16       q10, q6
+
+    vqrshrun.s16    d22, q7, #7             ;shift/round/saturate to u8
+    vqrshrun.s16    d23, q8, #7
+    vqrshrun.s16    d24, q9, #7
+    vqrshrun.s16    d25, q10, #7
+
+    vst1.u8         {d22}, [r4], r5         ;store result
+    vst1.u8         {d23}, [r4], r5
+    vst1.u8         {d24}, [r4], r5
+    vst1.u8         {d25}, [r4], r5
+
+    pop             {r4-r5,pc}
+
+;---------------------
+secondpass_filter8x4_only
+;Second pass: 8x4
+    add             r3, r12, r3, lsl #5
+    sub             r0, r0, r1, lsl #1
+    vld1.s32        {q5, q6}, [r3]          ;load second_pass filter
+    vabs.s32        q7, q5
+    vabs.s32        q8, q6
+
+    vld1.u8         {d22}, [r0], r1
+    vld1.u8         {d23}, [r0], r1
+    vld1.u8         {d24}, [r0], r1
+    vdup.8          d0, d14[0]              ;second_pass filter parameters (d0-d5)
+    vld1.u8         {d25}, [r0], r1
+    vdup.8          d1, d14[4]
+    vld1.u8         {d26}, [r0], r1
+    vdup.8          d2, d15[0]
+    vld1.u8         {d27}, [r0], r1
+    vdup.8          d3, d15[4]
+    vld1.u8         {d28}, [r0], r1
+    vdup.8          d4, d16[0]
+    vld1.u8         {d29}, [r0], r1
+    vdup.8          d5, d16[4]
+    vld1.u8         {d30}, [r0], r1
+
+    vmull.u8        q3, d22, d0             ;(src_ptr[-2] * vp9_filter[0])
+    vmull.u8        q4, d23, d0
+    vmull.u8        q5, d24, d0
+    vmull.u8        q6, d25, d0
+
+    vmlsl.u8        q3, d23, d1             ;-(src_ptr[-1] * vp9_filter[1])
+    vmlsl.u8        q4, d24, d1
+    vmlsl.u8        q5, d25, d1
+    vmlsl.u8        q6, d26, d1
+
+    vmlsl.u8        q3, d26, d4             ;-(src_ptr[2] * vp9_filter[4])
+    vmlsl.u8        q4, d27, d4
+    vmlsl.u8        q5, d28, d4
+    vmlsl.u8        q6, d29, d4
+
+    vmlal.u8        q3, d24, d2             ;(src_ptr[0] * vp9_filter[2])
+    vmlal.u8        q4, d25, d2
+    vmlal.u8        q5, d26, d2
+    vmlal.u8        q6, d27, d2
+
+    vmlal.u8        q3, d27, d5             ;(src_ptr[3] * vp9_filter[5])
+    vmlal.u8        q4, d28, d5
+    vmlal.u8        q5, d29, d5
+    vmlal.u8        q6, d30, d5
+
+    vmull.u8        q7, d25, d3             ;(src_ptr[1] * vp9_filter[3])
+    vmull.u8        q8, d26, d3
+    vmull.u8        q9, d27, d3
+    vmull.u8        q10, d28, d3
+
+    vqadd.s16       q7, q3                  ;sum of all (src_data*filter_parameters)
+    vqadd.s16       q8, q4
+    vqadd.s16       q9, q5
+    vqadd.s16       q10, q6
+
+    vqrshrun.s16    d6, q7, #7              ;shift/round/saturate to u8
+    vqrshrun.s16    d7, q8, #7
+    vqrshrun.s16    d8, q9, #7
+    vqrshrun.s16    d9, q10, #7
+
+    vst1.u8         {d6}, [r4], r5          ;store result
+    vst1.u8         {d7}, [r4], r5
+    vst1.u8         {d8}, [r4], r5
+    vst1.u8         {d9}, [r4], r5
+
+    pop             {r4-r5,pc}
+
+    ENDP
+
+;-----------------
+
+    END
--- /dev/null
+++ b/vp9/common/arm/neon/sixtappredict8x8_neon.asm
@@ -1,0 +1,524 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_sixtap_predict8x8_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+filter8_coeff
+    DCD     0,  0,  128,    0,   0,  0,   0,  0
+    DCD     0, -6,  123,   12,  -1,  0,   0,  0
+    DCD     2, -11, 108,   36,  -8,  1,   0,  0
+    DCD     0, -9,   93,   50,  -6,  0,   0,  0
+    DCD     3, -16,  77,   77, -16,  3,   0,  0
+    DCD     0, -6,   50,   93,  -9,  0,   0,  0
+    DCD     1, -8,   36,  108, -11,  2,   0,  0
+    DCD     0, -1,   12,  123,  -6,   0,  0,  0
+
+; r0    unsigned char  *src_ptr,
+; r1    int  src_pixels_per_line,
+; r2    int  xoffset,
+; r3    int  yoffset,
+; stack(r4) unsigned char *dst_ptr,
+; stack(r5) int  dst_pitch
+
+|vp8_sixtap_predict8x8_neon| PROC
+    push            {r4-r5, lr}
+
+    adr             r12, filter8_coeff
+
+    ldr             r4, [sp, #12]           ;load parameters from stack
+    ldr             r5, [sp, #16]           ;load parameters from stack
+
+    cmp             r2, #0                  ;skip first_pass filter if xoffset=0
+    beq             secondpass_filter8x8_only
+
+    add             r2, r12, r2, lsl #5     ;calculate filter location
+
+    cmp             r3, #0                  ;skip second_pass filter if yoffset=0
+
+    vld1.s32        {q14, q15}, [r2]        ;load first_pass filter
+
+    beq             firstpass_filter8x8_only
+
+    sub             sp, sp, #64             ;reserve space on stack for temporary storage
+    mov             lr, sp
+
+    vabs.s32        q12, q14
+    vabs.s32        q13, q15
+
+    mov             r2, #2                  ;loop counter
+    sub             r0, r0, #2              ;move srcptr back to (line-2) and (column-2)
+    sub             r0, r0, r1, lsl #1
+
+    vdup.8          d0, d24[0]              ;first_pass filter (d0-d5)
+    vdup.8          d1, d24[4]
+    vdup.8          d2, d25[0]
+
+;First pass: output_height lines x output_width columns (13x8)
+    vld1.u8         {q3}, [r0], r1          ;load src data
+    vdup.8          d3, d25[4]
+    vld1.u8         {q4}, [r0], r1
+    vdup.8          d4, d26[0]
+    vld1.u8         {q5}, [r0], r1
+    vdup.8          d5, d26[4]
+    vld1.u8         {q6}, [r0], r1
+
+filt_blk2d_fp8x8_loop_neon
+    pld             [r0]
+    pld             [r0, r1]
+    pld             [r0, r1, lsl #1]
+
+    vmull.u8        q7, d6, d0              ;(src_ptr[-2] * vp9_filter[0])
+    vmull.u8        q8, d8, d0
+    vmull.u8        q9, d10, d0
+    vmull.u8        q10, d12, d0
+
+    vext.8          d28, d6, d7, #1         ;construct src_ptr[-1]
+    vext.8          d29, d8, d9, #1
+    vext.8          d30, d10, d11, #1
+    vext.8          d31, d12, d13, #1
+
+    vmlsl.u8        q7, d28, d1             ;-(src_ptr[-1] * vp9_filter[1])
+    vmlsl.u8        q8, d29, d1
+    vmlsl.u8        q9, d30, d1
+    vmlsl.u8        q10, d31, d1
+
+    vext.8          d28, d6, d7, #4         ;construct src_ptr[2]
+    vext.8          d29, d8, d9, #4
+    vext.8          d30, d10, d11, #4
+    vext.8          d31, d12, d13, #4
+
+    vmlsl.u8        q7, d28, d4             ;-(src_ptr[2] * vp9_filter[4])
+    vmlsl.u8        q8, d29, d4
+    vmlsl.u8        q9, d30, d4
+    vmlsl.u8        q10, d31, d4
+
+    vext.8          d28, d6, d7, #2         ;construct src_ptr[0]
+    vext.8          d29, d8, d9, #2
+    vext.8          d30, d10, d11, #2
+    vext.8          d31, d12, d13, #2
+
+    vmlal.u8        q7, d28, d2             ;(src_ptr[0] * vp9_filter[2])
+    vmlal.u8        q8, d29, d2
+    vmlal.u8        q9, d30, d2
+    vmlal.u8        q10, d31, d2
+
+    vext.8          d28, d6, d7, #5         ;construct src_ptr[3]
+    vext.8          d29, d8, d9, #5
+    vext.8          d30, d10, d11, #5
+    vext.8          d31, d12, d13, #5
+
+    vmlal.u8        q7, d28, d5             ;(src_ptr[3] * vp9_filter[5])
+    vmlal.u8        q8, d29, d5
+    vmlal.u8        q9, d30, d5
+    vmlal.u8        q10, d31, d5
+
+    vext.8          d28, d6, d7, #3         ;construct src_ptr[1]
+    vext.8          d29, d8, d9, #3
+    vext.8          d30, d10, d11, #3
+    vext.8          d31, d12, d13, #3
+
+    vmull.u8        q3, d28, d3             ;(src_ptr[1] * vp9_filter[3])
+    vmull.u8        q4, d29, d3
+    vmull.u8        q5, d30, d3
+    vmull.u8        q6, d31, d3
+
+    subs            r2, r2, #1
+
+    vqadd.s16       q7, q3                  ;sum of all (src_data*filter_parameters)
+    vqadd.s16       q8, q4
+    vqadd.s16       q9, q5
+    vqadd.s16       q10, q6
+
+    vld1.u8         {q3}, [r0], r1          ;load src data
+
+    vqrshrun.s16    d22, q7, #7             ;shift/round/saturate to u8
+    vqrshrun.s16    d23, q8, #7
+    vqrshrun.s16    d24, q9, #7
+    vqrshrun.s16    d25, q10, #7
+
+    vst1.u8         {d22}, [lr]!            ;store result
+    vld1.u8         {q4}, [r0], r1
+    vst1.u8         {d23}, [lr]!
+    vld1.u8         {q5}, [r0], r1
+    vst1.u8         {d24}, [lr]!
+    vld1.u8         {q6}, [r0], r1
+    vst1.u8         {d25}, [lr]!
+
+    bne             filt_blk2d_fp8x8_loop_neon
+
+    ;first_pass filtering on the rest 5-line data
+    ;vld1.u8            {q3}, [r0], r1          ;load src data
+    ;vld1.u8            {q4}, [r0], r1
+    ;vld1.u8            {q5}, [r0], r1
+    ;vld1.u8            {q6}, [r0], r1
+    vld1.u8         {q7}, [r0], r1
+
+    vmull.u8        q8, d6, d0              ;(src_ptr[-2] * vp9_filter[0])
+    vmull.u8        q9, d8, d0
+    vmull.u8        q10, d10, d0
+    vmull.u8        q11, d12, d0
+    vmull.u8        q12, d14, d0
+
+    vext.8          d27, d6, d7, #1         ;construct src_ptr[-1]
+    vext.8          d28, d8, d9, #1
+    vext.8          d29, d10, d11, #1
+    vext.8          d30, d12, d13, #1
+    vext.8          d31, d14, d15, #1
+
+    vmlsl.u8        q8, d27, d1             ;-(src_ptr[-1] * vp9_filter[1])
+    vmlsl.u8        q9, d28, d1
+    vmlsl.u8        q10, d29, d1
+    vmlsl.u8        q11, d30, d1
+    vmlsl.u8        q12, d31, d1
+
+    vext.8          d27, d6, d7, #4         ;construct src_ptr[2]
+    vext.8          d28, d8, d9, #4
+    vext.8          d29, d10, d11, #4
+    vext.8          d30, d12, d13, #4
+    vext.8          d31, d14, d15, #4
+
+    vmlsl.u8        q8, d27, d4             ;-(src_ptr[2] * vp9_filter[4])
+    vmlsl.u8        q9, d28, d4
+    vmlsl.u8        q10, d29, d4
+    vmlsl.u8        q11, d30, d4
+    vmlsl.u8        q12, d31, d4
+
+    vext.8          d27, d6, d7, #2         ;construct src_ptr[0]
+    vext.8          d28, d8, d9, #2
+    vext.8          d29, d10, d11, #2
+    vext.8          d30, d12, d13, #2
+    vext.8          d31, d14, d15, #2
+
+    vmlal.u8        q8, d27, d2             ;(src_ptr[0] * vp9_filter[2])
+    vmlal.u8        q9, d28, d2
+    vmlal.u8        q10, d29, d2
+    vmlal.u8        q11, d30, d2
+    vmlal.u8        q12, d31, d2
+
+    vext.8          d27, d6, d7, #5         ;construct src_ptr[3]
+    vext.8          d28, d8, d9, #5
+    vext.8          d29, d10, d11, #5
+    vext.8          d30, d12, d13, #5
+    vext.8          d31, d14, d15, #5
+
+    vmlal.u8        q8, d27, d5             ;(src_ptr[3] * vp9_filter[5])
+    vmlal.u8        q9, d28, d5
+    vmlal.u8        q10, d29, d5
+    vmlal.u8        q11, d30, d5
+    vmlal.u8        q12, d31, d5
+
+    vext.8          d27, d6, d7, #3         ;construct src_ptr[1]
+    vext.8          d28, d8, d9, #3
+    vext.8          d29, d10, d11, #3
+    vext.8          d30, d12, d13, #3
+    vext.8          d31, d14, d15, #3
+
+    vmull.u8        q3, d27, d3             ;(src_ptr[1] * vp9_filter[3])
+    vmull.u8        q4, d28, d3
+    vmull.u8        q5, d29, d3
+    vmull.u8        q6, d30, d3
+    vmull.u8        q7, d31, d3
+
+    vqadd.s16       q8, q3                  ;sum of all (src_data*filter_parameters)
+    vqadd.s16       q9, q4
+    vqadd.s16       q10, q5
+    vqadd.s16       q11, q6
+    vqadd.s16       q12, q7
+
+    add             r3, r12, r3, lsl #5
+
+    vqrshrun.s16    d26, q8, #7             ;shift/round/saturate to u8
+    sub             lr, lr, #64
+    vqrshrun.s16    d27, q9, #7
+    vld1.u8         {q9}, [lr]!             ;load intermediate data from stack
+    vqrshrun.s16    d28, q10, #7
+    vld1.u8         {q10}, [lr]!
+
+    vld1.s32        {q5, q6}, [r3]          ;load second_pass filter
+
+    vqrshrun.s16    d29, q11, #7
+    vld1.u8         {q11}, [lr]!
+
+    vabs.s32        q7, q5
+    vabs.s32        q8, q6
+
+    vqrshrun.s16    d30, q12, #7
+    vld1.u8         {q12}, [lr]!
+
+;Second pass: 8x8
+    mov             r3, #2                  ;loop counter
+
+    vdup.8          d0, d14[0]              ;second_pass filter parameters (d0-d5)
+    vdup.8          d1, d14[4]
+    vdup.8          d2, d15[0]
+    vdup.8          d3, d15[4]
+    vdup.8          d4, d16[0]
+    vdup.8          d5, d16[4]
+
+filt_blk2d_sp8x8_loop_neon
+    vmull.u8        q3, d18, d0             ;(src_ptr[-2] * vp9_filter[0])
+    vmull.u8        q4, d19, d0
+    vmull.u8        q5, d20, d0
+    vmull.u8        q6, d21, d0
+
+    vmlsl.u8        q3, d19, d1             ;-(src_ptr[-1] * vp9_filter[1])
+    vmlsl.u8        q4, d20, d1
+    vmlsl.u8        q5, d21, d1
+    vmlsl.u8        q6, d22, d1
+
+    vmlsl.u8        q3, d22, d4             ;-(src_ptr[2] * vp9_filter[4])
+    vmlsl.u8        q4, d23, d4
+    vmlsl.u8        q5, d24, d4
+    vmlsl.u8        q6, d25, d4
+
+    vmlal.u8        q3, d20, d2             ;(src_ptr[0] * vp9_filter[2])
+    vmlal.u8        q4, d21, d2
+    vmlal.u8        q5, d22, d2
+    vmlal.u8        q6, d23, d2
+
+    vmlal.u8        q3, d23, d5             ;(src_ptr[3] * vp9_filter[5])
+    vmlal.u8        q4, d24, d5
+    vmlal.u8        q5, d25, d5
+    vmlal.u8        q6, d26, d5
+
+    vmull.u8        q7, d21, d3             ;(src_ptr[1] * vp9_filter[3])
+    vmull.u8        q8, d22, d3
+    vmull.u8        q9, d23, d3
+    vmull.u8        q10, d24, d3
+
+    subs            r3, r3, #1
+
+    vqadd.s16       q7, q3                  ;sum of all (src_data*filter_parameters)
+    vqadd.s16       q8, q4
+    vqadd.s16       q9, q5
+    vqadd.s16       q10, q6
+
+    vqrshrun.s16    d6, q7, #7              ;shift/round/saturate to u8
+    vqrshrun.s16    d7, q8, #7
+    vqrshrun.s16    d8, q9, #7
+    vqrshrun.s16    d9, q10, #7
+
+    vmov            q9, q11
+    vst1.u8         {d6}, [r4], r5          ;store result
+    vmov            q10, q12
+    vst1.u8         {d7}, [r4], r5
+    vmov            q11, q13
+    vst1.u8         {d8}, [r4], r5
+    vmov            q12, q14
+    vst1.u8         {d9}, [r4], r5
+    vmov            d26, d30
+
+    bne filt_blk2d_sp8x8_loop_neon
+
+    add             sp, sp, #64
+    pop             {r4-r5,pc}
+
+;---------------------
+firstpass_filter8x8_only
+    ;add                r2, r12, r2, lsl #5     ;calculate filter location
+    ;vld1.s32       {q14, q15}, [r2]        ;load first_pass filter
+    vabs.s32        q12, q14
+    vabs.s32        q13, q15
+
+    mov             r2, #2                  ;loop counter
+    sub             r0, r0, #2              ;move srcptr back to (line-2) and (column-2)
+
+    vdup.8          d0, d24[0]              ;first_pass filter (d0-d5)
+    vdup.8          d1, d24[4]
+    vdup.8          d2, d25[0]
+    vdup.8          d3, d25[4]
+    vdup.8          d4, d26[0]
+    vdup.8          d5, d26[4]
+
+;First pass: output_height lines x output_width columns (8x8)
+filt_blk2d_fpo8x8_loop_neon
+    vld1.u8         {q3}, [r0], r1          ;load src data
+    vld1.u8         {q4}, [r0], r1
+    vld1.u8         {q5}, [r0], r1
+    vld1.u8         {q6}, [r0], r1
+
+    pld             [r0]
+    pld             [r0, r1]
+    pld             [r0, r1, lsl #1]
+
+    vmull.u8        q7, d6, d0              ;(src_ptr[-2] * vp9_filter[0])
+    vmull.u8        q8, d8, d0
+    vmull.u8        q9, d10, d0
+    vmull.u8        q10, d12, d0
+
+    vext.8          d28, d6, d7, #1         ;construct src_ptr[-1]
+    vext.8          d29, d8, d9, #1
+    vext.8          d30, d10, d11, #1
+    vext.8          d31, d12, d13, #1
+
+    vmlsl.u8        q7, d28, d1             ;-(src_ptr[-1] * vp9_filter[1])
+    vmlsl.u8        q8, d29, d1
+    vmlsl.u8        q9, d30, d1
+    vmlsl.u8        q10, d31, d1
+
+    vext.8          d28, d6, d7, #4         ;construct src_ptr[2]
+    vext.8          d29, d8, d9, #4
+    vext.8          d30, d10, d11, #4
+    vext.8          d31, d12, d13, #4
+
+    vmlsl.u8        q7, d28, d4             ;-(src_ptr[2] * vp9_filter[4])
+    vmlsl.u8        q8, d29, d4
+    vmlsl.u8        q9, d30, d4
+    vmlsl.u8        q10, d31, d4
+
+    vext.8          d28, d6, d7, #2         ;construct src_ptr[0]
+    vext.8          d29, d8, d9, #2
+    vext.8          d30, d10, d11, #2
+    vext.8          d31, d12, d13, #2
+
+    vmlal.u8        q7, d28, d2             ;(src_ptr[0] * vp9_filter[2])
+    vmlal.u8        q8, d29, d2
+    vmlal.u8        q9, d30, d2
+    vmlal.u8        q10, d31, d2
+
+    vext.8          d28, d6, d7, #5         ;construct src_ptr[3]
+    vext.8          d29, d8, d9, #5
+    vext.8          d30, d10, d11, #5
+    vext.8          d31, d12, d13, #5
+
+    vmlal.u8        q7, d28, d5             ;(src_ptr[3] * vp9_filter[5])
+    vmlal.u8        q8, d29, d5
+    vmlal.u8        q9, d30, d5
+    vmlal.u8        q10, d31, d5
+
+    vext.8          d28, d6, d7, #3         ;construct src_ptr[1]
+    vext.8          d29, d8, d9, #3
+    vext.8          d30, d10, d11, #3
+    vext.8          d31, d12, d13, #3
+
+    vmull.u8        q3, d28, d3             ;(src_ptr[1] * vp9_filter[3])
+    vmull.u8        q4, d29, d3
+    vmull.u8        q5, d30, d3
+    vmull.u8        q6, d31, d3
+ ;
+    vqadd.s16       q7, q3                  ;sum of all (src_data*filter_parameters)
+    vqadd.s16       q8, q4
+    vqadd.s16       q9, q5
+    vqadd.s16       q10, q6
+
+    subs            r2, r2, #1
+
+    vqrshrun.s16    d22, q7, #7             ;shift/round/saturate to u8
+    vqrshrun.s16    d23, q8, #7
+    vqrshrun.s16    d24, q9, #7
+    vqrshrun.s16    d25, q10, #7
+
+    vst1.u8         {d22}, [r4], r5         ;store result
+    vst1.u8         {d23}, [r4], r5
+    vst1.u8         {d24}, [r4], r5
+    vst1.u8         {d25}, [r4], r5
+
+    bne             filt_blk2d_fpo8x8_loop_neon
+
+    pop             {r4-r5,pc}
+
+;---------------------
+secondpass_filter8x8_only
+    sub             r0, r0, r1, lsl #1
+    add             r3, r12, r3, lsl #5
+
+    vld1.u8         {d18}, [r0], r1         ;load src data
+    vld1.s32        {q5, q6}, [r3]          ;load second_pass filter
+    vld1.u8         {d19}, [r0], r1
+    vabs.s32        q7, q5
+    vld1.u8         {d20}, [r0], r1
+    vabs.s32        q8, q6
+    vld1.u8         {d21}, [r0], r1
+    mov             r3, #2                  ;loop counter
+    vld1.u8         {d22}, [r0], r1
+    vdup.8          d0, d14[0]              ;second_pass filter parameters (d0-d5)
+    vld1.u8         {d23}, [r0], r1
+    vdup.8          d1, d14[4]
+    vld1.u8         {d24}, [r0], r1
+    vdup.8          d2, d15[0]
+    vld1.u8         {d25}, [r0], r1
+    vdup.8          d3, d15[4]
+    vld1.u8         {d26}, [r0], r1
+    vdup.8          d4, d16[0]
+    vld1.u8         {d27}, [r0], r1
+    vdup.8          d5, d16[4]
+    vld1.u8         {d28}, [r0], r1
+    vld1.u8         {d29}, [r0], r1
+    vld1.u8         {d30}, [r0], r1
+
+;Second pass: 8x8
+filt_blk2d_spo8x8_loop_neon
+    vmull.u8        q3, d18, d0             ;(src_ptr[-2] * vp9_filter[0])
+    vmull.u8        q4, d19, d0
+    vmull.u8        q5, d20, d0
+    vmull.u8        q6, d21, d0
+
+    vmlsl.u8        q3, d19, d1             ;-(src_ptr[-1] * vp9_filter[1])
+    vmlsl.u8        q4, d20, d1
+    vmlsl.u8        q5, d21, d1
+    vmlsl.u8        q6, d22, d1
+
+    vmlsl.u8        q3, d22, d4             ;-(src_ptr[2] * vp9_filter[4])
+    vmlsl.u8        q4, d23, d4
+    vmlsl.u8        q5, d24, d4
+    vmlsl.u8        q6, d25, d4
+
+    vmlal.u8        q3, d20, d2             ;(src_ptr[0] * vp9_filter[2])
+    vmlal.u8        q4, d21, d2
+    vmlal.u8        q5, d22, d2
+    vmlal.u8        q6, d23, d2
+
+    vmlal.u8        q3, d23, d5             ;(src_ptr[3] * vp9_filter[5])
+    vmlal.u8        q4, d24, d5
+    vmlal.u8        q5, d25, d5
+    vmlal.u8        q6, d26, d5
+
+    vmull.u8        q7, d21, d3             ;(src_ptr[1] * vp9_filter[3])
+    vmull.u8        q8, d22, d3
+    vmull.u8        q9, d23, d3
+    vmull.u8        q10, d24, d3
+
+    subs            r3, r3, #1
+
+    vqadd.s16       q7, q3                  ;sum of all (src_data*filter_parameters)
+    vqadd.s16       q8, q4
+    vqadd.s16       q9, q5
+    vqadd.s16       q10, q6
+
+    vqrshrun.s16    d6, q7, #7              ;shift/round/saturate to u8
+    vqrshrun.s16    d7, q8, #7
+    vqrshrun.s16    d8, q9, #7
+    vqrshrun.s16    d9, q10, #7
+
+    vmov            q9, q11
+    vst1.u8         {d6}, [r4], r5          ;store result
+    vmov            q10, q12
+    vst1.u8         {d7}, [r4], r5
+    vmov            q11, q13
+    vst1.u8         {d8}, [r4], r5
+    vmov            q12, q14
+    vst1.u8         {d9}, [r4], r5
+    vmov            d26, d30
+
+    bne filt_blk2d_spo8x8_loop_neon
+
+    pop             {r4-r5,pc}
+
+    ENDP
+
+;-----------------
+
+    END
--- /dev/null
+++ b/vp9/common/arm/recon_arm.h
@@ -1,0 +1,90 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef RECON_ARM_H
+#define RECON_ARM_H
+
+#if HAVE_ARMV6
+extern prototype_recon_block(vp9_recon_b_armv6);
+extern prototype_recon_block(vp9_recon2b_armv6);
+extern prototype_recon_block(vp9_recon4b_armv6);
+
+extern prototype_copy_block(vp9_copy_mem8x8_v6);
+extern prototype_copy_block(vp9_copy_mem8x4_v6);
+extern prototype_copy_block(vp9_copy_mem16x16_v6);
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+#undef  vp8_recon_recon
+#define vp8_recon_recon vp9_recon_b_armv6
+
+#undef  vp8_recon_recon2
+#define vp8_recon_recon2 vp9_recon2b_armv6
+
+#undef  vp8_recon_recon4
+#define vp8_recon_recon4 vp9_recon4b_armv6
+
+#undef  vp8_recon_copy8x8
+#define vp8_recon_copy8x8 vp9_copy_mem8x8_v6
+
+#undef  vp8_recon_copy8x4
+#define vp8_recon_copy8x4 vp9_copy_mem8x4_v6
+
+#undef  vp8_recon_copy16x16
+#define vp8_recon_copy16x16 vp9_copy_mem16x16_v6
+#endif
+#endif
+
+#if HAVE_ARMV7
+extern prototype_recon_block(vp9_recon_b_neon);
+extern prototype_recon_block(vp9_recon2b_neon);
+extern prototype_recon_block(vp9_recon4b_neon);
+
+extern prototype_copy_block(vp9_copy_mem8x8_neon);
+extern prototype_copy_block(vp9_copy_mem8x4_neon);
+extern prototype_copy_block(vp9_copy_mem16x16_neon);
+
+extern prototype_recon_macroblock(vp9_recon_mb_neon);
+
+extern prototype_build_intra_predictors(vp9_build_intra_predictors_mby_neon);
+extern prototype_build_intra_predictors(vp9_build_intra_predictors_mby_s_neon);
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+#undef  vp8_recon_recon
+#define vp8_recon_recon vp9_recon_b_neon
+
+#undef  vp8_recon_recon2
+#define vp8_recon_recon2 vp9_recon2b_neon
+
+#undef  vp8_recon_recon4
+#define vp8_recon_recon4 vp9_recon4b_neon
+
+#undef  vp8_recon_copy8x8
+#define vp8_recon_copy8x8 vp9_copy_mem8x8_neon
+
+#undef  vp8_recon_copy8x4
+#define vp8_recon_copy8x4 vp9_copy_mem8x4_neon
+
+#undef  vp8_recon_copy16x16
+#define vp8_recon_copy16x16 vp9_copy_mem16x16_neon
+
+#undef  vp8_recon_recon_mb
+#define vp8_recon_recon_mb vp9_recon_mb_neon
+
+#undef  vp9_recon_build_intra_predictors_mby
+#define vp9_recon_build_intra_predictors_mby vp9_build_intra_predictors_mby_neon
+
+#undef  vp9_recon_build_intra_predictors_mby_s
+#define vp9_recon_build_intra_predictors_mby_s vp9_build_intra_predictors_mby_s_neon
+
+#endif
+#endif
+
+#endif
--- /dev/null
+++ b/vp9/common/arm/reconintra_arm.c
@@ -1,0 +1,62 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_ports/config.h"
+#include "vp9/common/blockd.h"
+#include "vp9/common/reconintra.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vp9/common/recon.h"
+
+#if HAVE_ARMV7
+extern void vp9_build_intra_predictors_mby_neon_func(
+  unsigned char *y_buffer,
+  unsigned char *ypred_ptr,
+  int y_stride,
+  int mode,
+  int Up,
+  int Left);
+
+void vp9_build_intra_predictors_mby_neon(MACROBLOCKD *xd) {
+  unsigned char *y_buffer = xd->dst.y_buffer;
+  unsigned char *ypred_ptr = xd->predictor;
+  int y_stride = xd->dst.y_stride;
+  int mode = xd->mode_info_context->mbmi.mode;
+  int Up = xd->up_available;
+  int Left = xd->left_available;
+
+  vp9_build_intra_predictors_mby_neon_func(y_buffer, ypred_ptr,
+                                           y_stride, mode, Up, Left);
+}
+#endif
+
+
+#if HAVE_ARMV7
+extern void vp9_build_intra_predictors_mby_s_neon_func(
+  unsigned char *y_buffer,
+  unsigned char *ypred_ptr,
+  int y_stride,
+  int mode,
+  int Up,
+  int Left);
+
+void vp9_build_intra_predictors_mby_s_neon(MACROBLOCKD *xd) {
+  unsigned char *y_buffer = xd->dst.y_buffer;
+  unsigned char *ypred_ptr = xd->predictor;
+  int y_stride = xd->dst.y_stride;
+  int mode = xd->mode_info_context->mbmi.mode;
+  int Up = xd->up_available;
+  int Left = xd->left_available;
+
+  vp9_build_intra_predictors_mby_s_neon_func(y_buffer, ypred_ptr,
+                                             y_stride, mode, Up, Left);
+}
+
+#endif
--- /dev/null
+++ b/vp9/common/arm/subpixel_arm.h
@@ -1,0 +1,89 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef SUBPIXEL_ARM_H
+#define SUBPIXEL_ARM_H
+
+#if HAVE_ARMV6
+extern prototype_subpixel_predict(vp9_sixtap_predict16x16_armv6);
+extern prototype_subpixel_predict(vp9_sixtap_predict8x8_armv6);
+extern prototype_subpixel_predict(vp9_sixtap_predict8x4_armv6);
+extern prototype_subpixel_predict(vp9_sixtap_predict_armv6);
+extern prototype_subpixel_predict(vp9_bilinear_predict16x16_armv6);
+extern prototype_subpixel_predict(vp9_bilinear_predict8x8_armv6);
+extern prototype_subpixel_predict(vp9_bilinear_predict8x4_armv6);
+extern prototype_subpixel_predict(vp9_bilinear_predict4x4_armv6);
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+#undef  vp9_subpix_sixtap16x16
+#define vp9_subpix_sixtap16x16 vp9_sixtap_predict16x16_armv6
+
+#undef  vp9_subpix_sixtap8x8
+#define vp9_subpix_sixtap8x8 vp9_sixtap_predict8x8_armv6
+
+#undef  vp9_subpix_sixtap8x4
+#define vp9_subpix_sixtap8x4 vp9_sixtap_predict8x4_armv6
+
+#undef  vp9_subpix_sixtap4x4
+#define vp9_subpix_sixtap4x4 vp9_sixtap_predict_armv6
+
+#undef  vp9_subpix_bilinear16x16
+#define vp9_subpix_bilinear16x16 vp9_bilinear_predict16x16_armv6
+
+#undef  vp9_subpix_bilinear8x8
+#define vp9_subpix_bilinear8x8 vp9_bilinear_predict8x8_armv6
+
+#undef  vp9_subpix_bilinear8x4
+#define vp9_subpix_bilinear8x4 vp9_bilinear_predict8x4_armv6
+
+#undef  vp9_subpix_bilinear4x4
+#define vp9_subpix_bilinear4x4 vp9_bilinear_predict4x4_armv6
+#endif
+#endif
+
+#if HAVE_ARMV7
+extern prototype_subpixel_predict(vp9_sixtap_predict16x16_neon);
+extern prototype_subpixel_predict(vp9_sixtap_predict8x8_neon);
+extern prototype_subpixel_predict(vp9_sixtap_predict8x4_neon);
+extern prototype_subpixel_predict(vp9_sixtap_predict_neon);
+extern prototype_subpixel_predict(vp9_bilinear_predict16x16_neon);
+extern prototype_subpixel_predict(vp9_bilinear_predict8x8_neon);
+extern prototype_subpixel_predict(vp9_bilinear_predict8x4_neon);
+extern prototype_subpixel_predict(vp9_bilinear_predict4x4_neon);
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+#undef  vp9_subpix_sixtap16x16
+#define vp9_subpix_sixtap16x16 vp9_sixtap_predict16x16_neon
+
+#undef  vp9_subpix_sixtap8x8
+#define vp9_subpix_sixtap8x8 vp9_sixtap_predict8x8_neon
+
+#undef  vp9_subpix_sixtap8x4
+#define vp9_subpix_sixtap8x4 vp9_sixtap_predict8x4_neon
+
+#undef  vp9_subpix_sixtap4x4
+#define vp9_subpix_sixtap4x4 vp9_sixtap_predict_neon
+
+#undef  vp9_subpix_bilinear16x16
+#define vp9_subpix_bilinear16x16 vp9_bilinear_predict16x16_neon
+
+#undef  vp9_subpix_bilinear8x8
+#define vp9_subpix_bilinear8x8 vp9_bilinear_predict8x8_neon
+
+#undef  vp9_subpix_bilinear8x4
+#define vp9_subpix_bilinear8x4 vp9_bilinear_predict8x4_neon
+
+#undef  vp9_subpix_bilinear4x4
+#define vp9_subpix_bilinear4x4 vp9_bilinear_predict4x4_neon
+#endif
+#endif
+
+#endif
--- /dev/null
+++ b/vp9/common/asm_com_offsets.c
@@ -1,0 +1,40 @@
+/*
+ *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_config.h"
+#include "vpx/vpx_codec.h"
+#include "vpx_ports/asm_offsets.h"
+#include "vpx_scale/yv12config.h"
+
+BEGIN
+
+/* vpx_scale */
+DEFINE(yv12_buffer_config_y_width,              offsetof(YV12_BUFFER_CONFIG, y_width));
+DEFINE(yv12_buffer_config_y_height,             offsetof(YV12_BUFFER_CONFIG, y_height));
+DEFINE(yv12_buffer_config_y_stride,             offsetof(YV12_BUFFER_CONFIG, y_stride));
+DEFINE(yv12_buffer_config_uv_width,             offsetof(YV12_BUFFER_CONFIG, uv_width));
+DEFINE(yv12_buffer_config_uv_height,            offsetof(YV12_BUFFER_CONFIG, uv_height));
+DEFINE(yv12_buffer_config_uv_stride,            offsetof(YV12_BUFFER_CONFIG, uv_stride));
+DEFINE(yv12_buffer_config_y_buffer,             offsetof(YV12_BUFFER_CONFIG, y_buffer));
+DEFINE(yv12_buffer_config_u_buffer,             offsetof(YV12_BUFFER_CONFIG, u_buffer));
+DEFINE(yv12_buffer_config_v_buffer,             offsetof(YV12_BUFFER_CONFIG, v_buffer));
+DEFINE(yv12_buffer_config_border,               offsetof(YV12_BUFFER_CONFIG, border));
+DEFINE(VP8BORDERINPIXELS_VAL,                   VP8BORDERINPIXELS);
+
+END
+
+/* add asserts for any offset that is not supported by assembly code */
+/* add asserts for any size that is not supported by assembly code */
+
+#if HAVE_ARMV7
+/* vp8_yv12_extend_frame_borders_neon makes several assumptions based on this */
+ct_assert(VP8BORDERINPIXELS_VAL, VP8BORDERINPIXELS == 32)
+#endif
--- /dev/null
+++ b/vp9/common/blockd.c
@@ -1,0 +1,29 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "blockd.h"
+#include "vpx_mem/vpx_mem.h"
+
+
+const unsigned char vp9_block2left[25] = {
+  0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
+};
+const unsigned char vp9_block2above[25] = {
+  0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8
+};
+
+const unsigned char vp9_block2left_8x8[25] = {
+  0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 6, 6, 6, 6, 8
+};
+const unsigned char vp9_block2above_8x8[25] = {
+  0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, 2, 2, 2, 2, 4, 4, 4, 4, 6, 6, 6, 6, 8
+};
+
--- /dev/null
+++ b/vp9/common/blockd.h
@@ -1,0 +1,518 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_BLOCKD_H
+#define __INC_BLOCKD_H
+
+void vpx_log(const char *format, ...);
+
+#include "vpx_ports/config.h"
+#include "vpx_scale/yv12config.h"
+#include "mv.h"
+#include "treecoder.h"
+#include "subpixel.h"
+#include "vpx_ports/mem.h"
+#include "common.h"
+
+#define TRUE    1
+#define FALSE   0
+
+// #define MODE_STATS
+
+/*#define DCPRED 1*/
+#define DCPREDSIMTHRESH 0
+#define DCPREDCNTTHRESH 3
+
+#define MB_FEATURE_TREE_PROBS   3
+#define PREDICTION_PROBS 3
+
+#define MBSKIP_CONTEXTS 3
+
+#define MAX_MB_SEGMENTS         4
+
+#define MAX_REF_LF_DELTAS       4
+#define MAX_MODE_LF_DELTAS      4
+
+/* Segment Feature Masks */
+#define SEGMENT_DELTADATA   0
+#define SEGMENT_ABSDATA     1
+#if CONFIG_NEWBESTREFMV || CONFIG_NEW_MVREF
+#define MAX_MV_REFS 19
+#endif
+
+typedef struct {
+  int r, c;
+} POS;
+
+typedef enum PlaneType {
+  PLANE_TYPE_Y_NO_DC = 0,
+  PLANE_TYPE_Y2,
+  PLANE_TYPE_UV,
+  PLANE_TYPE_Y_WITH_DC,
+} PLANE_TYPE;
+
+typedef char ENTROPY_CONTEXT;
+typedef struct {
+  ENTROPY_CONTEXT y1[4];
+  ENTROPY_CONTEXT u[2];
+  ENTROPY_CONTEXT v[2];
+  ENTROPY_CONTEXT y2;
+} ENTROPY_CONTEXT_PLANES;
+
+extern const unsigned char vp9_block2left[25];
+extern const unsigned char vp9_block2above[25];
+extern const unsigned char vp9_block2left_8x8[25];
+extern const unsigned char vp9_block2above_8x8[25];
+
+#define VP9_COMBINEENTROPYCONTEXTS( Dest, A, B) \
+  Dest = ((A)!=0) + ((B)!=0);
+
+typedef enum {
+  KEY_FRAME = 0,
+  INTER_FRAME = 1
+} FRAME_TYPE;
+
+typedef enum
+{
+  SIXTAP   = 0,
+  BILINEAR = 1,
+  EIGHTTAP = 2,
+  EIGHTTAP_SHARP = 3,
+  SWITCHABLE  /* should be the last one */
+} INTERPOLATIONFILTERTYPE;
+
+typedef enum
+{
+  DC_PRED,            /* average of above and left pixels */
+  V_PRED,             /* vertical prediction */
+  H_PRED,             /* horizontal prediction */
+  D45_PRED,           /* Directional 45 deg prediction  [anti-clockwise from 0 deg hor] */
+  D135_PRED,          /* Directional 135 deg prediction [anti-clockwise from 0 deg hor] */
+  D117_PRED,          /* Directional 112 deg prediction [anti-clockwise from 0 deg hor] */
+  D153_PRED,          /* Directional 157 deg prediction [anti-clockwise from 0 deg hor] */
+  D27_PRED,           /* Directional 22 deg prediction  [anti-clockwise from 0 deg hor] */
+  D63_PRED,           /* Directional 67 deg prediction  [anti-clockwise from 0 deg hor] */
+  TM_PRED,            /* Truemotion prediction */
+  I8X8_PRED,          /* 8x8 based prediction, each 8x8 has its own prediction mode */
+  B_PRED,             /* block based prediction, each block has its own prediction mode */
+
+  NEARESTMV,
+  NEARMV,
+  ZEROMV,
+  NEWMV,
+  SPLITMV,
+
+  MB_MODE_COUNT
+} MB_PREDICTION_MODE;
+
+// Segment level features.
+typedef enum {
+  SEG_LVL_ALT_Q = 0,               // Use alternate Quantizer ....
+  SEG_LVL_ALT_LF = 1,              // Use alternate loop filter value...
+  SEG_LVL_REF_FRAME = 2,           // Optional Segment reference frame
+  SEG_LVL_MODE = 3,                // Optional Segment mode
+  SEG_LVL_EOB = 4,                 // EOB end stop marker.
+  SEG_LVL_TRANSFORM = 5,           // Block transform size.
+  SEG_LVL_MAX = 6                  // Number of MB level features supported
+
+} SEG_LVL_FEATURES;
+
+// Segment level features.
+typedef enum {
+  TX_4X4,                      // 4x4 dct transform
+  TX_8X8,                      // 8x8 dct transform
+  TX_16X16,                    // 16x16 dct transform
+  TX_SIZE_MAX                  // Number of different transforms available
+} TX_SIZE;
+
+typedef enum {
+  DCT_DCT   = 0,                      // DCT  in both horizontal and vertical
+  ADST_DCT  = 1,                      // ADST in vertical, DCT in horizontal
+  DCT_ADST  = 2,                      // DCT  in vertical, ADST in horizontal
+  ADST_ADST = 3                       // ADST in both directions
+} TX_TYPE;
+
+#define VP9_YMODES  (B_PRED + 1)
+#define VP9_UV_MODES (TM_PRED + 1)
+#define VP9_I8X8_MODES (TM_PRED + 1)
+#define VP9_I32X32_MODES (TM_PRED + 1)
+
+#define VP9_MVREFS (1 + SPLITMV - NEARESTMV)
+
+typedef enum {
+  B_DC_PRED,          /* average of above and left pixels */
+  B_TM_PRED,
+
+  B_VE_PRED,           /* vertical prediction */
+  B_HE_PRED,           /* horizontal prediction */
+
+  B_LD_PRED,
+  B_RD_PRED,
+
+  B_VR_PRED,
+  B_VL_PRED,
+  B_HD_PRED,
+  B_HU_PRED,
+
+  LEFT4X4,
+  ABOVE4X4,
+  ZERO4X4,
+  NEW4X4,
+
+  B_MODE_COUNT
+} B_PREDICTION_MODE;
+
+#define VP9_BINTRAMODES (B_HU_PRED + 1)  /* 10 */
+#define VP9_SUBMVREFS (1 + NEW4X4 - LEFT4X4)
+
+typedef enum {
+  PARTITIONING_16X8 = 0,
+  PARTITIONING_8X16,
+  PARTITIONING_8X8,
+  PARTITIONING_4X4,
+  NB_PARTITIONINGS,
+} SPLITMV_PARTITIONING_TYPE;
+
+/* For keyframes, intra block modes are predicted by the (already decoded)
+   modes for the Y blocks to the left and above us; for interframes, there
+   is a single probability table. */
+
+union b_mode_info {
+  struct {
+    B_PREDICTION_MODE first;
+    TX_TYPE           tx_type;
+
+#if CONFIG_COMP_INTRA_PRED
+    B_PREDICTION_MODE second;
+#endif
+  } as_mode;
+  struct {
+    int_mv first;
+    int_mv second;
+  } as_mv;
+};
+
+typedef enum {
+  INTRA_FRAME = 0,
+  LAST_FRAME = 1,
+  GOLDEN_FRAME = 2,
+  ALTREF_FRAME = 3,
+  MAX_REF_FRAMES = 4
+} MV_REFERENCE_FRAME;
+
+typedef struct {
+  MB_PREDICTION_MODE mode, uv_mode;
+#if CONFIG_COMP_INTRA_PRED
+  MB_PREDICTION_MODE second_mode, second_uv_mode;
+#endif
+  MV_REFERENCE_FRAME ref_frame, second_ref_frame;
+  TX_SIZE txfm_size;
+  int_mv mv[2]; // for each reference frame used
+#if CONFIG_NEWBESTREFMV || CONFIG_NEW_MVREF
+  int_mv ref_mvs[MAX_REF_FRAMES][MAX_MV_REFS];
+#endif
+
+  SPLITMV_PARTITIONING_TYPE partitioning;
+  unsigned char mb_skip_coeff;                                /* does this mb has coefficients at all, 1=no coefficients, 0=need decode tokens */
+  unsigned char need_to_clamp_mvs;
+  unsigned char need_to_clamp_secondmv;
+  unsigned char segment_id;                  /* Which set of segmentation parameters should be used for this MB */
+
+  // Flags used for prediction status of various bistream signals
+  unsigned char seg_id_predicted;
+  unsigned char ref_predicted;
+
+  // Indicates if the mb is part of the image (1) vs border (0)
+  // This can be useful in determining whether the MB provides
+  // a valid predictor
+  unsigned char mb_in_image;
+
+#if CONFIG_PRED_FILTER
+  // Flag to turn prediction signal filter on(1)/off(0 ) at the MB level
+  unsigned int pred_filter_enabled;
+#endif
+    INTERPOLATIONFILTERTYPE interp_filter;
+
+#if CONFIG_SUPERBLOCKS
+  // FIXME need a SB array of 4 MB_MODE_INFOs that
+  // only needs one encoded_as_sb.
+  unsigned char encoded_as_sb;
+#endif
+} MB_MODE_INFO;
+
+typedef struct {
+  MB_MODE_INFO mbmi;
+  union b_mode_info bmi[16];
+} MODE_INFO;
+
+typedef struct blockd {
+  short *qcoeff;
+  short *dqcoeff;
+  unsigned char  *predictor;
+  short *diff;
+  short *dequant;
+
+  /* 16 Y blocks, 4 U blocks, 4 V blocks each with 16 entries */
+  unsigned char **base_pre;
+  unsigned char **base_second_pre;
+  int pre;
+  int pre_stride;
+
+  unsigned char **base_dst;
+  int dst;
+  int dst_stride;
+
+  int eob;
+
+  union b_mode_info bmi;
+} BLOCKD;
+
+typedef struct macroblockd {
+  DECLARE_ALIGNED(16, short, diff[400]);      /* from idct diff */
+  DECLARE_ALIGNED(16, unsigned char,  predictor[384]);
+  DECLARE_ALIGNED(16, short, qcoeff[400]);
+  DECLARE_ALIGNED(16, short, dqcoeff[400]);
+  DECLARE_ALIGNED(16, char,  eobs[25]);
+
+  /* 16 Y blocks, 4 U, 4 V, 1 DC 2nd order block, each with 16 entries. */
+  BLOCKD block[25];
+  int fullpixel_mask;
+
+  YV12_BUFFER_CONFIG pre; /* Filtered copy of previous frame reconstruction */
+  struct {
+    uint8_t *y_buffer, *u_buffer, *v_buffer;
+  } second_pre;
+  YV12_BUFFER_CONFIG dst;
+
+  MODE_INFO *prev_mode_info_context;
+  MODE_INFO *mode_info_context;
+  int mode_info_stride;
+
+  FRAME_TYPE frame_type;
+
+  int up_available;
+  int left_available;
+
+  /* Y,U,V,Y2 */
+  ENTROPY_CONTEXT_PLANES *above_context;
+  ENTROPY_CONTEXT_PLANES *left_context;
+
+  /* 0 indicates segmentation at MB level is not enabled. Otherwise the individual bits indicate which features are active. */
+  unsigned char segmentation_enabled;
+
+  /* 0 (do not update) 1 (update) the macroblock segmentation map. */
+  unsigned char update_mb_segmentation_map;
+
+  /* 0 (do not update) 1 (update) the macroblock segmentation feature data. */
+  unsigned char update_mb_segmentation_data;
+
+  /* 0 (do not update) 1 (update) the macroblock segmentation feature data. */
+  unsigned char mb_segment_abs_delta;
+
+  /* Per frame flags that define which MB level features (such as quantizer or loop filter level) */
+  /* are enabled and when enabled the proabilities used to decode the per MB flags in MB_MODE_INFO */
+
+  // Probability Tree used to code Segment number
+  vp9_prob mb_segment_tree_probs[MB_FEATURE_TREE_PROBS];
+
+#if CONFIG_NEW_MVREF
+  vp9_prob mb_mv_ref_id_probs[MAX_REF_FRAMES][3];
+#endif
+
+  // Segment features
+  signed char segment_feature_data[MAX_MB_SEGMENTS][SEG_LVL_MAX];
+  unsigned int segment_feature_mask[MAX_MB_SEGMENTS];
+
+  /* mode_based Loop filter adjustment */
+  unsigned char mode_ref_lf_delta_enabled;
+  unsigned char mode_ref_lf_delta_update;
+
+  /* Delta values have the range +/- MAX_LOOP_FILTER */
+  signed char last_ref_lf_deltas[MAX_REF_LF_DELTAS];                /* 0 = Intra, Last, GF, ARF */
+  signed char ref_lf_deltas[MAX_REF_LF_DELTAS];                     /* 0 = Intra, Last, GF, ARF */
+  signed char last_mode_lf_deltas[MAX_MODE_LF_DELTAS];              /* 0 = BPRED, ZERO_MV, MV, SPLIT */
+  signed char mode_lf_deltas[MAX_MODE_LF_DELTAS];                   /* 0 = BPRED, ZERO_MV, MV, SPLIT */
+
+  /* Distance of MB away from frame edges */
+  int mb_to_left_edge;
+  int mb_to_right_edge;
+  int mb_to_top_edge;
+  int mb_to_bottom_edge;
+
+  unsigned int frames_since_golden;
+  unsigned int frames_till_alt_ref_frame;
+  vp9_subpix_fn_t  subpixel_predict;
+  vp9_subpix_fn_t  subpixel_predict8x4;
+  vp9_subpix_fn_t  subpixel_predict8x8;
+  vp9_subpix_fn_t  subpixel_predict16x16;
+  vp9_subpix_fn_t  subpixel_predict_avg;
+  vp9_subpix_fn_t  subpixel_predict_avg8x4;
+  vp9_subpix_fn_t  subpixel_predict_avg8x8;
+  vp9_subpix_fn_t  subpixel_predict_avg16x16;
+  int allow_high_precision_mv;
+
+  int corrupted;
+
+#if !CONFIG_SUPERBLOCKS && (ARCH_X86 || ARCH_X86_64)
+  /* This is an intermediate buffer currently used in sub-pixel motion search
+   * to keep a copy of the reference area. This buffer can be used for other
+   * purpose.
+   */
+  DECLARE_ALIGNED(32, unsigned char, y_buf[22 * 32]);
+#endif
+
+#if CONFIG_RUNTIME_CPU_DETECT
+  struct VP9_COMMON_RTCD  *rtcd;
+#endif
+
+  int mb_index;   // Index of the MB in the SB (0..3)
+  int q_index;
+
+} MACROBLOCKD;
+
+#define ACTIVE_HT 110                // quantization stepsize threshold
+
+#define ACTIVE_HT8 300
+
+#define ACTIVE_HT16 300
+
+// convert MB_PREDICTION_MODE to B_PREDICTION_MODE
+static B_PREDICTION_MODE pred_mode_conv(MB_PREDICTION_MODE mode) {
+  B_PREDICTION_MODE b_mode;
+  switch (mode) {
+    case DC_PRED:
+      b_mode = B_DC_PRED;
+      break;
+    case V_PRED:
+      b_mode = B_VE_PRED;
+      break;
+    case H_PRED:
+      b_mode = B_HE_PRED;
+      break;
+    case TM_PRED:
+      b_mode = B_TM_PRED;
+      break;
+    case D45_PRED:
+      b_mode = B_LD_PRED;
+      break;
+    case D135_PRED:
+      b_mode = B_RD_PRED;
+      break;
+    case D117_PRED:
+      b_mode = B_VR_PRED;
+      break;
+    case D153_PRED:
+      b_mode = B_HD_PRED;
+      break;
+    case D27_PRED:
+      b_mode = B_HU_PRED;
+      break;
+    case D63_PRED:
+      b_mode = B_VL_PRED;
+      break;
+    default :
+      // for debug purpose, to be removed after full testing
+      assert(0);
+      break;
+  }
+  return b_mode;
+}
+
+// transform mapping
+static TX_TYPE txfm_map(B_PREDICTION_MODE bmode) {
+  // map transform type
+  TX_TYPE tx_type;
+  switch (bmode) {
+    case B_TM_PRED :
+    case B_RD_PRED :
+      tx_type = ADST_ADST;
+      break;
+
+    case B_VE_PRED :
+    case B_VR_PRED :
+      tx_type = ADST_DCT;
+      break;
+
+    case B_HE_PRED :
+    case B_HD_PRED :
+    case B_HU_PRED :
+      tx_type = DCT_ADST;
+      break;
+
+    default :
+      tx_type = DCT_DCT;
+      break;
+  }
+  return tx_type;
+}
+
+static TX_TYPE get_tx_type_4x4(const MACROBLOCKD *xd, const BLOCKD *b) {
+  TX_TYPE tx_type = DCT_DCT;
+  if (xd->mode_info_context->mbmi.mode == B_PRED &&
+      xd->q_index < ACTIVE_HT) {
+    tx_type = txfm_map(b->bmi.as_mode.first);
+  }
+  return tx_type;
+}
+
+static TX_TYPE get_tx_type_8x8(const MACROBLOCKD *xd, const BLOCKD *b) {
+  TX_TYPE tx_type = DCT_DCT;
+  if (xd->mode_info_context->mbmi.mode == I8X8_PRED &&
+      xd->q_index < ACTIVE_HT8) {
+    tx_type = txfm_map(pred_mode_conv(b->bmi.as_mode.first));
+  }
+  return tx_type;
+}
+
+static TX_TYPE get_tx_type_16x16(const MACROBLOCKD *xd, const BLOCKD *b) {
+  TX_TYPE tx_type = DCT_DCT;
+  if (xd->mode_info_context->mbmi.mode < I8X8_PRED &&
+      xd->q_index < ACTIVE_HT16) {
+    tx_type = txfm_map(pred_mode_conv(xd->mode_info_context->mbmi.mode));
+  }
+  return tx_type;
+}
+
+static TX_TYPE get_tx_type(const MACROBLOCKD *xd, const BLOCKD *b) {
+  TX_TYPE tx_type = DCT_DCT;
+  int ib = (b - xd->block);
+  if (ib >= 16)
+    return tx_type;
+  if (xd->mode_info_context->mbmi.txfm_size == TX_16X16) {
+    tx_type = get_tx_type_16x16(xd, b);
+  }
+  if (xd->mode_info_context->mbmi.txfm_size  == TX_8X8) {
+    ib = (ib & 8) + ((ib & 4) >> 1);
+    tx_type = get_tx_type_8x8(xd, &xd->block[ib]);
+  }
+  if (xd->mode_info_context->mbmi.txfm_size  == TX_4X4) {
+    tx_type = get_tx_type_4x4(xd, b);
+  }
+  return tx_type;
+}
+
+extern void vp9_build_block_doffsets(MACROBLOCKD *xd);
+extern void vp9_setup_block_dptrs(MACROBLOCKD *xd);
+
+static void update_blockd_bmi(MACROBLOCKD *xd) {
+  int i;
+  int is_4x4;
+  is_4x4 = (xd->mode_info_context->mbmi.mode == SPLITMV) ||
+           (xd->mode_info_context->mbmi.mode == I8X8_PRED) ||
+           (xd->mode_info_context->mbmi.mode == B_PRED);
+
+  if (is_4x4) {
+    for (i = 0; i < 16; i++) {
+      xd->block[i].bmi = xd->mode_info_context->bmi[i];
+    }
+  }
+}
+#endif  /* __INC_BLOCKD_H */
--- /dev/null
+++ b/vp9/common/coefupdateprobs.h
@@ -1,0 +1,16 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+/* Update probabilities for the nodes in the token entropy tree.
+   Generated file included by entropy.c */
+#define COEF_UPDATE_PROB 252
+#define COEF_UPDATE_PROB_8X8 252
+#define COEF_UPDATE_PROB_16X16 252
--- /dev/null
+++ b/vp9/common/common.h
@@ -1,0 +1,41 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef common_h
+#define common_h 1
+
+#include <assert.h>
+#include "vpx_config.h"
+/* Interface header for common constant data structures and lookup tables */
+
+#include "vpx_mem/vpx_mem.h"
+
+#include "common_types.h"
+
+/* Only need this for fixed-size arrays, for structs just assign. */
+
+#define vp9_copy( Dest, Src) { \
+    assert( sizeof( Dest) == sizeof( Src)); \
+    vpx_memcpy( Dest, Src, sizeof( Src)); \
+  }
+
+/* Use this for variably-sized arrays. */
+
+#define vp9_copy_array( Dest, Src, N) { \
+    assert( sizeof( *Dest) == sizeof( *Src)); \
+    vpx_memcpy( Dest, Src, N * sizeof( *Src)); \
+  }
+
+#define vp9_zero( Dest)  vpx_memset( &Dest, 0, sizeof( Dest));
+
+#define vp9_zero_array( Dest, N)  vpx_memset( Dest, 0, N * sizeof( *Dest));
+
+#endif  /* common_h */
--- /dev/null
+++ b/vp9/common/common_types.h
@@ -1,0 +1,18 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_COMMON_TYPES
+#define __INC_COMMON_TYPES
+
+#define TRUE    1
+#define FALSE   0
+
+#endif
--- /dev/null
+++ b/vp9/common/context.c
@@ -1,0 +1,397 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "entropy.h"
+
+/* *** GENERATED FILE: DO NOT EDIT *** */
+
+#if 0
+int Contexts[vp8_coef_counter_dimen];
+
+const int default_contexts[vp8_coef_counter_dimen] = {
+  {
+    // Block Type ( 0 )
+    {
+      // Coeff Band ( 0 )
+      {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0},
+      {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0},
+      {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0},
+    },
+    {
+      // Coeff Band ( 1 )
+      {30190, 26544, 225,  24,   4,   0,   0,   0,   0,   0,   0, 4171593},
+      {26846, 25157, 1241, 130,  26,   6,   1,   0,   0,   0,   0, 149987},
+      {10484, 9538, 1006, 160,  36,  18,   0,   0,   0,   0,   0, 15104},
+    },
+    {
+      // Coeff Band ( 2 )
+      {25842, 40456, 1126,  83,  11,   2,   0,   0,   0,   0,   0,   0},
+      {9338, 8010, 512,  73,   7,   3,   2,   0,   0,   0,   0, 43294},
+      {1047, 751, 149,  31,  13,   6,   1,   0,   0,   0,   0, 879},
+    },
+    {
+      // Coeff Band ( 3 )
+      {26136, 9826, 252,  13,   0,   0,   0,   0,   0,   0,   0,   0},
+      {8134, 5574, 191,  14,   2,   0,   0,   0,   0,   0,   0, 35302},
+      { 605, 677, 116,   9,   1,   0,   0,   0,   0,   0,   0, 611},
+    },
+    {
+      // Coeff Band ( 4 )
+      {10263, 15463, 283,  17,   0,   0,   0,   0,   0,   0,   0,   0},
+      {2773, 2191, 128,   9,   2,   2,   0,   0,   0,   0,   0, 10073},
+      { 134, 125,  32,   4,   0,   2,   0,   0,   0,   0,   0,  50},
+    },
+    {
+      // Coeff Band ( 5 )
+      {10483, 2663,  23,   1,   0,   0,   0,   0,   0,   0,   0,   0},
+      {2137, 1251,  27,   1,   1,   0,   0,   0,   0,   0,   0, 14362},
+      { 116, 156,  14,   2,   1,   0,   0,   0,   0,   0,   0, 190},
+    },
+    {
+      // Coeff Band ( 6 )
+      {40977, 27614, 412,  28,   0,   0,   0,   0,   0,   0,   0,   0},
+      {6113, 5213, 261,  22,   3,   0,   0,   0,   0,   0,   0, 26164},
+      { 382, 312,  50,  14,   2,   0,   0,   0,   0,   0,   0, 345},
+    },
+    {
+      // Coeff Band ( 7 )
+      {   0,  26,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0},
+      {   0,  13,   0,   0,   0,   0,   0,   0,   0,   0,   0, 319},
+      {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   8},
+    },
+  },
+  {
+    // Block Type ( 1 )
+    {
+      // Coeff Band ( 0 )
+      {3268, 19382, 1043, 250,  93,  82,  49,  26,  17,   8,  25, 82289},
+      {8758, 32110, 5436, 1832, 827, 668, 420, 153,  24,   0,   3, 52914},
+      {9337, 23725, 8487, 3954, 2107, 1836, 1069, 399,  59,   0,   0, 18620},
+    },
+    {
+      // Coeff Band ( 1 )
+      {12419, 8420, 452,  62,   9,   1,   0,   0,   0,   0,   0,   0},
+      {11715, 8705, 693,  92,  15,   7,   2,   0,   0,   0,   0, 53988},
+      {7603, 8585, 2306, 778, 270, 145,  39,   5,   0,   0,   0, 9136},
+    },
+    {
+      // Coeff Band ( 2 )
+      {15938, 14335, 1207, 184,  55,  13,   4,   1,   0,   0,   0,   0},
+      {7415, 6829, 1138, 244,  71,  26,   7,   0,   0,   0,   0, 9980},
+      {1580, 1824, 655, 241,  89,  46,  10,   2,   0,   0,   0, 429},
+    },
+    {
+      // Coeff Band ( 3 )
+      {19453, 5260, 201,  19,   0,   0,   0,   0,   0,   0,   0,   0},
+      {9173, 3758, 213,  22,   1,   1,   0,   0,   0,   0,   0, 9820},
+      {1689, 1277, 276,  51,  17,   4,   0,   0,   0,   0,   0, 679},
+    },
+    {
+      // Coeff Band ( 4 )
+      {12076, 10667, 620,  85,  19,   9,   5,   0,   0,   0,   0,   0},
+      {4665, 3625, 423,  55,  19,   9,   0,   0,   0,   0,   0, 5127},
+      { 415, 440, 143,  34,  20,   7,   2,   0,   0,   0,   0, 101},
+    },
+    {
+      // Coeff Band ( 5 )
+      {12183, 4846, 115,  11,   1,   0,   0,   0,   0,   0,   0,   0},
+      {4226, 3149, 177,  21,   2,   0,   0,   0,   0,   0,   0, 7157},
+      { 375, 621, 189,  51,  11,   4,   1,   0,   0,   0,   0, 198},
+    },
+    {
+      // Coeff Band ( 6 )
+      {61658, 37743, 1203,  94,  10,   3,   0,   0,   0,   0,   0,   0},
+      {15514, 11563, 903, 111,  14,   5,   0,   0,   0,   0,   0, 25195},
+      { 929, 1077, 291,  78,  14,   7,   1,   0,   0,   0,   0, 507},
+    },
+    {
+      // Coeff Band ( 7 )
+      {   0, 990,  15,   3,   0,   0,   0,   0,   0,   0,   0,   0},
+      {   0, 412,  13,   0,   0,   0,   0,   0,   0,   0,   0, 1641},
+      {   0,  18,   7,   1,   0,   0,   0,   0,   0,   0,   0,  30},
+    },
+  },
+  {
+    // Block Type ( 2 )
+    {
+      // Coeff Band ( 0 )
+      { 953, 24519, 628, 120,  28,  12,   4,   0,   0,   0,   0, 2248798},
+      {1525, 25654, 2647, 617, 239, 143,  42,   5,   0,   0,   0, 66837},
+      {1180, 11011, 3001, 1237, 532, 448, 239,  54,   5,   0,   0, 7122},
+    },
+    {
+      // Coeff Band ( 1 )
+      {1356, 2220,  67,  10,   4,   1,   0,   0,   0,   0,   0,   0},
+      {1450, 2544, 102,  18,   4,   3,   0,   0,   0,   0,   0, 57063},
+      {1182, 2110, 470, 130,  41,  21,   0,   0,   0,   0,   0, 6047},
+    },
+    {
+      // Coeff Band ( 2 )
+      { 370, 3378, 200,  30,   5,   4,   1,   0,   0,   0,   0,   0},
+      { 293, 1006, 131,  29,  11,   0,   0,   0,   0,   0,   0, 5404},
+      { 114, 387,  98,  23,   4,   8,   1,   0,   0,   0,   0, 236},
+    },
+    {
+      // Coeff Band ( 3 )
+      { 579, 194,   4,   0,   0,   0,   0,   0,   0,   0,   0,   0},
+      { 395, 213,   5,   1,   0,   0,   0,   0,   0,   0,   0, 4157},
+      { 119, 122,   4,   0,   0,   0,   0,   0,   0,   0,   0, 300},
+    },
+    {
+      // Coeff Band ( 4 )
+      {  38, 557,  19,   0,   0,   0,   0,   0,   0,   0,   0,   0},
+      {  21, 114,  12,   1,   0,   0,   0,   0,   0,   0,   0, 427},
+      {   0,   5,   0,   0,   0,   0,   0,   0,   0,   0,   0,   7},
+    },
+    {
+      // Coeff Band ( 5 )
+      {  52,   7,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0},
+      {  18,   6,   0,   0,   0,   0,   0,   0,   0,   0,   0, 652},
+      {   1,   1,   0,   0,   0,   0,   0,   0,   0,   0,   0,  30},
+    },
+    {
+      // Coeff Band ( 6 )
+      { 640, 569,  10,   0,   0,   0,   0,   0,   0,   0,   0,   0},
+      {  25,  77,   2,   0,   0,   0,   0,   0,   0,   0,   0, 517},
+      {   4,   7,   0,   0,   0,   0,   0,   0,   0,   0,   0,   3},
+    },
+    {
+      // Coeff Band ( 7 )
+      {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0},
+      {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0},
+      {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0},
+    },
+  },
+  {
+    // Block Type ( 3 )
+    {
+      // Coeff Band ( 0 )
+      {2506, 20161, 2707, 767, 261, 178, 107,  30,  14,   3,   0, 100694},
+      {8806, 36478, 8817, 3268, 1280, 850, 401, 114,  42,   0,   0, 58572},
+      {11003, 27214, 11798, 5716, 2482, 2072, 1048, 175,  32,   0,   0, 19284},
+    },
+    {
+      // Coeff Band ( 1 )
+      {9738, 11313, 959, 205,  70,  18,  11,   1,   0,   0,   0,   0},
+      {12628, 15085, 1507, 273,  52,  19,   9,   0,   0,   0,   0, 54280},
+      {10701, 15846, 5561, 1926, 813, 570, 249,  36,   0,   0,   0, 6460},
+    },
+    {
+      // Coeff Band ( 2 )
+      {6781, 22539, 2784, 634, 182, 123,  20,   4,   0,   0,   0,   0},
+      {6263, 11544, 2649, 790, 259, 168,  27,   5,   0,   0,   0, 20539},
+      {3109, 4075, 2031, 896, 457, 386, 158,  29,   0,   0,   0, 1138},
+    },
+    {
+      // Coeff Band ( 3 )
+      {11515, 4079, 465,  73,   5,  14,   2,   0,   0,   0,   0,   0},
+      {9361, 5834, 650,  96,  24,   8,   4,   0,   0,   0,   0, 22181},
+      {4343, 3974, 1360, 415, 132,  96,  14,   1,   0,   0,   0, 1267},
+    },
+    {
+      // Coeff Band ( 4 )
+      {4787, 9297, 823, 168,  44,  12,   4,   0,   0,   0,   0,   0},
+      {3619, 4472, 719, 198,  60,  31,   3,   0,   0,   0,   0, 8401},
+      {1157, 1175, 483, 182,  88,  31,   8,   0,   0,   0,   0, 268},
+    },
+    {
+      // Coeff Band ( 5 )
+      {8299, 1226,  32,   5,   1,   0,   0,   0,   0,   0,   0,   0},
+      {3502, 1568,  57,   4,   1,   1,   0,   0,   0,   0,   0, 9811},
+      {1055, 1070, 166,  29,   6,   1,   0,   0,   0,   0,   0, 527},
+    },
+    {
+      // Coeff Band ( 6 )
+      {27414, 27927, 1989, 347,  69,  26,   0,   0,   0,   0,   0,   0},
+      {5876, 10074, 1574, 341,  91,  24,   4,   0,   0,   0,   0, 21954},
+      {1571, 2171, 778, 324, 124,  65,  16,   0,   0,   0,   0, 979},
+    },
+    {
+      // Coeff Band ( 7 )
+      {   0,  29,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0},
+      {   0,  23,   0,   0,   0,   0,   0,   0,   0,   0,   0, 459},
+      {   0,   1,   0,   0,   0,   0,   0,   0,   0,   0,   0,  13},
+    },
+  },
+};
+
+// Update probabilities for the nodes in the token entropy tree.
+const vp9_prob tree_update_probs[vp9_coef_tree_dimen] = {
+  {
+    {
+      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+    },
+    {
+      {176, 246, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+      {223, 241, 252, 255, 255, 255, 255, 255, 255, 255, 255, },
+      {249, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
+    },
+    {
+      {255, 244, 252, 255, 255, 255, 255, 255, 255, 255, 255, },
+      {234, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+      {253, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+    },
+    {
+      {255, 246, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+      {239, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+      {254, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+    },
+    {
+      {255, 248, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+      {251, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+    },
+    {
+      {255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+      {251, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+      {254, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+    },
+    {
+      {255, 254, 253, 255, 254, 255, 255, 255, 255, 255, 255, },
+      {250, 255, 254, 255, 254, 255, 255, 255, 255, 255, 255, },
+      {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+    },
+    {
+      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+    },
+  },
+  {
+    {
+      {217, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+      {225, 252, 241, 253, 255, 255, 254, 255, 255, 255, 255, },
+      {234, 250, 241, 250, 253, 255, 253, 254, 255, 255, 255, },
+    },
+    {
+      {255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+      {223, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+      {238, 253, 254, 254, 255, 255, 255, 255, 255, 255, 255, },
+    },
+    {
+      {255, 248, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+      {249, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+    },
+    {
+      {255, 253, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+      {247, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+    },
+    {
+      {255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+      {252, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+    },
+    {
+      {255, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+      {253, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+    },
+    {
+      {255, 254, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
+      {250, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+      {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+    },
+    {
+      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+    },
+  },
+  {
+    {
+      {186, 251, 250, 255, 255, 255, 255, 255, 255, 255, 255, },
+      {234, 251, 244, 254, 255, 255, 255, 255, 255, 255, 255, },
+      {251, 251, 243, 253, 254, 255, 254, 255, 255, 255, 255, },
+    },
+    {
+      {255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+      {236, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+      {251, 253, 253, 254, 254, 255, 255, 255, 255, 255, 255, },
+    },
+    {
+      {255, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+      {254, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+    },
+    {
+      {255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+      {254, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+      {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+    },
+    {
+      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+      {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+    },
+    {
+      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+    },
+    {
+      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+    },
+    {
+      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+    },
+  },
+  {
+    {
+      {248, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+      {250, 254, 252, 254, 255, 255, 255, 255, 255, 255, 255, },
+      {248, 254, 249, 253, 255, 255, 255, 255, 255, 255, 255, },
+    },
+    {
+      {255, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
+      {246, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
+      {252, 254, 251, 254, 254, 255, 255, 255, 255, 255, 255, },
+    },
+    {
+      {255, 254, 252, 255, 255, 255, 255, 255, 255, 255, 255, },
+      {248, 254, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
+      {253, 255, 254, 254, 255, 255, 255, 255, 255, 255, 255, },
+    },
+    {
+      {255, 251, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+      {245, 251, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+      {253, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+    },
+    {
+      {255, 251, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
+      {252, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+      {255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+    },
+    {
+      {255, 252, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+      {249, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+      {255, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+    },
+    {
+      {255, 255, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
+      {250, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+    },
+    {
+      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+      {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+    },
+  },
+};
+#endif
--- /dev/null
+++ b/vp9/common/debugmodes.c
@@ -1,0 +1,146 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdio.h>
+#include "blockd.h"
+
+void vp9_print_modes_and_motion_vectors(MODE_INFO *mi, int rows, int cols,
+                                        int frame) {
+  int mb_row;
+  int mb_col;
+  int mb_index = 0;
+  FILE *mvs = fopen("mvs.stt", "a");
+
+  /* print out the macroblock Y modes */
+  mb_index = 0;
+  fprintf(mvs, "Mb Modes for Frame %d\n", frame);
+
+  for (mb_row = 0; mb_row < rows; mb_row++) {
+    for (mb_col = 0; mb_col < cols; mb_col++) {
+
+      fprintf(mvs, "%2d ", mi[mb_index].mbmi.mode);
+
+      mb_index++;
+    }
+
+    fprintf(mvs, "\n");
+    mb_index++;
+  }
+
+  fprintf(mvs, "\n");
+
+  mb_index = 0;
+  fprintf(mvs, "Mb mv ref for Frame %d\n", frame);
+
+  for (mb_row = 0; mb_row < rows; mb_row++) {
+    for (mb_col = 0; mb_col < cols; mb_col++) {
+
+      fprintf(mvs, "%2d ", mi[mb_index].mbmi.ref_frame);
+
+      mb_index++;
+    }
+
+    fprintf(mvs, "\n");
+    mb_index++;
+  }
+
+  fprintf(mvs, "\n");
+
+  /* print out the macroblock UV modes */
+  mb_index = 0;
+  fprintf(mvs, "UV Modes for Frame %d\n", frame);
+
+  for (mb_row = 0; mb_row < rows; mb_row++) {
+    for (mb_col = 0; mb_col < cols; mb_col++) {
+
+      fprintf(mvs, "%2d ", mi[mb_index].mbmi.uv_mode);
+
+      mb_index++;
+    }
+
+    mb_index++;
+    fprintf(mvs, "\n");
+  }
+
+  fprintf(mvs, "\n");
+
+  /* print out the block modes */
+  mb_index = 0;
+  fprintf(mvs, "Mbs for Frame %d\n", frame);
+  {
+    int b_row;
+
+    for (b_row = 0; b_row < 4 * rows; b_row++) {
+      int b_col;
+      int bindex;
+
+      for (b_col = 0; b_col < 4 * cols; b_col++) {
+        mb_index = (b_row >> 2) * (cols + 1) + (b_col >> 2);
+        bindex = (b_row & 3) * 4 + (b_col & 3);
+
+        if (mi[mb_index].mbmi.mode == B_PRED) {
+          fprintf(mvs, "%2d ", mi[mb_index].bmi[bindex].as_mode.first);
+#if CONFIG_COMP_INTRA_PRED
+          fprintf(mvs, "%2d ", mi[mb_index].bmi[bindex].as_mode.second);
+#endif
+        } else
+          fprintf(mvs, "xx ");
+
+      }
+
+      fprintf(mvs, "\n");
+    }
+  }
+  fprintf(mvs, "\n");
+
+  /* print out the macroblock mvs */
+  mb_index = 0;
+  fprintf(mvs, "MVs for Frame %d\n", frame);
+
+  for (mb_row = 0; mb_row < rows; mb_row++) {
+    for (mb_col = 0; mb_col < cols; mb_col++) {
+      fprintf(mvs, "%5d:%-5d", mi[mb_index].mbmi.mv[0].as_mv.row / 2,
+          mi[mb_index].mbmi.mv[0].as_mv.col / 2);
+
+      mb_index++;
+    }
+
+    mb_index++;
+    fprintf(mvs, "\n");
+  }
+
+  fprintf(mvs, "\n");
+
+  /* print out the block modes */
+  mb_index = 0;
+  fprintf(mvs, "MVs for Frame %d\n", frame);
+  {
+    int b_row;
+
+    for (b_row = 0; b_row < 4 * rows; b_row++) {
+      int b_col;
+      int bindex;
+
+      for (b_col = 0; b_col < 4 * cols; b_col++) {
+        mb_index = (b_row >> 2) * (cols + 1) + (b_col >> 2);
+        bindex = (b_row & 3) * 4 + (b_col & 3);
+        fprintf(mvs, "%3d:%-3d ",
+                mi[mb_index].bmi[bindex].as_mv.first.as_mv.row,
+                mi[mb_index].bmi[bindex].as_mv.first.as_mv.col);
+
+      }
+
+      fprintf(mvs, "\n");
+    }
+  }
+  fprintf(mvs, "\n");
+
+  fclose(mvs);
+}
--- /dev/null
+++ b/vp9/common/default_coef_probs.h
@@ -1,0 +1,1377 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+*/
+
+
+/*Generated file, included by entropy.c*/
+
+
+static const vp9_prob default_coef_probs [BLOCK_TYPES]
+                                         [COEF_BANDS]
+                                         [PREV_COEF_CONTEXTS]
+                                         [ENTROPY_NODES] = {
+  {
+    /* Block Type ( 0 ) */
+    {
+      /* Coeff Band ( 0 )*/
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+    },
+    {
+      /* Coeff Band ( 1 )*/
+      { 253, 136, 254, 255, 228, 219, 128, 128, 128, 128, 128 },
+      { 189, 129, 242, 255, 227, 213, 255, 219, 128, 128, 128 },
+      { 106, 126, 227, 252, 214, 209, 255, 255, 128, 128, 128 },
+      { 90, 116, 227, 252, 214, 209, 255, 255, 128, 128, 128 },
+    },
+    {
+      /* Coeff Band ( 2 )*/
+      {   1,  98, 248, 255, 236, 226, 255, 255, 128, 128, 128 },
+      { 181, 133, 238, 254, 221, 234, 255, 154, 128, 128, 128 },
+      {  78, 134, 202, 247, 198, 180, 255, 219, 128, 128, 128 },
+      {  64, 128, 202, 247, 198, 180, 255, 219, 128, 128, 128 },
+    },
+    {
+      /* Coeff Band ( 3 )*/
+      {   1, 185, 249, 255, 243, 255, 128, 128, 128, 128, 128 },
+      { 184, 150, 247, 255, 236, 224, 128, 128, 128, 128, 128 },
+      {  77, 110, 216, 255, 236, 230, 128, 128, 128, 128, 128 },
+      {  64, 100, 216, 255, 236, 230, 128, 128, 128, 128, 128 },
+    },
+    {
+      /* Coeff Band ( 4 )*/
+      {   1, 101, 251, 255, 241, 255, 128, 128, 128, 128, 128 },
+      { 170, 139, 241, 252, 236, 209, 255, 255, 128, 128, 128 },
+      {  37, 116, 196, 243, 228, 255, 255, 255, 128, 128, 128 },
+      {  28, 110, 196, 243, 228, 255, 255, 255, 128, 128, 128 },
+    },
+    {
+      /* Coeff Band ( 5 )*/
+      {   1, 204, 254, 255, 245, 255, 128, 128, 128, 128, 128 },
+      { 207, 160, 250, 255, 238, 128, 128, 128, 128, 128, 128 },
+      { 102, 103, 231, 255, 211, 171, 128, 128, 128, 128, 128 },
+      { 90, 90, 231, 255, 211, 171, 128, 128, 128, 128, 128 },
+    },
+    {
+      /* Coeff Band ( 6 )*/
+      {   1, 152, 252, 255, 240, 255, 128, 128, 128, 128, 128 },
+      { 177, 135, 243, 255, 234, 225, 128, 128, 128, 128, 128 },
+      {  80, 129, 211, 255, 194, 224, 128, 128, 128, 128, 128 },
+      {  64, 120, 211, 255, 194, 224, 128, 128, 128, 128, 128 },
+    },
+    {
+      /* Coeff Band ( 7 )*/
+      {   1,   1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 246,   1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 255, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 255, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+    }
+  },
+  {
+    /* Block Type ( 1 ) */
+    {
+      /* Coeff Band ( 0 )*/
+      { 198,  35, 237, 223, 193, 187, 162, 160, 145, 155,  62 },
+      { 131,  45, 198, 221, 172, 176, 220, 157, 252, 221,   1 },
+      {  68,  47, 146, 208, 149, 167, 221, 162, 255, 223, 128 },
+      {  48,  32, 146, 208, 149, 167, 221, 162, 255, 223, 128 },
+    },
+    {
+      /* Coeff Band ( 1 )*/
+      {   1, 149, 241, 255, 221, 224, 255, 255, 128, 128, 128 },
+      { 184, 141, 234, 253, 222, 220, 255, 199, 128, 128, 128 },
+      {  81,  99, 181, 242, 176, 190, 249, 202, 255, 255, 128 },
+      {  66,  90, 181, 242, 176, 190, 249, 202, 255, 255, 128 },
+    },
+    {
+      /* Coeff Band ( 2 )*/
+      {   1, 129, 232, 253, 214, 197, 242, 196, 255, 255, 128 },
+      {  99, 121, 210, 250, 201, 198, 255, 202, 128, 128, 128 },
+      {  23,  91, 163, 242, 170, 187, 247, 210, 255, 255, 128 },
+      {  18,  80, 163, 242, 170, 187, 247, 210, 255, 255, 128 },
+    },
+    {
+      /* Coeff Band ( 3 )*/
+      {   1, 200, 246, 255, 234, 255, 128, 128, 128, 128, 128 },
+      { 109, 178, 241, 255, 231, 245, 255, 255, 128, 128, 128 },
+      {  44, 130, 201, 253, 205, 192, 255, 255, 128, 128, 128 },
+      {  36, 120, 201, 253, 205, 192, 255, 255, 128, 128, 128 },
+    },
+    {
+      /* Coeff Band ( 4 )*/
+      {   1, 132, 239, 251, 219, 209, 255, 165, 128, 128, 128 },
+      {  94, 136, 225, 251, 218, 190, 255, 255, 128, 128, 128 },
+      {  22, 100, 174, 245, 186, 161, 255, 199, 128, 128, 128 },
+      {  18, 90, 174, 245, 186, 161, 255, 199, 128, 128, 128 },
+    },
+    {
+      /* Coeff Band ( 5 )*/
+      {   1, 182, 249, 255, 232, 235, 128, 128, 128, 128, 128 },
+      { 124, 143, 241, 255, 227, 234, 128, 128, 128, 128, 128 },
+      {  35,  77, 181, 251, 193, 211, 255, 205, 128, 128, 128 },
+      {  28,  70, 181, 251, 193, 211, 255, 205, 128, 128, 128 },
+    },
+    {
+      /* Coeff Band ( 6 )*/
+      {   1, 157, 247, 255, 236, 231, 255, 255, 128, 128, 128 },
+      { 121, 141, 235, 255, 225, 227, 255, 255, 128, 128, 128 },
+      {  45,  99, 188, 251, 195, 217, 255, 224, 128, 128, 128 },
+      {  40,  90, 188, 251, 195, 217, 255, 224, 128, 128, 128 },
+    },
+    {
+      /* Coeff Band ( 7 )*/
+      {   1,   1, 251, 255, 213, 255, 128, 128, 128, 128, 128 },
+      { 203,   1, 248, 255, 255, 128, 128, 128, 128, 128, 128 },
+      { 137,   1, 177, 255, 224, 255, 128, 128, 128, 128, 128 },
+      { 137,   1, 177, 255, 224, 255, 128, 128, 128, 128, 128 },
+    }
+  },
+  {
+    /* Block Type ( 2 ) */
+    {
+      /* Coeff Band ( 0 )*/
+      { 253,   9, 248, 251, 207, 208, 255, 192, 128, 128, 128 },
+      { 175,  13, 224, 243, 193, 185, 249, 198, 255, 255, 128 },
+      {  73,  17, 171, 221, 161, 179, 236, 167, 255, 234, 128 },
+      {  64,  17, 171, 221, 161, 179, 236, 167, 255, 234, 128 },
+    },
+    {
+      /* Coeff Band ( 1 )*/
+      {   1,  95, 247, 253, 212, 183, 255, 255, 128, 128, 128 },
+      { 239,  90, 244, 250, 211, 209, 255, 255, 128, 128, 128 },
+      { 155,  77, 195, 248, 188, 195, 255, 255, 128, 128, 128 },
+      { 140,  70, 195, 248, 188, 195, 255, 255, 128, 128, 128 },
+    },
+    {
+      /* Coeff Band ( 2 )*/
+      {   1,  24, 239, 251, 218, 219, 255, 205, 128, 128, 128 },
+      { 201,  51, 219, 255, 196, 186, 128, 128, 128, 128, 128 },
+      {  69,  46, 190, 239, 201, 218, 255, 228, 128, 128, 128 },
+      {  60,  40, 190, 239, 201, 218, 255, 228, 128, 128, 128 },
+    },
+    {
+      /* Coeff Band ( 3 )*/
+      {   1, 191, 251, 255, 255, 128, 128, 128, 128, 128, 128 },
+      { 223, 165, 249, 255, 213, 255, 128, 128, 128, 128, 128 },
+      { 141, 124, 248, 255, 255, 128, 128, 128, 128, 128, 128 },
+      { 132, 118, 248, 255, 255, 128, 128, 128, 128, 128, 128 },
+    },
+    {
+      /* Coeff Band ( 4 )*/
+      {   1,  16, 248, 255, 255, 128, 128, 128, 128, 128, 128 },
+      { 190,  36, 230, 255, 236, 255, 128, 128, 128, 128, 128 },
+      { 149,   1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 149,   1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+    },
+    {
+      /* Coeff Band ( 5 )*/
+      {   1, 226, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 247, 192, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 240, 128, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 240, 128, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+    },
+    {
+      /* Coeff Band ( 6 )*/
+      {   1, 134, 252, 255, 255, 128, 128, 128, 128, 128, 128 },
+      { 213,  62, 250, 255, 255, 128, 128, 128, 128, 128, 128 },
+      {  55,  93, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+      {  48,  85, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+    },
+    {
+      /* Coeff Band ( 7 )*/
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+    }
+  },
+  {
+    /* Block Type ( 3 ) */
+    {
+      /* Coeff Band ( 0 )*/
+      { 202,  24, 213, 235, 186, 191, 220, 160, 240, 175, 255 },
+      { 126,  38, 182, 232, 169, 184, 228, 174, 255, 187, 128 },
+      {  63,  48, 138, 219, 151, 178, 240, 170, 255, 216, 128 },
+      {  54,  40, 138, 219, 151, 178, 240, 170, 255, 216, 128 },
+    },
+    {
+      /* Coeff Band ( 1 )*/
+      {   1, 112, 230, 250, 199, 191, 247, 159, 255, 255, 128 },
+      { 166, 109, 228, 252, 211, 215, 255, 174, 128, 128, 128 },
+      {  44,  84, 162, 232, 172, 180, 245, 178, 255, 255, 128 },
+      {  32,  70, 162, 232, 172, 180, 245, 178, 255, 255, 128 },
+    },
+    {
+      /* Coeff Band ( 2 )*/
+      {   1,  52, 220, 246, 198, 199, 249, 220, 255, 255, 128 },
+      { 124,  74, 191, 243, 183, 193, 250, 221, 255, 255, 128 },
+      {  24,  71, 130, 219, 154, 170, 243, 182, 255, 255, 128 },
+      {  24,  71, 130, 219, 154, 170, 243, 182, 255, 255, 128 },
+    },
+    {
+      /* Coeff Band ( 3 )*/
+      {   1, 182, 225, 249, 219, 240, 255, 224, 128, 128, 128 },
+      { 149, 150, 226, 252, 216, 205, 255, 171, 128, 128, 128 },
+      {  28, 108, 170, 242, 183, 194, 254, 223, 255, 255, 128 },
+      {  26, 104, 170, 242, 183, 194, 254, 223, 255, 255, 128 },
+    },
+    {
+      /* Coeff Band ( 4 )*/
+      {   1,  81, 230, 252, 204, 203, 255, 192, 128, 128, 128 },
+      { 123, 102, 209, 247, 188, 196, 255, 233, 128, 128, 128 },
+      {  20,  95, 153, 243, 164, 173, 255, 203, 128, 128, 128 },
+      {  20,  95, 153, 243, 164, 173, 255, 203, 128, 128, 128 },
+    },
+    {
+      /* Coeff Band ( 5 )*/
+      {   1, 222, 248, 255, 216, 213, 128, 128, 128, 128, 128 },
+      { 168, 175, 246, 252, 235, 205, 255, 255, 128, 128, 128 },
+      {  47, 116, 215, 255, 211, 212, 255, 255, 128, 128, 128 },
+      {  47, 116, 215, 255, 211, 212, 255, 255, 128, 128, 128 },
+    },
+    {
+      /* Coeff Band ( 6 )*/
+      {   1, 121, 236, 253, 212, 214, 255, 255, 128, 128, 128 },
+      { 141,  84, 213, 252, 201, 202, 255, 219, 128, 128, 128 },
+      {  42,  80, 160, 240, 162, 185, 255, 205, 128, 128, 128 },
+      {  42,  80, 160, 240, 162, 185, 255, 205, 128, 128, 128 },
+    },
+    {
+      /* Coeff Band ( 7 )*/
+      {   1,   1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 244,   1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 238,   1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 238,   1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+    }
+  }
+};
+
+static const vp9_prob default_hybrid_coef_probs [BLOCK_TYPES]
+                                                [COEF_BANDS]
+                                                [PREV_COEF_CONTEXTS]
+                                                [ENTROPY_NODES] = {
+  {
+    /* Block Type ( 0 ) */
+    {
+      /* Coeff Band ( 0 )*/
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+    },
+    {
+      /* Coeff Band ( 1 )*/
+      { 253, 136, 254, 255, 228, 219, 128, 128, 128, 128, 128 },
+      { 189, 129, 242, 255, 227, 213, 255, 219, 128, 128, 128 },
+      { 106, 126, 227, 252, 214, 209, 255, 255, 128, 128, 128 },
+      { 90, 116, 227, 252, 214, 209, 255, 255, 128, 128, 128 },
+    },
+    {
+      /* Coeff Band ( 2 )*/
+      {   1,  98, 248, 255, 236, 226, 255, 255, 128, 128, 128 },
+      { 181, 133, 238, 254, 221, 234, 255, 154, 128, 128, 128 },
+      {  78, 134, 202, 247, 198, 180, 255, 219, 128, 128, 128 },
+      {  64, 128, 202, 247, 198, 180, 255, 219, 128, 128, 128 },
+    },
+    {
+      /* Coeff Band ( 3 )*/
+      {   1, 185, 249, 255, 243, 255, 128, 128, 128, 128, 128 },
+      { 184, 150, 247, 255, 236, 224, 128, 128, 128, 128, 128 },
+      {  77, 110, 216, 255, 236, 230, 128, 128, 128, 128, 128 },
+      {  64, 100, 216, 255, 236, 230, 128, 128, 128, 128, 128 },
+    },
+    {
+      /* Coeff Band ( 4 )*/
+      {   1, 101, 251, 255, 241, 255, 128, 128, 128, 128, 128 },
+      { 170, 139, 241, 252, 236, 209, 255, 255, 128, 128, 128 },
+      {  37, 116, 196, 243, 228, 255, 255, 255, 128, 128, 128 },
+      {  28, 110, 196, 243, 228, 255, 255, 255, 128, 128, 128 },
+    },
+    {
+      /* Coeff Band ( 5 )*/
+      {   1, 204, 254, 255, 245, 255, 128, 128, 128, 128, 128 },
+      { 207, 160, 250, 255, 238, 128, 128, 128, 128, 128, 128 },
+      { 102, 103, 231, 255, 211, 171, 128, 128, 128, 128, 128 },
+      { 90, 90, 231, 255, 211, 171, 128, 128, 128, 128, 128 },
+    },
+    {
+      /* Coeff Band ( 6 )*/
+      {   1, 152, 252, 255, 240, 255, 128, 128, 128, 128, 128 },
+      { 177, 135, 243, 255, 234, 225, 128, 128, 128, 128, 128 },
+      {  80, 129, 211, 255, 194, 224, 128, 128, 128, 128, 128 },
+      {  64, 120, 211, 255, 194, 224, 128, 128, 128, 128, 128 },
+    },
+    {
+      /* Coeff Band ( 7 )*/
+      {   1,   1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 246,   1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 255, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 255, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+    }
+  },
+  {
+    /* Block Type ( 1 ) */
+    {
+      /* Coeff Band ( 0 )*/
+      { 198,  35, 237, 223, 193, 187, 162, 160, 145, 155,  62 },
+      { 131,  45, 198, 221, 172, 176, 220, 157, 252, 221,   1 },
+      {  68,  47, 146, 208, 149, 167, 221, 162, 255, 223, 128 },
+      {  48,  32, 146, 208, 149, 167, 221, 162, 255, 223, 128 },
+    },
+    {
+      /* Coeff Band ( 1 )*/
+      {   1, 149, 241, 255, 221, 224, 255, 255, 128, 128, 128 },
+      { 184, 141, 234, 253, 222, 220, 255, 199, 128, 128, 128 },
+      {  81,  99, 181, 242, 176, 190, 249, 202, 255, 255, 128 },
+      {  66,  90, 181, 242, 176, 190, 249, 202, 255, 255, 128 },
+    },
+    {
+      /* Coeff Band ( 2 )*/
+      {   1, 129, 232, 253, 214, 197, 242, 196, 255, 255, 128 },
+      {  99, 121, 210, 250, 201, 198, 255, 202, 128, 128, 128 },
+      {  23,  91, 163, 242, 170, 187, 247, 210, 255, 255, 128 },
+      {  18,  80, 163, 242, 170, 187, 247, 210, 255, 255, 128 },
+    },
+    {
+      /* Coeff Band ( 3 )*/
+      {   1, 200, 246, 255, 234, 255, 128, 128, 128, 128, 128 },
+      { 109, 178, 241, 255, 231, 245, 255, 255, 128, 128, 128 },
+      {  44, 130, 201, 253, 205, 192, 255, 255, 128, 128, 128 },
+      {  36, 120, 201, 253, 205, 192, 255, 255, 128, 128, 128 },
+    },
+    {
+      /* Coeff Band ( 4 )*/
+      {   1, 132, 239, 251, 219, 209, 255, 165, 128, 128, 128 },
+      {  94, 136, 225, 251, 218, 190, 255, 255, 128, 128, 128 },
+      {  22, 100, 174, 245, 186, 161, 255, 199, 128, 128, 128 },
+      {  18, 90, 174, 245, 186, 161, 255, 199, 128, 128, 128 },
+    },
+    {
+      /* Coeff Band ( 5 )*/
+      {   1, 182, 249, 255, 232, 235, 128, 128, 128, 128, 128 },
+      { 124, 143, 241, 255, 227, 234, 128, 128, 128, 128, 128 },
+      {  35,  77, 181, 251, 193, 211, 255, 205, 128, 128, 128 },
+      {  28,  70, 181, 251, 193, 211, 255, 205, 128, 128, 128 },
+    },
+    {
+      /* Coeff Band ( 6 )*/
+      {   1, 157, 247, 255, 236, 231, 255, 255, 128, 128, 128 },
+      { 121, 141, 235, 255, 225, 227, 255, 255, 128, 128, 128 },
+      {  45,  99, 188, 251, 195, 217, 255, 224, 128, 128, 128 },
+      {  40,  90, 188, 251, 195, 217, 255, 224, 128, 128, 128 },
+    },
+    {
+      /* Coeff Band ( 7 )*/
+      {   1,   1, 251, 255, 213, 255, 128, 128, 128, 128, 128 },
+      { 203,   1, 248, 255, 255, 128, 128, 128, 128, 128, 128 },
+      { 137,   1, 177, 255, 224, 255, 128, 128, 128, 128, 128 },
+      { 137,   1, 177, 255, 224, 255, 128, 128, 128, 128, 128 },
+    }
+  },
+  {
+    /* Block Type ( 2 ) */
+    {
+      /* Coeff Band ( 0 )*/
+      { 253,   9, 248, 251, 207, 208, 255, 192, 128, 128, 128 },
+      { 175,  13, 224, 243, 193, 185, 249, 198, 255, 255, 128 },
+      {  73,  17, 171, 221, 161, 179, 236, 167, 255, 234, 128 },
+      {  64,  17, 171, 221, 161, 179, 236, 167, 255, 234, 128 },
+    },
+    {
+      /* Coeff Band ( 1 )*/
+      {   1,  95, 247, 253, 212, 183, 255, 255, 128, 128, 128 },
+      { 239,  90, 244, 250, 211, 209, 255, 255, 128, 128, 128 },
+      { 155,  77, 195, 248, 188, 195, 255, 255, 128, 128, 128 },
+      { 140,  70, 195, 248, 188, 195, 255, 255, 128, 128, 128 },
+    },
+    {
+      /* Coeff Band ( 2 )*/
+      {   1,  24, 239, 251, 218, 219, 255, 205, 128, 128, 128 },
+      { 201,  51, 219, 255, 196, 186, 128, 128, 128, 128, 128 },
+      {  69,  46, 190, 239, 201, 218, 255, 228, 128, 128, 128 },
+      {  60,  40, 190, 239, 201, 218, 255, 228, 128, 128, 128 },
+    },
+    {
+      /* Coeff Band ( 3 )*/
+      {   1, 191, 251, 255, 255, 128, 128, 128, 128, 128, 128 },
+      { 223, 165, 249, 255, 213, 255, 128, 128, 128, 128, 128 },
+      { 141, 124, 248, 255, 255, 128, 128, 128, 128, 128, 128 },
+      { 132, 118, 248, 255, 255, 128, 128, 128, 128, 128, 128 },
+    },
+    {
+      /* Coeff Band ( 4 )*/
+      {   1,  16, 248, 255, 255, 128, 128, 128, 128, 128, 128 },
+      { 190,  36, 230, 255, 236, 255, 128, 128, 128, 128, 128 },
+      { 149,   1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 149,   1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+    },
+    {
+      /* Coeff Band ( 5 )*/
+      {   1, 226, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 247, 192, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 240, 128, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 240, 128, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+    },
+    {
+      /* Coeff Band ( 6 )*/
+      {   1, 134, 252, 255, 255, 128, 128, 128, 128, 128, 128 },
+      { 213,  62, 250, 255, 255, 128, 128, 128, 128, 128, 128 },
+      {  55,  93, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+      {  48,  85, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+    },
+    {
+      /* Coeff Band ( 7 )*/
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+    }
+  },
+  {
+    /* Block Type ( 3 ) */
+    {
+      /* Coeff Band ( 0 )*/
+      { 202,  24, 213, 235, 186, 191, 220, 160, 240, 175, 255 },
+      { 126,  38, 182, 232, 169, 184, 228, 174, 255, 187, 128 },
+      {  63,  48, 138, 219, 151, 178, 240, 170, 255, 216, 128 },
+      {  54,  40, 138, 219, 151, 178, 240, 170, 255, 216, 128 },
+    },
+    {
+      /* Coeff Band ( 1 )*/
+      {   1, 112, 230, 250, 199, 191, 247, 159, 255, 255, 128 },
+      { 166, 109, 228, 252, 211, 215, 255, 174, 128, 128, 128 },
+      {  44,  84, 162, 232, 172, 180, 245, 178, 255, 255, 128 },
+      {  32,  70, 162, 232, 172, 180, 245, 178, 255, 255, 128 },
+    },
+    {
+      /* Coeff Band ( 2 )*/
+      {   1,  52, 220, 246, 198, 199, 249, 220, 255, 255, 128 },
+      { 124,  74, 191, 243, 183, 193, 250, 221, 255, 255, 128 },
+      {  24,  71, 130, 219, 154, 170, 243, 182, 255, 255, 128 },
+      {  24,  71, 130, 219, 154, 170, 243, 182, 255, 255, 128 },
+    },
+    {
+      /* Coeff Band ( 3 )*/
+      {   1, 182, 225, 249, 219, 240, 255, 224, 128, 128, 128 },
+      { 149, 150, 226, 252, 216, 205, 255, 171, 128, 128, 128 },
+      {  28, 108, 170, 242, 183, 194, 254, 223, 255, 255, 128 },
+      {  26, 104, 170, 242, 183, 194, 254, 223, 255, 255, 128 },
+    },
+    {
+      /* Coeff Band ( 4 )*/
+      {   1,  81, 230, 252, 204, 203, 255, 192, 128, 128, 128 },
+      { 123, 102, 209, 247, 188, 196, 255, 233, 128, 128, 128 },
+      {  20,  95, 153, 243, 164, 173, 255, 203, 128, 128, 128 },
+      {  20,  95, 153, 243, 164, 173, 255, 203, 128, 128, 128 },
+    },
+    {
+      /* Coeff Band ( 5 )*/
+      {   1, 222, 248, 255, 216, 213, 128, 128, 128, 128, 128 },
+      { 168, 175, 246, 252, 235, 205, 255, 255, 128, 128, 128 },
+      {  47, 116, 215, 255, 211, 212, 255, 255, 128, 128, 128 },
+      {  47, 116, 215, 255, 211, 212, 255, 255, 128, 128, 128 },
+    },
+    {
+      /* Coeff Band ( 6 )*/
+      {   1, 121, 236, 253, 212, 214, 255, 255, 128, 128, 128 },
+      { 141,  84, 213, 252, 201, 202, 255, 219, 128, 128, 128 },
+      {  42,  80, 160, 240, 162, 185, 255, 205, 128, 128, 128 },
+      {  42,  80, 160, 240, 162, 185, 255, 205, 128, 128, 128 },
+    },
+    {
+      /* Coeff Band ( 7 )*/
+      {   1,   1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 244,   1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 238,   1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 238,   1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+    }
+  }
+};
+
+static const vp9_prob
+default_coef_probs_8x8[BLOCK_TYPES_8X8]
+[COEF_BANDS]
+[PREV_COEF_CONTEXTS]
+[ENTROPY_NODES] = {
+  {
+    /* block Type 0 */
+    {
+      /* Coeff Band 0 */
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}
+    },
+    {
+      /* Coeff Band 1 */
+      { 60, 140, 195, 255, 212, 214, 128, 128, 128, 128, 128},
+      { 75, 221, 231, 255, 203, 255, 128, 128, 128, 128, 128},
+      { 9, 212, 196, 251, 197, 207, 255, 185, 128, 128, 128},
+      { 9, 212, 196, 251, 197, 207, 255, 185, 128, 128, 128}
+    },
+    {
+      /* Coeff Band 2 */
+      { 1, 227, 226, 255, 215, 215, 128, 128, 128, 128, 128},
+      { 5, 163, 209, 255, 212, 212, 255, 255, 128, 128, 128},
+      { 1, 133, 203, 255, 210, 220, 255, 255, 128, 128, 128},
+      { 1, 133, 203, 255, 210, 220, 255, 255, 128, 128, 128}
+    },
+    {
+      /* Coeff Band 3 */
+      { 1, 226, 225, 255, 228, 236, 128, 128, 128, 128, 128},
+      { 6, 163, 208, 255, 224, 234, 255, 255, 128, 128, 128},
+      { 1, 122, 196, 253, 212, 248, 255, 255, 128, 128, 128},
+      { 1, 122, 196, 253, 212, 248, 255, 255, 128, 128, 128}
+    },
+    {
+      /* Coeff Band 4 */
+      { 1, 222, 197, 254, 193, 216, 255, 236, 128, 128, 128},
+      { 7, 140, 163, 251, 195, 211, 255, 238, 128, 128, 128},
+      { 1, 91, 152, 249, 181, 197, 255, 239, 128, 128, 128},
+      { 1, 91, 152, 249, 181, 197, 255, 239, 128, 128, 128}
+    },
+    {
+      /* Coeff Band 5 */
+      { 1, 226, 218, 255, 216, 241, 255, 255, 128, 128, 128},
+      { 6, 154, 191, 255, 218, 240, 255, 255, 128, 128, 128},
+      { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128},
+      { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128}
+    },
+    {
+      /* Coeff Band 6 */
+      { 1, 221, 217, 255, 208, 217, 255, 232, 128, 128, 128},
+      { 11, 155, 189, 254, 203, 211, 255, 249, 128, 128, 128},
+      { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128},
+      { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128}
+    },
+    {
+      /* Coeff Band 7 */
+      { 1, 207, 235, 255, 232, 240, 128, 128, 128, 128, 128},
+      { 58, 161, 216, 255, 229, 235, 255, 255, 128, 128, 128},
+      { 8, 133, 204, 255, 219, 231, 255, 255, 128, 128, 128},
+      { 8, 133, 204, 255, 219, 231, 255, 255, 128, 128, 128}
+    }
+  },
+  {
+    /* block Type 1 */
+    {
+      /* Coeff Band 0 */
+      { 134, 152, 233, 224, 234, 52, 255, 166, 128, 128, 128},
+      { 97, 132, 185, 234, 186, 189, 197, 171, 255, 212, 128},
+      { 84, 110, 185, 237, 182, 182, 145, 145, 255, 255, 128}
+    },
+    {
+      /* Coeff Band 1 */
+      { 1, 124, 213, 247, 192, 212, 255, 255, 128, 128, 128},
+      { 88, 111, 178, 254, 189, 211, 255, 255, 128, 128, 128},
+      { 12, 59, 129, 236, 150, 179, 239, 195, 255, 255, 128},
+      { 12, 59, 129, 236, 150, 179, 239, 195, 255, 255, 128}
+    },
+    {
+      /* Coeff Band 2 */
+      { 1, 102, 225, 255, 210, 240, 128, 128, 128, 128, 128},
+      { 110, 78, 195, 254, 200, 191, 255, 255, 128, 128, 128},
+      { 37, 63, 177, 255, 194, 195, 128, 128, 128, 128, 128},
+      { 37, 63, 177, 255, 194, 195, 128, 128, 128, 128, 128}
+    },
+    {
+      /* Coeff Band 3 */
+      { 1, 1, 229, 255, 202, 224, 128, 128, 128, 128, 128},
+      { 150, 1, 192, 255, 206, 226, 128, 128, 128, 128, 128},
+      { 75, 1, 138, 255, 172, 228, 128, 128, 128, 128, 128},
+      { 75, 1, 138, 255, 172, 228, 128, 128, 128, 128, 128}
+    },
+    {
+      /* Coeff Band 4 */
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}
+    },
+    {
+      /* Coeff Band 5 */
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}
+    },
+    {
+      /* Coeff Band 6 */
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}
+    },
+    {
+      /* Coeff Band 7 */
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}
+    }
+  },
+  {
+    /* block Type 2 */
+    {
+      /* Coeff Band 0 */
+      { 11, 181, 226, 199, 183, 255, 255, 255, 128, 128, 128},
+      { 2, 147, 185, 248, 163, 180, 255, 236, 128, 128, 128},
+      { 1, 123, 157, 238, 154, 176, 255, 226, 255, 255, 128},
+      { 1, 123, 157, 238, 154, 176, 255, 226, 255, 255, 128}
+    },
+    {
+      /* Coeff Band 1 */
+      { 1, 150, 191, 246, 174, 188, 255, 235, 128, 128, 128},
+      { 1, 125, 166, 245, 165, 185, 255, 234, 128, 128, 128},
+      { 1, 79, 125, 240, 148, 179, 255, 234, 255, 255, 128},
+      { 1, 79, 125, 240, 148, 179, 255, 234, 255, 255, 128}
+    },
+    {
+      /* Coeff Band 2 */
+      { 1, 146, 184, 242, 167, 183, 255, 230, 255, 255, 128},
+      { 1, 119, 160, 239, 156, 178, 255, 231, 255, 255, 128},
+      { 1, 75, 115, 234, 142, 173, 255, 225, 255, 255, 128},
+      { 1, 75, 115, 234, 142, 173, 255, 225, 255, 255, 128}
+    },
+    {
+      /* Coeff Band 3 */
+      { 1, 150, 188, 244, 169, 183, 255, 233, 255, 255, 128},
+      { 1, 123, 162, 243, 161, 180, 255, 233, 128, 128, 128},
+      { 1, 76, 120, 238, 148, 178, 255, 230, 255, 255, 128},
+      { 1, 76, 120, 238, 148, 178, 255, 230, 255, 255, 128}
+    },
+    {
+      /* Coeff Band 4 */
+      { 1, 163, 202, 252, 188, 204, 255, 248, 128, 128, 128},
+      { 1, 136, 180, 251, 181, 201, 255, 246, 128, 128, 128},
+      { 1, 92, 146, 249, 170, 197, 255, 245, 128, 128, 128},
+      { 1, 92, 146, 249, 170, 197, 255, 245, 128, 128, 128}
+    },
+    {
+      /* Coeff Band 5 */
+      { 1, 156, 195, 249, 179, 193, 255, 241, 255, 255, 128},
+      { 1, 128, 169, 248, 171, 192, 255, 242, 255, 255, 128},
+      { 1, 84, 132, 245, 158, 187, 255, 240, 255, 255, 128},
+      { 1, 84, 132, 245, 158, 187, 255, 240, 255, 255, 128}
+    },
+    {
+      /* Coeff Band 6 */
+      { 1, 36, 71, 251, 192, 201, 255, 243, 255, 255, 128},
+      { 1, 49, 185, 250, 184, 199, 255, 242, 128, 128, 128},
+      { 1, 95, 147, 247, 168, 190, 255, 239, 255, 255, 128},
+      { 1, 95, 147, 247, 168, 190, 255, 239, 255, 255, 128}
+    },
+    {
+      /* Coeff Band 7 */
+      { 1, 19, 98, 255, 218, 222, 255, 255, 128, 128, 128},
+      { 36, 50, 210, 255, 212, 221, 255, 255, 128, 128, 128},
+      { 6, 117, 180, 254, 199, 216, 255, 251, 128, 128, 128},
+      { 6, 117, 180, 254, 199, 216, 255, 251, 128, 128, 128}
+    }
+  },
+  { /* block Type 3 */
+    { /* Coeff Band 0 */
+      { 192, 18, 155, 172, 145, 164, 192, 135, 246, 223, 255},
+      { 94, 29, 97, 131, 131, 153, 171, 121, 250, 190, 255},
+      { 25, 29, 63, 128, 119, 147, 168, 124, 251, 183, 255},
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}
+    },
+    { /* Coeff Band 1 */
+      { 1, 108, 192, 220, 186, 173, 255, 194, 255, 255, 128},
+      { 123, 104, 188, 221, 165, 171, 247, 180, 255, 255, 128},
+      { 23, 76, 152, 216, 154, 166, 226, 182, 255, 209, 128},
+      { 1, 26, 52, 162, 109, 152, 208, 144, 255, 231, 128}
+    },
+    { /* Coeff Band 2 */
+      { 1, 57, 179, 220, 156, 175, 210, 158, 255, 223, 128},
+      { 48, 57, 134, 212, 151, 170, 219, 185, 255, 248, 128},
+      { 4, 35, 63, 189, 120, 156, 221, 159, 255, 241, 128},
+      { 1, 17, 23, 110, 97, 143, 187, 120, 255, 234, 128}
+    },
+    { /* Coeff Band 3 */
+      { 1, 115, 205, 243, 182, 187, 254, 218, 255, 255, 128},
+      { 80, 101, 186, 241, 183, 186, 249, 182, 255, 255, 128},
+      { 10, 81, 144, 229, 164, 175, 241, 185, 255, 255, 128},
+      { 1, 44, 81, 192, 130, 148, 240, 180, 255, 255, 128}
+    },
+    { /* Coeff Band 4 */
+      { 1, 161, 207, 249, 187, 176, 255, 180, 128, 128, 128},
+      { 79, 148, 196, 240, 186, 182, 253, 171, 255, 255, 128},
+      { 14, 111, 171, 233, 170, 178, 235, 204, 255, 255, 128},
+      { 1, 63, 103, 202, 143, 162, 240, 178, 255, 255, 128}
+    },
+    { /* Coeff Band 5 */
+      { 1, 101, 202, 239, 185, 184, 252, 186, 255, 255, 128},
+      { 43, 67, 166, 237, 178, 190, 246, 194, 255, 255, 128},
+      { 4, 49, 85, 220, 140, 168, 253, 182, 255, 255, 128},
+      { 1, 24, 35, 144, 93, 135, 239, 159, 255, 253, 128}
+    },
+    { /* Coeff Band 6 */
+      { 1, 212, 243, 255, 240, 234, 255, 255, 128, 128, 128},
+      { 98, 168, 234, 255, 229, 234, 255, 255, 128, 128, 128},
+      { 19, 127, 199, 255, 212, 198, 255, 255, 128, 128, 128},
+      { 1, 103, 162, 253, 186, 151, 255, 255, 128, 128, 128}
+    },
+    { /* Coeff Band 7 */
+      { 1, 188, 253, 255, 255, 128, 128, 128, 128, 128, 128},
+      { 191, 68, 242, 255, 255, 128, 128, 128, 128, 128, 128},
+      { 8, 132, 255, 128, 128, 128, 128, 128, 128, 128, 128},
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}
+    }
+  }
+};
+
+static const vp9_prob
+default_hybrid_coef_probs_8x8[BLOCK_TYPES_8X8]
+                             [COEF_BANDS]
+                             [PREV_COEF_CONTEXTS]
+                             [ENTROPY_NODES] = {
+  {
+    /* block Type 0 */
+    {
+      /* Coeff Band 0 */
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}
+    },
+    {
+      /* Coeff Band 1 */
+      { 60, 140, 195, 255, 212, 214, 128, 128, 128, 128, 128},
+      { 75, 221, 231, 255, 203, 255, 128, 128, 128, 128, 128},
+      { 9, 212, 196, 251, 197, 207, 255, 185, 128, 128, 128},
+      { 9, 212, 196, 251, 197, 207, 255, 185, 128, 128, 128}
+    },
+    {
+      /* Coeff Band 2 */
+      { 1, 227, 226, 255, 215, 215, 128, 128, 128, 128, 128},
+      { 5, 163, 209, 255, 212, 212, 255, 255, 128, 128, 128},
+      { 1, 133, 203, 255, 210, 220, 255, 255, 128, 128, 128},
+      { 1, 133, 203, 255, 210, 220, 255, 255, 128, 128, 128}
+    },
+    {
+      /* Coeff Band 3 */
+      { 1, 226, 225, 255, 228, 236, 128, 128, 128, 128, 128},
+      { 6, 163, 208, 255, 224, 234, 255, 255, 128, 128, 128},
+      { 1, 122, 196, 253, 212, 248, 255, 255, 128, 128, 128},
+      { 1, 122, 196, 253, 212, 248, 255, 255, 128, 128, 128}
+    },
+    {
+      /* Coeff Band 4 */
+      { 1, 222, 197, 254, 193, 216, 255, 236, 128, 128, 128},
+      { 7, 140, 163, 251, 195, 211, 255, 238, 128, 128, 128},
+      { 1, 91, 152, 249, 181, 197, 255, 239, 128, 128, 128},
+      { 1, 91, 152, 249, 181, 197, 255, 239, 128, 128, 128}
+    },
+    {
+      /* Coeff Band 5 */
+      { 1, 226, 218, 255, 216, 241, 255, 255, 128, 128, 128},
+      { 6, 154, 191, 255, 218, 240, 255, 255, 128, 128, 128},
+      { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128},
+      { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128}
+    },
+    {
+      /* Coeff Band 6 */
+      { 1, 221, 217, 255, 208, 217, 255, 232, 128, 128, 128},
+      { 11, 155, 189, 254, 203, 211, 255, 249, 128, 128, 128},
+      { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128},
+      { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128}
+    },
+    {
+      /* Coeff Band 7 */
+      { 1, 207, 235, 255, 232, 240, 128, 128, 128, 128, 128},
+      { 58, 161, 216, 255, 229, 235, 255, 255, 128, 128, 128},
+      { 8, 133, 204, 255, 219, 231, 255, 255, 128, 128, 128},
+      { 8, 133, 204, 255, 219, 231, 255, 255, 128, 128, 128}
+    }
+  },
+  {
+    /* block Type 1 */
+    {
+      /* Coeff Band 0 */
+      { 134, 152, 233, 224, 234, 52, 255, 166, 128, 128, 128},
+      { 97, 132, 185, 234, 186, 189, 197, 171, 255, 212, 128},
+      { 84, 110, 185, 237, 182, 182, 145, 145, 255, 255, 128}
+    },
+    {
+      /* Coeff Band 1 */
+      { 1, 124, 213, 247, 192, 212, 255, 255, 128, 128, 128},
+      { 88, 111, 178, 254, 189, 211, 255, 255, 128, 128, 128},
+      { 12, 59, 129, 236, 150, 179, 239, 195, 255, 255, 128},
+      { 12, 59, 129, 236, 150, 179, 239, 195, 255, 255, 128}
+    },
+    {
+      /* Coeff Band 2 */
+      { 1, 102, 225, 255, 210, 240, 128, 128, 128, 128, 128},
+      { 110, 78, 195, 254, 200, 191, 255, 255, 128, 128, 128},
+      { 37, 63, 177, 255, 194, 195, 128, 128, 128, 128, 128},
+      { 37, 63, 177, 255, 194, 195, 128, 128, 128, 128, 128}
+    },
+    {
+      /* Coeff Band 3 */
+      { 1, 1, 229, 255, 202, 224, 128, 128, 128, 128, 128},
+      { 150, 1, 192, 255, 206, 226, 128, 128, 128, 128, 128},
+      { 75, 1, 138, 255, 172, 228, 128, 128, 128, 128, 128},
+      { 75, 1, 138, 255, 172, 228, 128, 128, 128, 128, 128}
+    },
+    {
+      /* Coeff Band 4 */
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}
+    },
+    {
+      /* Coeff Band 5 */
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}
+    },
+    {
+      /* Coeff Band 6 */
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}
+    },
+    {
+      /* Coeff Band 7 */
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}
+    }
+  },
+  {
+    /* block Type 2 */
+    {
+      /* Coeff Band 0 */
+      { 11, 181, 226, 199, 183, 255, 255, 255, 128, 128, 128},
+      { 2, 147, 185, 248, 163, 180, 255, 236, 128, 128, 128},
+      { 1, 123, 157, 238, 154, 176, 255, 226, 255, 255, 128},
+      { 1, 123, 157, 238, 154, 176, 255, 226, 255, 255, 128}
+    },
+    {
+      /* Coeff Band 1 */
+      { 1, 150, 191, 246, 174, 188, 255, 235, 128, 128, 128},
+      { 1, 125, 166, 245, 165, 185, 255, 234, 128, 128, 128},
+      { 1, 79, 125, 240, 148, 179, 255, 234, 255, 255, 128},
+      { 1, 79, 125, 240, 148, 179, 255, 234, 255, 255, 128}
+    },
+    {
+      /* Coeff Band 2 */
+      { 1, 146, 184, 242, 167, 183, 255, 230, 255, 255, 128},
+      { 1, 119, 160, 239, 156, 178, 255, 231, 255, 255, 128},
+      { 1, 75, 115, 234, 142, 173, 255, 225, 255, 255, 128},
+      { 1, 75, 115, 234, 142, 173, 255, 225, 255, 255, 128}
+    },
+    {
+      /* Coeff Band 3 */
+      { 1, 150, 188, 244, 169, 183, 255, 233, 255, 255, 128},
+      { 1, 123, 162, 243, 161, 180, 255, 233, 128, 128, 128},
+      { 1, 76, 120, 238, 148, 178, 255, 230, 255, 255, 128},
+      { 1, 76, 120, 238, 148, 178, 255, 230, 255, 255, 128}
+    },
+    {
+      /* Coeff Band 4 */
+      { 1, 163, 202, 252, 188, 204, 255, 248, 128, 128, 128},
+      { 1, 136, 180, 251, 181, 201, 255, 246, 128, 128, 128},
+      { 1, 92, 146, 249, 170, 197, 255, 245, 128, 128, 128},
+      { 1, 92, 146, 249, 170, 197, 255, 245, 128, 128, 128}
+    },
+    {
+      /* Coeff Band 5 */
+      { 1, 156, 195, 249, 179, 193, 255, 241, 255, 255, 128},
+      { 1, 128, 169, 248, 171, 192, 255, 242, 255, 255, 128},
+      { 1, 84, 132, 245, 158, 187, 255, 240, 255, 255, 128},
+      { 1, 84, 132, 245, 158, 187, 255, 240, 255, 255, 128}
+    },
+    {
+      /* Coeff Band 6 */
+      { 1, 36, 71, 251, 192, 201, 255, 243, 255, 255, 128},
+      { 1, 49, 185, 250, 184, 199, 255, 242, 128, 128, 128},
+      { 1, 95, 147, 247, 168, 190, 255, 239, 255, 255, 128},
+      { 1, 95, 147, 247, 168, 190, 255, 239, 255, 255, 128}
+    },
+    {
+      /* Coeff Band 7 */
+      { 1, 19, 98, 255, 218, 222, 255, 255, 128, 128, 128},
+      { 36, 50, 210, 255, 212, 221, 255, 255, 128, 128, 128},
+      { 6, 117, 180, 254, 199, 216, 255, 251, 128, 128, 128},
+      { 6, 117, 180, 254, 199, 216, 255, 251, 128, 128, 128}
+    }
+  },
+  { /* block Type 3 */
+    { /* Coeff Band 0 */
+      { 192, 18, 155, 172, 145, 164, 192, 135, 246, 223, 255},
+      { 94, 29, 97, 131, 131, 153, 171, 121, 250, 190, 255},
+      { 25, 29, 63, 128, 119, 147, 168, 124, 251, 183, 255},
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}
+    },
+    { /* Coeff Band 1 */
+      { 1, 108, 192, 220, 186, 173, 255, 194, 255, 255, 128},
+      { 123, 104, 188, 221, 165, 171, 247, 180, 255, 255, 128},
+      { 23, 76, 152, 216, 154, 166, 226, 182, 255, 209, 128},
+      { 1, 26, 52, 162, 109, 152, 208, 144, 255, 231, 128}
+    },
+    { /* Coeff Band 2 */
+      { 1, 57, 179, 220, 156, 175, 210, 158, 255, 223, 128},
+      { 48, 57, 134, 212, 151, 170, 219, 185, 255, 248, 128},
+      { 4, 35, 63, 189, 120, 156, 221, 159, 255, 241, 128},
+      { 1, 17, 23, 110, 97, 143, 187, 120, 255, 234, 128}
+    },
+    { /* Coeff Band 3 */
+      { 1, 115, 205, 243, 182, 187, 254, 218, 255, 255, 128},
+      { 80, 101, 186, 241, 183, 186, 249, 182, 255, 255, 128},
+      { 10, 81, 144, 229, 164, 175, 241, 185, 255, 255, 128},
+      { 1, 44, 81, 192, 130, 148, 240, 180, 255, 255, 128}
+    },
+    { /* Coeff Band 4 */
+      { 1, 161, 207, 249, 187, 176, 255, 180, 128, 128, 128},
+      { 79, 148, 196, 240, 186, 182, 253, 171, 255, 255, 128},
+      { 14, 111, 171, 233, 170, 178, 235, 204, 255, 255, 128},
+      { 1, 63, 103, 202, 143, 162, 240, 178, 255, 255, 128}
+    },
+    { /* Coeff Band 5 */
+      { 1, 101, 202, 239, 185, 184, 252, 186, 255, 255, 128},
+      { 43, 67, 166, 237, 178, 190, 246, 194, 255, 255, 128},
+      { 4, 49, 85, 220, 140, 168, 253, 182, 255, 255, 128},
+      { 1, 24, 35, 144, 93, 135, 239, 159, 255, 253, 128}
+    },
+    { /* Coeff Band 6 */
+      { 1, 212, 243, 255, 240, 234, 255, 255, 128, 128, 128},
+      { 98, 168, 234, 255, 229, 234, 255, 255, 128, 128, 128},
+      { 19, 127, 199, 255, 212, 198, 255, 255, 128, 128, 128},
+      { 1, 103, 162, 253, 186, 151, 255, 255, 128, 128, 128}
+    },
+    { /* Coeff Band 7 */
+      { 1, 188, 253, 255, 255, 128, 128, 128, 128, 128, 128},
+      { 191, 68, 242, 255, 255, 128, 128, 128, 128, 128, 128},
+      { 8, 132, 255, 128, 128, 128, 128, 128, 128, 128, 128},
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}
+    }
+  }
+};
+
+static const vp9_prob
+  default_coef_probs_16x16[BLOCK_TYPES_16X16]
+                          [COEF_BANDS]
+                          [PREV_COEF_CONTEXTS]
+                          [ENTROPY_NODES] = {
+  { /* block Type 0 */
+    { /* Coeff Band 0 */
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}
+    },
+    { /* Coeff Band 1 */
+      { 60, 140, 195, 255, 212, 214, 128, 128, 128, 128, 128},
+      { 75, 221, 231, 255, 203, 255, 128, 128, 128, 128, 128},
+      { 9, 212, 196, 251, 197, 207, 255, 185, 128, 128, 128},
+      { 9, 212, 196, 251, 197, 207, 255, 185, 128, 128, 128}
+    },
+    { /* Coeff Band 2 */
+      { 1, 227, 226, 255, 215, 215, 128, 128, 128, 128, 128},
+      { 5, 163, 209, 255, 212, 212, 255, 255, 128, 128, 128},
+      { 1, 133, 203, 255, 210, 220, 255, 255, 128, 128, 128},
+      { 1, 133, 203, 255, 210, 220, 255, 255, 128, 128, 128}
+    },
+    { /* Coeff Band 3 */
+      { 1, 226, 225, 255, 228, 236, 128, 128, 128, 128, 128},
+      { 6, 163, 208, 255, 224, 234, 255, 255, 128, 128, 128},
+      { 1, 122, 196, 253, 212, 248, 255, 255, 128, 128, 128},
+      { 1, 122, 196, 253, 212, 248, 255, 255, 128, 128, 128}
+    },
+    { /* Coeff Band 4 */
+      { 1, 222, 197, 254, 193, 216, 255, 236, 128, 128, 128},
+      { 7, 140, 163, 251, 195, 211, 255, 238, 128, 128, 128},
+      { 1, 91, 152, 249, 181, 197, 255, 239, 128, 128, 128},
+      { 1, 91, 152, 249, 181, 197, 255, 239, 128, 128, 128}
+    },
+    { /* Coeff Band 5 */
+      { 1, 226, 218, 255, 216, 241, 255, 255, 128, 128, 128},
+      { 6, 154, 191, 255, 218, 240, 255, 255, 128, 128, 128},
+      { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128},
+      { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128}
+    },
+    { /* Coeff Band 6 */
+      { 1, 221, 217, 255, 208, 217, 255, 232, 128, 128, 128},
+      { 11, 155, 189, 254, 203, 211, 255, 249, 128, 128, 128},
+      { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128},
+      { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128}
+    },
+    { /* Coeff Band 7 */
+      { 1, 207, 235, 255, 232, 240, 128, 128, 128, 128, 128},
+      { 58, 161, 216, 255, 229, 235, 255, 255, 128, 128, 128},
+      { 8, 133, 204, 255, 219, 231, 255, 255, 128, 128, 128},
+      { 8, 133, 204, 255, 219, 231, 255, 255, 128, 128, 128}
+    }
+  },
+  { /* block Type 1 */
+      { /* Coeff Band 0 */
+        { 1, 30, 103, 204, 142, 168, 235, 161, 255, 228, 128},
+        { 1, 35, 90, 192, 130, 161, 227, 158, 255, 226, 255},
+        { 1, 36, 78, 180, 122, 156, 221, 153, 255, 222, 255},
+        { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}
+      },
+      { /* Coeff Band 1 */
+        { 1, 163, 228, 253, 212, 194, 255, 205, 128, 128, 128},
+        { 67, 160, 226, 253, 210, 202, 245, 172, 255, 255, 128},
+        { 51, 147, 219, 251, 207, 207, 255, 217, 128, 128, 128},
+        { 25, 107, 175, 245, 183, 190, 254, 209, 255, 255, 128}
+      },
+      { /* Coeff Band 2 */
+        { 1, 66, 170, 240, 177, 186, 252, 203, 255, 245, 128},
+        { 23, 64, 145, 230, 161, 177, 252, 198, 255, 255, 128},
+        { 6, 51, 99, 208, 135, 163, 249, 178, 255, 248, 128},
+        { 1, 33, 59, 161, 104, 151, 238, 164, 255, 237, 128}
+      },
+      { /* Coeff Band 3 */
+        { 1, 76, 216, 250, 198, 199, 255, 226, 255, 255, 128},
+        { 86, 83, 200, 247, 189, 193, 255, 224, 255, 255, 128},
+        { 30, 75, 164, 242, 172, 184, 254, 218, 255, 255, 128},
+        { 3, 54, 103, 227, 140, 172, 253, 201, 255, 255, 128}
+      },
+      { /* Coeff Band 4 */
+        { 1, 241, 247, 255, 233, 223, 255, 255, 128, 128, 128},
+        { 78, 212, 242, 255, 226, 230, 255, 255, 128, 128, 128},
+        { 10, 167, 224, 255, 217, 225, 255, 128, 128, 128, 128},
+        { 1, 104, 176, 250, 166, 219, 255, 255, 128, 128, 128}
+      },
+      { /* Coeff Band 5 */
+        { 1, 194, 241, 254, 228, 214, 248, 237, 255, 255, 128},
+        { 95, 133, 228, 254, 218, 215, 255, 229, 128, 128, 128},
+        { 24, 119, 201, 252, 202, 205, 255, 229, 128, 128, 128},
+        { 1, 88, 155, 246, 183, 193, 255, 205, 128, 128, 128}
+      },
+      { /* Coeff Band 6 */
+        { 1, 204, 236, 255, 222, 220, 255, 239, 128, 128, 128},
+        { 126, 105, 225, 254, 214, 217, 255, 254, 128, 128, 128},
+        { 44, 86, 190, 251, 197, 204, 255, 233, 128, 128, 128},
+        { 6, 71, 130, 240, 164, 188, 255, 246, 128, 128, 128}
+      },
+      { /* Coeff Band 7 */
+        { 1, 195, 250, 255, 239, 197, 128, 128, 128, 128, 128},
+        { 167, 102, 249, 255, 234, 255, 128, 128, 128, 128, 128},
+        { 65, 91, 222, 255, 217, 255, 128, 128, 128, 128, 128},
+        { 1, 59, 128, 255, 154, 255, 128, 128, 128, 128, 128}
+      }
+  },
+  { /* block Type 2 */
+      { /* Coeff Band 0 */
+        { 1, 30, 103, 204, 142, 168, 235, 161, 255, 228, 128},
+        { 1, 35, 90, 192, 130, 161, 227, 158, 255, 226, 255},
+        { 1, 36, 78, 180, 122, 156, 221, 153, 255, 222, 255},
+        { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}
+      },
+      { /* Coeff Band 1 */
+        { 1, 163, 228, 253, 212, 194, 255, 205, 128, 128, 128},
+        { 67, 160, 226, 253, 210, 202, 245, 172, 255, 255, 128},
+        { 51, 147, 219, 251, 207, 207, 255, 217, 128, 128, 128},
+        { 25, 107, 175, 245, 183, 190, 254, 209, 255, 255, 128}
+      },
+      { /* Coeff Band 2 */
+        { 1, 66, 170, 240, 177, 186, 252, 203, 255, 245, 128},
+        { 23, 64, 145, 230, 161, 177, 252, 198, 255, 255, 128},
+        { 6, 51, 99, 208, 135, 163, 249, 178, 255, 248, 128},
+        { 1, 33, 59, 161, 104, 151, 238, 164, 255, 237, 128}
+      },
+      { /* Coeff Band 3 */
+        { 1, 76, 216, 250, 198, 199, 255, 226, 255, 255, 128},
+        { 86, 83, 200, 247, 189, 193, 255, 224, 255, 255, 128},
+        { 30, 75, 164, 242, 172, 184, 254, 218, 255, 255, 128},
+        { 3, 54, 103, 227, 140, 172, 253, 201, 255, 255, 128}
+      },
+      { /* Coeff Band 4 */
+        { 1, 241, 247, 255, 233, 223, 255, 255, 128, 128, 128},
+        { 78, 212, 242, 255, 226, 230, 255, 255, 128, 128, 128},
+        { 10, 167, 224, 255, 217, 225, 255, 128, 128, 128, 128},
+        { 1, 104, 176, 250, 166, 219, 255, 255, 128, 128, 128}
+      },
+      { /* Coeff Band 5 */
+        { 1, 194, 241, 254, 228, 214, 248, 237, 255, 255, 128},
+        { 95, 133, 228, 254, 218, 215, 255, 229, 128, 128, 128},
+        { 24, 119, 201, 252, 202, 205, 255, 229, 128, 128, 128},
+        { 1, 88, 155, 246, 183, 193, 255, 205, 128, 128, 128}
+      },
+      { /* Coeff Band 6 */
+        { 1, 204, 236, 255, 222, 220, 255, 239, 128, 128, 128},
+        { 126, 105, 225, 254, 214, 217, 255, 254, 128, 128, 128},
+        { 44, 86, 190, 251, 197, 204, 255, 233, 128, 128, 128},
+        { 6, 71, 130, 240, 164, 188, 255, 246, 128, 128, 128}
+      },
+      { /* Coeff Band 7 */
+        { 1, 195, 250, 255, 239, 197, 128, 128, 128, 128, 128},
+        { 167, 102, 249, 255, 234, 255, 128, 128, 128, 128, 128},
+        { 65, 91, 222, 255, 217, 255, 128, 128, 128, 128, 128},
+        { 1, 59, 128, 255, 154, 255, 128, 128, 128, 128, 128}
+      }
+  },
+  { /* block Type 3 */
+    { /* Coeff Band 0 */
+      { 17, 105, 227, 195, 164, 170, 168, 137, 221, 160, 184},
+      { 6, 92, 166, 193, 158, 169, 179, 142, 236, 175, 200},
+      { 2, 68, 118, 193, 147, 168, 187, 149, 241, 178, 247},
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}
+    },
+    { /* Coeff Band 1 */
+      { 1, 193, 221, 246, 198, 194, 244, 176, 255, 192, 128},
+      { 112, 160, 209, 244, 196, 194, 243, 175, 255, 209, 128},
+      { 45, 123, 175, 240, 184, 195, 239, 178, 255, 218, 255},
+      { 16, 53, 75, 169, 119, 152, 209, 146, 255, 219, 255}
+    },
+    { /* Coeff Band 2 */
+      { 1, 141, 183, 240, 176, 187, 246, 198, 255, 218, 128},
+      { 36, 97, 150, 231, 161, 180, 243, 191, 255, 217, 255},
+      { 8, 65, 111, 210, 143, 166, 230, 167, 255, 224, 255},
+      { 2, 35, 61, 157, 113, 149, 208, 142, 255, 217, 255}
+    },
+    { /* Coeff Band 3 */
+      { 1, 173, 196, 245, 184, 191, 252, 211, 255, 240, 128},
+      { 35, 119, 175, 242, 177, 187, 252, 209, 255, 235, 128},
+      { 4, 88, 141, 234, 161, 180, 249, 200, 255, 228, 128},
+      { 1, 57, 95, 203, 133, 161, 235, 167, 255, 231, 255}
+    },
+    { /* Coeff Band 4 */
+      { 1, 208, 227, 249, 209, 204, 248, 188, 255, 248, 128},
+      { 28, 162, 211, 247, 203, 200, 252, 188, 255, 232, 128},
+      { 5, 114, 174, 238, 182, 189, 245, 184, 255, 238, 128},
+      { 1, 61, 100, 205, 136, 164, 235, 163, 255, 239, 128}
+    },
+    { /* Coeff Band 5 */
+      { 1, 195, 218, 252, 208, 207, 250, 205, 255, 245, 128},
+      { 22, 141, 196, 249, 198, 201, 250, 202, 255, 244, 128},
+      { 2, 105, 163, 240, 178, 189, 246, 191, 255, 246, 128},
+      { 1, 70, 112, 206, 144, 167, 232, 162, 255, 239, 128}
+    },
+    { /* Coeff Band 6 */
+      { 1, 204, 215, 251, 204, 203, 255, 222, 255, 225, 128},
+      { 15, 140, 194, 249, 194, 199, 254, 221, 255, 253, 128},
+      { 1, 95, 153, 243, 172, 188, 254, 213, 255, 248, 128},
+      { 1, 59, 99, 216, 135, 166, 247, 190, 255, 237, 255}
+    },
+    { /* Coeff Band 7 */
+      { 1, 7, 231, 255, 227, 223, 255, 240, 255, 255, 128},
+      { 15, 157, 217, 255, 218, 219, 255, 239, 255, 255, 128},
+      { 1, 114, 182, 252, 198, 207, 255, 235, 255, 255, 128},
+      { 1, 71, 122, 238, 154, 181, 255, 216, 255, 255, 128}
+    }
+  }
+};
+
+static const vp9_prob
+  default_hybrid_coef_probs_16x16[BLOCK_TYPES_16X16]
+                                 [COEF_BANDS]
+                                 [PREV_COEF_CONTEXTS]
+                                 [ENTROPY_NODES] = {
+  { /* block Type 0 */
+    { /* Coeff Band 0 */
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}
+    },
+    { /* Coeff Band 1 */
+      { 60, 140, 195, 255, 212, 214, 128, 128, 128, 128, 128},
+      { 75, 221, 231, 255, 203, 255, 128, 128, 128, 128, 128},
+      { 9, 212, 196, 251, 197, 207, 255, 185, 128, 128, 128},
+      { 9, 212, 196, 251, 197, 207, 255, 185, 128, 128, 128}
+    },
+    { /* Coeff Band 2 */
+      { 1, 227, 226, 255, 215, 215, 128, 128, 128, 128, 128},
+      { 5, 163, 209, 255, 212, 212, 255, 255, 128, 128, 128},
+      { 1, 133, 203, 255, 210, 220, 255, 255, 128, 128, 128},
+      { 1, 133, 203, 255, 210, 220, 255, 255, 128, 128, 128}
+    },
+    { /* Coeff Band 3 */
+      { 1, 226, 225, 255, 228, 236, 128, 128, 128, 128, 128},
+      { 6, 163, 208, 255, 224, 234, 255, 255, 128, 128, 128},
+      { 1, 122, 196, 253, 212, 248, 255, 255, 128, 128, 128},
+      { 1, 122, 196, 253, 212, 248, 255, 255, 128, 128, 128}
+    },
+    { /* Coeff Band 4 */
+      { 1, 222, 197, 254, 193, 216, 255, 236, 128, 128, 128},
+      { 7, 140, 163, 251, 195, 211, 255, 238, 128, 128, 128},
+      { 1, 91, 152, 249, 181, 197, 255, 239, 128, 128, 128},
+      { 1, 91, 152, 249, 181, 197, 255, 239, 128, 128, 128}
+    },
+    { /* Coeff Band 5 */
+      { 1, 226, 218, 255, 216, 241, 255, 255, 128, 128, 128},
+      { 6, 154, 191, 255, 218, 240, 255, 255, 128, 128, 128},
+      { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128},
+      { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128}
+    },
+    { /* Coeff Band 6 */
+      { 1, 221, 217, 255, 208, 217, 255, 232, 128, 128, 128},
+      { 11, 155, 189, 254, 203, 211, 255, 249, 128, 128, 128},
+      { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128},
+      { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128}
+    },
+    { /* Coeff Band 7 */
+      { 1, 207, 235, 255, 232, 240, 128, 128, 128, 128, 128},
+      { 58, 161, 216, 255, 229, 235, 255, 255, 128, 128, 128},
+      { 8, 133, 204, 255, 219, 231, 255, 255, 128, 128, 128},
+      { 8, 133, 204, 255, 219, 231, 255, 255, 128, 128, 128}
+    }
+  },
+  { /* block Type 1 */
+      { /* Coeff Band 0 */
+        { 1, 30, 103, 204, 142, 168, 235, 161, 255, 228, 128},
+        { 1, 35, 90, 192, 130, 161, 227, 158, 255, 226, 255},
+        { 1, 36, 78, 180, 122, 156, 221, 153, 255, 222, 255},
+        { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}
+      },
+      { /* Coeff Band 1 */
+        { 1, 163, 228, 253, 212, 194, 255, 205, 128, 128, 128},
+        { 67, 160, 226, 253, 210, 202, 245, 172, 255, 255, 128},
+        { 51, 147, 219, 251, 207, 207, 255, 217, 128, 128, 128},
+        { 25, 107, 175, 245, 183, 190, 254, 209, 255, 255, 128}
+      },
+      { /* Coeff Band 2 */
+        { 1, 66, 170, 240, 177, 186, 252, 203, 255, 245, 128},
+        { 23, 64, 145, 230, 161, 177, 252, 198, 255, 255, 128},
+        { 6, 51, 99, 208, 135, 163, 249, 178, 255, 248, 128},
+        { 1, 33, 59, 161, 104, 151, 238, 164, 255, 237, 128}
+      },
+      { /* Coeff Band 3 */
+        { 1, 76, 216, 250, 198, 199, 255, 226, 255, 255, 128},
+        { 86, 83, 200, 247, 189, 193, 255, 224, 255, 255, 128},
+        { 30, 75, 164, 242, 172, 184, 254, 218, 255, 255, 128},
+        { 3, 54, 103, 227, 140, 172, 253, 201, 255, 255, 128}
+      },
+      { /* Coeff Band 4 */
+        { 1, 241, 247, 255, 233, 223, 255, 255, 128, 128, 128},
+        { 78, 212, 242, 255, 226, 230, 255, 255, 128, 128, 128},
+        { 10, 167, 224, 255, 217, 225, 255, 128, 128, 128, 128},
+        { 1, 104, 176, 250, 166, 219, 255, 255, 128, 128, 128}
+      },
+      { /* Coeff Band 5 */
+        { 1, 194, 241, 254, 228, 214, 248, 237, 255, 255, 128},
+        { 95, 133, 228, 254, 218, 215, 255, 229, 128, 128, 128},
+        { 24, 119, 201, 252, 202, 205, 255, 229, 128, 128, 128},
+        { 1, 88, 155, 246, 183, 193, 255, 205, 128, 128, 128}
+      },
+      { /* Coeff Band 6 */
+        { 1, 204, 236, 255, 222, 220, 255, 239, 128, 128, 128},
+        { 126, 105, 225, 254, 214, 217, 255, 254, 128, 128, 128},
+        { 44, 86, 190, 251, 197, 204, 255, 233, 128, 128, 128},
+        { 6, 71, 130, 240, 164, 188, 255, 246, 128, 128, 128}
+      },
+      { /* Coeff Band 7 */
+        { 1, 195, 250, 255, 239, 197, 128, 128, 128, 128, 128},
+        { 167, 102, 249, 255, 234, 255, 128, 128, 128, 128, 128},
+        { 65, 91, 222, 255, 217, 255, 128, 128, 128, 128, 128},
+        { 1, 59, 128, 255, 154, 255, 128, 128, 128, 128, 128}
+      }
+  },
+  { /* block Type 2 */
+      { /* Coeff Band 0 */
+        { 1, 30, 103, 204, 142, 168, 235, 161, 255, 228, 128},
+        { 1, 35, 90, 192, 130, 161, 227, 158, 255, 226, 255},
+        { 1, 36, 78, 180, 122, 156, 221, 153, 255, 222, 255},
+        { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}
+      },
+      { /* Coeff Band 1 */
+        { 1, 163, 228, 253, 212, 194, 255, 205, 128, 128, 128},
+        { 67, 160, 226, 253, 210, 202, 245, 172, 255, 255, 128},
+        { 51, 147, 219, 251, 207, 207, 255, 217, 128, 128, 128},
+        { 25, 107, 175, 245, 183, 190, 254, 209, 255, 255, 128}
+      },
+      { /* Coeff Band 2 */
+        { 1, 66, 170, 240, 177, 186, 252, 203, 255, 245, 128},
+        { 23, 64, 145, 230, 161, 177, 252, 198, 255, 255, 128},
+        { 6, 51, 99, 208, 135, 163, 249, 178, 255, 248, 128},
+        { 1, 33, 59, 161, 104, 151, 238, 164, 255, 237, 128}
+      },
+      { /* Coeff Band 3 */
+        { 1, 76, 216, 250, 198, 199, 255, 226, 255, 255, 128},
+        { 86, 83, 200, 247, 189, 193, 255, 224, 255, 255, 128},
+        { 30, 75, 164, 242, 172, 184, 254, 218, 255, 255, 128},
+        { 3, 54, 103, 227, 140, 172, 253, 201, 255, 255, 128}
+      },
+      { /* Coeff Band 4 */
+        { 1, 241, 247, 255, 233, 223, 255, 255, 128, 128, 128},
+        { 78, 212, 242, 255, 226, 230, 255, 255, 128, 128, 128},
+        { 10, 167, 224, 255, 217, 225, 255, 128, 128, 128, 128},
+        { 1, 104, 176, 250, 166, 219, 255, 255, 128, 128, 128}
+      },
+      { /* Coeff Band 5 */
+        { 1, 194, 241, 254, 228, 214, 248, 237, 255, 255, 128},
+        { 95, 133, 228, 254, 218, 215, 255, 229, 128, 128, 128},
+        { 24, 119, 201, 252, 202, 205, 255, 229, 128, 128, 128},
+        { 1, 88, 155, 246, 183, 193, 255, 205, 128, 128, 128}
+      },
+      { /* Coeff Band 6 */
+        { 1, 204, 236, 255, 222, 220, 255, 239, 128, 128, 128},
+        { 126, 105, 225, 254, 214, 217, 255, 254, 128, 128, 128},
+        { 44, 86, 190, 251, 197, 204, 255, 233, 128, 128, 128},
+        { 6, 71, 130, 240, 164, 188, 255, 246, 128, 128, 128}
+      },
+      { /* Coeff Band 7 */
+        { 1, 195, 250, 255, 239, 197, 128, 128, 128, 128, 128},
+        { 167, 102, 249, 255, 234, 255, 128, 128, 128, 128, 128},
+        { 65, 91, 222, 255, 217, 255, 128, 128, 128, 128, 128},
+        { 1, 59, 128, 255, 154, 255, 128, 128, 128, 128, 128}
+      }
+  },
+  { /* block Type 3 */
+    { /* Coeff Band 0 */
+      { 17, 105, 227, 195, 164, 170, 168, 137, 221, 160, 184},
+      { 6, 92, 166, 193, 158, 169, 179, 142, 236, 175, 200},
+      { 2, 68, 118, 193, 147, 168, 187, 149, 241, 178, 247},
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}
+    },
+    { /* Coeff Band 1 */
+      { 1, 193, 221, 246, 198, 194, 244, 176, 255, 192, 128},
+      { 112, 160, 209, 244, 196, 194, 243, 175, 255, 209, 128},
+      { 45, 123, 175, 240, 184, 195, 239, 178, 255, 218, 255},
+      { 16, 53, 75, 169, 119, 152, 209, 146, 255, 219, 255}
+    },
+    { /* Coeff Band 2 */
+      { 1, 141, 183, 240, 176, 187, 246, 198, 255, 218, 128},
+      { 36, 97, 150, 231, 161, 180, 243, 191, 255, 217, 255},
+      { 8, 65, 111, 210, 143, 166, 230, 167, 255, 224, 255},
+      { 2, 35, 61, 157, 113, 149, 208, 142, 255, 217, 255}
+    },
+    { /* Coeff Band 3 */
+      { 1, 173, 196, 245, 184, 191, 252, 211, 255, 240, 128},
+      { 35, 119, 175, 242, 177, 187, 252, 209, 255, 235, 128},
+      { 4, 88, 141, 234, 161, 180, 249, 200, 255, 228, 128},
+      { 1, 57, 95, 203, 133, 161, 235, 167, 255, 231, 255}
+    },
+    { /* Coeff Band 4 */
+      { 1, 208, 227, 249, 209, 204, 248, 188, 255, 248, 128},
+      { 28, 162, 211, 247, 203, 200, 252, 188, 255, 232, 128},
+      { 5, 114, 174, 238, 182, 189, 245, 184, 255, 238, 128},
+      { 1, 61, 100, 205, 136, 164, 235, 163, 255, 239, 128}
+    },
+    { /* Coeff Band 5 */
+      { 1, 195, 218, 252, 208, 207, 250, 205, 255, 245, 128},
+      { 22, 141, 196, 249, 198, 201, 250, 202, 255, 244, 128},
+      { 2, 105, 163, 240, 178, 189, 246, 191, 255, 246, 128},
+      { 1, 70, 112, 206, 144, 167, 232, 162, 255, 239, 128}
+    },
+    { /* Coeff Band 6 */
+      { 1, 204, 215, 251, 204, 203, 255, 222, 255, 225, 128},
+      { 15, 140, 194, 249, 194, 199, 254, 221, 255, 253, 128},
+      { 1, 95, 153, 243, 172, 188, 254, 213, 255, 248, 128},
+      { 1, 59, 99, 216, 135, 166, 247, 190, 255, 237, 255}
+    },
+    { /* Coeff Band 7 */
+      { 1, 7, 231, 255, 227, 223, 255, 240, 255, 255, 128},
+      { 15, 157, 217, 255, 218, 219, 255, 239, 255, 255, 128},
+      { 1, 114, 182, 252, 198, 207, 255, 235, 255, 255, 128},
+      { 1, 71, 122, 238, 154, 181, 255, 216, 255, 255, 128}
+    }
+  }
+};
--- /dev/null
+++ b/vp9/common/entropy.c
@@ -1,0 +1,447 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include <stdio.h>
+
+#include "entropy.h"
+#include "string.h"
+#include "blockd.h"
+#include "onyxc_int.h"
+#include "entropymode.h"
+#include "vpx_mem/vpx_mem.h"
+
+#define uchar unsigned char     /* typedefs can clash */
+#define uint  unsigned int
+
+typedef const uchar cuchar;
+typedef const uint cuint;
+
+typedef vp9_prob Prob;
+
+#include "coefupdateprobs.h"
+
+const int vp9_i8x8_block[4] = {0, 2, 8, 10};
+
+DECLARE_ALIGNED(16, const unsigned char, vp9_norm[256]) = {
+  0, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,
+  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+};
+
+DECLARE_ALIGNED(16, const int, vp9_coef_bands[16]) = {
+  0, 1, 2, 3, 6, 4, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7
+};
+
+DECLARE_ALIGNED(16, cuchar, vp9_prev_token_class[MAX_ENTROPY_TOKENS]) = {
+  0, 1, 2, 2, 3, 3, 3, 3, 3, 3, 3, 0
+};
+
+DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d[16]) = {
+  0,  1,  4,  8,
+  5,  2,  3,  6,
+  9, 12, 13, 10,
+  7, 11, 14, 15,
+};
+
+DECLARE_ALIGNED(16, const int, vp9_col_scan[16]) = {
+  0, 4,  8, 12,
+  1, 5,  9, 13,
+  2, 6, 10, 14,
+  3, 7, 11, 15
+};
+DECLARE_ALIGNED(16, const int, vp9_row_scan[16]) = {
+  0,   1,  2,  3,
+  4,   5,  6,  7,
+  8,   9, 10, 11,
+  12, 13, 14, 15
+};
+
+
+DECLARE_ALIGNED(64, const int, vp9_coef_bands_8x8[64]) = { 0, 1, 2, 3, 5, 4, 4, 5,
+                                                           5, 3, 6, 3, 5, 4, 6, 6,
+                                                           6, 5, 5, 6, 6, 6, 6, 6,
+                                                           6, 6, 6, 6, 6, 6, 6, 6,
+                                                           6, 6, 6, 6, 7, 7, 7, 7,
+                                                           7, 7, 7, 7, 7, 7, 7, 7,
+                                                           7, 7, 7, 7, 7, 7, 7, 7,
+                                                           7, 7, 7, 7, 7, 7, 7, 7
+                                                         };
+DECLARE_ALIGNED(64, const int, vp9_default_zig_zag1d_8x8[64]) = {
+  0,  1,  8, 16,  9,  2,  3, 10, 17, 24, 32, 25, 18, 11,  4,  5,
+  12, 19, 26, 33, 40, 48, 41, 34, 27, 20, 13,  6,  7, 14, 21, 28,
+  35, 42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23, 30, 37, 44, 51,
+  58, 59, 52, 45, 38, 31, 39, 46, 53, 60, 61, 54, 47, 55, 62, 63,
+};
+
+// Table can be optimized.
+DECLARE_ALIGNED(16, const int, vp9_coef_bands_16x16[256]) = {
+    0, 1, 2, 3, 5, 4, 4, 5, 5, 3, 6, 3, 5, 4, 6, 6,
+    6, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+    6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+};
+DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d_16x16[256]) = {
+      0,   1,  16,  32,  17,   2,   3,  18,  33,  48,  64,  49,  34,  19,   4,   5,
+     20,  35,  50,  65,  80,  96,  81,  66,  51,  36,  21,   6,   7,  22,  37,  52,
+     67,  82,  97, 112, 128, 113,  98,  83,  68,  53,  38,  23,   8,   9,  24,  39,
+     54,  69,  84,  99, 114, 129, 144, 160, 145, 130, 115, 100,  85,  70,  55,  40,
+     25,  10,  11,  26,  41,  56,  71,  86, 101, 116, 131, 146, 161, 176, 192, 177,
+    162, 147, 132, 117, 102,  87,  72,  57,  42,  27,  12,  13,  28,  43,  58,  73,
+     88, 103, 118, 133, 148, 163, 178, 193, 208, 224, 209, 194, 179, 164, 149, 134,
+    119, 104,  89,  74,  59,  44,  29,  14,  15,  30,  45,  60,  75,  90, 105, 120,
+    135, 150, 165, 180, 195, 210, 225, 240, 241, 226, 211, 196, 181, 166, 151, 136,
+    121, 106,  91,  76,  61,  46,  31,  47,  62,  77,  92, 107, 122, 137, 152, 167,
+    182, 197, 212, 227, 242, 243, 228, 213, 198, 183, 168, 153, 138, 123, 108,  93,
+     78,  63,  79,  94, 109, 124, 139, 154, 169, 184, 199, 214, 229, 244, 245, 230,
+    215, 200, 185, 170, 155, 140, 125, 110,  95, 111, 126, 141, 156, 171, 186, 201,
+    216, 231, 246, 247, 232, 217, 202, 187, 172, 157, 142, 127, 143, 158, 173, 188,
+    203, 218, 233, 248, 249, 234, 219, 204, 189, 174, 159, 175, 190, 205, 220, 235,
+    250, 251, 236, 221, 206, 191, 207, 222, 237, 252, 253, 238, 223, 239, 254, 255,
+};
+
+
+/* Array indices are identical to previously-existing CONTEXT_NODE indices */
+
+const vp9_tree_index vp9_coef_tree[ 22] =     /* corresponding _CONTEXT_NODEs */
+{
+  -DCT_EOB_TOKEN, 2,                             /* 0 = EOB */
+  -ZERO_TOKEN, 4,                               /* 1 = ZERO */
+  -ONE_TOKEN, 6,                               /* 2 = ONE */
+  8, 12,                                      /* 3 = LOW_VAL */
+  -TWO_TOKEN, 10,                            /* 4 = TWO */
+  -THREE_TOKEN, -FOUR_TOKEN,                /* 5 = THREE */
+  14, 16,                                    /* 6 = HIGH_LOW */
+  -DCT_VAL_CATEGORY1, -DCT_VAL_CATEGORY2,   /* 7 = CAT_ONE */
+  18, 20,                                   /* 8 = CAT_THREEFOUR */
+  -DCT_VAL_CATEGORY3, -DCT_VAL_CATEGORY4,  /* 9 = CAT_THREE */
+  -DCT_VAL_CATEGORY5, -DCT_VAL_CATEGORY6   /* 10 = CAT_FIVE */
+};
+
+struct vp9_token_struct vp9_coef_encodings[MAX_ENTROPY_TOKENS];
+
+/* Trees for extra bits.  Probabilities are constant and
+   do not depend on previously encoded bits */
+
+static const Prob Pcat1[] = { 159};
+static const Prob Pcat2[] = { 165, 145};
+static const Prob Pcat3[] = { 173, 148, 140};
+static const Prob Pcat4[] = { 176, 155, 140, 135};
+static const Prob Pcat5[] = { 180, 157, 141, 134, 130};
+static const Prob Pcat6[] =
+{ 254, 254, 252, 249, 243, 230, 196, 177, 153, 140, 133, 130, 129};
+
+static vp9_tree_index cat1[2], cat2[4], cat3[6], cat4[8], cat5[10], cat6[26];
+
+static void init_bit_tree(vp9_tree_index *p, int n) {
+  int i = 0;
+
+  while (++i < n) {
+    p[0] = p[1] = i << 1;
+    p += 2;
+  }
+
+  p[0] = p[1] = 0;
+}
+
+static void init_bit_trees() {
+  init_bit_tree(cat1, 1);
+  init_bit_tree(cat2, 2);
+  init_bit_tree(cat3, 3);
+  init_bit_tree(cat4, 4);
+  init_bit_tree(cat5, 5);
+  init_bit_tree(cat6, 13);
+}
+
+vp9_extra_bit_struct vp9_extra_bits[12] = {
+  { 0, 0, 0, 0},
+  { 0, 0, 0, 1},
+  { 0, 0, 0, 2},
+  { 0, 0, 0, 3},
+  { 0, 0, 0, 4},
+  { cat1, Pcat1, 1, 5},
+  { cat2, Pcat2, 2, 7},
+  { cat3, Pcat3, 3, 11},
+  { cat4, Pcat4, 4, 19},
+  { cat5, Pcat5, 5, 35},
+  { cat6, Pcat6, 13, 67},
+  { 0, 0, 0, 0}
+};
+
+#include "default_coef_probs.h"
+
+void vp9_default_coef_probs(VP9_COMMON *pc) {
+  vpx_memcpy(pc->fc.coef_probs, default_coef_probs,
+             sizeof(pc->fc.coef_probs));
+  vpx_memcpy(pc->fc.hybrid_coef_probs, default_hybrid_coef_probs,
+             sizeof(pc->fc.hybrid_coef_probs));
+
+  vpx_memcpy(pc->fc.coef_probs_8x8, default_coef_probs_8x8,
+             sizeof(pc->fc.coef_probs_8x8));
+  vpx_memcpy(pc->fc.hybrid_coef_probs_8x8, default_hybrid_coef_probs_8x8,
+             sizeof(pc->fc.hybrid_coef_probs_8x8));
+
+  vpx_memcpy(pc->fc.coef_probs_16x16, default_coef_probs_16x16,
+             sizeof(pc->fc.coef_probs_16x16));
+  vpx_memcpy(pc->fc.hybrid_coef_probs_16x16,
+             default_hybrid_coef_probs_16x16,
+             sizeof(pc->fc.hybrid_coef_probs_16x16));
+}
+
+void vp9_coef_tree_initialize() {
+  init_bit_trees();
+  vp9_tokens_from_tree(vp9_coef_encodings, vp9_coef_tree);
+}
+
+// #define COEF_COUNT_TESTING
+
+#define COEF_COUNT_SAT 24
+#define COEF_MAX_UPDATE_FACTOR 112
+#define COEF_COUNT_SAT_KEY 24
+#define COEF_MAX_UPDATE_FACTOR_KEY 112
+#define COEF_COUNT_SAT_AFTER_KEY 24
+#define COEF_MAX_UPDATE_FACTOR_AFTER_KEY 128
+
+void vp9_adapt_coef_probs(VP9_COMMON *cm) {
+  int t, i, j, k, count;
+  unsigned int branch_ct[ENTROPY_NODES][2];
+  vp9_prob coef_probs[ENTROPY_NODES];
+  int update_factor; /* denominator 256 */
+  int factor;
+  int count_sat;
+
+  // printf("Frame type: %d\n", cm->frame_type);
+  if (cm->frame_type == KEY_FRAME) {
+    update_factor = COEF_MAX_UPDATE_FACTOR_KEY;
+    count_sat = COEF_COUNT_SAT_KEY;
+  } else if (cm->last_frame_type == KEY_FRAME) {
+    update_factor = COEF_MAX_UPDATE_FACTOR_AFTER_KEY;  /* adapt quickly */
+    count_sat = COEF_COUNT_SAT_AFTER_KEY;
+  } else {
+    update_factor = COEF_MAX_UPDATE_FACTOR;
+    count_sat = COEF_COUNT_SAT;
+  }
+
+#ifdef COEF_COUNT_TESTING
+  {
+    printf("static const unsigned int\ncoef_counts"
+           "[BLOCK_TYPES] [COEF_BANDS]"
+           "[PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS] = {\n");
+    for (i = 0; i < BLOCK_TYPES; ++i) {
+      printf("  {\n");
+      for (j = 0; j < COEF_BANDS; ++j) {
+        printf("    {\n");
+        for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {
+          printf("      {");
+          for (t = 0; t < MAX_ENTROPY_TOKENS; ++t)
+            printf("%d, ", cm->fc.coef_counts[i][j][k][t]);
+          printf("},\n");
+        }
+        printf("    },\n");
+      }
+      printf("  },\n");
+    }
+    printf("};\n");
+    printf("static const unsigned int\ncoef_counts_8x8"
+           "[BLOCK_TYPES_8X8] [COEF_BANDS]"
+           "[PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS] = {\n");
+    for (i = 0; i < BLOCK_TYPES_8X8; ++i) {
+      printf("  {\n");
+      for (j = 0; j < COEF_BANDS; ++j) {
+        printf("    {\n");
+        for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {
+          printf("      {");
+          for (t = 0; t < MAX_ENTROPY_TOKENS; ++t)
+            printf("%d, ", cm->fc.coef_counts_8x8[i][j][k][t]);
+          printf("},\n");
+        }
+        printf("    },\n");
+      }
+      printf("  },\n");
+    }
+    printf("};\n");
+    printf("static const unsigned int\nhybrid_coef_counts"
+           "[BLOCK_TYPES] [COEF_BANDS]"
+           "[PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS] = {\n");
+    for (i = 0; i < BLOCK_TYPES; ++i) {
+      printf("  {\n");
+      for (j = 0; j < COEF_BANDS; ++j) {
+        printf("    {\n");
+        for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {
+          printf("      {");
+          for (t = 0; t < MAX_ENTROPY_TOKENS; ++t)
+            printf("%d, ", cm->fc.hybrid_coef_counts[i][j][k][t]);
+          printf("},\n");
+        }
+        printf("    },\n");
+      }
+      printf("  },\n");
+    }
+    printf("};\n");
+  }
+#endif
+
+  for (i = 0; i < BLOCK_TYPES; ++i)
+    for (j = 0; j < COEF_BANDS; ++j)
+      for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {
+        if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0)))
+          continue;
+        vp9_tree_probs_from_distribution(
+          MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree,
+          coef_probs, branch_ct, cm->fc.coef_counts [i][j][k],
+          256, 1);
+        for (t = 0; t < ENTROPY_NODES; ++t) {
+          int prob;
+          count = branch_ct[t][0] + branch_ct[t][1];
+          count = count > count_sat ? count_sat : count;
+          factor = (update_factor * count / count_sat);
+          prob = ((int)cm->fc.pre_coef_probs[i][j][k][t] * (256 - factor) +
+                  (int)coef_probs[t] * factor + 128) >> 8;
+          if (prob <= 0) cm->fc.coef_probs[i][j][k][t] = 1;
+          else if (prob > 255) cm->fc.coef_probs[i][j][k][t] = 255;
+          else cm->fc.coef_probs[i][j][k][t] = prob;
+        }
+      }
+
+  for (i = 0; i < BLOCK_TYPES; ++i)
+    for (j = 0; j < COEF_BANDS; ++j)
+      for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {
+        if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0)))
+          continue;
+        vp9_tree_probs_from_distribution(
+          MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree,
+          coef_probs, branch_ct, cm->fc.hybrid_coef_counts [i][j][k],
+          256, 1);
+        for (t = 0; t < ENTROPY_NODES; ++t) {
+          int prob;
+          count = branch_ct[t][0] + branch_ct[t][1];
+          count = count > count_sat ? count_sat : count;
+          factor = (update_factor * count / count_sat);
+          prob = ((int)cm->fc.pre_hybrid_coef_probs[i][j][k][t] * (256 - factor) +
+                  (int)coef_probs[t] * factor + 128) >> 8;
+          if (prob <= 0) cm->fc.hybrid_coef_probs[i][j][k][t] = 1;
+          else if (prob > 255) cm->fc.hybrid_coef_probs[i][j][k][t] = 255;
+          else cm->fc.hybrid_coef_probs[i][j][k][t] = prob;
+        }
+      }
+
+  for (i = 0; i < BLOCK_TYPES_8X8; ++i)
+    for (j = 0; j < COEF_BANDS; ++j)
+      for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {
+        if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0)))
+          continue;
+        vp9_tree_probs_from_distribution(
+          MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree,
+          coef_probs, branch_ct, cm->fc.coef_counts_8x8 [i][j][k],
+          256, 1);
+        for (t = 0; t < ENTROPY_NODES; ++t) {
+          int prob;
+          count = branch_ct[t][0] + branch_ct[t][1];
+          count = count > count_sat ? count_sat : count;
+          factor = (update_factor * count / count_sat);
+          prob = ((int)cm->fc.pre_coef_probs_8x8[i][j][k][t] * (256 - factor) +
+                  (int)coef_probs[t] * factor + 128) >> 8;
+          if (prob <= 0) cm->fc.coef_probs_8x8[i][j][k][t] = 1;
+          else if (prob > 255) cm->fc.coef_probs_8x8[i][j][k][t] = 255;
+          else cm->fc.coef_probs_8x8[i][j][k][t] = prob;
+        }
+      }
+
+  for (i = 0; i < BLOCK_TYPES_8X8; ++i)
+    for (j = 0; j < COEF_BANDS; ++j)
+      for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {
+        if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0)))
+          continue;
+        vp9_tree_probs_from_distribution(
+          MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree,
+          coef_probs, branch_ct, cm->fc.hybrid_coef_counts_8x8 [i][j][k],
+          256, 1);
+        for (t = 0; t < ENTROPY_NODES; ++t) {
+          int prob;
+          count = branch_ct[t][0] + branch_ct[t][1];
+          count = count > count_sat ? count_sat : count;
+          factor = (update_factor * count / count_sat);
+          prob = ((int)cm->fc.pre_hybrid_coef_probs_8x8[i][j][k][t] *
+                  (256 - factor) +
+                  (int)coef_probs[t] * factor + 128) >> 8;
+          if (prob <= 0) cm->fc.hybrid_coef_probs_8x8[i][j][k][t] = 1;
+          else if (prob > 255) cm->fc.hybrid_coef_probs_8x8[i][j][k][t] = 255;
+          else cm->fc.hybrid_coef_probs_8x8[i][j][k][t] = prob;
+        }
+      }
+
+  for (i = 0; i < BLOCK_TYPES_16X16; ++i)
+    for (j = 0; j < COEF_BANDS; ++j)
+      for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {
+        if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0)))
+          continue;
+        vp9_tree_probs_from_distribution(
+          MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree,
+          coef_probs, branch_ct, cm->fc.coef_counts_16x16[i][j][k], 256, 1);
+        for (t = 0; t < ENTROPY_NODES; ++t) {
+          int prob;
+          count = branch_ct[t][0] + branch_ct[t][1];
+          count = count > count_sat ? count_sat : count;
+          factor = (update_factor * count / count_sat);
+          prob = ((int)cm->fc.pre_coef_probs_16x16[i][j][k][t] *
+                  (256 - factor) +
+                  (int)coef_probs[t] * factor + 128) >> 8;
+          if (prob <= 0) cm->fc.coef_probs_16x16[i][j][k][t] = 1;
+          else if (prob > 255) cm->fc.coef_probs_16x16[i][j][k][t] = 255;
+          else cm->fc.coef_probs_16x16[i][j][k][t] = prob;
+        }
+      }
+
+  for (i = 0; i < BLOCK_TYPES_16X16; ++i)
+    for (j = 0; j < COEF_BANDS; ++j)
+      for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {
+        if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0)))
+          continue;
+        vp9_tree_probs_from_distribution(
+          MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree,
+          coef_probs, branch_ct, cm->fc.hybrid_coef_counts_16x16[i][j][k], 256, 1);
+        for (t = 0; t < ENTROPY_NODES; ++t) {
+          int prob;
+          count = branch_ct[t][0] + branch_ct[t][1];
+          count = count > count_sat ? count_sat : count;
+          factor = (update_factor * count / count_sat);
+          prob = ((int)cm->fc.pre_hybrid_coef_probs_16x16[i][j][k][t] * (256 - factor) +
+                  (int)coef_probs[t] * factor + 128) >> 8;
+          if (prob <= 0) cm->fc.hybrid_coef_probs_16x16[i][j][k][t] = 1;
+          else if (prob > 255) cm->fc.hybrid_coef_probs_16x16[i][j][k][t] = 255;
+          else cm->fc.hybrid_coef_probs_16x16[i][j][k][t] = prob;
+        }
+      }
+}
--- /dev/null
+++ b/vp9/common/entropy.h
@@ -1,0 +1,112 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_ENTROPY_H
+#define __INC_ENTROPY_H
+
+#include "treecoder.h"
+#include "blockd.h"
+#include "common.h"
+#include "coefupdateprobs.h"
+
+extern const int vp9_i8x8_block[4];
+
+/* Coefficient token alphabet */
+
+#define ZERO_TOKEN              0       /* 0         Extra Bits 0+0 */
+#define ONE_TOKEN               1       /* 1         Extra Bits 0+1 */
+#define TWO_TOKEN               2       /* 2         Extra Bits 0+1 */
+#define THREE_TOKEN             3       /* 3         Extra Bits 0+1 */
+#define FOUR_TOKEN              4       /* 4         Extra Bits 0+1 */
+#define DCT_VAL_CATEGORY1       5       /* 5-6       Extra Bits 1+1 */
+#define DCT_VAL_CATEGORY2       6       /* 7-10      Extra Bits 2+1 */
+#define DCT_VAL_CATEGORY3       7       /* 11-18     Extra Bits 3+1 */
+#define DCT_VAL_CATEGORY4       8       /* 19-34     Extra Bits 4+1 */
+#define DCT_VAL_CATEGORY5       9       /* 35-66     Extra Bits 5+1 */
+#define DCT_VAL_CATEGORY6       10      /* 67+       Extra Bits 13+1 */
+#define DCT_EOB_TOKEN           11      /* EOB       Extra Bits 0+0 */
+#define MAX_ENTROPY_TOKENS 12
+#define ENTROPY_NODES 11
+#define EOSB_TOKEN              127     /* Not signalled, encoder only */
+
+extern const vp9_tree_index vp9_coef_tree[];
+
+extern struct vp9_token_struct vp9_coef_encodings[MAX_ENTROPY_TOKENS];
+
+typedef struct {
+  vp9_tree_p tree;
+  const vp9_prob *prob;
+  int Len;
+  int base_val;
+} vp9_extra_bit_struct;
+
+extern vp9_extra_bit_struct vp9_extra_bits[12];    /* indexed by token value */
+
+#define PROB_UPDATE_BASELINE_COST   7
+
+#define MAX_PROB                255
+#define DCT_MAX_VALUE           8192
+
+/* Coefficients are predicted via a 3-dimensional probability table. */
+
+/* Outside dimension.  0 = Y no DC, 1 = Y2, 2 = UV, 3 = Y with DC */
+#define BLOCK_TYPES 4
+
+#define BLOCK_TYPES_8X8 4
+
+#define BLOCK_TYPES_16X16 4
+
+/* Middle dimension is a coarsening of the coefficient's
+   position within the 4x4 DCT. */
+
+#define COEF_BANDS 8
+extern DECLARE_ALIGNED(16, const int, vp9_coef_bands[16]);
+extern DECLARE_ALIGNED(64, const int, vp9_coef_bands_8x8[64]);
+extern DECLARE_ALIGNED(16, const int, vp9_coef_bands_16x16[256]);
+
+/* Inside dimension is 3-valued measure of nearby complexity, that is,
+   the extent to which nearby coefficients are nonzero.  For the first
+   coefficient (DC, unless block type is 0), we look at the (already encoded)
+   blocks above and to the left of the current block.  The context index is
+   then the number (0,1,or 2) of these blocks having nonzero coefficients.
+   After decoding a coefficient, the measure is roughly the size of the
+   most recently decoded coefficient (0 for 0, 1 for 1, 2 for >1).
+   Note that the intuitive meaning of this measure changes as coefficients
+   are decoded, e.g., prior to the first token, a zero means that my neighbors
+   are empty while, after the first token, because of the use of end-of-block,
+   a zero means we just decoded a zero and hence guarantees that a non-zero
+   coefficient will appear later in this block.  However, this shift
+   in meaning is perfectly OK because our context depends also on the
+   coefficient band (and since zigzag positions 0, 1, and 2 are in
+   distinct bands). */
+
+/*# define DC_TOKEN_CONTEXTS        3*/ /* 00, 0!0, !0!0 */
+#define PREV_COEF_CONTEXTS       4
+
+#define SUBEXP_PARAM                4   /* Subexponential code parameter */
+#define MODULUS_PARAM               13  /* Modulus parameter */
+
+extern DECLARE_ALIGNED(16, const unsigned char, vp9_prev_token_class[MAX_ENTROPY_TOKENS]);
+
+struct VP9Common;
+void vp9_default_coef_probs(struct VP9Common *);
+extern DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d[16]);
+
+extern DECLARE_ALIGNED(16, const int, vp9_col_scan[16]);
+extern DECLARE_ALIGNED(16, const int, vp9_row_scan[16]);
+
+extern DECLARE_ALIGNED(64, const int, vp9_default_zig_zag1d_8x8[64]);
+void vp9_coef_tree_initialize(void);
+
+extern DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d_16x16[256]);
+void vp9_adapt_coef_probs(struct VP9Common *);
+
+#endif
--- /dev/null
+++ b/vp9/common/entropymode.c
@@ -1,0 +1,614 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "onyxc_int.h"
+#include "modecont.h"
+#include "vpx_mem/vpx_mem.h"
+
+
+static const unsigned int kf_y_mode_cts[8][VP9_YMODES] = {
+  /* DC V   H  D45 135 117 153 D27 D63 TM i8x8 BPRED */
+  {12,  6,  5,  5,  5,  5,  5,  5,  5,  2, 22, 200},
+  {25, 13, 13,  7,  7,  7,  7,  7,  7,  6, 27, 160},
+  {31, 17, 18,  8,  8,  8,  8,  8,  8,  9, 26, 139},
+  {40, 22, 23,  8,  8,  8,  8,  8,  8, 12, 27, 116},
+  {53, 26, 28,  8,  8,  8,  8,  8,  8, 13, 26,  94},
+  {68, 33, 35,  8,  8,  8,  8,  8,  8, 17, 20,  68},
+  {78, 38, 38,  8,  8,  8,  8,  8,  8, 19, 16,  52},
+  {89, 42, 42,  8,  8,  8,  8,  8,  8, 21, 12,  34},
+};
+
+static const unsigned int y_mode_cts  [VP9_YMODES] = {
+  /* DC V   H  D45 135 117 153 D27 D63 TM i8x8 BPRED */
+  98, 19, 15, 14, 14, 14, 14, 12, 12, 13, 16, 70
+};
+
+static const unsigned int uv_mode_cts [VP9_YMODES] [VP9_UV_MODES] = {
+  /* DC   V   H  D45 135 117 153 D27 D63 TM */
+  { 200, 15, 15, 10, 10, 10, 10, 10, 10,  6}, /* DC */
+  { 130, 75, 10, 10, 10, 10, 10, 10, 10,  6}, /* V */
+  { 130, 10, 75, 10, 10, 10, 10, 10, 10,  6}, /* H */
+  { 130, 15, 10, 75, 10, 10, 10, 10, 10,  6}, /* D45 */
+  { 150, 15, 10, 10, 75, 10, 10, 10, 10,  6}, /* D135 */
+  { 150, 15, 10, 10, 10, 75, 10, 10, 10,  6}, /* D117 */
+  { 150, 15, 10, 10, 10, 10, 75, 10, 10,  6}, /* D153 */
+  { 150, 15, 10, 10, 10, 10, 10, 75, 10,  6}, /* D27 */
+  { 150, 15, 10, 10, 10, 10, 10, 10, 75,  6}, /* D63 */
+  { 160, 30, 30, 10, 10, 10, 10, 10, 10, 16}, /* TM */
+  { 132, 46, 40, 10, 10, 10, 10, 10, 10, 18}, /* i8x8 - never used */
+  { 150, 35, 41, 10, 10, 10, 10, 10, 10, 10}, /* BPRED */
+};
+
+static const unsigned int i8x8_mode_cts  [VP9_I8X8_MODES] = {
+  /* DC V   H D45 135 117 153 D27 D63  TM */
+  73, 49, 61, 30, 30, 30, 30, 30, 30, 13
+};
+
+static const unsigned int kf_uv_mode_cts [VP9_YMODES] [VP9_UV_MODES] = {
+  // DC   V   H  D45 135 117 153 D27 D63 TM
+  { 160, 24, 24, 20, 20, 20, 20, 20, 20,  8}, /* DC */
+  { 102, 64, 30, 20, 20, 20, 20, 20, 20, 10}, /* V */
+  { 102, 30, 64, 20, 20, 20, 20, 20, 20, 10}, /* H */
+  { 102, 33, 20, 64, 20, 20, 20, 20, 20, 14}, /* D45 */
+  { 102, 33, 20, 20, 64, 20, 20, 20, 20, 14}, /* D135 */
+  { 122, 33, 20, 20, 20, 64, 20, 20, 20, 14}, /* D117 */
+  { 102, 33, 20, 20, 20, 20, 64, 20, 20, 14}, /* D153 */
+  { 102, 33, 20, 20, 20, 20, 20, 64, 20, 14}, /* D27 */
+  { 102, 33, 20, 20, 20, 20, 20, 20, 64, 14}, /* D63 */
+  { 132, 36, 30, 20, 20, 20, 20, 20, 20, 18}, /* TM */
+  { 122, 41, 35, 20, 20, 20, 20, 20, 20, 18}, /* i8x8 - never used */
+  { 122, 41, 35, 20, 20, 20, 20, 20, 20, 18}, /* BPRED */
+};
+
+static const unsigned int bmode_cts[VP9_BINTRAMODES] = {
+  /* DC    TM     VE     HE   LD    RD    VR    VL    HD    HU */
+  43891, 17694, 10036, 3920, 3363, 2546, 5119, 3221, 2471, 1723
+};
+
+typedef enum {
+  SUBMVREF_NORMAL,
+  SUBMVREF_LEFT_ZED,
+  SUBMVREF_ABOVE_ZED,
+  SUBMVREF_LEFT_ABOVE_SAME,
+  SUBMVREF_LEFT_ABOVE_ZED
+} sumvfref_t;
+
+int vp9_mv_cont(const int_mv *l, const int_mv *a) {
+  int lez = (l->as_int == 0);
+  int aez = (a->as_int == 0);
+  int lea = (l->as_int == a->as_int);
+
+  if (lea && lez)
+    return SUBMVREF_LEFT_ABOVE_ZED;
+
+  if (lea)
+    return SUBMVREF_LEFT_ABOVE_SAME;
+
+  if (aez)
+    return SUBMVREF_ABOVE_ZED;
+
+  if (lez)
+    return SUBMVREF_LEFT_ZED;
+
+  return SUBMVREF_NORMAL;
+}
+
+const vp9_prob vp9_sub_mv_ref_prob [VP9_SUBMVREFS - 1] = { 180, 162, 25};
+
+const vp9_prob vp9_sub_mv_ref_prob2 [SUBMVREF_COUNT][VP9_SUBMVREFS - 1] = {
+  { 147, 136, 18 },
+  { 106, 145, 1  },
+  { 179, 121, 1  },
+  { 223, 1, 34 },
+  { 208, 1, 1  }
+};
+
+vp9_mbsplit vp9_mbsplits [VP9_NUMMBSPLITS] = {
+  {
+    0,  0,  0,  0,
+    0,  0,  0,  0,
+    1,  1,  1,  1,
+    1,  1,  1,  1,
+  }, {
+    0,  0,  1,  1,
+    0,  0,  1,  1,
+    0,  0,  1,  1,
+    0,  0,  1,  1,
+  }, {
+    0,  0,  1,  1,
+    0,  0,  1,  1,
+    2,  2,  3,  3,
+    2,  2,  3,  3,
+  }, {
+    0,  1,  2,  3,
+    4,  5,  6,  7,
+    8,  9,  10, 11,
+    12, 13, 14, 15,
+  },
+};
+
+const int vp9_mbsplit_count [VP9_NUMMBSPLITS] = { 2, 2, 4, 16};
+
+const vp9_prob vp9_mbsplit_probs [VP9_NUMMBSPLITS - 1] = { 110, 111, 150};
+
+/* Array indices are identical to previously-existing INTRAMODECONTEXTNODES. */
+
+const vp9_tree_index vp9_bmode_tree[VP9_BINTRAMODES * 2 - 2] = /* INTRAMODECONTEXTNODE value */
+{
+  -B_DC_PRED, 2,                             /* 0 = DC_NODE */
+  -B_TM_PRED, 4,                            /* 1 = TM_NODE */
+  -B_VE_PRED, 6,                           /* 2 = VE_NODE */
+  8, 12,                                  /* 3 = COM_NODE */
+  -B_HE_PRED, 10,                        /* 4 = HE_NODE */
+  -B_RD_PRED, -B_VR_PRED,               /* 5 = RD_NODE */
+  -B_LD_PRED, 14,                        /* 6 = LD_NODE */
+  -B_VL_PRED, 16,                      /* 7 = VL_NODE */
+  -B_HD_PRED, -B_HU_PRED             /* 8 = HD_NODE */
+};
+
+/* Again, these trees use the same probability indices as their
+   explicitly-programmed predecessors. */
+const vp9_tree_index vp9_ymode_tree[VP9_YMODES * 2 - 2] = {
+  2, 14,
+  -DC_PRED, 4,
+  6, 8,
+  -D45_PRED, -D135_PRED,
+  10, 12,
+  -D117_PRED, -D153_PRED,
+  -D27_PRED, -D63_PRED,
+  16, 18,
+  -V_PRED, -H_PRED,
+  -TM_PRED, 20,
+  -B_PRED, -I8X8_PRED
+};
+
+const vp9_tree_index vp9_kf_ymode_tree[VP9_YMODES * 2 - 2] = {
+  2, 14,
+  -DC_PRED, 4,
+  6, 8,
+  -D45_PRED, -D135_PRED,
+  10, 12,
+  -D117_PRED, -D153_PRED,
+  -D27_PRED, -D63_PRED,
+  16, 18,
+  -V_PRED, -H_PRED,
+  -TM_PRED, 20,
+  -B_PRED, -I8X8_PRED
+};
+
+const vp9_tree_index vp9_i8x8_mode_tree[VP9_I8X8_MODES * 2 - 2] = {
+  2, 14,
+  -DC_PRED, 4,
+  6, 8,
+  -D45_PRED, -D135_PRED,
+  10, 12,
+  -D117_PRED, -D153_PRED,
+  -D27_PRED, -D63_PRED,
+  -V_PRED, 16,
+  -H_PRED, -TM_PRED
+};
+
+const vp9_tree_index vp9_uv_mode_tree[VP9_UV_MODES * 2 - 2] = {
+  2, 14,
+  -DC_PRED, 4,
+  6, 8,
+  -D45_PRED, -D135_PRED,
+  10, 12,
+  -D117_PRED, -D153_PRED,
+  -D27_PRED, -D63_PRED,
+  -V_PRED, 16,
+  -H_PRED, -TM_PRED
+};
+
+const vp9_tree_index vp9_mbsplit_tree[6] = {
+  -PARTITIONING_4X4,   2,
+  -PARTITIONING_8X8,   4,
+  -PARTITIONING_16X8, -PARTITIONING_8X16,
+};
+
+const vp9_tree_index vp9_mv_ref_tree[8] = {
+  -ZEROMV, 2,
+  -NEARESTMV, 4,
+  -NEARMV, 6,
+  -NEWMV, -SPLITMV
+};
+
+#if CONFIG_SUPERBLOCKS
+const vp9_tree_index vp9_sb_mv_ref_tree[6] = {
+  -ZEROMV, 2,
+  -NEARESTMV, 4,
+  -NEARMV, -NEWMV
+};
+#endif
+
+const vp9_tree_index vp9_sub_mv_ref_tree[6] = {
+  -LEFT4X4, 2,
+  -ABOVE4X4, 4,
+  -ZERO4X4, -NEW4X4
+};
+
+struct vp9_token_struct vp9_bmode_encodings   [VP9_BINTRAMODES];
+struct vp9_token_struct vp9_ymode_encodings   [VP9_YMODES];
+#if CONFIG_SUPERBLOCKS
+struct vp9_token_struct vp9_sb_kf_ymode_encodings [VP9_I32X32_MODES];
+#endif
+struct vp9_token_struct vp9_kf_ymode_encodings [VP9_YMODES];
+struct vp9_token_struct vp9_uv_mode_encodings  [VP9_UV_MODES];
+struct vp9_token_struct vp9_i8x8_mode_encodings  [VP9_I8X8_MODES];
+struct vp9_token_struct vp9_mbsplit_encodings [VP9_NUMMBSPLITS];
+
+struct vp9_token_struct vp9_mv_ref_encoding_array    [VP9_MVREFS];
+#if CONFIG_SUPERBLOCKS
+struct vp9_token_struct vp9_sb_mv_ref_encoding_array  [VP9_MVREFS];
+#endif
+struct vp9_token_struct vp9_sub_mv_ref_encoding_array [VP9_SUBMVREFS];
+
+void vp9_init_mbmode_probs(VP9_COMMON *x) {
+  unsigned int bct [VP9_YMODES] [2];      /* num Ymodes > num UV modes */
+
+  vp9_tree_probs_from_distribution(VP9_YMODES, vp9_ymode_encodings,
+                                   vp9_ymode_tree, x->fc.ymode_prob,
+                                   bct, y_mode_cts, 256, 1);
+  {
+    int i;
+    for (i = 0; i < 8; i++) {
+      vp9_tree_probs_from_distribution(VP9_YMODES, vp9_kf_ymode_encodings,
+                                       vp9_kf_ymode_tree, x->kf_ymode_prob[i],
+                                       bct, kf_y_mode_cts[i], 256, 1);
+#if CONFIG_SUPERBLOCKS
+      vp9_tree_probs_from_distribution(VP9_I32X32_MODES,
+                                       vp9_sb_kf_ymode_encodings,
+                                       vp9_sb_ymode_tree,
+                                       x->sb_kf_ymode_prob[i], bct,
+                                       kf_y_mode_cts[i], 256, 1);
+#endif
+    }
+  }
+  {
+    int i;
+    for (i = 0; i < VP9_YMODES; i++) {
+      vp9_tree_probs_from_distribution(VP9_UV_MODES, vp9_uv_mode_encodings,
+                                       vp9_uv_mode_tree, x->kf_uv_mode_prob[i],
+                                       bct, kf_uv_mode_cts[i], 256, 1);
+      vp9_tree_probs_from_distribution(VP9_UV_MODES, vp9_uv_mode_encodings,
+                                       vp9_uv_mode_tree, x->fc.uv_mode_prob[i],
+                                       bct, uv_mode_cts[i], 256, 1);
+    }
+  }
+
+  vp9_tree_probs_from_distribution(VP9_I8X8_MODES, vp9_i8x8_mode_encodings,
+                                   vp9_i8x8_mode_tree, x->fc.i8x8_mode_prob,
+                                   bct, i8x8_mode_cts, 256, 1);
+
+  vpx_memcpy(x->fc.sub_mv_ref_prob, vp9_sub_mv_ref_prob2,
+             sizeof(vp9_sub_mv_ref_prob2));
+  vpx_memcpy(x->fc.mbsplit_prob, vp9_mbsplit_probs, sizeof(vp9_mbsplit_probs));
+  vpx_memcpy(x->fc.switchable_interp_prob, vp9_switchable_interp_prob,
+             sizeof(vp9_switchable_interp_prob));
+}
+
+
+static void intra_bmode_probs_from_distribution(
+  vp9_prob p [VP9_BINTRAMODES - 1],
+  unsigned int branch_ct [VP9_BINTRAMODES - 1] [2],
+  const unsigned int events [VP9_BINTRAMODES]) {
+  vp9_tree_probs_from_distribution(VP9_BINTRAMODES, vp9_bmode_encodings,
+                                   vp9_bmode_tree, p, branch_ct,
+                                   events, 256, 1);
+}
+
+void vp9_default_bmode_probs(vp9_prob p [VP9_BINTRAMODES - 1]) {
+  unsigned int branch_ct [VP9_BINTRAMODES - 1] [2];
+  intra_bmode_probs_from_distribution(p, branch_ct, bmode_cts);
+}
+
+void vp9_kf_default_bmode_probs(vp9_prob p[VP9_BINTRAMODES][VP9_BINTRAMODES]
+                                          [VP9_BINTRAMODES - 1]) {
+  unsigned int branch_ct[VP9_BINTRAMODES - 1][2];
+  int i, j;
+
+  for (i = 0; i < VP9_BINTRAMODES; i++) {
+    for (j = 0; j < VP9_BINTRAMODES; j++) {
+      intra_bmode_probs_from_distribution(
+        p[i][j], branch_ct, vp9_kf_default_bmode_counts[i][j]);
+    }
+  }
+}
+
+#if VP9_SWITCHABLE_FILTERS == 3
+const vp9_tree_index vp9_switchable_interp_tree[VP9_SWITCHABLE_FILTERS*2-2] = {
+  -0, 2,
+  -1, -2
+};
+struct vp9_token_struct vp9_switchable_interp_encodings[VP9_SWITCHABLE_FILTERS];
+const INTERPOLATIONFILTERTYPE vp9_switchable_interp[VP9_SWITCHABLE_FILTERS] = {
+  EIGHTTAP, SIXTAP, EIGHTTAP_SHARP};
+const int vp9_switchable_interp_map[SWITCHABLE+1] = {1, -1, 0, 2, -1};
+const vp9_prob vp9_switchable_interp_prob [VP9_SWITCHABLE_FILTERS+1]
+                                          [VP9_SWITCHABLE_FILTERS-1] = {
+  {248, 192}, { 32, 248}, { 32,  32}, {192, 160}
+};
+#elif VP9_SWITCHABLE_FILTERS == 2
+const vp9_tree_index vp9_switchable_interp_tree[VP9_SWITCHABLE_FILTERS*2-2] = {
+  -0, -1,
+};
+struct vp9_token_struct vp9_switchable_interp_encodings[VP9_SWITCHABLE_FILTERS];
+const vp9_prob vp9_switchable_interp_prob [VP9_SWITCHABLE_FILTERS+1]
+                                          [VP9_SWITCHABLE_FILTERS-1] = {
+  {248},
+  { 64},
+  {192},
+};
+const INTERPOLATIONFILTERTYPE vp9_switchable_interp[VP9_SWITCHABLE_FILTERS] = {
+  EIGHTTAP, EIGHTTAP_SHARP};
+const int vp9_switchable_interp_map[SWITCHABLE+1] = {-1, -1, 0, 1, -1}; //8, 8s
+#endif
+
+void vp9_entropy_mode_init() {
+  vp9_tokens_from_tree(vp9_bmode_encodings,   vp9_bmode_tree);
+  vp9_tokens_from_tree(vp9_ymode_encodings,   vp9_ymode_tree);
+  vp9_tokens_from_tree(vp9_kf_ymode_encodings, vp9_kf_ymode_tree);
+#if CONFIG_SUPERBLOCKS
+  vp9_tokens_from_tree(vp9_sb_kf_ymode_encodings, vp9_sb_ymode_tree);
+#endif
+  vp9_tokens_from_tree(vp9_uv_mode_encodings,  vp9_uv_mode_tree);
+  vp9_tokens_from_tree(vp9_i8x8_mode_encodings,  vp9_i8x8_mode_tree);
+  vp9_tokens_from_tree(vp9_mbsplit_encodings, vp9_mbsplit_tree);
+  vp9_tokens_from_tree(vp9_switchable_interp_encodings,
+                       vp9_switchable_interp_tree);
+
+  vp9_tokens_from_tree_offset(vp9_mv_ref_encoding_array,
+                              vp9_mv_ref_tree, NEARESTMV);
+#if CONFIG_SUPERBLOCKS
+  vp9_tokens_from_tree_offset(vp9_sb_mv_ref_encoding_array,
+                              vp9_sb_mv_ref_tree, NEARESTMV);
+#endif
+  vp9_tokens_from_tree_offset(vp9_sub_mv_ref_encoding_array,
+                              vp9_sub_mv_ref_tree, LEFT4X4);
+}
+
+void vp9_init_mode_contexts(VP9_COMMON *pc) {
+  vpx_memset(pc->fc.mv_ref_ct, 0, sizeof(pc->fc.mv_ref_ct));
+  vpx_memset(pc->fc.mv_ref_ct_a, 0, sizeof(pc->fc.mv_ref_ct_a));
+
+  vpx_memcpy(pc->fc.mode_context,
+             vp9_default_mode_contexts,
+             sizeof(pc->fc.mode_context));
+  vpx_memcpy(pc->fc.mode_context_a,
+             vp9_default_mode_contexts_a,
+             sizeof(pc->fc.mode_context_a));
+
+}
+
+void vp9_accum_mv_refs(VP9_COMMON *pc,
+                       MB_PREDICTION_MODE m,
+                       const int ct[4]) {
+  int (*mv_ref_ct)[4][2];
+
+  if (pc->refresh_alt_ref_frame)
+    mv_ref_ct = pc->fc.mv_ref_ct_a;
+  else
+    mv_ref_ct = pc->fc.mv_ref_ct;
+
+  if (m == ZEROMV) {
+    ++mv_ref_ct [ct[0]] [0] [0];
+  } else {
+    ++mv_ref_ct [ct[0]] [0] [1];
+    if (m == NEARESTMV) {
+      ++mv_ref_ct [ct[1]] [1] [0];
+    } else {
+      ++mv_ref_ct [ct[1]] [1] [1];
+      if (m == NEARMV) {
+        ++mv_ref_ct [ct[2]] [2] [0];
+      } else {
+        ++mv_ref_ct [ct[2]] [2] [1];
+        if (m == NEWMV) {
+          ++mv_ref_ct [ct[3]] [3] [0];
+        } else {
+          ++mv_ref_ct [ct[3]] [3] [1];
+        }
+      }
+    }
+  }
+}
+
+#define MVREF_COUNT_SAT 20
+#define MVREF_MAX_UPDATE_FACTOR 144
+void vp9_update_mode_context(VP9_COMMON *pc) {
+  int i, j;
+  int (*mv_ref_ct)[4][2];
+  int (*mode_context)[4];
+
+  if (pc->refresh_alt_ref_frame) {
+    mv_ref_ct = pc->fc.mv_ref_ct_a;
+    mode_context = pc->fc.mode_context_a;
+  } else {
+    mv_ref_ct = pc->fc.mv_ref_ct;
+    mode_context = pc->fc.mode_context;
+  }
+
+  for (j = 0; j < 6; j++) {
+    for (i = 0; i < 4; i++) {
+      int this_prob;
+      int count = mv_ref_ct[j][i][0] + mv_ref_ct[j][i][1];
+      int factor;
+      {
+        this_prob = count > 0 ? 256 * mv_ref_ct[j][i][0] / count : 128;
+        count = count > MVREF_COUNT_SAT ? MVREF_COUNT_SAT : count;
+        factor = (MVREF_MAX_UPDATE_FACTOR * count / MVREF_COUNT_SAT);
+        this_prob = (pc->fc.vp8_mode_contexts[j][i] * (256 - factor) +
+                     this_prob * factor + 128) >> 8;
+        this_prob = this_prob ? (this_prob < 255 ? this_prob : 255) : 1;
+        mode_context[j][i] = this_prob;
+      }
+    }
+  }
+}
+
+#ifdef MODE_STATS
+#include "vp9/common/modecont.h"
+void print_mode_contexts(VP9_COMMON *pc) {
+  int j, i;
+  printf("\n====================\n");
+  for (j = 0; j < 6; j++) {
+    for (i = 0; i < 4; i++) {
+      printf("%4d ", pc->fc.mode_context[j][i]);
+    }
+    printf("\n");
+  }
+  printf("====================\n");
+  for (j = 0; j < 6; j++) {
+    for (i = 0; i < 4; i++) {
+      printf("%4d ", pc->fc.mode_context_a[j][i]);
+    }
+    printf("\n");
+  }
+}
+#endif
+
+// #define MODE_COUNT_TESTING
+#define MODE_COUNT_SAT 20
+#define MODE_MAX_UPDATE_FACTOR 144
+void vp9_adapt_mode_probs(VP9_COMMON *cm) {
+  int i, t, count, factor;
+  unsigned int branch_ct[32][2];
+  vp9_prob ymode_probs[VP9_YMODES - 1];
+  vp9_prob uvmode_probs[VP9_UV_MODES - 1];
+  vp9_prob bmode_probs[VP9_BINTRAMODES - 1];
+  vp9_prob i8x8_mode_probs[VP9_I8X8_MODES - 1];
+  vp9_prob sub_mv_ref_probs[VP9_SUBMVREFS - 1];
+  vp9_prob mbsplit_probs[VP9_NUMMBSPLITS - 1];
+#ifdef MODE_COUNT_TESTING
+  printf("static const unsigned int\nymode_counts"
+         "[VP9_YMODES] = {\n");
+  for (t = 0; t < VP9_YMODES; ++t) printf("%d, ", cm->fc.ymode_counts[t]);
+  printf("};\n");
+  printf("static const unsigned int\nuv_mode_counts"
+         "[VP9_YMODES] [VP9_UV_MODES] = {\n");
+  for (i = 0; i < VP9_YMODES; ++i) {
+    printf("  {");
+    for (t = 0; t < VP9_UV_MODES; ++t) printf("%d, ", cm->fc.uv_mode_counts[i][t]);
+    printf("},\n");
+  }
+  printf("};\n");
+  printf("static const unsigned int\nbmode_counts"
+         "[VP9_BINTRAMODES] = {\n");
+  for (t = 0; t < VP9_BINTRAMODES; ++t) printf("%d, ", cm->fc.bmode_counts[t]);
+  printf("};\n");
+  printf("static const unsigned int\ni8x8_mode_counts"
+         "[VP9_I8X8_MODES] = {\n");
+  for (t = 0; t < VP9_I8X8_MODES; ++t) printf("%d, ", cm->fc.i8x8_mode_counts[t]);
+  printf("};\n");
+  printf("static const unsigned int\nsub_mv_ref_counts"
+         "[SUBMVREF_COUNT] [VP9_SUBMVREFS] = {\n");
+  for (i = 0; i < SUBMVREF_COUNT; ++i) {
+    printf("  {");
+    for (t = 0; t < VP9_SUBMVREFS; ++t) printf("%d, ", cm->fc.sub_mv_ref_counts[i][t]);
+    printf("},\n");
+  }
+  printf("};\n");
+  printf("static const unsigned int\nmbsplit_counts"
+         "[VP9_NUMMBSPLITS] = {\n");
+  for (t = 0; t < VP9_NUMMBSPLITS; ++t) printf("%d, ", cm->fc.mbsplit_counts[t]);
+  printf("};\n");
+#endif
+  vp9_tree_probs_from_distribution(
+    VP9_YMODES, vp9_ymode_encodings, vp9_ymode_tree,
+    ymode_probs, branch_ct, cm->fc.ymode_counts,
+    256, 1);
+  for (t = 0; t < VP9_YMODES - 1; ++t) {
+    int prob;
+    count = branch_ct[t][0] + branch_ct[t][1];
+    count = count > MODE_COUNT_SAT ? MODE_COUNT_SAT : count;
+    factor = (MODE_MAX_UPDATE_FACTOR * count / MODE_COUNT_SAT);
+    prob = ((int)cm->fc.pre_ymode_prob[t] * (256 - factor) +
+            (int)ymode_probs[t] * factor + 128) >> 8;
+    if (prob <= 0) cm->fc.ymode_prob[t] = 1;
+    else if (prob > 255) cm->fc.ymode_prob[t] = 255;
+    else cm->fc.ymode_prob[t] = prob;
+  }
+  for (i = 0; i < VP9_YMODES; ++i) {
+    vp9_tree_probs_from_distribution(VP9_UV_MODES, vp9_uv_mode_encodings,
+                                     vp9_uv_mode_tree, uvmode_probs, branch_ct,
+                                     cm->fc.uv_mode_counts[i], 256, 1);
+    for (t = 0; t < VP9_UV_MODES - 1; ++t) {
+      int prob;
+      count = branch_ct[t][0] + branch_ct[t][1];
+      count = count > MODE_COUNT_SAT ? MODE_COUNT_SAT : count;
+      factor = (MODE_MAX_UPDATE_FACTOR * count / MODE_COUNT_SAT);
+      prob = ((int)cm->fc.pre_uv_mode_prob[i][t] * (256 - factor) +
+              (int)uvmode_probs[t] * factor + 128) >> 8;
+      if (prob <= 0) cm->fc.uv_mode_prob[i][t] = 1;
+      else if (prob > 255) cm->fc.uv_mode_prob[i][t] = 255;
+      else cm->fc.uv_mode_prob[i][t] = prob;
+    }
+  }
+  vp9_tree_probs_from_distribution(VP9_BINTRAMODES, vp9_bmode_encodings,
+                                   vp9_bmode_tree, bmode_probs, branch_ct,
+                                   cm->fc.bmode_counts, 256, 1);
+  for (t = 0; t < VP9_BINTRAMODES - 1; ++t) {
+    int prob;
+    count = branch_ct[t][0] + branch_ct[t][1];
+    count = count > MODE_COUNT_SAT ? MODE_COUNT_SAT : count;
+    factor = (MODE_MAX_UPDATE_FACTOR * count / MODE_COUNT_SAT);
+    prob = ((int)cm->fc.pre_bmode_prob[t] * (256 - factor) +
+            (int)bmode_probs[t] * factor + 128) >> 8;
+    if (prob <= 0) cm->fc.bmode_prob[t] = 1;
+    else if (prob > 255) cm->fc.bmode_prob[t] = 255;
+    else cm->fc.bmode_prob[t] = prob;
+  }
+  vp9_tree_probs_from_distribution(VP9_I8X8_MODES, vp9_i8x8_mode_encodings,
+                                   vp9_i8x8_mode_tree, i8x8_mode_probs,
+                                   branch_ct, cm->fc.i8x8_mode_counts, 256, 1);
+  for (t = 0; t < VP9_I8X8_MODES - 1; ++t) {
+    int prob;
+    count = branch_ct[t][0] + branch_ct[t][1];
+    count = count > MODE_COUNT_SAT ? MODE_COUNT_SAT : count;
+    factor = (MODE_MAX_UPDATE_FACTOR * count / MODE_COUNT_SAT);
+    prob = ((int)cm->fc.pre_i8x8_mode_prob[t] * (256 - factor) +
+            (int)i8x8_mode_probs[t] * factor + 128) >> 8;
+    if (prob <= 0) cm->fc.i8x8_mode_prob[t] = 1;
+    else if (prob > 255) cm->fc.i8x8_mode_prob[t] = 255;
+    else cm->fc.i8x8_mode_prob[t] = prob;
+  }
+  for (i = 0; i < SUBMVREF_COUNT; ++i) {
+    vp9_tree_probs_from_distribution(VP9_SUBMVREFS,
+                                     vp9_sub_mv_ref_encoding_array,
+                                     vp9_sub_mv_ref_tree, sub_mv_ref_probs,
+                                     branch_ct, cm->fc.sub_mv_ref_counts[i],
+                                     256, 1);
+    for (t = 0; t < VP9_SUBMVREFS - 1; ++t) {
+      int prob;
+      count = branch_ct[t][0] + branch_ct[t][1];
+      count = count > MODE_COUNT_SAT ? MODE_COUNT_SAT : count;
+      factor = (MODE_MAX_UPDATE_FACTOR * count / MODE_COUNT_SAT);
+      prob = ((int)cm->fc.pre_sub_mv_ref_prob[i][t] * (256 - factor) +
+              (int)sub_mv_ref_probs[t] * factor + 128) >> 8;
+      if (prob <= 0) cm->fc.sub_mv_ref_prob[i][t] = 1;
+      else if (prob > 255) cm->fc.sub_mv_ref_prob[i][t] = 255;
+      else cm->fc.sub_mv_ref_prob[i][t] = prob;
+    }
+  }
+  vp9_tree_probs_from_distribution(VP9_NUMMBSPLITS, vp9_mbsplit_encodings,
+                                   vp9_mbsplit_tree, mbsplit_probs, branch_ct,
+                                   cm->fc.mbsplit_counts, 256, 1);
+  for (t = 0; t < VP9_NUMMBSPLITS - 1; ++t) {
+    int prob;
+    count = branch_ct[t][0] + branch_ct[t][1];
+    count = count > MODE_COUNT_SAT ? MODE_COUNT_SAT : count;
+    factor = (MODE_MAX_UPDATE_FACTOR * count / MODE_COUNT_SAT);
+    prob = ((int)cm->fc.pre_mbsplit_prob[t] * (256 - factor) +
+            (int)mbsplit_probs[t] * factor + 128) >> 8;
+    if (prob <= 0) cm->fc.mbsplit_prob[t] = 1;
+    else if (prob > 255) cm->fc.mbsplit_prob[t] = 255;
+    else cm->fc.mbsplit_prob[t] = prob;
+  }
+}
--- /dev/null
+++ b/vp9/common/entropymode.h
@@ -1,0 +1,102 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_ENTROPYMODE_H
+#define __INC_ENTROPYMODE_H
+
+#include "blockd.h"
+#include "treecoder.h"
+
+#define SUBMVREF_COUNT 5
+#define VP9_NUMMBSPLITS 4
+
+typedef const int vp9_mbsplit[16];
+
+extern vp9_mbsplit vp9_mbsplits[VP9_NUMMBSPLITS];
+
+extern const int vp9_mbsplit_count[VP9_NUMMBSPLITS];    /* # of subsets */
+
+extern const vp9_prob vp9_mbsplit_probs[VP9_NUMMBSPLITS - 1];
+
+extern int vp9_mv_cont(const int_mv *l, const int_mv *a);
+
+extern const vp9_prob vp9_sub_mv_ref_prob[VP9_SUBMVREFS - 1];
+
+extern const vp9_prob vp9_sub_mv_ref_prob2[SUBMVREF_COUNT][VP9_SUBMVREFS - 1];
+
+extern const unsigned int vp9_kf_default_bmode_counts[VP9_BINTRAMODES]
+                                                     [VP9_BINTRAMODES]
+                                                     [VP9_BINTRAMODES];
+
+extern const vp9_tree_index vp9_bmode_tree[];
+
+extern const vp9_tree_index  vp9_ymode_tree[];
+extern const vp9_tree_index  vp9_kf_ymode_tree[];
+extern const vp9_tree_index  vp9_uv_mode_tree[];
+#define vp9_sb_ymode_tree vp9_uv_mode_tree
+extern const vp9_tree_index  vp9_i8x8_mode_tree[];
+extern const vp9_tree_index  vp9_mbsplit_tree[];
+extern const vp9_tree_index  vp9_mv_ref_tree[];
+extern const vp9_tree_index  vp9_sb_mv_ref_tree[];
+extern const vp9_tree_index  vp9_sub_mv_ref_tree[];
+
+extern struct vp9_token_struct vp9_bmode_encodings[VP9_BINTRAMODES];
+extern struct vp9_token_struct vp9_ymode_encodings[VP9_YMODES];
+extern struct vp9_token_struct vp9_sb_kf_ymode_encodings[VP9_I32X32_MODES];
+extern struct vp9_token_struct vp9_kf_ymode_encodings[VP9_YMODES];
+extern struct vp9_token_struct vp9_i8x8_mode_encodings[VP9_I8X8_MODES];
+extern struct vp9_token_struct vp9_uv_mode_encodings[VP9_UV_MODES];
+extern struct vp9_token_struct vp9_mbsplit_encodings[VP9_NUMMBSPLITS];
+
+/* Inter mode values do not start at zero */
+
+extern struct vp9_token_struct vp9_mv_ref_encoding_array[VP9_MVREFS];
+extern struct vp9_token_struct vp9_sb_mv_ref_encoding_array[VP9_MVREFS];
+extern struct vp9_token_struct vp9_sub_mv_ref_encoding_array[VP9_SUBMVREFS];
+
+void vp9_entropy_mode_init(void);
+
+struct VP9Common;
+
+void vp9_init_mbmode_probs(struct VP9Common *x);
+
+extern void vp9_init_mode_contexts(struct VP9Common *pc);
+
+extern void vp9_update_mode_context(struct VP9Common *pc);
+
+extern void vp9_accum_mv_refs(struct VP9Common *pc,
+                              MB_PREDICTION_MODE m,
+                              const int ct[4]);
+
+void vp9_default_bmode_probs(vp9_prob dest[VP9_BINTRAMODES - 1]);
+
+void vp9_kf_default_bmode_probs(vp9_prob dest[VP9_BINTRAMODES][VP9_BINTRAMODES]
+                                             [VP9_BINTRAMODES - 1]);
+
+void vp9_adapt_mode_probs(struct VP9Common *);
+
+#define VP9_SWITCHABLE_FILTERS 2 /* number of switchable filters */
+
+extern const  INTERPOLATIONFILTERTYPE vp9_switchable_interp
+                  [VP9_SWITCHABLE_FILTERS];
+
+extern const  int vp9_switchable_interp_map[SWITCHABLE + 1];
+
+extern const  vp9_tree_index vp9_switchable_interp_tree
+                  [2 * (VP9_SWITCHABLE_FILTERS - 1)];
+
+extern struct vp9_token_struct vp9_switchable_interp_encodings
+                  [VP9_SWITCHABLE_FILTERS];
+
+extern const  vp9_prob vp9_switchable_interp_prob[VP9_SWITCHABLE_FILTERS + 1]
+                                                 [VP9_SWITCHABLE_FILTERS - 1];
+
+#endif
--- /dev/null
+++ b/vp9/common/entropymv.c
@@ -1,0 +1,465 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "onyxc_int.h"
+#include "entropymv.h"
+
+//#define MV_COUNT_TESTING
+
+#define MV_COUNT_SAT 16
+#define MV_MAX_UPDATE_FACTOR 160
+
+#if CONFIG_NEW_MVREF
+/* Integer pel reference mv threshold for use of high-precision 1/8 mv */
+#define COMPANDED_MVREF_THRESH    1000000
+#else
+/* Integer pel reference mv threshold for use of high-precision 1/8 mv */
+#define COMPANDED_MVREF_THRESH    8
+#endif
+
+/* Smooth or bias the mv-counts before prob computation */
+/* #define SMOOTH_MV_COUNTS */
+
+const vp9_tree_index vp9_mv_joint_tree[2 * MV_JOINTS - 2] = {
+  -MV_JOINT_ZERO, 2,
+  -MV_JOINT_HNZVZ, 4,
+  -MV_JOINT_HZVNZ, -MV_JOINT_HNZVNZ
+};
+struct vp9_token_struct vp9_mv_joint_encodings[MV_JOINTS];
+
+const vp9_tree_index vp9_mv_class_tree[2 * MV_CLASSES - 2] = {
+  -MV_CLASS_0, 2,
+  -MV_CLASS_1, 4,
+  6, 8,
+  -MV_CLASS_2, -MV_CLASS_3,
+  10, 12,
+  -MV_CLASS_4, -MV_CLASS_5,
+  -MV_CLASS_6, -MV_CLASS_7,
+};
+struct vp9_token_struct vp9_mv_class_encodings[MV_CLASSES];
+
+const vp9_tree_index vp9_mv_class0_tree [2 * CLASS0_SIZE - 2] = {
+  -0, -1,
+};
+struct vp9_token_struct vp9_mv_class0_encodings[CLASS0_SIZE];
+
+const vp9_tree_index vp9_mv_fp_tree [2 * 4 - 2] = {
+  -0, 2,
+  -1, 4,
+  -2, -3
+};
+struct vp9_token_struct vp9_mv_fp_encodings[4];
+
+const nmv_context vp9_default_nmv_context = {
+  {32, 64, 96},
+  {
+    { /* vert component */
+      128,                                             /* sign */
+      {224, 144, 192, 168, 192, 176, 192},             /* class */
+      {216},                                           /* class0 */
+      {136, 140, 148, 160, 176, 192, 224},             /* bits */
+      {{128, 128, 64}, {96, 112, 64}},                 /* class0_fp */
+      {64, 96, 64},                                    /* fp */
+      160,                                             /* class0_hp bit */
+      128,                                             /* hp */
+    },
+    { /* hor component */
+      128,                                             /* sign */
+      {216, 128, 176, 160, 176, 176, 192},             /* class */
+      {208},                                           /* class0 */
+      {136, 140, 148, 160, 176, 192, 224},             /* bits */
+      {{128, 128, 64}, {96, 112, 64}},                 /* class0_fp */
+      {64, 96, 64},                                    /* fp */
+      160,                                             /* class0_hp bit */
+      128,                                             /* hp */
+    }
+  },
+};
+
+MV_JOINT_TYPE vp9_get_mv_joint(MV mv) {
+  if (mv.row == 0 && mv.col == 0) return MV_JOINT_ZERO;
+  else if (mv.row == 0 && mv.col != 0) return MV_JOINT_HNZVZ;
+  else if (mv.row != 0 && mv.col == 0) return MV_JOINT_HZVNZ;
+  else return MV_JOINT_HNZVNZ;
+}
+
+#define mv_class_base(c) ((c) ? (CLASS0_SIZE << (c + 2)) : 0)
+
+MV_CLASS_TYPE vp9_get_mv_class(int z, int *offset) {
+  MV_CLASS_TYPE c;
+  if      (z < CLASS0_SIZE * 8)    c = MV_CLASS_0;
+  else if (z < CLASS0_SIZE * 16)   c = MV_CLASS_1;
+  else if (z < CLASS0_SIZE * 32)   c = MV_CLASS_2;
+  else if (z < CLASS0_SIZE * 64)   c = MV_CLASS_3;
+  else if (z < CLASS0_SIZE * 128)  c = MV_CLASS_4;
+  else if (z < CLASS0_SIZE * 256)  c = MV_CLASS_5;
+  else if (z < CLASS0_SIZE * 512)  c = MV_CLASS_6;
+  else if (z < CLASS0_SIZE * 1024) c = MV_CLASS_7;
+  else assert(0);
+  if (offset)
+    *offset = z - mv_class_base(c);
+  return c;
+}
+
+int vp9_use_nmv_hp(const MV *ref) {
+  if ((abs(ref->row) >> 3) < COMPANDED_MVREF_THRESH &&
+      (abs(ref->col) >> 3) < COMPANDED_MVREF_THRESH)
+    return 1;
+  else
+    return 0;
+}
+
+int vp9_get_mv_mag(MV_CLASS_TYPE c, int offset) {
+  return mv_class_base(c) + offset;
+}
+
+static void increment_nmv_component_count(int v,
+                                          nmv_component_counts *mvcomp,
+                                          int incr,
+                                          int usehp) {
+  assert (v != 0);            /* should not be zero */
+  mvcomp->mvcount[MV_MAX + v] += incr;
+}
+
+static void increment_nmv_component(int v,
+                                    nmv_component_counts *mvcomp,
+                                    int incr,
+                                    int usehp) {
+  int s, z, c, o, d, e, f;
+  assert (v != 0);            /* should not be zero */
+  s = v < 0;
+  mvcomp->sign[s] += incr;
+  z = (s ? -v : v) - 1;       /* magnitude - 1 */
+
+  c = vp9_get_mv_class(z, &o);
+  mvcomp->classes[c] += incr;
+
+  d = (o >> 3);               /* int mv data */
+  f = (o >> 1) & 3;           /* fractional pel mv data */
+  e = (o & 1);                /* high precision mv data */
+  if (c == MV_CLASS_0) {
+    mvcomp->class0[d] += incr;
+  } else {
+    int i, b;
+    b = c + CLASS0_BITS - 1;  /* number of bits */
+    for (i = 0; i < b; ++i)
+      mvcomp->bits[i][((d >> i) & 1)] += incr;
+  }
+
+  /* Code the fractional pel bits */
+  if (c == MV_CLASS_0) {
+    mvcomp->class0_fp[d][f] += incr;
+  } else {
+    mvcomp->fp[f] += incr;
+  }
+
+  /* Code the high precision bit */
+  if (usehp) {
+    if (c == MV_CLASS_0) {
+      mvcomp->class0_hp[e] += incr;
+    } else {
+      mvcomp->hp[e] += incr;
+    }
+  }
+}
+
+#ifdef SMOOTH_MV_COUNTS
+static void smooth_counts(nmv_component_counts *mvcomp) {
+  static const int flen = 3;  // (filter_length + 1) / 2
+  static const int fval[] = {8, 3, 1};
+  static const int fvalbits = 4;
+  int i;
+  unsigned int smvcount[MV_VALS];
+  vpx_memcpy(smvcount, mvcomp->mvcount, sizeof(smvcount));
+  smvcount[MV_MAX] = (smvcount[MV_MAX - 1] + smvcount[MV_MAX + 1]) >> 1;
+  for (i = flen - 1; i <= MV_VALS - flen; ++i) {
+    int j, s = smvcount[i] * fval[0];
+    for (j = 1; j < flen; ++j)
+      s += (smvcount[i - j] + smvcount[i + j]) * fval[j];
+    mvcomp->mvcount[i] = (s + (1 << (fvalbits - 1))) >> fvalbits;
+  }
+}
+#endif
+
+static void counts_to_context(nmv_component_counts *mvcomp, int usehp) {
+  int v;
+  vpx_memset(mvcomp->sign, 0, sizeof(nmv_component_counts) - sizeof(mvcomp->mvcount));
+  for (v = 1; v <= MV_MAX; v++) {
+    increment_nmv_component(-v, mvcomp, mvcomp->mvcount[MV_MAX - v], usehp);
+    increment_nmv_component( v, mvcomp, mvcomp->mvcount[MV_MAX + v], usehp);
+  }
+}
+
+void vp9_increment_nmv(const MV *mv, const MV *ref, nmv_context_counts *mvctx,
+                       int usehp) {
+  MV_JOINT_TYPE j = vp9_get_mv_joint(*mv);
+  mvctx->joints[j]++;
+  usehp = usehp && vp9_use_nmv_hp(ref);
+  if (j == MV_JOINT_HZVNZ || j == MV_JOINT_HNZVNZ) {
+    increment_nmv_component_count(mv->row, &mvctx->comps[0], 1, usehp);
+  }
+  if (j == MV_JOINT_HNZVZ || j == MV_JOINT_HNZVNZ) {
+    increment_nmv_component_count(mv->col, &mvctx->comps[1], 1, usehp);
+  }
+}
+
+static void adapt_prob(vp9_prob *dest, vp9_prob prep, vp9_prob newp,
+                       unsigned int ct[2]) {
+  int factor;
+  int prob;
+  int count = ct[0] + ct[1];
+  if (count) {
+    count = count > MV_COUNT_SAT ? MV_COUNT_SAT : count;
+    factor = (MV_MAX_UPDATE_FACTOR * count / MV_COUNT_SAT);
+    prob = ((int)prep * (256 - factor) + (int)(newp) * factor + 128) >> 8;
+    prob += !prob;
+    prob = (prob > 255 ? 255 : prob);
+    *dest = prob;
+  }
+}
+
+void vp9_counts_to_nmv_context(
+    nmv_context_counts *NMVcount,
+    nmv_context *prob,
+    int usehp,
+    unsigned int (*branch_ct_joint)[2],
+    unsigned int (*branch_ct_sign)[2],
+    unsigned int (*branch_ct_classes)[MV_CLASSES - 1][2],
+    unsigned int (*branch_ct_class0)[CLASS0_SIZE - 1][2],
+    unsigned int (*branch_ct_bits)[MV_OFFSET_BITS][2],
+    unsigned int (*branch_ct_class0_fp)[CLASS0_SIZE][4 - 1][2],
+    unsigned int (*branch_ct_fp)[4 - 1][2],
+    unsigned int (*branch_ct_class0_hp)[2],
+    unsigned int (*branch_ct_hp)[2]) {
+  int i, j, k;
+  counts_to_context(&NMVcount->comps[0], usehp);
+  counts_to_context(&NMVcount->comps[1], usehp);
+  vp9_tree_probs_from_distribution(MV_JOINTS,
+                                   vp9_mv_joint_encodings,
+                                   vp9_mv_joint_tree,
+                                   prob->joints,
+                                   branch_ct_joint,
+                                   NMVcount->joints,
+                                   256, 1);
+  for (i = 0; i < 2; ++i) {
+    prob->comps[i].sign =
+        vp9_bin_prob_from_distribution(NMVcount->comps[i].sign);
+    branch_ct_sign[i][0] = NMVcount->comps[i].sign[0];
+    branch_ct_sign[i][1] = NMVcount->comps[i].sign[1];
+    vp9_tree_probs_from_distribution(MV_CLASSES,
+                                     vp9_mv_class_encodings,
+                                     vp9_mv_class_tree,
+                                     prob->comps[i].classes,
+                                     branch_ct_classes[i],
+                                     NMVcount->comps[i].classes,
+                                     256, 1);
+    vp9_tree_probs_from_distribution(CLASS0_SIZE,
+                                     vp9_mv_class0_encodings,
+                                     vp9_mv_class0_tree,
+                                     prob->comps[i].class0,
+                                     branch_ct_class0[i],
+                                     NMVcount->comps[i].class0,
+                                     256, 1);
+    for (j = 0; j < MV_OFFSET_BITS; ++j) {
+      prob->comps[i].bits[j] = vp9_bin_prob_from_distribution(
+          NMVcount->comps[i].bits[j]);
+      branch_ct_bits[i][j][0] = NMVcount->comps[i].bits[j][0];
+      branch_ct_bits[i][j][1] = NMVcount->comps[i].bits[j][1];
+    }
+  }
+  for (i = 0; i < 2; ++i) {
+    for (k = 0; k < CLASS0_SIZE; ++k) {
+      vp9_tree_probs_from_distribution(4,
+                                       vp9_mv_fp_encodings,
+                                       vp9_mv_fp_tree,
+                                       prob->comps[i].class0_fp[k],
+                                       branch_ct_class0_fp[i][k],
+                                       NMVcount->comps[i].class0_fp[k],
+                                       256, 1);
+    }
+    vp9_tree_probs_from_distribution(4,
+                                     vp9_mv_fp_encodings,
+                                     vp9_mv_fp_tree,
+                                     prob->comps[i].fp,
+                                     branch_ct_fp[i],
+                                     NMVcount->comps[i].fp,
+                                     256, 1);
+  }
+  if (usehp) {
+    for (i = 0; i < 2; ++i) {
+      prob->comps[i].class0_hp = vp9_bin_prob_from_distribution(
+          NMVcount->comps[i].class0_hp);
+      branch_ct_class0_hp[i][0] = NMVcount->comps[i].class0_hp[0];
+      branch_ct_class0_hp[i][1] = NMVcount->comps[i].class0_hp[1];
+
+      prob->comps[i].hp =
+          vp9_bin_prob_from_distribution(NMVcount->comps[i].hp);
+      branch_ct_hp[i][0] = NMVcount->comps[i].hp[0];
+      branch_ct_hp[i][1] = NMVcount->comps[i].hp[1];
+    }
+  }
+}
+
+void vp9_adapt_nmv_probs(VP9_COMMON *cm, int usehp) {
+  int i, j, k;
+  nmv_context prob;
+  unsigned int branch_ct_joint[MV_JOINTS - 1][2];
+  unsigned int branch_ct_sign[2][2];
+  unsigned int branch_ct_classes[2][MV_CLASSES - 1][2];
+  unsigned int branch_ct_class0[2][CLASS0_SIZE - 1][2];
+  unsigned int branch_ct_bits[2][MV_OFFSET_BITS][2];
+  unsigned int branch_ct_class0_fp[2][CLASS0_SIZE][4 - 1][2];
+  unsigned int branch_ct_fp[2][4 - 1][2];
+  unsigned int branch_ct_class0_hp[2][2];
+  unsigned int branch_ct_hp[2][2];
+#ifdef MV_COUNT_TESTING
+  printf("joints count: ");
+  for (j = 0; j < MV_JOINTS; ++j) printf("%d ", cm->fc.NMVcount.joints[j]);
+  printf("\n"); fflush(stdout);
+  printf("signs count:\n");
+  for (i = 0; i < 2; ++i)
+    printf("%d/%d ", cm->fc.NMVcount.comps[i].sign[0], cm->fc.NMVcount.comps[i].sign[1]);
+  printf("\n"); fflush(stdout);
+  printf("classes count:\n");
+  for (i = 0; i < 2; ++i) {
+    for (j = 0; j < MV_CLASSES; ++j)
+      printf("%d ", cm->fc.NMVcount.comps[i].classes[j]);
+    printf("\n"); fflush(stdout);
+  }
+  printf("class0 count:\n");
+  for (i = 0; i < 2; ++i) {
+    for (j = 0; j < CLASS0_SIZE; ++j)
+      printf("%d ", cm->fc.NMVcount.comps[i].class0[j]);
+    printf("\n"); fflush(stdout);
+  }
+  printf("bits count:\n");
+  for (i = 0; i < 2; ++i) {
+    for (j = 0; j < MV_OFFSET_BITS; ++j)
+      printf("%d/%d ", cm->fc.NMVcount.comps[i].bits[j][0],
+                       cm->fc.NMVcount.comps[i].bits[j][1]);
+    printf("\n"); fflush(stdout);
+  }
+  printf("class0_fp count:\n");
+  for (i = 0; i < 2; ++i) {
+    for (j = 0; j < CLASS0_SIZE; ++j) {
+      printf("{");
+      for (k = 0; k < 4; ++k)
+        printf("%d ", cm->fc.NMVcount.comps[i].class0_fp[j][k]);
+      printf("}, ");
+    }
+    printf("\n"); fflush(stdout);
+  }
+  printf("fp count:\n");
+  for (i = 0; i < 2; ++i) {
+    for (j = 0; j < 4; ++j)
+      printf("%d ", cm->fc.NMVcount.comps[i].fp[j]);
+    printf("\n"); fflush(stdout);
+  }
+  if (usehp) {
+    printf("class0_hp count:\n");
+    for (i = 0; i < 2; ++i)
+      printf("%d/%d ", cm->fc.NMVcount.comps[i].class0_hp[0],
+                       cm->fc.NMVcount.comps[i].class0_hp[1]);
+    printf("\n"); fflush(stdout);
+    printf("hp count:\n");
+    for (i = 0; i < 2; ++i)
+      printf("%d/%d ", cm->fc.NMVcount.comps[i].hp[0],
+                       cm->fc.NMVcount.comps[i].hp[1]);
+    printf("\n"); fflush(stdout);
+  }
+#endif
+#ifdef SMOOTH_MV_COUNTS
+  smooth_counts(&cm->fc.NMVcount.comps[0]);
+  smooth_counts(&cm->fc.NMVcount.comps[1]);
+#endif
+  vp9_counts_to_nmv_context(&cm->fc.NMVcount,
+                            &prob,
+                            usehp,
+                            branch_ct_joint,
+                            branch_ct_sign,
+                            branch_ct_classes,
+                            branch_ct_class0,
+                            branch_ct_bits,
+                            branch_ct_class0_fp,
+                            branch_ct_fp,
+                            branch_ct_class0_hp,
+                            branch_ct_hp);
+
+  for (j = 0; j < MV_JOINTS - 1; ++j) {
+    adapt_prob(&cm->fc.nmvc.joints[j],
+               cm->fc.pre_nmvc.joints[j],
+               prob.joints[j],
+               branch_ct_joint[j]);
+  }
+  for (i = 0; i < 2; ++i) {
+    adapt_prob(&cm->fc.nmvc.comps[i].sign,
+               cm->fc.pre_nmvc.comps[i].sign,
+               prob.comps[i].sign,
+               branch_ct_sign[i]);
+    for (j = 0; j < MV_CLASSES - 1; ++j) {
+      adapt_prob(&cm->fc.nmvc.comps[i].classes[j],
+                 cm->fc.pre_nmvc.comps[i].classes[j],
+                 prob.comps[i].classes[j],
+                 branch_ct_classes[i][j]);
+    }
+    for (j = 0; j < CLASS0_SIZE - 1; ++j) {
+      adapt_prob(&cm->fc.nmvc.comps[i].class0[j],
+                 cm->fc.pre_nmvc.comps[i].class0[j],
+                 prob.comps[i].class0[j],
+                 branch_ct_class0[i][j]);
+    }
+    for (j = 0; j < MV_OFFSET_BITS; ++j) {
+      adapt_prob(&cm->fc.nmvc.comps[i].bits[j],
+                 cm->fc.pre_nmvc.comps[i].bits[j],
+                 prob.comps[i].bits[j],
+                 branch_ct_bits[i][j]);
+    }
+  }
+  for (i = 0; i < 2; ++i) {
+    for (j = 0; j < CLASS0_SIZE; ++j) {
+      for (k = 0; k < 3; ++k) {
+        adapt_prob(&cm->fc.nmvc.comps[i].class0_fp[j][k],
+                   cm->fc.pre_nmvc.comps[i].class0_fp[j][k],
+                   prob.comps[i].class0_fp[j][k],
+                   branch_ct_class0_fp[i][j][k]);
+      }
+    }
+    for (j = 0; j < 3; ++j) {
+      adapt_prob(&cm->fc.nmvc.comps[i].fp[j],
+                 cm->fc.pre_nmvc.comps[i].fp[j],
+                 prob.comps[i].fp[j],
+                 branch_ct_fp[i][j]);
+    }
+  }
+  if (usehp) {
+    for (i = 0; i < 2; ++i) {
+      adapt_prob(&cm->fc.nmvc.comps[i].class0_hp,
+                 cm->fc.pre_nmvc.comps[i].class0_hp,
+                 prob.comps[i].class0_hp,
+                 branch_ct_class0_hp[i]);
+      adapt_prob(&cm->fc.nmvc.comps[i].hp,
+                 cm->fc.pre_nmvc.comps[i].hp,
+                 prob.comps[i].hp,
+                 branch_ct_hp[i]);
+    }
+  }
+}
+
+void vp9_entropy_mv_init() {
+  vp9_tokens_from_tree(vp9_mv_joint_encodings, vp9_mv_joint_tree);
+  vp9_tokens_from_tree(vp9_mv_class_encodings, vp9_mv_class_tree);
+  vp9_tokens_from_tree(vp9_mv_class0_encodings, vp9_mv_class0_tree);
+  vp9_tokens_from_tree(vp9_mv_fp_encodings, vp9_mv_fp_tree);
+}
+
+void vp9_init_mv_probs(VP9_COMMON *cm) {
+  vpx_memcpy(&cm->fc.nmvc, &vp9_default_nmv_context, sizeof(nmv_context));
+}
--- /dev/null
+++ b/vp9/common/entropymv.h
@@ -1,0 +1,129 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_ENTROPYMV_H
+#define __INC_ENTROPYMV_H
+
+#include "treecoder.h"
+#include "vpx_config.h"
+#include "blockd.h"
+
+struct VP9Common;
+
+void vp9_entropy_mv_init();
+void vp9_init_mv_probs(struct VP9Common *cm);
+
+void vp9_adapt_nmv_probs(struct VP9Common *cm, int usehp);
+int vp9_use_nmv_hp(const MV *ref);
+
+#define VP9_NMV_UPDATE_PROB  255
+//#define MV_GROUP_UPDATE
+
+#define LOW_PRECISION_MV_UPDATE  /* Use 7 bit forward update */
+
+/* Symbols for coding which components are zero jointly */
+#define MV_JOINTS     4
+typedef enum {
+  MV_JOINT_ZERO = 0,             /* Zero vector */
+  MV_JOINT_HNZVZ = 1,            /* Vert zero, hor nonzero */
+  MV_JOINT_HZVNZ = 2,            /* Hor zero, vert nonzero */
+  MV_JOINT_HNZVNZ = 3,           /* Both components nonzero */
+} MV_JOINT_TYPE;
+
+extern const vp9_tree_index vp9_mv_joint_tree[2 * MV_JOINTS - 2];
+extern struct vp9_token_struct vp9_mv_joint_encodings [MV_JOINTS];
+
+/* Symbols for coding magnitude class of nonzero components */
+#define MV_CLASSES     8
+typedef enum {
+  MV_CLASS_0 = 0,      /* (0, 2]     integer pel */
+  MV_CLASS_1 = 1,      /* (2, 4]     integer pel */
+  MV_CLASS_2 = 2,      /* (4, 8]     integer pel */
+  MV_CLASS_3 = 3,      /* (8, 16]    integer pel */
+  MV_CLASS_4 = 4,      /* (16, 32]   integer pel */
+  MV_CLASS_5 = 5,      /* (32, 64]   integer pel */
+  MV_CLASS_6 = 6,      /* (64, 128]  integer pel */
+  MV_CLASS_7 = 7,      /* (128, 256] integer pel */
+} MV_CLASS_TYPE;
+
+extern const vp9_tree_index vp9_mv_class_tree[2 * MV_CLASSES - 2];
+extern struct vp9_token_struct vp9_mv_class_encodings [MV_CLASSES];
+
+#define CLASS0_BITS    1  /* bits at integer precision for class 0 */
+#define CLASS0_SIZE    (1 << CLASS0_BITS)
+#define MV_OFFSET_BITS (MV_CLASSES + CLASS0_BITS - 2)
+
+#define MV_MAX_BITS    (MV_CLASSES + CLASS0_BITS + 2)
+#define MV_MAX         ((1 << MV_MAX_BITS) - 1)
+#define MV_VALS        ((MV_MAX << 1) + 1)
+
+extern const vp9_tree_index vp9_mv_class0_tree[2 * CLASS0_SIZE - 2];
+extern struct vp9_token_struct vp9_mv_class0_encodings[CLASS0_SIZE];
+
+extern const vp9_tree_index vp9_mv_fp_tree[2 * 4 - 2];
+extern struct vp9_token_struct vp9_mv_fp_encodings[4];
+
+typedef struct {
+  vp9_prob sign;
+  vp9_prob classes[MV_CLASSES - 1];
+  vp9_prob class0[CLASS0_SIZE - 1];
+  vp9_prob bits[MV_OFFSET_BITS];
+  vp9_prob class0_fp[CLASS0_SIZE][4 - 1];
+  vp9_prob fp[4 - 1];
+  vp9_prob class0_hp;
+  vp9_prob hp;
+} nmv_component;
+
+typedef struct {
+  vp9_prob joints[MV_JOINTS - 1];
+  nmv_component comps[2];
+} nmv_context;
+
+MV_JOINT_TYPE vp9_get_mv_joint(MV mv);
+MV_CLASS_TYPE vp9_get_mv_class(int z, int *offset);
+int vp9_get_mv_mag(MV_CLASS_TYPE c, int offset);
+
+
+typedef struct {
+  unsigned int mvcount[MV_VALS];
+  unsigned int sign[2];
+  unsigned int classes[MV_CLASSES];
+  unsigned int class0[CLASS0_SIZE];
+  unsigned int bits[MV_OFFSET_BITS][2];
+  unsigned int class0_fp[CLASS0_SIZE][4];
+  unsigned int fp[4];
+  unsigned int class0_hp[2];
+  unsigned int hp[2];
+} nmv_component_counts;
+
+typedef struct {
+  unsigned int joints[MV_JOINTS];
+  nmv_component_counts comps[2];
+} nmv_context_counts;
+
+void vp9_increment_nmv(const MV *mv, const MV *ref, nmv_context_counts *mvctx,
+                       int usehp);
+extern const nmv_context vp9_default_nmv_context;
+void vp9_counts_to_nmv_context(
+    nmv_context_counts *NMVcount,
+    nmv_context *prob,
+    int usehp,
+    unsigned int (*branch_ct_joint)[2],
+    unsigned int (*branch_ct_sign)[2],
+    unsigned int (*branch_ct_classes)[MV_CLASSES - 1][2],
+    unsigned int (*branch_ct_class0)[CLASS0_SIZE - 1][2],
+    unsigned int (*branch_ct_bits)[MV_OFFSET_BITS][2],
+    unsigned int (*branch_ct_class0_fp)[CLASS0_SIZE][4 - 1][2],
+    unsigned int (*branch_ct_fp)[4 - 1][2],
+    unsigned int (*branch_ct_class0_hp)[2],
+    unsigned int (*branch_ct_hp)[2]);
+
+#endif
--- /dev/null
+++ b/vp9/common/extend.c
@@ -1,0 +1,169 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "extend.h"
+#include "vpx_mem/vpx_mem.h"
+
+static void copy_and_extend_plane(unsigned char *s, /* source */
+                                  int sp,           /* source pitch */
+                                  unsigned char *d, /* destination */
+                                  int dp,           /* destination pitch */
+                                  int h,            /* height */
+                                  int w,            /* width */
+                                  int et,           /* extend top border */
+                                  int el,           /* extend left border */
+                                  int eb,           /* extend bottom border */
+                                  int er) {         /* extend right border */
+  int i;
+  unsigned char *src_ptr1, *src_ptr2;
+  unsigned char *dest_ptr1, *dest_ptr2;
+  int linesize;
+
+  /* copy the left and right most columns out */
+  src_ptr1 = s;
+  src_ptr2 = s + w - 1;
+  dest_ptr1 = d - el;
+  dest_ptr2 = d + w;
+
+  for (i = 0; i < h; i++) {
+    vpx_memset(dest_ptr1, src_ptr1[0], el);
+    vpx_memcpy(dest_ptr1 + el, src_ptr1, w);
+    vpx_memset(dest_ptr2, src_ptr2[0], er);
+    src_ptr1  += sp;
+    src_ptr2  += sp;
+    dest_ptr1 += dp;
+    dest_ptr2 += dp;
+  }
+
+  /* Now copy the top and bottom lines into each line of the respective
+   * borders
+   */
+  src_ptr1 = d - el;
+  src_ptr2 = d + dp * (h - 1) - el;
+  dest_ptr1 = d + dp * (-et) - el;
+  dest_ptr2 = d + dp * (h) - el;
+  linesize = el + er + w;
+
+  for (i = 0; i < et; i++) {
+    vpx_memcpy(dest_ptr1, src_ptr1, linesize);
+    dest_ptr1 += dp;
+  }
+
+  for (i = 0; i < eb; i++) {
+    vpx_memcpy(dest_ptr2, src_ptr2, linesize);
+    dest_ptr2 += dp;
+  }
+}
+
+void vp9_copy_and_extend_frame(YV12_BUFFER_CONFIG *src,
+                               YV12_BUFFER_CONFIG *dst) {
+  int et = dst->border;
+  int el = dst->border;
+  int eb = dst->border + dst->y_height - src->y_height;
+  int er = dst->border + dst->y_width - src->y_width;
+
+  copy_and_extend_plane(src->y_buffer, src->y_stride,
+                        dst->y_buffer, dst->y_stride,
+                        src->y_height, src->y_width,
+                        et, el, eb, er);
+
+  et = dst->border >> 1;
+  el = dst->border >> 1;
+  eb = (dst->border >> 1) + dst->uv_height - src->uv_height;
+  er = (dst->border >> 1) + dst->uv_width - src->uv_width;
+
+  copy_and_extend_plane(src->u_buffer, src->uv_stride,
+                        dst->u_buffer, dst->uv_stride,
+                        src->uv_height, src->uv_width,
+                        et, el, eb, er);
+
+  copy_and_extend_plane(src->v_buffer, src->uv_stride,
+                        dst->v_buffer, dst->uv_stride,
+                        src->uv_height, src->uv_width,
+                        et, el, eb, er);
+}
+
+void vp9_copy_and_extend_frame_with_rect(YV12_BUFFER_CONFIG *src,
+                                         YV12_BUFFER_CONFIG *dst,
+                                         int srcy, int srcx,
+                                         int srch, int srcw) {
+  int et = dst->border;
+  int el = dst->border;
+  int eb = dst->border + dst->y_height - src->y_height;
+  int er = dst->border + dst->y_width - src->y_width;
+  int src_y_offset = srcy * src->y_stride + srcx;
+  int dst_y_offset = srcy * dst->y_stride + srcx;
+  int src_uv_offset = ((srcy * src->uv_stride) >> 1) + (srcx >> 1);
+  int dst_uv_offset = ((srcy * dst->uv_stride) >> 1) + (srcx >> 1);
+
+  // If the side is not touching the bounder then don't extend.
+  if (srcy)
+    et = 0;
+  if (srcx)
+    el = 0;
+  if (srcy + srch != src->y_height)
+    eb = 0;
+  if (srcx + srcw != src->y_width)
+    er = 0;
+
+  copy_and_extend_plane(src->y_buffer + src_y_offset,
+                        src->y_stride,
+                        dst->y_buffer + dst_y_offset,
+                        dst->y_stride,
+                        srch, srcw,
+                        et, el, eb, er);
+
+  et = (et + 1) >> 1;
+  el = (el + 1) >> 1;
+  eb = (eb + 1) >> 1;
+  er = (er + 1) >> 1;
+  srch = (srch + 1) >> 1;
+  srcw = (srcw + 1) >> 1;
+
+  copy_and_extend_plane(src->u_buffer + src_uv_offset,
+                        src->uv_stride,
+                        dst->u_buffer + dst_uv_offset,
+                        dst->uv_stride,
+                        srch, srcw,
+                        et, el, eb, er);
+
+  copy_and_extend_plane(src->v_buffer + src_uv_offset,
+                        src->uv_stride,
+                        dst->v_buffer + dst_uv_offset,
+                        dst->uv_stride,
+                        srch, srcw,
+                        et, el, eb, er);
+}
+
+/* note the extension is only for the last row, for intra prediction purpose */
+void vp9_extend_mb_row(YV12_BUFFER_CONFIG *ybf, unsigned char *YPtr,
+                       unsigned char *UPtr, unsigned char *VPtr) {
+  int i;
+
+  YPtr += ybf->y_stride * 14;
+  UPtr += ybf->uv_stride * 6;
+  VPtr += ybf->uv_stride * 6;
+
+  for (i = 0; i < 4; i++) {
+    YPtr[i] = YPtr[-1];
+    UPtr[i] = UPtr[-1];
+    VPtr[i] = VPtr[-1];
+  }
+
+  YPtr += ybf->y_stride;
+  UPtr += ybf->uv_stride;
+  VPtr += ybf->uv_stride;
+
+  for (i = 0; i < 4; i++) {
+    YPtr[i] = YPtr[-1];
+    UPtr[i] = UPtr[-1];
+    VPtr[i] = VPtr[-1];
+  }
+}
--- /dev/null
+++ b/vp9/common/extend.h
@@ -1,0 +1,27 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef __INC_EXTEND_H
+#define __INC_EXTEND_H
+
+#include "vpx_scale/yv12config.h"
+
+void vp9_extend_mb_row(YV12_BUFFER_CONFIG *ybf, unsigned char *YPtr,
+                       unsigned char *UPtr, unsigned char *VPtr);
+
+void vp9_copy_and_extend_frame(YV12_BUFFER_CONFIG *src,
+                               YV12_BUFFER_CONFIG *dst);
+
+void vp9_copy_and_extend_frame_with_rect(YV12_BUFFER_CONFIG *src,
+                                         YV12_BUFFER_CONFIG *dst,
+                                         int srcy, int srcx,
+                                         int srch, int srcw);
+
+#endif  // __INC_EXTEND_H
--- /dev/null
+++ b/vp9/common/filter.c
@@ -1,0 +1,1159 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include <stdlib.h>
+#include "filter.h"
+#include "vpx_ports/mem.h"
+#include "vpx_rtcd.h"
+
+DECLARE_ALIGNED(16, const short, vp9_bilinear_filters[SUBPEL_SHIFTS][2]) = {
+  { 128,   0 },
+  { 120,   8 },
+  { 112,  16 },
+  { 104,  24 },
+  {  96,  32 },
+  {  88,  40 },
+  {  80,  48 },
+  {  72,  56 },
+  {  64,  64 },
+  {  56,  72 },
+  {  48,  80 },
+  {  40,  88 },
+  {  32,  96 },
+  {  24, 104 },
+  {  16, 112 },
+  {   8, 120 }
+};
+
+#define FILTER_ALPHA       0
+#define FILTER_ALPHA_SHARP 1
+DECLARE_ALIGNED(16, const short, vp9_sub_pel_filters_8[SUBPEL_SHIFTS][8]) = {
+#if FILTER_ALPHA == 0
+  /* Lagrangian interpolation filter */
+  { 0,   0,   0, 128,   0,   0,   0,  0},
+  { 0,   1,  -5, 126,   8,  -3,   1,  0},
+  { -1,   3, -10, 122,  18,  -6,   2,  0},
+  { -1,   4, -13, 118,  27,  -9,   3, -1},
+  { -1,   4, -16, 112,  37, -11,   4, -1},
+  { -1,   5, -18, 105,  48, -14,   4, -1},
+  { -1,   5, -19,  97,  58, -16,   5, -1},
+  { -1,   6, -19,  88,  68, -18,   5, -1},
+  { -1,   6, -19,  78,  78, -19,   6, -1},
+  { -1,   5, -18,  68,  88, -19,   6, -1},
+  { -1,   5, -16,  58,  97, -19,   5, -1},
+  { -1,   4, -14,  48, 105, -18,   5, -1},
+  { -1,   4, -11,  37, 112, -16,   4, -1},
+  { -1,   3,  -9,  27, 118, -13,   4, -1},
+  { 0,   2,  -6,  18, 122, -10,   3, -1},
+  { 0,   1,  -3,   8, 126,  -5,   1,  0}
+#elif FILTER_ALPHA == 50
+  /* Generated using MATLAB:
+   * alpha = 0.5;
+   * b=intfilt(8,4,alpha);
+   * bi=round(128*b);
+   * ba=flipud(reshape([bi 0], 8, 8));
+   * disp(num2str(ba, '%d,'))
+   */
+  { 0,   0,   0, 128,   0,   0,   0,  0},
+  { 0,   1,  -5, 126,   8,  -3,   1,  0},
+  { 0,   2, -10, 122,  18,  -6,   2,  0},
+  { -1,   3, -13, 118,  27,  -9,   3,  0},
+  { -1,   4, -16, 112,  37, -11,   3,  0},
+  { -1,   5, -17, 104,  48, -14,   4, -1},
+  { -1,   5, -18,  96,  58, -16,   5, -1},
+  { -1,   5, -19,  88,  68, -17,   5, -1},
+  { -1,   5, -18,  78,  78, -18,   5, -1},
+  { -1,   5, -17,  68,  88, -19,   5, -1},
+  { -1,   5, -16,  58,  96, -18,   5, -1},
+  { -1,   4, -14,  48, 104, -17,   5, -1},
+  { 0,   3, -11,  37, 112, -16,   4, -1},
+  { 0,   3,  -9,  27, 118, -13,   3, -1},
+  { 0,   2,  -6,  18, 122, -10,   2,  0},
+  { 0,   1,  -3,   8, 126,  -5,   1,  0}
+#endif  /* FILTER_ALPHA */
+};
+
+DECLARE_ALIGNED(16, const short, vp9_sub_pel_filters_8s[SUBPEL_SHIFTS][8]) = {
+#if FILTER_ALPHA_SHARP == 1
+  /* dct based filter */
+  {0,   0,   0, 128,   0,   0,   0, 0},
+  {-1,   3,  -7, 127,   8,  -3,   1, 0},
+  {-2,   5, -13, 125,  17,  -6,   3, -1},
+  {-3,   7, -17, 121,  27, -10,   5, -2},
+  {-4,   9, -20, 115,  37, -13,   6, -2},
+  {-4,  10, -23, 108,  48, -16,   8, -3},
+  {-4,  10, -24, 100,  59, -19,   9, -3},
+  {-4,  11, -24,  90,  70, -21,  10, -4},
+  {-4,  11, -23,  80,  80, -23,  11, -4},
+  {-4,  10, -21,  70,  90, -24,  11, -4},
+  {-3,   9, -19,  59, 100, -24,  10, -4},
+  {-3,   8, -16,  48, 108, -23,  10, -4},
+  {-2,   6, -13,  37, 115, -20,   9, -4},
+  {-2,   5, -10,  27, 121, -17,   7, -3},
+  {-1,   3,  -6,  17, 125, -13,   5, -2},
+  {0,   1,  -3,   8, 127,  -7,   3, -1}
+#elif FILTER_ALPHA_SHARP == 75
+  /* alpha = 0.75 */
+  {0,   0,   0, 128,   0,   0,   0, 0},
+  {-1,   2,  -6, 126,   9,  -3,   2, -1},
+  {-1,   4, -11, 123,  18,  -7,   3, -1},
+  {-2,   6, -16, 119,  28, -10,   5, -2},
+  {-2,   7, -19, 113,  38, -13,   6, -2},
+  {-3,   8, -21, 106,  49, -16,   7, -2},
+  {-3,   9, -22,  99,  59, -19,   8, -3},
+  {-3,   9, -23,  90,  70, -21,   9, -3},
+  {-3,   9, -22,  80,  80, -22,   9, -3},
+  {-3,   9, -21,  70,  90, -23,   9, -3},
+  {-3,   8, -19,  59,  99, -22,   9, -3},
+  {-2,   7, -16,  49, 106, -21,   8, -3},
+  {-2,   6, -13,  38, 113, -19,   7, -2},
+  {-2,   5, -10,  28, 119, -16,   6, -2},
+  {-1,   3,  -7,  18, 123, -11,   4, -1},
+  {-1,   2,  -3,   9, 126,  -6,   2, -1}
+#endif  /* FILTER_ALPHA_SHARP */
+};
+
+DECLARE_ALIGNED(16, const short, vp9_sub_pel_filters_6[SUBPEL_SHIFTS][6]) = {
+  {0,   0, 128,   0,   0, 0},
+  {1,  -5, 125,   8,  -2, 1},
+  {1,  -8, 122,  17,  -5, 1},
+  {2, -11, 116,  27,  -8, 2},
+  {3, -14, 110,  37, -10, 2},
+  {3, -15, 103,  47, -12, 2},
+  {3, -16,  95,  57, -14, 3},
+  {3, -16,  86,  67, -15, 3},
+  {3, -16,  77,  77, -16, 3},
+  {3, -15,  67,  86, -16, 3},
+  {3, -14,  57,  95, -16, 3},
+  {2, -12,  47, 103, -15, 3},
+  {2, -10,  37, 110, -14, 3},
+  {2,  -8,  27, 116, -11, 2},
+  {1,  -5,  17, 122,  -8, 1},
+  {1,  -2,   8, 125,  -5, 1}
+};
+
+static void filter_block2d_first_pass_6(unsigned char *src_ptr,
+                                        int *output_ptr,
+                                        unsigned int src_pixels_per_line,
+                                        unsigned int pixel_step,
+                                        unsigned int output_height,
+                                        unsigned int output_width,
+                                        const short *vp9_filter) {
+  unsigned int i, j;
+  int  Temp;
+
+  for (i = 0; i < output_height; i++) {
+    for (j = 0; j < output_width; j++) {
+      Temp = ((int)src_ptr[-2 * (int)pixel_step] * vp9_filter[0]) +
+             ((int)src_ptr[-1 * (int)pixel_step] * vp9_filter[1]) +
+             ((int)src_ptr[0]                    * vp9_filter[2]) +
+             ((int)src_ptr[pixel_step]           * vp9_filter[3]) +
+             ((int)src_ptr[2 * pixel_step]       * vp9_filter[4]) +
+             ((int)src_ptr[3 * pixel_step]       * vp9_filter[5]) +
+             (VP9_FILTER_WEIGHT >> 1);      /* Rounding */
+
+      /* Normalize back to 0-255 */
+      Temp = Temp >> VP9_FILTER_SHIFT;
+
+      if (Temp < 0)
+        Temp = 0;
+      else if (Temp > 255)
+        Temp = 255;
+
+      output_ptr[j] = Temp;
+      src_ptr++;
+    }
+
+    /* Next row... */
+    src_ptr    += src_pixels_per_line - output_width;
+    output_ptr += output_width;
+  }
+}
+
+static void filter_block2d_second_pass_6(int *src_ptr,
+                                         unsigned char *output_ptr,
+                                         int output_pitch,
+                                         unsigned int src_pixels_per_line,
+                                         unsigned int pixel_step,
+                                         unsigned int output_height,
+                                         unsigned int output_width,
+                                         const short *vp9_filter) {
+  unsigned int i, j;
+  int  Temp;
+
+  for (i = 0; i < output_height; i++) {
+    for (j = 0; j < output_width; j++) {
+      /* Apply filter */
+      Temp = ((int)src_ptr[-2 * (int)pixel_step] * vp9_filter[0]) +
+             ((int)src_ptr[-1 * (int)pixel_step] * vp9_filter[1]) +
+             ((int)src_ptr[0]                    * vp9_filter[2]) +
+             ((int)src_ptr[pixel_step]           * vp9_filter[3]) +
+             ((int)src_ptr[2 * pixel_step]         * vp9_filter[4]) +
+             ((int)src_ptr[3 * pixel_step]         * vp9_filter[5]) +
+             (VP9_FILTER_WEIGHT >> 1);   /* Rounding */
+
+      /* Normalize back to 0-255 */
+      Temp = Temp >> VP9_FILTER_SHIFT;
+
+      if (Temp < 0)
+        Temp = 0;
+      else if (Temp > 255)
+        Temp = 255;
+
+      output_ptr[j] = (unsigned char)Temp;
+      src_ptr++;
+    }
+
+    /* Start next row */
+    src_ptr    += src_pixels_per_line - output_width;
+    output_ptr += output_pitch;
+  }
+}
+
+/*
+ * The only functional difference between filter_block2d_second_pass()
+ * and this function is that filter_block2d_second_pass() does a sixtap
+ * filter on the input and stores it in the output. This function
+ * (filter_block2d_second_pass_avg()) does a sixtap filter on the input,
+ * and then averages that with the content already present in the output
+ * ((filter_result + dest + 1) >> 1) and stores that in the output.
+ */
+static void filter_block2d_second_pass_avg_6(int *src_ptr,
+                                             unsigned char *output_ptr,
+                                             int output_pitch,
+                                             unsigned int src_pixels_per_line,
+                                             unsigned int pixel_step,
+                                             unsigned int output_height,
+                                             unsigned int output_width,
+                                             const short *vp9_filter) {
+  unsigned int i, j;
+  int  Temp;
+
+  for (i = 0; i < output_height; i++) {
+    for (j = 0; j < output_width; j++) {
+      /* Apply filter */
+      Temp = ((int)src_ptr[-2 * (int)pixel_step] * vp9_filter[0]) +
+             ((int)src_ptr[-1 * (int)pixel_step] * vp9_filter[1]) +
+             ((int)src_ptr[0]                    * vp9_filter[2]) +
+             ((int)src_ptr[pixel_step]           * vp9_filter[3]) +
+             ((int)src_ptr[2 * pixel_step]         * vp9_filter[4]) +
+             ((int)src_ptr[3 * pixel_step]         * vp9_filter[5]) +
+             (VP9_FILTER_WEIGHT >> 1);   /* Rounding */
+
+      /* Normalize back to 0-255 */
+      Temp = Temp >> VP9_FILTER_SHIFT;
+
+      if (Temp < 0)
+        Temp = 0;
+      else if (Temp > 255)
+        Temp = 255;
+
+      output_ptr[j] = (unsigned char)((output_ptr[j] + Temp + 1) >> 1);
+      src_ptr++;
+    }
+
+    /* Start next row */
+    src_ptr    += src_pixels_per_line - output_width;
+    output_ptr += output_pitch;
+  }
+}
+
+#define Interp_Extend 3
+static void filter_block2d_6(unsigned char  *src_ptr,
+                             unsigned char  *output_ptr,
+                             unsigned int src_pixels_per_line,
+                             int output_pitch,
+                             const short  *HFilter,
+                             const short  *VFilter) {
+  int FData[(3 + Interp_Extend * 2) * 4]; /* Temp data buffer used in filtering */
+
+  /* First filter 1-D horizontally... */
+  filter_block2d_first_pass_6(src_ptr - ((Interp_Extend - 1) * src_pixels_per_line), FData, src_pixels_per_line, 1,
+                              3 + Interp_Extend * 2, 4, HFilter);
+
+  /* then filter verticaly... */
+  filter_block2d_second_pass_6(FData + 4 * (Interp_Extend - 1), output_ptr, output_pitch, 4, 4, 4, 4, VFilter);
+}
+
+
+void vp9_sixtap_predict_c(unsigned char  *src_ptr,
+                          int   src_pixels_per_line,
+                          int  xoffset,
+                          int  yoffset,
+                          unsigned char *dst_ptr,
+                          int dst_pitch) {
+  const short  *HFilter;
+  const short  *VFilter;
+
+  HFilter = vp9_sub_pel_filters_6[xoffset];   /* 6 tap */
+  VFilter = vp9_sub_pel_filters_6[yoffset];   /* 6 tap */
+
+  filter_block2d_6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter);
+}
+
+/*
+ * The difference between filter_block2d_6() and filter_block2d_avg_6 is
+ * that filter_block2d_6() does a 6-tap filter and stores it in the output
+ * buffer, whereas filter_block2d_avg_6() does the same 6-tap filter, and
+ * then averages that with the content already present in the output
+ * ((filter_result + dest + 1) >> 1) and stores that in the output.
+ */
+static void filter_block2d_avg_6(unsigned char  *src_ptr,
+                                 unsigned char  *output_ptr,
+                                 unsigned int src_pixels_per_line,
+                                 int output_pitch,
+                                 const short  *HFilter,
+                                 const short  *VFilter) {
+  int FData[(3 + Interp_Extend * 2) * 4]; /* Temp data buffer used in filtering */
+
+  /* First filter 1-D horizontally... */
+  filter_block2d_first_pass_6(src_ptr - ((Interp_Extend - 1) * src_pixels_per_line),
+                              FData, src_pixels_per_line, 1,
+                              3 + Interp_Extend * 2, 4, HFilter);
+
+  /* then filter verticaly... */
+  filter_block2d_second_pass_avg_6(FData + 4 * (Interp_Extend - 1), output_ptr,
+                                   output_pitch, 4, 4, 4, 4, VFilter);
+}
+
+void vp9_sixtap_predict_avg_c
+(
+  unsigned char  *src_ptr,
+  int   src_pixels_per_line,
+  int  xoffset,
+  int  yoffset,
+  unsigned char *dst_ptr,
+  int dst_pitch
+) {
+  const short  *HFilter;
+  const short  *VFilter;
+
+  HFilter = vp9_sub_pel_filters_6[xoffset];   /* 6 tap */
+  VFilter = vp9_sub_pel_filters_6[yoffset];   /* 6 tap */
+
+  filter_block2d_avg_6(src_ptr, dst_ptr, src_pixels_per_line,
+                       dst_pitch, HFilter, VFilter);
+}
+
+void vp9_sixtap_predict8x8_c
+(
+  unsigned char  *src_ptr,
+  int  src_pixels_per_line,
+  int  xoffset,
+  int  yoffset,
+  unsigned char *dst_ptr,
+  int  dst_pitch
+) {
+  const short  *HFilter;
+  const short  *VFilter;
+  // int FData[(7+Interp_Extend*2)*16];   /* Temp data buffer used in filtering */
+  int FData[(7 + Interp_Extend * 2) * 8]; /* Temp data buffer used in filtering */
+
+  HFilter = vp9_sub_pel_filters_6[xoffset];   /* 6 tap */
+  VFilter = vp9_sub_pel_filters_6[yoffset];   /* 6 tap */
+
+  /* First filter 1-D horizontally... */
+  filter_block2d_first_pass_6(src_ptr - ((Interp_Extend - 1) * src_pixels_per_line), FData, src_pixels_per_line, 1,
+                              7 + Interp_Extend * 2, 8, HFilter);
+
+
+  /* then filter verticaly... */
+  filter_block2d_second_pass_6(FData + 8 * (Interp_Extend - 1), dst_ptr, dst_pitch, 8, 8, 8, 8, VFilter);
+
+}
+
+void vp9_sixtap_predict_avg8x8_c
+(
+  unsigned char  *src_ptr,
+  int  src_pixels_per_line,
+  int  xoffset,
+  int  yoffset,
+  unsigned char *dst_ptr,
+  int  dst_pitch
+) {
+  const short  *HFilter;
+  const short  *VFilter;
+  // int FData[(7+Interp_Extend*2)*16];   /* Temp data buffer used in filtering */
+  int FData[(7 + Interp_Extend * 2) * 8]; /* Temp data buffer used in filtering */
+
+  HFilter = vp9_sub_pel_filters_6[xoffset];   /* 6 tap */
+  VFilter = vp9_sub_pel_filters_6[yoffset];   /* 6 tap */
+
+  /* First filter 1-D horizontally... */
+  filter_block2d_first_pass_6(src_ptr - ((Interp_Extend - 1) * src_pixels_per_line), FData, src_pixels_per_line, 1,
+                              7 + Interp_Extend * 2, 8, HFilter);
+
+  /* then filter verticaly... */
+  filter_block2d_second_pass_avg_6(FData + 8 * (Interp_Extend - 1), dst_ptr, dst_pitch, 8, 8, 8, 8, VFilter);
+}
+
+void vp9_sixtap_predict8x4_c
+(
+  unsigned char  *src_ptr,
+  int  src_pixels_per_line,
+  int  xoffset,
+  int  yoffset,
+  unsigned char *dst_ptr,
+  int  dst_pitch
+) {
+  const short  *HFilter;
+  const short  *VFilter;
+  // int FData[(7+Interp_Extend*2)*16];   /* Temp data buffer used in filtering */
+  int FData[(3 + Interp_Extend * 2) * 8]; /* Temp data buffer used in filtering */
+
+  HFilter = vp9_sub_pel_filters_6[xoffset];   /* 6 tap */
+  VFilter = vp9_sub_pel_filters_6[yoffset];   /* 6 tap */
+
+  /* First filter 1-D horizontally... */
+  filter_block2d_first_pass_6(src_ptr - ((Interp_Extend - 1) * src_pixels_per_line), FData, src_pixels_per_line, 1,
+                              3 + Interp_Extend * 2, 8, HFilter);
+
+
+  /* then filter verticaly... */
+  filter_block2d_second_pass_6(FData + 8 * (Interp_Extend - 1), dst_ptr, dst_pitch, 8, 8, 4, 8, VFilter);
+
+}
+
+void vp9_sixtap_predict16x16_c
+(
+  unsigned char  *src_ptr,
+  int  src_pixels_per_line,
+  int  xoffset,
+  int  yoffset,
+  unsigned char *dst_ptr,
+  int  dst_pitch
+) {
+  const short  *HFilter;
+  const short  *VFilter;
+  // int FData[(15+Interp_Extend*2)*24];   /* Temp data buffer used in filtering */
+  int FData[(15 + Interp_Extend * 2) * 16]; /* Temp data buffer used in filtering */
+
+
+  HFilter = vp9_sub_pel_filters_6[xoffset];   /* 6 tap */
+  VFilter = vp9_sub_pel_filters_6[yoffset];   /* 6 tap */
+
+  /* First filter 1-D horizontally... */
+  filter_block2d_first_pass_6(src_ptr - ((Interp_Extend - 1) * src_pixels_per_line), FData, src_pixels_per_line, 1,
+                              15 + Interp_Extend * 2, 16, HFilter);
+
+  /* then filter verticaly... */
+  filter_block2d_second_pass_6(FData + 16 * (Interp_Extend - 1), dst_ptr, dst_pitch, 16, 16, 16, 16, VFilter);
+
+}
+
+void vp9_sixtap_predict_avg16x16_c
+(
+  unsigned char  *src_ptr,
+  int  src_pixels_per_line,
+  int  xoffset,
+  int  yoffset,
+  unsigned char *dst_ptr,
+  int  dst_pitch
+) {
+  const short  *HFilter;
+  const short  *VFilter;
+  // int FData[(15+Interp_Extend*2)*24];   /* Temp data buffer used in filtering */
+  int FData[(15 + Interp_Extend * 2) * 16]; /* Temp data buffer used in filtering */
+
+  HFilter = vp9_sub_pel_filters_6[xoffset];   /* 6 tap */
+  VFilter = vp9_sub_pel_filters_6[yoffset];   /* 6 tap */
+
+  /* First filter 1-D horizontally... */
+  filter_block2d_first_pass_6(src_ptr - ((Interp_Extend - 1) * src_pixels_per_line), FData,
+                              src_pixels_per_line, 1, 15 + Interp_Extend * 2, 16, HFilter);
+
+  /* then filter verticaly... */
+  filter_block2d_second_pass_avg_6(FData + 16 * (Interp_Extend - 1), dst_ptr, dst_pitch,
+                                   16, 16, 16, 16, VFilter);
+}
+
+typedef enum {
+  VPX_FILTER_4x4 = 0,
+  VPX_FILTER_8x8 = 1,
+  VPX_FILTER_8x4 = 2,
+  VPX_FILTER_16x16 = 3,
+} filter_size_t;
+
+static const unsigned int filter_size_to_wh[][2] = {
+  {4, 4},
+  {8, 8},
+  {8, 4},
+  {16,16},
+};
+
+static const unsigned int filter_max_height = 16;
+static const unsigned int filter_max_width = 16;
+
+static void filter_block2d_8_c(const unsigned char *src_ptr,
+                               const unsigned int   src_stride,
+                               const short *HFilter,
+                               const short *VFilter,
+                               const filter_size_t filter_size,
+                               unsigned char *dst_ptr,
+                               unsigned int   dst_stride) {
+  const unsigned int output_width = filter_size_to_wh[filter_size][0];
+  const unsigned int output_height = filter_size_to_wh[filter_size][1];
+
+  // Between passes, we use an intermediate buffer whose height is extended to
+  // have enough horizontally filtered values as input for the vertical pass.
+  // This buffer is allocated to be big enough for the largest block type we
+  // support.
+  const int kInterp_Extend = 4;
+  const unsigned int intermediate_height =
+    (kInterp_Extend - 1) +     output_height + kInterp_Extend;
+  const unsigned int max_intermediate_height =
+    (kInterp_Extend - 1) + filter_max_height + kInterp_Extend;
+#ifdef _MSC_VER
+  // MSVC does not support C99 style declaration
+  unsigned char intermediate_buffer[23 * 16];
+#else
+  unsigned char intermediate_buffer[max_intermediate_height * filter_max_width];
+#endif
+  const int intermediate_next_stride = 1 - intermediate_height * output_width;
+
+  // Horizontal pass (src -> transposed intermediate).
+  {
+    unsigned char *output_ptr = intermediate_buffer;
+    const int src_next_row_stride = src_stride - output_width;
+    unsigned int i, j;
+    src_ptr -= (kInterp_Extend - 1) * src_stride + (kInterp_Extend - 1);
+    for (i = 0; i < intermediate_height; i++) {
+      for (j = 0; j < output_width; j++) {
+        // Apply filter...
+        int temp = ((int)src_ptr[0] * HFilter[0]) +
+                   ((int)src_ptr[1] * HFilter[1]) +
+                   ((int)src_ptr[2] * HFilter[2]) +
+                   ((int)src_ptr[3] * HFilter[3]) +
+                   ((int)src_ptr[4] * HFilter[4]) +
+                   ((int)src_ptr[5] * HFilter[5]) +
+                   ((int)src_ptr[6] * HFilter[6]) +
+                   ((int)src_ptr[7] * HFilter[7]) +
+                   (VP9_FILTER_WEIGHT >> 1); // Rounding
+
+        // Normalize back to 0-255...
+        temp >>= VP9_FILTER_SHIFT;
+        if (temp < 0) {
+          temp = 0;
+        } else if (temp > 255) {
+          temp = 255;
+        }
+        src_ptr++;
+        *output_ptr = temp;
+        output_ptr += intermediate_height;
+      }
+      src_ptr += src_next_row_stride;
+      output_ptr += intermediate_next_stride;
+    }
+  }
+
+  // Vertical pass (transposed intermediate -> dst).
+  {
+    unsigned char *src_ptr = intermediate_buffer;
+    const int dst_next_row_stride = dst_stride - output_width;
+    unsigned int i, j;
+    for (i = 0; i < output_height; i++) {
+      for (j = 0; j < output_width; j++) {
+        // Apply filter...
+        int temp = ((int)src_ptr[0] * VFilter[0]) +
+                   ((int)src_ptr[1] * VFilter[1]) +
+                   ((int)src_ptr[2] * VFilter[2]) +
+                   ((int)src_ptr[3] * VFilter[3]) +
+                   ((int)src_ptr[4] * VFilter[4]) +
+                   ((int)src_ptr[5] * VFilter[5]) +
+                   ((int)src_ptr[6] * VFilter[6]) +
+                   ((int)src_ptr[7] * VFilter[7]) +
+                   (VP9_FILTER_WEIGHT >> 1); // Rounding
+
+        // Normalize back to 0-255...
+        temp >>= VP9_FILTER_SHIFT;
+        if (temp < 0) {
+          temp = 0;
+        } else if (temp > 255) {
+          temp = 255;
+        }
+
+        src_ptr += intermediate_height;
+        *dst_ptr++ = (unsigned char)temp;
+      }
+      src_ptr += intermediate_next_stride;
+      dst_ptr += dst_next_row_stride;
+    }
+  }
+}
+
+void vp9_filter_block2d_4x4_8_c(const unsigned char *src_ptr,
+                                const unsigned int src_stride,
+                                const short *HFilter_aligned16,
+                                const short *VFilter_aligned16,
+                                unsigned char *dst_ptr,
+                                unsigned int dst_stride) {
+  filter_block2d_8_c(src_ptr, src_stride,
+                     HFilter_aligned16, VFilter_aligned16,
+                     VPX_FILTER_4x4, dst_ptr, dst_stride);
+}
+
+void vp9_filter_block2d_8x4_8_c(const unsigned char *src_ptr,
+                                const unsigned int src_stride,
+                                const short *HFilter_aligned16,
+                                const short *VFilter_aligned16,
+                                unsigned char *dst_ptr,
+                                unsigned int dst_stride) {
+  filter_block2d_8_c(src_ptr, src_stride,
+                     HFilter_aligned16, VFilter_aligned16,
+                     VPX_FILTER_8x4, dst_ptr, dst_stride);
+}
+
+void vp9_filter_block2d_8x8_8_c(const unsigned char *src_ptr,
+                                const unsigned int src_stride,
+                                const short *HFilter_aligned16,
+                                const short *VFilter_aligned16,
+                                unsigned char *dst_ptr,
+                                unsigned int dst_stride) {
+  filter_block2d_8_c(src_ptr, src_stride,
+                     HFilter_aligned16, VFilter_aligned16,
+                     VPX_FILTER_8x8, dst_ptr, dst_stride);
+}
+
+void vp9_filter_block2d_16x16_8_c(const unsigned char *src_ptr,
+                                  const unsigned int src_stride,
+                                  const short *HFilter_aligned16,
+                                  const short *VFilter_aligned16,
+                                  unsigned char *dst_ptr,
+                                  unsigned int dst_stride) {
+  filter_block2d_8_c(src_ptr, src_stride,
+                     HFilter_aligned16, VFilter_aligned16,
+                     VPX_FILTER_16x16, dst_ptr, dst_stride);
+}
+
+static void block2d_average_c(unsigned char *src,
+                              unsigned int   src_stride,
+                              unsigned char *output_ptr,
+                              unsigned int output_stride,
+                              const filter_size_t filter_size) {
+  const unsigned int output_width = filter_size_to_wh[filter_size][0];
+  const unsigned int output_height = filter_size_to_wh[filter_size][1];
+
+  unsigned int i, j;
+  for (i = 0; i < output_height; i++) {
+    for (j = 0; j < output_width; j++) {
+      output_ptr[j] = (output_ptr[j] + src[i * src_stride + j] + 1) >> 1;
+    }
+    output_ptr += output_stride;
+  }
+}
+
+#define block2d_average block2d_average_c
+
+void vp9_eighttap_predict_c(unsigned char  *src_ptr,
+                            int   src_pixels_per_line,
+                            int  xoffset,
+                            int  yoffset,
+                            unsigned char *dst_ptr,
+                            int dst_pitch) {
+  const short  *HFilter;
+  const short  *VFilter;
+
+  HFilter = vp9_sub_pel_filters_8[xoffset];
+  VFilter = vp9_sub_pel_filters_8[yoffset];
+
+  vp9_filter_block2d_4x4_8(src_ptr, src_pixels_per_line,
+                           HFilter, VFilter,
+                           dst_ptr, dst_pitch);
+}
+
+void vp9_eighttap_predict_avg4x4_c(unsigned char  *src_ptr,
+                                   int   src_pixels_per_line,
+                                   int  xoffset,
+                                   int  yoffset,
+                                   unsigned char *dst_ptr,
+                                   int dst_pitch) {
+  const short  *HFilter = vp9_sub_pel_filters_8[xoffset];
+  const short  *VFilter = vp9_sub_pel_filters_8[yoffset];
+  unsigned char tmp[4 * 4];
+
+  vp9_filter_block2d_4x4_8(src_ptr, src_pixels_per_line,
+                           HFilter, VFilter,
+                           tmp, 4);
+  block2d_average(tmp, 4, dst_ptr, dst_pitch, VPX_FILTER_4x4);
+}
+
+void vp9_eighttap_predict_sharp_c(unsigned char  *src_ptr,
+                                  int   src_pixels_per_line,
+                                  int  xoffset,
+                                  int  yoffset,
+                                  unsigned char *dst_ptr,
+                                  int dst_pitch) {
+  const short  *HFilter;
+  const short  *VFilter;
+
+  HFilter = vp9_sub_pel_filters_8s[xoffset];
+  VFilter = vp9_sub_pel_filters_8s[yoffset];
+
+  vp9_filter_block2d_4x4_8(src_ptr, src_pixels_per_line,
+                           HFilter, VFilter,
+                           dst_ptr, dst_pitch);
+}
+
+void vp9_eighttap_predict_avg4x4_sharp_c(unsigned char  *src_ptr,
+                                         int   src_pixels_per_line,
+                                         int  xoffset,
+                                         int  yoffset,
+                                         unsigned char *dst_ptr,
+                                         int dst_pitch) {
+  const short  *HFilter = vp9_sub_pel_filters_8s[xoffset];
+  const short  *VFilter = vp9_sub_pel_filters_8s[yoffset];
+  unsigned char tmp[4 * 4];
+
+  vp9_filter_block2d_4x4_8(src_ptr, src_pixels_per_line,
+                           HFilter, VFilter,
+                           tmp, 4);
+  block2d_average(tmp, 4, dst_ptr, dst_pitch, VPX_FILTER_4x4);
+}
+
+void vp9_eighttap_predict8x8_c(unsigned char  *src_ptr,
+                               int  src_pixels_per_line,
+                               int  xoffset,
+                               int  yoffset,
+                               unsigned char *dst_ptr,
+                               int  dst_pitch) {
+  const short  *HFilter = vp9_sub_pel_filters_8[xoffset];
+  const short  *VFilter = vp9_sub_pel_filters_8[yoffset];
+
+  vp9_filter_block2d_8x8_8(src_ptr, src_pixels_per_line,
+                           HFilter, VFilter,
+                           dst_ptr, dst_pitch);
+}
+
+void vp9_eighttap_predict8x8_sharp_c(unsigned char  *src_ptr,
+                                     int  src_pixels_per_line,
+                                     int  xoffset,
+                                     int  yoffset,
+                                     unsigned char *dst_ptr,
+                                     int  dst_pitch) {
+  const short  *HFilter = vp9_sub_pel_filters_8s[xoffset];
+  const short  *VFilter = vp9_sub_pel_filters_8s[yoffset];
+
+  vp9_filter_block2d_8x8_8(src_ptr, src_pixels_per_line,
+                           HFilter, VFilter,
+                           dst_ptr, dst_pitch);
+}
+
+void vp9_eighttap_predict_avg8x8_c(unsigned char  *src_ptr,
+                                   int  src_pixels_per_line,
+                                   int  xoffset,
+                                   int  yoffset,
+                                   unsigned char *dst_ptr,
+                                   int  dst_pitch) {
+  unsigned char tmp[8 * 8];
+  const short  *HFilter = vp9_sub_pel_filters_8[xoffset];
+  const short  *VFilter = vp9_sub_pel_filters_8[yoffset];
+
+  vp9_filter_block2d_8x8_8(src_ptr, src_pixels_per_line,
+                           HFilter, VFilter,
+                           tmp, 8);
+  block2d_average(tmp, 8, dst_ptr, dst_pitch, VPX_FILTER_8x8);
+}
+
+void vp9_eighttap_predict_avg8x8_sharp_c(unsigned char  *src_ptr,
+                                         int  src_pixels_per_line,
+                                         int  xoffset,
+                                         int  yoffset,
+                                         unsigned char *dst_ptr,
+                                         int  dst_pitch) {
+  unsigned char tmp[8 * 8];
+  const short  *HFilter = vp9_sub_pel_filters_8s[xoffset];
+  const short  *VFilter = vp9_sub_pel_filters_8s[yoffset];
+
+  vp9_filter_block2d_8x8_8(src_ptr, src_pixels_per_line,
+                           HFilter, VFilter,
+                           tmp, 8);
+  block2d_average(tmp, 8, dst_ptr, dst_pitch, VPX_FILTER_8x8);
+}
+
+void vp9_eighttap_predict8x4_c(unsigned char  *src_ptr,
+                               int  src_pixels_per_line,
+                               int  xoffset,
+                               int  yoffset,
+                               unsigned char *dst_ptr,
+                               int  dst_pitch) {
+  const short  *HFilter = vp9_sub_pel_filters_8[xoffset];
+  const short  *VFilter = vp9_sub_pel_filters_8[yoffset];
+
+  vp9_filter_block2d_8x4_8(src_ptr, src_pixels_per_line,
+                           HFilter, VFilter,
+                           dst_ptr, dst_pitch);
+}
+
+void vp9_eighttap_predict8x4_sharp_c(unsigned char  *src_ptr,
+                                     int  src_pixels_per_line,
+                                     int  xoffset,
+                                     int  yoffset,
+                                     unsigned char *dst_ptr,
+                                     int  dst_pitch) {
+  const short  *HFilter = vp9_sub_pel_filters_8s[xoffset];
+  const short  *VFilter = vp9_sub_pel_filters_8s[yoffset];
+
+  vp9_filter_block2d_8x4_8(src_ptr, src_pixels_per_line,
+                           HFilter, VFilter,
+                           dst_ptr, dst_pitch);
+}
+
+void vp9_eighttap_predict16x16_c(unsigned char  *src_ptr,
+                                 int  src_pixels_per_line,
+                                 int  xoffset,
+                                 int  yoffset,
+                                 unsigned char *dst_ptr,
+                                 int  dst_pitch) {
+  const short  *HFilter = vp9_sub_pel_filters_8[xoffset];
+  const short  *VFilter = vp9_sub_pel_filters_8[yoffset];
+
+  vp9_filter_block2d_16x16_8(src_ptr, src_pixels_per_line,
+                       HFilter, VFilter,
+                       dst_ptr, dst_pitch);
+}
+
+void vp9_eighttap_predict16x16_sharp_c(unsigned char  *src_ptr,
+                                       int  src_pixels_per_line,
+                                       int  xoffset,
+                                       int  yoffset,
+                                       unsigned char *dst_ptr,
+                                       int  dst_pitch) {
+  const short  *HFilter = vp9_sub_pel_filters_8s[xoffset];
+  const short  *VFilter = vp9_sub_pel_filters_8s[yoffset];
+
+  vp9_filter_block2d_16x16_8(src_ptr, src_pixels_per_line,
+                       HFilter, VFilter,
+                       dst_ptr, dst_pitch);
+}
+
+void vp9_eighttap_predict_avg16x16_c(unsigned char  *src_ptr,
+                                     int  src_pixels_per_line,
+                                     int  xoffset,
+                                     int  yoffset,
+                                     unsigned char *dst_ptr,
+                                     int  dst_pitch) {
+  DECLARE_ALIGNED_ARRAY(16, unsigned char, tmp, 16 * 16);
+  const short  *HFilter = vp9_sub_pel_filters_8[xoffset];
+  const short  *VFilter = vp9_sub_pel_filters_8[yoffset];
+
+  vp9_filter_block2d_16x16_8(src_ptr, src_pixels_per_line,
+                       HFilter, VFilter,
+                       tmp, 16);
+  block2d_average(tmp, 16, dst_ptr, dst_pitch, VPX_FILTER_16x16);
+}
+
+void vp9_eighttap_predict_avg16x16_sharp_c(unsigned char  *src_ptr,
+                                           int  src_pixels_per_line,
+                                           int  xoffset,
+                                           int  yoffset,
+                                           unsigned char *dst_ptr,
+                                           int  dst_pitch) {
+  DECLARE_ALIGNED_ARRAY(16, unsigned char, tmp, 16 * 16);
+  const short  *HFilter = vp9_sub_pel_filters_8s[xoffset];
+  const short  *VFilter = vp9_sub_pel_filters_8s[yoffset];
+
+  vp9_filter_block2d_16x16_8(src_ptr, src_pixels_per_line,
+                       HFilter, VFilter,
+                       tmp, 16);
+  block2d_average(tmp, 16, dst_ptr, dst_pitch, VPX_FILTER_16x16);
+}
+
+/****************************************************************************
+ *
+ *  ROUTINE       : filter_block2d_bil_first_pass
+ *
+ *  INPUTS        : UINT8  *src_ptr    : Pointer to source block.
+ *                  UINT32  src_stride : Stride of source block.
+ *                  UINT32  height     : Block height.
+ *                  UINT32  width      : Block width.
+ *                  INT32  *vp9_filter : Array of 2 bi-linear filter taps.
+ *
+ *  OUTPUTS       : INT32  *dst_ptr    : Pointer to filtered block.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Applies a 1-D 2-tap bi-linear filter to the source block
+ *                  in the horizontal direction to produce the filtered output
+ *                  block. Used to implement first-pass of 2-D separable filter.
+ *
+ *  SPECIAL NOTES : Produces INT32 output to retain precision for next pass.
+ *                  Two filter taps should sum to VP9_FILTER_WEIGHT.
+ *
+ ****************************************************************************/
+static void filter_block2d_bil_first_pass(unsigned char  *src_ptr,
+                                          unsigned short *dst_ptr,
+                                          unsigned int    src_stride,
+                                          unsigned int    height,
+                                          unsigned int    width,
+                                          const short    *vp9_filter) {
+  unsigned int i, j;
+
+  for (i = 0; i < height; i++) {
+    for (j = 0; j < width; j++) {
+      /* Apply bilinear filter */
+      dst_ptr[j] = (((int)src_ptr[0] * vp9_filter[0]) +
+                    ((int)src_ptr[1] * vp9_filter[1]) +
+                    (VP9_FILTER_WEIGHT / 2)) >> VP9_FILTER_SHIFT;
+      src_ptr++;
+    }
+
+    /* Next row... */
+    src_ptr += src_stride - width;
+    dst_ptr += width;
+  }
+}
+
+/****************************************************************************
+ *
+ *  ROUTINE       : filter_block2d_bil_second_pass
+ *
+ *  INPUTS        : INT32  *src_ptr    : Pointer to source block.
+ *                  UINT32  dst_pitch  : Destination block pitch.
+ *                  UINT32  height     : Block height.
+ *                  UINT32  width      : Block width.
+ *                  INT32  *vp9_filter : Array of 2 bi-linear filter taps.
+ *
+ *  OUTPUTS       : UINT16 *dst_ptr    : Pointer to filtered block.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Applies a 1-D 2-tap bi-linear filter to the source block
+ *                  in the vertical direction to produce the filtered output
+ *                  block. Used to implement second-pass of 2-D separable filter.
+ *
+ *  SPECIAL NOTES : Requires 32-bit input as produced by filter_block2d_bil_first_pass.
+ *                  Two filter taps should sum to VP9_FILTER_WEIGHT.
+ *
+ ****************************************************************************/
+static void filter_block2d_bil_second_pass(unsigned short *src_ptr,
+                                           unsigned char  *dst_ptr,
+                                           int             dst_pitch,
+                                           unsigned int    height,
+                                           unsigned int    width,
+                                           const short    *vp9_filter) {
+  unsigned int  i, j;
+  int  Temp;
+
+  for (i = 0; i < height; i++) {
+    for (j = 0; j < width; j++) {
+      /* Apply filter */
+      Temp = ((int)src_ptr[0]     * vp9_filter[0]) +
+             ((int)src_ptr[width] * vp9_filter[1]) +
+             (VP9_FILTER_WEIGHT / 2);
+      dst_ptr[j] = (unsigned int)(Temp >> VP9_FILTER_SHIFT);
+      src_ptr++;
+    }
+
+    /* Next row... */
+    dst_ptr += dst_pitch;
+  }
+}
+
+/*
+ * As before for filter_block2d_second_pass_avg(), the functional difference
+ * between filter_block2d_bil_second_pass() and filter_block2d_bil_second_pass_avg()
+ * is that filter_block2d_bil_second_pass() does a bilinear filter on input
+ * and stores the result in output; filter_block2d_bil_second_pass_avg(),
+ * instead, does a bilinear filter on input, averages the resulting value
+ * with the values already present in the output and stores the result of
+ * that back into the output ((filter_result + dest + 1) >> 1).
+ */
+static void filter_block2d_bil_second_pass_avg(unsigned short *src_ptr,
+                                               unsigned char  *dst_ptr,
+                                               int             dst_pitch,
+                                               unsigned int    height,
+                                               unsigned int    width,
+                                               const short    *vp9_filter) {
+  unsigned int  i, j;
+  int  Temp;
+
+  for (i = 0; i < height; i++) {
+    for (j = 0; j < width; j++) {
+      /* Apply filter */
+      Temp = ((int)src_ptr[0]     * vp9_filter[0]) +
+             ((int)src_ptr[width] * vp9_filter[1]) +
+             (VP9_FILTER_WEIGHT / 2);
+      dst_ptr[j] = (unsigned int)(((Temp >> VP9_FILTER_SHIFT) + dst_ptr[j] + 1) >> 1);
+      src_ptr++;
+    }
+
+    /* Next row... */
+    dst_ptr += dst_pitch;
+  }
+}
+
+/****************************************************************************
+ *
+ *  ROUTINE       : filter_block2d_bil
+ *
+ *  INPUTS        : UINT8  *src_ptr          : Pointer to source block.
+ *                  UINT32  src_pitch        : Stride of source block.
+ *                  UINT32  dst_pitch        : Stride of destination block.
+ *                  INT32  *HFilter          : Array of 2 horizontal filter taps.
+ *                  INT32  *VFilter          : Array of 2 vertical filter taps.
+ *                  INT32  Width             : Block width
+ *                  INT32  Height            : Block height
+ *
+ *  OUTPUTS       : UINT16 *dst_ptr       : Pointer to filtered block.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : 2-D filters an input block by applying a 2-tap
+ *                  bi-linear filter horizontally followed by a 2-tap
+ *                  bi-linear filter vertically on the result.
+ *
+ *  SPECIAL NOTES : The largest block size can be handled here is 16x16
+ *
+ ****************************************************************************/
+static void filter_block2d_bil(unsigned char *src_ptr,
+                               unsigned char *dst_ptr,
+                               unsigned int   src_pitch,
+                               unsigned int   dst_pitch,
+                               const short   *HFilter,
+                               const short   *VFilter,
+                               int            Width,
+                               int            Height) {
+
+  unsigned short FData[17 * 16];  /* Temp data buffer used in filtering */
+
+  /* First filter 1-D horizontally... */
+  filter_block2d_bil_first_pass(src_ptr, FData, src_pitch, Height + 1, Width, HFilter);
+
+  /* then 1-D vertically... */
+  filter_block2d_bil_second_pass(FData, dst_ptr, dst_pitch, Height, Width, VFilter);
+}
+
+static void filter_block2d_bil_avg(unsigned char *src_ptr,
+                                   unsigned char *dst_ptr,
+                                   unsigned int   src_pitch,
+                                   unsigned int   dst_pitch,
+                                   const short   *HFilter,
+                                   const short   *VFilter,
+                                   int            Width,
+                                   int            Height) {
+  unsigned short FData[17 * 16];  /* Temp data buffer used in filtering */
+
+  /* First filter 1-D horizontally... */
+  filter_block2d_bil_first_pass(src_ptr, FData, src_pitch, Height + 1, Width, HFilter);
+
+  /* then 1-D vertically... */
+  filter_block2d_bil_second_pass_avg(FData, dst_ptr, dst_pitch, Height, Width, VFilter);
+}
+
+void vp9_bilinear_predict4x4_c(unsigned char  *src_ptr,
+                               int   src_pixels_per_line,
+                               int  xoffset,
+                               int  yoffset,
+                               unsigned char *dst_ptr,
+                               int dst_pitch) {
+  const short *HFilter;
+  const short *VFilter;
+
+  HFilter = vp9_bilinear_filters[xoffset];
+  VFilter = vp9_bilinear_filters[yoffset];
+
+  filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 4, 4);
+}
+
+void vp9_bilinear_predict_avg4x4_c(unsigned char  *src_ptr,
+                                   int   src_pixels_per_line,
+                                   int  xoffset,
+                                   int  yoffset,
+                                   unsigned char *dst_ptr,
+                                   int dst_pitch) {
+  const short *HFilter;
+  const short *VFilter;
+
+  HFilter = vp9_bilinear_filters[xoffset];
+  VFilter = vp9_bilinear_filters[yoffset];
+
+  filter_block2d_bil_avg(src_ptr, dst_ptr, src_pixels_per_line,
+                         dst_pitch, HFilter, VFilter, 4, 4);
+}
+
+void vp9_bilinear_predict8x8_c(unsigned char  *src_ptr,
+                               int  src_pixels_per_line,
+                               int  xoffset,
+                               int  yoffset,
+                               unsigned char *dst_ptr,
+                               int  dst_pitch) {
+  const short *HFilter;
+  const short *VFilter;
+
+  HFilter = vp9_bilinear_filters[xoffset];
+  VFilter = vp9_bilinear_filters[yoffset];
+
+  filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 8);
+
+}
+
+void vp9_bilinear_predict_avg8x8_c(unsigned char  *src_ptr,
+                                   int  src_pixels_per_line,
+                                   int  xoffset,
+                                   int  yoffset,
+                                   unsigned char *dst_ptr,
+                                   int  dst_pitch) {
+  const short *HFilter;
+  const short *VFilter;
+
+  HFilter = vp9_bilinear_filters[xoffset];
+  VFilter = vp9_bilinear_filters[yoffset];
+
+  filter_block2d_bil_avg(src_ptr, dst_ptr, src_pixels_per_line,
+                         dst_pitch, HFilter, VFilter, 8, 8);
+}
+
+void vp9_bilinear_predict8x4_c(unsigned char  *src_ptr,
+                               int  src_pixels_per_line,
+                               int  xoffset,
+                               int  yoffset,
+                               unsigned char *dst_ptr,
+                               int  dst_pitch) {
+  const short *HFilter;
+  const short *VFilter;
+
+  HFilter = vp9_bilinear_filters[xoffset];
+  VFilter = vp9_bilinear_filters[yoffset];
+
+  filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 4);
+
+}
+
+void vp9_bilinear_predict16x16_c(unsigned char  *src_ptr,
+                                 int  src_pixels_per_line,
+                                 int  xoffset,
+                                 int  yoffset,
+                                 unsigned char *dst_ptr,
+                                 int  dst_pitch) {
+  const short *HFilter;
+  const short *VFilter;
+
+  HFilter = vp9_bilinear_filters[xoffset];
+  VFilter = vp9_bilinear_filters[yoffset];
+
+  filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 16, 16);
+}
+
+void vp9_bilinear_predict_avg16x16_c(unsigned char  *src_ptr,
+                                     int  src_pixels_per_line,
+                                     int  xoffset,
+                                     int  yoffset,
+                                     unsigned char *dst_ptr,
+                                     int  dst_pitch) {
+  const short *HFilter;
+  const short *VFilter;
+
+  HFilter = vp9_bilinear_filters[xoffset];
+  VFilter = vp9_bilinear_filters[yoffset];
+
+  filter_block2d_bil_avg(src_ptr, dst_ptr, src_pixels_per_line,
+                         dst_pitch, HFilter, VFilter, 16, 16);
+}
--- /dev/null
+++ b/vp9/common/filter.h
@@ -1,0 +1,28 @@
+/*
+ *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef FILTER_H
+#define FILTER_H
+
+#include "vpx_config.h"
+#include "vpx_scale/yv12config.h"
+
+#define BLOCK_HEIGHT_WIDTH 4
+#define VP9_FILTER_WEIGHT 128
+#define VP9_FILTER_SHIFT  7
+
+#define SUBPEL_SHIFTS 16
+
+extern const short vp9_bilinear_filters[SUBPEL_SHIFTS][2];
+extern const short vp9_sub_pel_filters_6[SUBPEL_SHIFTS][6];
+extern const short vp9_sub_pel_filters_8[SUBPEL_SHIFTS][8];
+extern const short vp9_sub_pel_filters_8s[SUBPEL_SHIFTS][8];
+
+#endif // FILTER_H
--- /dev/null
+++ b/vp9/common/findnearmv.c
@@ -1,0 +1,327 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "findnearmv.h"
+#include "vp9/common/sadmxn.h"
+#include <limits.h>
+
+const unsigned char vp9_mbsplit_offset[4][16] = {
+  { 0,  8,  0,  0,  0,  0,  0,  0,  0,  0,   0,  0,  0,  0,  0,  0},
+  { 0,  2,  0,  0,  0,  0,  0,  0,  0,  0,   0,  0,  0,  0,  0,  0},
+  { 0,  2,  8, 10,  0,  0,  0,  0,  0,  0,   0,  0,  0,  0,  0,  0},
+  { 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15}
+};
+
+static void lower_mv_precision(int_mv *mv, int usehp)
+{
+  if (!usehp || !vp9_use_nmv_hp(&mv->as_mv)) {
+    if (mv->as_mv.row & 1)
+      mv->as_mv.row += (mv->as_mv.row > 0 ? -1 : 1);
+    if (mv->as_mv.col & 1)
+      mv->as_mv.col += (mv->as_mv.col > 0 ? -1 : 1);
+  }
+}
+
+/* Predict motion vectors using those from already-decoded nearby blocks.
+   Note that we only consider one 4x4 subblock from each candidate 16x16
+   macroblock.   */
+
+void vp9_find_near_mvs
+(
+  MACROBLOCKD *xd,
+  const MODE_INFO *here,
+  const MODE_INFO *lf_here,
+  int_mv *nearest,
+  int_mv *nearby,
+  int_mv *best_mv,
+  int cnt[4],
+  int refframe,
+  int *ref_frame_sign_bias) {
+  const MODE_INFO *above = here - xd->mode_info_stride;
+  const MODE_INFO *left = here - 1;
+  const MODE_INFO *aboveleft = above - 1;
+  const MODE_INFO *third = NULL;
+  int_mv            near_mvs[4];
+  int_mv           *mv = near_mvs;
+  int             *cntx = cnt;
+  enum {CNT_INTRA, CNT_NEAREST, CNT_NEAR, CNT_SPLITMV};
+
+  /* Zero accumulators */
+  mv[0].as_int = mv[1].as_int = mv[2].as_int = 0;
+  cnt[0] = cnt[1] = cnt[2] = cnt[3] = 0;
+
+  /* Process above */
+  if (above->mbmi.ref_frame != INTRA_FRAME) {
+    if (above->mbmi.mv[0].as_int) {
+      ++ mv;
+      mv->as_int = above->mbmi.mv[0].as_int;
+      mv_bias(ref_frame_sign_bias[above->mbmi.ref_frame],
+              refframe, mv, ref_frame_sign_bias);
+      ++cntx;
+    }
+    *cntx += 2;
+  }
+
+  /* Process left */
+  if (left->mbmi.ref_frame != INTRA_FRAME) {
+    if (left->mbmi.mv[0].as_int) {
+      int_mv this_mv;
+      this_mv.as_int = left->mbmi.mv[0].as_int;
+      mv_bias(ref_frame_sign_bias[left->mbmi.ref_frame],
+              refframe, &this_mv, ref_frame_sign_bias);
+
+      if (this_mv.as_int != mv->as_int) {
+        ++ mv;
+        mv->as_int = this_mv.as_int;
+        ++ cntx;
+      }
+      *cntx += 2;
+    } else
+      cnt[CNT_INTRA] += 2;
+  }
+  /* Process above left or the one from last frame */
+  if (aboveleft->mbmi.ref_frame != INTRA_FRAME ||
+      (lf_here->mbmi.ref_frame == LAST_FRAME && refframe == LAST_FRAME)) {
+    if (aboveleft->mbmi.mv[0].as_int) {
+      third = aboveleft;
+    } else if (lf_here->mbmi.mv[0].as_int) {
+      third = lf_here;
+    }
+    if (third) {
+      int_mv this_mv;
+      this_mv.as_int = third->mbmi.mv[0].as_int;
+      mv_bias(ref_frame_sign_bias[third->mbmi.ref_frame],
+              refframe, &this_mv, ref_frame_sign_bias);
+
+      if (this_mv.as_int != mv->as_int) {
+        ++ mv;
+        mv->as_int = this_mv.as_int;
+        ++ cntx;
+      }
+      *cntx += 1;
+    } else
+      cnt[CNT_INTRA] += 1;
+  }
+
+  /* If we have three distinct MV's ... */
+  if (cnt[CNT_SPLITMV]) {
+    /* See if the third MV can be merged with NEAREST */
+    if (mv->as_int == near_mvs[CNT_NEAREST].as_int)
+      cnt[CNT_NEAREST] += 1;
+  }
+
+  cnt[CNT_SPLITMV] = ((above->mbmi.mode == SPLITMV)
+                      + (left->mbmi.mode == SPLITMV)) * 2
+                     + (
+                       lf_here->mbmi.mode == SPLITMV ||
+                       aboveleft->mbmi.mode == SPLITMV);
+
+  /* Swap near and nearest if necessary */
+  if (cnt[CNT_NEAR] > cnt[CNT_NEAREST]) {
+    int tmp;
+    tmp = cnt[CNT_NEAREST];
+    cnt[CNT_NEAREST] = cnt[CNT_NEAR];
+    cnt[CNT_NEAR] = tmp;
+    tmp = near_mvs[CNT_NEAREST].as_int;
+    near_mvs[CNT_NEAREST].as_int = near_mvs[CNT_NEAR].as_int;
+    near_mvs[CNT_NEAR].as_int = tmp;
+  }
+
+  /* Use near_mvs[0] to store the "best" MV */
+  if (cnt[CNT_NEAREST] >= cnt[CNT_INTRA])
+    near_mvs[CNT_INTRA] = near_mvs[CNT_NEAREST];
+
+  /* Set up return values */
+  best_mv->as_int = near_mvs[0].as_int;
+  nearest->as_int = near_mvs[CNT_NEAREST].as_int;
+  nearby->as_int = near_mvs[CNT_NEAR].as_int;
+
+  /* Make sure that the 1/8th bits of the Mvs are zero if high_precision
+   * is not being used, by truncating the last bit towards 0
+   */
+  lower_mv_precision(best_mv, xd->allow_high_precision_mv);
+  lower_mv_precision(nearest, xd->allow_high_precision_mv);
+  lower_mv_precision(nearby, xd->allow_high_precision_mv);
+
+  // TODO: move clamp outside findnearmv
+  clamp_mv2(nearest, xd);
+  clamp_mv2(nearby, xd);
+  clamp_mv2(best_mv, xd);
+}
+
+vp9_prob *vp9_mv_ref_probs(VP9_COMMON *pc,
+                           vp9_prob p[VP9_MVREFS - 1], const int near_mv_ref_ct[4]
+                          ) {
+  p[0] = pc->fc.vp8_mode_contexts [near_mv_ref_ct[0]] [0];
+  p[1] = pc->fc.vp8_mode_contexts [near_mv_ref_ct[1]] [1];
+  p[2] = pc->fc.vp8_mode_contexts [near_mv_ref_ct[2]] [2];
+  p[3] = pc->fc.vp8_mode_contexts [near_mv_ref_ct[3]] [3];
+  return p;
+}
+
+#if CONFIG_NEWBESTREFMV
+#define SP(x) (((x) & 7) << 1)
+unsigned int vp9_sad3x16_c(
+  const unsigned char *src_ptr,
+  int  src_stride,
+  const unsigned char *ref_ptr,
+  int  ref_stride,
+  int max_sad) {
+  return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 3, 16);
+}
+unsigned int vp9_sad16x3_c(
+  const unsigned char *src_ptr,
+  int  src_stride,
+  const unsigned char *ref_ptr,
+  int  ref_stride,
+  int max_sad) {
+  return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 16, 3);
+}
+
+/* check a list of motion vectors by sad score using a number rows of pixels
+ * above and a number cols of pixels in the left to select the one with best
+ * score to use as ref motion vector
+ */
+void vp9_find_best_ref_mvs(MACROBLOCKD *xd,
+                           unsigned char *ref_y_buffer,
+                           int ref_y_stride,
+                           int_mv *mvlist,
+                           int_mv *best_mv,
+                           int_mv *nearest,
+                           int_mv *near) {
+  int i, j;
+  unsigned char *above_src;
+  unsigned char *left_src;
+  unsigned char *above_ref;
+  unsigned char *left_ref;
+  int score;
+  int sse;
+  int ref_scores[MAX_MV_REFS] = {0};
+  int_mv sorted_mvs[MAX_MV_REFS];
+  int zero_seen = FALSE;
+
+  // Default all to 0,0 if nothing else available
+  best_mv->as_int = nearest->as_int = near->as_int = 0;
+  vpx_memset(sorted_mvs, 0, sizeof(sorted_mvs));
+
+#if CONFIG_SUBPELREFMV
+  above_src = xd->dst.y_buffer - xd->dst.y_stride * 2;
+  left_src  = xd->dst.y_buffer - 2;
+  above_ref = ref_y_buffer - ref_y_stride * 2;
+  left_ref  = ref_y_buffer - 2;
+#else
+  above_src = xd->dst.y_buffer - xd->dst.y_stride * 3;
+  left_src  = xd->dst.y_buffer - 3;
+  above_ref = ref_y_buffer - ref_y_stride * 3;
+  left_ref  = ref_y_buffer - 3;
+#endif
+
+  //for(i = 0; i < MAX_MV_REFS; ++i) {
+  // Limit search to the predicted best 4
+  for(i = 0; i < 4; ++i) {
+    int_mv this_mv;
+    int offset = 0;
+    int row_offset, col_offset;
+
+    this_mv.as_int = mvlist[i].as_int;
+
+    // If we see a 0,0 vector for a second time we have reached the end of
+    // the list of valid candidate vectors.
+    if (!this_mv.as_int && zero_seen)
+      break;
+
+    zero_seen = zero_seen || !this_mv.as_int;
+
+    clamp_mv(&this_mv,
+             xd->mb_to_left_edge - LEFT_TOP_MARGIN + 24,
+             xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN,
+             xd->mb_to_top_edge - LEFT_TOP_MARGIN + 24,
+             xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN);
+#if CONFIG_SUBPELREFMV
+    row_offset = this_mv.as_mv.row >> 3;
+    col_offset = this_mv.as_mv.col >> 3;
+    offset = ref_y_stride * row_offset + col_offset;
+    score = 0;
+    if (xd->up_available) {
+      vp9_sub_pixel_variance16x2_c(above_ref + offset, ref_y_stride,
+                                   SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),
+                                   above_src, xd->dst.y_stride, &sse);
+      score += sse;
+    }
+    if (xd->left_available) {
+      vp9_sub_pixel_variance2x16_c(left_ref + offset, ref_y_stride,
+                                   SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),
+                                   left_src, xd->dst.y_stride, &sse);
+      score += sse;
+    }
+#else
+    row_offset = (this_mv.as_mv.row > 0) ?
+      ((this_mv.as_mv.row + 3) >> 3):((this_mv.as_mv.row + 4) >> 3);
+    col_offset = (this_mv.as_mv.col > 0) ?
+      ((this_mv.as_mv.col + 3) >> 3):((this_mv.as_mv.col + 4) >> 3);
+    offset = ref_y_stride * row_offset + col_offset;
+    score = 0;
+    if (xd->up_available) {
+      score += vp9_sad16x3(above_src, xd->dst.y_stride,
+                           above_ref + offset, ref_y_stride, INT_MAX);
+    }
+    if (xd->left_available) {
+      score += vp9_sad3x16(left_src, xd->dst.y_stride,
+                           left_ref + offset, ref_y_stride, INT_MAX);
+    }
+#endif
+    // Add the entry to our list and then resort the list on score.
+    ref_scores[i] = score;
+    sorted_mvs[i].as_int = this_mv.as_int;
+    j = i;
+    while (j > 0) {
+      if (ref_scores[j] < ref_scores[j-1]) {
+        ref_scores[j] = ref_scores[j-1];
+        sorted_mvs[j].as_int = sorted_mvs[j-1].as_int;
+        ref_scores[j-1] = score;
+        sorted_mvs[j-1].as_int = this_mv.as_int;
+        j--;
+      } else
+        break;
+    }
+  }
+
+  // Make sure all the candidates are properly clamped etc
+  for (i = 0; i < 4; ++i) {
+    lower_mv_precision(&sorted_mvs[i], xd->allow_high_precision_mv);
+    clamp_mv2(&sorted_mvs[i], xd);
+  }
+
+  // Set the best mv to the first entry in the sorted list
+  best_mv->as_int = sorted_mvs[0].as_int;
+
+  // Provided that there are non zero vectors available there will not
+  // be more than one 0,0 entry in the sorted list.
+  // The best ref mv is always set to the first entry (which gave the best
+  // results. The nearest is set to the first non zero vector if available and
+  // near to the second non zero vector if available.
+  // We do not use 0,0 as a nearest or near as 0,0 has its own mode.
+  if ( sorted_mvs[0].as_int ) {
+    nearest->as_int = sorted_mvs[0].as_int;
+    if ( sorted_mvs[1].as_int )
+      near->as_int = sorted_mvs[1].as_int;
+    else
+      near->as_int = sorted_mvs[2].as_int;
+  } else {
+      nearest->as_int = sorted_mvs[1].as_int;
+      near->as_int = sorted_mvs[2].as_int;
+  }
+
+  // Copy back the re-ordered mv list
+  vpx_memcpy(mvlist, sorted_mvs, sizeof(sorted_mvs));
+}
+
+#endif  // CONFIG_NEWBESTREFMV
--- /dev/null
+++ b/vp9/common/findnearmv.h
@@ -1,0 +1,188 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_FINDNEARMV_H
+#define __INC_FINDNEARMV_H
+
+#include "mv.h"
+#include "blockd.h"
+#include "modecont.h"
+#include "treecoder.h"
+#include "onyxc_int.h"
+
+#if CONFIG_NEWBESTREFMV
+/* check a list of motion vectors by sad score using a number rows of pixels
+ * above and a number cols of pixels in the left to select the one with best
+ * score to use as ref motion vector
+ */
+void vp9_find_best_ref_mvs(MACROBLOCKD *xd,
+                           unsigned char *ref_y_buffer,
+                           int ref_y_stride,
+                           int_mv *mvlist,
+                           int_mv *best_mv,
+                           int_mv *nearest,
+                           int_mv *near);
+#endif
+
+static void mv_bias(int refmb_ref_frame_sign_bias, int refframe, int_mv *mvp, const int *ref_frame_sign_bias) {
+  MV xmv;
+  xmv = mvp->as_mv;
+
+  if (refmb_ref_frame_sign_bias != ref_frame_sign_bias[refframe]) {
+    xmv.row *= -1;
+    xmv.col *= -1;
+  }
+
+  mvp->as_mv = xmv;
+}
+
+#define LEFT_TOP_MARGIN (16 << 3)
+#define RIGHT_BOTTOM_MARGIN (16 << 3)
+
+static void clamp_mv(int_mv *mv,
+                     int mb_to_left_edge,
+                     int mb_to_right_edge,
+                     int mb_to_top_edge,
+                     int mb_to_bottom_edge) {
+  mv->as_mv.col = (mv->as_mv.col < mb_to_left_edge) ?
+                  mb_to_left_edge : mv->as_mv.col;
+  mv->as_mv.col = (mv->as_mv.col > mb_to_right_edge) ?
+                  mb_to_right_edge : mv->as_mv.col;
+  mv->as_mv.row = (mv->as_mv.row < mb_to_top_edge) ?
+                  mb_to_top_edge : mv->as_mv.row;
+  mv->as_mv.row = (mv->as_mv.row > mb_to_bottom_edge) ?
+                  mb_to_bottom_edge : mv->as_mv.row;
+}
+
+static void clamp_mv2(int_mv *mv, const MACROBLOCKD *xd) {
+  clamp_mv(mv,
+           xd->mb_to_left_edge - LEFT_TOP_MARGIN,
+           xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN,
+           xd->mb_to_top_edge - LEFT_TOP_MARGIN,
+           xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN);
+}
+
+static unsigned int check_mv_bounds(int_mv *mv,
+                                    int mb_to_left_edge,
+                                    int mb_to_right_edge,
+                                    int mb_to_top_edge,
+                                    int mb_to_bottom_edge) {
+  return (mv->as_mv.col < mb_to_left_edge) ||
+         (mv->as_mv.col > mb_to_right_edge) ||
+         (mv->as_mv.row < mb_to_top_edge) ||
+         (mv->as_mv.row > mb_to_bottom_edge);
+}
+
+void vp9_find_near_mvs(MACROBLOCKD *xd,
+                       const MODE_INFO *here,
+                       const MODE_INFO *lfhere,
+                       int_mv *nearest, int_mv *nearby, int_mv *best,
+                       int near_mv_ref_cts[4],
+                       int refframe,
+                       int *ref_frame_sign_bias);
+
+vp9_prob *vp9_mv_ref_probs(VP9_COMMON *pc,
+                           vp9_prob p[VP9_MVREFS - 1],
+                           const int near_mv_ref_ct[4]);
+
+extern const unsigned char vp9_mbsplit_offset[4][16];
+
+static int left_block_mv(const MODE_INFO *cur_mb, int b) {
+  if (!(b & 3)) {
+    /* On L edge, get from MB to left of us */
+    --cur_mb;
+
+    if (cur_mb->mbmi.mode != SPLITMV)
+      return cur_mb->mbmi.mv[0].as_int;
+    b += 4;
+  }
+
+  return (cur_mb->bmi + b - 1)->as_mv.first.as_int;
+}
+
+static int left_block_second_mv(const MODE_INFO *cur_mb, int b) {
+  if (!(b & 3)) {
+    /* On L edge, get from MB to left of us */
+    --cur_mb;
+
+    if (cur_mb->mbmi.mode != SPLITMV)
+      return cur_mb->mbmi.second_ref_frame ? cur_mb->mbmi.mv[1].as_int : cur_mb->mbmi.mv[0].as_int;
+    b += 4;
+  }
+
+  return cur_mb->mbmi.second_ref_frame ? (cur_mb->bmi + b - 1)->as_mv.second.as_int : (cur_mb->bmi + b - 1)->as_mv.first.as_int;
+}
+
+static int above_block_mv(const MODE_INFO *cur_mb, int b, int mi_stride) {
+  if (!(b >> 2)) {
+    /* On top edge, get from MB above us */
+    cur_mb -= mi_stride;
+
+    if (cur_mb->mbmi.mode != SPLITMV)
+      return cur_mb->mbmi.mv[0].as_int;
+    b += 16;
+  }
+
+  return (cur_mb->bmi + b - 4)->as_mv.first.as_int;
+}
+
+static int above_block_second_mv(const MODE_INFO *cur_mb, int b, int mi_stride) {
+  if (!(b >> 2)) {
+    /* On top edge, get from MB above us */
+    cur_mb -= mi_stride;
+
+    if (cur_mb->mbmi.mode != SPLITMV)
+      return cur_mb->mbmi.second_ref_frame ? cur_mb->mbmi.mv[1].as_int : cur_mb->mbmi.mv[0].as_int;
+    b += 16;
+  }
+
+  return cur_mb->mbmi.second_ref_frame ? (cur_mb->bmi + b - 4)->as_mv.second.as_int : (cur_mb->bmi + b - 4)->as_mv.first.as_int;
+}
+
+static B_PREDICTION_MODE left_block_mode(const MODE_INFO *cur_mb, int b) {
+  if (!(b & 3)) {
+    /* On L edge, get from MB to left of us */
+    --cur_mb;
+
+    if (cur_mb->mbmi.mode < I8X8_PRED) {
+      return pred_mode_conv(cur_mb->mbmi.mode);
+    } else if (cur_mb->mbmi.mode == I8X8_PRED) {
+      return pred_mode_conv((cur_mb->bmi + 3 + b)->as_mode.first);
+    } else if (cur_mb->mbmi.mode == B_PRED) {
+      return ((cur_mb->bmi + 3 + b)->as_mode.first);
+    } else {
+      return B_DC_PRED;
+    }
+  }
+  return (cur_mb->bmi + b - 1)->as_mode.first;
+}
+
+static B_PREDICTION_MODE above_block_mode(const MODE_INFO *cur_mb,
+                                          int b, int mi_stride) {
+  if (!(b >> 2)) {
+    /* On top edge, get from MB above us */
+    cur_mb -= mi_stride;
+
+    if (cur_mb->mbmi.mode < I8X8_PRED) {
+      return pred_mode_conv(cur_mb->mbmi.mode);
+    } else if (cur_mb->mbmi.mode == I8X8_PRED) {
+      return pred_mode_conv((cur_mb->bmi + 12 + b)->as_mode.first);
+    } else if (cur_mb->mbmi.mode == B_PRED) {
+      return ((cur_mb->bmi + 12 + b)->as_mode.first);
+    } else {
+      return B_DC_PRED;
+    }
+  }
+
+  return (cur_mb->bmi + b - 4)->as_mode.first;
+}
+
+#endif
--- /dev/null
+++ b/vp9/common/generic/systemdependent.c
@@ -1,0 +1,87 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_ports/config.h"
+#include "vpx_rtcd.h"
+#include "vp9/common/subpixel.h"
+#include "vp9/common/loopfilter.h"
+#include "vp9/common/idct.h"
+#include "vp9/common/onyxc_int.h"
+
+extern void vp9_arch_x86_common_init(VP9_COMMON *ctx);
+extern void vp9_arch_arm_common_init(VP9_COMMON *ctx);
+
+void vp9_machine_specific_config(VP9_COMMON *ctx) {
+#if CONFIG_RUNTIME_CPU_DETECT
+  VP9_COMMON_RTCD *rtcd = &ctx->rtcd;
+
+  rtcd->idct.idct1        = vp9_short_idct4x4llm_1_c;
+  rtcd->idct.idct16       = vp9_short_idct4x4llm_c;
+  rtcd->idct.idct1_scalar_add = vp9_dc_only_idct_add_c;
+  rtcd->idct.iwalsh1      = vp9_short_inv_walsh4x4_1_c;
+  rtcd->idct.iwalsh16     = vp9_short_inv_walsh4x4_c;
+  rtcd->idct.idct8        = vp9_short_idct8x8_c;
+  rtcd->idct.idct1_scalar_add_8x8 = vp9_dc_only_idct_add_8x8_c;
+  rtcd->idct.ihaar2       = vp9_short_ihaar2x2_c;
+  rtcd->idct.idct16x16    = vp9_short_idct16x16_c;
+
+  rtcd->subpix.eighttap16x16       = vp9_eighttap_predict16x16_c;
+  rtcd->subpix.eighttap8x8         = vp9_eighttap_predict8x8_c;
+  rtcd->subpix.eighttap_avg16x16   = vp9_eighttap_predict_avg16x16_c;
+  rtcd->subpix.eighttap_avg8x8     = vp9_eighttap_predict_avg8x8_c;
+  rtcd->subpix.eighttap_avg4x4     = vp9_eighttap_predict_avg4x4_c;
+  rtcd->subpix.eighttap8x4         = vp9_eighttap_predict8x4_c;
+  rtcd->subpix.eighttap4x4         = vp9_eighttap_predict_c;
+  rtcd->subpix.eighttap16x16_sharp     = vp9_eighttap_predict16x16_sharp_c;
+  rtcd->subpix.eighttap8x8_sharp       = vp9_eighttap_predict8x8_sharp_c;
+  rtcd->subpix.eighttap_avg16x16_sharp = vp9_eighttap_predict_avg16x16_sharp_c;
+  rtcd->subpix.eighttap_avg8x8_sharp   = vp9_eighttap_predict_avg8x8_sharp_c;
+  rtcd->subpix.eighttap_avg4x4_sharp   = vp9_eighttap_predict_avg4x4_sharp_c;
+  rtcd->subpix.eighttap8x4_sharp       = vp9_eighttap_predict8x4_sharp_c;
+  rtcd->subpix.eighttap4x4_sharp       = vp9_eighttap_predict_sharp_c;
+
+  rtcd->subpix.sixtap16x16       = vp9_sixtap_predict16x16_c;
+  rtcd->subpix.sixtap8x8         = vp9_sixtap_predict8x8_c;
+  rtcd->subpix.sixtap_avg16x16   = vp9_sixtap_predict_avg16x16_c;
+  rtcd->subpix.sixtap_avg8x8     = vp9_sixtap_predict_avg8x8_c;
+  rtcd->subpix.sixtap8x4         = vp9_sixtap_predict8x4_c;
+  rtcd->subpix.sixtap4x4         = vp9_sixtap_predict_c;
+  rtcd->subpix.sixtap_avg4x4     = vp9_sixtap_predict_avg_c;
+  rtcd->subpix.bilinear16x16     = vp9_bilinear_predict16x16_c;
+  rtcd->subpix.bilinear8x8       = vp9_bilinear_predict8x8_c;
+  rtcd->subpix.bilinear_avg16x16 = vp9_bilinear_predict_avg16x16_c;
+  rtcd->subpix.bilinear_avg8x8   = vp9_bilinear_predict_avg8x8_c;
+  rtcd->subpix.bilinear8x4       = vp9_bilinear_predict8x4_c;
+  rtcd->subpix.bilinear4x4       = vp9_bilinear_predict4x4_c;
+  rtcd->subpix.bilinear_avg4x4   = vp9_bilinear_predict_avg4x4_c;
+
+#if CONFIG_POSTPROC || (CONFIG_VP9_ENCODER && CONFIG_INTERNAL_STATS)
+  rtcd->postproc.down             = vp9_mbpost_proc_down_c;
+  rtcd->postproc.across           = vp9_mbpost_proc_across_ip_c;
+  rtcd->postproc.downacross       = vp9_post_proc_down_and_across_c;
+  rtcd->postproc.addnoise         = vp9_plane_add_noise_c;
+  rtcd->postproc.blend_mb_inner   = vp9_blend_mb_inner_c;
+  rtcd->postproc.blend_mb_outer   = vp9_blend_mb_outer_c;
+  rtcd->postproc.blend_b          = vp9_blend_b_c;
+#endif
+
+#endif
+
+#if ARCH_X86 || ARCH_X86_64
+  vp9_arch_x86_common_init(ctx);
+#endif
+
+#if ARCH_ARM
+  vp9_arch_arm_common_init(ctx);
+#endif
+
+  vpx_rtcd();
+}
--- /dev/null
+++ b/vp9/common/header.h
@@ -1,0 +1,42 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_HEADER_H
+#define __INC_HEADER_H
+
+/* 24 bits total */
+typedef struct {
+  unsigned int type: 1;
+  unsigned int version: 3;
+  unsigned int show_frame: 1;
+
+  /* Allow 2^20 bytes = 8 megabits for first partition */
+
+  unsigned int first_partition_length_in_bytes: 19;
+
+#ifdef PACKET_TESTING
+  unsigned int frame_number;
+  unsigned int update_gold: 1;
+  unsigned int uses_gold: 1;
+  unsigned int update_last: 1;
+  unsigned int uses_last: 1;
+#endif
+
+} VP9_HEADER;
+
+#ifdef PACKET_TESTING
+#define VP9_HEADER_SIZE 8
+#else
+#define VP9_HEADER_SIZE 3
+#endif
+
+
+#endif
--- /dev/null
+++ b/vp9/common/idct.h
@@ -1,0 +1,144 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_IDCT_H
+#define __INC_IDCT_H
+
+#include "vp9/common/blockd.h"
+
+#define prototype_second_order(sym) \
+  void sym(short *input, short *output)
+
+#define prototype_idct(sym) \
+  void sym(short *input, short *output, int pitch)
+
+#define prototype_idct_scalar_add(sym) \
+  void sym(short input, \
+           unsigned char *pred, unsigned char *output, \
+           int pitch, int stride)
+
+#if ARCH_X86 || ARCH_X86_64
+#include "x86/idct_x86.h"
+#endif
+
+#ifdef _MSC_VER
+/* TODO: remove these after integer implmementations are done */
+#define M_PI       3.14159265358979323846
+#define round(x) (((x)>0)? floor((x)+0.5): ceil((x)-0.5))
+#endif
+
+
+#if ARCH_ARM
+#include "arm/idct_arm.h"
+#endif
+
+#if CONFIG_LOSSLESS
+#define WHT_UPSCALE_FACTOR 3
+#define Y2_WHT_UPSCALE_FACTOR 2
+#endif
+
+#ifndef vp9_idct_idct16x16
+#define vp9_idct_idct16x16 vp9_short_idct16x16_c
+#endif
+extern prototype_idct(vp9_idct_idct16x16);
+
+#ifndef vp9_idct_idct8
+#define vp9_idct_idct8 vp9_short_idct8x8_c
+#endif
+extern prototype_idct(vp9_idct_idct8);
+
+#ifndef vp9_idct_idct8_1
+#define vp9_idct_idct8_1 vp9_short_idct8x8_1_c
+#endif
+extern prototype_idct(vp9_idct_idct8_1);
+
+#ifndef vp9_idct_ihaar2
+#define vp9_idct_ihaar2 vp9_short_ihaar2x2_c
+#endif
+extern prototype_idct(vp9_idct_ihaar2);
+
+#ifndef vp9_idct_ihaar2_1
+#define vp9_idct_ihaar2_1 vp9_short_ihaar2x2_1_c
+#endif
+extern prototype_idct(vp9_idct_ihaar2_1);
+
+#ifndef vp9_idct_idct1_scalar_add_8x8
+#define vp9_idct_idct1_scalar_add_8x8 vp9_dc_only_idct_add_8x8_c
+#endif
+extern prototype_idct_scalar_add(vp9_idct_idct1_scalar_add_8x8);
+
+
+
+#ifndef vp9_idct_idct1
+#define vp9_idct_idct1 vp9_short_idct4x4llm_1_c
+#endif
+extern prototype_idct(vp9_idct_idct1);
+
+#ifndef vp9_idct_idct16
+#define vp9_idct_idct16 vp9_short_idct4x4llm_c
+#endif
+extern prototype_idct(vp9_idct_idct16);
+
+#ifndef vp9_idct_idct1_scalar_add
+#define vp9_idct_idct1_scalar_add vp9_dc_only_idct_add_c
+#endif
+extern prototype_idct_scalar_add(vp9_idct_idct1_scalar_add);
+
+
+#ifndef vp9_idct_iwalsh1
+#define vp9_idct_iwalsh1 vp9_short_inv_walsh4x4_1_c
+#endif
+extern prototype_second_order(vp9_idct_iwalsh1);
+
+#ifndef vp9_idct_iwalsh16
+#define vp9_idct_iwalsh16 vp9_short_inv_walsh4x4_c
+#endif
+extern prototype_second_order(vp9_idct_iwalsh16);
+
+#if CONFIG_LOSSLESS
+extern prototype_idct(vp9_short_inv_walsh4x4_x8_c);
+extern prototype_idct(vp9_short_inv_walsh4x4_1_x8_c);
+extern prototype_idct_scalar_add(vp9_dc_only_inv_walsh_add_c);
+extern prototype_second_order(vp9_short_inv_walsh4x4_lossless_c);
+extern prototype_second_order(vp9_short_inv_walsh4x4_1_lossless_c);
+#endif
+
+void vp9_ihtllm_c(const int16_t *input, int16_t *output, int pitch,
+                  TX_TYPE tx_type, int tx_dim);
+
+typedef prototype_idct((*vp9_idct_fn_t));
+typedef prototype_idct_scalar_add((*vp9_idct_scalar_add_fn_t));
+typedef prototype_second_order((*vp9_second_order_fn_t));
+
+typedef struct {
+  vp9_idct_fn_t            idct1;
+  vp9_idct_fn_t            idct16;
+  vp9_idct_scalar_add_fn_t idct1_scalar_add;
+
+  vp9_second_order_fn_t iwalsh1;
+  vp9_second_order_fn_t iwalsh16;
+
+  vp9_idct_fn_t            idct8;
+  vp9_idct_fn_t            idct8_1;
+  vp9_idct_scalar_add_fn_t idct1_scalar_add_8x8;
+  vp9_idct_fn_t ihaar2;
+  vp9_idct_fn_t ihaar2_1;
+
+  vp9_idct_fn_t            idct16x16;
+} vp9_idct_rtcd_vtable_t;
+
+#if CONFIG_RUNTIME_CPU_DETECT
+#define IDCT_INVOKE(ctx,fn) (ctx)->fn
+#else
+#define IDCT_INVOKE(ctx,fn) vp9_idct_##fn
+#endif
+
+#endif
--- /dev/null
+++ b/vp9/common/idctllm.c
@@ -1,0 +1,1275 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+/****************************************************************************
+ * Notes:
+ *
+ * This implementation makes use of 16 bit fixed point verio of two multiply
+ * constants:
+ *         1.   sqrt(2) * cos (pi/8)
+ *         2.   sqrt(2) * sin (pi/8)
+ * Becuase the first constant is bigger than 1, to maintain the same 16 bit
+ * fixed point precision as the second one, we use a trick of
+ *         x * a = x + x*(a-1)
+ * so
+ *         x * sqrt(2) * cos (pi/8) = x + x * (sqrt(2) *cos(pi/8)-1).
+ **************************************************************************/
+#include <assert.h>
+#include <math.h>
+#include "vpx_ports/config.h"
+#include "vp9/common/idct.h"
+#include "vp9/common/systemdependent.h"
+
+#include "vp9/common/blockd.h"
+
+static const int cospi8sqrt2minus1 = 20091;
+static const int sinpi8sqrt2      = 35468;
+static const int rounding = 0;
+
+// TODO: these transforms can be further converted into integer forms
+//       for complexity optimization
+static const float idct_4[16] = {
+  0.500000000000000,   0.653281482438188,   0.500000000000000,   0.270598050073099,
+  0.500000000000000,   0.270598050073099,  -0.500000000000000,  -0.653281482438188,
+  0.500000000000000,  -0.270598050073099,  -0.500000000000000,   0.653281482438188,
+  0.500000000000000,  -0.653281482438188,   0.500000000000000,  -0.270598050073099
+};
+
+static const float iadst_4[16] = {
+  0.228013428883779,   0.577350269189626,   0.656538502008139,   0.428525073124360,
+  0.428525073124360,   0.577350269189626,  -0.228013428883779,  -0.656538502008139,
+  0.577350269189626,                   0,  -0.577350269189626,   0.577350269189626,
+  0.656538502008139,  -0.577350269189626,   0.428525073124359,  -0.228013428883779
+};
+
+static const float idct_8[64] = {
+  0.353553390593274,   0.490392640201615,   0.461939766255643,   0.415734806151273,
+  0.353553390593274,   0.277785116509801,   0.191341716182545,   0.097545161008064,
+  0.353553390593274,   0.415734806151273,   0.191341716182545,  -0.097545161008064,
+ -0.353553390593274,  -0.490392640201615,  -0.461939766255643,  -0.277785116509801,
+  0.353553390593274,   0.277785116509801,  -0.191341716182545,  -0.490392640201615,
+ -0.353553390593274,   0.097545161008064,   0.461939766255643,   0.415734806151273,
+  0.353553390593274,   0.097545161008064,  -0.461939766255643,  -0.277785116509801,
+  0.353553390593274,   0.415734806151273,  -0.191341716182545,  -0.490392640201615,
+  0.353553390593274,  -0.097545161008064,  -0.461939766255643,   0.277785116509801,
+  0.353553390593274,  -0.415734806151273,  -0.191341716182545,   0.490392640201615,
+  0.353553390593274,  -0.277785116509801,  -0.191341716182545,   0.490392640201615,
+ -0.353553390593274,  -0.097545161008064,   0.461939766255643,  -0.415734806151273,
+  0.353553390593274,  -0.415734806151273,   0.191341716182545,   0.097545161008064,
+ -0.353553390593274,   0.490392640201615,  -0.461939766255643,   0.277785116509801,
+  0.353553390593274,  -0.490392640201615,   0.461939766255643,  -0.415734806151273,
+  0.353553390593274,  -0.277785116509801,   0.191341716182545,  -0.097545161008064
+};
+
+static const float iadst_8[64] = {
+  0.089131608307533,   0.255357107325376,   0.387095214016349,   0.466553967085785,
+  0.483002021635509,   0.434217976756762,   0.326790388032145,   0.175227946595735,
+  0.175227946595735,   0.434217976756762,   0.466553967085785,   0.255357107325376,
+ -0.089131608307533,  -0.387095214016348,  -0.483002021635509,  -0.326790388032145,
+  0.255357107325376,   0.483002021635509,   0.175227946595735,  -0.326790388032145,
+ -0.466553967085785,  -0.089131608307533,   0.387095214016349,   0.434217976756762,
+  0.326790388032145,   0.387095214016349,  -0.255357107325376,  -0.434217976756762,
+  0.175227946595735,   0.466553967085786,  -0.089131608307534,  -0.483002021635509,
+  0.387095214016349,   0.175227946595735,  -0.483002021635509,   0.089131608307533,
+  0.434217976756762,  -0.326790388032145,  -0.255357107325377,   0.466553967085785,
+  0.434217976756762,  -0.089131608307533,  -0.326790388032145,   0.483002021635509,
+ -0.255357107325376,  -0.175227946595735,   0.466553967085785,  -0.387095214016348,
+  0.466553967085785,  -0.326790388032145,   0.089131608307533,   0.175227946595735,
+ -0.387095214016348,   0.483002021635509,  -0.434217976756762,   0.255357107325376,
+  0.483002021635509,  -0.466553967085785,   0.434217976756762,  -0.387095214016348,
+  0.326790388032145,  -0.255357107325375,   0.175227946595736,  -0.089131608307532
+};
+
+static const int16_t idct_i4[16] = {
+  8192,  10703,  8192,   4433,
+  8192,   4433, -8192, -10703,
+  8192,  -4433, -8192,  10703,
+  8192, -10703,  8192,  -4433
+};
+
+static const int16_t iadst_i4[16] = {
+   3736,  9459, 10757,   7021,
+   7021,  9459, -3736, -10757,
+   9459,     0, -9459,   9459,
+  10757, -9459,  7021,  -3736
+};
+
+static const int16_t idct_i8[64] = {
+   5793,  8035,  7568,  6811,
+   5793,  4551,  3135,  1598,
+   5793,  6811,  3135, -1598,
+  -5793, -8035, -7568, -4551,
+   5793,  4551, -3135, -8035,
+  -5793,  1598,  7568,  6811,
+   5793,  1598, -7568, -4551,
+   5793,  6811, -3135, -8035,
+   5793, -1598, -7568,  4551,
+   5793, -6811, -3135,  8035,
+   5793, -4551, -3135,  8035,
+  -5793, -1598,  7568, -6811,
+   5793, -6811,  3135,  1598,
+  -5793,  8035, -7568,  4551,
+   5793, -8035,  7568, -6811,
+   5793, -4551,  3135, -1598
+};
+
+static const int16_t iadst_i8[64] = {
+   1460,  4184,  6342,  7644,
+   7914,  7114,  5354,  2871,
+   2871,  7114,  7644,  4184,
+  -1460, -6342, -7914, -5354,
+   4184,  7914,  2871, -5354,
+  -7644, -1460,  6342,  7114,
+   5354,  6342, -4184, -7114,
+   2871,  7644, -1460, -7914,
+   6342,  2871, -7914,  1460,
+   7114, -5354, -4184,  7644,
+   7114, -1460, -5354,  7914,
+  -4184, -2871,  7644, -6342,
+   7644, -5354,  1460,  2871,
+  -6342,  7914, -7114,  4184,
+   7914, -7644,  7114, -6342,
+   5354, -4184,  2871, -1460
+};
+
+static float idct_16[256] = {
+  0.250000,  0.351851,  0.346760,  0.338330,  0.326641,  0.311806,  0.293969,  0.273300,
+  0.250000,  0.224292,  0.196424,  0.166664,  0.135299,  0.102631,  0.068975,  0.034654,
+  0.250000,  0.338330,  0.293969,  0.224292,  0.135299,  0.034654, -0.068975, -0.166664,
+ -0.250000, -0.311806, -0.346760, -0.351851, -0.326641, -0.273300, -0.196424, -0.102631,
+  0.250000,  0.311806,  0.196424,  0.034654, -0.135299, -0.273300, -0.346760, -0.338330,
+ -0.250000, -0.102631,  0.068975,  0.224292,  0.326641,  0.351851,  0.293969,  0.166664,
+  0.250000,  0.273300,  0.068975, -0.166664, -0.326641, -0.338330, -0.196424,  0.034654,
+  0.250000,  0.351851,  0.293969,  0.102631, -0.135299, -0.311806, -0.346760, -0.224292,
+  0.250000,  0.224292, -0.068975, -0.311806, -0.326641, -0.102631,  0.196424,  0.351851,
+  0.250000, -0.034654, -0.293969, -0.338330, -0.135299,  0.166664,  0.346760,  0.273300,
+  0.250000,  0.166664, -0.196424, -0.351851, -0.135299,  0.224292,  0.346760,  0.102631,
+ -0.250000, -0.338330, -0.068975,  0.273300,  0.326641,  0.034654, -0.293969, -0.311806,
+  0.250000,  0.102631, -0.293969, -0.273300,  0.135299,  0.351851,  0.068975, -0.311806,
+ -0.250000,  0.166664,  0.346760,  0.034654, -0.326641, -0.224292,  0.196424,  0.338330,
+  0.250000,  0.034654, -0.346760, -0.102631,  0.326641,  0.166664, -0.293969, -0.224292,
+  0.250000,  0.273300, -0.196424, -0.311806,  0.135299,  0.338330, -0.068975, -0.351851,
+  0.250000, -0.034654, -0.346760,  0.102631,  0.326641, -0.166664, -0.293969,  0.224292,
+  0.250000, -0.273300, -0.196424,  0.311806,  0.135299, -0.338330, -0.068975,  0.351851,
+  0.250000, -0.102631, -0.293969,  0.273300,  0.135299, -0.351851,  0.068975,  0.311806,
+ -0.250000, -0.166664,  0.346760, -0.034654, -0.326641,  0.224292,  0.196424, -0.338330,
+  0.250000, -0.166664, -0.196424,  0.351851, -0.135299, -0.224292,  0.346760, -0.102631,
+ -0.250000,  0.338330, -0.068975, -0.273300,  0.326641, -0.034654, -0.293969,  0.311806,
+  0.250000, -0.224292, -0.068975,  0.311806, -0.326641,  0.102631,  0.196424, -0.351851,
+  0.250000,  0.034654, -0.293969,  0.338330, -0.135299, -0.166664,  0.346760, -0.273300,
+  0.250000, -0.273300,  0.068975,  0.166664, -0.326641,  0.338330, -0.196424, -0.034654,
+  0.250000, -0.351851,  0.293969, -0.102631, -0.135299,  0.311806, -0.346760,  0.224292,
+  0.250000, -0.311806,  0.196424, -0.034654, -0.135299,  0.273300, -0.346760,  0.338330,
+ -0.250000,  0.102631,  0.068975, -0.224292,  0.326641, -0.351851,  0.293969, -0.166664,
+  0.250000, -0.338330,  0.293969, -0.224292,  0.135299, -0.034654, -0.068975,  0.166664,
+ -0.250000,  0.311806, -0.346760,  0.351851, -0.326641,  0.273300, -0.196424,  0.102631,
+  0.250000, -0.351851,  0.346760, -0.338330,  0.326641, -0.311806,  0.293969, -0.273300,
+  0.250000, -0.224292,  0.196424, -0.166664,  0.135299, -0.102631,  0.068975, -0.034654
+};
+
+static float iadst_16[256] = {
+  0.033094,  0.098087,  0.159534,  0.215215,  0.263118,  0.301511,  0.329007,  0.344612,
+  0.347761,  0.338341,  0.316693,  0.283599,  0.240255,  0.188227,  0.129396,  0.065889,
+  0.065889,  0.188227,  0.283599,  0.338341,  0.344612,  0.301511,  0.215215,  0.098087,
+ -0.033094, -0.159534, -0.263118, -0.329007, -0.347761, -0.316693, -0.240255, -0.129396,
+  0.098087,  0.263118,  0.344612,  0.316693,  0.188227,  0.000000, -0.188227, -0.316693,
+ -0.344612, -0.263118, -0.098087,  0.098087,  0.263118,  0.344612,  0.316693,  0.188227,
+  0.129396,  0.316693,  0.329007,  0.159534, -0.098087, -0.301511, -0.338341, -0.188227,
+  0.065889,  0.283599,  0.344612,  0.215215, -0.033094, -0.263118, -0.347761, -0.240255,
+  0.159534,  0.344612,  0.240255, -0.065889, -0.316693, -0.301511, -0.033094,  0.263118,
+  0.338341,  0.129396, -0.188227, -0.347761, -0.215215,  0.098087,  0.329007,  0.283599,
+  0.188227,  0.344612,  0.098087, -0.263118, -0.316693, -0.000000,  0.316693,  0.263118,
+ -0.098087, -0.344612, -0.188227,  0.188227,  0.344612,  0.098087, -0.263118, -0.316693,
+  0.215215,  0.316693, -0.065889, -0.347761, -0.098087,  0.301511,  0.240255, -0.188227,
+ -0.329007,  0.033094,  0.344612,  0.129396, -0.283599, -0.263118,  0.159534,  0.338341,
+  0.240255,  0.263118, -0.215215, -0.283599,  0.188227,  0.301511, -0.159534, -0.316693,
+  0.129396,  0.329007, -0.098087, -0.338341,  0.065889,  0.344612, -0.033094, -0.347761,
+  0.263118,  0.188227, -0.316693, -0.098087,  0.344612,  0.000000, -0.344612,  0.098087,
+  0.316693, -0.188227, -0.263118,  0.263118,  0.188227, -0.316693, -0.098087,  0.344612,
+  0.283599,  0.098087, -0.347761,  0.129396,  0.263118, -0.301511, -0.065889,  0.344612,
+ -0.159534, -0.240255,  0.316693,  0.033094, -0.338341,  0.188227,  0.215215, -0.329007,
+  0.301511,  0.000000, -0.301511,  0.301511,  0.000000, -0.301511,  0.301511,  0.000000,
+ -0.301511,  0.301511,  0.000000, -0.301511,  0.301511,  0.000000, -0.301511,  0.301511,
+  0.316693, -0.098087, -0.188227,  0.344612, -0.263118, -0.000000,  0.263118, -0.344612,
+  0.188227,  0.098087, -0.316693,  0.316693, -0.098087, -0.188227,  0.344612, -0.263118,
+  0.329007, -0.188227, -0.033094,  0.240255, -0.344612,  0.301511, -0.129396, -0.098087,
+  0.283599, -0.347761,  0.263118, -0.065889, -0.159534,  0.316693, -0.338341,  0.215215,
+  0.338341, -0.263118,  0.129396,  0.033094, -0.188227,  0.301511, -0.347761,  0.316693,
+ -0.215215,  0.065889,  0.098087, -0.240255,  0.329007, -0.344612,  0.283599, -0.159534,
+  0.344612, -0.316693,  0.263118, -0.188227,  0.098087,  0.000000, -0.098087,  0.188227,
+ -0.263118,  0.316693, -0.344612,  0.344612, -0.316693,  0.263118, -0.188227,  0.098087,
+  0.347761, -0.344612,  0.338341, -0.329007,  0.316693, -0.301511,  0.283599, -0.263118,
+  0.240255, -0.215215,  0.188227, -0.159534,  0.129396, -0.098087,  0.065889, -0.033094
+};
+
+static const int16_t idct_i16[256] = {
+   4096,  5765,  5681,  5543,  5352,  5109,  4816,  4478,
+   4096,  3675,  3218,  2731,  2217,  1682,  1130,   568,
+   4096,  5543,  4816,  3675,  2217,   568, -1130, -2731,
+  -4096, -5109, -5681, -5765, -5352, -4478, -3218, -1682,
+   4096,  5109,  3218,   568, -2217, -4478, -5681, -5543,
+  -4096, -1682,  1130,  3675,  5352,  5765,  4816,  2731,
+   4096,  4478,  1130, -2731, -5352, -5543, -3218,   568,
+   4096,  5765,  4816,  1682, -2217, -5109, -5681, -3675,
+   4096,  3675, -1130, -5109, -5352, -1682,  3218,  5765,
+   4096,  -568, -4816, -5543, -2217,  2731,  5681,  4478,
+   4096,  2731, -3218, -5765, -2217,  3675,  5681,  1682,
+  -4096, -5543, -1130,  4478,  5352,   568, -4816, -5109,
+   4096,  1682, -4816, -4478,  2217,  5765,  1130, -5109,
+  -4096,  2731,  5681,   568, -5352, -3675,  3218,  5543,
+   4096,   568, -5681, -1682,  5352,  2731, -4816, -3675,
+   4096,  4478, -3218, -5109,  2217,  5543, -1130, -5765,
+   4096,  -568, -5681,  1682,  5352, -2731, -4816,  3675,
+   4096, -4478, -3218,  5109,  2217, -5543, -1130,  5765,
+   4096, -1682, -4816,  4478,  2217, -5765,  1130,  5109,
+  -4096, -2731,  5681,  -568, -5352,  3675,  3218, -5543,
+   4096, -2731, -3218,  5765, -2217, -3675,  5681, -1682,
+  -4096,  5543, -1130, -4478,  5352,  -568, -4816,  5109,
+   4096, -3675, -1130,  5109, -5352,  1682,  3218, -5765,
+   4096,   568, -4816,  5543, -2217, -2731,  5681, -4478,
+   4096, -4478,  1130,  2731, -5352,  5543, -3218,  -568,
+   4096, -5765,  4816, -1682, -2217,  5109, -5681,  3675,
+   4096, -5109,  3218,  -568, -2217,  4478, -5681,  5543,
+  -4096,  1682,  1130, -3675,  5352, -5765,  4816, -2731,
+   4096, -5543,  4816, -3675,  2217,  -568, -1130,  2731,
+  -4096,  5109, -5681,  5765, -5352,  4478, -3218,  1682,
+   4096, -5765,  5681, -5543,  5352, -5109,  4816, -4478,
+   4096, -3675,  3218, -2731,  2217, -1682,  1130,  -568
+};
+
+static const int16_t iadst_i16[256] = {
+    542,  1607,  2614,  3526,  4311,  4940,  5390,  5646,
+   5698,  5543,  5189,  4646,  3936,  3084,  2120,  1080,
+   1080,  3084,  4646,  5543,  5646,  4940,  3526,  1607,
+   -542, -2614, -4311, -5390, -5698, -5189, -3936, -2120,
+   1607,  4311,  5646,  5189,  3084,     0, -3084, -5189,
+  -5646, -4311, -1607,  1607,  4311,  5646,  5189,  3084,
+   2120,  5189,  5390,  2614, -1607, -4940, -5543, -3084,
+   1080,  4646,  5646,  3526, -542,  -4311, -5698, -3936,
+   2614,  5646,  3936, -1080, -5189, -4940,  -542,  4311,
+   5543,  2120, -3084, -5698, -3526,  1607,  5390,  4646,
+   3084,  5646,  1607, -4311, -5189,     0,  5189,  4311,
+  -1607, -5646, -3084,  3084,  5646,  1607, -4311, -5189,
+   3526,  5189, -1080, -5698, -1607,  4940,  3936, -3084,
+  -5390,   542,  5646,  2120, -4646, -4311,  2614,  5543,
+   3936,  4311, -3526, -4646,  3084,  4940, -2614, -5189,
+   2120,  5390, -1607, -5543,  1080,  5646,  -542, -5698,
+   4311,  3084, -5189, -1607,  5646,     0, -5646,  1607,
+   5189, -3084, -4311,  4311,  3084, -5189, -1607,  5646,
+   4646,  1607, -5698,  2120,  4311, -4940, -1080,  5646,
+  -2614, -3936,  5189,   542, -5543,  3084,  3526, -5390,
+   4940,     0, -4940,  4940,     0, -4940,  4940,     0,
+  -4940,  4940,     0, -4940,  4940,     0, -4940,  4940,
+   5189, -1607, -3084,  5646, -4311,     0,  4311, -5646,
+   3084,  1607, -5189,  5189, -1607, -3084,  5646, -4311,
+   5390, -3084,  -542,  3936, -5646,  4940, -2120, -1607,
+   4646, -5698,  4311, -1080, -2614,  5189, -5543,  3526,
+   5543, -4311,  2120,   542, -3084,  4940, -5698,  5189,
+  -3526,  1080,  1607, -3936,  5390, -5646,  4646, -2614,
+   5646, -5189,  4311, -3084,  1607,     0, -1607,  3084,
+  -4311,  5189, -5646,  5646, -5189,  4311, -3084,  1607,
+   5698, -5646,  5543, -5390,  5189, -4940,  4646, -4311,
+   3936, -3526,  3084, -2614,  2120, -1607,  1080,  -542
+};
+
+/* For test */
+#define TEST_INT 1
+#if TEST_INT
+#define vp9_ihtllm_int_c vp9_ihtllm_c
+#else
+#define vp9_ihtllm_float_c vp9_ihtllm_c
+#endif
+
+void vp9_ihtllm_float_c(const int16_t *input, int16_t *output, int pitch,
+                  TX_TYPE tx_type, int tx_dim) {
+  vp9_clear_system_state();  // Make it simd safe : __asm emms;
+  {
+    int i, j, k;
+    float bufa[256], bufb[256];  // buffers are for floating-point test purpose
+                                 // the implementation could be simplified in
+                                 // conjunction with integer transform
+    const int16_t *ip = input;
+    int16_t *op = output;
+    int shortpitch = pitch >> 1;
+
+    float *pfa = &bufa[0];
+    float *pfb = &bufb[0];
+
+    // pointers to vertical and horizontal transforms
+    const float *ptv, *pth;
+
+    assert(tx_type != DCT_DCT);
+    // load and convert residual array into floating-point
+    for(j = 0; j < tx_dim; j++) {
+      for(i = 0; i < tx_dim; i++) {
+        pfa[i] = (float)ip[i];
+      }
+      pfa += tx_dim;
+      ip  += tx_dim;
+    }
+
+    // vertical transformation
+    pfa = &bufa[0];
+    pfb = &bufb[0];
+
+    switch(tx_type) {
+      case ADST_ADST :
+      case ADST_DCT  :
+        ptv = (tx_dim == 4) ? &iadst_4[0] :
+                              ((tx_dim == 8) ? &iadst_8[0] : &iadst_16[0]);
+        break;
+
+      default :
+        ptv = (tx_dim == 4) ? &idct_4[0] :
+                              ((tx_dim == 8) ? &idct_8[0] : &idct_16[0]);
+        break;
+    }
+
+    for(j = 0; j < tx_dim; j++) {
+      for(i = 0; i < tx_dim; i++) {
+        pfb[i] = 0 ;
+        for(k = 0; k < tx_dim; k++) {
+          pfb[i] += ptv[k] * pfa[(k * tx_dim)];
+        }
+        pfa += 1;
+      }
+
+      pfb += tx_dim;
+      ptv += tx_dim;
+      pfa = &bufa[0];
+    }
+
+    // horizontal transformation
+    pfa = &bufa[0];
+    pfb = &bufb[0];
+
+    switch(tx_type) {
+      case ADST_ADST :
+      case  DCT_ADST :
+        pth = (tx_dim == 4) ? &iadst_4[0] :
+                              ((tx_dim == 8) ? &iadst_8[0] : &iadst_16[0]);
+        break;
+
+      default :
+        pth = (tx_dim == 4) ? &idct_4[0] :
+                              ((tx_dim == 8) ? &idct_8[0] : &idct_16[0]);
+        break;
+    }
+
+    for(j = 0; j < tx_dim; j++) {
+      for(i = 0; i < tx_dim; i++) {
+        pfa[i] = 0;
+        for(k = 0; k < tx_dim; k++) {
+          pfa[i] += pfb[k] * pth[k];
+        }
+        pth += tx_dim;
+       }
+
+      pfa += tx_dim;
+      pfb += tx_dim;
+
+      switch(tx_type) {
+        case ADST_ADST :
+        case  DCT_ADST :
+          pth = (tx_dim == 4) ? &iadst_4[0] :
+                                ((tx_dim == 8) ? &iadst_8[0] : &iadst_16[0]);
+          break;
+
+        default :
+          pth = (tx_dim == 4) ? &idct_4[0] :
+                                ((tx_dim == 8) ? &idct_8[0] : &idct_16[0]);
+          break;
+      }
+    }
+
+    // convert to short integer format and load BLOCKD buffer
+    op  = output;
+    pfa = &bufa[0];
+
+    for(j = 0; j < tx_dim; j++) {
+      for(i = 0; i < tx_dim; i++) {
+        op[i] = (pfa[i] > 0 ) ? (int16_t)( pfa[i] / 8 + 0.49) :
+                               -(int16_t)( - pfa[i] / 8 + 0.49);
+      }
+
+      op += shortpitch;
+      pfa += tx_dim;
+    }
+  }
+  vp9_clear_system_state(); // Make it simd safe : __asm emms;
+}
+
+/* Converted the transforms to integer form. */
+#define VERTICAL_SHIFT 14  // 16
+#define VERTICAL_ROUNDING ((1 << (VERTICAL_SHIFT - 1)) - 1)
+#define HORIZONTAL_SHIFT 17  // 15
+#define HORIZONTAL_ROUNDING ((1 << (HORIZONTAL_SHIFT - 1)) - 1)
+void vp9_ihtllm_int_c(const int16_t *input, int16_t *output, int pitch,
+                      TX_TYPE tx_type, int tx_dim) {
+  int i, j, k;
+  int16_t imbuf[256];
+
+  const int16_t *ip = input;
+  int16_t *op = output;
+  int16_t *im = &imbuf[0];
+
+  /* pointers to vertical and horizontal transforms. */
+  const int16_t *ptv = NULL, *pth = NULL;
+  int shortpitch = pitch >> 1;
+
+  switch (tx_type) {
+    case ADST_ADST :
+      ptv = pth = (tx_dim == 4) ? &iadst_i4[0]
+                                  : ((tx_dim == 8) ? &iadst_i8[0]
+                                                     : &iadst_i16[0]);
+      break;
+    case ADST_DCT  :
+      ptv = (tx_dim == 4) ? &iadst_i4[0]
+                            : ((tx_dim == 8) ? &iadst_i8[0] : &iadst_i16[0]);
+      pth = (tx_dim == 4) ? &idct_i4[0]
+                            : ((tx_dim == 8) ? &idct_i8[0] : &idct_i16[0]);
+      break;
+    case  DCT_ADST :
+      ptv = (tx_dim == 4) ? &idct_i4[0]
+                            : ((tx_dim == 8) ? &idct_i8[0] : &idct_i16[0]);
+      pth = (tx_dim == 4) ? &iadst_i4[0]
+                            : ((tx_dim == 8) ? &iadst_i8[0] : &iadst_i16[0]);
+      break;
+    case  DCT_DCT :
+      ptv = pth = (tx_dim == 4) ? &idct_i4[0]
+                                  : ((tx_dim == 8) ? &idct_i8[0]
+                                                     : &idct_i16[0]);
+      break;
+    default:
+      assert(0);
+      break;
+  }
+
+  /* vertical transformation */
+  for (j = 0; j < tx_dim; j++) {
+    for (i = 0; i < tx_dim; i++) {
+      int temp = 0;
+
+      for (k = 0; k < tx_dim; k++) {
+        temp += ptv[k] * ip[(k * tx_dim)];
+      }
+
+      im[i] = (int16_t)((temp + VERTICAL_ROUNDING) >> VERTICAL_SHIFT);
+      ip++;
+    }
+    im += tx_dim;  // 16
+    ptv += tx_dim;
+    ip = input;
+  }
+
+  /* horizontal transformation */
+  im = &imbuf[0];
+
+  for (j = 0; j < tx_dim; j++) {
+    const int16_t *pthc = pth;
+
+    for (i = 0; i < tx_dim; i++) {
+      int temp = 0;
+
+      for (k = 0; k < tx_dim; k++) {
+        temp += im[k] * pthc[k];
+      }
+
+      op[i] = (int16_t)((temp + HORIZONTAL_ROUNDING) >> HORIZONTAL_SHIFT);
+      pthc += tx_dim;
+    }
+
+    im += tx_dim;  // 16
+    op += shortpitch;
+  }
+}
+
+void vp9_short_idct4x4llm_c(short *input, short *output, int pitch) {
+  int i;
+  int a1, b1, c1, d1;
+
+  short *ip = input;
+  short *op = output;
+  int temp1, temp2;
+  int shortpitch = pitch >> 1;
+
+  for (i = 0; i < 4; i++) {
+    a1 = ip[0] + ip[8];
+    b1 = ip[0] - ip[8];
+
+    temp1 = (ip[4] * sinpi8sqrt2 + rounding) >> 16;
+    temp2 = ip[12] + ((ip[12] * cospi8sqrt2minus1 + rounding) >> 16);
+    c1 = temp1 - temp2;
+
+    temp1 = ip[4] + ((ip[4] * cospi8sqrt2minus1 + rounding) >> 16);
+    temp2 = (ip[12] * sinpi8sqrt2 + rounding) >> 16;
+    d1 = temp1 + temp2;
+
+    op[shortpitch * 0] = a1 + d1;
+    op[shortpitch * 3] = a1 - d1;
+
+    op[shortpitch * 1] = b1 + c1;
+    op[shortpitch * 2] = b1 - c1;
+
+    ip++;
+    op++;
+  }
+
+  ip = output;
+  op = output;
+
+  for (i = 0; i < 4; i++) {
+    a1 = ip[0] + ip[2];
+    b1 = ip[0] - ip[2];
+
+    temp1 = (ip[1] * sinpi8sqrt2 + rounding) >> 16;
+    temp2 = ip[3] + ((ip[3] * cospi8sqrt2minus1 + rounding) >> 16);
+    c1 = temp1 - temp2;
+
+    temp1 = ip[1] + ((ip[1] * cospi8sqrt2minus1 + rounding) >> 16);
+    temp2 = (ip[3] * sinpi8sqrt2 + rounding) >> 16;
+    d1 = temp1 + temp2;
+
+    op[0] = (a1 + d1 + 16) >> 5;
+    op[3] = (a1 - d1 + 16) >> 5;
+
+    op[1] = (b1 + c1 + 16) >> 5;
+    op[2] = (b1 - c1 + 16) >> 5;
+
+    ip += shortpitch;
+    op += shortpitch;
+  }
+}
+
+void vp9_short_idct4x4llm_1_c(short *input, short *output, int pitch) {
+  int i;
+  int a1;
+  short *op = output;
+  int shortpitch = pitch >> 1;
+  a1 = ((input[0] + 16) >> 5);
+  for (i = 0; i < 4; i++) {
+    op[0] = a1;
+    op[1] = a1;
+    op[2] = a1;
+    op[3] = a1;
+    op += shortpitch;
+  }
+}
+
+void vp9_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr,
+                            unsigned char *dst_ptr, int pitch, int stride) {
+  int a1 = ((input_dc + 16) >> 5);
+  int r, c;
+
+  for (r = 0; r < 4; r++) {
+    for (c = 0; c < 4; c++) {
+      int a = a1 + pred_ptr[c];
+
+      if (a < 0)
+        a = 0;
+
+      if (a > 255)
+        a = 255;
+
+      dst_ptr[c] = (unsigned char) a;
+    }
+
+    dst_ptr += stride;
+    pred_ptr += pitch;
+  }
+}
+
+void vp9_short_inv_walsh4x4_c(short *input, short *output) {
+  int i;
+  int a1, b1, c1, d1;
+  short *ip = input;
+  short *op = output;
+
+  for (i = 0; i < 4; i++) {
+    a1 = ((ip[0] + ip[3]));
+    b1 = ((ip[1] + ip[2]));
+    c1 = ((ip[1] - ip[2]));
+    d1 = ((ip[0] - ip[3]));
+
+    op[0] = (a1 + b1 + 1) >> 1;
+    op[1] = (c1 + d1) >> 1;
+    op[2] = (a1 - b1) >> 1;
+    op[3] = (d1 - c1) >> 1;
+
+    ip += 4;
+    op += 4;
+  }
+
+  ip = output;
+  op = output;
+  for (i = 0; i < 4; i++) {
+    a1 = ip[0] + ip[12];
+    b1 = ip[4] + ip[8];
+    c1 = ip[4] - ip[8];
+    d1 = ip[0] - ip[12];
+    op[0] = (a1 + b1 + 1) >> 1;
+    op[4] = (c1 + d1) >> 1;
+    op[8] = (a1 - b1) >> 1;
+    op[12] = (d1 - c1) >> 1;
+    ip++;
+    op++;
+  }
+}
+
+void vp9_short_inv_walsh4x4_1_c(short *in, short *out) {
+  int i;
+  short tmp[4];
+  short *ip = in;
+  short *op = tmp;
+
+  op[0] = (ip[0] + 1) >> 1;
+  op[1] = op[2] = op[3] = (ip[0] >> 1);
+
+  ip = tmp;
+  op = out;
+  for (i = 0; i < 4; i++) {
+    op[0] = (ip[0] + 1) >> 1;
+    op[4] = op[8] = op[12] = (ip[0] >> 1);
+    ip++;
+    op++;
+  }
+}
+
+#if CONFIG_LOSSLESS
+void vp9_short_inv_walsh4x4_lossless_c(short *input, short *output) {
+  int i;
+  int a1, b1, c1, d1;
+  short *ip = input;
+  short *op = output;
+
+  for (i = 0; i < 4; i++) {
+    a1 = ((ip[0] + ip[3])) >> Y2_WHT_UPSCALE_FACTOR;
+    b1 = ((ip[1] + ip[2])) >> Y2_WHT_UPSCALE_FACTOR;
+    c1 = ((ip[1] - ip[2])) >> Y2_WHT_UPSCALE_FACTOR;
+    d1 = ((ip[0] - ip[3])) >> Y2_WHT_UPSCALE_FACTOR;
+
+    op[0] = (a1 + b1 + 1) >> 1;
+    op[1] = (c1 + d1) >> 1;
+    op[2] = (a1 - b1) >> 1;
+    op[3] = (d1 - c1) >> 1;
+
+    ip += 4;
+    op += 4;
+  }
+
+  ip = output;
+  op = output;
+  for (i = 0; i < 4; i++) {
+    a1 = ip[0] + ip[12];
+    b1 = ip[4] + ip[8];
+    c1 = ip[4] - ip[8];
+    d1 = ip[0] - ip[12];
+
+
+    op[0] = ((a1 + b1 + 1) >> 1) << Y2_WHT_UPSCALE_FACTOR;
+    op[4] = ((c1 + d1) >> 1) << Y2_WHT_UPSCALE_FACTOR;
+    op[8] = ((a1 - b1) >> 1) << Y2_WHT_UPSCALE_FACTOR;
+    op[12] = ((d1 - c1) >> 1) << Y2_WHT_UPSCALE_FACTOR;
+
+    ip++;
+    op++;
+  }
+}
+
+void vp9_short_inv_walsh4x4_1_lossless_c(short *in, short *out) {
+  int i;
+  short tmp[4];
+  short *ip = in;
+  short *op = tmp;
+
+  op[0] = ((ip[0] >> Y2_WHT_UPSCALE_FACTOR) + 1) >> 1;
+  op[1] = op[2] = op[3] = ((ip[0] >> Y2_WHT_UPSCALE_FACTOR) >> 1);
+
+  ip = tmp;
+  op = out;
+  for (i = 0; i < 4; i++) {
+    op[0] = ((ip[0] + 1) >> 1) << Y2_WHT_UPSCALE_FACTOR;
+    op[4] = op[8] = op[12] = ((ip[0] >> 1)) << Y2_WHT_UPSCALE_FACTOR;
+    ip++;
+    op++;
+  }
+}
+
+void vp9_short_inv_walsh4x4_x8_c(short *input, short *output, int pitch) {
+  int i;
+  int a1, b1, c1, d1;
+  short *ip = input;
+  short *op = output;
+  int shortpitch = pitch >> 1;
+
+  for (i = 0; i < 4; i++) {
+    a1 = ((ip[0] + ip[3])) >> WHT_UPSCALE_FACTOR;
+    b1 = ((ip[1] + ip[2])) >> WHT_UPSCALE_FACTOR;
+    c1 = ((ip[1] - ip[2])) >> WHT_UPSCALE_FACTOR;
+    d1 = ((ip[0] - ip[3])) >> WHT_UPSCALE_FACTOR;
+
+    op[0] = (a1 + b1 + 1) >> 1;
+    op[1] = (c1 + d1) >> 1;
+    op[2] = (a1 - b1) >> 1;
+    op[3] = (d1 - c1) >> 1;
+
+    ip += 4;
+    op += shortpitch;
+  }
+
+  ip = output;
+  op = output;
+  for (i = 0; i < 4; i++) {
+    a1 = ip[shortpitch * 0] + ip[shortpitch * 3];
+    b1 = ip[shortpitch * 1] + ip[shortpitch * 2];
+    c1 = ip[shortpitch * 1] - ip[shortpitch * 2];
+    d1 = ip[shortpitch * 0] - ip[shortpitch * 3];
+
+
+    op[shortpitch * 0] = (a1 + b1 + 1) >> 1;
+    op[shortpitch * 1] = (c1 + d1) >> 1;
+    op[shortpitch * 2] = (a1 - b1) >> 1;
+    op[shortpitch * 3] = (d1 - c1) >> 1;
+
+    ip++;
+    op++;
+  }
+}
+
+void vp9_short_inv_walsh4x4_1_x8_c(short *in, short *out, int pitch) {
+  int i;
+  short tmp[4];
+  short *ip = in;
+  short *op = tmp;
+  int shortpitch = pitch >> 1;
+
+  op[0] = ((ip[0] >> WHT_UPSCALE_FACTOR) + 1) >> 1;
+  op[1] = op[2] = op[3] = ((ip[0] >> WHT_UPSCALE_FACTOR) >> 1);
+
+
+  ip = tmp;
+  op = out;
+  for (i = 0; i < 4; i++) {
+    op[shortpitch * 0] = (ip[0] + 1) >> 1;
+    op[shortpitch * 1] = op[shortpitch * 2] = op[shortpitch * 3] = ip[0] >> 1;
+    ip++;
+    op++;
+  }
+}
+
+void vp9_dc_only_inv_walsh_add_c(short input_dc, unsigned char *pred_ptr,
+                                 unsigned char *dst_ptr,
+                                 int pitch, int stride) {
+  int r, c;
+  short tmp[16];
+  vp9_short_inv_walsh4x4_1_x8_c(&input_dc, tmp, 4 << 1);
+
+  for (r = 0; r < 4; r++) {
+    for (c = 0; c < 4; c++) {
+      int a = tmp[r * 4 + c] + pred_ptr[c];
+      if (a < 0)
+        a = 0;
+
+      if (a > 255)
+        a = 255;
+
+      dst_ptr[c] = (unsigned char) a;
+    }
+
+    dst_ptr += stride;
+    pred_ptr += pitch;
+  }
+}
+#endif
+
+void vp9_dc_only_idct_add_8x8_c(short input_dc,
+                                unsigned char *pred_ptr,
+                                unsigned char *dst_ptr,
+                                int pitch, int stride) {
+  int a1 = ((input_dc + 16) >> 5);
+  int r, c, b;
+  unsigned char *orig_pred = pred_ptr;
+  unsigned char *orig_dst = dst_ptr;
+  for (b = 0; b < 4; b++) {
+    for (r = 0; r < 4; r++) {
+      for (c = 0; c < 4; c++) {
+        int a = a1 + pred_ptr[c];
+
+        if (a < 0)
+          a = 0;
+
+        if (a > 255)
+          a = 255;
+
+        dst_ptr[c] = (unsigned char) a;
+      }
+
+      dst_ptr += stride;
+      pred_ptr += pitch;
+    }
+    dst_ptr = orig_dst + (b + 1) % 2 * 4 + (b + 1) / 2 * 4 * stride;
+    pred_ptr = orig_pred + (b + 1) % 2 * 4 + (b + 1) / 2 * 4 * pitch;
+  }
+}
+
+#define W1 2841                 /* 2048*sqrt(2)*cos(1*pi/16) */
+#define W2 2676                 /* 2048*sqrt(2)*cos(2*pi/16) */
+#define W3 2408                 /* 2048*sqrt(2)*cos(3*pi/16) */
+#define W5 1609                 /* 2048*sqrt(2)*cos(5*pi/16) */
+#define W6 1108                 /* 2048*sqrt(2)*cos(6*pi/16) */
+#define W7 565                  /* 2048*sqrt(2)*cos(7*pi/16) */
+
+/* row (horizontal) IDCT
+ *
+ * 7                       pi         1 dst[k] = sum c[l] * src[l] * cos( -- *
+ * ( k + - ) * l ) l=0                      8          2
+ *
+ * where: c[0]    = 128 c[1..7] = 128*sqrt(2) */
+
+static void idctrow(int *blk) {
+  int x0, x1, x2, x3, x4, x5, x6, x7, x8;
+  /* shortcut */
+  if (!((x1 = blk[4] << 11) | (x2 = blk[6]) | (x3 = blk[2]) |
+        (x4 = blk[1]) | (x5 = blk[7]) | (x6 = blk[5]) | (x7 = blk[3]))) {
+    blk[0] = blk[1] = blk[2] = blk[3] = blk[4]
+                                        = blk[5] = blk[6] = blk[7] = blk[0] << 3;
+    return;
+  }
+
+  x0 = (blk[0] << 11) + 128;    /* for proper rounding in the fourth stage */
+  /* first stage */
+  x8 = W7 * (x4 + x5);
+  x4 = x8 + (W1 - W7) * x4;
+  x5 = x8 - (W1 + W7) * x5;
+  x8 = W3 * (x6 + x7);
+  x6 = x8 - (W3 - W5) * x6;
+  x7 = x8 - (W3 + W5) * x7;
+
+  /* second stage */
+  x8 = x0 + x1;
+  x0 -= x1;
+  x1 = W6 * (x3 + x2);
+  x2 = x1 - (W2 + W6) * x2;
+  x3 = x1 + (W2 - W6) * x3;
+  x1 = x4 + x6;
+  x4 -= x6;
+  x6 = x5 + x7;
+  x5 -= x7;
+
+  /* third stage */
+  x7 = x8 + x3;
+  x8 -= x3;
+  x3 = x0 + x2;
+  x0 -= x2;
+  x2 = (181 * (x4 + x5) + 128) >> 8;
+  x4 = (181 * (x4 - x5) + 128) >> 8;
+
+  /* fourth stage */
+  blk[0] = (x7 + x1) >> 8;
+  blk[1] = (x3 + x2) >> 8;
+  blk[2] = (x0 + x4) >> 8;
+  blk[3] = (x8 + x6) >> 8;
+  blk[4] = (x8 - x6) >> 8;
+  blk[5] = (x0 - x4) >> 8;
+  blk[6] = (x3 - x2) >> 8;
+  blk[7] = (x7 - x1) >> 8;
+}
+
+/* column (vertical) IDCT
+ *
+ * 7                         pi         1 dst[8*k] = sum c[l] * src[8*l] *
+ * cos( -- * ( k + - ) * l ) l=0                        8          2
+ *
+ * where: c[0]    = 1/1024 c[1..7] = (1/1024)*sqrt(2) */
+static void idctcol(int *blk) {
+  int x0, x1, x2, x3, x4, x5, x6, x7, x8;
+
+  /* shortcut */
+  if (!((x1 = (blk[8 * 4] << 8)) | (x2 = blk[8 * 6]) | (x3 = blk[8 * 2]) |
+        (x4 = blk[8 * 1]) | (x5 = blk[8 * 7]) | (x6 = blk[8 * 5]) |
+        (x7 = blk[8 * 3]))) {
+    blk[8 * 0] = blk[8 * 1] = blk[8 * 2] = blk[8 * 3]
+                                           = blk[8 * 4] = blk[8 * 5] = blk[8 * 6]
+                                                                       = blk[8 * 7] = ((blk[8 * 0] + 32) >> 6);
+    return;
+  }
+
+  x0 = (blk[8 * 0] << 8) + 16384;
+
+  /* first stage */
+  x8 = W7 * (x4 + x5) + 4;
+  x4 = (x8 + (W1 - W7) * x4) >> 3;
+  x5 = (x8 - (W1 + W7) * x5) >> 3;
+  x8 = W3 * (x6 + x7) + 4;
+  x6 = (x8 - (W3 - W5) * x6) >> 3;
+  x7 = (x8 - (W3 + W5) * x7) >> 3;
+
+  /* second stage */
+  x8 = x0 + x1;
+  x0 -= x1;
+  x1 = W6 * (x3 + x2) + 4;
+  x2 = (x1 - (W2 + W6) * x2) >> 3;
+  x3 = (x1 + (W2 - W6) * x3) >> 3;
+  x1 = x4 + x6;
+  x4 -= x6;
+  x6 = x5 + x7;
+  x5 -= x7;
+
+  /* third stage */
+  x7 = x8 + x3;
+  x8 -= x3;
+  x3 = x0 + x2;
+  x0 -= x2;
+  x2 = (181 * (x4 + x5) + 128) >> 8;
+  x4 = (181 * (x4 - x5) + 128) >> 8;
+
+  /* fourth stage */
+  blk[8 * 0] = (x7 + x1) >> 14;
+  blk[8 * 1] = (x3 + x2) >> 14;
+  blk[8 * 2] = (x0 + x4) >> 14;
+  blk[8 * 3] = (x8 + x6) >> 14;
+  blk[8 * 4] = (x8 - x6) >> 14;
+  blk[8 * 5] = (x0 - x4) >> 14;
+  blk[8 * 6] = (x3 - x2) >> 14;
+  blk[8 * 7] = (x7 - x1) >> 14;
+}
+
+#define TX_DIM 8
+void vp9_short_idct8x8_c(short *coefs, short *block, int pitch) {
+  int X[TX_DIM * TX_DIM];
+  int i, j;
+  int shortpitch = pitch >> 1;
+
+  for (i = 0; i < TX_DIM; i++) {
+    for (j = 0; j < TX_DIM; j++) {
+      X[i * TX_DIM + j] = (int)(coefs[i * TX_DIM + j] + 1
+                                + (coefs[i * TX_DIM + j] < 0)) >> 2;
+    }
+  }
+  for (i = 0; i < 8; i++)
+    idctrow(X + 8 * i);
+
+  for (i = 0; i < 8; i++)
+    idctcol(X + i);
+
+  for (i = 0; i < TX_DIM; i++) {
+    for (j = 0; j < TX_DIM; j++) {
+      block[i * shortpitch + j]  = X[i * TX_DIM + j] >> 1;
+    }
+  }
+}
+
+
+void vp9_short_ihaar2x2_c(short *input, short *output, int pitch) {
+  int i;
+  short *ip = input; // 0,1, 4, 8
+  short *op = output;
+  for (i = 0; i < 16; i++) {
+    op[i] = 0;
+  }
+
+  op[0] = (ip[0] + ip[1] + ip[4] + ip[8] + 1) >> 1;
+  op[1] = (ip[0] - ip[1] + ip[4] - ip[8]) >> 1;
+  op[4] = (ip[0] + ip[1] - ip[4] - ip[8]) >> 1;
+  op[8] = (ip[0] - ip[1] - ip[4] + ip[8]) >> 1;
+}
+
+
+#if 0
+// Keep a really bad float version as reference for now.
+void vp9_short_idct16x16_c(short *input, short *output, int pitch) {
+
+  vp9_clear_system_state(); // Make it simd safe : __asm emms;
+  {
+    double x;
+    const int short_pitch = pitch >> 1;
+    int i, j, k, l;
+    for (l = 0; l < 16; ++l) {
+      for (k = 0; k < 16; ++k) {
+        double s = 0;
+        for (i = 0; i < 16; ++i) {
+          for (j = 0; j < 16; ++j) {
+            x=cos(PI*j*(l+0.5)/16.0)*cos(PI*i*(k+0.5)/16.0)*input[i*16+j]/32;
+            if (i != 0)
+              x *= sqrt(2.0);
+            if (j != 0)
+              x *= sqrt(2.0);
+            s += x;
+          }
+        }
+        output[k*short_pitch+l] = (short)round(s);
+      }
+    }
+  }
+  vp9_clear_system_state(); // Make it simd safe : __asm emms;
+}
+#endif
+
+static const double C1 = 0.995184726672197;
+static const double C2 = 0.98078528040323;
+static const double C3 = 0.956940335732209;
+static const double C4 = 0.923879532511287;
+static const double C5 = 0.881921264348355;
+static const double C6 = 0.831469612302545;
+static const double C7 = 0.773010453362737;
+static const double C8 = 0.707106781186548;
+static const double C9 = 0.634393284163646;
+static const double C10 = 0.555570233019602;
+static const double C11 = 0.471396736825998;
+static const double C12 = 0.38268343236509;
+static const double C13 = 0.290284677254462;
+static const double C14 = 0.195090322016128;
+static const double C15 = 0.098017140329561;
+
+
+static void butterfly_16x16_idct_1d(double input[16], double output[16]) {
+
+  vp9_clear_system_state(); // Make it simd safe : __asm emms;
+  {
+    double step[16];
+    double intermediate[16];
+    double temp1, temp2;
+
+
+    // step 1 and 2
+    step[ 0] = input[0] + input[8];
+    step[ 1] = input[0] - input[8];
+
+    temp1 = input[4]*C12;
+    temp2 = input[12]*C4;
+
+    temp1 -= temp2;
+    temp1 *= C8;
+
+    step[ 2] = 2*(temp1);
+
+    temp1 = input[4]*C4;
+    temp2 = input[12]*C12;
+    temp1 += temp2;
+    temp1 = (temp1);
+    temp1 *= C8;
+    step[ 3] = 2*(temp1);
+
+    temp1 = input[2]*C8;
+    temp1 = 2*(temp1);
+    temp2 = input[6] + input[10];
+
+    step[ 4] = temp1 + temp2;
+    step[ 5] = temp1 - temp2;
+
+    temp1 = input[14]*C8;
+    temp1 = 2*(temp1);
+    temp2 = input[6] - input[10];
+
+    step[ 6] = temp2 - temp1;
+    step[ 7] = temp2 + temp1;
+
+    // for odd input
+    temp1 = input[3]*C12;
+    temp2 = input[13]*C4;
+    temp1 += temp2;
+    temp1 = (temp1);
+    temp1 *= C8;
+    intermediate[ 8] = 2*(temp1);
+
+    temp1 = input[3]*C4;
+    temp2 = input[13]*C12;
+    temp2 -= temp1;
+    temp2 = (temp2);
+    temp2 *= C8;
+    intermediate[ 9] = 2*(temp2);
+
+    intermediate[10] = 2*(input[9]*C8);
+    intermediate[11] = input[15] - input[1];
+    intermediate[12] = input[15] + input[1];
+    intermediate[13] = 2*((input[7]*C8));
+
+    temp1 = input[11]*C12;
+    temp2 = input[5]*C4;
+    temp2 -= temp1;
+    temp2 = (temp2);
+    temp2 *= C8;
+    intermediate[14] = 2*(temp2);
+
+    temp1 = input[11]*C4;
+    temp2 = input[5]*C12;
+    temp1 += temp2;
+    temp1 = (temp1);
+    temp1 *= C8;
+    intermediate[15] = 2*(temp1);
+
+    step[ 8] = intermediate[ 8] + intermediate[14];
+    step[ 9] = intermediate[ 9] + intermediate[15];
+    step[10] = intermediate[10] + intermediate[11];
+    step[11] = intermediate[10] - intermediate[11];
+    step[12] = intermediate[12] + intermediate[13];
+    step[13] = intermediate[12] - intermediate[13];
+    step[14] = intermediate[ 8] - intermediate[14];
+    step[15] = intermediate[ 9] - intermediate[15];
+
+    // step 3
+    output[0] = step[ 0] + step[ 3];
+    output[1] = step[ 1] + step[ 2];
+    output[2] = step[ 1] - step[ 2];
+    output[3] = step[ 0] - step[ 3];
+
+    temp1 = step[ 4]*C14;
+    temp2 = step[ 7]*C2;
+    temp1 -= temp2;
+    output[4] =  (temp1);
+
+    temp1 = step[ 4]*C2;
+    temp2 = step[ 7]*C14;
+    temp1 += temp2;
+    output[7] =  (temp1);
+
+    temp1 = step[ 5]*C10;
+    temp2 = step[ 6]*C6;
+    temp1 -= temp2;
+    output[5] =  (temp1);
+
+    temp1 = step[ 5]*C6;
+    temp2 = step[ 6]*C10;
+    temp1 += temp2;
+    output[6] =  (temp1);
+
+    output[8] = step[ 8] + step[11];
+    output[9] = step[ 9] + step[10];
+    output[10] = step[ 9] - step[10];
+    output[11] = step[ 8] - step[11];
+    output[12] = step[12] + step[15];
+    output[13] = step[13] + step[14];
+    output[14] = step[13] - step[14];
+    output[15] = step[12] - step[15];
+
+    // output 4
+    step[ 0] = output[0] + output[7];
+    step[ 1] = output[1] + output[6];
+    step[ 2] = output[2] + output[5];
+    step[ 3] = output[3] + output[4];
+    step[ 4] = output[3] - output[4];
+    step[ 5] = output[2] - output[5];
+    step[ 6] = output[1] - output[6];
+    step[ 7] = output[0] - output[7];
+
+    temp1 = output[8]*C7;
+    temp2 = output[15]*C9;
+    temp1 -= temp2;
+    step[ 8] = (temp1);
+
+    temp1 = output[9]*C11;
+    temp2 = output[14]*C5;
+    temp1 += temp2;
+    step[ 9] = (temp1);
+
+    temp1 = output[10]*C3;
+    temp2 = output[13]*C13;
+    temp1 -= temp2;
+    step[10] = (temp1);
+
+    temp1 = output[11]*C15;
+    temp2 = output[12]*C1;
+    temp1 += temp2;
+    step[11] = (temp1);
+
+    temp1 = output[11]*C1;
+    temp2 = output[12]*C15;
+    temp2 -= temp1;
+    step[12] = (temp2);
+
+    temp1 = output[10]*C13;
+    temp2 = output[13]*C3;
+    temp1 += temp2;
+    step[13] = (temp1);
+
+    temp1 = output[9]*C5;
+    temp2 = output[14]*C11;
+    temp2 -= temp1;
+    step[14] = (temp2);
+
+    temp1 = output[8]*C9;
+    temp2 = output[15]*C7;
+    temp1 += temp2;
+    step[15] = (temp1);
+
+    // step 5
+    output[0] = (step[0] + step[15]);
+    output[1] = (step[1] + step[14]);
+    output[2] = (step[2] + step[13]);
+    output[3] = (step[3] + step[12]);
+    output[4] = (step[4] + step[11]);
+    output[5] = (step[5] + step[10]);
+    output[6] = (step[6] + step[ 9]);
+    output[7] = (step[7] + step[ 8]);
+
+    output[15] = (step[0] - step[15]);
+    output[14] = (step[1] - step[14]);
+    output[13] = (step[2] - step[13]);
+    output[12] = (step[3] - step[12]);
+    output[11] = (step[4] - step[11]);
+    output[10] = (step[5] - step[10]);
+    output[9] = (step[6] - step[ 9]);
+    output[8] = (step[7] - step[ 8]);
+  }
+  vp9_clear_system_state(); // Make it simd safe : __asm emms;
+}
+
+// Remove once an int version of iDCT is written
+#if 0
+void reference_16x16_idct_1d(double input[16], double output[16]) {
+
+  vp9_clear_system_state(); // Make it simd safe : __asm emms;
+  {
+    const double kPi = 3.141592653589793238462643383279502884;
+    const double kSqrt2 = 1.414213562373095048801688724209698;
+    for (int k = 0; k < 16; k++) {
+      output[k] = 0.0;
+      for (int n = 0; n < 16; n++) {
+        output[k] += input[n]*cos(kPi*(2*k+1)*n/32.0);
+        if (n == 0)
+          output[k] = output[k]/kSqrt2;
+      }
+    }
+  }
+  vp9_clear_system_state(); // Make it simd safe : __asm emms;
+}
+#endif
+
+void vp9_short_idct16x16_c(short *input, short *output, int pitch) {
+
+  vp9_clear_system_state(); // Make it simd safe : __asm emms;
+  {
+    double out[16*16], out2[16*16];
+    const int short_pitch = pitch >> 1;
+    int i, j;
+      // First transform rows
+    for (i = 0; i < 16; ++i) {
+      double temp_in[16], temp_out[16];
+      for (j = 0; j < 16; ++j)
+        temp_in[j] = input[j + i*short_pitch];
+      butterfly_16x16_idct_1d(temp_in, temp_out);
+      for (j = 0; j < 16; ++j)
+        out[j + i*16] = temp_out[j];
+    }
+    // Then transform columns
+    for (i = 0; i < 16; ++i) {
+      double temp_in[16], temp_out[16];
+      for (j = 0; j < 16; ++j)
+        temp_in[j] = out[j*16 + i];
+      butterfly_16x16_idct_1d(temp_in, temp_out);
+      for (j = 0; j < 16; ++j)
+        out2[j*16 + i] = temp_out[j];
+    }
+    for (i = 0; i < 16*16; ++i)
+      output[i] = round(out2[i]/128);
+  }
+  vp9_clear_system_state(); // Make it simd safe : __asm emms;
+}
--- /dev/null
+++ b/vp9/common/implicit_segmentation.c
@@ -1,0 +1,255 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vp9/common/onyxc_int.h"
+
+#define MAX_REGIONS 24000
+#ifndef NULL
+#define NULL 0
+#endif
+
+#define min_mbs_in_region 3
+
+// this linked list structure holds equivalences for connected
+// component labeling
+struct list_el {
+  int label;
+  int seg_value;
+  int count;
+  struct list_el *next;
+};
+typedef struct list_el item;
+
+// connected colorsegments
+typedef struct {
+  int min_x;
+  int min_y;
+  int max_x;
+  int max_y;
+  long long sum_x;
+  long long sum_y;
+  int pixels;
+  int seg_value;
+  int label;
+} segment_info;
+
+
+typedef enum {
+  SEGMENT_MODE,
+  SEGMENT_MV,
+  SEGMENT_REFFRAME,
+  SEGMENT_SKIPPED
+} SEGMENT_TYPE;
+
+
+// this merges the two equivalence lists and
+// then makes sure that every label points to the same
+// equivalence list
+void merge(item *labels, int u, int v) {
+  item *a = labels[u].next;
+  item *b = labels[v].next;
+  item c;
+  item *it = &c;
+  int count;
+
+  // check if they are already merged
+  if (u == v || a == b)
+    return;
+
+  count = a->count + b->count;
+
+  // merge 2 sorted linked lists.
+  while (a != NULL && b != NULL) {
+    if (a->label < b->label) {
+      it->next = a;
+      a = a->next;
+    } else {
+      it->next = b;
+      b = b->next;
+    }
+
+    it = it->next;
+  }
+
+  if (a == NULL)
+    it->next = b;
+  else
+    it->next = a;
+
+  it = c.next;
+
+  // make sure every equivalence in the linked list points to this new ll
+  while (it != NULL) {
+    labels[it->label].next = c.next;
+    it = it->next;
+  }
+  c.next->count = count;
+
+}
+
+void segment_via_mode_info(VP9_COMMON *oci, int how) {
+  MODE_INFO *mi = oci->mi;
+  int i, j;
+  int mb_index = 0;
+
+  int label = 1;
+  int pitch = oci->mb_cols;
+
+  // holds linked list equivalences
+  // the max should probably be allocated at a higher level in oci
+  item equivalences[MAX_REGIONS];
+  int eq_ptr = 0;
+  item labels[MAX_REGIONS];
+  segment_info segments[MAX_REGIONS];
+  int label_count = 1;
+  int labeling[400 * 300];
+  int *lp = labeling;
+
+  label_count = 1;
+  memset(labels, 0, sizeof(labels));
+  memset(segments, 0, sizeof(segments));
+
+  /* Go through each macroblock first pass labelling */
+  for (i = 0; i < oci->mb_rows; i++, lp += pitch) {
+    for (j = 0; j < oci->mb_cols; j++) {
+      // int above seg_value, left seg_value, this seg_value...
+      int a = -1, l = -1, n = -1;
+
+      // above label, left label
+      int al = -1, ll = -1;
+      if (i) {
+        al = lp[j - pitch];
+        a = labels[al].next->seg_value;
+      }
+      if (j) {
+        ll = lp[j - 1];
+        l = labels[ll].next->seg_value;
+      }
+
+      // what setting are we going to do the implicit segmentation on
+      switch (how) {
+        case SEGMENT_MODE:
+          n = mi[mb_index].mbmi.mode;
+          break;
+        case SEGMENT_MV:
+          n = mi[mb_index].mbmi.mv[0].as_int;
+          if (mi[mb_index].mbmi.ref_frame == INTRA_FRAME)
+            n = -9999999;
+          break;
+        case SEGMENT_REFFRAME:
+          n = mi[mb_index].mbmi.ref_frame;
+          break;
+        case SEGMENT_SKIPPED:
+          n = mi[mb_index].mbmi.mb_skip_coeff;
+          break;
+      }
+
+      // above and left both have the same seg_value
+      if (n == a && n == l) {
+        // pick the lowest label
+        lp[j] = (al < ll ? al : ll);
+        labels[lp[j]].next->count++;
+
+        // merge the above and left equivalencies
+        merge(labels, al, ll);
+      }
+      // this matches above seg_value
+      else if (n == a) {
+        // give it the same label as above
+        lp[j] = al;
+        labels[al].next->count++;
+      }
+      // this matches left seg_value
+      else if (n == l) {
+        // give it the same label as above
+        lp[j] = ll;
+        labels[ll].next->count++;
+      } else {
+        // new label doesn't match either
+        item *e = &labels[label];
+        item *nl = &equivalences[eq_ptr++];
+        lp[j] = label;
+        nl->label = label;
+        nl->next = 0;
+        nl->seg_value = n;
+        nl->count = 1;
+        e->next = nl;
+        label++;
+      }
+      mb_index++;
+    }
+    mb_index++;
+  }
+  lp = labeling;
+
+  // give new labels to regions
+  for (i = 1; i < label; i++)
+    if (labels[i].next->count > min_mbs_in_region  &&  labels[labels[i].next->label].label == 0) {
+      segment_info *cs = &segments[label_count];
+      cs->label = label_count;
+      labels[labels[i].next->label].label = label_count++;
+      labels[labels[i].next->label].seg_value  = labels[i].next->seg_value;
+      cs->seg_value = labels[labels[i].next->label].seg_value;
+      cs->min_x = oci->mb_cols;
+      cs->min_y = oci->mb_rows;
+      cs->max_x = 0;
+      cs->max_y = 0;
+      cs->sum_x = 0;
+      cs->sum_y = 0;
+      cs->pixels = 0;
+
+    }
+  lp = labeling;
+
+  // this is just to gather stats...
+  for (i = 0; i < oci->mb_rows; i++, lp += pitch) {
+    for (j = 0; j < oci->mb_cols; j++) {
+      segment_info *cs;
+      int oldlab = labels[lp[j]].next->label;
+      int lab = labels[oldlab].label;
+      lp[j] = lab;
+
+      cs = &segments[lab];
+
+      cs->min_x = (j < cs->min_x ? j : cs->min_x);
+      cs->max_x = (j > cs->max_x ? j : cs->max_x);
+      cs->min_y = (i < cs->min_y ? i : cs->min_y);
+      cs->max_y = (i > cs->max_y ? i : cs->max_y);
+      cs->sum_x += j;
+      cs->sum_y += i;
+      cs->pixels++;
+
+      lp[j] = lab;
+      mb_index++;
+    }
+    mb_index++;
+  }
+
+  {
+    lp = labeling;
+    printf("labelling \n");
+    mb_index = 0;
+    for (i = 0; i < oci->mb_rows; i++, lp += pitch) {
+      for (j = 0; j < oci->mb_cols; j++) {
+        printf("%4d", lp[j]);
+      }
+      printf("            ");
+      for (j = 0; j < oci->mb_cols; j++, mb_index++) {
+        // printf("%3d",mi[mb_index].mbmi.mode );
+        printf("%4d:%4d", mi[mb_index].mbmi.mv[0].as_mv.row,
+            mi[mb_index].mbmi.mv[0].as_mv.col);
+      }
+      printf("\n");
+      ++mb_index;
+    }
+    printf("\n");
+  }
+}
+
--- /dev/null
+++ b/vp9/common/invtrans.c
@@ -1,0 +1,135 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "invtrans.h"
+
+static void recon_dcblock(MACROBLOCKD *xd) {
+  BLOCKD *b = &xd->block[24];
+  int i;
+
+  for (i = 0; i < 16; i++) {
+    xd->block[i].dqcoeff[0] = b->diff[i];
+  }
+}
+
+static void recon_dcblock_8x8(MACROBLOCKD *xd) {
+  BLOCKD *b = &xd->block[24]; // for coeff 0, 2, 8, 10
+
+  xd->block[0].dqcoeff[0] = b->diff[0];
+  xd->block[4].dqcoeff[0] = b->diff[1];
+  xd->block[8].dqcoeff[0] = b->diff[4];
+  xd->block[12].dqcoeff[0] = b->diff[8];
+}
+
+void vp9_inverse_transform_b_4x4(const vp9_idct_rtcd_vtable_t *rtcd,
+                                 BLOCKD *b, int pitch) {
+  if (b->eob <= 1)
+    IDCT_INVOKE(rtcd, idct1)(b->dqcoeff, b->diff, pitch);
+  else
+    IDCT_INVOKE(rtcd, idct16)(b->dqcoeff, b->diff, pitch);
+}
+
+void vp9_inverse_transform_mby_4x4(const vp9_idct_rtcd_vtable_t *rtcd,
+                                   MACROBLOCKD *xd) {
+  int i;
+  BLOCKD *blockd = xd->block;
+
+  if (xd->mode_info_context->mbmi.mode != SPLITMV) {
+    /* do 2nd order transform on the dc block */
+    IDCT_INVOKE(rtcd, iwalsh16)(blockd[24].dqcoeff, blockd[24].diff);
+    recon_dcblock(xd);
+  }
+
+  for (i = 0; i < 16; i++) {
+    vp9_inverse_transform_b_4x4(rtcd, &blockd[i], 32);
+  }
+}
+
+void vp9_inverse_transform_mbuv_4x4(const vp9_idct_rtcd_vtable_t *rtcd,
+                                    MACROBLOCKD *xd) {
+  int i;
+  BLOCKD *blockd = xd->block;
+
+  for (i = 16; i < 24; i++) {
+    vp9_inverse_transform_b_4x4(rtcd, &blockd[i], 16);
+  }
+}
+
+void vp9_inverse_transform_mb_4x4(const vp9_idct_rtcd_vtable_t *rtcd,
+                                  MACROBLOCKD *xd) {
+  vp9_inverse_transform_mby_4x4(rtcd, xd);
+  vp9_inverse_transform_mbuv_4x4(rtcd, xd);
+}
+
+void vp9_inverse_transform_b_8x8(const vp9_idct_rtcd_vtable_t *rtcd,
+                                 short *input_dqcoeff, short *output_coeff,
+                                 int pitch) {
+  // int b,i;
+  // if (b->eob > 1)
+  IDCT_INVOKE(rtcd, idct8)(input_dqcoeff, output_coeff, pitch);
+  // else
+  // IDCT_INVOKE(rtcd, idct8_1)(b->dqcoeff, b->diff, pitch);//pitch
+}
+
+void vp9_inverse_transform_mby_8x8(const vp9_idct_rtcd_vtable_t *rtcd,
+                                   MACROBLOCKD *xd) {
+  int i;
+  BLOCKD *blockd = xd->block;
+
+  if (xd->mode_info_context->mbmi.mode != SPLITMV) {
+    // do 2nd order transform on the dc block
+    IDCT_INVOKE(rtcd, ihaar2)(blockd[24].dqcoeff, blockd[24].diff, 8);
+    recon_dcblock_8x8(xd); // need to change for 8x8
+  }
+
+  for (i = 0; i < 9; i += 8) {
+    vp9_inverse_transform_b_8x8(rtcd, &blockd[i].dqcoeff[0],
+                                &blockd[i].diff[0], 32);
+  }
+  for (i = 2; i < 11; i += 8) {
+    vp9_inverse_transform_b_8x8(rtcd, &blockd[i + 2].dqcoeff[0],
+                                &blockd[i].diff[0], 32);
+  }
+}
+
+void vp9_inverse_transform_mbuv_8x8(const vp9_idct_rtcd_vtable_t *rtcd,
+                                    MACROBLOCKD *xd) {
+  int i;
+  BLOCKD *blockd = xd->block;
+
+  for (i = 16; i < 24; i += 4) {
+    vp9_inverse_transform_b_8x8(rtcd, &blockd[i].dqcoeff[0],
+                                &blockd[i].diff[0], 16);
+  }
+}
+
+void vp9_inverse_transform_mb_8x8(const vp9_idct_rtcd_vtable_t *rtcd,
+                                  MACROBLOCKD *xd) {
+  vp9_inverse_transform_mby_8x8(rtcd, xd);
+  vp9_inverse_transform_mbuv_8x8(rtcd, xd);
+}
+
+void vp9_inverse_transform_b_16x16(const vp9_idct_rtcd_vtable_t *rtcd,
+                                   short *input_dqcoeff,
+                                   short *output_coeff, int pitch) {
+  IDCT_INVOKE(rtcd, idct16x16)(input_dqcoeff, output_coeff, pitch);
+}
+
+void vp9_inverse_transform_mby_16x16(const vp9_idct_rtcd_vtable_t *rtcd,
+                                     MACROBLOCKD *xd) {
+  vp9_inverse_transform_b_16x16(rtcd, &xd->block[0].dqcoeff[0],
+                                &xd->block[0].diff[0], 32);
+}
+
+void vp9_inverse_transform_mb_16x16(const vp9_idct_rtcd_vtable_t *rtcd,
+                                    MACROBLOCKD *xd) {
+  vp9_inverse_transform_mby_16x16(rtcd, xd);
+  vp9_inverse_transform_mbuv_8x8(rtcd, xd);
+}
--- /dev/null
+++ b/vp9/common/invtrans.h
@@ -1,0 +1,53 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef __INC_INVTRANS_H
+#define __INC_INVTRANS_H
+
+#include "vpx_ports/config.h"
+#include "idct.h"
+#include "blockd.h"
+
+extern void vp9_inverse_transform_b_4x4(const vp9_idct_rtcd_vtable_t *rtcd,
+                                        BLOCKD *b, int pitch);
+
+extern void vp9_inverse_transform_mb_4x4(const vp9_idct_rtcd_vtable_t *rtcd,
+                                         MACROBLOCKD *xd);
+
+extern void vp9_inverse_transform_mby_4x4(const vp9_idct_rtcd_vtable_t *rtcd,
+                                          MACROBLOCKD *xd);
+
+extern void vp9_inverse_transform_mbuv_4x4(const vp9_idct_rtcd_vtable_t *rtcd,
+                                           MACROBLOCKD *xd);
+
+extern void vp9_inverse_transform_b_8x8(const vp9_idct_rtcd_vtable_t *rtcd,
+                                        short *input_dqcoeff,
+                                        short *output_coeff, int pitch);
+
+extern void vp9_inverse_transform_mb_8x8(const vp9_idct_rtcd_vtable_t *rtcd,
+                                         MACROBLOCKD *xd);
+
+extern void vp9_inverse_transform_mby_8x8(const vp9_idct_rtcd_vtable_t *rtcd,
+                                          MACROBLOCKD *xd);
+
+extern void vp9_inverse_transform_mbuv_8x8(const vp9_idct_rtcd_vtable_t *rtcd,
+                                           MACROBLOCKD *xd);
+
+extern void vp9_inverse_transform_b_16x16(const vp9_idct_rtcd_vtable_t *rtcd,
+                                          short *input_dqcoeff,
+                                          short *output_coeff, int pitch);
+
+extern void vp9_inverse_transform_mb_16x16(const vp9_idct_rtcd_vtable_t *rtcd,
+                                           MACROBLOCKD *xd);
+
+extern void vp9_inverse_transform_mby_16x16(const vp9_idct_rtcd_vtable_t *rtcd,
+                                            MACROBLOCKD *xd);
+
+#endif  // __INC_INVTRANS_H
--- /dev/null
+++ b/vp9/common/loopfilter.c
@@ -1,0 +1,524 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_config.h"
+#include "loopfilter.h"
+#include "onyxc_int.h"
+#include "vpx_mem/vpx_mem.h"
+
+#include "vp9/common/seg_common.h"
+
+static void lf_init_lut(loop_filter_info_n *lfi) {
+  int filt_lvl;
+
+  for (filt_lvl = 0; filt_lvl <= MAX_LOOP_FILTER; filt_lvl++) {
+    if (filt_lvl >= 40) {
+      lfi->hev_thr_lut[KEY_FRAME][filt_lvl] = 2;
+      lfi->hev_thr_lut[INTER_FRAME][filt_lvl] = 3;
+    } else if (filt_lvl >= 20) {
+      lfi->hev_thr_lut[KEY_FRAME][filt_lvl] = 1;
+      lfi->hev_thr_lut[INTER_FRAME][filt_lvl] = 2;
+    } else if (filt_lvl >= 15) {
+      lfi->hev_thr_lut[KEY_FRAME][filt_lvl] = 1;
+      lfi->hev_thr_lut[INTER_FRAME][filt_lvl] = 1;
+    } else {
+      lfi->hev_thr_lut[KEY_FRAME][filt_lvl] = 0;
+      lfi->hev_thr_lut[INTER_FRAME][filt_lvl] = 0;
+    }
+  }
+
+  lfi->mode_lf_lut[DC_PRED] = 1;
+  lfi->mode_lf_lut[D45_PRED] = 1;
+  lfi->mode_lf_lut[D135_PRED] = 1;
+  lfi->mode_lf_lut[D117_PRED] = 1;
+  lfi->mode_lf_lut[D153_PRED] = 1;
+  lfi->mode_lf_lut[D27_PRED] = 1;
+  lfi->mode_lf_lut[D63_PRED] = 1;
+  lfi->mode_lf_lut[V_PRED] = 1;
+  lfi->mode_lf_lut[H_PRED] = 1;
+  lfi->mode_lf_lut[TM_PRED] = 1;
+  lfi->mode_lf_lut[B_PRED]  = 0;
+  lfi->mode_lf_lut[I8X8_PRED] = 0;
+  lfi->mode_lf_lut[ZEROMV]  = 1;
+  lfi->mode_lf_lut[NEARESTMV] = 2;
+  lfi->mode_lf_lut[NEARMV] = 2;
+  lfi->mode_lf_lut[NEWMV] = 2;
+  lfi->mode_lf_lut[SPLITMV] = 3;
+}
+
+void vp9_loop_filter_update_sharpness(loop_filter_info_n *lfi,
+                                      int sharpness_lvl) {
+  int i;
+
+  /* For each possible value for the loop filter fill out limits */
+  for (i = 0; i <= MAX_LOOP_FILTER; i++) {
+    int filt_lvl = i;
+    int block_inside_limit = 0;
+
+    /* Set loop filter paramaeters that control sharpness. */
+    block_inside_limit = filt_lvl >> (sharpness_lvl > 0);
+    block_inside_limit = block_inside_limit >> (sharpness_lvl > 4);
+
+    if (sharpness_lvl > 0) {
+      if (block_inside_limit > (9 - sharpness_lvl))
+        block_inside_limit = (9 - sharpness_lvl);
+    }
+
+    if (block_inside_limit < 1)
+      block_inside_limit = 1;
+
+    vpx_memset(lfi->lim[i], block_inside_limit, SIMD_WIDTH);
+    vpx_memset(lfi->blim[i], (2 * filt_lvl + block_inside_limit),
+               SIMD_WIDTH);
+    vpx_memset(lfi->mblim[i], (2 * (filt_lvl + 2) + block_inside_limit),
+               SIMD_WIDTH);
+  }
+}
+
+void vp9_loop_filter_init(VP9_COMMON *cm) {
+  loop_filter_info_n *lfi = &cm->lf_info;
+  int i;
+
+  /* init limits for given sharpness*/
+  vp9_loop_filter_update_sharpness(lfi, cm->sharpness_level);
+  cm->last_sharpness_level = cm->sharpness_level;
+
+  /* init LUT for lvl  and hev thr picking */
+  lf_init_lut(lfi);
+
+  /* init hev threshold const vectors */
+  for (i = 0; i < 4; i++) {
+    vpx_memset(lfi->hev_thr[i], i, SIMD_WIDTH);
+  }
+}
+
+void vp9_loop_filter_frame_init(VP9_COMMON *cm,
+                                MACROBLOCKD *xd,
+                                int default_filt_lvl) {
+  int seg,  /* segment number */
+      ref,  /* index in ref_lf_deltas */
+      mode; /* index in mode_lf_deltas */
+
+  loop_filter_info_n *lfi = &cm->lf_info;
+
+  /* update limits if sharpness has changed */
+  if (cm->last_sharpness_level != cm->sharpness_level) {
+    vp9_loop_filter_update_sharpness(lfi, cm->sharpness_level);
+    cm->last_sharpness_level = cm->sharpness_level;
+  }
+
+  for (seg = 0; seg < MAX_MB_SEGMENTS; seg++) {
+    int lvl_seg = default_filt_lvl;
+    int lvl_ref, lvl_mode;
+
+
+    // Set the baseline filter values for each segment
+    if (vp9_segfeature_active(xd, seg, SEG_LVL_ALT_LF)) {
+      /* Abs value */
+      if (xd->mb_segment_abs_delta == SEGMENT_ABSDATA) {
+        lvl_seg = vp9_get_segdata(xd, seg, SEG_LVL_ALT_LF);
+      } else { /* Delta Value */
+        lvl_seg += vp9_get_segdata(xd, seg, SEG_LVL_ALT_LF);
+        lvl_seg = (lvl_seg > 0) ? ((lvl_seg > 63) ? 63 : lvl_seg) : 0;
+      }
+    }
+
+    if (!xd->mode_ref_lf_delta_enabled) {
+      /* we could get rid of this if we assume that deltas are set to
+       * zero when not in use; encoder always uses deltas
+       */
+      vpx_memset(lfi->lvl[seg][0], lvl_seg, 4 * 4);
+      continue;
+    }
+
+    lvl_ref = lvl_seg;
+
+    /* INTRA_FRAME */
+    ref = INTRA_FRAME;
+
+    /* Apply delta for reference frame */
+    lvl_ref += xd->ref_lf_deltas[ref];
+
+    /* Apply delta for Intra modes */
+    mode = 0; /* B_PRED */
+    /* Only the split mode BPRED has a further special case */
+    lvl_mode = lvl_ref +  xd->mode_lf_deltas[mode];
+    lvl_mode = (lvl_mode > 0) ? (lvl_mode > 63 ? 63 : lvl_mode) : 0; /* clamp */
+
+    lfi->lvl[seg][ref][mode] = lvl_mode;
+
+    mode = 1; /* all the rest of Intra modes */
+    lvl_mode = (lvl_ref > 0) ? (lvl_ref > 63 ? 63 : lvl_ref)  : 0; /* clamp */
+    lfi->lvl[seg][ref][mode] = lvl_mode;
+
+    /* LAST, GOLDEN, ALT */
+    for (ref = 1; ref < MAX_REF_FRAMES; ref++) {
+      int lvl_ref = lvl_seg;
+
+      /* Apply delta for reference frame */
+      lvl_ref += xd->ref_lf_deltas[ref];
+
+      /* Apply delta for Inter modes */
+      for (mode = 1; mode < 4; mode++) {
+        lvl_mode = lvl_ref + xd->mode_lf_deltas[mode];
+        lvl_mode = (lvl_mode > 0) ? (lvl_mode > 63 ? 63 : lvl_mode) : 0; /* clamp */
+
+        lfi->lvl[seg][ref][mode] = lvl_mode;
+      }
+    }
+  }
+}
+
+void vp9_loop_filter_frame(VP9_COMMON *cm, MACROBLOCKD *xd) {
+  YV12_BUFFER_CONFIG *post = cm->frame_to_show;
+  loop_filter_info_n *lfi_n = &cm->lf_info;
+  struct loop_filter_info lfi;
+
+  FRAME_TYPE frame_type = cm->frame_type;
+
+  int mb_row;
+  int mb_col;
+
+  int filter_level;
+
+  unsigned char *y_ptr, *u_ptr, *v_ptr;
+
+  /* Point at base of Mb MODE_INFO list */
+  const MODE_INFO *mode_info_context = cm->mi;
+
+  /* Initialize the loop filter for this frame. */
+  vp9_loop_filter_frame_init(cm, xd, cm->filter_level);
+
+  /* Set up the buffer pointers */
+  y_ptr = post->y_buffer;
+  u_ptr = post->u_buffer;
+  v_ptr = post->v_buffer;
+
+  /* vp9_filter each macro block */
+  for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) {
+    for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) {
+      int skip_lf = (mode_info_context->mbmi.mode != B_PRED &&
+                     mode_info_context->mbmi.mode != I8X8_PRED &&
+                     mode_info_context->mbmi.mode != SPLITMV &&
+                     mode_info_context->mbmi.mb_skip_coeff);
+
+      const int mode_index = lfi_n->mode_lf_lut[mode_info_context->mbmi.mode];
+      const int seg = mode_info_context->mbmi.segment_id;
+      const int ref_frame = mode_info_context->mbmi.ref_frame;
+      int tx_type = mode_info_context->mbmi.txfm_size;
+      filter_level = lfi_n->lvl[seg][ref_frame][mode_index];
+
+      if (filter_level) {
+        if (cm->filter_type == NORMAL_LOOPFILTER) {
+          const int hev_index = lfi_n->hev_thr_lut[frame_type][filter_level];
+          lfi.mblim = lfi_n->mblim[filter_level];
+          lfi.blim = lfi_n->blim[filter_level];
+          lfi.lim = lfi_n->lim[filter_level];
+          lfi.hev_thr = lfi_n->hev_thr[hev_index];
+
+          if (mb_col > 0
+#if CONFIG_SUPERBLOCKS
+              && !((mb_col & 1) && mode_info_context->mbmi.encoded_as_sb &&
+                   mode_info_context[0].mbmi.mb_skip_coeff &&
+                   mode_info_context[-1].mbmi.mb_skip_coeff)
+#endif
+              )
+            vp9_loop_filter_mbv(y_ptr, u_ptr, v_ptr, post->y_stride,
+                                post->uv_stride, &lfi);
+
+          if (!skip_lf && tx_type != TX_16X16) {
+            if (tx_type == TX_8X8)
+              vp9_loop_filter_bv8x8(y_ptr, u_ptr, v_ptr, post->y_stride,
+                                    post->uv_stride, &lfi);
+            else
+              vp9_loop_filter_bv(y_ptr, u_ptr, v_ptr, post->y_stride,
+                                 post->uv_stride, &lfi);
+
+          }
+
+          /* don't apply across umv border */
+          if (mb_row > 0
+#if CONFIG_SUPERBLOCKS
+              && !((mb_row & 1) && mode_info_context->mbmi.encoded_as_sb &&
+                   mode_info_context[0].mbmi.mb_skip_coeff &&
+                   mode_info_context[-cm->mode_info_stride].mbmi.mb_skip_coeff)
+#endif
+              )
+            vp9_loop_filter_mbh(y_ptr, u_ptr, v_ptr, post->y_stride,
+                                post->uv_stride, &lfi);
+
+          if (!skip_lf && tx_type != TX_16X16) {
+            if (tx_type == TX_8X8)
+              vp9_loop_filter_bh8x8(y_ptr, u_ptr, v_ptr, post->y_stride,
+                                    post->uv_stride, &lfi);
+            else
+              vp9_loop_filter_bh(y_ptr, u_ptr, v_ptr, post->y_stride,
+                                 post->uv_stride, &lfi);
+          }
+        } else {
+          // FIXME: Not 8x8 aware
+          if (mb_col > 0
+#if CONFIG_SUPERBLOCKS
+              && !((mb_col & 1) && mode_info_context->mbmi.encoded_as_sb &&
+                   mode_info_context[0].mbmi.mb_skip_coeff &&
+                   mode_info_context[-1].mbmi.mb_skip_coeff)
+#endif
+              )
+            vp9_loop_filter_simple_mbv(y_ptr, post->y_stride,
+                                       lfi_n->mblim[filter_level]);
+
+          if (!skip_lf)
+            vp9_loop_filter_simple_bv(y_ptr, post->y_stride,
+                                      lfi_n->blim[filter_level]);
+
+          /* don't apply across umv border */
+          if (mb_row > 0
+#if CONFIG_SUPERBLOCKS
+              && !((mb_row & 1) && mode_info_context->mbmi.encoded_as_sb &&
+                   mode_info_context[0].mbmi.mb_skip_coeff &&
+                   mode_info_context[-cm->mode_info_stride].mbmi.mb_skip_coeff)
+#endif
+              )
+            vp9_loop_filter_simple_mbh(y_ptr, post->y_stride,
+                                       lfi_n->mblim[filter_level]);
+
+          if (!skip_lf)
+            vp9_loop_filter_simple_bh(y_ptr, post->y_stride,
+                                      lfi_n->blim[filter_level]);
+        }
+      }
+
+      y_ptr += 16;
+      u_ptr += 8;
+      v_ptr += 8;
+
+      mode_info_context++;     /* step to next MB */
+    }
+
+    y_ptr += post->y_stride  * 16 - post->y_width;
+    u_ptr += post->uv_stride *  8 - post->uv_width;
+    v_ptr += post->uv_stride *  8 - post->uv_width;
+
+    mode_info_context++;         /* Skip border mb */
+  }
+}
+
+void vp9_loop_filter_frame_yonly(VP9_COMMON *cm, MACROBLOCKD *xd,
+                                 int default_filt_lvl) {
+  YV12_BUFFER_CONFIG *post = cm->frame_to_show;
+
+  unsigned char *y_ptr;
+  int mb_row;
+  int mb_col;
+
+  loop_filter_info_n *lfi_n = &cm->lf_info;
+  struct loop_filter_info lfi;
+
+  int filter_level;
+  FRAME_TYPE frame_type = cm->frame_type;
+
+  /* Point at base of Mb MODE_INFO list */
+  const MODE_INFO *mode_info_context = cm->mi;
+
+#if 0
+  if (default_filt_lvl == 0) /* no filter applied */
+    return;
+#endif
+
+  /* Initialize the loop filter for this frame. */
+  vp9_loop_filter_frame_init(cm, xd, default_filt_lvl);
+
+  /* Set up the buffer pointers */
+  y_ptr = post->y_buffer;
+
+  /* vp9_filter each macro block */
+  for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) {
+    for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) {
+      int skip_lf = (mode_info_context->mbmi.mode != B_PRED &&
+                     mode_info_context->mbmi.mode != I8X8_PRED &&
+                     mode_info_context->mbmi.mode != SPLITMV &&
+                     mode_info_context->mbmi.mb_skip_coeff);
+
+      const int mode_index = lfi_n->mode_lf_lut[mode_info_context->mbmi.mode];
+      const int seg = mode_info_context->mbmi.segment_id;
+      const int ref_frame = mode_info_context->mbmi.ref_frame;
+      int tx_type = mode_info_context->mbmi.txfm_size;
+      filter_level = lfi_n->lvl[seg][ref_frame][mode_index];
+
+      if (filter_level) {
+        if (cm->filter_type == NORMAL_LOOPFILTER) {
+          const int hev_index = lfi_n->hev_thr_lut[frame_type][filter_level];
+          lfi.mblim = lfi_n->mblim[filter_level];
+          lfi.blim = lfi_n->blim[filter_level];
+          lfi.lim = lfi_n->lim[filter_level];
+          lfi.hev_thr = lfi_n->hev_thr[hev_index];
+
+          if (mb_col > 0)
+            vp9_loop_filter_mbv(y_ptr, 0, 0, post->y_stride, 0, &lfi);
+
+          if (!skip_lf && tx_type != TX_16X16) {
+            if (tx_type == TX_8X8)
+              vp9_loop_filter_bv8x8(y_ptr, 0, 0, post->y_stride, 0, &lfi);
+            else
+              vp9_loop_filter_bv(y_ptr, 0, 0, post->y_stride, 0, &lfi);
+          }
+
+          /* don't apply across umv border */
+          if (mb_row > 0)
+            vp9_loop_filter_mbh(y_ptr, 0, 0, post->y_stride, 0, &lfi);
+
+          if (!skip_lf && tx_type != TX_16X16) {
+            if (tx_type == TX_8X8)
+              vp9_loop_filter_bh8x8(y_ptr, 0, 0, post->y_stride, 0, &lfi);
+            else
+              vp9_loop_filter_bh(y_ptr, 0, 0, post->y_stride, 0, &lfi);
+          }
+        } else {
+          // FIXME: Not 8x8 aware
+          if (mb_col > 0)
+            vp9_loop_filter_simple_mbv(y_ptr, post->y_stride,
+                                       lfi_n->mblim[filter_level]);
+
+          if (!skip_lf)
+            vp9_loop_filter_simple_bv(y_ptr, post->y_stride,
+                                      lfi_n->blim[filter_level]);
+
+          /* don't apply across umv border */
+          if (mb_row > 0)
+            vp9_loop_filter_simple_mbh(y_ptr, post->y_stride,
+                                       lfi_n->mblim[filter_level]);
+
+          if (!skip_lf)
+            vp9_loop_filter_simple_bh(y_ptr, post->y_stride,
+                                      lfi_n->blim[filter_level]);
+        }
+      }
+
+      y_ptr += 16;
+      mode_info_context++;        /* step to next MB */
+    }
+
+    y_ptr += post->y_stride  * 16 - post->y_width;
+    mode_info_context++;            /* Skip border mb */
+  }
+}
+
+void vp9_loop_filter_partial_frame(VP9_COMMON *cm, MACROBLOCKD *xd,
+                                   int default_filt_lvl) {
+  YV12_BUFFER_CONFIG *post = cm->frame_to_show;
+
+  unsigned char *y_ptr;
+  int mb_row;
+  int mb_col;
+  int mb_cols = post->y_width  >> 4;
+
+  int linestocopy, i;
+
+  loop_filter_info_n *lfi_n = &cm->lf_info;
+  struct loop_filter_info lfi;
+
+  int filter_level;
+  int alt_flt_enabled = xd->segmentation_enabled;
+  FRAME_TYPE frame_type = cm->frame_type;
+
+  const MODE_INFO *mode_info_context;
+
+  int lvl_seg[MAX_MB_SEGMENTS];
+
+  mode_info_context = cm->mi + (post->y_height >> 5) * (mb_cols + 1);
+
+  /* 3 is a magic number. 4 is probably magic too */
+  linestocopy = (post->y_height >> (4 + 3));
+
+  if (linestocopy < 1)
+    linestocopy = 1;
+
+  linestocopy <<= 4;
+
+  /* Note the baseline filter values for each segment */
+  /* See vp9_loop_filter_frame_init. Rather than call that for each change
+   * to default_filt_lvl, copy the relevant calculation here.
+   */
+  if (alt_flt_enabled) {
+    for (i = 0; i < MAX_MB_SEGMENTS; i++) {
+      /* Abs value */
+      if (xd->mb_segment_abs_delta == SEGMENT_ABSDATA) {
+        lvl_seg[i] = vp9_get_segdata(xd, i, SEG_LVL_ALT_LF);
+      }
+      /* Delta Value */
+      else {
+        lvl_seg[i] = default_filt_lvl +
+                     vp9_get_segdata(xd, i, SEG_LVL_ALT_LF);
+        lvl_seg[i] = (lvl_seg[i] > 0) ?
+                     ((lvl_seg[i] > 63) ? 63 : lvl_seg[i]) : 0;
+      }
+    }
+  }
+
+  /* Set up the buffer pointers */
+  y_ptr = post->y_buffer + (post->y_height >> 5) * 16 * post->y_stride;
+
+  /* vp9_filter each macro block */
+  for (mb_row = 0; mb_row < (linestocopy >> 4); mb_row++) {
+    for (mb_col = 0; mb_col < mb_cols; mb_col++) {
+      int skip_lf = (mode_info_context->mbmi.mode != B_PRED &&
+                     mode_info_context->mbmi.mode != I8X8_PRED &&
+                     mode_info_context->mbmi.mode != SPLITMV &&
+                     mode_info_context->mbmi.mb_skip_coeff);
+
+      if (alt_flt_enabled)
+        filter_level = lvl_seg[mode_info_context->mbmi.segment_id];
+      else
+        filter_level = default_filt_lvl;
+
+      if (filter_level) {
+        if (cm->filter_type == NORMAL_LOOPFILTER) {
+          const int hev_index = lfi_n->hev_thr_lut[frame_type][filter_level];
+          lfi.mblim = lfi_n->mblim[filter_level];
+          lfi.blim = lfi_n->blim[filter_level];
+          lfi.lim = lfi_n->lim[filter_level];
+          lfi.hev_thr = lfi_n->hev_thr[hev_index];
+
+          if (mb_col > 0)
+            vp9_loop_filter_mbv(y_ptr, 0, 0, post->y_stride, 0, &lfi);
+
+          if (!skip_lf)
+            vp9_loop_filter_bv(y_ptr, 0, 0, post->y_stride, 0, &lfi);
+
+          vp9_loop_filter_mbh(y_ptr, 0, 0, post->y_stride, 0, &lfi);
+
+          if (!skip_lf)
+            vp9_loop_filter_bh(y_ptr, 0, 0, post->y_stride, 0, &lfi);
+        } else {
+          if (mb_col > 0)
+            vp9_loop_filter_simple_mbv (y_ptr, post->y_stride,
+                                        lfi_n->mblim[filter_level]);
+
+          if (!skip_lf)
+            vp9_loop_filter_simple_bv(y_ptr, post->y_stride,
+                                      lfi_n->blim[filter_level]);
+
+          vp9_loop_filter_simple_mbh(y_ptr, post->y_stride,
+                                     lfi_n->mblim[filter_level]);
+
+          if (!skip_lf)
+            vp9_loop_filter_simple_bh(y_ptr, post->y_stride,
+                                      lfi_n->blim[filter_level]);
+        }
+      }
+
+      y_ptr += 16;
+      mode_info_context += 1;      /* step to next MB */
+    }
+
+    y_ptr += post->y_stride  * 16 - post->y_width;
+    mode_info_context += 1;          /* Skip border mb */
+  }
+}
--- /dev/null
+++ b/vp9/common/loopfilter.h
@@ -1,0 +1,104 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef loopfilter_h
+#define loopfilter_h
+
+#include "vpx_ports/mem.h"
+#include "vpx_config.h"
+#include "blockd.h"
+
+#define MAX_LOOP_FILTER 63
+
+typedef enum {
+  NORMAL_LOOPFILTER = 0,
+  SIMPLE_LOOPFILTER = 1
+} LOOPFILTERTYPE;
+
+#if ARCH_ARM
+#define SIMD_WIDTH 1
+#else
+#define SIMD_WIDTH 16
+#endif
+
+/* Need to align this structure so when it is declared and
+ * passed it can be loaded into vector registers.
+ */
+typedef struct {
+  DECLARE_ALIGNED(SIMD_WIDTH, unsigned char,
+                  mblim[MAX_LOOP_FILTER + 1][SIMD_WIDTH]);
+  DECLARE_ALIGNED(SIMD_WIDTH, unsigned char,
+                  blim[MAX_LOOP_FILTER + 1][SIMD_WIDTH]);
+  DECLARE_ALIGNED(SIMD_WIDTH, unsigned char,
+                  lim[MAX_LOOP_FILTER + 1][SIMD_WIDTH]);
+  DECLARE_ALIGNED(SIMD_WIDTH, unsigned char,
+                  hev_thr[4][SIMD_WIDTH]);
+  unsigned char lvl[4][4][4];
+  unsigned char hev_thr_lut[2][MAX_LOOP_FILTER + 1];
+  unsigned char mode_lf_lut[MB_MODE_COUNT];
+} loop_filter_info_n;
+
+struct loop_filter_info {
+  const unsigned char *mblim;
+  const unsigned char *blim;
+  const unsigned char *lim;
+  const unsigned char *hev_thr;
+};
+
+#define prototype_loopfilter(sym) \
+  void sym(unsigned char *src, int pitch, const unsigned char *blimit,\
+           const unsigned char *limit, const unsigned char *thresh, int count)
+
+#define prototype_loopfilter_block(sym) \
+  void sym(unsigned char *y, unsigned char *u, unsigned char *v, \
+           int ystride, int uv_stride, struct loop_filter_info *lfi)
+
+#define prototype_simple_loopfilter(sym) \
+  void sym(unsigned char *y, int ystride, const unsigned char *blimit)
+
+#if ARCH_X86 || ARCH_X86_64
+#include "x86/loopfilter_x86.h"
+#endif
+
+#if ARCH_ARM
+#include "arm/loopfilter_arm.h"
+#endif
+
+typedef void loop_filter_uvfunction(unsigned char *u,   /* source pointer */
+                                    int p,              /* pitch */
+                                    const unsigned char *blimit,
+                                    const unsigned char *limit,
+                                    const unsigned char *thresh,
+                                    unsigned char *v);
+
+/* assorted loopfilter functions which get used elsewhere */
+struct VP9Common;
+struct macroblockd;
+
+void vp9_loop_filter_init(struct VP9Common *cm);
+
+void vp9_loop_filter_frame_init(struct VP9Common *cm,
+                                struct macroblockd *mbd,
+                                int default_filt_lvl);
+
+void vp9_loop_filter_frame(struct VP9Common *cm, struct macroblockd *mbd);
+
+void vp9_loop_filter_partial_frame(struct VP9Common *cm,
+                                   struct macroblockd *mbd,
+                                   int default_filt_lvl);
+
+void vp9_loop_filter_frame_yonly(struct VP9Common *cm,
+                                 struct macroblockd *mbd,
+                                 int default_filt_lvl);
+
+void vp9_loop_filter_update_sharpness(loop_filter_info_n *lfi,
+                                      int sharpness_lvl);
+
+#endif  // loopfilter_h
--- /dev/null
+++ b/vp9/common/loopfilter_filters.c
@@ -1,0 +1,480 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+#include "vpx_config.h"
+#include "loopfilter.h"
+#include "onyxc_int.h"
+
+typedef unsigned char uc;
+
+static __inline signed char signed_char_clamp(int t) {
+  t = (t < -128 ? -128 : t);
+  t = (t > 127 ? 127 : t);
+  return (signed char) t;
+}
+
+
+/* should we apply any filter at all ( 11111111 yes, 00000000 no) */
+static __inline signed char filter_mask(uc limit, uc blimit,
+                                        uc p3, uc p2, uc p1, uc p0,
+                                        uc q0, uc q1, uc q2, uc q3) {
+  signed char mask = 0;
+  mask |= (abs(p3 - p2) > limit) * -1;
+  mask |= (abs(p2 - p1) > limit) * -1;
+  mask |= (abs(p1 - p0) > limit) * -1;
+  mask |= (abs(q1 - q0) > limit) * -1;
+  mask |= (abs(q2 - q1) > limit) * -1;
+  mask |= (abs(q3 - q2) > limit) * -1;
+  mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
+  mask = ~mask;
+  return mask;
+}
+
+/* is there high variance internal edge ( 11111111 yes, 00000000 no) */
+static __inline signed char hevmask(uc thresh, uc p1, uc p0, uc q0, uc q1) {
+  signed char hev = 0;
+  hev  |= (abs(p1 - p0) > thresh) * -1;
+  hev  |= (abs(q1 - q0) > thresh) * -1;
+  return hev;
+}
+
+static __inline void filter(signed char mask, uc hev, uc *op1,
+                            uc *op0, uc *oq0, uc *oq1)
+
+{
+  signed char ps0, qs0;
+  signed char ps1, qs1;
+  signed char filter, Filter1, Filter2;
+  signed char u;
+
+  ps1 = (signed char) * op1 ^ 0x80;
+  ps0 = (signed char) * op0 ^ 0x80;
+  qs0 = (signed char) * oq0 ^ 0x80;
+  qs1 = (signed char) * oq1 ^ 0x80;
+
+  /* add outer taps if we have high edge variance */
+  filter = signed_char_clamp(ps1 - qs1);
+  filter &= hev;
+
+  /* inner taps */
+  filter = signed_char_clamp(filter + 3 * (qs0 - ps0));
+  filter &= mask;
+
+  /* save bottom 3 bits so that we round one side +4 and the other +3
+   * if it equals 4 we'll set to adjust by -1 to account for the fact
+   * we'd round 3 the other way
+   */
+  Filter1 = signed_char_clamp(filter + 4);
+  Filter2 = signed_char_clamp(filter + 3);
+  Filter1 >>= 3;
+  Filter2 >>= 3;
+  u = signed_char_clamp(qs0 - Filter1);
+  *oq0 = u ^ 0x80;
+  u = signed_char_clamp(ps0 + Filter2);
+  *op0 = u ^ 0x80;
+  filter = Filter1;
+
+  /* outer tap adjustments */
+  filter += 1;
+  filter >>= 1;
+  filter &= ~hev;
+
+  u = signed_char_clamp(qs1 - filter);
+  *oq1 = u ^ 0x80;
+  u = signed_char_clamp(ps1 + filter);
+  *op1 = u ^ 0x80;
+
+}
+
+void vp9_loop_filter_horizontal_edge_c
+(
+  unsigned char *s,
+  int p, /* pitch */
+  const unsigned char *blimit,
+  const unsigned char *limit,
+  const unsigned char *thresh,
+  int count
+) {
+  int  hev = 0; /* high edge variance */
+  signed char mask = 0;
+  int i = 0;
+
+  /* loop filter designed to work using chars so that we can make maximum use
+   * of 8 bit simd instructions.
+   */
+  do {
+    mask = filter_mask(limit[0], blimit[0],
+                       s[-4 * p], s[-3 * p], s[-2 * p], s[-1 * p],
+                       s[0 * p], s[1 * p], s[2 * p], s[3 * p]);
+
+    hev = hevmask(thresh[0], s[-2 * p], s[-1 * p], s[0 * p], s[1 * p]);
+
+    filter(mask, hev, s - 2 * p, s - 1 * p, s, s + 1 * p);
+
+    ++s;
+  } while (++i < count * 8);
+}
+
+void vp9_loop_filter_vertical_edge_c(unsigned char *s,
+                                     int p,
+                                     const unsigned char *blimit,
+                                     const unsigned char *limit,
+                                     const unsigned char *thresh,
+                                     int count) {
+  int  hev = 0; /* high edge variance */
+  signed char mask = 0;
+  int i = 0;
+
+  /* loop filter designed to work using chars so that we can make maximum use
+   * of 8 bit simd instructions.
+   */
+  do {
+    mask = filter_mask(limit[0], blimit[0],
+                       s[-4], s[-3], s[-2], s[-1],
+                       s[0], s[1], s[2], s[3]);
+
+    hev = hevmask(thresh[0], s[-2], s[-1], s[0], s[1]);
+
+    filter(mask, hev, s - 2, s - 1, s, s + 1);
+
+    s += p;
+  } while (++i < count * 8);
+}
+static __inline signed char flatmask(uc thresh,
+                                     uc p4, uc p3, uc p2, uc p1, uc p0,
+                                     uc q0, uc q1, uc q2, uc q3, uc q4) {
+  signed char flat = 0;
+  flat |= (abs(p1 - p0) > 1) * -1;
+  flat |= (abs(q1 - q0) > 1) * -1;
+  flat |= (abs(p0 - p2) > 1) * -1;
+  flat |= (abs(q0 - q2) > 1) * -1;
+  flat |= (abs(p3 - p0) > 1) * -1;
+  flat |= (abs(q3 - q0) > 1) * -1;
+  flat |= (abs(p4 - p0) > 1) * -1;
+  flat |= (abs(q4 - q0) > 1) * -1;
+  flat = ~flat;
+  return flat;
+}
+
+static __inline void mbfilter(signed char mask, uc hev, uc flat,
+                              uc *op4, uc *op3, uc *op2, uc *op1, uc *op0,
+                              uc *oq0, uc *oq1, uc *oq2, uc *oq3, uc *oq4) {
+  /* use a 7 tap filter [1, 1, 1, 2, 1, 1, 1] for flat line */
+  if (flat && mask) {
+    unsigned char p0, q0;
+    unsigned char p1, q1;
+    unsigned char p2, q2;
+    unsigned char p3, q3;
+    unsigned char p4, q4;
+
+    p4 = *op4;
+    p3 = *op3;
+    p2 = *op2;
+    p1 = *op1;
+    p0 = *op0;
+    q0 = *oq0;
+    q1 = *oq1;
+    q2 = *oq2;
+    q3 = *oq3;
+    q4 = *oq4;
+
+    *op2 = (p4 + p4 + p3 + p2 + p2 + p1 + p0 + q0 + 4) >> 3;
+    *op1 = (p4 + p3 + p2 + p1 + p1 + p0 + q0 + q1 + 4) >> 3;
+    *op0 = (p3 + p2 + p1 + p0 + p0 + q0 + q1 + q2 + 4) >> 3;
+    *oq0 = (p2 + p1 + p0 + q0 + q0 + q1 + q2 + q3 + 4) >> 3;
+    *oq1 = (p1 + p0 + q0 + q1 + q1 + q2 + q3 + q4 + 4) >> 3;
+    *oq2 = (p0 + q0 + q1 + q2 + q2 + q3 + q4 + q4 + 4) >> 3;
+  } else {
+    signed char ps0, qs0;
+    signed char ps1, qs1;
+    signed char filter, Filter1, Filter2;
+    signed char u;
+
+    ps1 = (signed char) * op1 ^ 0x80;
+    ps0 = (signed char) * op0 ^ 0x80;
+    qs0 = (signed char) * oq0 ^ 0x80;
+    qs1 = (signed char) * oq1 ^ 0x80;
+
+    /* add outer taps if we have high edge variance */
+    filter = signed_char_clamp(ps1 - qs1);
+    filter &= hev;
+
+    /* inner taps */
+    filter = signed_char_clamp(filter + 3 * (qs0 - ps0));
+    filter &= mask;
+
+    Filter1 = signed_char_clamp(filter + 4);
+    Filter2 = signed_char_clamp(filter + 3);
+    Filter1 >>= 3;
+    Filter2 >>= 3;
+
+    u = signed_char_clamp(qs0 - Filter1);
+    *oq0 = u ^ 0x80;
+    u = signed_char_clamp(ps0 + Filter2);
+    *op0 = u ^ 0x80;
+    filter = Filter1;
+
+    /* outer tap adjustments */
+    filter += 1;
+    filter >>= 1;
+    filter &= ~hev;
+
+    u = signed_char_clamp(qs1 - filter);
+    *oq1 = u ^ 0x80;
+    u = signed_char_clamp(ps1 + filter);
+    *op1 = u ^ 0x80;
+  }
+}
+void vp9_mbloop_filter_horizontal_edge_c
+(
+  unsigned char *s,
+  int p,
+  const unsigned char *blimit,
+  const unsigned char *limit,
+  const unsigned char *thresh,
+  int count
+) {
+  signed char hev = 0; /* high edge variance */
+  signed char mask = 0;
+  signed char flat = 0;
+  int i = 0;
+
+  /* loop filter designed to work using chars so that we can make maximum use
+   * of 8 bit simd instructions.
+   */
+  do {
+
+    mask = filter_mask(limit[0], blimit[0],
+                       s[-4 * p], s[-3 * p], s[-2 * p], s[-1 * p],
+                       s[ 0 * p], s[ 1 * p], s[ 2 * p], s[ 3 * p]);
+
+    hev = hevmask(thresh[0], s[-2 * p], s[-1 * p], s[0 * p], s[1 * p]);
+
+    flat = flatmask(thresh[0],
+                    s[-5 * p], s[-4 * p], s[-3 * p], s[-2 * p], s[-1 * p],
+                    s[ 0 * p], s[ 1 * p], s[ 2 * p], s[ 3 * p], s[ 4 * p]);
+    mbfilter(mask, hev, flat,
+             s - 5 * p, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p,
+             s,       s + 1 * p, s + 2 * p, s + 3 * p, s + 4 * p);
+
+    ++s;
+  } while (++i < count * 8);
+
+}
+void vp9_mbloop_filter_vertical_edge_c
+(
+  unsigned char *s,
+  int p,
+  const unsigned char *blimit,
+  const unsigned char *limit,
+  const unsigned char *thresh,
+  int count
+) {
+  signed char hev = 0; /* high edge variance */
+  signed char mask = 0;
+  signed char flat = 0;
+  int i = 0;
+
+  do {
+
+    mask = filter_mask(limit[0], blimit[0],
+                       s[-4], s[-3], s[-2], s[-1],
+                       s[0], s[1], s[2], s[3]);
+
+    hev = hevmask(thresh[0], s[-2], s[-1], s[0], s[1]);
+    flat = flatmask(thresh[0],
+                    s[-5], s[-4], s[-3], s[-2], s[-1],
+                    s[ 0], s[ 1], s[ 2], s[ 3], s[ 4]);
+    mbfilter(mask, hev, flat,
+             s - 5, s - 4, s - 3, s - 2, s - 1,
+             s,     s + 1, s + 2, s + 3, s + 4);
+    s += p;
+  } while (++i < count * 8);
+
+}
+
+/* should we apply any filter at all ( 11111111 yes, 00000000 no) */
+static __inline signed char simple_filter_mask(uc blimit,
+                                               uc p1, uc p0,
+                                               uc q0, uc q1) {
+  /* Why does this cause problems for win32?
+   * error C2143: syntax error : missing ';' before 'type'
+   *  (void) limit;
+   */
+  signed char mask = (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  <= blimit) * -1;
+  return mask;
+}
+
+static __inline void simple_filter(signed char mask,
+                                   uc *op1, uc *op0,
+                                   uc *oq0, uc *oq1) {
+  signed char filter, Filter1, Filter2;
+  signed char p1 = (signed char) * op1 ^ 0x80;
+  signed char p0 = (signed char) * op0 ^ 0x80;
+  signed char q0 = (signed char) * oq0 ^ 0x80;
+  signed char q1 = (signed char) * oq1 ^ 0x80;
+  signed char u;
+
+  filter = signed_char_clamp(p1 - q1);
+  filter = signed_char_clamp(filter + 3 * (q0 - p0));
+  filter &= mask;
+
+  /* save bottom 3 bits so that we round one side +4 and the other +3 */
+  Filter1 = signed_char_clamp(filter + 4);
+  Filter1 >>= 3;
+  u = signed_char_clamp(q0 - Filter1);
+  *oq0  = u ^ 0x80;
+
+  Filter2 = signed_char_clamp(filter + 3);
+  Filter2 >>= 3;
+  u = signed_char_clamp(p0 + Filter2);
+  *op0 = u ^ 0x80;
+}
+
+void vp9_loop_filter_simple_horizontal_edge_c
+(
+  unsigned char *s,
+  int p,
+  const unsigned char *blimit
+) {
+  signed char mask = 0;
+  int i = 0;
+
+  do {
+    mask = simple_filter_mask(blimit[0],
+                              s[-2 * p], s[-1 * p],
+                              s[0 * p], s[1 * p]);
+    simple_filter(mask,
+                  s - 2 * p, s - 1 * p,
+                  s, s + 1 * p);
+    ++s;
+  } while (++i < 16);
+}
+
+void vp9_loop_filter_simple_vertical_edge_c
+(
+  unsigned char *s,
+  int p,
+  const unsigned char *blimit
+) {
+  signed char mask = 0;
+  int i = 0;
+
+  do {
+    mask = simple_filter_mask(blimit[0], s[-2], s[-1], s[0], s[1]);
+    simple_filter(mask, s - 2, s - 1, s, s + 1);
+    s += p;
+  } while (++i < 16);
+
+}
+
+/* Vertical MB Filtering */
+void vp9_loop_filter_mbv_c(unsigned char *y_ptr, unsigned char *u_ptr,
+                           unsigned char *v_ptr, int y_stride, int uv_stride,
+                           struct loop_filter_info *lfi) {
+  vp9_mbloop_filter_vertical_edge_c(y_ptr, y_stride,
+                                    lfi->mblim, lfi->lim, lfi->hev_thr, 2);
+
+  if (u_ptr)
+    vp9_mbloop_filter_vertical_edge_c(u_ptr, uv_stride,
+                                      lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+
+  if (v_ptr)
+    vp9_mbloop_filter_vertical_edge_c(v_ptr, uv_stride,
+                                      lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+}
+
+/* Vertical B Filtering */
+void vp9_loop_filter_bv_c(unsigned char *y_ptr, unsigned char *u_ptr,
+                          unsigned char *v_ptr, int y_stride, int uv_stride,
+                          struct loop_filter_info *lfi) {
+  vp9_loop_filter_vertical_edge_c(y_ptr + 4, y_stride,
+                                  lfi->blim, lfi->lim, lfi->hev_thr, 2);
+  vp9_loop_filter_vertical_edge_c(y_ptr + 8, y_stride,
+                                  lfi->blim, lfi->lim, lfi->hev_thr, 2);
+  vp9_loop_filter_vertical_edge_c(y_ptr + 12, y_stride,
+                                  lfi->blim, lfi->lim, lfi->hev_thr, 2);
+
+  if (u_ptr)
+    vp9_loop_filter_vertical_edge_c(u_ptr + 4, uv_stride,
+                                    lfi->blim, lfi->lim, lfi->hev_thr, 1);
+
+  if (v_ptr)
+    vp9_loop_filter_vertical_edge_c(v_ptr + 4, uv_stride,
+                                    lfi->blim, lfi->lim, lfi->hev_thr, 1);
+}
+
+/* Horizontal MB filtering */
+void vp9_loop_filter_mbh_c(unsigned char *y_ptr, unsigned char *u_ptr,
+                           unsigned char *v_ptr, int y_stride, int uv_stride,
+                           struct loop_filter_info *lfi) {
+  vp9_mbloop_filter_horizontal_edge_c(y_ptr, y_stride,
+                                      lfi->mblim, lfi->lim, lfi->hev_thr, 2);
+
+  if (u_ptr)
+    vp9_mbloop_filter_horizontal_edge_c(u_ptr, uv_stride,
+                                        lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+
+  if (v_ptr)
+    vp9_mbloop_filter_horizontal_edge_c(v_ptr, uv_stride,
+                                        lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+}
+
+/* Horizontal B Filtering */
+void vp9_loop_filter_bh_c(unsigned char *y_ptr, unsigned char *u_ptr,
+                          unsigned char *v_ptr, int y_stride, int uv_stride,
+                          struct loop_filter_info *lfi) {
+  vp9_loop_filter_horizontal_edge_c(y_ptr + 4 * y_stride, y_stride,
+                                    lfi->blim, lfi->lim, lfi->hev_thr, 2);
+  vp9_loop_filter_horizontal_edge_c(y_ptr + 8 * y_stride, y_stride,
+                                    lfi->blim, lfi->lim, lfi->hev_thr, 2);
+  vp9_loop_filter_horizontal_edge_c(y_ptr + 12 * y_stride, y_stride,
+                                    lfi->blim, lfi->lim, lfi->hev_thr, 2);
+
+  if (u_ptr)
+    vp9_loop_filter_horizontal_edge_c(u_ptr + 4 * uv_stride, uv_stride,
+                                      lfi->blim, lfi->lim, lfi->hev_thr, 1);
+
+  if (v_ptr)
+    vp9_loop_filter_horizontal_edge_c(v_ptr + 4 * uv_stride, uv_stride,
+                                      lfi->blim, lfi->lim, lfi->hev_thr, 1);
+}
+
+void vp9_loop_filter_bh8x8_c(unsigned char *y_ptr, unsigned char *u_ptr,
+                             unsigned char *v_ptr, int y_stride, int uv_stride,
+                             struct loop_filter_info *lfi) {
+  vp9_mbloop_filter_horizontal_edge_c(
+    y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+}
+
+void vp9_loop_filter_bhs_c(unsigned char *y_ptr, int y_stride,
+                           const unsigned char *blimit) {
+  vp9_loop_filter_simple_horizontal_edge_c(y_ptr + 4 * y_stride,
+                                           y_stride, blimit);
+  vp9_loop_filter_simple_horizontal_edge_c(y_ptr + 8 * y_stride,
+                                           y_stride, blimit);
+  vp9_loop_filter_simple_horizontal_edge_c(y_ptr + 12 * y_stride,
+                                           y_stride, blimit);
+}
+
+void vp9_loop_filter_bv8x8_c(unsigned char *y_ptr, unsigned char *u_ptr,
+                             unsigned char *v_ptr, int y_stride, int uv_stride,
+                             struct loop_filter_info *lfi) {
+  vp9_mbloop_filter_vertical_edge_c(
+    y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+}
+
+void vp9_loop_filter_bvs_c(unsigned char *y_ptr, int y_stride,
+                           const unsigned char *blimit) {
+  vp9_loop_filter_simple_vertical_edge_c(y_ptr + 4, y_stride, blimit);
+  vp9_loop_filter_simple_vertical_edge_c(y_ptr + 8, y_stride, blimit);
+  vp9_loop_filter_simple_vertical_edge_c(y_ptr + 12, y_stride, blimit);
+}
--- /dev/null
+++ b/vp9/common/maskingmv.c
@@ -1,0 +1,806 @@
+/*
+ ============================================================================
+ Name        : maskingmv.c
+ Author      : jimbankoski
+ Version     :
+ Copyright   : Your copyright notice
+ Description : Hello World in C, Ansi-style
+ ============================================================================
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+extern unsigned int vp9_sad16x16_sse3(
+  unsigned char *src_ptr,
+  int  src_stride,
+  unsigned char *ref_ptr,
+  int  ref_stride,
+  int  max_err);
+
+extern void vp9_sad16x16x3_sse3(
+  unsigned char *src_ptr,
+  int  src_stride,
+  unsigned char *ref_ptr,
+  int  ref_stride,
+  int  *results);
+
+extern int vp8_growmaskmb_sse3(
+  unsigned char *om,
+  unsigned char *nm);
+
+extern void vp8_makemask_sse3(
+  unsigned char *y,
+  unsigned char *u,
+  unsigned char *v,
+  unsigned char *ym,
+  int yp,
+  int uvp,
+  int ys,
+  int us,
+  int vs,
+  int yt,
+  int ut,
+  int vt);
+
+unsigned int vp9_sad16x16_unmasked_wmt(
+  unsigned char *src_ptr,
+  int  src_stride,
+  unsigned char *ref_ptr,
+  int  ref_stride,
+  unsigned char *mask);
+
+unsigned int vp9_sad16x16_masked_wmt(
+  unsigned char *src_ptr,
+  int  src_stride,
+  unsigned char *ref_ptr,
+  int  ref_stride,
+  unsigned char *mask);
+
+unsigned int vp8_masked_predictor_wmt(
+  unsigned char *masked,
+  unsigned char *unmasked,
+  int  src_stride,
+  unsigned char *dst_ptr,
+  int  dst_stride,
+  unsigned char *mask);
+unsigned int vp8_masked_predictor_uv_wmt(
+  unsigned char *masked,
+  unsigned char *unmasked,
+  int  src_stride,
+  unsigned char *dst_ptr,
+  int  dst_stride,
+  unsigned char *mask);
+unsigned int vp8_uv_from_y_mask(
+  unsigned char *ymask,
+  unsigned char *uvmask);
+int yp = 16;
+unsigned char sxy[] = {
+  40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,
+  40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,
+  40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,
+  40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,
+  40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,
+  60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,
+  60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,
+  60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,
+  40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,
+  40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,
+  40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,
+  40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,
+  40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,
+  40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,
+  40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,
+  40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90
+};
+
+unsigned char sts[] = {
+  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+};
+unsigned char str[] = {
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
+};
+
+unsigned char y[] = {
+  40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40,
+  40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40,
+  40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40,
+  40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40,
+  40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40,
+  60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40,
+  60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40,
+  60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40,
+  40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40,
+  40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40,
+  40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40,
+  40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40,
+  40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40,
+  40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40,
+  40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40,
+  40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40
+};
+int uvp = 8;
+unsigned char u[] = {
+  90, 80, 70, 70, 90, 90, 90, 17,
+  90, 80, 70, 70, 90, 90, 90, 17,
+  84, 70, 70, 90, 90, 90, 17, 17,
+  84, 70, 70, 90, 90, 90, 17, 17,
+  80, 70, 70, 90, 90, 90, 17, 17,
+  90, 80, 70, 70, 90, 90, 90, 17,
+  90, 80, 70, 70, 90, 90, 90, 17,
+  90, 80, 70, 70, 90, 90, 90, 17
+};
+
+unsigned char v[] = {
+  80, 80, 80, 80, 80, 80, 80, 80,
+  80, 80, 80, 80, 80, 80, 80, 80,
+  80, 80, 80, 80, 80, 80, 80, 80,
+  80, 80, 80, 80, 80, 80, 80, 80,
+  80, 80, 80, 80, 80, 80, 80, 80,
+  80, 80, 80, 80, 80, 80, 80, 80,
+  80, 80, 80, 80, 80, 80, 80, 80,
+  80, 80, 80, 80, 80, 80, 80, 80
+};
+
+unsigned char ym[256];
+unsigned char uvm[64];
+typedef struct {
+  unsigned char y;
+  unsigned char yt;
+  unsigned char u;
+  unsigned char ut;
+  unsigned char v;
+  unsigned char vt;
+  unsigned char use;
+} COLOR_SEG_ELEMENT;
+
+/*
+COLOR_SEG_ELEMENT segmentation[]=
+{
+    { 60,4,80,17,80,10, 1},
+    { 40,4,15,10,80,10, 1},
+};
+*/
+
+COLOR_SEG_ELEMENT segmentation[] = {
+  { 79, 44, 92, 44, 237, 60, 1},
+};
+
+unsigned char pixel_mask(unsigned char y, unsigned char u, unsigned char v,
+                         COLOR_SEG_ELEMENT sgm[],
+                         int c) {
+  COLOR_SEG_ELEMENT *s = sgm;
+  unsigned char m = 0;
+  int i;
+  for (i = 0; i < c; i++, s++)
+    m |= (abs(y - s->y) < s->yt &&
+          abs(u - s->u) < s->ut &&
+          abs(v - s->v) < s->vt ? 255 : 0);
+
+  return m;
+}
+int neighbors[256][8];
+int makeneighbors(void) {
+  int i, j;
+  for (i = 0; i < 256; i++) {
+    int r = (i >> 4), c = (i & 15);
+    int ni = 0;
+    for (j = 0; j < 8; j++)
+      neighbors[i][j] = i;
+    for (j = 0; j < 256; j++) {
+      int nr = (j >> 4), nc = (j & 15);
+      if (abs(nr - r) < 2 && abs(nc - c) < 2)
+        neighbors[i][ni++] = j;
+    }
+  }
+  return 0;
+}
+void grow_ymask(unsigned char *ym) {
+  unsigned char nym[256];
+  int i, j;
+
+  for (i = 0; i < 256; i++) {
+    nym[i] = ym[i];
+    for (j = 0; j < 8; j++) {
+      nym[i] |= ym[neighbors[i][j]];
+    }
+  }
+  for (i = 0; i < 256; i++)
+    ym[i] = nym[i];
+}
+void make_mb_mask(unsigned char *y, unsigned char *u, unsigned char *v,
+                  unsigned char *ym, unsigned char *uvm,
+                  int yp, int uvp,
+                  COLOR_SEG_ELEMENT sgm[],
+                  int count) {
+  int r, c;
+  unsigned char *oym = ym;
+
+  memset(ym, 20, 256);
+  for (r = 0; r < 8; r++, uvm += 8, u += uvp, v += uvp, y += (yp + yp), ym += 32)
+    for (c = 0; c < 8; c++) {
+      int y1 = y[c << 1];
+      int u1 = u[c];
+      int v1 = v[c];
+      int m = pixel_mask(y1, u1, v1, sgm, count);
+      uvm[c] = m;
+      ym[c << 1] = uvm[c]; // = pixel_mask(y[c<<1],u[c],v[c],sgm,count);
+      ym[(c << 1) + 1] = pixel_mask(y[1 + (c << 1)], u[c], v[c], sgm, count);
+      ym[(c << 1) + 16] = pixel_mask(y[yp + (c << 1)], u[c], v[c], sgm, count);
+      ym[(c << 1) + 17] = pixel_mask(y[1 + yp + (c << 1)], u[c], v[c], sgm, count);
+    }
+  grow_ymask(oym);
+}
+
+int masked_sad(unsigned char *src, int p, unsigned char *dst, int dp,
+               unsigned char *ym) {
+  int i, j;
+  unsigned sad = 0;
+  for (i = 0; i < 16; i++, src += p, dst += dp, ym += 16)
+    for (j = 0; j < 16; j++)
+      if (ym[j])
+        sad += abs(src[j] - dst[j]);
+
+  return sad;
+}
+
+int compare_masks(unsigned char *sym, unsigned char *ym) {
+  int i, j;
+  unsigned sad = 0;
+  for (i = 0; i < 16; i++, sym += 16, ym += 16)
+    for (j = 0; j < 16; j++)
+      sad += (sym[j] != ym[j] ? 1 : 0);
+
+  return sad;
+}
+int unmasked_sad(unsigned char *src, int p, unsigned char *dst, int dp,
+                 unsigned char *ym) {
+  int i, j;
+  unsigned sad = 0;
+  for (i = 0; i < 16; i++, src += p, dst += dp, ym += 16)
+    for (j = 0; j < 16; j++)
+      if (!ym[j])
+        sad += abs(src[j] - dst[j]);
+
+  return sad;
+}
+int masked_motion_search(unsigned char *y, unsigned char *u, unsigned char *v,
+                         int yp, int uvp,
+                         unsigned char *dy, unsigned char *du, unsigned char *dv,
+                         int dyp, int duvp,
+                         COLOR_SEG_ELEMENT sgm[],
+                         int count,
+                         int *mi,
+                         int *mj,
+                         int *ui,
+                         int *uj,
+                         int *wm) {
+  int i, j;
+
+  unsigned char ym[256];
+  unsigned char uvm[64];
+  unsigned char dym[256];
+  unsigned char duvm[64];
+  unsigned int e = 0;
+  int beste = 256;
+  int bmi = -32, bmj = -32;
+  int bui = -32, buj = -32;
+  int beste1 = 256;
+  int bmi1 = -32, bmj1 = -32;
+  int bui1 = -32, buj1 = -32;
+  int obeste;
+
+  // first try finding best mask and then unmasked
+  beste = 0xffffffff;
+
+  // find best unmasked mv
+  for (i = -32; i < 32; i++) {
+    unsigned char *dyz = i * dyp + dy;
+    unsigned char *duz = i / 2 * duvp + du;
+    unsigned char *dvz = i / 2 * duvp + dv;
+    for (j = -32; j < 32; j++) {
+      // 0,0  masked destination
+      make_mb_mask(dyz + j, duz + j / 2, dvz + j / 2, dym, duvm, dyp, duvp, sgm, count);
+
+      e = unmasked_sad(y, yp, dyz + j, dyp, dym);
+
+      if (e < beste) {
+        bui = i;
+        buj = j;
+        beste = e;
+      }
+    }
+  }
+  // bui=0;buj=0;
+  // best mv masked destination
+  make_mb_mask(dy + bui * dyp + buj, du + bui / 2 * duvp + buj / 2, dv + bui / 2 * duvp + buj / 2,
+               dym, duvm, dyp, duvp, sgm, count);
+
+  obeste = beste;
+  beste = 0xffffffff;
+
+  // find best masked
+  for (i = -32; i < 32; i++) {
+    unsigned char *dyz = i * dyp + dy;
+    for (j = -32; j < 32; j++) {
+      e = masked_sad(y, yp, dyz + j, dyp, dym);
+
+      if (e < beste) {
+        bmi = i;
+        bmj = j;
+        beste = e;
+      }
+    }
+  }
+  beste1 = beste + obeste;
+  bmi1 = bmi;
+  bmj1 = bmj;
+  bui1 = bui;
+  buj1 = buj;
+
+  beste = 0xffffffff;
+  // source mask
+  make_mb_mask(y, u, v, ym, uvm, yp, uvp, sgm, count);
+
+  // find best mask
+  for (i = -32; i < 32; i++) {
+    unsigned char *dyz = i * dyp + dy;
+    unsigned char *duz = i / 2 * duvp + du;
+    unsigned char *dvz = i / 2 * duvp + dv;
+    for (j = -32; j < 32; j++) {
+      // 0,0  masked destination
+      make_mb_mask(dyz + j, duz + j / 2, dvz + j / 2, dym, duvm, dyp, duvp, sgm, count);
+
+      e = compare_masks(ym, dym);
+
+      if (e < beste) {
+        bmi = i;
+        bmj = j;
+        beste = e;
+      }
+    }
+  }
+
+
+  // best mv masked destination
+  make_mb_mask(dy + bmi * dyp + bmj, du + bmi / 2 * duvp + bmj / 2, dv + bmi / 2 * duvp + bmj / 2,
+               dym, duvm, dyp, duvp, sgm, count);
+
+  obeste = masked_sad(y, yp, dy + bmi * dyp + bmj, dyp, dym);
+
+  beste = 0xffffffff;
+
+  // find best unmasked mv
+  for (i = -32; i < 32; i++) {
+    unsigned char *dyz = i * dyp + dy;
+    for (j = -32; j < 32; j++) {
+      e = unmasked_sad(y, yp, dyz + j, dyp, dym);
+
+      if (e < beste) {
+        bui = i;
+        buj = j;
+        beste = e;
+      }
+    }
+  }
+  beste += obeste;
+
+
+  if (beste < beste1) {
+    *mi = bmi;
+    *mj = bmj;
+    *ui = bui;
+    *uj = buj;
+    *wm = 1;
+  } else {
+    *mi = bmi1;
+    *mj = bmj1;
+    *ui = bui1;
+    *uj = buj1;
+    *wm = 0;
+
+  }
+  return 0;
+}
+
+int predict(unsigned char *src, int p, unsigned char *dst, int dp,
+            unsigned char *ym, unsigned char *prd) {
+  int i, j;
+  for (i = 0; i < 16; i++, src += p, dst += dp, ym += 16, prd += 16)
+    for (j = 0; j < 16; j++)
+      prd[j] = (ym[j] ? src[j] : dst[j]);
+  return 0;
+}
+
+int fast_masked_motion_search(unsigned char *y, unsigned char *u, unsigned char *v,
+                              int yp, int uvp,
+                              unsigned char *dy, unsigned char *du, unsigned char *dv,
+                              int dyp, int duvp,
+                              COLOR_SEG_ELEMENT sgm[],
+                              int count,
+                              int *mi,
+                              int *mj,
+                              int *ui,
+                              int *uj,
+                              int *wm) {
+  int i, j;
+
+  unsigned char ym[256];
+  unsigned char ym2[256];
+  unsigned char uvm[64];
+  unsigned char dym2[256];
+  unsigned char dym[256];
+  unsigned char duvm[64];
+  unsigned int e = 0;
+  int beste = 256;
+  int bmi = -32, bmj = -32;
+  int bui = -32, buj = -32;
+  int beste1 = 256;
+  int bmi1 = -32, bmj1 = -32;
+  int bui1 = -32, buj1 = -32;
+  int obeste;
+
+  // first try finding best mask and then unmasked
+  beste = 0xffffffff;
+
+#if 0
+  for (i = 0; i < 16; i++) {
+    unsigned char *dy = i * yp + y;
+    for (j = 0; j < 16; j++)
+      printf("%2x", dy[j]);
+    printf("\n");
+  }
+  printf("\n");
+
+  for (i = -32; i < 48; i++) {
+    unsigned char *dyz = i * dyp + dy;
+    for (j = -32; j < 48; j++)
+      printf("%2x", dyz[j]);
+    printf("\n");
+  }
+#endif
+
+  // find best unmasked mv
+  for (i = -32; i < 32; i++) {
+    unsigned char *dyz = i * dyp + dy;
+    unsigned char *duz = i / 2 * duvp + du;
+    unsigned char *dvz = i / 2 * duvp + dv;
+    for (j = -32; j < 32; j++) {
+      // 0,0  masked destination
+      vp8_makemask_sse3(dyz + j, duz + j / 2, dvz + j / 2, dym, dyp, duvp,
+                        sgm[0].y, sgm[0].u, sgm[0].v,
+                        sgm[0].yt, sgm[0].ut, sgm[0].vt);
+
+      vp8_growmaskmb_sse3(dym, dym2);
+
+      e = vp9_sad16x16_unmasked_wmt(y, yp, dyz + j, dyp, dym2);
+
+      if (e < beste) {
+        bui = i;
+        buj = j;
+        beste = e;
+      }
+    }
+  }
+  // bui=0;buj=0;
+  // best mv masked destination
+
+  vp8_makemask_sse3(dy + bui * dyp + buj, du + bui / 2 * duvp + buj / 2, dv + bui / 2 * duvp + buj / 2,
+                    dym, dyp, duvp,
+                    sgm[0].y, sgm[0].u, sgm[0].v,
+                    sgm[0].yt, sgm[0].ut, sgm[0].vt);
+
+  vp8_growmaskmb_sse3(dym, dym2);
+
+  obeste = beste;
+  beste = 0xffffffff;
+
+  // find best masked
+  for (i = -32; i < 32; i++) {
+    unsigned char *dyz = i * dyp + dy;
+    for (j = -32; j < 32; j++) {
+      e = vp9_sad16x16_masked_wmt(y, yp, dyz + j, dyp, dym2);
+      if (e < beste) {
+        bmi = i;
+        bmj = j;
+        beste = e;
+      }
+    }
+  }
+  beste1 = beste + obeste;
+  bmi1 = bmi;
+  bmj1 = bmj;
+  bui1 = bui;
+  buj1 = buj;
+
+  // source mask
+  vp8_makemask_sse3(y, u, v,
+                    ym, yp, uvp,
+                    sgm[0].y, sgm[0].u, sgm[0].v,
+                    sgm[0].yt, sgm[0].ut, sgm[0].vt);
+
+  vp8_growmaskmb_sse3(ym, ym2);
+
+  // find best mask
+  for (i = -32; i < 32; i++) {
+    unsigned char *dyz = i * dyp + dy;
+    unsigned char *duz = i / 2 * duvp + du;
+    unsigned char *dvz = i / 2 * duvp + dv;
+    for (j = -32; j < 32; j++) {
+      // 0,0  masked destination
+      vp8_makemask_sse3(dyz + j, duz + j / 2, dvz + j / 2, dym, dyp, duvp,
+                        sgm[0].y, sgm[0].u, sgm[0].v,
+                        sgm[0].yt, sgm[0].ut, sgm[0].vt);
+
+      vp8_growmaskmb_sse3(dym, dym2);
+
+      e = compare_masks(ym2, dym2);
+
+      if (e < beste) {
+        bmi = i;
+        bmj = j;
+        beste = e;
+      }
+    }
+  }
+
+  vp8_makemask_sse3(dy + bmi * dyp + bmj, du + bmi / 2 * duvp + bmj / 2, dv + bmi / 2 * duvp + bmj / 2,
+                    dym, dyp, duvp,
+                    sgm[0].y, sgm[0].u, sgm[0].v,
+                    sgm[0].yt, sgm[0].ut, sgm[0].vt);
+
+  vp8_growmaskmb_sse3(dym, dym2);
+
+  obeste = vp9_sad16x16_masked_wmt(y, yp, dy + bmi * dyp + bmj, dyp, dym2);
+
+  beste = 0xffffffff;
+
+  // find best unmasked mv
+  for (i = -32; i < 32; i++) {
+    unsigned char *dyz = i * dyp + dy;
+    for (j = -32; j < 32; j++) {
+      e = vp9_sad16x16_unmasked_wmt(y, yp, dyz + j, dyp, dym2);
+
+      if (e < beste) {
+        bui = i;
+        buj = j;
+        beste = e;
+      }
+    }
+  }
+  beste += obeste;
+
+  if (beste < beste1) {
+    *mi = bmi;
+    *mj = bmj;
+    *ui = bui;
+    *uj = buj;
+    *wm = 1;
+  } else {
+    *mi = bmi1;
+    *mj = bmj1;
+    *ui = bui1;
+    *uj = buj1;
+    *wm = 0;
+    beste = beste1;
+
+  }
+  return beste;
+}
+
+int predict_all(unsigned char *ym, unsigned char *um, unsigned char *vm,
+                int ymp, int uvmp,
+                unsigned char *yp, unsigned char *up, unsigned char *vp,
+                int ypp, int uvpp,
+                COLOR_SEG_ELEMENT sgm[],
+                int count,
+                int mi,
+                int mj,
+                int ui,
+                int uj,
+                int wm) {
+  int i, j;
+  unsigned char dym[256];
+  unsigned char dym2[256];
+  unsigned char duvm[64];
+  unsigned char *yu = ym, *uu = um, *vu = vm;
+
+  unsigned char *dym3 = dym2;
+
+  ym += mi * ymp + mj;
+  um += mi / 2 * uvmp + mj / 2;
+  vm += mi / 2 * uvmp + mj / 2;
+
+  yu += ui * ymp + uj;
+  uu += ui / 2 * uvmp + uj / 2;
+  vu += ui / 2 * uvmp + uj / 2;
+
+  // best mv masked destination
+  if (wm)
+    vp8_makemask_sse3(ym, um, vm, dym, ymp, uvmp,
+                      sgm[0].y, sgm[0].u, sgm[0].v,
+                      sgm[0].yt, sgm[0].ut, sgm[0].vt);
+  else
+    vp8_makemask_sse3(yu, uu, vu, dym, ymp, uvmp,
+                      sgm[0].y, sgm[0].u, sgm[0].v,
+                      sgm[0].yt, sgm[0].ut, sgm[0].vt);
+
+  vp8_growmaskmb_sse3(dym, dym2);
+  vp8_masked_predictor_wmt(ym, yu, ymp, yp, ypp, dym3);
+  vp8_uv_from_y_mask(dym3, duvm);
+  vp8_masked_predictor_uv_wmt(um, uu, uvmp, up, uvpp, duvm);
+  vp8_masked_predictor_uv_wmt(vm, vu, uvmp, vp, uvpp, duvm);
+
+  return 0;
+}
+
+unsigned char f0p[1280 * 720 * 3 / 2];
+unsigned char f1p[1280 * 720 * 3 / 2];
+unsigned char prd[1280 * 720 * 3 / 2];
+unsigned char msk[1280 * 720 * 3 / 2];
+
+
+int mainz(int argc, char *argv[]) {
+
+  FILE *f = fopen(argv[1], "rb");
+  FILE *g = fopen(argv[2], "wb");
+  int w = atoi(argv[3]), h = atoi(argv[4]);
+  int y_stride = w, uv_stride = w / 2;
+  int r, c;
+  unsigned char *f0 = f0p, *f1 = f1p, *t;
+  unsigned char ym[256], uvm[64];
+  unsigned char ym2[256], uvm2[64];
+  unsigned char ym3[256], uvm3[64];
+  int a, b;
+
+  COLOR_SEG_ELEMENT last = { 20, 20, 20, 20, 230, 20, 1}, best;
+#if 0
+  makeneighbors();
+  COLOR_SEG_ELEMENT segmentation[] = {
+    { 60, 4, 80, 17, 80, 10, 1},
+    { 40, 4, 15, 10, 80, 10, 1},
+  };
+  make_mb_mask(y, u, v, ym2, uvm2, 16, 8, segmentation, 1);
+
+  vp8_makemask_sse3(y, u, v, ym, (int) 16, (int) 8,
+                    (int) segmentation[0].y, (int) segmentation[0].u, (int) segmentation[0].v,
+                    segmentation[0].yt, segmentation[0].ut, segmentation[0].vt);
+
+  vp8_growmaskmb_sse3(ym, ym3);
+
+  a = vp9_sad16x16_masked_wmt(str, 16, sts, 16, ym3);
+  b = vp9_sad16x16_unmasked_wmt(str, 16, sts, 16, ym3);
+
+  vp8_masked_predictor_wmt(str, sts, 16, ym, 16, ym3);
+
+  vp8_uv_from_y_mask(ym3, uvm3);
+
+  return 4;
+#endif
+  makeneighbors();
+
+
+  memset(prd, 128, w * h * 3 / 2);
+
+  fread(f0, w * h * 3 / 2, 1, f);
+
+  while (!feof(f)) {
+    unsigned char *ys = f1, *yd = f0, *yp = prd;
+    unsigned char *us = f1 + w * h, *ud = f0 + w * h, *up = prd + w * h;
+    unsigned char *vs = f1 + w * h * 5 / 4, *vd = f0 + w * h * 5 / 4, *vp = prd + w * h * 5 / 4;
+    fread(f1, w * h * 3 / 2, 1, f);
+
+    ys += 32 * y_stride;
+    yd += 32 * y_stride;
+    yp += 32 * y_stride;
+    us += 16 * uv_stride;
+    ud += 16 * uv_stride;
+    up += 16 * uv_stride;
+    vs += 16 * uv_stride;
+    vd += 16 * uv_stride;
+    vp += 16 * uv_stride;
+    for (r = 32; r < h - 32; r += 16,
+         ys += 16 * w, yd += 16 * w, yp += 16 * w,
+         us += 8 * uv_stride, ud += 8 * uv_stride, up += 8 * uv_stride,
+         vs += 8 * uv_stride, vd += 8 * uv_stride, vp += 8 * uv_stride) {
+      for (c = 32; c < w - 32; c += 16) {
+        int mi, mj, ui, uj, wm;
+        int bmi, bmj, bui, buj, bwm;
+        unsigned char ym[256];
+
+        if (vp9_sad16x16_sse3(ys + c, y_stride, yd + c, y_stride, 0xffff) == 0)
+          bmi = bmj = bui = buj = bwm = 0;
+        else {
+          COLOR_SEG_ELEMENT cs[5];
+          int j;
+          unsigned int beste = 0xfffffff;
+          unsigned int bestj = 0;
+
+          // try color from last mb segmentation
+          cs[0] = last;
+
+          // try color segs from 4 pixels in mb recon as segmentation
+          cs[1].y = yd[c + y_stride + 1];
+          cs[1].u = ud[c / 2 + uv_stride];
+          cs[1].v = vd[c / 2 + uv_stride];
+          cs[1].yt = cs[1].ut = cs[1].vt = 20;
+          cs[2].y = yd[c + w + 14];
+          cs[2].u = ud[c / 2 + uv_stride + 7];
+          cs[2].v = vd[c / 2 + uv_stride + 7];
+          cs[2].yt = cs[2].ut = cs[2].vt = 20;
+          cs[3].y = yd[c + w * 14 + 1];
+          cs[3].u = ud[c / 2 + uv_stride * 7];
+          cs[3].v = vd[c / 2 + uv_stride * 7];
+          cs[3].yt = cs[3].ut = cs[3].vt = 20;
+          cs[4].y = yd[c + w * 14 + 14];
+          cs[4].u = ud[c / 2 + uv_stride * 7 + 7];
+          cs[4].v = vd[c / 2 + uv_stride * 7 + 7];
+          cs[4].yt = cs[4].ut = cs[4].vt = 20;
+
+          for (j = 0; j < 5; j++) {
+            int e;
+
+            e = fast_masked_motion_search(
+                  ys + c, us + c / 2, vs + c / 2, y_stride, uv_stride,
+                  yd + c, ud + c / 2, vd + c / 2, y_stride, uv_stride,
+                  &cs[j], 1, &mi, &mj, &ui, &uj, &wm);
+
+            if (e < beste) {
+              bmi = mi;
+              bmj = mj;
+              bui = ui;
+              buj = uj, bwm = wm;
+              bestj = j;
+              beste = e;
+            }
+          }
+          best = cs[bestj];
+          // best = segmentation[0];
+          last = best;
+        }
+        predict_all(yd + c, ud + c / 2, vd + c / 2, w, uv_stride,
+                    yp + c, up + c / 2, vp + c / 2, w, uv_stride,
+                    &best, 1, bmi, bmj, bui, buj, bwm);
+
+      }
+    }
+    fwrite(prd, w * h * 3 / 2, 1, g);
+    t = f0;
+    f0 = f1;
+    f1 = t;
+
+  }
+  fclose(f);
+  fclose(g);
+  return;
+}
--- /dev/null
+++ b/vp9/common/mbpitch.c
@@ -1,0 +1,124 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "blockd.h"
+
+typedef enum {
+  PRED = 0,
+  DEST = 1
+} BLOCKSET;
+
+static void setup_block
+(
+  BLOCKD *b,
+  int mv_stride,
+  unsigned char **base,
+  unsigned char **base2,
+  int Stride,
+  int offset,
+  BLOCKSET bs
+) {
+
+  if (bs == DEST) {
+    b->dst_stride = Stride;
+    b->dst = offset;
+    b->base_dst = base;
+  } else {
+    b->pre_stride = Stride;
+    b->pre = offset;
+    b->base_pre = base;
+    b->base_second_pre = base2;
+  }
+
+}
+
+
+static void setup_macroblock(MACROBLOCKD *xd, BLOCKSET bs) {
+  int block;
+
+  unsigned char **y, **u, **v;
+  unsigned char **y2, **u2, **v2;
+  BLOCKD *blockd = xd->block;
+  int stride;
+
+  if (bs == DEST) {
+    y = &xd->dst.y_buffer;
+    u = &xd->dst.u_buffer;
+    v = &xd->dst.v_buffer;
+  } else {
+    y = &xd->pre.y_buffer;
+    u = &xd->pre.u_buffer;
+    v = &xd->pre.v_buffer;
+
+    y2 = &xd->second_pre.y_buffer;
+    u2 = &xd->second_pre.u_buffer;
+    v2 = &xd->second_pre.v_buffer;
+  }
+
+  stride = xd->dst.y_stride;
+  for (block = 0; block < 16; block++) { /* y blocks */
+    setup_block(&blockd[block], stride, y, y2, stride,
+                (block >> 2) * 4 * stride + (block & 3) * 4, bs);
+  }
+
+  stride = xd->dst.uv_stride;
+  for (block = 16; block < 20; block++) { /* U and V blocks */
+    setup_block(&blockd[block], stride, u, u2, stride,
+      ((block - 16) >> 1) * 4 * stride + (block & 1) * 4, bs);
+
+    setup_block(&blockd[block + 4], stride, v, v2, stride,
+      ((block - 16) >> 1) * 4 * stride + (block & 1) * 4, bs);
+  }
+}
+
+void vp9_setup_block_dptrs(MACROBLOCKD *xd) {
+  int r, c;
+  BLOCKD *blockd = xd->block;
+
+  for (r = 0; r < 4; r++) {
+    for (c = 0; c < 4; c++) {
+      blockd[r * 4 + c].diff = &xd->diff[r * 4 * 16 + c * 4];
+      blockd[r * 4 + c].predictor = xd->predictor + r * 4 * 16 + c * 4;
+    }
+  }
+
+  for (r = 0; r < 2; r++) {
+    for (c = 0; c < 2; c++) {
+      blockd[16 + r * 2 + c].diff = &xd->diff[256 + r * 4 * 8 + c * 4];
+      blockd[16 + r * 2 + c].predictor =
+        xd->predictor + 256 + r * 4 * 8 + c * 4;
+
+    }
+  }
+
+  for (r = 0; r < 2; r++) {
+    for (c = 0; c < 2; c++) {
+      blockd[20 + r * 2 + c].diff = &xd->diff[320 + r * 4 * 8 + c * 4];
+      blockd[20 + r * 2 + c].predictor =
+        xd->predictor + 320 + r * 4 * 8 + c * 4;
+
+    }
+  }
+
+  blockd[24].diff = &xd->diff[384];
+
+  for (r = 0; r < 25; r++) {
+    blockd[r].qcoeff  = xd->qcoeff  + r * 16;
+    blockd[r].dqcoeff = xd->dqcoeff + r * 16;
+  }
+}
+
+void vp9_build_block_doffsets(MACROBLOCKD *xd) {
+
+  /* handle the destination pitch features */
+  setup_macroblock(xd, DEST);
+  setup_macroblock(xd, PRED);
+}
--- /dev/null
+++ b/vp9/common/modecont.c
@@ -1,0 +1,64 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "entropy.h"
+const int vp9_default_mode_contexts[6][4] = {
+  {
+    /* 0 */
+    7,     1,     1,   183
+  },
+  {
+    /* 1 */
+    14,    18,    14,   147
+  },
+  {
+    /* 2 */
+    135,    64,    57,    68
+  },
+  {
+    /* 3 */
+    60,    56,   128,   65
+  },
+  {
+    /* 4 */
+    159,   134,   128,   34
+  },
+  {
+    /* 5 */
+    234,   188,   128,   28
+  },
+};
+const int vp9_default_mode_contexts_a[6][4] = {
+  {
+    /* 0 */
+    4,     1,    1,   143
+  },
+  {
+    /* 1 */
+    7,     9,    7,   107
+  },
+  {
+    /* 2 */
+    95,    34,   57,    68
+  },
+  {
+    /* 3 */
+    95,    56,   128,   65
+  },
+  {
+    /* 4 */
+    159,   67,   128,   34
+  },
+  {
+    /* 5 */
+    234,   94,   128,   28
+  },
+};
--- /dev/null
+++ b/vp9/common/modecont.h
@@ -1,0 +1,17 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_MODECONT_H
+#define __INC_MODECONT_H
+
+extern const int vp9_default_mode_contexts[6][4];
+extern const int vp9_default_mode_contexts_a[6][4];
+#endif
--- /dev/null
+++ b/vp9/common/modecontext.c
@@ -1,0 +1,145 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "entropymode.h"
+
+const unsigned int vp9_kf_default_bmode_counts[VP9_BINTRAMODES][VP9_BINTRAMODES][VP9_BINTRAMODES] = {
+  {
+    /*Above Mode :  0*/
+    { 43438,   2195,    470,    316,    615,    171,    217,    412,    124,    160, }, /* left_mode 0 */
+    {  5722,   2751,    296,    291,     81,     68,     80,    101,    100,    170, }, /* left_mode 1 */
+    {  1629,    201,    307,     25,     47,     16,     34,     72,     19,     28, }, /* left_mode 2 */
+    {   332,    266,     36,    500,     20,     65,     23,     14,    154,    106, }, /* left_mode 3 */
+    {   450,     97,     10,     24,    117,     10,      2,     12,      8,     71, }, /* left_mode 4 */
+    {   384,     49,     29,     44,     12,    162,     51,      5,     87,     42, }, /* left_mode 5 */
+    {   495,     53,    157,     27,     14,     57,    180,     17,     17,     34, }, /* left_mode 6 */
+    {   695,     64,     62,      9,     27,      5,      3,    147,     10,     26, }, /* left_mode 7 */
+    {   230,     54,     20,    124,     16,    125,     29,     12,    283,     37, }, /* left_mode 8 */
+    {   260,     87,     21,    120,     32,     16,     33,     16,     33,    203, }, /* left_mode 9 */
+  },
+  {
+    /*Above Mode :  1*/
+    {  3934,   2573,    355,    137,    128,     87,    133,    117,     37,     27, }, /* left_mode 0 */
+    {  1036,   1929,    278,    135,     27,     37,     48,     55,     41,     91, }, /* left_mode 1 */
+    {   223,    256,    253,     15,     13,      9,     28,     64,      3,      3, }, /* left_mode 2 */
+    {   120,    129,     17,    316,     15,     11,      9,      4,     53,     74, }, /* left_mode 3 */
+    {   129,     58,      6,     11,     38,      2,      0,      5,      2,     67, }, /* left_mode 4 */
+    {    53,     22,     11,     16,      8,     26,     14,      3,     19,     12, }, /* left_mode 5 */
+    {    59,     26,     61,     11,      4,      9,     35,     13,      8,      8, }, /* left_mode 6 */
+    {   101,     52,     40,      8,      5,      2,      8,     59,      2,     20, }, /* left_mode 7 */
+    {    48,     34,     10,     52,      8,     15,      6,      6,     63,     20, }, /* left_mode 8 */
+    {    96,     48,     22,     63,     11,     14,      5,      8,      9,     96, }, /* left_mode 9 */
+  },
+  {
+    /*Above Mode :  2*/
+    {   709,    461,    506,     36,     27,     33,    151,     98,     24,      6, }, /* left_mode 0 */
+    {   201,    375,    442,     27,     13,      8,     46,     58,      6,     19, }, /* left_mode 1 */
+    {   122,    140,    417,      4,     13,      3,     33,     59,      4,      2, }, /* left_mode 2 */
+    {    36,     17,     22,     16,      6,      8,     12,     17,      9,     21, }, /* left_mode 3 */
+    {    51,     15,      7,      1,     14,      0,      4,      5,      3,     22, }, /* left_mode 4 */
+    {    18,     11,     30,      9,      7,     20,     11,      5,      2,      6, }, /* left_mode 5 */
+    {    38,     21,    103,      9,      4,     12,     79,     13,      2,      5, }, /* left_mode 6 */
+    {    64,     17,     66,      2,     12,      4,      2,     65,      4,      5, }, /* left_mode 7 */
+    {    14,      7,      7,     16,      3,     11,      4,     13,     15,     16, }, /* left_mode 8 */
+    {    36,      8,     32,      9,      9,      4,     14,      7,      6,     24, }, /* left_mode 9 */
+  },
+  {
+    /*Above Mode :  3*/
+    {  1340,    173,     36,    119,     30,     10,     13,     10,     20,     26, }, /* left_mode 0 */
+    {   156,    293,     26,    108,      5,     16,      2,      4,     23,     30, }, /* left_mode 1 */
+    {    60,     34,     13,      7,      3,      3,      0,      8,      4,      5, }, /* left_mode 2 */
+    {    72,     64,      1,    235,      3,      9,      2,      7,     28,     38, }, /* left_mode 3 */
+    {    29,     14,      1,      3,      5,      0,      2,      2,      5,     13, }, /* left_mode 4 */
+    {    22,      7,      4,     11,      2,      5,      1,      2,      6,      4, }, /* left_mode 5 */
+    {    18,     14,      5,      6,      4,      3,     14,      0,      9,      2, }, /* left_mode 6 */
+    {    41,     10,      7,      1,      2,      0,      0,     10,      2,      1, }, /* left_mode 7 */
+    {    23,     19,      2,     33,      1,      5,      2,      0,     51,      8, }, /* left_mode 8 */
+    {    33,     26,      7,     53,      3,      9,      3,      3,      9,     19, }, /* left_mode 9 */
+  },
+  {
+    /*Above Mode :  4*/
+    {   410,    165,     43,     31,     66,     15,     30,     54,      8,     17, }, /* left_mode 0 */
+    {   115,     64,     27,     18,     30,      7,     11,     15,      4,     19, }, /* left_mode 1 */
+    {    31,     23,     25,      1,      7,      2,      2,     10,      0,      5, }, /* left_mode 2 */
+    {    17,      4,      1,      6,      8,      2,      7,      5,      5,     21, }, /* left_mode 3 */
+    {   120,     12,      1,      2,     83,      3,      0,      4,      1,     40, }, /* left_mode 4 */
+    {     4,      3,      1,      2,      1,      2,      5,      0,      3,      6, }, /* left_mode 5 */
+    {    10,      2,     13,      6,      6,      6,      8,      2,      4,      5, }, /* left_mode 6 */
+    {    58,     10,      5,      1,     28,      1,      1,     33,      1,      9, }, /* left_mode 7 */
+    {     8,      2,      1,      4,      2,      5,      1,      1,      2,     10, }, /* left_mode 8 */
+    {    76,      7,      5,      7,     18,      2,      2,      0,      5,     45, }, /* left_mode 9 */
+  },
+  {
+    /*Above Mode :  5*/
+    {   444,     46,     47,     20,     14,    110,     60,     14,     60,      7, }, /* left_mode 0 */
+    {    59,     57,     25,     18,      3,     17,     21,      6,     14,      6, }, /* left_mode 1 */
+    {    24,     17,     20,      6,      4,     13,      7,      2,      3,      2, }, /* left_mode 2 */
+    {    13,     11,      5,     14,      4,      9,      2,      4,     15,      7, }, /* left_mode 3 */
+    {     8,      5,      2,      1,      4,      0,      1,      1,      2,     12, }, /* left_mode 4 */
+    {    19,      5,      5,      7,      4,     40,      6,      3,     10,      4, }, /* left_mode 5 */
+    {    16,      5,      9,      1,      1,     16,     26,      2,     10,      4, }, /* left_mode 6 */
+    {    11,      4,      8,      1,      1,      4,      4,      5,      4,      1, }, /* left_mode 7 */
+    {    15,      1,      3,      7,      3,     21,      7,      1,     34,      5, }, /* left_mode 8 */
+    {    18,      5,      1,      3,      4,      3,      7,      1,      2,      9, }, /* left_mode 9 */
+  },
+  {
+    /*Above Mode :  6*/
+    {   476,    149,     94,     13,     14,     77,    291,     27,     23,      3, }, /* left_mode 0 */
+    {    79,     83,     42,     14,      2,     12,     63,      2,      4,     14, }, /* left_mode 1 */
+    {    43,     36,     55,      1,      3,      8,     42,     11,      5,      1, }, /* left_mode 2 */
+    {     9,      9,      6,     16,      1,      5,      6,      3,     11,     10, }, /* left_mode 3 */
+    {    10,      3,      1,      3,     10,      1,      0,      1,      1,      4, }, /* left_mode 4 */
+    {    14,      6,     15,      5,      1,     20,     25,      2,      5,      0, }, /* left_mode 5 */
+    {    28,      7,     51,      1,      0,      8,    127,      6,      2,      5, }, /* left_mode 6 */
+    {    13,      3,      3,      2,      3,      1,      2,      8,      1,      2, }, /* left_mode 7 */
+    {    10,      3,      3,      3,      3,      8,      2,      2,      9,      3, }, /* left_mode 8 */
+    {    13,      7,     11,      4,      0,      4,      6,      2,      5,      8, }, /* left_mode 9 */
+  },
+  {
+    /*Above Mode :  7*/
+    {   376,    135,    119,      6,     32,      8,     31,    224,      9,      3, }, /* left_mode 0 */
+    {    93,     60,     54,      6,     13,      7,      8,     92,      2,     12, }, /* left_mode 1 */
+    {    74,     36,     84,      0,      3,      2,      9,     67,      2,      1, }, /* left_mode 2 */
+    {    19,      4,      4,      8,      8,      2,      4,      7,      6,     16, }, /* left_mode 3 */
+    {    51,      7,      4,      1,     77,      3,      0,     14,      1,     15, }, /* left_mode 4 */
+    {     7,      7,      5,      7,      4,      7,      4,      5,      0,      3, }, /* left_mode 5 */
+    {    18,      2,     19,      2,      2,      4,     12,     11,      1,      2, }, /* left_mode 6 */
+    {   129,      6,     27,      1,     21,      3,      0,    189,      0,      6, }, /* left_mode 7 */
+    {     9,      1,      2,      8,      3,      7,      0,      5,      3,      3, }, /* left_mode 8 */
+    {    20,      4,      5,     10,      4,      2,      7,     17,      3,     16, }, /* left_mode 9 */
+  },
+  {
+    /*Above Mode :  8*/
+    {   617,     68,     34,     79,     11,     27,     25,     14,     75,     13, }, /* left_mode 0 */
+    {    51,     82,     21,     26,      6,     12,     13,      1,     26,     16, }, /* left_mode 1 */
+    {    29,      9,     12,     11,      3,      7,      1,     10,      2,      2, }, /* left_mode 2 */
+    {    17,     19,     11,     74,      4,      3,      2,      0,     58,     13, }, /* left_mode 3 */
+    {    10,      1,      1,      3,      4,      1,      0,      2,      1,      8, }, /* left_mode 4 */
+    {    14,      4,      5,      5,      1,     13,      2,      0,     27,      8, }, /* left_mode 5 */
+    {    10,      3,      5,      4,      1,      7,      6,      4,      5,      1, }, /* left_mode 6 */
+    {    10,      2,      6,      2,      1,      1,      1,      4,      2,      1, }, /* left_mode 7 */
+    {    14,      8,      5,     23,      2,     12,      6,      2,    117,      5, }, /* left_mode 8 */
+    {     9,      6,      2,     19,      1,      6,      3,      2,      9,      9, }, /* left_mode 9 */
+  },
+  {
+    /*Above Mode :  9*/
+    {   680,     73,     22,     38,     42,      5,     11,      9,      6,     28, }, /* left_mode 0 */
+    {   113,    112,     21,     22,     10,      2,      8,      4,      6,     42, }, /* left_mode 1 */
+    {    44,     20,     24,      6,      5,      4,      3,      3,      1,      2, }, /* left_mode 2 */
+    {    40,     23,      7,     71,      5,      2,      4,      1,      7,     22, }, /* left_mode 3 */
+    {    85,      9,      4,      4,     17,      2,      0,      3,      2,     23, }, /* left_mode 4 */
+    {    13,      4,      2,      6,      1,      7,      0,      1,      7,      6, }, /* left_mode 5 */
+    {    26,      6,      8,      3,      2,      3,      8,      1,      5,      4, }, /* left_mode 6 */
+    {    54,      8,      9,      6,      7,      0,      1,     11,      1,      3, }, /* left_mode 7 */
+    {     9,     10,      4,     13,      2,      5,      4,      2,     14,      8, }, /* left_mode 8 */
+    {    92,      9,      5,     19,     15,      3,      3,      1,      6,     58, }, /* left_mode 9 */
+  },
+};
--- /dev/null
+++ b/vp9/common/mv.h
@@ -1,0 +1,26 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_MV_H
+#define __INC_MV_H
+#include "vpx/vpx_integer.h"
+
+typedef struct {
+  short row;
+  short col;
+} MV;
+
+typedef union {
+  uint32_t  as_int;
+  MV        as_mv;
+} int_mv;        /* facilitates faster equality tests and copies */
+
+#endif
--- /dev/null
+++ b/vp9/common/mvref_common.c
@@ -1,0 +1,342 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "mvref_common.h"
+
+#if CONFIG_NEWBESTREFMV
+
+#define MVREF_NEIGHBOURS 8
+static int mv_ref_search[MVREF_NEIGHBOURS][2] =
+  { {0,-1},{-1,0},{-1,-1},{0,-2},{-2,0},{-1,-2},{-2,-1},{-2,-2} };
+static int ref_distance_weight[MVREF_NEIGHBOURS] =
+  { 3,3,2,1,1,1,1,1 };
+
+// clamp_mv
+#define MV_BORDER (16 << 3) // Allow 16 pels in 1/8th pel units
+static void clamp_mv(const MACROBLOCKD *xd, int_mv *mv) {
+
+  if (mv->as_mv.col < (xd->mb_to_left_edge - MV_BORDER))
+    mv->as_mv.col = xd->mb_to_left_edge - MV_BORDER;
+  else if (mv->as_mv.col > xd->mb_to_right_edge + MV_BORDER)
+    mv->as_mv.col = xd->mb_to_right_edge + MV_BORDER;
+
+  if (mv->as_mv.row < (xd->mb_to_top_edge - MV_BORDER))
+    mv->as_mv.row = xd->mb_to_top_edge - MV_BORDER;
+  else if (mv->as_mv.row > xd->mb_to_bottom_edge + MV_BORDER)
+    mv->as_mv.row = xd->mb_to_bottom_edge + MV_BORDER;
+}
+
+
+// Gets a best matching candidate refenence motion vector
+// from the given mode info structure (if available)
+static int get_candidate_mvref(
+  const MODE_INFO *candidate_mi,
+  MV_REFERENCE_FRAME ref_frame,
+  MV_REFERENCE_FRAME *c_ref_frame,
+  int_mv *c_mv,
+  MV_REFERENCE_FRAME *c2_ref_frame,
+  int_mv *c2_mv
+) {
+
+  int ret_val = FALSE;
+  c2_mv->as_int = 0;
+  *c2_ref_frame = INTRA_FRAME;
+
+  // Target ref frame matches candidate first ref frame
+  if (ref_frame == candidate_mi->mbmi.ref_frame) {
+    c_mv->as_int = candidate_mi->mbmi.mv[0].as_int;
+    *c_ref_frame = ref_frame;
+    ret_val = TRUE;
+
+    // Is there a second non zero vector we can use.
+    if ((candidate_mi->mbmi.second_ref_frame != INTRA_FRAME) &&
+        (candidate_mi->mbmi.mv[1].as_int != 0) &&
+        (candidate_mi->mbmi.mv[1].as_int != c_mv->as_int)) {
+      c2_mv->as_int = candidate_mi->mbmi.mv[1].as_int;
+      *c2_ref_frame = candidate_mi->mbmi.second_ref_frame;
+    }
+
+  // Target ref frame matches candidate second ref frame
+  } else if (ref_frame == candidate_mi->mbmi.second_ref_frame) {
+    c_mv->as_int = candidate_mi->mbmi.mv[1].as_int;
+    *c_ref_frame = ref_frame;
+    ret_val = TRUE;
+
+    // Is there a second non zero vector we can use.
+    if ((candidate_mi->mbmi.ref_frame != INTRA_FRAME) &&
+        (candidate_mi->mbmi.mv[0].as_int != 0) &&
+        (candidate_mi->mbmi.mv[0].as_int != c_mv->as_int)) {
+      c2_mv->as_int = candidate_mi->mbmi.mv[0].as_int;
+      *c2_ref_frame = candidate_mi->mbmi.ref_frame;
+    }
+
+  // No ref frame matches so use first ref mv as first choice
+  } else if (candidate_mi->mbmi.ref_frame != INTRA_FRAME) {
+    c_mv->as_int = candidate_mi->mbmi.mv[0].as_int;
+    *c_ref_frame = candidate_mi->mbmi.ref_frame;
+    ret_val = TRUE;
+
+    // Is there a second non zero vector we can use.
+    if ((candidate_mi->mbmi.second_ref_frame != INTRA_FRAME) &&
+        (candidate_mi->mbmi.mv[1].as_int != 0) &&
+        (candidate_mi->mbmi.mv[1].as_int != c_mv->as_int)) {
+      c2_mv->as_int = candidate_mi->mbmi.mv[1].as_int;
+      *c2_ref_frame = candidate_mi->mbmi.second_ref_frame;
+    }
+
+  // If only the second ref mv is valid:- (Should not trigger in current code
+  // base given current possible compound prediction options).
+  } else if (candidate_mi->mbmi.second_ref_frame != INTRA_FRAME) {
+    c_mv->as_int = candidate_mi->mbmi.mv[1].as_int;
+    *c_ref_frame = candidate_mi->mbmi.second_ref_frame;
+    ret_val = TRUE;
+  }
+
+  return ret_val;
+}
+
+// Performs mv adjustment based on reference frame and clamps the MV
+// if it goes off the edge of the buffer.
+static void scale_mv(
+  MACROBLOCKD *xd,
+  MV_REFERENCE_FRAME this_ref_frame,
+  MV_REFERENCE_FRAME candidate_ref_frame,
+  int_mv *candidate_mv,
+  int *ref_sign_bias
+) {
+
+  if (candidate_ref_frame != this_ref_frame) {
+
+    //int frame_distances[MAX_REF_FRAMES];
+    //int last_distance = 1;
+    //int gf_distance = xd->frames_since_golden;
+    //int arf_distance = xd->frames_till_alt_ref_frame;
+
+    // Sign inversion where appropriate.
+    if (ref_sign_bias[candidate_ref_frame] != ref_sign_bias[this_ref_frame]) {
+      candidate_mv->as_mv.row = -candidate_mv->as_mv.row;
+      candidate_mv->as_mv.col = -candidate_mv->as_mv.col;
+    }
+
+    // Scale based on frame distance if the reference frames not the same.
+    /*frame_distances[INTRA_FRAME] = 1;   // should never be used
+    frame_distances[LAST_FRAME] = 1;
+    frame_distances[GOLDEN_FRAME] =
+      (xd->frames_since_golden) ? xd->frames_since_golden : 1;
+    frame_distances[ALTREF_FRAME] =
+      (xd->frames_till_alt_ref_frame) ? xd->frames_till_alt_ref_frame : 1;
+
+    if (frame_distances[this_ref_frame] &&
+        frame_distances[candidate_ref_frame]) {
+      candidate_mv->as_mv.row =
+        (short)(((int)(candidate_mv->as_mv.row) *
+                 frame_distances[this_ref_frame]) /
+                frame_distances[candidate_ref_frame]);
+
+      candidate_mv->as_mv.col =
+        (short)(((int)(candidate_mv->as_mv.col) *
+                 frame_distances[this_ref_frame]) /
+                frame_distances[candidate_ref_frame]);
+    }
+    */
+  }
+
+  // Clamp the MV so it does not point out of the frame buffer
+  clamp_mv(xd, candidate_mv);
+}
+
+// Adds a new candidate reference vector to the list if indeed it is new.
+// If it is not new then the score of the existing candidate that it matches
+// is increased and the list is resorted.
+static void addmv_and_shuffle(
+  int_mv *mv_list,
+  int *mv_scores,
+  int *index,
+  int_mv candidate_mv,
+  int weight
+) {
+
+  int i = *index;
+  int duplicate_found = FALSE;
+
+  // Check for duplicates. If there is one increment its score.
+  // Duplicate defined as being the same full pel vector with rounding.
+  while (i > 0) {
+    i--;
+
+    if (candidate_mv.as_int == mv_list[i].as_int) {
+      duplicate_found = TRUE;
+      mv_scores[i] += weight;
+      break;
+    }
+  }
+
+  // If no duplicate was found add the new vector and give it a weight
+  if (!duplicate_found) {
+    mv_list[*index].as_int = candidate_mv.as_int;
+    mv_scores[*index] = weight;
+    i = *index;
+    (*index)++;
+  }
+
+  // Reshuffle the list so that highest scoring mvs at the top.
+  while (i > 0) {
+    if (mv_scores[i] > mv_scores[i-1]) {
+      int tmp_score = mv_scores[i-1];
+      int_mv tmp_mv = mv_list[i-1];
+
+      mv_scores[i-1] = mv_scores[i];
+      mv_list[i-1] = mv_list[i];
+      mv_scores[i] = tmp_score;
+      mv_list[i] = tmp_mv;
+      i--;
+    } else
+      break;
+  }
+}
+
+// This function searches the neighbourhood of a given MB/SB and populates a
+// list of candidate reference vectors.
+//
+void vp9_find_mv_refs(
+  MACROBLOCKD *xd,
+  MODE_INFO *here,
+  MODE_INFO *lf_here,
+  MV_REFERENCE_FRAME ref_frame,
+  int_mv *mv_ref_list,
+  int *ref_sign_bias
+) {
+
+  int i;
+  MODE_INFO *candidate_mi;
+  int_mv candidate_mvs[MAX_MV_REFS];
+  int_mv c_refmv;
+  MV_REFERENCE_FRAME c_ref_frame;
+  int_mv c2_refmv;
+  MV_REFERENCE_FRAME c2_ref_frame;
+  int candidate_scores[MAX_MV_REFS];
+  int index = 0;
+  int ref_weight = 0;
+  int valid_mv_ref;
+
+  // Blank the reference vector lists and other local structures.
+  vpx_memset(mv_ref_list, 0, sizeof(int_mv) * MAX_MV_REFS);
+  vpx_memset(candidate_mvs, 0, sizeof(int_mv) * MAX_MV_REFS);
+  vpx_memset(candidate_scores, 0, sizeof(candidate_scores));
+
+  // Populate a list with candidate reference vectors from the
+  // spatial neighbours.
+  for (i = 0; i < 2; ++i) {
+    if (((mv_ref_search[i][0] << 7) >= xd->mb_to_left_edge) &&
+        ((mv_ref_search[i][1] << 7) >= xd->mb_to_top_edge)) {
+
+      candidate_mi = here + mv_ref_search[i][0] +
+                     (mv_ref_search[i][1] * xd->mode_info_stride);
+
+      valid_mv_ref = get_candidate_mvref(candidate_mi, ref_frame,
+                                         &c_ref_frame, &c_refmv,
+                                         &c2_ref_frame, &c2_refmv);
+
+      // If there is a valid MV candidate then add it to the list
+      if (valid_mv_ref) {
+        scale_mv(xd, ref_frame, c_ref_frame, &c_refmv, ref_sign_bias );
+        ref_weight = ref_distance_weight[i] +
+                     ((c_ref_frame == ref_frame) << 4);
+
+        addmv_and_shuffle(candidate_mvs, candidate_scores,
+                          &index, c_refmv, ref_weight);
+
+        // If there is a second valid mv then add it as well.
+        if (c2_ref_frame != INTRA_FRAME) {
+          scale_mv(xd, ref_frame, c2_ref_frame, &c2_refmv, ref_sign_bias );
+          ref_weight = ref_distance_weight[i] +
+                       ((c2_ref_frame == ref_frame) << 4);
+
+          addmv_and_shuffle(candidate_mvs, candidate_scores,
+                            &index, c2_refmv, ref_weight);
+        }
+      }
+    }
+  }
+
+  // Look at the corresponding vector in the last frame
+  candidate_mi = lf_here;
+  valid_mv_ref = get_candidate_mvref(candidate_mi, ref_frame,
+                                     &c_ref_frame, &c_refmv,
+                                     &c2_ref_frame, &c2_refmv);
+
+  // If there is a valid MV candidate then add it to the list
+  if (valid_mv_ref) {
+    scale_mv(xd, ref_frame, c_ref_frame, &c_refmv, ref_sign_bias );
+    ref_weight = 2 + ((c_ref_frame == ref_frame) << 4);
+    addmv_and_shuffle(candidate_mvs, candidate_scores,
+                      &index, c_refmv, ref_weight);
+
+    // If there is a second valid mv then add it as well.
+    if (c2_ref_frame != INTRA_FRAME) {
+      scale_mv(xd, ref_frame, c2_ref_frame, &c2_refmv, ref_sign_bias );
+      ref_weight = ref_distance_weight[i] +
+                   ((c2_ref_frame == ref_frame) << 4);
+
+      addmv_and_shuffle(candidate_mvs, candidate_scores,
+                        &index, c2_refmv, ref_weight);
+    }
+  }
+
+  // Populate a list with candidate reference vectors from the
+  // spatial neighbours.
+  for (i = 2; i < MVREF_NEIGHBOURS; ++i) {
+    if (((mv_ref_search[i][0] << 7) >= xd->mb_to_left_edge) &&
+        ((mv_ref_search[i][1] << 7) >= xd->mb_to_top_edge)) {
+
+      candidate_mi = here + mv_ref_search[i][0] +
+                     (mv_ref_search[i][1] * xd->mode_info_stride);
+
+      valid_mv_ref = get_candidate_mvref(candidate_mi, ref_frame,
+                                         &c_ref_frame, &c_refmv,
+                                         &c2_ref_frame, &c2_refmv);
+
+      // If there is a valid MV candidate then add it to the list
+      if (valid_mv_ref) {
+        scale_mv(xd, ref_frame, c_ref_frame, &c_refmv, ref_sign_bias );
+        ref_weight = ref_distance_weight[i] +
+                     ((c_ref_frame == ref_frame) << 4);
+
+        addmv_and_shuffle(candidate_mvs, candidate_scores,
+                          &index, c_refmv, ref_weight);
+
+        // If there is a second valid mv then add it as well.
+        if (c2_ref_frame != INTRA_FRAME) {
+          scale_mv(xd, ref_frame, c2_ref_frame, &c2_refmv, ref_sign_bias );
+          ref_weight = ref_distance_weight[i] +
+                       ((c2_ref_frame == ref_frame) << 4);
+
+          addmv_and_shuffle(candidate_mvs, candidate_scores,
+                            &index, c2_refmv, ref_weight);
+        }
+      }
+    }
+  }
+
+  // 0,0 is always a valid reference.
+  for (i = 0; i < index; ++i)
+    if (candidate_mvs[i].as_int == 0)
+      break;
+  if (i == index) {
+    c_refmv.as_int = 0;
+    addmv_and_shuffle(candidate_mvs, candidate_scores,
+                      &index, c_refmv, candidate_scores[3]+1 );
+  }
+
+  // Copy over the candidate list.
+  vpx_memcpy(mv_ref_list, candidate_mvs, sizeof(candidate_mvs));
+}
+
+#endif
--- /dev/null
+++ b/vp9/common/mvref_common.h
@@ -1,0 +1,31 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "onyxc_int.h"
+#include "blockd.h"
+
+// MR reference entropy header file.
+#if CONFIG_NEWBESTREFMV
+
+#ifndef __INC_MVREF_COMMON_H
+#define __INC_MVREF_COMMON_H
+
+void vp9_find_mv_refs(
+  MACROBLOCKD *xd,
+  MODE_INFO *here,
+  MODE_INFO *lf_here,
+  MV_REFERENCE_FRAME ref_frame,
+  int_mv * mv_ref_list,
+  int *ref_sign_bias
+);
+
+#endif
+
+#endif
--- /dev/null
+++ b/vp9/common/onyx.h
@@ -1,0 +1,225 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_ONYX_H
+#define __INC_ONYX_H
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+#include "vpx/internal/vpx_codec_internal.h"
+#include "vpx/vp8cx.h"
+#include "vpx_scale/yv12config.h"
+#include "type_aliases.h"
+#include "ppflags.h"
+  typedef int *VP9_PTR;
+
+  /* Create/destroy static data structures. */
+
+  typedef enum {
+    NORMAL      = 0,
+    FOURFIVE    = 1,
+    THREEFIVE   = 2,
+    ONETWO      = 3
+
+  } VPX_SCALING;
+
+  typedef enum {
+    VP9_LAST_FLAG = 1,
+    VP9_GOLD_FLAG = 2,
+    VP9_ALT_FLAG = 4
+  } VP9_REFFRAME;
+
+
+  typedef enum {
+    USAGE_STREAM_FROM_SERVER    = 0x0,
+    USAGE_LOCAL_FILE_PLAYBACK   = 0x1,
+    USAGE_CONSTRAINED_QUALITY   = 0x2
+  } END_USAGE;
+
+
+  typedef enum {
+    MODE_GOODQUALITY    = 0x1,
+    MODE_BESTQUALITY    = 0x2,
+    MODE_FIRSTPASS      = 0x3,
+    MODE_SECONDPASS     = 0x4,
+    MODE_SECONDPASS_BEST = 0x5,
+  } MODE;
+
+  typedef enum {
+    FRAMEFLAGS_KEY    = 1,
+    FRAMEFLAGS_GOLDEN = 2,
+    FRAMEFLAGS_ALTREF = 4,
+  } FRAMETYPE_FLAGS;
+
+
+#include <assert.h>
+  static __inline void Scale2Ratio(int mode, int *hr, int *hs) {
+    switch (mode) {
+      case    NORMAL:
+        *hr = 1;
+        *hs = 1;
+        break;
+      case    FOURFIVE:
+        *hr = 4;
+        *hs = 5;
+        break;
+      case    THREEFIVE:
+        *hr = 3;
+        *hs = 5;
+        break;
+      case    ONETWO:
+        *hr = 1;
+        *hs = 2;
+        break;
+      default:
+        *hr = 1;
+        *hs = 1;
+        assert(0);
+        break;
+    }
+  }
+
+  typedef struct {
+    int Version;            // 4 versions of bitstream defined 0 best quality/slowest decode, 3 lowest quality/fastest decode
+    int Width;              // width of data passed to the compressor
+    int Height;             // height of data passed to the compressor
+    double frame_rate;       // set to passed in framerate
+    int target_bandwidth;    // bandwidth to be used in kilobits per second
+
+    int noise_sensitivity;   // parameter used for applying pre processing blur: recommendation 0
+    int Sharpness;          // parameter used for sharpening output: recommendation 0:
+    int cpu_used;
+    unsigned int rc_max_intra_bitrate_pct;
+
+    // mode ->
+    // (0)=Realtime/Live Encoding. This mode is optimized for realtim encoding (for example, capturing
+    //    a television signal or feed from a live camera). ( speed setting controls how fast )
+    // (1)=Good Quality Fast Encoding. The encoder balances quality with the amount of time it takes to
+    //    encode the output. ( speed setting controls how fast )
+    // (2)=One Pass - Best Quality. The encoder places priority on the quality of the output over encoding
+    //    speed. The output is compressed at the highest possible quality. This option takes the longest
+    //    amount of time to encode. ( speed setting ignored )
+    // (3)=Two Pass - First Pass. The encoder generates a file of statistics for use in the second encoding
+    //    pass. ( speed setting controls how fast )
+    // (4)=Two Pass - Second Pass. The encoder uses the statistics that were generated in the first encoding
+    //    pass to create the compressed output. ( speed setting controls how fast )
+    // (5)=Two Pass - Second Pass Best.  The encoder uses the statistics that were generated in the first
+    //    encoding pass to create the compressed output using the highest possible quality, and taking a
+    //    longer amount of time to encode.. ( speed setting ignored )
+    int Mode;               //
+
+    // Key Framing Operations
+    int auto_key;            // automatically detect cut scenes and set the keyframes
+    int key_freq;            // maximum distance to key frame.
+
+    int allow_lag;           // allow lagged compression (if 0 lagin frames is ignored)
+    int lag_in_frames;        // how many frames lag before we start encoding
+
+    // ----------------------------------------------------------------
+    // DATARATE CONTROL OPTIONS
+
+    int end_usage; // vbr or cbr
+
+    // buffer targeting aggressiveness
+    int under_shoot_pct;
+    int over_shoot_pct;
+
+    // buffering parameters
+    int starting_buffer_level;  // in seconds
+    int optimal_buffer_level;
+    int maximum_buffer_size;
+
+    // controlling quality
+    int fixed_q;
+    int worst_allowed_q;
+    int best_allowed_q;
+    int cq_level;
+    int lossless;
+
+    // two pass datarate control
+    int two_pass_vbrbias;        // two pass datarate control tweaks
+    int two_pass_vbrmin_section;
+    int two_pass_vbrmax_section;
+    // END DATARATE CONTROL OPTIONS
+    // ----------------------------------------------------------------
+
+
+    // these parameters aren't to be used in final build don't use!!!
+    int play_alternate;
+    int alt_freq;
+
+    int encode_breakout;  // early breakout encode threshold : for video conf recommend 800
+
+    int arnr_max_frames;
+    int arnr_strength;
+    int arnr_type;
+
+    struct vpx_fixed_buf         two_pass_stats_in;
+    struct vpx_codec_pkt_list  *output_pkt_list;
+
+    vp8e_tuning tuning;
+  } VP9_CONFIG;
+
+
+  void vp9_initialize_enc();
+
+  VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf);
+  void vp9_remove_compressor(VP9_PTR *comp);
+
+  void vp9_change_config(VP9_PTR onyx, VP9_CONFIG *oxcf);
+
+// receive a frames worth of data caller can assume that a copy of this frame is made
+// and not just a copy of the pointer..
+  int vp9_receive_raw_frame(VP9_PTR comp, unsigned int frame_flags,
+                            YV12_BUFFER_CONFIG *sd, int64_t time_stamp,
+                            int64_t end_time_stamp);
+
+  int vp9_get_compressed_data(VP9_PTR comp, unsigned int *frame_flags,
+                              unsigned long *size, unsigned char *dest,
+                              int64_t *time_stamp, int64_t *time_end,
+                              int flush);
+
+  int vp9_get_preview_raw_frame(VP9_PTR comp, YV12_BUFFER_CONFIG *dest,
+                                vp9_ppflags_t *flags);
+
+  int vp9_use_as_reference(VP9_PTR comp, int ref_frame_flags);
+
+  int vp9_update_reference(VP9_PTR comp, int ref_frame_flags);
+
+  int vp9_get_reference_enc(VP9_PTR comp, VP9_REFFRAME ref_frame_flag,
+                            YV12_BUFFER_CONFIG *sd);
+
+  int vp9_set_reference_enc(VP9_PTR comp, VP9_REFFRAME ref_frame_flag,
+                            YV12_BUFFER_CONFIG *sd);
+
+  int vp9_update_entropy(VP9_PTR comp, int update);
+
+  int vp9_set_roimap(VP9_PTR comp, unsigned char *map,
+                     unsigned int rows, unsigned int cols,
+                     int delta_q[4], int delta_lf[4],
+                     unsigned int threshold[4]);
+
+  int vp9_set_active_map(VP9_PTR comp, unsigned char *map,
+                         unsigned int rows, unsigned int cols);
+
+  int vp9_set_internal_size(VP9_PTR comp,
+                            VPX_SCALING horiz_mode, VPX_SCALING vert_mode);
+
+  int vp9_get_quantizer(VP9_PTR c);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // __INC_ONYX_H
--- /dev/null
+++ b/vp9/common/onyxc_int.h
@@ -1,0 +1,314 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_ONYXC_INT_H
+#define __INC_ONYXC_INT_H
+
+#include "vpx_config.h"
+#include "vpx/internal/vpx_codec_internal.h"
+#include "vpx_rtcd.h"
+#include "loopfilter.h"
+#include "entropymv.h"
+#include "entropy.h"
+#include "entropymode.h"
+#include "idct.h"
+#if CONFIG_POSTPROC
+#include "postproc.h"
+#endif
+
+/*#ifdef PACKET_TESTING*/
+#include "header.h"
+/*#endif*/
+
+/* Create/destroy static data structures. */
+
+void vp9_initialize_common(void);
+
+#define MINQ 0
+
+#define MAXQ 255
+#define QINDEX_BITS 8
+
+#define QINDEX_RANGE (MAXQ + 1)
+
+#define NUM_YV12_BUFFERS 4
+
+#define COMP_PRED_CONTEXTS   2
+
+typedef struct frame_contexts {
+  vp9_prob bmode_prob [VP9_BINTRAMODES - 1];
+  vp9_prob ymode_prob [VP9_YMODES - 1]; /* interframe intra mode probs */
+  vp9_prob uv_mode_prob [VP9_YMODES][VP9_UV_MODES - 1];
+  vp9_prob i8x8_mode_prob [VP9_I8X8_MODES - 1];
+  vp9_prob sub_mv_ref_prob [SUBMVREF_COUNT][VP9_SUBMVREFS - 1];
+  vp9_prob mbsplit_prob [VP9_NUMMBSPLITS - 1];
+  vp9_prob coef_probs [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
+  vp9_prob hybrid_coef_probs [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
+  vp9_prob coef_probs_8x8 [BLOCK_TYPES_8X8] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
+  vp9_prob hybrid_coef_probs_8x8 [BLOCK_TYPES_8X8] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
+  vp9_prob coef_probs_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
+  vp9_prob hybrid_coef_probs_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
+
+  nmv_context nmvc;
+  nmv_context pre_nmvc;
+  vp9_prob pre_bmode_prob [VP9_BINTRAMODES - 1];
+  vp9_prob pre_ymode_prob [VP9_YMODES - 1]; /* interframe intra mode probs */
+  vp9_prob pre_uv_mode_prob [VP9_YMODES][VP9_UV_MODES - 1];
+  vp9_prob pre_i8x8_mode_prob [VP9_I8X8_MODES - 1];
+  vp9_prob pre_sub_mv_ref_prob [SUBMVREF_COUNT][VP9_SUBMVREFS - 1];
+  vp9_prob pre_mbsplit_prob [VP9_NUMMBSPLITS - 1];
+  unsigned int bmode_counts [VP9_BINTRAMODES];
+  unsigned int ymode_counts [VP9_YMODES];   /* interframe intra mode probs */
+  unsigned int uv_mode_counts [VP9_YMODES][VP9_UV_MODES];
+  unsigned int i8x8_mode_counts [VP9_I8X8_MODES];   /* interframe intra mode probs */
+  unsigned int sub_mv_ref_counts [SUBMVREF_COUNT][VP9_SUBMVREFS];
+  unsigned int mbsplit_counts [VP9_NUMMBSPLITS];
+
+  vp9_prob pre_coef_probs [BLOCK_TYPES] [COEF_BANDS]
+      [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
+  vp9_prob pre_hybrid_coef_probs [BLOCK_TYPES] [COEF_BANDS]
+      [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
+
+  vp9_prob pre_coef_probs_8x8 [BLOCK_TYPES_8X8] [COEF_BANDS]
+      [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
+  vp9_prob pre_hybrid_coef_probs_8x8 [BLOCK_TYPES_8X8] [COEF_BANDS]
+      [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
+
+  vp9_prob pre_coef_probs_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS]
+      [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
+  vp9_prob pre_hybrid_coef_probs_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS]
+      [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
+
+  unsigned int coef_counts [BLOCK_TYPES] [COEF_BANDS]
+      [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];
+  unsigned int hybrid_coef_counts [BLOCK_TYPES] [COEF_BANDS]
+      [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];
+
+  unsigned int coef_counts_8x8 [BLOCK_TYPES_8X8] [COEF_BANDS]
+      [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];
+  unsigned int hybrid_coef_counts_8x8 [BLOCK_TYPES_8X8] [COEF_BANDS]
+      [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];
+
+  unsigned int coef_counts_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS]
+      [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];
+  unsigned int hybrid_coef_counts_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS]
+      [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];
+
+  nmv_context_counts NMVcount;
+  vp9_prob switchable_interp_prob[VP9_SWITCHABLE_FILTERS + 1]
+                                 [VP9_SWITCHABLE_FILTERS - 1];
+
+  int mode_context[6][4];
+  int mode_context_a[6][4];
+  int vp8_mode_contexts[6][4];
+  int mv_ref_ct[6][4][2];
+  int mv_ref_ct_a[6][4][2];
+} FRAME_CONTEXT;
+
+typedef enum {
+  RECON_CLAMP_REQUIRED        = 0,
+  RECON_CLAMP_NOTREQUIRED     = 1
+} CLAMP_TYPE;
+
+typedef enum {
+  SINGLE_PREDICTION_ONLY = 0,
+  COMP_PREDICTION_ONLY   = 1,
+  HYBRID_PREDICTION      = 2,
+  NB_PREDICTION_TYPES    = 3,
+} COMPPREDMODE_TYPE;
+
+typedef enum {
+  ONLY_4X4            = 0,
+  ALLOW_8X8           = 1,
+  ALLOW_16X16         = 2,
+  TX_MODE_SELECT      = 3,
+  NB_TXFM_MODES       = 4,
+} TXFM_MODE;
+
+typedef struct VP9_COMMON_RTCD {
+#if CONFIG_RUNTIME_CPU_DETECT
+  vp9_idct_rtcd_vtable_t        idct;
+  vp9_subpix_rtcd_vtable_t      subpix;
+#if CONFIG_POSTPROC
+  vp9_postproc_rtcd_vtable_t    postproc;
+#endif
+  int                           flags;
+#else
+  int unused;
+#endif
+} VP9_COMMON_RTCD;
+
+typedef struct VP9Common {
+  struct vpx_internal_error_info  error;
+
+  DECLARE_ALIGNED(16, short, Y1dequant[QINDEX_RANGE][16]);
+  DECLARE_ALIGNED(16, short, Y2dequant[QINDEX_RANGE][16]);
+  DECLARE_ALIGNED(16, short, UVdequant[QINDEX_RANGE][16]);
+
+  int Width;
+  int Height;
+  int horiz_scale;
+  int vert_scale;
+
+  YUV_TYPE clr_type;
+  CLAMP_TYPE  clamp_type;
+
+  YV12_BUFFER_CONFIG *frame_to_show;
+
+  YV12_BUFFER_CONFIG yv12_fb[NUM_YV12_BUFFERS];
+  int fb_idx_ref_cnt[NUM_YV12_BUFFERS];
+  int new_fb_idx, lst_fb_idx, gld_fb_idx, alt_fb_idx;
+
+  YV12_BUFFER_CONFIG post_proc_buffer;
+  YV12_BUFFER_CONFIG temp_scale_frame;
+
+
+  FRAME_TYPE last_frame_type;  /* Save last frame's frame type for motion search. */
+  FRAME_TYPE frame_type;
+
+  int show_frame;
+
+  int frame_flags;
+  int MBs;
+  int mb_rows;
+  int mb_cols;
+  int mode_info_stride;
+
+  /* profile settings */
+  int experimental;
+  int mb_no_coeff_skip;
+  TXFM_MODE txfm_mode;
+  COMPPREDMODE_TYPE comp_pred_mode;
+  int no_lpf;
+  int use_bilinear_mc_filter;
+  int full_pixel;
+
+  int base_qindex;
+  int last_kf_gf_q;  /* Q used on the last GF or KF */
+
+  int y1dc_delta_q;
+  int y2dc_delta_q;
+  int y2ac_delta_q;
+  int uvdc_delta_q;
+  int uvac_delta_q;
+
+  unsigned int frames_since_golden;
+  unsigned int frames_till_alt_ref_frame;
+
+  /* We allocate a MODE_INFO struct for each macroblock, together with
+     an extra row on top and column on the left to simplify prediction. */
+
+  MODE_INFO *mip; /* Base of allocated array */
+  MODE_INFO *mi;  /* Corresponds to upper left visible macroblock */
+  MODE_INFO *prev_mip; /* MODE_INFO array 'mip' from last decoded frame */
+  MODE_INFO *prev_mi;  /* 'mi' from last frame (points into prev_mip) */
+
+
+  // Persistent mb segment id map used in prediction.
+  unsigned char *last_frame_seg_map;
+
+  INTERPOLATIONFILTERTYPE mcomp_filter_type;
+  LOOPFILTERTYPE filter_type;
+
+  loop_filter_info_n lf_info;
+
+  int filter_level;
+  int last_sharpness_level;
+  int sharpness_level;
+
+  int refresh_last_frame;       /* Two state 0 = NO, 1 = YES */
+  int refresh_golden_frame;     /* Two state 0 = NO, 1 = YES */
+  int refresh_alt_ref_frame;     /* Two state 0 = NO, 1 = YES */
+
+  int copy_buffer_to_gf;         /* 0 none, 1 Last to GF, 2 ARF to GF */
+  int copy_buffer_to_arf;        /* 0 none, 1 Last to ARF, 2 GF to ARF */
+
+  int refresh_entropy_probs;    /* Two state 0 = NO, 1 = YES */
+
+  int ref_frame_sign_bias[MAX_REF_FRAMES];    /* Two state 0, 1 */
+
+  /* Y,U,V,Y2 */
+  ENTROPY_CONTEXT_PLANES *above_context;   /* row of context for each plane */
+  ENTROPY_CONTEXT_PLANES left_context[2];  /* (up to) 4 contexts "" */
+
+  /* keyframe block modes are predicted by their above, left neighbors */
+
+  vp9_prob kf_bmode_prob [VP9_BINTRAMODES] [VP9_BINTRAMODES] [VP9_BINTRAMODES - 1];
+  vp9_prob kf_ymode_prob[8][VP9_YMODES - 1]; /* keyframe "" */
+#if CONFIG_SUPERBLOCKS
+  vp9_prob sb_kf_ymode_prob[8][VP9_I32X32_MODES - 1];
+#endif
+  int kf_ymode_probs_index;
+  int kf_ymode_probs_update;
+  vp9_prob kf_uv_mode_prob[VP9_YMODES] [VP9_UV_MODES - 1];
+
+  vp9_prob prob_intra_coded;
+  vp9_prob prob_last_coded;
+  vp9_prob prob_gf_coded;
+#if CONFIG_SUPERBLOCKS
+  vp9_prob sb_coded;
+#endif
+
+  // Context probabilities when using predictive coding of segment id
+  vp9_prob segment_pred_probs[PREDICTION_PROBS];
+  unsigned char temporal_update;
+
+  // Context probabilities for reference frame prediction
+  unsigned char ref_scores[MAX_REF_FRAMES];
+  vp9_prob ref_pred_probs[PREDICTION_PROBS];
+  vp9_prob mod_refprobs[MAX_REF_FRAMES][PREDICTION_PROBS];
+
+  vp9_prob prob_comppred[COMP_PRED_CONTEXTS];
+
+  // FIXME contextualize
+  vp9_prob prob_tx[TX_SIZE_MAX - 1];
+
+  vp9_prob mbskip_pred_probs[MBSKIP_CONTEXTS];
+
+  FRAME_CONTEXT lfc_a; /* last alt ref entropy */
+  FRAME_CONTEXT lfc; /* last frame entropy */
+  FRAME_CONTEXT fc;  /* this frame entropy */
+
+  // int mv_ref_ct[6][4][2];
+  // int mv_ref_ct_a[6][4][2];
+  // int mode_context[6][4];
+  // int mode_context_a[6][4];
+  // int vp8_mode_contexts[6][4];
+
+  unsigned int current_video_frame;
+  int near_boffset[3];
+  int version;
+
+#ifdef PACKET_TESTING
+  VP9_HEADER oh;
+#endif
+  double bitrate;
+  double framerate;
+
+#if CONFIG_RUNTIME_CPU_DETECT
+  VP9_COMMON_RTCD rtcd;
+#endif
+
+#if CONFIG_POSTPROC
+  struct postproc_state  postproc_state;
+#endif
+
+#if CONFIG_PRED_FILTER
+  /* Prediction filter variables */
+  int pred_filter_mode;   // 0=disabled at the frame level (no MB filtered)
+  // 1=enabled at the frame level (all MB filtered)
+  // 2=specified per MB (1=filtered, 0=non-filtered)
+  vp9_prob prob_pred_filter_off;
+#endif
+
+} VP9_COMMON;
+
+#endif  // __INC_ONYX_INT_H
--- /dev/null
+++ b/vp9/common/onyxd.h
@@ -1,0 +1,68 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_ONYXD_H
+#define __INC_ONYXD_H
+
+
+/* Create/destroy static data structures. */
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+#include "type_aliases.h"
+#include "vpx_scale/yv12config.h"
+#include "ppflags.h"
+#include "vpx_ports/mem.h"
+#include "vpx/vpx_codec.h"
+
+  typedef void   *VP9D_PTR;
+  typedef struct {
+    int     Width;
+    int     Height;
+    int     Version;
+    int     postprocess;
+    int     max_threads;
+    int     input_partition;
+  } VP9D_CONFIG;
+  typedef enum {
+    VP9_LAST_FLAG = 1,
+    VP9_GOLD_FLAG = 2,
+    VP9_ALT_FLAG = 4
+  } VP9_REFFRAME;
+
+  void vp9_initialize_dec(void);
+
+  int vp9_receive_compressed_data(VP9D_PTR comp, unsigned long size,
+                                  const unsigned char *dest,
+                                  int64_t time_stamp);
+
+  int vp9_get_raw_frame(VP9D_PTR comp, YV12_BUFFER_CONFIG *sd,
+                        int64_t *time_stamp, int64_t *time_end_stamp,
+                        vp9_ppflags_t *flags);
+
+  vpx_codec_err_t vp9_get_reference_dec(VP9D_PTR comp,
+                                        VP9_REFFRAME ref_frame_flag,
+                                        YV12_BUFFER_CONFIG *sd);
+
+  vpx_codec_err_t vp9_set_reference_dec(VP9D_PTR comp,
+                                        VP9_REFFRAME ref_frame_flag,
+                                        YV12_BUFFER_CONFIG *sd);
+
+  VP9D_PTR vp9_create_decompressor(VP9D_CONFIG *oxcf);
+
+  void vp9_remove_decompressor(VP9D_PTR comp);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // __INC_ONYXD_H
--- /dev/null
+++ b/vp9/common/postproc.c
@@ -1,0 +1,1035 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_ports/config.h"
+#include "vpx_scale/yv12config.h"
+#include "postproc.h"
+#include "vpx_scale/yv12extend.h"
+#include "vpx_scale/vpxscale.h"
+#include "systemdependent.h"
+
+#include <math.h>
+#include <stdlib.h>
+#include <stdio.h>
+
+#define RGB_TO_YUV(t)                                            \
+  ( (0.257*(float)(t >> 16))  + (0.504*(float)(t >> 8 & 0xff)) + \
+    (0.098*(float)(t & 0xff)) + 16),                             \
+  (-(0.148*(float)(t >> 16))  - (0.291*(float)(t >> 8 & 0xff)) + \
+    (0.439*(float)(t & 0xff)) + 128),                            \
+  ( (0.439*(float)(t >> 16))  - (0.368*(float)(t >> 8 & 0xff)) - \
+    (0.071*(float)(t & 0xff)) + 128)
+
+/* global constants */
+#if CONFIG_POSTPROC_VISUALIZER
+static const unsigned char MB_PREDICTION_MODE_colors[MB_MODE_COUNT][3] = {
+  { RGB_TO_YUV(0x98FB98) },   /* PaleGreen */
+  { RGB_TO_YUV(0x00FF00) },   /* Green */
+  { RGB_TO_YUV(0xADFF2F) },   /* GreenYellow */
+  { RGB_TO_YUV(0x8F0000) },   /* Dark Red */
+  { RGB_TO_YUV(0x008F8F) },   /* Dark Cyan */
+  { RGB_TO_YUV(0x008F8F) },   /* Dark Cyan */
+  { RGB_TO_YUV(0x008F8F) },   /* Dark Cyan */
+  { RGB_TO_YUV(0x8F0000) },   /* Dark Red */
+  { RGB_TO_YUV(0x8F0000) },   /* Dark Red */
+  { RGB_TO_YUV(0x228B22) },   /* ForestGreen */
+  { RGB_TO_YUV(0x006400) },   /* DarkGreen */
+  { RGB_TO_YUV(0x98F5FF) },   /* Cadet Blue */
+  { RGB_TO_YUV(0x6CA6CD) },   /* Sky Blue */
+  { RGB_TO_YUV(0x00008B) },   /* Dark blue */
+  { RGB_TO_YUV(0x551A8B) },   /* Purple */
+  { RGB_TO_YUV(0xFF0000) }    /* Red */
+  { RGB_TO_YUV(0xCC33FF) },   /* Magenta */
+};
+
+static const unsigned char B_PREDICTION_MODE_colors[B_MODE_COUNT][3] = {
+  { RGB_TO_YUV(0x6633ff) },   /* Purple */
+  { RGB_TO_YUV(0xcc33ff) },   /* Magenta */
+  { RGB_TO_YUV(0xff33cc) },   /* Pink */
+  { RGB_TO_YUV(0xff3366) },   /* Coral */
+  { RGB_TO_YUV(0x3366ff) },   /* Blue */
+  { RGB_TO_YUV(0xed00f5) },   /* Dark Blue */
+  { RGB_TO_YUV(0x2e00b8) },   /* Dark Purple */
+  { RGB_TO_YUV(0xff6633) },   /* Orange */
+  { RGB_TO_YUV(0x33ccff) },   /* Light Blue */
+  { RGB_TO_YUV(0x8ab800) },   /* Green */
+  { RGB_TO_YUV(0xffcc33) },   /* Light Orange */
+  { RGB_TO_YUV(0x33ffcc) },   /* Aqua */
+  { RGB_TO_YUV(0x66ff33) },   /* Light Green */
+  { RGB_TO_YUV(0xccff33) },   /* Yellow */
+};
+
+static const unsigned char MV_REFERENCE_FRAME_colors[MAX_REF_FRAMES][3] = {
+  { RGB_TO_YUV(0x00ff00) },   /* Blue */
+  { RGB_TO_YUV(0x0000ff) },   /* Green */
+  { RGB_TO_YUV(0xffff00) },   /* Yellow */
+  { RGB_TO_YUV(0xff0000) },   /* Red */
+};
+#endif
+
+static const short kernel5[] = {
+  1, 1, 4, 1, 1
+};
+
+const short vp9_rv[] = {
+  8, 5, 2, 2, 8, 12, 4, 9, 8, 3,
+  0, 3, 9, 0, 0, 0, 8, 3, 14, 4,
+  10, 1, 11, 14, 1, 14, 9, 6, 12, 11,
+  8, 6, 10, 0, 0, 8, 9, 0, 3, 14,
+  8, 11, 13, 4, 2, 9, 0, 3, 9, 6,
+  1, 2, 3, 14, 13, 1, 8, 2, 9, 7,
+  3, 3, 1, 13, 13, 6, 6, 5, 2, 7,
+  11, 9, 11, 8, 7, 3, 2, 0, 13, 13,
+  14, 4, 12, 5, 12, 10, 8, 10, 13, 10,
+  4, 14, 4, 10, 0, 8, 11, 1, 13, 7,
+  7, 14, 6, 14, 13, 2, 13, 5, 4, 4,
+  0, 10, 0, 5, 13, 2, 12, 7, 11, 13,
+  8, 0, 4, 10, 7, 2, 7, 2, 2, 5,
+  3, 4, 7, 3, 3, 14, 14, 5, 9, 13,
+  3, 14, 3, 6, 3, 0, 11, 8, 13, 1,
+  13, 1, 12, 0, 10, 9, 7, 6, 2, 8,
+  5, 2, 13, 7, 1, 13, 14, 7, 6, 7,
+  9, 6, 10, 11, 7, 8, 7, 5, 14, 8,
+  4, 4, 0, 8, 7, 10, 0, 8, 14, 11,
+  3, 12, 5, 7, 14, 3, 14, 5, 2, 6,
+  11, 12, 12, 8, 0, 11, 13, 1, 2, 0,
+  5, 10, 14, 7, 8, 0, 4, 11, 0, 8,
+  0, 3, 10, 5, 8, 0, 11, 6, 7, 8,
+  10, 7, 13, 9, 2, 5, 1, 5, 10, 2,
+  4, 3, 5, 6, 10, 8, 9, 4, 11, 14,
+  0, 10, 0, 5, 13, 2, 12, 7, 11, 13,
+  8, 0, 4, 10, 7, 2, 7, 2, 2, 5,
+  3, 4, 7, 3, 3, 14, 14, 5, 9, 13,
+  3, 14, 3, 6, 3, 0, 11, 8, 13, 1,
+  13, 1, 12, 0, 10, 9, 7, 6, 2, 8,
+  5, 2, 13, 7, 1, 13, 14, 7, 6, 7,
+  9, 6, 10, 11, 7, 8, 7, 5, 14, 8,
+  4, 4, 0, 8, 7, 10, 0, 8, 14, 11,
+  3, 12, 5, 7, 14, 3, 14, 5, 2, 6,
+  11, 12, 12, 8, 0, 11, 13, 1, 2, 0,
+  5, 10, 14, 7, 8, 0, 4, 11, 0, 8,
+  0, 3, 10, 5, 8, 0, 11, 6, 7, 8,
+  10, 7, 13, 9, 2, 5, 1, 5, 10, 2,
+  4, 3, 5, 6, 10, 8, 9, 4, 11, 14,
+  3, 8, 3, 7, 8, 5, 11, 4, 12, 3,
+  11, 9, 14, 8, 14, 13, 4, 3, 1, 2,
+  14, 6, 5, 4, 4, 11, 4, 6, 2, 1,
+  5, 8, 8, 12, 13, 5, 14, 10, 12, 13,
+  0, 9, 5, 5, 11, 10, 13, 9, 10, 13,
+};
+
+
+extern void vp9_blit_text(const char *msg, unsigned char *address,
+                          const int pitch);
+extern void vp9_blit_line(int x0, int x1, int y0, int y1,
+                          unsigned char *image, const int pitch);
+/****************************************************************************
+ */
+void vp9_post_proc_down_and_across_c(unsigned char *src_ptr,
+                                     unsigned char *dst_ptr,
+                                     int src_pixels_per_line,
+                                     int dst_pixels_per_line,
+                                     int rows,
+                                     int cols,
+                                     int flimit) {
+  unsigned char *p_src, *p_dst;
+  int row;
+  int col;
+  int i;
+  int v;
+  int pitch = src_pixels_per_line;
+  unsigned char d[8];
+  (void)dst_pixels_per_line;
+
+  for (row = 0; row < rows; row++) {
+    /* post_proc_down for one row */
+    p_src = src_ptr;
+    p_dst = dst_ptr;
+
+    for (col = 0; col < cols; col++) {
+
+      int kernel = 4;
+      int v = p_src[col];
+
+      for (i = -2; i <= 2; i++) {
+        if (abs(v - p_src[col + i * pitch]) > flimit)
+          goto down_skip_convolve;
+
+        kernel += kernel5[2 + i] * p_src[col + i * pitch];
+      }
+
+      v = (kernel >> 3);
+    down_skip_convolve:
+      p_dst[col] = v;
+    }
+
+    /* now post_proc_across */
+    p_src = dst_ptr;
+    p_dst = dst_ptr;
+
+    for (i = 0; i < 8; i++)
+      d[i] = p_src[i];
+
+    for (col = 0; col < cols; col++) {
+      int kernel = 4;
+      v = p_src[col];
+
+      d[col & 7] = v;
+
+      for (i = -2; i <= 2; i++) {
+        if (abs(v - p_src[col + i]) > flimit)
+          goto across_skip_convolve;
+
+        kernel += kernel5[2 + i] * p_src[col + i];
+      }
+
+      d[col & 7] = (kernel >> 3);
+    across_skip_convolve:
+
+      if (col >= 2)
+        p_dst[col - 2] = d[(col - 2) & 7];
+    }
+
+    /* handle the last two pixels */
+    p_dst[col - 2] = d[(col - 2) & 7];
+    p_dst[col - 1] = d[(col - 1) & 7];
+
+
+    /* next row */
+    src_ptr += pitch;
+    dst_ptr += pitch;
+  }
+}
+
+static int q2mbl(int x) {
+  if (x < 20) x = 20;
+
+  x = 50 + (x - 50) * 10 / 8;
+  return x * x / 3;
+}
+
+void vp9_mbpost_proc_across_ip_c(unsigned char *src, int pitch,
+                                 int rows, int cols, int flimit) {
+  int r, c, i;
+
+  unsigned char *s = src;
+  unsigned char d[16];
+
+
+  for (r = 0; r < rows; r++) {
+    int sumsq = 0;
+    int sum   = 0;
+
+    for (i = -8; i <= 6; i++) {
+      sumsq += s[i] * s[i];
+      sum   += s[i];
+      d[i + 8] = 0;
+    }
+
+    for (c = 0; c < cols + 8; c++) {
+      int x = s[c + 7] - s[c - 8];
+      int y = s[c + 7] + s[c - 8];
+
+      sum  += x;
+      sumsq += x * y;
+
+      d[c & 15] = s[c];
+
+      if (sumsq * 15 - sum * sum < flimit) {
+        d[c & 15] = (8 + sum + s[c]) >> 4;
+      }
+
+      s[c - 8] = d[(c - 8) & 15];
+    }
+
+    s += pitch;
+  }
+}
+
+void vp9_mbpost_proc_down_c(unsigned char *dst, int pitch,
+                            int rows, int cols, int flimit) {
+  int r, c, i;
+  const short *rv3 = &vp9_rv[63 & rand()];
+
+  for (c = 0; c < cols; c++) {
+    unsigned char *s = &dst[c];
+    int sumsq = 0;
+    int sum   = 0;
+    unsigned char d[16];
+    const short *rv2 = rv3 + ((c * 17) & 127);
+
+    for (i = -8; i <= 6; i++) {
+      sumsq += s[i * pitch] * s[i * pitch];
+      sum   += s[i * pitch];
+    }
+
+    for (r = 0; r < rows + 8; r++) {
+      sumsq += s[7 * pitch] * s[ 7 * pitch] - s[-8 * pitch] * s[-8 * pitch];
+      sum  += s[7 * pitch] - s[-8 * pitch];
+      d[r & 15] = s[0];
+
+      if (sumsq * 15 - sum * sum < flimit) {
+        d[r & 15] = (rv2[r & 127] + sum + s[0]) >> 4;
+      }
+
+      s[-8 * pitch] = d[(r - 8) & 15];
+      s += pitch;
+    }
+  }
+}
+
+static void deblock_and_de_macro_block(YV12_BUFFER_CONFIG   *source,
+                                       YV12_BUFFER_CONFIG   *post,
+                                       int                   q,
+                                       int                   low_var_thresh,
+                                       int                   flag,
+                                       vp9_postproc_rtcd_vtable_t *rtcd) {
+  double level = 6.0e-05 * q * q * q - .0067 * q * q + .306 * q + .0065;
+  int ppl = (int)(level + .5);
+  (void) low_var_thresh;
+  (void) flag;
+
+  POSTPROC_INVOKE(rtcd, downacross)(source->y_buffer, post->y_buffer,
+                                    source->y_stride,  post->y_stride,
+                                    source->y_height, source->y_width,  ppl);
+  POSTPROC_INVOKE(rtcd, across)(post->y_buffer, post->y_stride,
+                                post->y_height, post->y_width, q2mbl(q));
+  POSTPROC_INVOKE(rtcd, down)(post->y_buffer, post->y_stride,
+                              post->y_height, post->y_width, q2mbl(q));
+
+  POSTPROC_INVOKE(rtcd, downacross)(source->u_buffer, post->u_buffer,
+                                    source->uv_stride, post->uv_stride,
+                                    source->uv_height, source->uv_width, ppl);
+  POSTPROC_INVOKE(rtcd, downacross)(source->v_buffer, post->v_buffer,
+                                    source->uv_stride, post->uv_stride,
+                                    source->uv_height, source->uv_width, ppl);
+}
+
+void vp9_deblock(YV12_BUFFER_CONFIG         *source,
+                 YV12_BUFFER_CONFIG         *post,
+                 int                         q,
+                 int                         low_var_thresh,
+                 int                         flag,
+                 vp9_postproc_rtcd_vtable_t *rtcd) {
+  double level = 6.0e-05 * q * q * q - .0067 * q * q + .306 * q + .0065;
+  int ppl = (int)(level + .5);
+  (void) low_var_thresh;
+  (void) flag;
+
+  POSTPROC_INVOKE(rtcd, downacross)(source->y_buffer, post->y_buffer,
+                                    source->y_stride,  post->y_stride,
+                                    source->y_height, source->y_width,   ppl);
+  POSTPROC_INVOKE(rtcd, downacross)(source->u_buffer, post->u_buffer,
+                                    source->uv_stride, post->uv_stride,
+                                    source->uv_height, source->uv_width, ppl);
+  POSTPROC_INVOKE(rtcd, downacross)(source->v_buffer, post->v_buffer,
+                                    source->uv_stride, post->uv_stride,
+                                    source->uv_height, source->uv_width, ppl);
+}
+
+void vp9_de_noise(YV12_BUFFER_CONFIG         *src,
+                  YV12_BUFFER_CONFIG         *post,
+                  int                         q,
+                  int                         low_var_thresh,
+                  int                         flag,
+                  vp9_postproc_rtcd_vtable_t *rtcd) {
+  double level = 6.0e-05 * q * q * q - .0067 * q * q + .306 * q + .0065;
+  int ppl = (int)(level + .5);
+  (void) post;
+  (void) low_var_thresh;
+  (void) flag;
+
+  POSTPROC_INVOKE(rtcd, downacross)(src->y_buffer + 2 * src->y_stride + 2,
+                                    src->y_buffer + 2 * src->y_stride + 2,
+                                    src->y_stride,
+                                    src->y_stride,
+                                    src->y_height - 4,
+                                    src->y_width - 4,
+                                    ppl);
+  POSTPROC_INVOKE(rtcd, downacross)(src->u_buffer + 2 * src->uv_stride + 2,
+                                    src->u_buffer + 2 * src->uv_stride + 2,
+                                    src->uv_stride,
+                                    src->uv_stride,
+                                    src->uv_height - 4,
+                                    src->uv_width - 4, ppl);
+  POSTPROC_INVOKE(rtcd, downacross)(src->v_buffer + 2 * src->uv_stride + 2,
+                                    src->v_buffer + 2 * src->uv_stride + 2,
+                                    src->uv_stride,
+                                    src->uv_stride,
+                                    src->uv_height - 4,
+                                    src->uv_width - 4, ppl);
+}
+
+double vp9_gaussian(double sigma, double mu, double x) {
+  return 1 / (sigma * sqrt(2.0 * 3.14159265)) *
+         (exp(-(x - mu) * (x - mu) / (2 * sigma * sigma)));
+}
+
+static void fillrd(struct postproc_state *state, int q, int a) {
+  char char_dist[300];
+
+  double sigma;
+  int ai = a, qi = q, i;
+
+  vp9_clear_system_state();
+
+  sigma = ai + .5 + .6 * (63 - qi) / 63.0;
+
+  /* set up a lookup table of 256 entries that matches
+   * a gaussian distribution with sigma determined by q.
+   */
+  {
+    double i;
+    int next, j;
+
+    next = 0;
+
+    for (i = -32; i < 32; i++) {
+      int a = (int)(.5 + 256 * vp9_gaussian(sigma, 0, i));
+
+      if (a) {
+        for (j = 0; j < a; j++) {
+          char_dist[next + j] = (char) i;
+        }
+
+        next = next + j;
+      }
+
+    }
+
+    for (next = next; next < 256; next++)
+      char_dist[next] = 0;
+  }
+
+  for (i = 0; i < 3072; i++) {
+    state->noise[i] = char_dist[rand() & 0xff];
+  }
+
+  for (i = 0; i < 16; i++) {
+    state->blackclamp[i] = -char_dist[0];
+    state->whiteclamp[i] = -char_dist[0];
+    state->bothclamp[i] = -2 * char_dist[0];
+  }
+
+  state->last_q = q;
+  state->last_noise = a;
+}
+
+/****************************************************************************
+ *
+ *  ROUTINE       : plane_add_noise_c
+ *
+ *  INPUTS        : unsigned char *Start  starting address of buffer to
+ *                                        add gaussian noise to
+ *                  unsigned int Width    width of plane
+ *                  unsigned int Height   height of plane
+ *                  int  Pitch    distance between subsequent lines of frame
+ *                  int  q        quantizer used to determine amount of noise
+ *                                  to add
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void.
+ *
+ *  FUNCTION      : adds gaussian noise to a plane of pixels
+ *
+ *  SPECIAL NOTES : None.
+ *
+ ****************************************************************************/
+void vp9_plane_add_noise_c(unsigned char *Start, char *noise,
+                           char blackclamp[16],
+                           char whiteclamp[16],
+                           char bothclamp[16],
+                           unsigned int Width, unsigned int Height, int Pitch) {
+  unsigned int i, j;
+
+  for (i = 0; i < Height; i++) {
+    unsigned char *Pos = Start + i * Pitch;
+    char  *Ref = (char *)(noise + (rand() & 0xff));
+
+    for (j = 0; j < Width; j++) {
+      if (Pos[j] < blackclamp[0])
+        Pos[j] = blackclamp[0];
+
+      if (Pos[j] > 255 + whiteclamp[0])
+        Pos[j] = 255 + whiteclamp[0];
+
+      Pos[j] += Ref[j];
+    }
+  }
+}
+
+/* Blend the macro block with a solid colored square.  Leave the
+ * edges unblended to give distinction to macro blocks in areas
+ * filled with the same color block.
+ */
+void vp9_blend_mb_inner_c(unsigned char *y, unsigned char *u, unsigned char *v,
+                          int y1, int u1, int v1, int alpha, int stride) {
+  int i, j;
+  int y1_const = y1 * ((1 << 16) - alpha);
+  int u1_const = u1 * ((1 << 16) - alpha);
+  int v1_const = v1 * ((1 << 16) - alpha);
+
+  y += 2 * stride + 2;
+  for (i = 0; i < 12; i++) {
+    for (j = 0; j < 12; j++) {
+      y[j] = (y[j] * alpha + y1_const) >> 16;
+    }
+    y += stride;
+  }
+
+  stride >>= 1;
+
+  u += stride + 1;
+  v += stride + 1;
+
+  for (i = 0; i < 6; i++) {
+    for (j = 0; j < 6; j++) {
+      u[j] = (u[j] * alpha + u1_const) >> 16;
+      v[j] = (v[j] * alpha + v1_const) >> 16;
+    }
+    u += stride;
+    v += stride;
+  }
+}
+
+/* Blend only the edge of the macro block.  Leave center
+ * unblended to allow for other visualizations to be layered.
+ */
+void vp9_blend_mb_outer_c(unsigned char *y, unsigned char *u, unsigned char *v,
+                          int y1, int u1, int v1, int alpha, int stride) {
+  int i, j;
+  int y1_const = y1 * ((1 << 16) - alpha);
+  int u1_const = u1 * ((1 << 16) - alpha);
+  int v1_const = v1 * ((1 << 16) - alpha);
+
+  for (i = 0; i < 2; i++) {
+    for (j = 0; j < 16; j++) {
+      y[j] = (y[j] * alpha + y1_const) >> 16;
+    }
+    y += stride;
+  }
+
+  for (i = 0; i < 12; i++) {
+    y[0]  = (y[0] * alpha  + y1_const) >> 16;
+    y[1]  = (y[1] * alpha  + y1_const) >> 16;
+    y[14] = (y[14] * alpha + y1_const) >> 16;
+    y[15] = (y[15] * alpha + y1_const) >> 16;
+    y += stride;
+  }
+
+  for (i = 0; i < 2; i++) {
+    for (j = 0; j < 16; j++) {
+      y[j] = (y[j] * alpha + y1_const) >> 16;
+    }
+    y += stride;
+  }
+
+  stride >>= 1;
+
+  for (j = 0; j < 8; j++) {
+    u[j] = (u[j] * alpha + u1_const) >> 16;
+    v[j] = (v[j] * alpha + v1_const) >> 16;
+  }
+  u += stride;
+  v += stride;
+
+  for (i = 0; i < 6; i++) {
+    u[0] = (u[0] * alpha + u1_const) >> 16;
+    v[0] = (v[0] * alpha + v1_const) >> 16;
+
+    u[7] = (u[7] * alpha + u1_const) >> 16;
+    v[7] = (v[7] * alpha + v1_const) >> 16;
+
+    u += stride;
+    v += stride;
+  }
+
+  for (j = 0; j < 8; j++) {
+    u[j] = (u[j] * alpha + u1_const) >> 16;
+    v[j] = (v[j] * alpha + v1_const) >> 16;
+  }
+}
+
+void vp9_blend_b_c(unsigned char *y, unsigned char *u, unsigned char *v,
+                   int y1, int u1, int v1, int alpha, int stride) {
+  int i, j;
+  int y1_const = y1 * ((1 << 16) - alpha);
+  int u1_const = u1 * ((1 << 16) - alpha);
+  int v1_const = v1 * ((1 << 16) - alpha);
+
+  for (i = 0; i < 4; i++) {
+    for (j = 0; j < 4; j++) {
+      y[j] = (y[j] * alpha + y1_const) >> 16;
+    }
+    y += stride;
+  }
+
+  stride >>= 1;
+
+  for (i = 0; i < 2; i++) {
+    for (j = 0; j < 2; j++) {
+      u[j] = (u[j] * alpha + u1_const) >> 16;
+      v[j] = (v[j] * alpha + v1_const) >> 16;
+    }
+    u += stride;
+    v += stride;
+  }
+}
+
+static void constrain_line(int x0, int *x1, int y0, int *y1,
+                           int width, int height) {
+  int dx;
+  int dy;
+
+  if (*x1 > width) {
+    dx = *x1 - x0;
+    dy = *y1 - y0;
+
+    *x1 = width;
+    if (dx)
+      *y1 = ((width - x0) * dy) / dx + y0;
+  }
+  if (*x1 < 0) {
+    dx = *x1 - x0;
+    dy = *y1 - y0;
+
+    *x1 = 0;
+    if (dx)
+      *y1 = ((0 - x0) * dy) / dx + y0;
+  }
+  if (*y1 > height) {
+    dx = *x1 - x0;
+    dy = *y1 - y0;
+
+    *y1 = height;
+    if (dy)
+      *x1 = ((height - y0) * dx) / dy + x0;
+  }
+  if (*y1 < 0) {
+    dx = *x1 - x0;
+    dy = *y1 - y0;
+
+    *y1 = 0;
+    if (dy)
+      *x1 = ((0 - y0) * dx) / dy + x0;
+  }
+}
+
+
+#if CONFIG_RUNTIME_CPU_DETECT
+#define RTCD_VTABLE(oci) (&(oci)->rtcd.postproc)
+#else
+#define RTCD_VTABLE(oci) NULL
+#endif
+
+int vp9_post_proc_frame(VP9_COMMON *oci, YV12_BUFFER_CONFIG *dest,
+                        vp9_ppflags_t *ppflags) {
+  int q = oci->filter_level * 10 / 6;
+  int flags = ppflags->post_proc_flag;
+  int deblock_level = ppflags->deblocking_level;
+  int noise_level = ppflags->noise_level;
+
+  if (!oci->frame_to_show)
+    return -1;
+
+  if (q > 63)
+    q = 63;
+
+  if (!flags) {
+    *dest = *oci->frame_to_show;
+
+    /* handle problem with extending borders */
+    dest->y_width = oci->Width;
+    dest->y_height = oci->Height;
+    dest->uv_height = dest->y_height / 2;
+    return 0;
+
+  }
+
+#if ARCH_X86||ARCH_X86_64
+  vpx_reset_mmx_state();
+#endif
+
+  if (flags & VP9D_DEMACROBLOCK) {
+    deblock_and_de_macro_block(oci->frame_to_show, &oci->post_proc_buffer,
+                               q + (deblock_level - 5) * 10, 1, 0,
+                               RTCD_VTABLE(oci));
+  } else if (flags & VP9D_DEBLOCK) {
+    vp9_deblock(oci->frame_to_show, &oci->post_proc_buffer,
+                q, 1, 0, RTCD_VTABLE(oci));
+  } else {
+    vp8_yv12_copy_frame_ptr(oci->frame_to_show, &oci->post_proc_buffer);
+  }
+
+  if (flags & VP9D_ADDNOISE) {
+    if (oci->postproc_state.last_q != q
+        || oci->postproc_state.last_noise != noise_level) {
+      fillrd(&oci->postproc_state, 63 - q, noise_level);
+    }
+
+    POSTPROC_INVOKE(RTCD_VTABLE(oci), addnoise)(oci->post_proc_buffer.y_buffer,
+                                                oci->postproc_state.noise,
+                                                oci->postproc_state.blackclamp,
+                                                oci->postproc_state.whiteclamp,
+                                                oci->postproc_state.bothclamp,
+                                                oci->post_proc_buffer.y_width,
+                                                oci->post_proc_buffer.y_height,
+                                                oci->post_proc_buffer.y_stride);
+  }
+
+#if CONFIG_POSTPROC_VISUALIZER
+  if (flags & VP9D_DEBUG_TXT_FRAME_INFO) {
+    char message[512];
+    sprintf(message, "F%1dG%1dQ%3dF%3dP%d_s%dx%d",
+            (oci->frame_type == KEY_FRAME),
+            oci->refresh_golden_frame,
+            oci->base_qindex,
+            oci->filter_level,
+            flags,
+            oci->mb_cols, oci->mb_rows);
+    vp9_blit_text(message, oci->post_proc_buffer.y_buffer,
+                  oci->post_proc_buffer.y_stride);
+  }
+
+  if (flags & VP9D_DEBUG_TXT_MBLK_MODES) {
+    int i, j;
+    unsigned char *y_ptr;
+    YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer;
+    int mb_rows = post->y_height >> 4;
+    int mb_cols = post->y_width  >> 4;
+    int mb_index = 0;
+    MODE_INFO *mi = oci->mi;
+
+    y_ptr = post->y_buffer + 4 * post->y_stride + 4;
+
+    /* vp9_filter each macro block */
+    for (i = 0; i < mb_rows; i++) {
+      for (j = 0; j < mb_cols; j++) {
+        char zz[4];
+
+        sprintf(zz, "%c", mi[mb_index].mbmi.mode + 'a');
+
+        vp9_blit_text(zz, y_ptr, post->y_stride);
+        mb_index++;
+        y_ptr += 16;
+      }
+
+      mb_index++; /* border */
+      y_ptr += post->y_stride  * 16 - post->y_width;
+
+    }
+  }
+
+  if (flags & VP9D_DEBUG_TXT_DC_DIFF) {
+    int i, j;
+    unsigned char *y_ptr;
+    YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer;
+    int mb_rows = post->y_height >> 4;
+    int mb_cols = post->y_width  >> 4;
+    int mb_index = 0;
+    MODE_INFO *mi = oci->mi;
+
+    y_ptr = post->y_buffer + 4 * post->y_stride + 4;
+
+    /* vp9_filter each macro block */
+    for (i = 0; i < mb_rows; i++) {
+      for (j = 0; j < mb_cols; j++) {
+        char zz[4];
+        int dc_diff = !(mi[mb_index].mbmi.mode != B_PRED &&
+                        mi[mb_index].mbmi.mode != SPLITMV &&
+                        mi[mb_index].mbmi.mb_skip_coeff);
+
+        if (oci->frame_type == KEY_FRAME)
+          sprintf(zz, "a");
+        else
+          sprintf(zz, "%c", dc_diff + '0');
+
+        vp9_blit_text(zz, y_ptr, post->y_stride);
+        mb_index++;
+        y_ptr += 16;
+      }
+
+      mb_index++; /* border */
+      y_ptr += post->y_stride  * 16 - post->y_width;
+
+    }
+  }
+
+  if (flags & VP9D_DEBUG_TXT_RATE_INFO) {
+    char message[512];
+    snprintf(message, sizeof(message),
+             "Bitrate: %10.2f frame_rate: %10.2f ",
+             oci->bitrate, oci->framerate);
+    vp9_blit_text(message, oci->post_proc_buffer.y_buffer,
+                  oci->post_proc_buffer.y_stride);
+  }
+
+  /* Draw motion vectors */
+  if ((flags & VP9D_DEBUG_DRAW_MV) && ppflags->display_mv_flag) {
+    YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer;
+    int width  = post->y_width;
+    int height = post->y_height;
+    unsigned char *y_buffer = oci->post_proc_buffer.y_buffer;
+    int y_stride = oci->post_proc_buffer.y_stride;
+    MODE_INFO *mi = oci->mi;
+    int x0, y0;
+
+    for (y0 = 0; y0 < height; y0 += 16) {
+      for (x0 = 0; x0 < width; x0 += 16) {
+        int x1, y1;
+
+        if (!(ppflags->display_mv_flag & (1 << mi->mbmi.mode))) {
+          mi++;
+          continue;
+        }
+
+        if (mi->mbmi.mode == SPLITMV) {
+          switch (mi->mbmi.partitioning) {
+            case PARTITIONING_16X8 : {  /* mv_top_bottom */
+              union b_mode_info *bmi = &mi->bmi[0];
+              MV *mv = &bmi->mv.as_mv;
+
+              x1 = x0 + 8 + (mv->col >> 3);
+              y1 = y0 + 4 + (mv->row >> 3);
+
+              constrain_line(x0 + 8, &x1, y0 + 4, &y1, width, height);
+              vp9_blit_line(x0 + 8,  x1, y0 + 4,  y1, y_buffer, y_stride);
+
+              bmi = &mi->bmi[8];
+
+              x1 = x0 + 8 + (mv->col >> 3);
+              y1 = y0 + 12 + (mv->row >> 3);
+
+              constrain_line(x0 + 8, &x1, y0 + 12, &y1, width, height);
+              vp9_blit_line(x0 + 8,  x1, y0 + 12,  y1, y_buffer, y_stride);
+
+              break;
+            }
+            case PARTITIONING_8X16 : {  /* mv_left_right */
+              union b_mode_info *bmi = &mi->bmi[0];
+              MV *mv = &bmi->mv.as_mv;
+
+              x1 = x0 + 4 + (mv->col >> 3);
+              y1 = y0 + 8 + (mv->row >> 3);
+
+              constrain_line(x0 + 4, &x1, y0 + 8, &y1, width, height);
+              vp9_blit_line(x0 + 4,  x1, y0 + 8,  y1, y_buffer, y_stride);
+
+              bmi = &mi->bmi[2];
+
+              x1 = x0 + 12 + (mv->col >> 3);
+              y1 = y0 + 8 + (mv->row >> 3);
+
+              constrain_line(x0 + 12, &x1, y0 + 8, &y1, width, height);
+              vp9_blit_line(x0 + 12,  x1, y0 + 8,  y1, y_buffer, y_stride);
+
+              break;
+            }
+            case PARTITIONING_8X8 : {  /* mv_quarters   */
+              union b_mode_info *bmi = &mi->bmi[0];
+              MV *mv = &bmi->mv.as_mv;
+
+              x1 = x0 + 4 + (mv->col >> 3);
+              y1 = y0 + 4 + (mv->row >> 3);
+
+              constrain_line(x0 + 4, &x1, y0 + 4, &y1, width, height);
+              vp9_blit_line(x0 + 4,  x1, y0 + 4,  y1, y_buffer, y_stride);
+
+              bmi = &mi->bmi[2];
+
+              x1 = x0 + 12 + (mv->col >> 3);
+              y1 = y0 + 4 + (mv->row >> 3);
+
+              constrain_line(x0 + 12, &x1, y0 + 4, &y1, width, height);
+              vp9_blit_line(x0 + 12,  x1, y0 + 4,  y1, y_buffer, y_stride);
+
+              bmi = &mi->bmi[8];
+
+              x1 = x0 + 4 + (mv->col >> 3);
+              y1 = y0 + 12 + (mv->row >> 3);
+
+              constrain_line(x0 + 4, &x1, y0 + 12, &y1, width, height);
+              vp9_blit_line(x0 + 4,  x1, y0 + 12,  y1, y_buffer, y_stride);
+
+              bmi = &mi->bmi[10];
+
+              x1 = x0 + 12 + (mv->col >> 3);
+              y1 = y0 + 12 + (mv->row >> 3);
+
+              constrain_line(x0 + 12, &x1, y0 + 12, &y1, width, height);
+              vp9_blit_line(x0 + 12,  x1, y0 + 12,  y1, y_buffer, y_stride);
+              break;
+            }
+            case PARTITIONING_4X4:
+            default : {
+              union b_mode_info *bmi = mi->bmi;
+              int bx0, by0;
+
+              for (by0 = y0; by0 < (y0 + 16); by0 += 4) {
+                for (bx0 = x0; bx0 < (x0 + 16); bx0 += 4) {
+                  MV *mv = &bmi->mv.as_mv;
+
+                  x1 = bx0 + 2 + (mv->col >> 3);
+                  y1 = by0 + 2 + (mv->row >> 3);
+
+                  constrain_line(bx0 + 2, &x1, by0 + 2, &y1, width, height);
+                  vp9_blit_line(bx0 + 2,  x1, by0 + 2,  y1, y_buffer, y_stride);
+
+                  bmi++;
+                }
+              }
+            }
+          }
+        } else if (mi->mbmi.mode >= NEARESTMV) {
+          MV *mv = &mi->mbmi.mv.as_mv;
+          const int lx0 = x0 + 8;
+          const int ly0 = y0 + 8;
+
+          x1 = lx0 + (mv->col >> 3);
+          y1 = ly0 + (mv->row >> 3);
+
+          if (x1 != lx0 && y1 != ly0) {
+            constrain_line(lx0, &x1, ly0 - 1, &y1, width, height);
+            vp9_blit_line(lx0,  x1, ly0 - 1,  y1, y_buffer, y_stride);
+
+            constrain_line(lx0, &x1, ly0 + 1, &y1, width, height);
+            vp9_blit_line(lx0,  x1, ly0 + 1,  y1, y_buffer, y_stride);
+          } else
+            vp9_blit_line(lx0,  x1, ly0,  y1, y_buffer, y_stride);
+        }
+
+        mi++;
+      }
+      mi++;
+    }
+  }
+
+  /* Color in block modes */
+  if ((flags & VP9D_DEBUG_CLR_BLK_MODES)
+      && (ppflags->display_mb_modes_flag || ppflags->display_b_modes_flag)) {
+    int y, x;
+    YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer;
+    int width  = post->y_width;
+    int height = post->y_height;
+    unsigned char *y_ptr = oci->post_proc_buffer.y_buffer;
+    unsigned char *u_ptr = oci->post_proc_buffer.u_buffer;
+    unsigned char *v_ptr = oci->post_proc_buffer.v_buffer;
+    int y_stride = oci->post_proc_buffer.y_stride;
+    MODE_INFO *mi = oci->mi;
+
+    for (y = 0; y < height; y += 16) {
+      for (x = 0; x < width; x += 16) {
+        int Y = 0, U = 0, V = 0;
+
+        if (mi->mbmi.mode == B_PRED &&
+            ((ppflags->display_mb_modes_flag & B_PRED) ||
+             ppflags->display_b_modes_flag)) {
+          int by, bx;
+          unsigned char *yl, *ul, *vl;
+          union b_mode_info *bmi = mi->bmi;
+
+          yl = y_ptr + x;
+          ul = u_ptr + (x >> 1);
+          vl = v_ptr + (x >> 1);
+
+          for (by = 0; by < 16; by += 4) {
+            for (bx = 0; bx < 16; bx += 4) {
+              if ((ppflags->display_b_modes_flag & (1 << mi->mbmi.mode))
+                  || (ppflags->display_mb_modes_flag & B_PRED)) {
+                Y = B_PREDICTION_MODE_colors[bmi->as_mode.first][0];
+                U = B_PREDICTION_MODE_colors[bmi->as_mode.first][1];
+                V = B_PREDICTION_MODE_colors[bmi->as_mode.first][2];
+
+                POSTPROC_INVOKE(RTCD_VTABLE(oci), blend_b)(yl + bx,
+                                                           ul + (bx >> 1),
+                                                           vl + (bx >> 1),
+                                                           Y, U, V,
+                                                           0xc000, y_stride);
+              }
+              bmi++;
+            }
+
+            yl += y_stride * 4;
+            ul += y_stride * 1;
+            vl += y_stride * 1;
+          }
+        } else if (ppflags->display_mb_modes_flag & (1 << mi->mbmi.mode)) {
+          Y = MB_PREDICTION_MODE_colors[mi->mbmi.mode][0];
+          U = MB_PREDICTION_MODE_colors[mi->mbmi.mode][1];
+          V = MB_PREDICTION_MODE_colors[mi->mbmi.mode][2];
+
+          POSTPROC_INVOKE(RTCD_VTABLE(oci), blend_mb_inner)(y_ptr + x,
+                                                            u_ptr + (x >> 1),
+                                                            v_ptr + (x >> 1),
+                                                            Y, U, V,
+                                                            0xc000, y_stride);
+        }
+
+        mi++;
+      }
+      y_ptr += y_stride * 16;
+      u_ptr += y_stride * 4;
+      v_ptr += y_stride * 4;
+
+      mi++;
+    }
+  }
+
+  /* Color in frame reference blocks */
+  if ((flags & VP9D_DEBUG_CLR_FRM_REF_BLKS) &&
+      ppflags->display_ref_frame_flag) {
+    int y, x;
+    YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer;
+    int width  = post->y_width;
+    int height = post->y_height;
+    unsigned char *y_ptr = oci->post_proc_buffer.y_buffer;
+    unsigned char *u_ptr = oci->post_proc_buffer.u_buffer;
+    unsigned char *v_ptr = oci->post_proc_buffer.v_buffer;
+    int y_stride = oci->post_proc_buffer.y_stride;
+    MODE_INFO *mi = oci->mi;
+
+    for (y = 0; y < height; y += 16) {
+      for (x = 0; x < width; x += 16) {
+        int Y = 0, U = 0, V = 0;
+
+        if (ppflags->display_ref_frame_flag & (1 << mi->mbmi.ref_frame)) {
+          Y = MV_REFERENCE_FRAME_colors[mi->mbmi.ref_frame][0];
+          U = MV_REFERENCE_FRAME_colors[mi->mbmi.ref_frame][1];
+          V = MV_REFERENCE_FRAME_colors[mi->mbmi.ref_frame][2];
+
+          POSTPROC_INVOKE(RTCD_VTABLE(oci), blend_mb_outer)(y_ptr + x,
+                                                            u_ptr + (x >> 1),
+                                                            v_ptr + (x >> 1),
+                                                            Y, U, V,
+                                                            0xc000, y_stride);
+        }
+
+        mi++;
+      }
+      y_ptr += y_stride * 16;
+      u_ptr += y_stride * 4;
+      v_ptr += y_stride * 4;
+
+      mi++;
+    }
+  }
+#endif
+
+  *dest = oci->post_proc_buffer;
+
+  /* handle problem with extending borders */
+  dest->y_width = oci->Width;
+  dest->y_height = oci->Height;
+  dest->uv_height = dest->y_height / 2;
+
+  return 0;
+}
--- /dev/null
+++ b/vp9/common/postproc.h
@@ -1,0 +1,128 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef POSTPROC_H
+#define POSTPROC_H
+
+#define prototype_postproc_inplace(sym)\
+  void sym(unsigned char *dst, int pitch, int rows, int cols, int flimit)
+
+#define prototype_postproc(sym)\
+  void sym(unsigned char *src, unsigned char *dst, int src_pitch, \
+           int dst_pitch, int rows, int cols, int flimit)
+
+#define prototype_postproc_addnoise(sym) \
+  void sym(unsigned char *s, char *noise, char blackclamp[16], \
+           char whiteclamp[16], char bothclamp[16], \
+           unsigned int w, unsigned int h, int pitch)
+
+#define prototype_postproc_blend_mb_inner(sym)\
+  void sym(unsigned char *y, unsigned char *u, unsigned char *v, \
+           int y1, int u1, int v1, int alpha, int stride)
+
+#define prototype_postproc_blend_mb_outer(sym)\
+  void sym(unsigned char *y, unsigned char *u, unsigned char *v, \
+           int y1, int u1, int v1, int alpha, int stride)
+
+#define prototype_postproc_blend_b(sym)\
+  void sym(unsigned char *y, unsigned char *u, unsigned char *v, \
+           int y1, int u1, int v1, int alpha, int stride)
+
+#if ARCH_X86 || ARCH_X86_64
+#include "x86/postproc_x86.h"
+#endif
+
+#ifndef vp9_postproc_down
+#define vp9_postproc_down vp9_mbpost_proc_down_c
+#endif
+extern prototype_postproc_inplace(vp9_postproc_down);
+
+#ifndef vp9_postproc_across
+#define vp9_postproc_across vp9_mbpost_proc_across_ip_c
+#endif
+extern prototype_postproc_inplace(vp9_postproc_across);
+
+#ifndef vp9_postproc_downacross
+#define vp9_postproc_downacross vp9_post_proc_down_and_across_c
+#endif
+extern prototype_postproc(vp9_postproc_downacross);
+
+#ifndef vp9_postproc_addnoise
+#define vp9_postproc_addnoise vp9_plane_add_noise_c
+#endif
+extern prototype_postproc_addnoise(vp9_postproc_addnoise);
+
+#ifndef vp9_postproc_blend_mb_inner
+#define vp9_postproc_blend_mb_inner vp9_blend_mb_inner_c
+#endif
+extern prototype_postproc_blend_mb_inner(vp9_postproc_blend_mb_inner);
+
+#ifndef vp9_postproc_blend_mb_outer
+#define vp9_postproc_blend_mb_outer vp9_blend_mb_outer_c
+#endif
+extern prototype_postproc_blend_mb_outer(vp9_postproc_blend_mb_outer);
+
+#ifndef vp9_postproc_blend_b
+#define vp9_postproc_blend_b vp9_blend_b_c
+#endif
+extern prototype_postproc_blend_b(vp9_postproc_blend_b);
+
+typedef prototype_postproc((*vp9_postproc_fn_t));
+typedef prototype_postproc_inplace((*vp9_postproc_inplace_fn_t));
+typedef prototype_postproc_addnoise((*vp9_postproc_addnoise_fn_t));
+typedef prototype_postproc_blend_mb_inner((*vp9_postproc_blend_mb_inner_fn_t));
+typedef prototype_postproc_blend_mb_outer((*vp9_postproc_blend_mb_outer_fn_t));
+typedef prototype_postproc_blend_b((*vp9_postproc_blend_b_fn_t));
+typedef struct {
+  vp9_postproc_inplace_fn_t           down;
+  vp9_postproc_inplace_fn_t           across;
+  vp9_postproc_fn_t                   downacross;
+  vp9_postproc_addnoise_fn_t          addnoise;
+  vp9_postproc_blend_mb_inner_fn_t    blend_mb_inner;
+  vp9_postproc_blend_mb_outer_fn_t    blend_mb_outer;
+  vp9_postproc_blend_b_fn_t           blend_b;
+} vp9_postproc_rtcd_vtable_t;
+
+#if CONFIG_RUNTIME_CPU_DETECT
+#define POSTPROC_INVOKE(ctx,fn) (ctx)->fn
+#else
+#define POSTPROC_INVOKE(ctx,fn) vp9_postproc_##fn
+#endif
+
+#include "vpx_ports/mem.h"
+struct postproc_state {
+  int           last_q;
+  int           last_noise;
+  char          noise[3072];
+  DECLARE_ALIGNED(16, char, blackclamp[16]);
+  DECLARE_ALIGNED(16, char, whiteclamp[16]);
+  DECLARE_ALIGNED(16, char, bothclamp[16]);
+};
+#include "onyxc_int.h"
+#include "ppflags.h"
+int vp9_post_proc_frame(struct VP9Common *oci, YV12_BUFFER_CONFIG *dest,
+                        vp9_ppflags_t *flags);
+
+
+void vp9_de_noise(YV12_BUFFER_CONFIG         *source,
+                  YV12_BUFFER_CONFIG         *post,
+                  int                         q,
+                  int                         low_var_thresh,
+                  int                         flag,
+                  vp9_postproc_rtcd_vtable_t *rtcd);
+
+void vp9_deblock(YV12_BUFFER_CONFIG         *source,
+                 YV12_BUFFER_CONFIG         *post,
+                 int                         q,
+                 int                         low_var_thresh,
+                 int                         flag,
+                 vp9_postproc_rtcd_vtable_t *rtcd);
+#endif
--- /dev/null
+++ b/vp9/common/ppc/copy_altivec.asm
@@ -1,0 +1,47 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    .globl copy_mem16x16_ppc
+
+;# r3 unsigned char *src
+;# r4 int src_stride
+;# r5 unsigned char *dst
+;# r6 int dst_stride
+
+;# Make the assumption that input will not be aligned,
+;#  but the output will be.  So two reads and a perm
+;#  for the input, but only one store for the output.
+copy_mem16x16_ppc:
+    mfspr   r11, 256            ;# get old VRSAVE
+    oris    r12, r11, 0xe000
+    mtspr   256, r12            ;# set VRSAVE
+
+    li      r10, 16
+    mtctr   r10
+
+cp_16x16_loop:
+    lvsl    v0,  0, r3          ;# permutate value for alignment
+
+    lvx     v1,   0, r3
+    lvx     v2, r10, r3
+
+    vperm   v1, v1, v2, v0
+
+    stvx    v1,  0, r5
+
+    add     r3, r3, r4          ;# increment source pointer
+    add     r5, r5, r6          ;# increment destination pointer
+
+    bdnz    cp_16x16_loop
+
+    mtspr   256, r11            ;# reset old VRSAVE
+
+    blr
--- /dev/null
+++ b/vp9/common/ppc/filter_altivec.asm
@@ -1,0 +1,1013 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    .globl sixtap_predict_ppc
+    .globl sixtap_predict8x4_ppc
+    .globl sixtap_predict8x8_ppc
+    .globl sixtap_predict16x16_ppc
+
+.macro load_c V, LABEL, OFF, R0, R1
+    lis     \R0, \LABEL@ha
+    la      \R1, \LABEL@l(\R0)
+    lvx     \V, \OFF, \R1
+.endm
+
+.macro load_hfilter V0, V1
+    load_c \V0, HFilter, r5, r9, r10
+
+    addi    r5,  r5, 16
+    lvx     \V1, r5, r10
+.endm
+
+;# Vertical filtering
+.macro Vprolog
+    load_c v0, VFilter, r6, r3, r10
+
+    vspltish v5, 8
+    vspltish v6, 3
+    vslh    v6, v5, v6      ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
+
+    vspltb  v1, v0, 1
+    vspltb  v2, v0, 2
+    vspltb  v3, v0, 3
+    vspltb  v4, v0, 4
+    vspltb  v5, v0, 5
+    vspltb  v0, v0, 0
+.endm
+
+.macro vpre_load
+    Vprolog
+    li      r10,  16
+    lvx     v10,   0, r9    ;# v10..v14 = first 5 rows
+    lvx     v11, r10, r9
+    addi    r9,   r9, 32
+    lvx     v12,   0, r9
+    lvx     v13, r10, r9
+    addi    r9,   r9, 32
+    lvx     v14,   0, r9
+.endm
+
+.macro Msum Re, Ro, V, T, TMP
+                                ;# (Re,Ro) += (V*T)
+    vmuleub \TMP, \V, \T        ;# trashes v8
+    vadduhm \Re, \Re, \TMP      ;# Re = evens, saturation unnecessary
+    vmuloub \TMP, \V, \T
+    vadduhm \Ro, \Ro, \TMP      ;# Ro = odds
+.endm
+
+.macro vinterp_no_store P0 P1 P2 P3 P4 P5
+    vmuleub  v8, \P0, v0        ;# 64 + 4 positive taps
+    vadduhm v16, v6, v8
+    vmuloub  v8, \P0, v0
+    vadduhm v17, v6, v8
+    Msum v16, v17, \P2, v2, v8
+    Msum v16, v17, \P3, v3, v8
+    Msum v16, v17, \P5, v5, v8
+
+    vmuleub v18, \P1, v1        ;# 2 negative taps
+    vmuloub v19, \P1, v1
+    Msum v18, v19, \P4, v4, v8
+
+    vsubuhs v16, v16, v18       ;# subtract neg from pos
+    vsubuhs v17, v17, v19
+    vsrh    v16, v16, v7        ;# divide by 128
+    vsrh    v17, v17, v7        ;# v16 v17 = evens, odds
+    vmrghh  v18, v16, v17       ;# v18 v19 = 16-bit result in order
+    vmrglh  v19, v16, v17
+    vpkuhus  \P0, v18, v19      ;# P0 = 8-bit result
+.endm
+
+.macro vinterp_no_store_8x8 P0 P1 P2 P3 P4 P5
+    vmuleub v24, \P0, v13       ;# 64 + 4 positive taps
+    vadduhm v21, v20, v24
+    vmuloub v24, \P0, v13
+    vadduhm v22, v20, v24
+    Msum v21, v22, \P2, v15, v25
+    Msum v21, v22, \P3, v16, v25
+    Msum v21, v22, \P5, v18, v25
+
+    vmuleub v23, \P1, v14       ;# 2 negative taps
+    vmuloub v24, \P1, v14
+    Msum v23, v24, \P4, v17, v25
+
+    vsubuhs v21, v21, v23       ;# subtract neg from pos
+    vsubuhs v22, v22, v24
+    vsrh    v21, v21, v19       ;# divide by 128
+    vsrh    v22, v22, v19       ;# v16 v17 = evens, odds
+    vmrghh  v23, v21, v22       ;# v18 v19 = 16-bit result in order
+    vmrglh  v24, v21, v22
+    vpkuhus \P0, v23, v24       ;# P0 = 8-bit result
+.endm
+
+
+.macro Vinterp P0 P1 P2 P3 P4 P5
+    vinterp_no_store \P0, \P1, \P2, \P3, \P4, \P5
+    stvx    \P0, 0, r7
+    add     r7, r7, r8      ;# 33 ops per 16 pels
+.endm
+
+
+.macro luma_v P0, P1, P2, P3, P4, P5
+    addi    r9,   r9, 16        ;# P5 = newest input row
+    lvx     \P5,   0, r9
+    Vinterp \P0, \P1, \P2, \P3, \P4, \P5
+.endm
+
+.macro luma_vtwo
+    luma_v v10, v11, v12, v13, v14, v15
+    luma_v v11, v12, v13, v14, v15, v10
+.endm
+
+.macro luma_vfour
+    luma_vtwo
+    luma_v v12, v13, v14, v15, v10, v11
+    luma_v v13, v14, v15, v10, v11, v12
+.endm
+
+.macro luma_vsix
+    luma_vfour
+    luma_v v14, v15, v10, v11, v12, v13
+    luma_v v15, v10, v11, v12, v13, v14
+.endm
+
+.macro Interp4 R I I4
+    vmsummbm \R, v13, \I, v15
+    vmsummbm \R, v14, \I4, \R
+.endm
+
+.macro Read8x8 VD, RS, RP, increment_counter
+    lvsl    v21,  0, \RS        ;# permutate value for alignment
+
+    ;# input to filter is 21 bytes wide, output is 16 bytes.
+    ;#  input will can span three vectors if not aligned correctly.
+    lvx     \VD,   0, \RS
+    lvx     v20, r10, \RS
+
+.if \increment_counter
+    add     \RS, \RS, \RP
+.endif
+
+    vperm   \VD, \VD, v20, v21
+.endm
+
+.macro interp_8x8 R
+    vperm   v20, \R, \R, v16    ;# v20 = 0123 1234 2345 3456
+    vperm   v21, \R, \R, v17    ;# v21 = 4567 5678 6789 789A
+    Interp4 v20, v20,  v21      ;# v20 = result 0 1 2 3
+    vperm   \R, \R, \R, v18     ;# R   = 89AB 9ABC ABCx BCxx
+    Interp4 v21, v21, \R        ;# v21 = result 4 5 6 7
+
+    vpkswus \R, v20, v21        ;#  R = 0 1 2 3 4 5 6 7
+    vsrh    \R, \R, v19
+
+    vpkuhus \R, \R, \R          ;# saturate and pack
+
+.endm
+
+.macro Read4x4 VD, RS, RP, increment_counter
+    lvsl    v21,  0, \RS        ;# permutate value for alignment
+
+    ;# input to filter is 21 bytes wide, output is 16 bytes.
+    ;#  input will can span three vectors if not aligned correctly.
+    lvx     v20,   0, \RS
+
+.if \increment_counter
+    add     \RS, \RS, \RP
+.endif
+
+    vperm   \VD, v20, v20, v21
+.endm
+    .text
+
+    .align 2
+;# r3 unsigned char * src
+;# r4 int src_pitch
+;# r5 int x_offset
+;# r6 int y_offset
+;# r7 unsigned char * dst
+;# r8 int dst_pitch
+sixtap_predict_ppc:
+    mfspr   r11, 256            ;# get old VRSAVE
+    oris    r12, r11, 0xff87
+    ori     r12, r12, 0xffc0
+    mtspr   256, r12            ;# set VRSAVE
+
+    stwu    r1,-32(r1)          ;# create space on the stack
+
+    slwi.   r5, r5, 5           ;# index into horizontal filter array
+
+    vspltish v19, 7
+
+    ;# If there isn't any filtering to be done for the horizontal, then
+    ;#  just skip to the second pass.
+    beq-    vertical_only_4x4
+
+    ;# load up horizontal filter
+    load_hfilter v13, v14
+
+    ;# rounding added in on the multiply
+    vspltisw v16, 8
+    vspltisw v15, 3
+    vslw    v15, v16, v15       ;# 0x00000040000000400000004000000040
+
+    ;# Load up permutation constants
+    load_c v16, B_0123, 0, r9, r10
+    load_c v17, B_4567, 0, r9, r10
+    load_c v18, B_89AB, 0, r9, r10
+
+    ;# Back off input buffer by 2 bytes.  Need 2 before and 3 after
+    addi    r3, r3, -2
+
+    addi    r9, r3, 0
+    li      r10, 16
+    Read8x8 v2, r3, r4, 1
+    Read8x8 v3, r3, r4, 1
+    Read8x8 v4, r3, r4, 1
+    Read8x8 v5, r3, r4, 1
+
+    slwi.   r6, r6, 4           ;# index into vertical filter array
+
+    ;# filter a line
+    interp_8x8 v2
+    interp_8x8 v3
+    interp_8x8 v4
+    interp_8x8 v5
+
+    ;# Finished filtering main horizontal block.  If there is no
+    ;#  vertical filtering, jump to storing the data.  Otherwise
+    ;#  load up and filter the additional 5 lines that are needed
+    ;#  for the vertical filter.
+    beq-    store_4x4
+
+    ;# only needed if there is a vertical filter present
+    ;# if the second filter is not null then need to back off by 2*pitch
+    sub     r9, r9, r4
+    sub     r9, r9, r4
+
+    Read8x8 v0, r9, r4, 1
+    Read8x8 v1, r9, r4, 0
+    Read8x8 v6, r3, r4, 1
+    Read8x8 v7, r3, r4, 1
+    Read8x8 v8, r3, r4, 0
+
+    interp_8x8 v0
+    interp_8x8 v1
+    interp_8x8 v6
+    interp_8x8 v7
+    interp_8x8 v8
+
+    b       second_pass_4x4
+
+vertical_only_4x4:
+    ;# only needed if there is a vertical filter present
+    ;# if the second filter is not null then need to back off by 2*pitch
+    sub     r3, r3, r4
+    sub     r3, r3, r4
+    li      r10, 16
+
+    Read8x8 v0, r3, r4, 1
+    Read8x8 v1, r3, r4, 1
+    Read8x8 v2, r3, r4, 1
+    Read8x8 v3, r3, r4, 1
+    Read8x8 v4, r3, r4, 1
+    Read8x8 v5, r3, r4, 1
+    Read8x8 v6, r3, r4, 1
+    Read8x8 v7, r3, r4, 1
+    Read8x8 v8, r3, r4, 0
+
+    slwi    r6, r6, 4           ;# index into vertical filter array
+
+second_pass_4x4:
+    load_c   v20, b_hilo_4x4, 0, r9, r10
+    load_c   v21, b_hilo, 0, r9, r10
+
+    ;# reposition input so that it can go through the
+    ;# filtering phase with one pass.
+    vperm   v0, v0, v1, v20     ;# 0 1 x x
+    vperm   v2, v2, v3, v20     ;# 2 3 x x
+    vperm   v4, v4, v5, v20     ;# 4 5 x x
+    vperm   v6, v6, v7, v20     ;# 6 7 x x
+
+    vperm   v0, v0, v2, v21     ;# 0 1 2 3
+    vperm   v4, v4, v6, v21     ;# 4 5 6 7
+
+    vsldoi  v1, v0, v4, 4
+    vsldoi  v2, v0, v4, 8
+    vsldoi  v3, v0, v4, 12
+
+    vsldoi  v5, v4, v8, 4
+
+    load_c   v13, VFilter, r6, r9, r10
+
+    vspltish v15, 8
+    vspltish v20, 3
+    vslh    v20, v15, v20       ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
+
+    vspltb  v14, v13, 1
+    vspltb  v15, v13, 2
+    vspltb  v16, v13, 3
+    vspltb  v17, v13, 4
+    vspltb  v18, v13, 5
+    vspltb  v13, v13, 0
+
+    vinterp_no_store_8x8 v0, v1, v2, v3, v4, v5
+
+    stvx    v0, 0, r1
+
+    lwz     r0, 0(r1)
+    stw     r0, 0(r7)
+    add     r7, r7, r8
+
+    lwz     r0, 4(r1)
+    stw     r0, 0(r7)
+    add     r7, r7, r8
+
+    lwz     r0, 8(r1)
+    stw     r0, 0(r7)
+    add     r7, r7, r8
+
+    lwz     r0, 12(r1)
+    stw     r0, 0(r7)
+
+    b       exit_4x4
+
+store_4x4:
+
+    stvx    v2, 0, r1
+    lwz     r0, 0(r1)
+    stw     r0, 0(r7)
+    add     r7, r7, r8
+
+    stvx    v3, 0, r1
+    lwz     r0, 0(r1)
+    stw     r0, 0(r7)
+    add     r7, r7, r8
+
+    stvx    v4, 0, r1
+    lwz     r0, 0(r1)
+    stw     r0, 0(r7)
+    add     r7, r7, r8
+
+    stvx    v5, 0, r1
+    lwz     r0, 0(r1)
+    stw     r0, 0(r7)
+
+exit_4x4:
+
+    addi    r1, r1, 32          ;# recover stack
+
+    mtspr   256, r11            ;# reset old VRSAVE
+
+    blr
+
+.macro w_8x8 V, D, R, P
+    stvx    \V, 0, r1
+    lwz     \R, 0(r1)
+    stw     \R, 0(r7)
+    lwz     \R, 4(r1)
+    stw     \R, 4(r7)
+    add     \D, \D, \P
+.endm
+
+    .align 2
+;# r3 unsigned char * src
+;# r4 int src_pitch
+;# r5 int x_offset
+;# r6 int y_offset
+;# r7 unsigned char * dst
+;# r8 int dst_pitch
+
+sixtap_predict8x4_ppc:
+    mfspr   r11, 256            ;# get old VRSAVE
+    oris    r12, r11, 0xffff
+    ori     r12, r12, 0xffc0
+    mtspr   256, r12            ;# set VRSAVE
+
+    stwu    r1,-32(r1)          ;# create space on the stack
+
+    slwi.   r5, r5, 5           ;# index into horizontal filter array
+
+    vspltish v19, 7
+
+    ;# If there isn't any filtering to be done for the horizontal, then
+    ;#  just skip to the second pass.
+    beq-    second_pass_pre_copy_8x4
+
+    load_hfilter v13, v14
+
+    ;# rounding added in on the multiply
+    vspltisw v16, 8
+    vspltisw v15, 3
+    vslw    v15, v16, v15       ;# 0x00000040000000400000004000000040
+
+    ;# Load up permutation constants
+    load_c v16, B_0123, 0, r9, r10
+    load_c v17, B_4567, 0, r9, r10
+    load_c v18, B_89AB, 0, r9, r10
+
+    ;# Back off input buffer by 2 bytes.  Need 2 before and 3 after
+    addi    r3, r3, -2
+
+    addi    r9, r3, 0
+    li      r10, 16
+    Read8x8 v2, r3, r4, 1
+    Read8x8 v3, r3, r4, 1
+    Read8x8 v4, r3, r4, 1
+    Read8x8 v5, r3, r4, 1
+
+    slwi.   r6, r6, 4           ;# index into vertical filter array
+
+    ;# filter a line
+    interp_8x8 v2
+    interp_8x8 v3
+    interp_8x8 v4
+    interp_8x8 v5
+
+    ;# Finished filtering main horizontal block.  If there is no
+    ;#  vertical filtering, jump to storing the data.  Otherwise
+    ;#  load up and filter the additional 5 lines that are needed
+    ;#  for the vertical filter.
+    beq-    store_8x4
+
+    ;# only needed if there is a vertical filter present
+    ;# if the second filter is not null then need to back off by 2*pitch
+    sub     r9, r9, r4
+    sub     r9, r9, r4
+
+    Read8x8 v0, r9, r4, 1
+    Read8x8 v1, r9, r4, 0
+    Read8x8 v6, r3, r4, 1
+    Read8x8 v7, r3, r4, 1
+    Read8x8 v8, r3, r4, 0
+
+    interp_8x8 v0
+    interp_8x8 v1
+    interp_8x8 v6
+    interp_8x8 v7
+    interp_8x8 v8
+
+    b       second_pass_8x4
+
+second_pass_pre_copy_8x4:
+    ;# only needed if there is a vertical filter present
+    ;# if the second filter is not null then need to back off by 2*pitch
+    sub     r3, r3, r4
+    sub     r3, r3, r4
+    li      r10, 16
+
+    Read8x8 v0,  r3, r4, 1
+    Read8x8 v1,  r3, r4, 1
+    Read8x8 v2,  r3, r4, 1
+    Read8x8 v3,  r3, r4, 1
+    Read8x8 v4,  r3, r4, 1
+    Read8x8 v5,  r3, r4, 1
+    Read8x8 v6,  r3, r4, 1
+    Read8x8 v7,  r3, r4, 1
+    Read8x8 v8,  r3, r4, 1
+
+    slwi    r6, r6, 4           ;# index into vertical filter array
+
+second_pass_8x4:
+    load_c v13, VFilter, r6, r9, r10
+
+    vspltish v15, 8
+    vspltish v20, 3
+    vslh    v20, v15, v20       ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
+
+    vspltb  v14, v13, 1
+    vspltb  v15, v13, 2
+    vspltb  v16, v13, 3
+    vspltb  v17, v13, 4
+    vspltb  v18, v13, 5
+    vspltb  v13, v13, 0
+
+    vinterp_no_store_8x8 v0, v1, v2, v3,  v4,  v5
+    vinterp_no_store_8x8 v1, v2, v3, v4,  v5,  v6
+    vinterp_no_store_8x8 v2, v3, v4, v5,  v6,  v7
+    vinterp_no_store_8x8 v3, v4, v5, v6,  v7,  v8
+
+    cmpi    cr0, r8, 8
+    beq     cr0, store_aligned_8x4
+
+    w_8x8   v0, r7, r0, r8
+    w_8x8   v1, r7, r0, r8
+    w_8x8   v2, r7, r0, r8
+    w_8x8   v3, r7, r0, r8
+
+    b       exit_8x4
+
+store_aligned_8x4:
+
+    load_c v10, b_hilo, 0, r9, r10
+
+    vperm   v0, v0, v1, v10
+    vperm   v2, v2, v3, v10
+
+    stvx    v0, 0, r7
+    addi    r7, r7, 16
+    stvx    v2, 0, r7
+
+    b       exit_8x4
+
+store_8x4:
+    cmpi    cr0, r8, 8
+    beq     cr0, store_aligned2_8x4
+
+    w_8x8   v2, r7, r0, r8
+    w_8x8   v3, r7, r0, r8
+    w_8x8   v4, r7, r0, r8
+    w_8x8   v5, r7, r0, r8
+
+    b       exit_8x4
+
+store_aligned2_8x4:
+    load_c v10, b_hilo, 0, r9, r10
+
+    vperm   v2, v2, v3, v10
+    vperm   v4, v4, v5, v10
+
+    stvx    v2, 0, r7
+    addi    r7, r7, 16
+    stvx    v4, 0, r7
+
+exit_8x4:
+
+    addi    r1, r1, 32          ;# recover stack
+
+    mtspr   256, r11            ;# reset old VRSAVE
+
+
+    blr
+
+    .align 2
+;# r3 unsigned char * src
+;# r4 int src_pitch
+;# r5 int x_offset
+;# r6 int y_offset
+;# r7 unsigned char * dst
+;# r8 int dst_pitch
+
+;# Because the width that needs to be filtered will fit in a single altivec
+;#  register there is no need to loop.  Everything can stay in registers.
+sixtap_predict8x8_ppc:
+    mfspr   r11, 256            ;# get old VRSAVE
+    oris    r12, r11, 0xffff
+    ori     r12, r12, 0xffc0
+    mtspr   256, r12            ;# set VRSAVE
+
+    stwu    r1,-32(r1)          ;# create space on the stack
+
+    slwi.   r5, r5, 5           ;# index into horizontal filter array
+
+    vspltish v19, 7
+
+    ;# If there isn't any filtering to be done for the horizontal, then
+    ;#  just skip to the second pass.
+    beq-    second_pass_pre_copy_8x8
+
+    load_hfilter v13, v14
+
+    ;# rounding added in on the multiply
+    vspltisw v16, 8
+    vspltisw v15, 3
+    vslw    v15, v16, v15       ;# 0x00000040000000400000004000000040
+
+    ;# Load up permutation constants
+    load_c v16, B_0123, 0, r9, r10
+    load_c v17, B_4567, 0, r9, r10
+    load_c v18, B_89AB, 0, r9, r10
+
+    ;# Back off input buffer by 2 bytes.  Need 2 before and 3 after
+    addi    r3, r3, -2
+
+    addi    r9, r3, 0
+    li      r10, 16
+    Read8x8 v2, r3, r4, 1
+    Read8x8 v3, r3, r4, 1
+    Read8x8 v4, r3, r4, 1
+    Read8x8 v5, r3, r4, 1
+    Read8x8 v6, r3, r4, 1
+    Read8x8 v7, r3, r4, 1
+    Read8x8 v8, r3, r4, 1
+    Read8x8 v9, r3, r4, 1
+
+    slwi.   r6, r6, 4           ;# index into vertical filter array
+
+    ;# filter a line
+    interp_8x8 v2
+    interp_8x8 v3
+    interp_8x8 v4
+    interp_8x8 v5
+    interp_8x8 v6
+    interp_8x8 v7
+    interp_8x8 v8
+    interp_8x8 v9
+
+    ;# Finished filtering main horizontal block.  If there is no
+    ;#  vertical filtering, jump to storing the data.  Otherwise
+    ;#  load up and filter the additional 5 lines that are needed
+    ;#  for the vertical filter.
+    beq-    store_8x8
+
+    ;# only needed if there is a vertical filter present
+    ;# if the second filter is not null then need to back off by 2*pitch
+    sub     r9, r9, r4
+    sub     r9, r9, r4
+
+    Read8x8 v0,  r9, r4, 1
+    Read8x8 v1,  r9, r4, 0
+    Read8x8 v10, r3, r4, 1
+    Read8x8 v11, r3, r4, 1
+    Read8x8 v12, r3, r4, 0
+
+    interp_8x8 v0
+    interp_8x8 v1
+    interp_8x8 v10
+    interp_8x8 v11
+    interp_8x8 v12
+
+    b       second_pass_8x8
+
+second_pass_pre_copy_8x8:
+    ;# only needed if there is a vertical filter present
+    ;# if the second filter is not null then need to back off by 2*pitch
+    sub     r3, r3, r4
+    sub     r3, r3, r4
+    li      r10, 16
+
+    Read8x8 v0,  r3, r4, 1
+    Read8x8 v1,  r3, r4, 1
+    Read8x8 v2,  r3, r4, 1
+    Read8x8 v3,  r3, r4, 1
+    Read8x8 v4,  r3, r4, 1
+    Read8x8 v5,  r3, r4, 1
+    Read8x8 v6,  r3, r4, 1
+    Read8x8 v7,  r3, r4, 1
+    Read8x8 v8,  r3, r4, 1
+    Read8x8 v9,  r3, r4, 1
+    Read8x8 v10, r3, r4, 1
+    Read8x8 v11, r3, r4, 1
+    Read8x8 v12, r3, r4, 0
+
+    slwi    r6, r6, 4           ;# index into vertical filter array
+
+second_pass_8x8:
+    load_c v13, VFilter, r6, r9, r10
+
+    vspltish v15, 8
+    vspltish v20, 3
+    vslh    v20, v15, v20       ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
+
+    vspltb  v14, v13, 1
+    vspltb  v15, v13, 2
+    vspltb  v16, v13, 3
+    vspltb  v17, v13, 4
+    vspltb  v18, v13, 5
+    vspltb  v13, v13, 0
+
+    vinterp_no_store_8x8 v0, v1, v2, v3,  v4,  v5
+    vinterp_no_store_8x8 v1, v2, v3, v4,  v5,  v6
+    vinterp_no_store_8x8 v2, v3, v4, v5,  v6,  v7
+    vinterp_no_store_8x8 v3, v4, v5, v6,  v7,  v8
+    vinterp_no_store_8x8 v4, v5, v6, v7,  v8,  v9
+    vinterp_no_store_8x8 v5, v6, v7, v8,  v9,  v10
+    vinterp_no_store_8x8 v6, v7, v8, v9,  v10, v11
+    vinterp_no_store_8x8 v7, v8, v9, v10, v11, v12
+
+    cmpi    cr0, r8, 8
+    beq     cr0, store_aligned_8x8
+
+    w_8x8   v0, r7, r0, r8
+    w_8x8   v1, r7, r0, r8
+    w_8x8   v2, r7, r0, r8
+    w_8x8   v3, r7, r0, r8
+    w_8x8   v4, r7, r0, r8
+    w_8x8   v5, r7, r0, r8
+    w_8x8   v6, r7, r0, r8
+    w_8x8   v7, r7, r0, r8
+
+    b       exit_8x8
+
+store_aligned_8x8:
+
+    load_c v10, b_hilo, 0, r9, r10
+
+    vperm   v0, v0, v1, v10
+    vperm   v2, v2, v3, v10
+    vperm   v4, v4, v5, v10
+    vperm   v6, v6, v7, v10
+
+    stvx    v0, 0, r7
+    addi    r7, r7, 16
+    stvx    v2, 0, r7
+    addi    r7, r7, 16
+    stvx    v4, 0, r7
+    addi    r7, r7, 16
+    stvx    v6, 0, r7
+
+    b       exit_8x8
+
+store_8x8:
+    cmpi    cr0, r8, 8
+    beq     cr0, store_aligned2_8x8
+
+    w_8x8   v2, r7, r0, r8
+    w_8x8   v3, r7, r0, r8
+    w_8x8   v4, r7, r0, r8
+    w_8x8   v5, r7, r0, r8
+    w_8x8   v6, r7, r0, r8
+    w_8x8   v7, r7, r0, r8
+    w_8x8   v8, r7, r0, r8
+    w_8x8   v9, r7, r0, r8
+
+    b       exit_8x8
+
+store_aligned2_8x8:
+    load_c v10, b_hilo, 0, r9, r10
+
+    vperm   v2, v2, v3, v10
+    vperm   v4, v4, v5, v10
+    vperm   v6, v6, v7, v10
+    vperm   v8, v8, v9, v10
+
+    stvx    v2, 0, r7
+    addi    r7, r7, 16
+    stvx    v4, 0, r7
+    addi    r7, r7, 16
+    stvx    v6, 0, r7
+    addi    r7, r7, 16
+    stvx    v8, 0, r7
+
+exit_8x8:
+
+    addi    r1, r1, 32          ;# recover stack
+
+    mtspr   256, r11            ;# reset old VRSAVE
+
+    blr
+
+    .align 2
+;# r3 unsigned char * src
+;# r4 int src_pitch
+;# r5 int x_offset
+;# r6 int y_offset
+;# r7 unsigned char * dst
+;# r8 int dst_pitch
+
+;# Two pass filtering.  First pass is Horizontal edges, second pass is vertical
+;#  edges.  One of the filters can be null, but both won't be.  Needs to use a
+;#  temporary buffer because the source buffer can't be modified and the buffer
+;#  for the destination is not large enough to hold the temporary data.
+sixtap_predict16x16_ppc:
+    mfspr   r11, 256            ;# get old VRSAVE
+    oris    r12, r11, 0xffff
+    ori     r12, r12, 0xf000
+    mtspr   256, r12            ;# set VRSAVE
+
+    stwu    r1,-416(r1)         ;# create space on the stack
+
+    ;# Three possiblities
+    ;#  1. First filter is null.  Don't use a temp buffer.
+    ;#  2. Second filter is null.  Don't use a temp buffer.
+    ;#  3. Neither are null, use temp buffer.
+
+    ;# First Pass (horizontal edge)
+    ;#  setup pointers for src
+    ;#  if possiblity (1) then setup the src pointer to be the orginal and jump
+    ;#  to second pass.  this is based on if x_offset is 0.
+
+    ;# load up horizontal filter
+    slwi.   r5, r5, 5           ;# index into horizontal filter array
+
+    load_hfilter v4, v5
+
+    beq-    copy_horizontal_16x21
+
+    ;# Back off input buffer by 2 bytes.  Need 2 before and 3 after
+    addi    r3, r3, -2
+
+    slwi.   r6, r6, 4           ;# index into vertical filter array
+
+    ;# setup constants
+    ;# v14 permutation value for alignment
+    load_c v14, b_hperm, 0, r9, r10
+
+    ;# These statements are guessing that there won't be a second pass,
+    ;#  but if there is then inside the bypass they need to be set
+    li      r0, 16              ;# prepare for no vertical filter
+
+    ;# Change the output pointer and pitch to be the actual
+    ;#  desination instead of a temporary buffer.
+    addi    r9, r7, 0
+    addi    r5, r8, 0
+
+    ;# no vertical filter, so write the output from the first pass
+    ;#  directly into the output buffer.
+    beq-    no_vertical_filter_bypass
+
+    ;# if the second filter is not null then need to back off by 2*pitch
+    sub     r3, r3, r4
+    sub     r3, r3, r4
+
+    ;# setup counter for the number of lines that are going to be filtered
+    li      r0, 21
+
+    ;# use the stack as temporary storage
+    la      r9, 48(r1)
+    li      r5, 16
+
+no_vertical_filter_bypass:
+
+    mtctr   r0
+
+    ;# rounding added in on the multiply
+    vspltisw v10, 8
+    vspltisw v12, 3
+    vslw    v12, v10, v12       ;# 0x00000040000000400000004000000040
+
+    ;# downshift by 7 ( divide by 128 ) at the end
+    vspltish v13, 7
+
+    ;# index to the next set of vectors in the row.
+    li      r10, 16
+    li      r12, 32
+
+horizontal_loop_16x16:
+
+    lvsl    v15,  0, r3         ;# permutate value for alignment
+
+    ;# input to filter is 21 bytes wide, output is 16 bytes.
+    ;#  input will can span three vectors if not aligned correctly.
+    lvx     v1,   0, r3
+    lvx     v2, r10, r3
+    lvx     v3, r12, r3
+
+    vperm   v8, v1, v2, v15
+    vperm   v9, v2, v3, v15     ;# v8 v9 = 21 input pixels left-justified
+
+    vsldoi  v11, v8, v9, 4
+
+    ;# set 0
+    vmsummbm v6, v4, v8, v12    ;# taps times elements
+    vmsummbm v0, v5, v11, v6
+
+    ;# set 1
+    vsldoi  v10, v8, v9, 1
+    vsldoi  v11, v8, v9, 5
+
+    vmsummbm v6, v4, v10, v12
+    vmsummbm v1, v5, v11, v6
+
+    ;# set 2
+    vsldoi  v10, v8, v9, 2
+    vsldoi  v11, v8, v9, 6
+
+    vmsummbm v6, v4, v10, v12
+    vmsummbm v2, v5, v11, v6
+
+    ;# set 3
+    vsldoi  v10, v8, v9, 3
+    vsldoi  v11, v8, v9, 7
+
+    vmsummbm v6, v4, v10, v12
+    vmsummbm v3, v5, v11, v6
+
+    vpkswus v0, v0, v1          ;# v0 = 0 4 8 C 1 5 9 D (16-bit)
+    vpkswus v1, v2, v3          ;# v1 = 2 6 A E 3 7 B F
+
+    vsrh    v0, v0, v13         ;# divide v0, v1 by 128
+    vsrh    v1, v1, v13
+
+    vpkuhus v0, v0, v1          ;# v0 = scrambled 8-bit result
+    vperm   v0, v0, v0, v14     ;# v0 = correctly-ordered result
+
+    stvx    v0,  0, r9
+    add     r9, r9, r5
+
+    add     r3, r3, r4
+
+    bdnz    horizontal_loop_16x16
+
+    ;# check again to see if vertical filter needs to be done.
+    cmpi    cr0, r6, 0
+    beq     cr0, end_16x16
+
+    ;# yes there is, so go to the second pass
+    b       second_pass_16x16
+
+copy_horizontal_16x21:
+    li      r10, 21
+    mtctr   r10
+
+    li      r10, 16
+
+    sub     r3, r3, r4
+    sub     r3, r3, r4
+
+    ;# this is done above if there is a horizontal filter,
+    ;#  if not it needs to be done down here.
+    slwi    r6, r6, 4           ;# index into vertical filter array
+
+    ;# always write to the stack when doing a horizontal copy
+    la      r9, 48(r1)
+
+copy_horizontal_loop_16x21:
+    lvsl    v15,  0, r3         ;# permutate value for alignment
+
+    lvx     v1,   0, r3
+    lvx     v2, r10, r3
+
+    vperm   v8, v1, v2, v15
+
+    stvx    v8,  0, r9
+    addi    r9, r9, 16
+
+    add     r3, r3, r4
+
+    bdnz    copy_horizontal_loop_16x21
+
+second_pass_16x16:
+
+    ;# always read from the stack when doing a vertical filter
+    la      r9, 48(r1)
+
+    ;# downshift by 7 ( divide by 128 ) at the end
+    vspltish v7, 7
+
+    vpre_load
+
+    luma_vsix
+    luma_vsix
+    luma_vfour
+
+end_16x16:
+
+    addi    r1, r1, 416         ;# recover stack
+
+    mtspr   256, r11            ;# reset old VRSAVE
+
+    blr
+
+    .data
+
+    .align 4
+HFilter:
+    .byte     0,  0,128,  0,  0,  0,128,  0,  0,  0,128,  0,  0,  0,128,  0
+    .byte     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
+    .byte     0, -6,123, 12,  0, -6,123, 12,  0, -6,123, 12,  0, -6,123, 12
+    .byte    -1,  0,  0,  0, -1,  0,  0,  0, -1,  0,  0,  0, -1,  0,  0,  0
+    .byte     2,-11,108, 36,  2,-11,108, 36,  2,-11,108, 36,  2,-11,108, 36
+    .byte    -8,  1,  0,  0, -8,  1,  0,  0, -8,  1,  0,  0, -8,  1,  0,  0
+    .byte     0, -9, 93, 50,  0, -9, 93, 50,  0, -9, 93, 50,  0, -9, 93, 50
+    .byte    -6,  0,  0,  0, -6,  0,  0,  0, -6,  0,  0,  0, -6,  0,  0,  0
+    .byte     3,-16, 77, 77,  3,-16, 77, 77,  3,-16, 77, 77,  3,-16, 77, 77
+    .byte   -16,  3,  0,  0,-16,  3,  0,  0,-16,  3,  0,  0,-16,  3,  0,  0
+    .byte     0, -6, 50, 93,  0, -6, 50, 93,  0, -6, 50, 93,  0, -6, 50, 93
+    .byte    -9,  0,  0,  0, -9,  0,  0,  0, -9,  0,  0,  0, -9,  0,  0,  0
+    .byte     1, -8, 36,108,  1, -8, 36,108,  1, -8, 36,108,  1, -8, 36,108
+    .byte   -11,  2,  0,  0,-11,  2,  0,  0,-11,  2,  0,  0,-11,  2,  0,  0
+    .byte     0, -1, 12,123,  0, -1, 12,123,  0, -1, 12,123,  0, -1, 12,123
+    .byte    -6,  0,  0,  0, -6,  0,  0,  0, -6,  0,  0,  0, -6,  0,  0,  0
+
+    .align 4
+VFilter:
+    .byte     0,  0,128,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
+    .byte     0,  6,123, 12,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
+    .byte     2, 11,108, 36,  8,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
+    .byte     0,  9, 93, 50,  6,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
+    .byte     3, 16, 77, 77, 16,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
+    .byte     0,  6, 50, 93,  9,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
+    .byte     1,  8, 36,108, 11,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
+    .byte     0,  1, 12,123,  6,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
+
+    .align 4
+b_hperm:
+    .byte     0,  4,  8, 12,  1,  5,  9, 13,  2,  6, 10, 14,  3,  7, 11, 15
+
+    .align 4
+B_0123:
+    .byte     0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6
+
+    .align 4
+B_4567:
+    .byte     4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10
+
+    .align 4
+B_89AB:
+    .byte     8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
+
+    .align 4
+b_hilo:
+    .byte     0,  1,  2,  3,  4,  5,  6,  7, 16, 17, 18, 19, 20, 21, 22, 23
+
+    .align 4
+b_hilo_4x4:
+    .byte     0,  1,  2,  3, 16, 17, 18, 19,  0,  0,  0,  0,  0,  0,  0,  0
--- /dev/null
+++ b/vp9/common/ppc/filter_bilinear_altivec.asm
@@ -1,0 +1,677 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    .globl bilinear_predict4x4_ppc
+    .globl bilinear_predict8x4_ppc
+    .globl bilinear_predict8x8_ppc
+    .globl bilinear_predict16x16_ppc
+
+.macro load_c V, LABEL, OFF, R0, R1
+    lis     \R0, \LABEL@ha
+    la      \R1, \LABEL@l(\R0)
+    lvx     \V, \OFF, \R1
+.endm
+
+.macro load_vfilter V0, V1
+    load_c \V0, vfilter_b, r6, r9, r10
+
+    addi    r6,  r6, 16
+    lvx     \V1, r6, r10
+.endm
+
+.macro HProlog jump_label
+    ;# load up horizontal filter
+    slwi.   r5, r5, 4           ;# index into horizontal filter array
+
+    ;# index to the next set of vectors in the row.
+    li      r10, 16
+    li      r12, 32
+
+    ;# downshift by 7 ( divide by 128 ) at the end
+    vspltish v19, 7
+
+    ;# If there isn't any filtering to be done for the horizontal, then
+    ;#  just skip to the second pass.
+    beq     \jump_label
+
+    load_c v20, hfilter_b, r5, r9, r0
+
+    ;# setup constants
+    ;# v14 permutation value for alignment
+    load_c v28, b_hperm_b, 0, r9, r0
+
+    ;# rounding added in on the multiply
+    vspltisw v21, 8
+    vspltisw v18, 3
+    vslw    v18, v21, v18       ;# 0x00000040000000400000004000000040
+
+    slwi.   r6, r6, 5           ;# index into vertical filter array
+.endm
+
+;# Filters a horizontal line
+;# expects:
+;#  r3  src_ptr
+;#  r4  pitch
+;#  r10 16
+;#  r12 32
+;#  v17 perm intput
+;#  v18 rounding
+;#  v19 shift
+;#  v20 filter taps
+;#  v21 tmp
+;#  v22 tmp
+;#  v23 tmp
+;#  v24 tmp
+;#  v25 tmp
+;#  v26 tmp
+;#  v27 tmp
+;#  v28 perm output
+;#
+.macro HFilter V
+    vperm   v24, v21, v21, v10  ;# v20 = 0123 1234 2345 3456
+    vperm   v25, v21, v21, v11  ;# v21 = 4567 5678 6789 789A
+
+    vmsummbm v24, v20, v24, v18
+    vmsummbm v25, v20, v25, v18
+
+    vpkswus v24, v24, v25       ;# v24 = 0 4 8 C 1 5 9 D (16-bit)
+
+    vsrh    v24, v24, v19       ;# divide v0, v1 by 128
+
+    vpkuhus \V, v24, v24        ;# \V = scrambled 8-bit result
+.endm
+
+.macro hfilter_8 V, increment_counter
+    lvsl    v17,  0, r3         ;# permutate value for alignment
+
+    ;# input to filter is 9 bytes wide, output is 8 bytes.
+    lvx     v21,   0, r3
+    lvx     v22, r10, r3
+
+.if \increment_counter
+    add     r3, r3, r4
+.endif
+    vperm   v21, v21, v22, v17
+
+    HFilter \V
+.endm
+
+
+.macro load_and_align_8 V, increment_counter
+    lvsl    v17,  0, r3         ;# permutate value for alignment
+
+    ;# input to filter is 21 bytes wide, output is 16 bytes.
+    ;#  input will can span three vectors if not aligned correctly.
+    lvx     v21,   0, r3
+    lvx     v22, r10, r3
+
+.if \increment_counter
+    add     r3, r3, r4
+.endif
+
+    vperm   \V, v21, v22, v17
+.endm
+
+.macro write_aligned_8 V, increment_counter
+    stvx    \V,  0, r7
+
+.if \increment_counter
+    add     r7, r7, r8
+.endif
+.endm
+
+.macro vfilter_16 P0 P1
+    vmuleub v22, \P0, v20       ;# 64 + 4 positive taps
+    vadduhm v22, v18, v22
+    vmuloub v23, \P0, v20
+    vadduhm v23, v18, v23
+
+    vmuleub v24, \P1, v21
+    vadduhm v22, v22, v24       ;# Re = evens, saturation unnecessary
+    vmuloub v25, \P1, v21
+    vadduhm v23, v23, v25       ;# Ro = odds
+
+    vsrh    v22, v22, v19       ;# divide by 128
+    vsrh    v23, v23, v19       ;# v16 v17 = evens, odds
+    vmrghh  \P0, v22, v23       ;# v18 v19 = 16-bit result in order
+    vmrglh  v23, v22, v23
+    vpkuhus \P0, \P0, v23       ;# P0 = 8-bit result
+.endm
+
+
+.macro w_8x8 V, D, R, P
+    stvx    \V, 0, r1
+    lwz     \R, 0(r1)
+    stw     \R, 0(r7)
+    lwz     \R, 4(r1)
+    stw     \R, 4(r7)
+    add     \D, \D, \P
+.endm
+
+
+    .align 2
+;# r3 unsigned char * src
+;# r4 int src_pitch
+;# r5 int x_offset
+;# r6 int y_offset
+;# r7 unsigned char * dst
+;# r8 int dst_pitch
+bilinear_predict4x4_ppc:
+    mfspr   r11, 256            ;# get old VRSAVE
+    oris    r12, r11, 0xf830
+    ori     r12, r12, 0xfff8
+    mtspr   256, r12            ;# set VRSAVE
+
+    stwu    r1,-32(r1)          ;# create space on the stack
+
+    HProlog second_pass_4x4_pre_copy_b
+
+    ;# Load up permutation constants
+    load_c v10, b_0123_b, 0, r9, r12
+    load_c v11, b_4567_b, 0, r9, r12
+
+    hfilter_8 v0, 1
+    hfilter_8 v1, 1
+    hfilter_8 v2, 1
+    hfilter_8 v3, 1
+
+    ;# Finished filtering main horizontal block.  If there is no
+    ;#  vertical filtering, jump to storing the data.  Otherwise
+    ;#  load up and filter the additional line that is needed
+    ;#  for the vertical filter.
+    beq     store_out_4x4_b
+
+    hfilter_8 v4, 0
+
+    b   second_pass_4x4_b
+
+second_pass_4x4_pre_copy_b:
+    slwi    r6, r6, 5           ;# index into vertical filter array
+
+    load_and_align_8  v0, 1
+    load_and_align_8  v1, 1
+    load_and_align_8  v2, 1
+    load_and_align_8  v3, 1
+    load_and_align_8  v4, 1
+
+second_pass_4x4_b:
+    vspltish v20, 8
+    vspltish v18, 3
+    vslh    v18, v20, v18   ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
+
+    load_vfilter v20, v21
+
+    vfilter_16 v0,  v1
+    vfilter_16 v1,  v2
+    vfilter_16 v2,  v3
+    vfilter_16 v3,  v4
+
+store_out_4x4_b:
+
+    stvx    v0, 0, r1
+    lwz     r0, 0(r1)
+    stw     r0, 0(r7)
+    add     r7, r7, r8
+
+    stvx    v1, 0, r1
+    lwz     r0, 0(r1)
+    stw     r0, 0(r7)
+    add     r7, r7, r8
+
+    stvx    v2, 0, r1
+    lwz     r0, 0(r1)
+    stw     r0, 0(r7)
+    add     r7, r7, r8
+
+    stvx    v3, 0, r1
+    lwz     r0, 0(r1)
+    stw     r0, 0(r7)
+
+exit_4x4:
+
+    addi    r1, r1, 32          ;# recover stack
+    mtspr   256, r11            ;# reset old VRSAVE
+
+    blr
+
+    .align 2
+;# r3 unsigned char * src
+;# r4 int src_pitch
+;# r5 int x_offset
+;# r6 int y_offset
+;# r7 unsigned char * dst
+;# r8 int dst_pitch
+bilinear_predict8x4_ppc:
+    mfspr   r11, 256            ;# get old VRSAVE
+    oris    r12, r11, 0xf830
+    ori     r12, r12, 0xfff8
+    mtspr   256, r12            ;# set VRSAVE
+
+    stwu    r1,-32(r1)          ;# create space on the stack
+
+    HProlog second_pass_8x4_pre_copy_b
+
+    ;# Load up permutation constants
+    load_c v10, b_0123_b, 0, r9, r12
+    load_c v11, b_4567_b, 0, r9, r12
+
+    hfilter_8 v0, 1
+    hfilter_8 v1, 1
+    hfilter_8 v2, 1
+    hfilter_8 v3, 1
+
+    ;# Finished filtering main horizontal block.  If there is no
+    ;#  vertical filtering, jump to storing the data.  Otherwise
+    ;#  load up and filter the additional line that is needed
+    ;#  for the vertical filter.
+    beq     store_out_8x4_b
+
+    hfilter_8 v4, 0
+
+    b   second_pass_8x4_b
+
+second_pass_8x4_pre_copy_b:
+    slwi    r6, r6, 5           ;# index into vertical filter array
+
+    load_and_align_8  v0, 1
+    load_and_align_8  v1, 1
+    load_and_align_8  v2, 1
+    load_and_align_8  v3, 1
+    load_and_align_8  v4, 1
+
+second_pass_8x4_b:
+    vspltish v20, 8
+    vspltish v18, 3
+    vslh    v18, v20, v18   ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
+
+    load_vfilter v20, v21
+
+    vfilter_16 v0,  v1
+    vfilter_16 v1,  v2
+    vfilter_16 v2,  v3
+    vfilter_16 v3,  v4
+
+store_out_8x4_b:
+
+    cmpi    cr0, r8, 8
+    beq     cr0, store_aligned_8x4_b
+
+    w_8x8   v0, r7, r0, r8
+    w_8x8   v1, r7, r0, r8
+    w_8x8   v2, r7, r0, r8
+    w_8x8   v3, r7, r0, r8
+
+    b       exit_8x4
+
+store_aligned_8x4_b:
+    load_c v10, b_hilo_b, 0, r9, r10
+
+    vperm   v0, v0, v1, v10
+    vperm   v2, v2, v3, v10
+
+    stvx    v0, 0, r7
+    addi    r7, r7, 16
+    stvx    v2, 0, r7
+
+exit_8x4:
+
+    addi    r1, r1, 32          ;# recover stack
+    mtspr   256, r11            ;# reset old VRSAVE
+
+    blr
+
+    .align 2
+;# r3 unsigned char * src
+;# r4 int src_pitch
+;# r5 int x_offset
+;# r6 int y_offset
+;# r7 unsigned char * dst
+;# r8 int dst_pitch
+bilinear_predict8x8_ppc:
+    mfspr   r11, 256            ;# get old VRSAVE
+    oris    r12, r11, 0xfff0
+    ori     r12, r12, 0xffff
+    mtspr   256, r12            ;# set VRSAVE
+
+    stwu    r1,-32(r1)          ;# create space on the stack
+
+    HProlog second_pass_8x8_pre_copy_b
+
+    ;# Load up permutation constants
+    load_c v10, b_0123_b, 0, r9, r12
+    load_c v11, b_4567_b, 0, r9, r12
+
+    hfilter_8 v0, 1
+    hfilter_8 v1, 1
+    hfilter_8 v2, 1
+    hfilter_8 v3, 1
+    hfilter_8 v4, 1
+    hfilter_8 v5, 1
+    hfilter_8 v6, 1
+    hfilter_8 v7, 1
+
+    ;# Finished filtering main horizontal block.  If there is no
+    ;#  vertical filtering, jump to storing the data.  Otherwise
+    ;#  load up and filter the additional line that is needed
+    ;#  for the vertical filter.
+    beq     store_out_8x8_b
+
+    hfilter_8 v8, 0
+
+    b   second_pass_8x8_b
+
+second_pass_8x8_pre_copy_b:
+    slwi    r6, r6, 5           ;# index into vertical filter array
+
+    load_and_align_8  v0, 1
+    load_and_align_8  v1, 1
+    load_and_align_8  v2, 1
+    load_and_align_8  v3, 1
+    load_and_align_8  v4, 1
+    load_and_align_8  v5, 1
+    load_and_align_8  v6, 1
+    load_and_align_8  v7, 1
+    load_and_align_8  v8, 0
+
+second_pass_8x8_b:
+    vspltish v20, 8
+    vspltish v18, 3
+    vslh    v18, v20, v18   ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
+
+    load_vfilter v20, v21
+
+    vfilter_16 v0,  v1
+    vfilter_16 v1,  v2
+    vfilter_16 v2,  v3
+    vfilter_16 v3,  v4
+    vfilter_16 v4,  v5
+    vfilter_16 v5,  v6
+    vfilter_16 v6,  v7
+    vfilter_16 v7,  v8
+
+store_out_8x8_b:
+
+    cmpi    cr0, r8, 8
+    beq     cr0, store_aligned_8x8_b
+
+    w_8x8   v0, r7, r0, r8
+    w_8x8   v1, r7, r0, r8
+    w_8x8   v2, r7, r0, r8
+    w_8x8   v3, r7, r0, r8
+    w_8x8   v4, r7, r0, r8
+    w_8x8   v5, r7, r0, r8
+    w_8x8   v6, r7, r0, r8
+    w_8x8   v7, r7, r0, r8
+
+    b       exit_8x8
+
+store_aligned_8x8_b:
+    load_c v10, b_hilo_b, 0, r9, r10
+
+    vperm   v0, v0, v1, v10
+    vperm   v2, v2, v3, v10
+    vperm   v4, v4, v5, v10
+    vperm   v6, v6, v7, v10
+
+    stvx    v0, 0, r7
+    addi    r7, r7, 16
+    stvx    v2, 0, r7
+    addi    r7, r7, 16
+    stvx    v4, 0, r7
+    addi    r7, r7, 16
+    stvx    v6, 0, r7
+
+exit_8x8:
+
+    addi    r1, r1, 32          ;# recover stack
+    mtspr   256, r11            ;# reset old VRSAVE
+
+    blr
+
+;# Filters a horizontal line
+;# expects:
+;#  r3  src_ptr
+;#  r4  pitch
+;#  r10 16
+;#  r12 32
+;#  v17 perm intput
+;#  v18 rounding
+;#  v19 shift
+;#  v20 filter taps
+;#  v21 tmp
+;#  v22 tmp
+;#  v23 tmp
+;#  v24 tmp
+;#  v25 tmp
+;#  v26 tmp
+;#  v27 tmp
+;#  v28 perm output
+;#
+.macro hfilter_16 V, increment_counter
+
+    lvsl    v17,  0, r3         ;# permutate value for alignment
+
+    ;# input to filter is 21 bytes wide, output is 16 bytes.
+    ;#  input will can span three vectors if not aligned correctly.
+    lvx     v21,   0, r3
+    lvx     v22, r10, r3
+    lvx     v23, r12, r3
+
+.if \increment_counter
+    add     r3, r3, r4
+.endif
+    vperm   v21, v21, v22, v17
+    vperm   v22, v22, v23, v17  ;# v8 v9 = 21 input pixels left-justified
+
+    ;# set 0
+    vmsummbm v24, v20, v21, v18 ;# taps times elements
+
+    ;# set 1
+    vsldoi  v23, v21, v22, 1
+    vmsummbm v25, v20, v23, v18
+
+    ;# set 2
+    vsldoi  v23, v21, v22, 2
+    vmsummbm v26, v20, v23, v18
+
+    ;# set 3
+    vsldoi  v23, v21, v22, 3
+    vmsummbm v27, v20, v23, v18
+
+    vpkswus v24, v24, v25       ;# v24 = 0 4 8 C 1 5 9 D (16-bit)
+    vpkswus v25, v26, v27       ;# v25 = 2 6 A E 3 7 B F
+
+    vsrh    v24, v24, v19       ;# divide v0, v1 by 128
+    vsrh    v25, v25, v19
+
+    vpkuhus \V, v24, v25        ;# \V = scrambled 8-bit result
+    vperm   \V, \V, v0, v28     ;# \V = correctly-ordered result
+.endm
+
+.macro load_and_align_16 V, increment_counter
+    lvsl    v17,  0, r3         ;# permutate value for alignment
+
+    ;# input to filter is 21 bytes wide, output is 16 bytes.
+    ;#  input will can span three vectors if not aligned correctly.
+    lvx     v21,   0, r3
+    lvx     v22, r10, r3
+
+.if \increment_counter
+    add     r3, r3, r4
+.endif
+
+    vperm   \V, v21, v22, v17
+.endm
+
+.macro write_16 V, increment_counter
+    stvx    \V,  0, r7
+
+.if \increment_counter
+    add     r7, r7, r8
+.endif
+.endm
+
+    .align 2
+;# r3 unsigned char * src
+;# r4 int src_pitch
+;# r5 int x_offset
+;# r6 int y_offset
+;# r7 unsigned char * dst
+;# r8 int dst_pitch
+bilinear_predict16x16_ppc:
+    mfspr   r11, 256            ;# get old VRSAVE
+    oris    r12, r11, 0xffff
+    ori     r12, r12, 0xfff8
+    mtspr   256, r12            ;# set VRSAVE
+
+    HProlog second_pass_16x16_pre_copy_b
+
+    hfilter_16 v0,  1
+    hfilter_16 v1,  1
+    hfilter_16 v2,  1
+    hfilter_16 v3,  1
+    hfilter_16 v4,  1
+    hfilter_16 v5,  1
+    hfilter_16 v6,  1
+    hfilter_16 v7,  1
+    hfilter_16 v8,  1
+    hfilter_16 v9,  1
+    hfilter_16 v10, 1
+    hfilter_16 v11, 1
+    hfilter_16 v12, 1
+    hfilter_16 v13, 1
+    hfilter_16 v14, 1
+    hfilter_16 v15, 1
+
+    ;# Finished filtering main horizontal block.  If there is no
+    ;#  vertical filtering, jump to storing the data.  Otherwise
+    ;#  load up and filter the additional line that is needed
+    ;#  for the vertical filter.
+    beq     store_out_16x16_b
+
+    hfilter_16 v16, 0
+
+    b   second_pass_16x16_b
+
+second_pass_16x16_pre_copy_b:
+    slwi    r6, r6, 5           ;# index into vertical filter array
+
+    load_and_align_16  v0,  1
+    load_and_align_16  v1,  1
+    load_and_align_16  v2,  1
+    load_and_align_16  v3,  1
+    load_and_align_16  v4,  1
+    load_and_align_16  v5,  1
+    load_and_align_16  v6,  1
+    load_and_align_16  v7,  1
+    load_and_align_16  v8,  1
+    load_and_align_16  v9,  1
+    load_and_align_16  v10, 1
+    load_and_align_16  v11, 1
+    load_and_align_16  v12, 1
+    load_and_align_16  v13, 1
+    load_and_align_16  v14, 1
+    load_and_align_16  v15, 1
+    load_and_align_16  v16, 0
+
+second_pass_16x16_b:
+    vspltish v20, 8
+    vspltish v18, 3
+    vslh    v18, v20, v18   ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
+
+    load_vfilter v20, v21
+
+    vfilter_16 v0,  v1
+    vfilter_16 v1,  v2
+    vfilter_16 v2,  v3
+    vfilter_16 v3,  v4
+    vfilter_16 v4,  v5
+    vfilter_16 v5,  v6
+    vfilter_16 v6,  v7
+    vfilter_16 v7,  v8
+    vfilter_16 v8,  v9
+    vfilter_16 v9,  v10
+    vfilter_16 v10, v11
+    vfilter_16 v11, v12
+    vfilter_16 v12, v13
+    vfilter_16 v13, v14
+    vfilter_16 v14, v15
+    vfilter_16 v15, v16
+
+store_out_16x16_b:
+
+    write_16 v0,  1
+    write_16 v1,  1
+    write_16 v2,  1
+    write_16 v3,  1
+    write_16 v4,  1
+    write_16 v5,  1
+    write_16 v6,  1
+    write_16 v7,  1
+    write_16 v8,  1
+    write_16 v9,  1
+    write_16 v10, 1
+    write_16 v11, 1
+    write_16 v12, 1
+    write_16 v13, 1
+    write_16 v14, 1
+    write_16 v15, 0
+
+    mtspr   256, r11            ;# reset old VRSAVE
+
+    blr
+
+    .data
+
+    .align 4
+hfilter_b:
+    .byte   128,  0,  0,  0,128,  0,  0,  0,128,  0,  0,  0,128,  0,  0,  0
+    .byte   112, 16,  0,  0,112, 16,  0,  0,112, 16,  0,  0,112, 16,  0,  0
+    .byte    96, 32,  0,  0, 96, 32,  0,  0, 96, 32,  0,  0, 96, 32,  0,  0
+    .byte    80, 48,  0,  0, 80, 48,  0,  0, 80, 48,  0,  0, 80, 48,  0,  0
+    .byte    64, 64,  0,  0, 64, 64,  0,  0, 64, 64,  0,  0, 64, 64,  0,  0
+    .byte    48, 80,  0,  0, 48, 80,  0,  0, 48, 80,  0,  0, 48, 80,  0,  0
+    .byte    32, 96,  0,  0, 32, 96,  0,  0, 32, 96,  0,  0, 32, 96,  0,  0
+    .byte    16,112,  0,  0, 16,112,  0,  0, 16,112,  0,  0, 16,112,  0,  0
+
+    .align 4
+vfilter_b:
+    .byte   128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128
+    .byte     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
+    .byte   112,112,112,112,112,112,112,112,112,112,112,112,112,112,112,112
+    .byte    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
+    .byte    96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96
+    .byte    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32
+    .byte    80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80
+    .byte    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48
+    .byte    64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
+    .byte    64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
+    .byte    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48
+    .byte    80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80
+    .byte    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32
+    .byte    96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96
+    .byte    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
+    .byte   112,112,112,112,112,112,112,112,112,112,112,112,112,112,112,112
+
+    .align 4
+b_hperm_b:
+    .byte     0,  4,  8, 12,  1,  5,  9, 13,  2,  6, 10, 14,  3,  7, 11, 15
+
+    .align 4
+b_0123_b:
+    .byte     0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6
+
+    .align 4
+b_4567_b:
+    .byte     4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10
+
+b_hilo_b:
+    .byte     0,  1,  2,  3,  4,  5,  6,  7, 16, 17, 18, 19, 20, 21, 22, 23
--- /dev/null
+++ b/vp9/common/ppc/idctllm_altivec.asm
@@ -1,0 +1,189 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    .globl short_idct4x4llm_ppc
+
+.macro load_c V, LABEL, OFF, R0, R1
+    lis     \R0, \LABEL@ha
+    la      \R1, \LABEL@l(\R0)
+    lvx     \V, \OFF, \R1
+.endm
+
+;# r3 short *input
+;# r4 short *output
+;# r5 int pitch
+    .align 2
+short_idct4x4llm_ppc:
+    mfspr   r11, 256            ;# get old VRSAVE
+    oris    r12, r11, 0xfff8
+    mtspr   256, r12            ;# set VRSAVE
+
+    load_c v8, sinpi8sqrt2, 0, r9, r10
+    load_c v9, cospi8sqrt2minus1, 0, r9, r10
+    load_c v10, hi_hi, 0, r9, r10
+    load_c v11, lo_lo, 0, r9, r10
+    load_c v12, shift_16, 0, r9, r10
+
+    li      r10,  16
+    lvx     v0,   0, r3         ;# input ip[0], ip[ 4]
+    lvx     v1, r10, r3         ;# input ip[8], ip[12]
+
+    ;# first pass
+    vupkhsh v2, v0
+    vupkhsh v3, v1
+    vaddsws v6, v2, v3          ;# a1 = ip[0]+ip[8]
+    vsubsws v7, v2, v3          ;# b1 = ip[0]-ip[8]
+
+    vupklsh v0, v0
+    vmulosh v4, v0, v8
+    vsraw   v4, v4, v12
+    vaddsws v4, v4, v0          ;# ip[ 4] * sin(pi/8) * sqrt(2)
+
+    vupklsh v1, v1
+    vmulosh v5, v1, v9
+    vsraw   v5, v5, v12         ;# ip[12] * cos(pi/8) * sqrt(2)
+    vaddsws v5, v5, v1
+
+    vsubsws v4, v4, v5          ;# c1
+
+    vmulosh v3, v1, v8
+    vsraw   v3, v3, v12
+    vaddsws v3, v3, v1          ;# ip[12] * sin(pi/8) * sqrt(2)
+
+    vmulosh v5, v0, v9
+    vsraw   v5, v5, v12         ;# ip[ 4] * cos(pi/8) * sqrt(2)
+    vaddsws v5, v5, v0
+
+    vaddsws v3, v3, v5          ;# d1
+
+    vaddsws v0, v6, v3          ;# a1 + d1
+    vsubsws v3, v6, v3          ;# a1 - d1
+
+    vaddsws v1, v7, v4          ;# b1 + c1
+    vsubsws v2, v7, v4          ;# b1 - c1
+
+    ;# transpose input
+    vmrghw  v4, v0, v1          ;# a0 b0 a1 b1
+    vmrghw  v5, v2, v3          ;# c0 d0 c1 d1
+
+    vmrglw  v6, v0, v1          ;# a2 b2 a3 b3
+    vmrglw  v7, v2, v3          ;# c2 d2 c3 d3
+
+    vperm   v0, v4, v5, v10     ;# a0 b0 c0 d0
+    vperm   v1, v4, v5, v11     ;# a1 b1 c1 d1
+
+    vperm   v2, v6, v7, v10     ;# a2 b2 c2 d2
+    vperm   v3, v6, v7, v11     ;# a3 b3 c3 d3
+
+    ;# second pass
+    vaddsws v6, v0, v2          ;# a1 = ip[0]+ip[8]
+    vsubsws v7, v0, v2          ;# b1 = ip[0]-ip[8]
+
+    vmulosh v4, v1, v8
+    vsraw   v4, v4, v12
+    vaddsws v4, v4, v1          ;# ip[ 4] * sin(pi/8) * sqrt(2)
+
+    vmulosh v5, v3, v9
+    vsraw   v5, v5, v12         ;# ip[12] * cos(pi/8) * sqrt(2)
+    vaddsws v5, v5, v3
+
+    vsubsws v4, v4, v5          ;# c1
+
+    vmulosh v2, v3, v8
+    vsraw   v2, v2, v12
+    vaddsws v2, v2, v3          ;# ip[12] * sin(pi/8) * sqrt(2)
+
+    vmulosh v5, v1, v9
+    vsraw   v5, v5, v12         ;# ip[ 4] * cos(pi/8) * sqrt(2)
+    vaddsws v5, v5, v1
+
+    vaddsws v3, v2, v5          ;# d1
+
+    vaddsws v0, v6, v3          ;# a1 + d1
+    vsubsws v3, v6, v3          ;# a1 - d1
+
+    vaddsws v1, v7, v4          ;# b1 + c1
+    vsubsws v2, v7, v4          ;# b1 - c1
+
+    vspltish v6, 4
+    vspltish v7, 3
+
+    vpkswss v0, v0, v1
+    vpkswss v1, v2, v3
+
+    vaddshs v0, v0, v6
+    vaddshs v1, v1, v6
+
+    vsrah   v0, v0, v7
+    vsrah   v1, v1, v7
+
+    ;# transpose output
+    vmrghh  v2, v0, v1          ;# a0 c0 a1 c1 a2 c2 a3 c3
+    vmrglh  v3, v0, v1          ;# b0 d0 b1 d1 b2 d2 b3 d3
+
+    vmrghh  v0, v2, v3          ;# a0 b0 c0 d0 a1 b1 c1 d1
+    vmrglh  v1, v2, v3          ;# a2 b2 c2 d2 a3 b3 c3 d3
+
+    stwu    r1,-416(r1)         ;# create space on the stack
+
+    stvx    v0,  0, r1
+    lwz     r6, 0(r1)
+    stw     r6, 0(r4)
+    lwz     r6, 4(r1)
+    stw     r6, 4(r4)
+
+    add     r4, r4, r5
+
+    lwz     r6,  8(r1)
+    stw     r6,  0(r4)
+    lwz     r6, 12(r1)
+    stw     r6,  4(r4)
+
+    add     r4, r4, r5
+
+    stvx    v1,  0, r1
+    lwz     r6, 0(r1)
+    stw     r6, 0(r4)
+    lwz     r6, 4(r1)
+    stw     r6, 4(r4)
+
+    add     r4, r4, r5
+
+    lwz     r6,  8(r1)
+    stw     r6,  0(r4)
+    lwz     r6, 12(r1)
+    stw     r6,  4(r4)
+
+    addi    r1, r1, 416         ;# recover stack
+
+    mtspr   256, r11            ;# reset old VRSAVE
+
+    blr
+
+    .align 4
+sinpi8sqrt2:
+    .short  35468, 35468, 35468, 35468, 35468, 35468, 35468, 35468
+
+    .align 4
+cospi8sqrt2minus1:
+    .short  20091, 20091, 20091, 20091, 20091, 20091, 20091, 20091
+
+    .align 4
+shift_16:
+    .long      16,    16,    16,    16
+
+    .align 4
+hi_hi:
+    .byte     0,  1,  2,  3,  4,  5,  6,  7, 16, 17, 18, 19, 20, 21, 22, 23
+
+    .align 4
+lo_lo:
+    .byte     8,  9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31
--- /dev/null
+++ b/vp9/common/ppc/loopfilter_altivec.c
@@ -1,0 +1,127 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "loopfilter.h"
+#include "onyxc_int.h"
+
+typedef void loop_filter_function_y_ppc
+(
+  unsigned char *s,   // source pointer
+  int p,              // pitch
+  const signed char *flimit,
+  const signed char *limit,
+  const signed char *thresh
+);
+
+typedef void loop_filter_function_uv_ppc
+(
+  unsigned char *u,   // source pointer
+  unsigned char *v,   // source pointer
+  int p,              // pitch
+  const signed char *flimit,
+  const signed char *limit,
+  const signed char *thresh
+);
+
+typedef void loop_filter_function_s_ppc
+(
+  unsigned char *s,   // source pointer
+  int p,              // pitch
+  const signed char *flimit
+);
+
+loop_filter_function_y_ppc mbloop_filter_horizontal_edge_y_ppc;
+loop_filter_function_y_ppc mbloop_filter_vertical_edge_y_ppc;
+loop_filter_function_y_ppc loop_filter_horizontal_edge_y_ppc;
+loop_filter_function_y_ppc loop_filter_vertical_edge_y_ppc;
+
+loop_filter_function_uv_ppc mbloop_filter_horizontal_edge_uv_ppc;
+loop_filter_function_uv_ppc mbloop_filter_vertical_edge_uv_ppc;
+loop_filter_function_uv_ppc loop_filter_horizontal_edge_uv_ppc;
+loop_filter_function_uv_ppc loop_filter_vertical_edge_uv_ppc;
+
+loop_filter_function_s_ppc loop_filter_simple_horizontal_edge_ppc;
+loop_filter_function_s_ppc loop_filter_simple_vertical_edge_ppc;
+
+// Horizontal MB filtering
+void loop_filter_mbh_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                         int y_stride, int uv_stride, loop_filter_info *lfi) {
+  mbloop_filter_horizontal_edge_y_ppc(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr);
+
+  if (u_ptr)
+    mbloop_filter_horizontal_edge_uv_ppc(u_ptr, v_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr);
+}
+
+void loop_filter_mbhs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                          int y_stride, int uv_stride, loop_filter_info *lfi) {
+  (void)u_ptr;
+  (void)v_ptr;
+  (void)uv_stride;
+  loop_filter_simple_horizontal_edge_ppc(y_ptr, y_stride, lfi->mbflim);
+}
+
+// Vertical MB Filtering
+void loop_filter_mbv_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                         int y_stride, int uv_stride, loop_filter_info *lfi) {
+  mbloop_filter_vertical_edge_y_ppc(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr);
+
+  if (u_ptr)
+    mbloop_filter_vertical_edge_uv_ppc(u_ptr, v_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr);
+}
+
+void loop_filter_mbvs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                          int y_stride, int uv_stride, loop_filter_info *lfi) {
+  (void)u_ptr;
+  (void)v_ptr;
+  (void)uv_stride;
+  loop_filter_simple_vertical_edge_ppc(y_ptr, y_stride, lfi->mbflim);
+}
+
+// Horizontal B Filtering
+void loop_filter_bh_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                        int y_stride, int uv_stride, loop_filter_info *lfi) {
+  // These should all be done at once with one call, instead of 3
+  loop_filter_horizontal_edge_y_ppc(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr);
+  loop_filter_horizontal_edge_y_ppc(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr);
+  loop_filter_horizontal_edge_y_ppc(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr);
+
+  if (u_ptr)
+    loop_filter_horizontal_edge_uv_ppc(u_ptr + 4 * uv_stride, v_ptr + 4 * uv_stride, uv_stride, lfi->flim, lfi->lim, lfi->thr);
+}
+
+void loop_filter_bhs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                         int y_stride, int uv_stride, loop_filter_info *lfi) {
+  (void)u_ptr;
+  (void)v_ptr;
+  (void)uv_stride;
+  loop_filter_simple_horizontal_edge_ppc(y_ptr + 4 * y_stride, y_stride, lfi->flim);
+  loop_filter_simple_horizontal_edge_ppc(y_ptr + 8 * y_stride, y_stride, lfi->flim);
+  loop_filter_simple_horizontal_edge_ppc(y_ptr + 12 * y_stride, y_stride, lfi->flim);
+}
+
+// Vertical B Filtering
+void loop_filter_bv_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                        int y_stride, int uv_stride, loop_filter_info *lfi) {
+  loop_filter_vertical_edge_y_ppc(y_ptr, y_stride, lfi->flim, lfi->lim, lfi->thr);
+
+  if (u_ptr)
+    loop_filter_vertical_edge_uv_ppc(u_ptr + 4, v_ptr + 4, uv_stride, lfi->flim, lfi->lim, lfi->thr);
+}
+
+void loop_filter_bvs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                         int y_stride, int uv_stride, loop_filter_info *lfi) {
+  (void)u_ptr;
+  (void)v_ptr;
+  (void)uv_stride;
+  loop_filter_simple_vertical_edge_ppc(y_ptr + 4,  y_stride, lfi->flim);
+  loop_filter_simple_vertical_edge_ppc(y_ptr + 8,  y_stride, lfi->flim);
+  loop_filter_simple_vertical_edge_ppc(y_ptr + 12, y_stride, lfi->flim);
+}
--- /dev/null
+++ b/vp9/common/ppc/loopfilter_filters_altivec.asm
@@ -1,0 +1,1253 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    .globl mbloop_filter_horizontal_edge_y_ppc
+    .globl loop_filter_horizontal_edge_y_ppc
+    .globl mbloop_filter_vertical_edge_y_ppc
+    .globl loop_filter_vertical_edge_y_ppc
+
+    .globl mbloop_filter_horizontal_edge_uv_ppc
+    .globl loop_filter_horizontal_edge_uv_ppc
+    .globl mbloop_filter_vertical_edge_uv_ppc
+    .globl loop_filter_vertical_edge_uv_ppc
+
+    .globl loop_filter_simple_horizontal_edge_ppc
+    .globl loop_filter_simple_vertical_edge_ppc
+
+    .text
+;# We often need to perform transposes (and other transpose-like operations)
+;#   on matrices of data.  This is simplified by the fact that we usually
+;#   operate on hunks of data whose dimensions are powers of 2, or at least
+;#   divisible by highish powers of 2.
+;#
+;#   These operations can be very confusing.  They become more straightforward
+;#   when we think of them as permutations of address bits: Concatenate a
+;#   group of vector registers and think of it as occupying a block of
+;#   memory beginning at address zero.  The low four bits 0...3 of the
+;#   address then correspond to position within a register, the higher-order
+;#   address bits select the register.
+;#
+;#   Although register selection, at the code level, is arbitrary, things
+;#   are simpler if we use contiguous ranges of register numbers, simpler
+;#   still if the low-order bits of the register number correspond to
+;#   conceptual address bits.  We do this whenever reasonable.
+;#
+;#   A 16x16 transpose can then be thought of as an operation on
+;#   a 256-element block of memory.  It takes 8 bits 0...7 to address this
+;#   memory and the effect of a transpose is to interchange address bit
+;#   0 with 4, 1 with 5, 2 with 6, and 3 with 7.  Bits 0...3 index the
+;#   column, which is interchanged with the row addressed by bits 4..7.
+;#
+;#   The altivec merge instructions provide a rapid means of effecting
+;#   many of these transforms.  They operate at three widths (8,16,32).
+;#   Writing V(x) for vector register #x, paired merges permute address
+;#   indices as follows.
+;#
+;#   0->1  1->2  2->3  3->(4+d)  (4+s)->0:
+;#
+;#      vmrghb  V( x),          V( y), V( y + (1<<s))
+;#      vmrglb  V( x + (1<<d)), V( y), V( y + (1<<s))
+;#
+;#
+;#   =0=   1->2  2->3  3->(4+d)  (4+s)->1:
+;#
+;#      vmrghh  V( x),          V( y), V( y + (1<<s))
+;#      vmrglh  V( x + (1<<d)), V( y), V( y + (1<<s))
+;#
+;#
+;#   =0=   =1=   2->3  3->(4+d)  (4+s)->2:
+;#
+;#      vmrghw  V( x),          V( y), V( y + (1<<s))
+;#      vmrglw  V( x + (1<<d)), V( y), V( y + (1<<s))
+;#
+;#
+;#   Unfortunately, there is no doubleword merge instruction.
+;#   The following sequence uses "vperm" is a substitute.
+;#   Assuming that the selection masks b_hihi and b_lolo (defined in LFppc.c)
+;#   are in registers Vhihi and Vlolo, we can also effect the permutation
+;#
+;#   =0=   =1=   =2=   3->(4+d)  (4+s)->3   by the sequence:
+;#
+;#      vperm   V( x),          V( y), V( y + (1<<s)), Vhihi
+;#      vperm   V( x + (1<<d)), V( y), V( y + (1<<s)), Vlolo
+;#
+;#
+;#   Except for bits s and d, the other relationships between register
+;#   number (= high-order part of address) bits are at the disposal of
+;#   the programmer.
+;#
+
+;# To avoid excess transposes, we filter all 3 vertical luma subblock
+;#   edges together.  This requires a single 16x16 transpose, which, in
+;#   the above language, amounts to the following permutation of address
+;#   indices:  0<->4   1<->5  2<->6  3<->7, which we accomplish by
+;#   4 iterations of the cyclic transform 0->1->2->3->4->5->6->7->0.
+;#
+;#   Except for the fact that the destination registers get written
+;#   before we are done referencing the old contents, the cyclic transform
+;#   is effected by
+;#
+;#      x = 0;  do {
+;#          vmrghb V(2x),   V(x), V(x+8);
+;#          vmrghb V(2x+1), V(x), V(x+8);
+;#      } while( ++x < 8);
+;#
+;#   For clarity, and because we can afford it, we do this transpose
+;#   using all 32 registers, alternating the banks 0..15  and  16 .. 31,
+;#   leaving the final result in 16 .. 31, as the lower registers are
+;#   used in the filtering itself.
+;#
+.macro Tpair A, B, X, Y
+    vmrghb  \A, \X, \Y
+    vmrglb  \B, \X, \Y
+.endm
+
+;# Each step takes 8*2 = 16 instructions
+
+.macro t16_even
+    Tpair v16,v17,  v0,v8
+    Tpair v18,v19,  v1,v9
+    Tpair v20,v21,  v2,v10
+    Tpair v22,v23,  v3,v11
+    Tpair v24,v25,  v4,v12
+    Tpair v26,v27,  v5,v13
+    Tpair v28,v29,  v6,v14
+    Tpair v30,v31,  v7,v15
+.endm
+
+.macro t16_odd
+    Tpair v0,v1, v16,v24
+    Tpair v2,v3, v17,v25
+    Tpair v4,v5, v18,v26
+    Tpair v6,v7, v19,v27
+    Tpair v8,v9, v20,v28
+    Tpair v10,v11, v21,v29
+    Tpair v12,v13, v22,v30
+    Tpair v14,v15, v23,v31
+.endm
+
+;# Whole transpose takes 4*16 = 64 instructions
+
+.macro t16_full
+    t16_odd
+    t16_even
+    t16_odd
+    t16_even
+.endm
+
+;# Vertical edge filtering requires transposes.  For the simple filter,
+;#   we need to convert 16 rows of 4 pels each into 4 registers of 16 pels
+;#   each.  Writing 0 ... 63 for the pixel indices, the desired result is:
+;#
+;#  v0 =  0  1 ... 14 15
+;#  v1 = 16 17 ... 30 31
+;#  v2 = 32 33 ... 47 48
+;#  v3 = 49 50 ... 62 63
+;#
+;#  In frame-buffer memory, the layout is:
+;#
+;#     0  16  32  48
+;#     1  17  33  49
+;#     ...
+;#    15  31  47  63.
+;#
+;#  We begin by reading the data 32 bits at a time (using scalar operations)
+;#  into a temporary array, reading the rows of the array into vector registers,
+;#  with the following layout:
+;#
+;#  v0 =  0 16 32 48  4 20 36 52  8 24 40 56  12 28 44 60
+;#  v1 =  1 17 33 49  5 21 ...                      45 61
+;#  v2 =  2 18 ...                                  46 62
+;#  v3 =  3 19 ...                                  47 63
+;#
+;#  From the "address-bit" perspective discussed above, we simply need to
+;#  interchange bits 0 <-> 4 and 1 <-> 5, leaving bits 2 and 3 alone.
+;#  In other words, we transpose each of the four 4x4 submatrices.
+;#
+;#  This transformation is its own inverse, and we need to perform it
+;#  again before writing the pixels back into the frame buffer.
+;#
+;#  It acts in place on registers v0...v3, uses v4...v7 as temporaries,
+;#  and assumes that v14/v15 contain the b_hihi/b_lolo selectors
+;#  defined above.  We think of both groups of 4 registers as having
+;#  "addresses" {0,1,2,3} * 16.
+;#
+.macro Transpose4times4x4 Vlo, Vhi
+
+    ;# d=s=0        0->1  1->2  2->3  3->4  4->0  =5=
+
+    vmrghb  v4, v0, v1
+    vmrglb  v5, v0, v1
+    vmrghb  v6, v2, v3
+    vmrglb  v7, v2, v3
+
+    ;# d=0 s=1      =0=   1->2  2->3  3->4  4->5  5->1
+
+    vmrghh  v0, v4, v6
+    vmrglh  v1, v4, v6
+    vmrghh  v2, v5, v7
+    vmrglh  v3, v5, v7
+
+    ;# d=s=0        =0=   =1=   2->3  3->4  4->2  =5=
+
+    vmrghw  v4, v0, v1
+    vmrglw  v5, v0, v1
+    vmrghw  v6, v2, v3
+    vmrglw  v7, v2, v3
+
+    ;# d=0  s=1     =0=   =1=   =2=   3->4  4->5  5->3
+
+    vperm   v0, v4, v6, \Vlo
+    vperm   v1, v4, v6, \Vhi
+    vperm   v2, v5, v7, \Vlo
+    vperm   v3, v5, v7, \Vhi
+.endm
+;# end Transpose4times4x4
+
+
+;# Normal mb vertical edge filter transpose.
+;#
+;#   We read 8 columns of data, initially in the following pattern:
+;#
+;#  (0,0)  (1,0) ... (7,0)  (0,1)  (1,1) ... (7,1)
+;#  (0,2)  (1,2) ... (7,2)  (0,3)  (1,3) ... (7,3)
+;#  ...
+;#  (0,14) (1,14) .. (7,14) (0,15) (1,15) .. (7,15)
+;#
+;#   and wish to convert to:
+;#
+;#  (0,0) ... (0,15)
+;#  (1,0) ... (1,15)
+;#  ...
+;#  (7,0) ... (7,15).
+;#
+;#  In "address bit" language, we wish to map
+;#
+;#  0->4  1->5  2->6  3->0  4->1  5->2  6->3, i.e., I -> (I+4) mod 7.
+;#
+;#  This can be accomplished by 4 iterations of the cyclic transform
+;#
+;#  I -> (I+1) mod 7;
+;#
+;#  each iteration can be realized by (d=0, s=2):
+;#
+;#  x = 0;  do  Tpair( V(2x),V(2x+1),  V(x),V(x+4))  while( ++x < 4);
+;#
+;#  The input/output is in registers v0...v7.  We use v10...v17 as mirrors;
+;#  preserving v8 = sign converter.
+;#
+;#  Inverse transpose is similar, except here I -> (I+3) mod 7 and the
+;#  result lands in the "mirror" registers v10...v17
+;#
+.macro t8x16_odd
+    Tpair v10, v11,  v0, v4
+    Tpair v12, v13,  v1, v5
+    Tpair v14, v15,  v2, v6
+    Tpair v16, v17,  v3, v7
+.endm
+
+.macro t8x16_even
+    Tpair v0, v1,  v10, v14
+    Tpair v2, v3,  v11, v15
+    Tpair v4, v5,  v12, v16
+    Tpair v6, v7,  v13, v17
+.endm
+
+.macro transpose8x16_fwd
+    t8x16_odd
+    t8x16_even
+    t8x16_odd
+    t8x16_even
+.endm
+
+.macro transpose8x16_inv
+    t8x16_odd
+    t8x16_even
+    t8x16_odd
+.endm
+
+.macro Transpose16x16
+    vmrghb  v0, v16, v24
+    vmrglb  v1, v16, v24
+    vmrghb  v2, v17, v25
+    vmrglb  v3, v17, v25
+    vmrghb  v4, v18, v26
+    vmrglb  v5, v18, v26
+    vmrghb  v6, v19, v27
+    vmrglb  v7, v19, v27
+    vmrghb  v8, v20, v28
+    vmrglb  v9, v20, v28
+    vmrghb  v10, v21, v29
+    vmrglb  v11, v21, v29
+    vmrghb  v12, v22, v30
+    vmrglb  v13, v22, v30
+    vmrghb  v14, v23, v31
+    vmrglb  v15, v23, v31
+    vmrghb  v16, v0, v8
+    vmrglb  v17, v0, v8
+    vmrghb  v18, v1, v9
+    vmrglb  v19, v1, v9
+    vmrghb  v20, v2, v10
+    vmrglb  v21, v2, v10
+    vmrghb  v22, v3, v11
+    vmrglb  v23, v3, v11
+    vmrghb  v24, v4, v12
+    vmrglb  v25, v4, v12
+    vmrghb  v26, v5, v13
+    vmrglb  v27, v5, v13
+    vmrghb  v28, v6, v14
+    vmrglb  v29, v6, v14
+    vmrghb  v30, v7, v15
+    vmrglb  v31, v7, v15
+    vmrghb  v0, v16, v24
+    vmrglb  v1, v16, v24
+    vmrghb  v2, v17, v25
+    vmrglb  v3, v17, v25
+    vmrghb  v4, v18, v26
+    vmrglb  v5, v18, v26
+    vmrghb  v6, v19, v27
+    vmrglb  v7, v19, v27
+    vmrghb  v8, v20, v28
+    vmrglb  v9, v20, v28
+    vmrghb  v10, v21, v29
+    vmrglb  v11, v21, v29
+    vmrghb  v12, v22, v30
+    vmrglb  v13, v22, v30
+    vmrghb  v14, v23, v31
+    vmrglb  v15, v23, v31
+    vmrghb  v16, v0, v8
+    vmrglb  v17, v0, v8
+    vmrghb  v18, v1, v9
+    vmrglb  v19, v1, v9
+    vmrghb  v20, v2, v10
+    vmrglb  v21, v2, v10
+    vmrghb  v22, v3, v11
+    vmrglb  v23, v3, v11
+    vmrghb  v24, v4, v12
+    vmrglb  v25, v4, v12
+    vmrghb  v26, v5, v13
+    vmrglb  v27, v5, v13
+    vmrghb  v28, v6, v14
+    vmrglb  v29, v6, v14
+    vmrghb  v30, v7, v15
+    vmrglb  v31, v7, v15
+.endm
+
+;# load_g loads a global vector (whose address is in the local variable Gptr)
+;#   into vector register Vreg.  Trashes r0
+.macro load_g Vreg, Gptr
+    lwz     r0, \Gptr
+    lvx     \Vreg, 0, r0
+.endm
+
+;# exploit the saturation here.  if the answer is negative
+;# it will be clamped to 0.  orring 0 with a positive
+;# number will be the positive number (abs)
+;# RES = abs( A-B), trashes TMP
+.macro Abs RES, TMP, A, B
+    vsububs \RES, \A, \B
+    vsububs \TMP, \B, \A
+    vor     \RES, \RES, \TMP
+.endm
+
+;# RES = Max( RES, abs( A-B)), trashes TMP
+.macro max_abs RES, TMP, A, B
+    vsububs \TMP, \A, \B
+    vmaxub  \RES, \RES, \TMP
+    vsububs \TMP, \B, \A
+    vmaxub  \RES, \RES, \TMP
+.endm
+
+.macro Masks
+    ;# build masks
+    ;# input is all 8 bit unsigned (0-255).  need to
+    ;# do abs(vala-valb) > limit.  but no need to compare each
+    ;# value to the limit.  find the max of the absolute differences
+    ;# and compare that to the limit.
+    ;# First hev
+    Abs     v14, v13, v2, v3    ;# |P1 - P0|
+    max_abs  v14, v13, v5, v4    ;# |Q1 - Q0|
+
+    vcmpgtub v10, v14, v10      ;# HEV = true if thresh exceeded
+
+    ;# Next limit
+    max_abs  v14, v13, v0, v1    ;# |P3 - P2|
+    max_abs  v14, v13, v1, v2    ;# |P2 - P1|
+    max_abs  v14, v13, v6, v5    ;# |Q2 - Q1|
+    max_abs  v14, v13, v7, v6    ;# |Q3 - Q2|
+
+    vcmpgtub v9, v14, v9        ;# R = true if limit exceeded
+
+    ;# flimit
+    Abs     v14, v13, v3, v4    ;# |P0 - Q0|
+
+    vcmpgtub v8, v14, v8        ;# X = true if flimit exceeded
+
+    vor     v8, v8, v9          ;# R = true if flimit or limit exceeded
+    ;# done building masks
+.endm
+
+.macro build_constants RFL, RLI, RTH, FL, LI, TH
+    ;# build constants
+    lvx     \FL, 0, \RFL        ;# flimit
+    lvx     \LI, 0, \RLI        ;# limit
+    lvx     \TH, 0, \RTH        ;# thresh
+
+    vspltisb v11, 8
+    vspltisb v12, 4
+    vslb    v11, v11, v12       ;# 0x80808080808080808080808080808080
+.endm
+
+.macro load_data_y
+    ;# setup strides/pointers to be able to access
+    ;# all of the data
+    add     r5, r4, r4          ;# r5 = 2 * stride
+    sub     r6, r3, r5          ;# r6 -> 2 rows back
+    neg     r7, r4              ;# r7 = -stride
+
+    ;# load 16 pixels worth of data to work on
+    sub     r0, r6, r5          ;# r0 -> 4 rows back (temp)
+    lvx     v0,  0, r0          ;# P3  (read only)
+    lvx     v1, r7, r6          ;# P2
+    lvx     v2,  0, r6          ;# P1
+    lvx     v3, r7, r3          ;# P0
+    lvx     v4,  0, r3          ;# Q0
+    lvx     v5, r4, r3          ;# Q1
+    lvx     v6, r5, r3          ;# Q2
+    add     r0, r3, r5          ;# r0 -> 2 rows fwd (temp)
+    lvx     v7, r4, r0          ;# Q3  (read only)
+.endm
+
+;# Expects
+;#  v10 == HEV
+;#  v13 == tmp
+;#  v14 == tmp
+.macro common_adjust P0, Q0, P1, Q1, HEV_PRESENT
+    vxor    \P1, \P1, v11       ;# SP1
+    vxor    \P0, \P0, v11       ;# SP0
+    vxor    \Q0, \Q0, v11       ;# SQ0
+    vxor    \Q1, \Q1, v11       ;# SQ1
+
+    vsubsbs v13, \P1, \Q1       ;# f  = c (P1 - Q1)
+.if \HEV_PRESENT
+    vand    v13, v13, v10       ;# f &= hev
+.endif
+    vsubsbs v14, \Q0, \P0       ;# -126 <=  X = Q0-P0  <= +126
+    vaddsbs v13, v13, v14
+    vaddsbs v13, v13, v14
+    vaddsbs v13, v13, v14       ;# A = c( c(P1-Q1) + 3*(Q0-P0))
+
+    vandc   v13, v13, v8        ;# f &= mask
+
+    vspltisb v8, 3
+    vspltisb v9, 4
+
+    vaddsbs v14, v13, v9        ;# f1 = c (f+4)
+    vaddsbs v15, v13, v8        ;# f2 = c (f+3)
+
+    vsrab   v13, v14, v8        ;# f1 >>= 3
+    vsrab   v15, v15, v8        ;# f2 >>= 3
+
+    vsubsbs \Q0, \Q0, v13       ;# u1 = c (SQ0 - f1)
+    vaddsbs \P0, \P0, v15       ;# u2 = c (SP0 + f2)
+.endm
+
+.macro vp8_mbfilter
+    Masks
+
+    ;# start the fitering here
+    vxor    v1, v1, v11         ;# SP2
+    vxor    v2, v2, v11         ;# SP1
+    vxor    v3, v3, v11         ;# SP0
+    vxor    v4, v4, v11         ;# SQ0
+    vxor    v5, v5, v11         ;# SQ1
+    vxor    v6, v6, v11         ;# SQ2
+
+    ;# add outer taps if we have high edge variance
+    vsubsbs v13, v2, v5         ;# f  = c (SP1-SQ1)
+
+    vsubsbs v14, v4, v3         ;# SQ0-SP0
+    vaddsbs v13, v13, v14
+    vaddsbs v13, v13, v14
+    vaddsbs v13, v13, v14       ;# f  = c( c(SP1-SQ1) + 3*(SQ0-SP0))
+
+    vandc   v13, v13, v8        ;# f &= mask
+    vand    v15, v13, v10       ;# f2 = f & hev
+
+    ;# save bottom 3 bits so that we round one side +4 and the other +3
+    vspltisb v8, 3
+    vspltisb v9, 4
+
+    vaddsbs v14, v15, v9        ;# f1 = c (f+4)
+    vaddsbs v15, v15, v8        ;# f2 = c (f+3)
+
+    vsrab   v14, v14, v8        ;# f1 >>= 3
+    vsrab   v15, v15, v8        ;# f2 >>= 3
+
+    vsubsbs v4, v4, v14         ;# u1 = c (SQ0 - f1)
+    vaddsbs v3, v3, v15         ;# u2 = c (SP0 + f2)
+
+    ;# only apply wider filter if not high edge variance
+    vandc   v13, v13, v10       ;# f &= ~hev
+
+    vspltisb v9, 2
+    vnor    v8, v8, v8
+    vsrb    v9, v8, v9          ;# 0x3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f
+    vupkhsb v9, v9              ;# 0x003f003f003f003f003f003f003f003f
+    vspltisb v8, 9
+
+    ;# roughly 1/7th difference across boundary
+    vspltish v10, 7
+    vmulosb v14, v8, v13        ;# A = c( c(P1-Q1) + 3*(Q0-P0))
+    vmulesb v15, v8, v13
+    vaddshs v14, v14, v9        ;# +=  63
+    vaddshs v15, v15, v9
+    vsrah   v14, v14, v10       ;# >>= 7
+    vsrah   v15, v15, v10
+    vmrglh  v10, v15, v14
+    vmrghh  v15, v15, v14
+
+    vpkshss v10, v15, v10       ;# X = saturated down to bytes
+
+    vsubsbs v6, v6, v10         ;# subtract from Q and add to P
+    vaddsbs v1, v1, v10
+
+    vxor    v6, v6, v11
+    vxor    v1, v1, v11
+
+    ;# roughly 2/7th difference across boundary
+    vspltish v10, 7
+    vaddubm v12, v8, v8
+    vmulosb v14, v12, v13       ;# A = c( c(P1-Q1) + 3*(Q0-P0))
+    vmulesb v15, v12, v13
+    vaddshs v14, v14, v9
+    vaddshs v15, v15, v9
+    vsrah   v14, v14, v10       ;# >>= 7
+    vsrah   v15, v15, v10
+    vmrglh  v10, v15, v14
+    vmrghh  v15, v15, v14
+
+    vpkshss v10, v15, v10       ;# X = saturated down to bytes
+
+    vsubsbs v5, v5, v10         ;# subtract from Q and add to P
+    vaddsbs v2, v2, v10
+
+    vxor    v5, v5, v11
+    vxor    v2, v2, v11
+
+    ;# roughly 3/7th difference across boundary
+    vspltish v10, 7
+    vaddubm v12, v12, v8
+    vmulosb v14, v12, v13       ;# A = c( c(P1-Q1) + 3*(Q0-P0))
+    vmulesb v15, v12, v13
+    vaddshs v14, v14, v9
+    vaddshs v15, v15, v9
+    vsrah   v14, v14, v10       ;# >>= 7
+    vsrah   v15, v15, v10
+    vmrglh  v10, v15, v14
+    vmrghh  v15, v15, v14
+
+    vpkshss v10, v15, v10       ;# X = saturated down to bytes
+
+    vsubsbs v4, v4, v10         ;# subtract from Q and add to P
+    vaddsbs v3, v3, v10
+
+    vxor    v4, v4, v11
+    vxor    v3, v3, v11
+.endm
+
+.macro SBFilter
+    Masks
+
+    common_adjust v3, v4, v2, v5, 1
+
+    ;# outer tap adjustments
+    vspltisb v8, 1
+
+    vaddubm v13, v13, v8        ;# f  += 1
+    vsrab   v13, v13, v8        ;# f >>= 1
+
+    vandc   v13, v13, v10       ;# f &= ~hev
+
+    vsubsbs v5, v5, v13         ;# u1 = c (SQ1 - f)
+    vaddsbs v2, v2, v13         ;# u2 = c (SP1 + f)
+
+    vxor    v2, v2, v11
+    vxor    v3, v3, v11
+    vxor    v4, v4, v11
+    vxor    v5, v5, v11
+.endm
+
+    .align 2
+mbloop_filter_horizontal_edge_y_ppc:
+    mfspr   r11, 256            ;# get old VRSAVE
+    oris    r12, r11, 0xffff
+    mtspr   256, r12            ;# set VRSAVE
+
+    build_constants r5, r6, r7, v8, v9, v10
+
+    load_data_y
+
+    vp8_mbfilter
+
+    stvx     v1, r7, r6         ;# P2
+    stvx     v2,  0, r6         ;# P1
+    stvx     v3, r7, r3         ;# P0
+    stvx     v4,  0, r3         ;# Q0
+    stvx     v5, r4, r3         ;# Q1
+    stvx     v6, r5, r3         ;# Q2
+
+    mtspr   256, r11            ;# reset old VRSAVE
+
+    blr
+
+    .align 2
+;#  r3 unsigned char *s
+;#  r4 int p
+;#  r5 const signed char *flimit
+;#  r6 const signed char *limit
+;#  r7 const signed char *thresh
+loop_filter_horizontal_edge_y_ppc:
+    mfspr   r11, 256            ;# get old VRSAVE
+    oris    r12, r11, 0xffff
+    mtspr   256, r12            ;# set VRSAVE
+
+    build_constants r5, r6, r7, v8, v9, v10
+
+    load_data_y
+
+    SBFilter
+
+    stvx     v2,  0, r6         ;# P1
+    stvx     v3, r7, r3         ;# P0
+    stvx     v4,  0, r3         ;# Q0
+    stvx     v5, r4, r3         ;# Q1
+
+    mtspr   256, r11            ;# reset old VRSAVE
+
+    blr
+
+;# Filtering a vertical mb.  Each mb is aligned on a 16 byte boundary.
+;#  So we can read in an entire mb aligned.  However if we want to filter the mb
+;#  edge we run into problems.  For the loopfilter we require 4 bytes before the mb
+;#  and 4 after for a total of 8 bytes.  Reading 16 bytes inorder to get 4 is a bit
+;#  of a waste.  So this is an even uglier way to get around that.
+;# Using the regular register file words are read in and then saved back out to
+;#  memory to align and order them up.  Then they are read in using the
+;#  vector register file.
+.macro RLVmb V, R
+    lwzux   r0, r3, r4
+    stw     r0, 4(\R)
+    lwz     r0,-4(r3)
+    stw     r0, 0(\R)
+    lwzux   r0, r3, r4
+    stw     r0,12(\R)
+    lwz     r0,-4(r3)
+    stw     r0, 8(\R)
+    lvx     \V, 0, \R
+.endm
+
+.macro WLVmb V, R
+    stvx    \V, 0, \R
+    lwz     r0,12(\R)
+    stwux   r0, r3, r4
+    lwz     r0, 8(\R)
+    stw     r0,-4(r3)
+    lwz     r0, 4(\R)
+    stwux   r0, r3, r4
+    lwz     r0, 0(\R)
+    stw     r0,-4(r3)
+.endm
+
+    .align 2
+;#  r3 unsigned char *s
+;#  r4 int p
+;#  r5 const signed char *flimit
+;#  r6 const signed char *limit
+;#  r7 const signed char *thresh
+mbloop_filter_vertical_edge_y_ppc:
+    mfspr   r11, 256            ;# get old VRSAVE
+    oris    r12, r11, 0xffff
+    ori     r12, r12, 0xc000
+    mtspr   256, r12            ;# set VRSAVE
+
+    la      r9, -48(r1)         ;# temporary space for reading in vectors
+    sub     r3, r3, r4
+
+    RLVmb v0, r9
+    RLVmb v1, r9
+    RLVmb v2, r9
+    RLVmb v3, r9
+    RLVmb v4, r9
+    RLVmb v5, r9
+    RLVmb v6, r9
+    RLVmb v7, r9
+
+    transpose8x16_fwd
+
+    build_constants r5, r6, r7, v8, v9, v10
+
+    vp8_mbfilter
+
+    transpose8x16_inv
+
+    add r3, r3, r4
+    neg r4, r4
+
+    WLVmb v17, r9
+    WLVmb v16, r9
+    WLVmb v15, r9
+    WLVmb v14, r9
+    WLVmb v13, r9
+    WLVmb v12, r9
+    WLVmb v11, r9
+    WLVmb v10, r9
+
+    mtspr   256, r11            ;# reset old VRSAVE
+
+    blr
+
+.macro RL V, R, P
+    lvx     \V, 0,  \R
+    add     \R, \R, \P
+.endm
+
+.macro WL V, R, P
+    stvx    \V, 0,  \R
+    add     \R, \R, \P
+.endm
+
+.macro Fil P3, P2, P1, P0, Q0, Q1, Q2, Q3
+                                ;# K = |P0-P1| already
+    Abs     v14, v13, \Q0, \Q1  ;# M = |Q0-Q1|
+    vmaxub  v14, v14, v4        ;# M = max( |P0-P1|, |Q0-Q1|)
+    vcmpgtub v10, v14, v0
+
+    Abs     v4, v5, \Q2, \Q3    ;# K = |Q2-Q3| = next |P0-P1]
+
+    max_abs  v14, v13, \Q1, \Q2  ;# M = max( M, |Q1-Q2|)
+    max_abs  v14, v13, \P1, \P2  ;# M = max( M, |P1-P2|)
+    max_abs  v14, v13, \P2, \P3  ;# M = max( M, |P2-P3|)
+
+    vmaxub   v14, v14, v4       ;# M = max interior abs diff
+    vcmpgtub v9, v14, v2        ;# M = true if int_l exceeded
+
+    Abs     v14, v13, \P0, \Q0  ;# X = Abs( P0-Q0)
+    vcmpgtub v8, v14, v3        ;# X = true if edge_l exceeded
+    vor     v8, v8, v9          ;# M = true if edge_l or int_l exceeded
+
+    ;# replace P1,Q1 w/signed versions
+    common_adjust \P0, \Q0, \P1, \Q1, 1
+
+    vaddubm v13, v13, v1        ;# -16 <= M <= 15, saturation irrelevant
+    vsrab   v13, v13, v1
+    vandc   v13, v13, v10       ;# adjust P1,Q1 by (M+1)>>1  if ! hev
+    vsubsbs \Q1, \Q1, v13
+    vaddsbs \P1, \P1, v13
+
+    vxor    \P1, \P1, v11       ;# P1
+    vxor    \P0, \P0, v11       ;# P0
+    vxor    \Q0, \Q0, v11       ;# Q0
+    vxor    \Q1, \Q1, v11       ;# Q1
+.endm
+
+
+    .align 2
+;#  r3 unsigned char *s
+;#  r4 int p
+;#  r5 const signed char *flimit
+;#  r6 const signed char *limit
+;#  r7 const signed char *thresh
+loop_filter_vertical_edge_y_ppc:
+    mfspr   r11, 256            ;# get old VRSAVE
+    oris    r12, r11, 0xffff
+    ori     r12, r12, 0xffff
+    mtspr   256, r12            ;# set VRSAVE
+
+    addi    r9, r3, 0
+    RL      v16, r9, r4
+    RL      v17, r9, r4
+    RL      v18, r9, r4
+    RL      v19, r9, r4
+    RL      v20, r9, r4
+    RL      v21, r9, r4
+    RL      v22, r9, r4
+    RL      v23, r9, r4
+    RL      v24, r9, r4
+    RL      v25, r9, r4
+    RL      v26, r9, r4
+    RL      v27, r9, r4
+    RL      v28, r9, r4
+    RL      v29, r9, r4
+    RL      v30, r9, r4
+    lvx     v31, 0, r9
+
+    Transpose16x16
+
+    vspltisb v1, 1
+
+    build_constants r5, r6, r7, v3, v2, v0
+
+    Abs v4, v5, v19, v18                            ;# K(v14) = first |P0-P1|
+
+    Fil v16, v17, v18, v19,  v20, v21, v22, v23
+    Fil v20, v21, v22, v23,  v24, v25, v26, v27
+    Fil v24, v25, v26, v27,  v28, v29, v30, v31
+
+    Transpose16x16
+
+    addi    r9, r3, 0
+    WL      v16, r9, r4
+    WL      v17, r9, r4
+    WL      v18, r9, r4
+    WL      v19, r9, r4
+    WL      v20, r9, r4
+    WL      v21, r9, r4
+    WL      v22, r9, r4
+    WL      v23, r9, r4
+    WL      v24, r9, r4
+    WL      v25, r9, r4
+    WL      v26, r9, r4
+    WL      v27, r9, r4
+    WL      v28, r9, r4
+    WL      v29, r9, r4
+    WL      v30, r9, r4
+    stvx    v31, 0, r9
+
+    mtspr   256, r11            ;# reset old VRSAVE
+
+    blr
+
+;# -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- UV FILTERING -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+.macro active_chroma_sel V
+    andi.   r7, r3, 8       ;# row origin modulo 16
+    add     r7, r7, r7      ;# selects selectors
+    lis     r12, _chromaSelectors@ha
+    la      r0,  _chromaSelectors@l(r12)
+    lwzux   r0, r7, r0      ;# leave selector addr in r7
+
+    lvx     \V, 0, r0       ;# mask to concatenate active U,V pels
+.endm
+
+.macro hread_uv Dest, U, V, Offs, VMask
+    lvx     \U, \Offs, r3
+    lvx     \V, \Offs, r4
+    vperm   \Dest, \U, \V, \VMask   ;# Dest = active part of U then V
+.endm
+
+.macro hwrite_uv New, U, V, Offs, Umask, Vmask
+    vperm   \U, \New, \U, \Umask    ;# Combine new pels with siblings
+    vperm   \V, \New, \V, \Vmask
+    stvx    \U, \Offs, r3           ;# Write to frame buffer
+    stvx    \V, \Offs, r4
+.endm
+
+;# Process U,V in parallel.
+.macro load_chroma_h
+    neg     r9, r5          ;# r9 = -1 * stride
+    add     r8, r9, r9      ;# r8 = -2 * stride
+    add     r10, r5, r5     ;# r10 = 2 * stride
+
+    active_chroma_sel v12
+
+    ;# P3, Q3 are read-only; need not save addresses or sibling pels
+    add     r6, r8, r8      ;# r6 = -4 * stride
+    hread_uv v0, v14, v15, r6, v12
+    add     r6, r10, r5     ;# r6 =  3 * stride
+    hread_uv v7, v14, v15, r6, v12
+
+    ;# Others are read/write; save addresses and sibling pels
+
+    add     r6, r8, r9      ;# r6 = -3 * stride
+    hread_uv v1, v16, v17, r6,  v12
+    hread_uv v2, v18, v19, r8,  v12
+    hread_uv v3, v20, v21, r9,  v12
+    hread_uv v4, v22, v23, 0,   v12
+    hread_uv v5, v24, v25, r5,  v12
+    hread_uv v6, v26, v27, r10, v12
+.endm
+
+.macro uresult_sel V
+    load_g   \V, 4(r7)
+.endm
+
+.macro vresult_sel V
+    load_g   \V, 8(r7)
+.endm
+
+;# always write P1,P0,Q0,Q1
+.macro store_chroma_h
+    uresult_sel v11
+    vresult_sel v12
+    hwrite_uv v2, v18, v19, r8, v11, v12
+    hwrite_uv v3, v20, v21, r9, v11, v12
+    hwrite_uv v4, v22, v23, 0,  v11, v12
+    hwrite_uv v5, v24, v25, r5, v11, v12
+.endm
+
+    .align 2
+;#  r3 unsigned char *u
+;#  r4 unsigned char *v
+;#  r5 int p
+;#  r6 const signed char *flimit
+;#  r7 const signed char *limit
+;#  r8 const signed char *thresh
+mbloop_filter_horizontal_edge_uv_ppc:
+    mfspr   r11, 256            ;# get old VRSAVE
+    oris    r12, r11, 0xffff
+    ori     r12, r12, 0xffff
+    mtspr   256, r12            ;# set VRSAVE
+
+    build_constants r6, r7, r8, v8, v9, v10
+
+    load_chroma_h
+
+    vp8_mbfilter
+
+    store_chroma_h
+
+    hwrite_uv v1, v16, v17, r6,  v11, v12    ;# v1 == P2
+    hwrite_uv v6, v26, v27, r10, v11, v12    ;# v6 == Q2
+
+    mtspr   256, r11            ;# reset old VRSAVE
+
+    blr
+
+    .align 2
+;#  r3 unsigned char *u
+;#  r4 unsigned char *v
+;#  r5 int p
+;#  r6 const signed char *flimit
+;#  r7 const signed char *limit
+;#  r8 const signed char *thresh
+loop_filter_horizontal_edge_uv_ppc:
+    mfspr   r11, 256            ;# get old VRSAVE
+    oris    r12, r11, 0xffff
+    ori     r12, r12, 0xffff
+    mtspr   256, r12            ;# set VRSAVE
+
+    build_constants r6, r7, r8, v8, v9, v10
+
+    load_chroma_h
+
+    SBFilter
+
+    store_chroma_h
+
+    mtspr   256, r11            ;# reset old VRSAVE
+
+    blr
+
+.macro R V, R
+    lwzux   r0, r3, r5
+    stw     r0, 4(\R)
+    lwz     r0,-4(r3)
+    stw     r0, 0(\R)
+    lwzux   r0, r4, r5
+    stw     r0,12(\R)
+    lwz     r0,-4(r4)
+    stw     r0, 8(\R)
+    lvx     \V, 0, \R
+.endm
+
+
+.macro W V, R
+    stvx    \V, 0, \R
+    lwz     r0,12(\R)
+    stwux   r0, r4, r5
+    lwz     r0, 8(\R)
+    stw     r0,-4(r4)
+    lwz     r0, 4(\R)
+    stwux   r0, r3, r5
+    lwz     r0, 0(\R)
+    stw     r0,-4(r3)
+.endm
+
+.macro chroma_vread R
+    sub r3, r3, r5          ;# back up one line for simplicity
+    sub r4, r4, r5
+
+    R v0, \R
+    R v1, \R
+    R v2, \R
+    R v3, \R
+    R v4, \R
+    R v5, \R
+    R v6, \R
+    R v7, \R
+
+    transpose8x16_fwd
+.endm
+
+.macro chroma_vwrite R
+
+    transpose8x16_inv
+
+    add     r3, r3, r5
+    add     r4, r4, r5
+    neg     r5, r5          ;# Write rows back in reverse order
+
+    W v17, \R
+    W v16, \R
+    W v15, \R
+    W v14, \R
+    W v13, \R
+    W v12, \R
+    W v11, \R
+    W v10, \R
+.endm
+
+    .align 2
+;#  r3 unsigned char *u
+;#  r4 unsigned char *v
+;#  r5 int p
+;#  r6 const signed char *flimit
+;#  r7 const signed char *limit
+;#  r8 const signed char *thresh
+mbloop_filter_vertical_edge_uv_ppc:
+    mfspr   r11, 256            ;# get old VRSAVE
+    oris    r12, r11, 0xffff
+    ori     r12, r12, 0xc000
+    mtspr   256, r12            ;# set VRSAVE
+
+    la      r9, -48(r1)         ;# temporary space for reading in vectors
+
+    chroma_vread r9
+
+    build_constants r6, r7, r8, v8, v9, v10
+
+    vp8_mbfilter
+
+    chroma_vwrite r9
+
+    mtspr   256, r11            ;# reset old VRSAVE
+
+    blr
+
+    .align 2
+;#  r3 unsigned char *u
+;#  r4 unsigned char *v
+;#  r5 int p
+;#  r6 const signed char *flimit
+;#  r7 const signed char *limit
+;#  r8 const signed char *thresh
+loop_filter_vertical_edge_uv_ppc:
+    mfspr   r11, 256            ;# get old VRSAVE
+    oris    r12, r11, 0xffff
+    ori     r12, r12, 0xc000
+    mtspr   256, r12            ;# set VRSAVE
+
+    la      r9, -48(r1)         ;# temporary space for reading in vectors
+
+    chroma_vread r9
+
+    build_constants r6, r7, r8, v8, v9, v10
+
+    SBFilter
+
+    chroma_vwrite r9
+
+    mtspr   256, r11            ;# reset old VRSAVE
+
+    blr
+
+;# -=-=-=-=-=-=-=-=-=-=-=-=-=-= SIMPLE LOOP FILTER =-=-=-=-=-=-=-=-=-=-=-=-=-=-
+
+.macro vp8_simple_filter
+    Abs v14, v13, v1, v2    ;# M = abs( P0 - Q0)
+    vcmpgtub v8, v14, v8    ;# v5 = true if _over_ limit
+
+    ;# preserve unsigned v0 and v3
+    common_adjust v1, v2, v0, v3, 0
+
+    vxor v1, v1, v11
+    vxor v2, v2, v11        ;# cvt Q0, P0 back to pels
+.endm
+
+.macro simple_vertical
+    addi    r8,  0, 16
+    addi    r7, r5, 32
+
+    lvx     v0,  0, r5
+    lvx     v1, r8, r5
+    lvx     v2,  0, r7
+    lvx     v3, r8, r7
+
+    lis     r12, _B_hihi@ha
+    la      r0,  _B_hihi@l(r12)
+    lvx     v16, 0, r0
+
+    lis     r12, _B_lolo@ha
+    la      r0,  _B_lolo@l(r12)
+    lvx     v17, 0, r0
+
+    Transpose4times4x4 v16, v17
+    vp8_simple_filter
+
+    vxor v0, v0, v11
+    vxor v3, v3, v11        ;# cvt Q0, P0 back to pels
+
+    Transpose4times4x4 v16, v17
+
+    stvx    v0,  0, r5
+    stvx    v1, r8, r5
+    stvx    v2,  0, r7
+    stvx    v3, r8, r7
+.endm
+
+    .align 2
+;#  r3 unsigned char *s
+;#  r4 int p
+;#  r5 const signed char *flimit
+loop_filter_simple_horizontal_edge_ppc:
+    mfspr   r11, 256            ;# get old VRSAVE
+    oris    r12, r11, 0xffff
+    mtspr   256, r12            ;# set VRSAVE
+
+    ;# build constants
+    lvx     v8, 0, r5           ;# flimit
+
+    vspltisb v11, 8
+    vspltisb v12, 4
+    vslb    v11, v11, v12       ;# 0x80808080808080808080808080808080
+
+    neg     r5, r4              ;# r5 = -1 * stride
+    add     r6, r5, r5          ;# r6 = -2 * stride
+
+    lvx     v0, r6, r3          ;# v0 = P1 = 16 pels two rows above edge
+    lvx     v1, r5, r3          ;# v1 = P0 = 16 pels one row  above edge
+    lvx     v2,  0, r3          ;# v2 = Q0 = 16 pels one row  below edge
+    lvx     v3, r4, r3          ;# v3 = Q1 = 16 pels two rows below edge
+
+    vp8_simple_filter
+
+    stvx    v1, r5, r3          ;# store P0
+    stvx    v2,  0, r3          ;# store Q0
+
+    mtspr   256, r11            ;# reset old VRSAVE
+
+    blr
+
+.macro RLV Offs
+    stw     r0, (\Offs*4)(r5)
+    lwzux   r0, r7, r4
+.endm
+
+.macro WLV Offs
+    lwz     r0, (\Offs*4)(r5)
+    stwux   r0, r7, r4
+.endm
+
+    .align 2
+;#  r3 unsigned char *s
+;#  r4 int p
+;#  r5 const signed char *flimit
+loop_filter_simple_vertical_edge_ppc:
+    mfspr   r11, 256            ;# get old VRSAVE
+    oris    r12, r11, 0xffff
+    ori     r12, r12, 0xc000
+    mtspr   256, r12            ;# set VRSAVE
+
+    ;# build constants
+    lvx     v8, 0, r5           ;# flimit
+
+    vspltisb v11, 8
+    vspltisb v12, 4
+    vslb    v11, v11, v12       ;# 0x80808080808080808080808080808080
+
+    la r5, -96(r1)              ;# temporary space for reading in vectors
+
+    ;# Store 4 pels at word "Offs" in temp array, then advance r7
+    ;#   to next row and read another 4 pels from the frame buffer.
+
+    subi    r7, r3,  2          ;# r7 -> 2 pels before start
+    lwzx    r0,  0, r7          ;# read first 4 pels
+
+    ;# 16 unaligned word accesses
+    RLV 0
+    RLV 4
+    RLV 8
+    RLV 12
+    RLV 1
+    RLV 5
+    RLV 9
+    RLV 13
+    RLV 2
+    RLV 6
+    RLV 10
+    RLV 14
+    RLV 3
+    RLV 7
+    RLV 11
+
+    stw     r0, (15*4)(r5)      ;# write last 4 pels
+
+    simple_vertical
+
+    ;# Read temp array, write frame buffer.
+    subi    r7, r3,  2          ;# r7 -> 2 pels before start
+    lwzx    r0,  0, r5          ;# read/write first 4 pels
+    stwx    r0,  0, r7
+
+    WLV 4
+    WLV 8
+    WLV 12
+    WLV 1
+    WLV 5
+    WLV 9
+    WLV 13
+    WLV 2
+    WLV 6
+    WLV 10
+    WLV 14
+    WLV 3
+    WLV 7
+    WLV 11
+    WLV 15
+
+    mtspr   256, r11            ;# reset old VRSAVE
+
+    blr
+
+    .data
+
+_chromaSelectors:
+    .long   _B_hihi
+    .long   _B_Ures0
+    .long   _B_Vres0
+    .long   0
+    .long   _B_lolo
+    .long   _B_Ures8
+    .long   _B_Vres8
+    .long   0
+
+    .align 4
+_B_Vres8:
+    .byte   16, 17, 18, 19, 20, 21, 22, 23,  8,  9, 10, 11, 12, 13, 14, 15
+
+    .align 4
+_B_Ures8:
+    .byte   16, 17, 18, 19, 20, 21, 22, 23,  0,  1,  2,  3,  4,  5,  6,  7
+
+    .align 4
+_B_lolo:
+    .byte    8,  9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31
+
+    .align 4
+_B_Vres0:
+    .byte    8,  9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31
+    .align 4
+_B_Ures0:
+    .byte    0,  1,  2,  3,  4,  5,  6,  7, 24, 25, 26, 27, 28, 29, 30, 31
+
+    .align 4
+_B_hihi:
+    .byte    0,  1,  2,  3,  4,  5,  6,  7, 16, 17, 18, 19, 20, 21, 22, 23
--- /dev/null
+++ b/vp9/common/ppc/platform_altivec.asm
@@ -1,0 +1,59 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    .globl save_platform_context
+    .globl restore_platform_context
+
+.macro W V P
+    stvx    \V,  0, \P
+    addi    \P, \P, 16
+.endm
+
+.macro R V P
+    lvx     \V,  0, \P
+    addi    \P, \P, 16
+.endm
+
+;# r3 context_ptr
+    .align 2
+save_platform_contex:
+    W v20, r3
+    W v21, r3
+    W v22, r3
+    W v23, r3
+    W v24, r3
+    W v25, r3
+    W v26, r3
+    W v27, r3
+    W v28, r3
+    W v29, r3
+    W v30, r3
+    W v31, r3
+
+    blr
+
+;# r3 context_ptr
+    .align 2
+restore_platform_context:
+    R v20, r3
+    R v21, r3
+    R v22, r3
+    R v23, r3
+    R v24, r3
+    R v25, r3
+    R v26, r3
+    R v27, r3
+    R v28, r3
+    R v29, r3
+    R v30, r3
+    R v31, r3
+
+    blr
--- /dev/null
+++ b/vp9/common/ppc/recon_altivec.asm
@@ -1,0 +1,175 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    .globl recon4b_ppc
+    .globl recon2b_ppc
+    .globl recon_b_ppc
+
+.macro row_of16 Diff Pred Dst Stride
+    lvx     v1,  0, \Pred           ;# v1 = pred = p0..p15
+    addi    \Pred, \Pred, 16        ;# next pred
+    vmrghb  v2, v0, v1              ;# v2 = 16-bit p0..p7
+    lvx     v3,  0, \Diff           ;# v3 = d0..d7
+    vaddshs v2, v2, v3              ;# v2 = r0..r7
+    vmrglb  v1, v0, v1              ;# v1 = 16-bit p8..p15
+    lvx     v3, r8, \Diff           ;# v3 = d8..d15
+    addi    \Diff, \Diff, 32        ;# next diff
+    vaddshs v3, v3, v1              ;# v3 = r8..r15
+    vpkshus v2, v2, v3              ;# v2 = 8-bit r0..r15
+    stvx    v2,  0, \Dst            ;# to dst
+    add     \Dst, \Dst, \Stride     ;# next dst
+.endm
+
+    .text
+    .align 2
+;#  r3 = short *diff_ptr,
+;#  r4 = unsigned char *pred_ptr,
+;#  r5 = unsigned char *dst_ptr,
+;#  r6 = int stride
+recon4b_ppc:
+    mfspr   r0, 256                     ;# get old VRSAVE
+    stw     r0, -8(r1)                  ;# save old VRSAVE to stack
+    oris    r0, r0, 0xf000
+    mtspr   256,r0                      ;# set VRSAVE
+
+    vxor    v0, v0, v0
+    li      r8, 16
+
+    row_of16 r3, r4, r5, r6
+    row_of16 r3, r4, r5, r6
+    row_of16 r3, r4, r5, r6
+    row_of16 r3, r4, r5, r6
+
+    lwz     r12, -8(r1)                 ;# restore old VRSAVE from stack
+    mtspr   256, r12                    ;# reset old VRSAVE
+
+    blr
+
+.macro two_rows_of8 Diff Pred Dst Stride write_first_four_pels
+    lvx     v1,  0, \Pred       ;# v1 = pred = p0..p15
+    vmrghb  v2, v0, v1          ;# v2 = 16-bit p0..p7
+    lvx     v3,  0, \Diff       ;# v3 = d0..d7
+    vaddshs v2, v2, v3          ;# v2 = r0..r7
+    vmrglb  v1, v0, v1          ;# v1 = 16-bit p8..p15
+    lvx     v3, r8, \Diff       ;# v2 = d8..d15
+    vaddshs v3, v3, v1          ;# v3 = r8..r15
+    vpkshus v2, v2, v3          ;# v3 = 8-bit r0..r15
+    stvx    v2,  0, r10         ;# 2 rows to dst from buf
+    lwz     r0, 0(r10)
+.if \write_first_four_pels
+    stw     r0, 0(\Dst)
+    .else
+    stwux   r0, \Dst, \Stride
+.endif
+    lwz     r0, 4(r10)
+    stw     r0, 4(\Dst)
+    lwz     r0, 8(r10)
+    stwux   r0, \Dst, \Stride       ;# advance dst to next row
+    lwz     r0, 12(r10)
+    stw     r0, 4(\Dst)
+.endm
+
+    .align 2
+;#  r3 = short *diff_ptr,
+;#  r4 = unsigned char *pred_ptr,
+;#  r5 = unsigned char *dst_ptr,
+;#  r6 = int stride
+
+recon2b_ppc:
+    mfspr   r0, 256                     ;# get old VRSAVE
+    stw     r0, -8(r1)                  ;# save old VRSAVE to stack
+    oris    r0, r0, 0xf000
+    mtspr   256,r0                      ;# set VRSAVE
+
+    vxor    v0, v0, v0
+    li      r8, 16
+
+    la      r10, -48(r1)                ;# buf
+
+    two_rows_of8 r3, r4, r5, r6, 1
+
+    addi    r4, r4, 16;                 ;# next pred
+    addi    r3, r3, 32;                 ;# next diff
+
+    two_rows_of8 r3, r4, r5, r6, 0
+
+    lwz     r12, -8(r1)                 ;# restore old VRSAVE from stack
+    mtspr   256, r12                    ;# reset old VRSAVE
+
+    blr
+
+.macro get_two_diff_rows
+    stw     r0, 0(r10)
+    lwz     r0, 4(r3)
+    stw     r0, 4(r10)
+    lwzu    r0, 32(r3)
+    stw     r0, 8(r10)
+    lwz     r0, 4(r3)
+    stw     r0, 12(r10)
+    lvx     v3, 0, r10
+.endm
+
+    .align 2
+;#  r3 = short *diff_ptr,
+;#  r4 = unsigned char *pred_ptr,
+;#  r5 = unsigned char *dst_ptr,
+;#  r6 = int stride
+recon_b_ppc:
+    mfspr   r0, 256                     ;# get old VRSAVE
+    stw     r0, -8(r1)                  ;# save old VRSAVE to stack
+    oris    r0, r0, 0xf000
+    mtspr   256,r0                      ;# set VRSAVE
+
+    vxor    v0, v0, v0
+
+    la      r10, -48(r1)    ;# buf
+
+    lwz     r0, 0(r4)
+    stw     r0, 0(r10)
+    lwz     r0, 16(r4)
+    stw     r0, 4(r10)
+    lwz     r0, 32(r4)
+    stw     r0, 8(r10)
+    lwz     r0, 48(r4)
+    stw     r0, 12(r10)
+
+    lvx     v1,  0, r10;    ;# v1 = pred = p0..p15
+
+    lwz r0, 0(r3)           ;# v3 = d0..d7
+
+    get_two_diff_rows
+
+    vmrghb  v2, v0, v1;     ;# v2 = 16-bit p0..p7
+    vaddshs v2, v2, v3;     ;# v2 = r0..r7
+
+    lwzu r0, 32(r3)         ;# v3 = d8..d15
+
+    get_two_diff_rows
+
+    vmrglb  v1, v0, v1;     ;# v1 = 16-bit p8..p15
+    vaddshs v3, v3, v1;     ;# v3 = r8..r15
+
+    vpkshus v2, v2, v3;     ;# v2 = 8-bit r0..r15
+    stvx    v2,  0, r10;    ;# 16 pels to dst from buf
+
+    lwz     r0, 0(r10)
+    stw     r0, 0(r5)
+    lwz     r0, 4(r10)
+    stwux   r0, r5, r6
+    lwz     r0, 8(r10)
+    stwux   r0, r5, r6
+    lwz     r0, 12(r10)
+    stwx    r0, r5, r6
+
+    lwz     r12, -8(r1)                 ;# restore old VRSAVE from stack
+    mtspr   256, r12                    ;# reset old VRSAVE
+
+    blr
--- /dev/null
+++ b/vp9/common/ppc/systemdependent.c
@@ -1,0 +1,167 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "subpixel.h"
+#include "loopfilter.h"
+#include "recon.h"
+#include "idct.h"
+#include "onyxc_int.h"
+
+void (*vp8_short_idct4x4)(short *input, short *output, int pitch);
+void (*vp8_short_idct4x4_1)(short *input, short *output, int pitch);
+void (*vp8_dc_only_idct)(short input_dc, short *output, int pitch);
+
+extern void (*vp9_post_proc_down_and_across)(
+  unsigned char *src_ptr,
+  unsigned char *dst_ptr,
+  int src_pixels_per_line,
+  int dst_pixels_per_line,
+  int rows,
+  int cols,
+  int flimit
+);
+
+extern void (*vp9_mbpost_proc_down)(unsigned char *dst, int pitch, int rows, int cols, int flimit);
+extern void vp9_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols, int flimit);
+extern void (*vp9_mbpost_proc_across_ip)(unsigned char *src, int pitch, int rows, int cols, int flimit);
+extern void vp9_mbpost_proc_across_ip_c(unsigned char *src, int pitch, int rows, int cols, int flimit);
+
+extern void vp9_post_proc_down_and_across_c
+(
+  unsigned char *src_ptr,
+  unsigned char *dst_ptr,
+  int src_pixels_per_line,
+  int dst_pixels_per_line,
+  int rows,
+  int cols,
+  int flimit
+);
+void vp9_plane_add_noise_c(unsigned char *Start, unsigned int Width, unsigned int Height, int Pitch, int q, int a);
+
+extern copy_mem_block_function *vp9_copy_mem16x16;
+extern copy_mem_block_function *vp9_copy_mem8x8;
+extern copy_mem_block_function *vp9_copy_mem8x4;
+
+// PPC
+extern subpixel_predict_function sixtap_predict_ppc;
+extern subpixel_predict_function sixtap_predict8x4_ppc;
+extern subpixel_predict_function sixtap_predict8x8_ppc;
+extern subpixel_predict_function sixtap_predict16x16_ppc;
+extern subpixel_predict_function bilinear_predict4x4_ppc;
+extern subpixel_predict_function bilinear_predict8x4_ppc;
+extern subpixel_predict_function bilinear_predict8x8_ppc;
+extern subpixel_predict_function bilinear_predict16x16_ppc;
+
+extern copy_mem_block_function copy_mem16x16_ppc;
+
+void recon_b_ppc(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride);
+void recon2b_ppc(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride);
+void recon4b_ppc(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride);
+
+extern void short_idct4x4llm_ppc(short *input, short *output, int pitch);
+
+// Generic C
+extern subpixel_predict_function vp9_sixtap_predict_c;
+extern subpixel_predict_function vp9_sixtap_predict8x4_c;
+extern subpixel_predict_function vp9_sixtap_predict8x8_c;
+extern subpixel_predict_function vp9_sixtap_predict16x16_c;
+extern subpixel_predict_function vp9_bilinear_predict4x4_c;
+extern subpixel_predict_function vp9_bilinear_predict8x4_c;
+extern subpixel_predict_function vp9_bilinear_predict8x8_c;
+extern subpixel_predict_function vp9_bilinear_predict16x16_c;
+
+extern copy_mem_block_function vp9_copy_mem16x16_c;
+extern copy_mem_block_function vp9_copy_mem8x8_c;
+extern copy_mem_block_function vp9_copy_mem8x4_c;
+
+void vp9_recon_b_c(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride);
+void vp9_recon2b_c(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride);
+void vp9_recon4b_c(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride);
+
+extern void vp9_short_idct4x4llm_1_c(short *input, short *output, int pitch);
+extern void vp9_short_idct4x4llm_c(short *input, short *output, int pitch);
+extern void vp8_dc_only_idct_c(short input_dc, short *output, int pitch);
+
+// PPC
+extern loop_filter_block_function loop_filter_mbv_ppc;
+extern loop_filter_block_function loop_filter_bv_ppc;
+extern loop_filter_block_function loop_filter_mbh_ppc;
+extern loop_filter_block_function loop_filter_bh_ppc;
+
+extern loop_filter_block_function loop_filter_mbvs_ppc;
+extern loop_filter_block_function loop_filter_bvs_ppc;
+extern loop_filter_block_function loop_filter_mbhs_ppc;
+extern loop_filter_block_function loop_filter_bhs_ppc;
+
+// Generic C
+extern loop_filter_block_function vp9_loop_filter_mbv_c;
+extern loop_filter_block_function vp9_loop_filter_bv_c;
+extern loop_filter_block_function vp9_loop_filter_mbh_c;
+extern loop_filter_block_function vp9_loop_filter_bh_c;
+
+extern loop_filter_block_function vp9_loop_filter_mbvs_c;
+extern loop_filter_block_function vp9_loop_filter_bvs_c;
+extern loop_filter_block_function vp9_loop_filter_mbhs_c;
+extern loop_filter_block_function vp9_loop_filter_bhs_c;
+
+extern loop_filter_block_function *vp8_lf_mbvfull;
+extern loop_filter_block_function *vp8_lf_mbhfull;
+extern loop_filter_block_function *vp8_lf_bvfull;
+extern loop_filter_block_function *vp8_lf_bhfull;
+
+extern loop_filter_block_function *vp8_lf_mbvsimple;
+extern loop_filter_block_function *vp8_lf_mbhsimple;
+extern loop_filter_block_function *vp8_lf_bvsimple;
+extern loop_filter_block_function *vp8_lf_bhsimple;
+
+void vp9_clear_c(void) {
+}
+
+void vp9_machine_specific_config(void) {
+  // Pure C:
+  vp9_clear_system_state                = vp9_clear_c;
+  vp9_recon_b                          = vp9_recon_b_c;
+  vp9_recon4b                         = vp9_recon4b_c;
+  vp9_recon2b                         = vp9_recon2b_c;
+
+  vp9_bilinear_predict16x16            = bilinear_predict16x16_ppc;
+  vp9_bilinear_predict8x8              = bilinear_predict8x8_ppc;
+  vp9_bilinear_predict8x4              = bilinear_predict8x4_ppc;
+  vp8_bilinear_predict                 = bilinear_predict4x4_ppc;
+
+  vp9_sixtap_predict16x16              = sixtap_predict16x16_ppc;
+  vp9_sixtap_predict8x8                = sixtap_predict8x8_ppc;
+  vp9_sixtap_predict8x4                = sixtap_predict8x4_ppc;
+  vp9_sixtap_predict                   = sixtap_predict_ppc;
+
+  vp8_short_idct4x4_1                  = vp9_short_idct4x4llm_1_c;
+  vp8_short_idct4x4                    = short_idct4x4llm_ppc;
+  vp8_dc_only_idct                      = vp8_dc_only_idct_c;
+
+  vp8_lf_mbvfull                       = loop_filter_mbv_ppc;
+  vp8_lf_bvfull                        = loop_filter_bv_ppc;
+  vp8_lf_mbhfull                       = loop_filter_mbh_ppc;
+  vp8_lf_bhfull                        = loop_filter_bh_ppc;
+
+  vp8_lf_mbvsimple                     = loop_filter_mbvs_ppc;
+  vp8_lf_bvsimple                      = loop_filter_bvs_ppc;
+  vp8_lf_mbhsimple                     = loop_filter_mbhs_ppc;
+  vp8_lf_bhsimple                      = loop_filter_bhs_ppc;
+
+  vp9_post_proc_down_and_across           = vp9_post_proc_down_and_across_c;
+  vp9_mbpost_proc_down                  = vp9_mbpost_proc_down_c;
+  vp9_mbpost_proc_across_ip              = vp9_mbpost_proc_across_ip_c;
+  vp9_plane_add_noise                   = vp9_plane_add_noise_c;
+
+  vp9_copy_mem16x16                    = copy_mem16x16_ppc;
+  vp9_copy_mem8x8                      = vp9_copy_mem8x8_c;
+  vp9_copy_mem8x4                      = vp9_copy_mem8x4_c;
+
+}
--- /dev/null
+++ b/vp9/common/ppflags.h
@@ -1,0 +1,38 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_PPFLAGS_H
+#define __INC_PPFLAGS_H
+enum {
+  VP9D_NOFILTERING            = 0,
+  VP9D_DEBLOCK                = 1 << 0,
+  VP9D_DEMACROBLOCK           = 1 << 1,
+  VP9D_ADDNOISE               = 1 << 2,
+  VP9D_DEBUG_TXT_FRAME_INFO   = 1 << 3,
+  VP9D_DEBUG_TXT_MBLK_MODES   = 1 << 4,
+  VP9D_DEBUG_TXT_DC_DIFF      = 1 << 5,
+  VP9D_DEBUG_TXT_RATE_INFO    = 1 << 6,
+  VP9D_DEBUG_DRAW_MV          = 1 << 7,
+  VP9D_DEBUG_CLR_BLK_MODES    = 1 << 8,
+  VP9D_DEBUG_CLR_FRM_REF_BLKS = 1 << 9
+};
+
+typedef struct {
+  int post_proc_flag;
+  int deblocking_level;
+  int noise_level;
+  int display_ref_frame_flag;
+  int display_mb_modes_flag;
+  int display_b_modes_flag;
+  int display_mv_flag;
+} vp9_ppflags_t;
+
+#endif
--- /dev/null
+++ b/vp9/common/pragmas.h
@@ -1,0 +1,19 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+
+
+#ifdef __INTEL_COMPILER
+#pragma warning(disable:997 1011 170)
+#endif
+#ifdef _MSC_VER
+#pragma warning(disable:4799)
+#endif
--- /dev/null
+++ b/vp9/common/pred_common.c
@@ -1,0 +1,463 @@
+
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vp9/common/pred_common.h"
+#include "vp9/common/seg_common.h"
+
+// TBD prediction functions for various bitstream signals
+
+// Returns a context number for the given MB prediction signal
+unsigned char vp9_get_pred_context(const VP9_COMMON *const cm,
+                                   const MACROBLOCKD *const xd,
+                                   PRED_ID pred_id) {
+  int pred_context;
+  MODE_INFO *m = xd->mode_info_context;
+
+  // Note:
+  // The mode info data structure has a one element border above and to the
+  // left of the entries correpsonding to real macroblocks.
+  // The prediction flags in these dummy entries are initialised to 0.
+  switch (pred_id) {
+    case PRED_SEG_ID:
+      pred_context = (m - 1)->mbmi.seg_id_predicted +
+                     (m - cm->mode_info_stride)->mbmi.seg_id_predicted;
+      break;
+
+
+    case PRED_REF:
+      pred_context = (m - 1)->mbmi.ref_predicted +
+                     (m - cm->mode_info_stride)->mbmi.ref_predicted;
+      break;
+
+    case PRED_COMP:
+      // Context based on use of comp pred flag by neighbours
+      // pred_context =
+      //   ((m - 1)->mbmi.second_ref_frame != INTRA_FRAME) +
+      //    ((m - cm->mode_info_stride)->mbmi.second_ref_frame != INTRA_FRAME);
+
+      // Context based on mode and reference frame
+      // if ( m->mbmi.ref_frame == LAST_FRAME )
+      //    pred_context = 0 + (m->mbmi.mode != ZEROMV);
+      // else if ( m->mbmi.ref_frame == GOLDEN_FRAME )
+      //    pred_context = 2 + (m->mbmi.mode != ZEROMV);
+      // else
+      //    pred_context = 4 + (m->mbmi.mode != ZEROMV);
+
+      if (m->mbmi.ref_frame == LAST_FRAME)
+        pred_context = 0;
+      else
+        pred_context = 1;
+
+      break;
+
+    case PRED_MBSKIP:
+      pred_context = (m - 1)->mbmi.mb_skip_coeff +
+                     (m - cm->mode_info_stride)->mbmi.mb_skip_coeff;
+      break;
+
+    case PRED_SWITCHABLE_INTERP:
+      {
+        int left_in_image = (m - 1)->mbmi.mb_in_image;
+        int above_in_image = (m - cm->mode_info_stride)->mbmi.mb_in_image;
+        int left_mode = (m - 1)->mbmi.mode;
+        int above_mode = (m - cm->mode_info_stride)->mbmi.mode;
+        int left_interp, above_interp;
+        if (left_in_image && left_mode >= NEARESTMV && left_mode <= SPLITMV)
+          left_interp = vp9_switchable_interp_map[(m - 1)->mbmi.interp_filter];
+        else
+          left_interp = VP9_SWITCHABLE_FILTERS;
+        if (above_in_image && above_mode >= NEARESTMV && above_mode <= SPLITMV)
+          above_interp = vp9_switchable_interp_map[
+              (m - cm->mode_info_stride)->mbmi.interp_filter];
+        else
+          above_interp = VP9_SWITCHABLE_FILTERS;
+
+        if (left_interp == above_interp)
+          pred_context = left_interp;
+        else if (left_interp == VP9_SWITCHABLE_FILTERS &&
+                 above_interp != VP9_SWITCHABLE_FILTERS)
+          pred_context = above_interp;
+        else if (left_interp != VP9_SWITCHABLE_FILTERS &&
+                 above_interp == VP9_SWITCHABLE_FILTERS)
+          pred_context = left_interp;
+        else
+          pred_context = VP9_SWITCHABLE_FILTERS;
+      }
+      break;
+
+    default:
+      // TODO *** add error trap code.
+      pred_context = 0;
+      break;
+  }
+
+  return pred_context;
+}
+
+// This function returns a context probability for coding a given
+// prediction signal
+vp9_prob vp9_get_pred_prob(const VP9_COMMON *const cm,
+                          const MACROBLOCKD *const xd,
+                          PRED_ID pred_id) {
+  vp9_prob pred_probability;
+  int pred_context;
+
+  // Get the appropriate prediction context
+  pred_context = vp9_get_pred_context(cm, xd, pred_id);
+
+  switch (pred_id) {
+    case PRED_SEG_ID:
+      pred_probability = cm->segment_pred_probs[pred_context];
+      break;
+
+    case PRED_REF:
+      pred_probability = cm->ref_pred_probs[pred_context];
+      break;
+
+    case PRED_COMP:
+      // In keeping with convention elsewhre the probability returned is
+      // the probability of a "0" outcome which in this case means the
+      // probability of comp pred off.
+      pred_probability = cm->prob_comppred[pred_context];
+      break;
+
+    case PRED_MBSKIP:
+      pred_probability = cm->mbskip_pred_probs[pred_context];
+      break;
+
+    default:
+      // TODO *** add error trap code.
+      pred_probability = 128;
+      break;
+  }
+
+  return pred_probability;
+}
+
+// This function returns a context probability ptr for coding a given
+// prediction signal
+const vp9_prob *vp9_get_pred_probs(const VP9_COMMON *const cm,
+                                   const MACROBLOCKD *const xd,
+                                   PRED_ID pred_id) {
+  const vp9_prob *pred_probability;
+  int pred_context;
+
+  // Get the appropriate prediction context
+  pred_context = vp9_get_pred_context(cm, xd, pred_id);
+
+  switch (pred_id) {
+    case PRED_SEG_ID:
+      pred_probability = &cm->segment_pred_probs[pred_context];
+      break;
+
+    case PRED_REF:
+      pred_probability = &cm->ref_pred_probs[pred_context];
+      break;
+
+    case PRED_COMP:
+      // In keeping with convention elsewhre the probability returned is
+      // the probability of a "0" outcome which in this case means the
+      // probability of comp pred off.
+      pred_probability = &cm->prob_comppred[pred_context];
+      break;
+
+    case PRED_MBSKIP:
+      pred_probability = &cm->mbskip_pred_probs[pred_context];
+      break;
+
+    case PRED_SWITCHABLE_INTERP:
+      pred_probability = &cm->fc.switchable_interp_prob[pred_context][0];
+      break;
+
+    default:
+      // TODO *** add error trap code.
+      pred_probability = NULL;
+      break;
+  }
+
+  return pred_probability;
+}
+
+// This function returns the status of the given prediction signal.
+// I.e. is the predicted value for the given signal correct.
+unsigned char vp9_get_pred_flag(const MACROBLOCKD *const xd,
+                                PRED_ID pred_id) {
+  unsigned char pred_flag = 0;
+
+  switch (pred_id) {
+    case PRED_SEG_ID:
+      pred_flag = xd->mode_info_context->mbmi.seg_id_predicted;
+      break;
+
+    case PRED_REF:
+      pred_flag = xd->mode_info_context->mbmi.ref_predicted;
+      break;
+
+    case PRED_MBSKIP:
+      pred_flag = xd->mode_info_context->mbmi.mb_skip_coeff;
+      break;
+
+    default:
+      // TODO *** add error trap code.
+      pred_flag = 0;
+      break;
+  }
+
+  return pred_flag;
+}
+
+// This function sets the status of the given prediction signal.
+// I.e. is the predicted value for the given signal correct.
+void vp9_set_pred_flag(MACROBLOCKD *const xd,
+                       PRED_ID pred_id,
+                       unsigned char pred_flag) {
+#if CONFIG_SUPERBLOCKS
+  const int mis = xd->mode_info_stride;
+#endif
+
+  switch (pred_id) {
+    case PRED_SEG_ID:
+      xd->mode_info_context->mbmi.seg_id_predicted = pred_flag;
+#if CONFIG_SUPERBLOCKS
+      if (xd->mode_info_context->mbmi.encoded_as_sb) {
+        if (xd->mb_to_right_edge > 0)
+          xd->mode_info_context[1].mbmi.seg_id_predicted = pred_flag;
+        if (xd->mb_to_bottom_edge > 0) {
+          xd->mode_info_context[mis].mbmi.seg_id_predicted = pred_flag;
+          if (xd->mb_to_right_edge > 0)
+            xd->mode_info_context[mis + 1].mbmi.seg_id_predicted = pred_flag;
+        }
+      }
+#endif
+      break;
+
+    case PRED_REF:
+      xd->mode_info_context->mbmi.ref_predicted = pred_flag;
+#if CONFIG_SUPERBLOCKS
+      if (xd->mode_info_context->mbmi.encoded_as_sb) {
+        if (xd->mb_to_right_edge > 0)
+          xd->mode_info_context[1].mbmi.ref_predicted = pred_flag;
+        if (xd->mb_to_bottom_edge > 0) {
+          xd->mode_info_context[mis].mbmi.ref_predicted = pred_flag;
+          if (xd->mb_to_right_edge > 0)
+            xd->mode_info_context[mis + 1].mbmi.ref_predicted = pred_flag;
+        }
+      }
+#endif
+      break;
+
+    case PRED_MBSKIP:
+      xd->mode_info_context->mbmi.mb_skip_coeff = pred_flag;
+#if CONFIG_SUPERBLOCKS
+      if (xd->mode_info_context->mbmi.encoded_as_sb) {
+        if (xd->mb_to_right_edge > 0)
+          xd->mode_info_context[1].mbmi.mb_skip_coeff = pred_flag;
+        if (xd->mb_to_bottom_edge > 0) {
+          xd->mode_info_context[mis].mbmi.mb_skip_coeff = pred_flag;
+          if (xd->mb_to_right_edge > 0)
+            xd->mode_info_context[mis + 1].mbmi.mb_skip_coeff = pred_flag;
+        }
+      }
+#endif
+      break;
+
+    default:
+      // TODO *** add error trap code.
+      break;
+  }
+}
+
+
+// The following contain the guts of the prediction code used to
+// peredict various bitstream signals.
+
+// Macroblock segment id prediction function
+unsigned char vp9_get_pred_mb_segid(const VP9_COMMON *const cm,
+                                    const MACROBLOCKD *const xd, int MbIndex) {
+  // Currently the prediction for the macroblock segment ID is
+  // the value stored for this macroblock in the previous frame.
+#if CONFIG_SUPERBLOCKS
+  if (!xd->mode_info_context->mbmi.encoded_as_sb) {
+#endif
+    return cm->last_frame_seg_map[MbIndex];
+#if CONFIG_SUPERBLOCKS
+  } else {
+    int seg_id = cm->last_frame_seg_map[MbIndex];
+    int mb_col = MbIndex % cm->mb_cols;
+    int mb_row = MbIndex / cm->mb_cols;
+    if (mb_col + 1 < cm->mb_cols)
+      seg_id = seg_id && cm->last_frame_seg_map[MbIndex + 1];
+    if (mb_row + 1 < cm->mb_rows) {
+      seg_id = seg_id && cm->last_frame_seg_map[MbIndex + cm->mb_cols];
+      if (mb_col + 1 < cm->mb_cols)
+        seg_id = seg_id && cm->last_frame_seg_map[MbIndex + cm->mb_cols + 1];
+    }
+    return seg_id;
+  }
+#endif
+}
+
+MV_REFERENCE_FRAME vp9_get_pred_ref(const VP9_COMMON *const cm,
+                                    const MACROBLOCKD *const xd) {
+  MODE_INFO *m = xd->mode_info_context;
+
+  MV_REFERENCE_FRAME left;
+  MV_REFERENCE_FRAME above;
+  MV_REFERENCE_FRAME above_left;
+  MV_REFERENCE_FRAME pred_ref = LAST_FRAME;
+
+  int segment_id = xd->mode_info_context->mbmi.segment_id;
+  int seg_ref_active;
+  int i;
+
+  unsigned char frame_allowed[MAX_REF_FRAMES] = {1, 1, 1, 1};
+  unsigned char ref_score[MAX_REF_FRAMES];
+  unsigned char best_score = 0;
+  unsigned char left_in_image;
+  unsigned char above_in_image;
+  unsigned char above_left_in_image;
+
+  // Is segment coding ennabled
+  seg_ref_active = vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME);
+
+  // Special case treatment if segment coding is enabled.
+  // Dont allow prediction of a reference frame that the segment
+  // does not allow
+  if (seg_ref_active) {
+    for (i = 0; i < MAX_REF_FRAMES; i++) {
+      frame_allowed[i] =
+        vp9_check_segref(xd, segment_id, i);
+
+      // Score set to 0 if ref frame not allowed
+      ref_score[i] = cm->ref_scores[i] * frame_allowed[i];
+    }
+  } else
+    vpx_memcpy(ref_score, cm->ref_scores, sizeof(ref_score));
+
+  // Reference frames used by neighbours
+  left = (m - 1)->mbmi.ref_frame;
+  above = (m - cm->mode_info_stride)->mbmi.ref_frame;
+  above_left = (m - 1 - cm->mode_info_stride)->mbmi.ref_frame;
+
+  // Are neighbours in image
+  left_in_image = (m - 1)->mbmi.mb_in_image;
+  above_in_image = (m - cm->mode_info_stride)->mbmi.mb_in_image;
+  above_left_in_image = (m - 1 - cm->mode_info_stride)->mbmi.mb_in_image;
+
+  // Adjust scores for candidate reference frames based on neigbours
+  if (frame_allowed[left] && left_in_image) {
+    ref_score[left] += 16;
+    if (above_left_in_image && (left == above_left))
+      ref_score[left] += 4;
+  }
+  if (frame_allowed[above] && above_in_image) {
+    ref_score[above] += 16;
+    if (above_left_in_image && (above == above_left))
+      ref_score[above] += 4;
+  }
+
+  // Now choose the candidate with the highest score
+  for (i = 0; i < MAX_REF_FRAMES; i++) {
+    if (ref_score[i] > best_score) {
+      pred_ref = i;
+      best_score = ref_score[i];
+    }
+  }
+
+  return pred_ref;
+}
+
+// Functions to computes a set of modified reference frame probabilities
+// to use when the prediction of the reference frame value fails
+void vp9_calc_ref_probs(int *count, vp9_prob *probs) {
+  int tot_count;
+
+  tot_count = count[0] + count[1] + count[2] + count[3];
+  if (tot_count) {
+    probs[0] = (vp9_prob)((count[0] * 255 + (tot_count >> 1)) / tot_count);
+    probs[0] += !probs[0];
+  } else
+    probs[0] = 128;
+
+  tot_count -= count[0];
+  if (tot_count) {
+    probs[1] = (vp9_prob)((count[1] * 255 + (tot_count >> 1)) / tot_count);
+    probs[1] += !probs[1];
+  } else
+    probs[1] = 128;
+
+  tot_count -= count[1];
+  if (tot_count) {
+    probs[2] = (vp9_prob)((count[2] * 255 + (tot_count >> 1)) / tot_count);
+    probs[2] += !probs[2];
+  } else
+    probs[2] = 128;
+
+}
+
+// Computes a set of modified conditional probabilities for the reference frame
+// Values willbe set to 0 for reference frame options that are not possible
+// because wither they were predicted and prediction has failed or because
+// they are not allowed for a given segment.
+void vp9_compute_mod_refprobs(VP9_COMMON *const cm) {
+  int norm_cnt[MAX_REF_FRAMES];
+  int intra_count;
+  int inter_count;
+  int last_count;
+  int gfarf_count;
+  int gf_count;
+  int arf_count;
+
+  intra_count = cm->prob_intra_coded;
+  inter_count = (255 - intra_count);
+  last_count = (inter_count * cm->prob_last_coded) / 255;
+  gfarf_count = inter_count - last_count;
+  gf_count = (gfarf_count * cm->prob_gf_coded) / 255;
+  arf_count = gfarf_count - gf_count;
+
+  // Work out modified reference frame probabilities to use where prediction
+  // of the reference frame fails
+  norm_cnt[0] = 0;
+  norm_cnt[1] = last_count;
+  norm_cnt[2] = gf_count;
+  norm_cnt[3] = arf_count;
+  vp9_calc_ref_probs(norm_cnt, cm->mod_refprobs[INTRA_FRAME]);
+  cm->mod_refprobs[INTRA_FRAME][0] = 0;    // This branch implicit
+
+  norm_cnt[0] = intra_count;
+  norm_cnt[1] = 0;
+  norm_cnt[2] = gf_count;
+  norm_cnt[3] = arf_count;
+  vp9_calc_ref_probs(norm_cnt, cm->mod_refprobs[LAST_FRAME]);
+  cm->mod_refprobs[LAST_FRAME][1] = 0;    // This branch implicit
+
+  norm_cnt[0] = intra_count;
+  norm_cnt[1] = last_count;
+  norm_cnt[2] = 0;
+  norm_cnt[3] = arf_count;
+  vp9_calc_ref_probs(norm_cnt, cm->mod_refprobs[GOLDEN_FRAME]);
+  cm->mod_refprobs[GOLDEN_FRAME][2] = 0;  // This branch implicit
+
+  norm_cnt[0] = intra_count;
+  norm_cnt[1] = last_count;
+  norm_cnt[2] = gf_count;
+  norm_cnt[3] = 0;
+  vp9_calc_ref_probs(norm_cnt, cm->mod_refprobs[ALTREF_FRAME]);
+  cm->mod_refprobs[ALTREF_FRAME][2] = 0;  // This branch implicit
+
+  // Score the reference frames based on overal frequency.
+  // These scores contribute to the prediction choices.
+  // Max score 17 min 1
+  cm->ref_scores[INTRA_FRAME] = 1 + (intra_count * 16 / 255);
+  cm->ref_scores[LAST_FRAME] = 1 + (last_count * 16 / 255);
+  cm->ref_scores[GOLDEN_FRAME] = 1 + (gf_count * 16 / 255);
+  cm->ref_scores[ALTREF_FRAME] = 1 + (arf_count * 16 / 255);
+}
--- /dev/null
+++ b/vp9/common/pred_common.h
@@ -1,0 +1,56 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "type_aliases.h"
+#include "onyxc_int.h"
+#include "vp9/common/blockd.h"
+
+#ifndef __INC_PRED_COMMON_H__
+#define __INC_PRED_COMMON_H__ 1
+
+
+// Predicted items
+typedef enum {
+  PRED_SEG_ID = 0,               // Segment identifier
+  PRED_REF = 1,
+  PRED_COMP = 2,
+  PRED_MBSKIP = 3,
+  PRED_SWITCHABLE_INTERP = 4
+} PRED_ID;
+
+extern unsigned char vp9_get_pred_context(const VP9_COMMON *const cm,
+                                          const MACROBLOCKD *const xd,
+                                          PRED_ID pred_id);
+
+extern vp9_prob vp9_get_pred_prob(const VP9_COMMON *const cm,
+                                  const MACROBLOCKD *const xd,
+                                  PRED_ID pred_id);
+
+extern const vp9_prob *vp9_get_pred_probs(const VP9_COMMON *const cm,
+                                          const MACROBLOCKD *const xd,
+                                          PRED_ID pred_id);
+
+extern unsigned char vp9_get_pred_flag(const MACROBLOCKD *const xd,
+                                       PRED_ID pred_id);
+
+extern void vp9_set_pred_flag(MACROBLOCKD *const xd,
+                              PRED_ID pred_id,
+                              unsigned char pred_flag);
+
+
+extern unsigned char vp9_get_pred_mb_segid(const VP9_COMMON *const cm,
+                                           const MACROBLOCKD *const xd,
+                                           int MbIndex);
+
+extern MV_REFERENCE_FRAME vp9_get_pred_ref(const VP9_COMMON *const cm,
+                                       const MACROBLOCKD *const xd);
+extern void vp9_compute_mod_refprobs(VP9_COMMON *const cm);
+
+#endif /* __INC_PRED_COMMON_H__ */
--- /dev/null
+++ b/vp9/common/quant_common.c
@@ -1,0 +1,125 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "quant_common.h"
+
+static int dc_qlookup[QINDEX_RANGE];
+static int ac_qlookup[QINDEX_RANGE];
+
+#define ACDC_MIN 4
+
+void vp9_init_quant_tables() {
+  int i;
+  int current_val = 4;
+  int last_val = 4;
+  int ac_val;
+
+  for (i = 0; i < QINDEX_RANGE; i++) {
+    ac_qlookup[i] = current_val;
+    current_val = (int)((double)current_val * 1.02);
+    if (current_val == last_val)
+      current_val++;
+    last_val = current_val;
+
+    ac_val = ac_qlookup[i];
+    dc_qlookup[i] = (0.000000305 * ac_val * ac_val * ac_val) +
+                    (-0.00065 * ac_val * ac_val) +
+                    (0.9 * ac_val) + 0.5;
+    if (dc_qlookup[i] < ACDC_MIN)
+      dc_qlookup[i] = ACDC_MIN;
+  }
+}
+
+int vp9_dc_quant(int QIndex, int Delta) {
+  int retval;
+
+  QIndex = QIndex + Delta;
+
+  if (QIndex > MAXQ)
+    QIndex = MAXQ;
+  else if (QIndex < 0)
+    QIndex = 0;
+
+  retval = dc_qlookup[ QIndex ];
+  return retval;
+}
+
+int vp9_dc2quant(int QIndex, int Delta) {
+  int retval;
+
+  QIndex = QIndex + Delta;
+
+  if (QIndex > MAXQ)
+    QIndex = MAXQ;
+  else if (QIndex < 0)
+    QIndex = 0;
+
+  retval = dc_qlookup[ QIndex ];
+
+  return retval;
+
+}
+int vp9_dc_uv_quant(int QIndex, int Delta) {
+  int retval;
+
+  QIndex = QIndex + Delta;
+
+  if (QIndex > MAXQ)
+    QIndex = MAXQ;
+  else if (QIndex < 0)
+    QIndex = 0;
+
+  retval = dc_qlookup[ QIndex ];
+
+  return retval;
+}
+
+int vp9_ac_yquant(int QIndex) {
+  int retval;
+
+  if (QIndex > MAXQ)
+    QIndex = MAXQ;
+  else if (QIndex < 0)
+    QIndex = 0;
+
+  retval = ac_qlookup[ QIndex ];
+  return retval;
+}
+
+int vp9_ac2quant(int QIndex, int Delta) {
+  int retval;
+
+  QIndex = QIndex + Delta;
+
+  if (QIndex > MAXQ)
+    QIndex = MAXQ;
+  else if (QIndex < 0)
+    QIndex = 0;
+
+  retval = (ac_qlookup[ QIndex ] * 775) / 1000;
+  if (retval < 4)
+    retval = 4;
+
+  return retval;
+}
+int vp9_ac_uv_quant(int QIndex, int Delta) {
+  int retval;
+
+  QIndex = QIndex + Delta;
+
+  if (QIndex > MAXQ)
+    QIndex = MAXQ;
+  else if (QIndex < 0)
+    QIndex = 0;
+
+  retval = ac_qlookup[ QIndex ];
+  return retval;
+}
--- /dev/null
+++ b/vp9/common/quant_common.h
@@ -1,0 +1,22 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "string.h"
+#include "blockd.h"
+#include "onyxc_int.h"
+
+extern void vp9_init_quant_tables();
+extern int vp9_ac_yquant(int QIndex);
+extern int vp9_dc_quant(int QIndex, int Delta);
+extern int vp9_dc2quant(int QIndex, int Delta);
+extern int vp9_ac2quant(int QIndex, int Delta);
+extern int vp9_dc_uv_quant(int QIndex, int Delta);
+extern int vp9_ac_uv_quant(int QIndex, int Delta);
--- /dev/null
+++ b/vp9/common/recon.c
@@ -1,0 +1,197 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_ports/config.h"
+#include "vpx_rtcd.h"
+#include "blockd.h"
+
+void vp9_recon_b_c
+(
+  unsigned char *pred_ptr,
+  short *diff_ptr,
+  unsigned char *dst_ptr,
+  int stride
+) {
+  int r, c;
+
+  for (r = 0; r < 4; r++) {
+    for (c = 0; c < 4; c++) {
+      int a = diff_ptr[c] + pred_ptr[c];
+
+      if (a < 0)
+        a = 0;
+
+      if (a > 255)
+        a = 255;
+
+      dst_ptr[c] = (unsigned char) a;
+    }
+
+    dst_ptr += stride;
+    diff_ptr += 16;
+    pred_ptr += 16;
+  }
+}
+
+void vp9_recon_uv_b_c
+(
+  unsigned char *pred_ptr,
+  short *diff_ptr,
+  unsigned char *dst_ptr,
+  int stride
+) {
+  int r, c;
+
+  for (r = 0; r < 4; r++) {
+    for (c = 0; c < 4; c++) {
+      int a = diff_ptr[c] + pred_ptr[c];
+
+      if (a < 0)
+        a = 0;
+
+      if (a > 255)
+        a = 255;
+
+      dst_ptr[c] = (unsigned char) a;
+    }
+
+    dst_ptr += stride;
+    diff_ptr += 8;
+    pred_ptr += 8;
+  }
+}
+void vp9_recon4b_c
+(
+  unsigned char *pred_ptr,
+  short *diff_ptr,
+  unsigned char *dst_ptr,
+  int stride
+) {
+  int r, c;
+
+  for (r = 0; r < 4; r++) {
+    for (c = 0; c < 16; c++) {
+      int a = diff_ptr[c] + pred_ptr[c];
+
+      if (a < 0)
+        a = 0;
+
+      if (a > 255)
+        a = 255;
+
+      dst_ptr[c] = (unsigned char) a;
+    }
+
+    dst_ptr += stride;
+    diff_ptr += 16;
+    pred_ptr += 16;
+  }
+}
+
+void vp9_recon2b_c
+(
+  unsigned char *pred_ptr,
+  short *diff_ptr,
+  unsigned char *dst_ptr,
+  int stride
+) {
+  int r, c;
+
+  for (r = 0; r < 4; r++) {
+    for (c = 0; c < 8; c++) {
+      int a = diff_ptr[c] + pred_ptr[c];
+
+      if (a < 0)
+        a = 0;
+
+      if (a > 255)
+        a = 255;
+
+      dst_ptr[c] = (unsigned char) a;
+    }
+
+    dst_ptr += stride;
+    diff_ptr += 8;
+    pred_ptr += 8;
+  }
+}
+
+#if CONFIG_SUPERBLOCKS
+void vp9_recon_mby_s_c(MACROBLOCKD *xd, uint8_t *dst) {
+  int x, y;
+  BLOCKD *b = &xd->block[0];
+  int stride = b->dst_stride;
+  short *diff = b->diff;
+
+  for (y = 0; y < 16; y++) {
+    for (x = 0; x < 16; x++) {
+      int a = dst[x] + diff[x];
+      if (a < 0)
+        a = 0;
+      else if (a > 255)
+        a = 255;
+      dst[x] = a;
+    }
+    dst += stride;
+    diff += 16;
+  }
+}
+
+void vp9_recon_mbuv_s_c(MACROBLOCKD *xd, uint8_t *udst, uint8_t *vdst) {
+  int x, y, i;
+  uint8_t *dst = udst;
+
+  for (i = 0; i < 2; i++, dst = vdst) {
+    BLOCKD *b = &xd->block[16 + 4 * i];
+    int stride = b->dst_stride;
+    short *diff = b->diff;
+
+    for (y = 0; y < 8; y++) {
+      for (x = 0; x < 8; x++) {
+        int a = dst[x] + diff[x];
+        if (a < 0)
+          a = 0;
+        else if (a > 255)
+          a = 255;
+        dst[x] = a;
+      }
+      dst += stride;
+      diff += 8;
+    }
+  }
+}
+#endif
+
+void vp9_recon_mby_c(MACROBLOCKD *xd) {
+  int i;
+
+  for (i = 0; i < 16; i += 4) {
+    BLOCKD *b = &xd->block[i];
+
+    vp9_recon4b(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+  }
+}
+
+void vp9_recon_mb_c(MACROBLOCKD *xd) {
+  int i;
+
+  for (i = 0; i < 16; i += 4) {
+    BLOCKD *b = &xd->block[i];
+
+    vp9_recon4b(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+  }
+
+  for (i = 16; i < 24; i += 2) {
+    BLOCKD *b = &xd->block[i];
+
+    vp9_recon2b(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+  }
+}
--- /dev/null
+++ b/vp9/common/reconinter.c
@@ -1,0 +1,1145 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_ports/config.h"
+#include "vpx/vpx_integer.h"
+#include "subpixel.h"
+#include "blockd.h"
+#include "reconinter.h"
+#if CONFIG_RUNTIME_CPU_DETECT
+#include "onyxc_int.h"
+#endif
+
+void vp9_setup_interp_filters(MACROBLOCKD *xd,
+                              INTERPOLATIONFILTERTYPE mcomp_filter_type,
+                              VP9_COMMON *cm) {
+  if (mcomp_filter_type == SIXTAP) {
+    xd->subpixel_predict        = SUBPIX_INVOKE(
+        &cm->rtcd.subpix, sixtap4x4);
+    xd->subpixel_predict8x4     = SUBPIX_INVOKE(
+        &cm->rtcd.subpix, sixtap8x4);
+    xd->subpixel_predict8x8     = SUBPIX_INVOKE(
+        &cm->rtcd.subpix, sixtap8x8);
+    xd->subpixel_predict16x16   = SUBPIX_INVOKE(
+        &cm->rtcd.subpix, sixtap16x16);
+    xd->subpixel_predict_avg    = SUBPIX_INVOKE(
+        &cm->rtcd.subpix, sixtap_avg4x4);
+    xd->subpixel_predict_avg8x8 = SUBPIX_INVOKE(
+        &cm->rtcd.subpix, sixtap_avg8x8);
+    xd->subpixel_predict_avg16x16 = SUBPIX_INVOKE(
+        &cm->rtcd.subpix, sixtap_avg16x16);
+  } else if (mcomp_filter_type == EIGHTTAP || mcomp_filter_type == SWITCHABLE) {
+    xd->subpixel_predict        = SUBPIX_INVOKE(
+        &cm->rtcd.subpix, eighttap4x4);
+    xd->subpixel_predict8x4     = SUBPIX_INVOKE(
+        &cm->rtcd.subpix, eighttap8x4);
+    xd->subpixel_predict8x8     = SUBPIX_INVOKE(
+        &cm->rtcd.subpix, eighttap8x8);
+    xd->subpixel_predict16x16   = SUBPIX_INVOKE(
+        &cm->rtcd.subpix, eighttap16x16);
+    xd->subpixel_predict_avg    = SUBPIX_INVOKE(
+        &cm->rtcd.subpix, eighttap_avg4x4);
+    xd->subpixel_predict_avg8x8 = SUBPIX_INVOKE(
+        &cm->rtcd.subpix, eighttap_avg8x8);
+    xd->subpixel_predict_avg16x16 = SUBPIX_INVOKE(
+        &cm->rtcd.subpix, eighttap_avg16x16);
+  } else if (mcomp_filter_type == EIGHTTAP_SHARP) {
+    xd->subpixel_predict        = SUBPIX_INVOKE(
+        &cm->rtcd.subpix, eighttap4x4_sharp);
+    xd->subpixel_predict8x4     = SUBPIX_INVOKE(
+        &cm->rtcd.subpix, eighttap8x4_sharp);
+    xd->subpixel_predict8x8     = SUBPIX_INVOKE(
+        &cm->rtcd.subpix, eighttap8x8_sharp);
+    xd->subpixel_predict16x16   = SUBPIX_INVOKE(
+        &cm->rtcd.subpix, eighttap16x16_sharp);
+    xd->subpixel_predict_avg    = SUBPIX_INVOKE(
+        &cm->rtcd.subpix, eighttap_avg4x4_sharp);
+    xd->subpixel_predict_avg8x8 = SUBPIX_INVOKE(
+        &cm->rtcd.subpix, eighttap_avg8x8_sharp);
+    xd->subpixel_predict_avg16x16 = SUBPIX_INVOKE(
+        &cm->rtcd.subpix, eighttap_avg16x16_sharp);
+  }
+  else {
+    xd->subpixel_predict        = SUBPIX_INVOKE(
+        &cm->rtcd.subpix, bilinear4x4);
+    xd->subpixel_predict8x4     = SUBPIX_INVOKE(
+        &cm->rtcd.subpix, bilinear8x4);
+    xd->subpixel_predict8x8     = SUBPIX_INVOKE(
+        &cm->rtcd.subpix, bilinear8x8);
+    xd->subpixel_predict16x16   = SUBPIX_INVOKE(
+        &cm->rtcd.subpix, bilinear16x16);
+    xd->subpixel_predict_avg    = SUBPIX_INVOKE(
+        &cm->rtcd.subpix, bilinear_avg4x4);
+    xd->subpixel_predict_avg8x8 = SUBPIX_INVOKE(
+        &cm->rtcd.subpix, bilinear_avg8x8);
+    xd->subpixel_predict_avg16x16 = SUBPIX_INVOKE(
+        &cm->rtcd.subpix, bilinear_avg16x16);
+  }
+}
+
+void vp9_copy_mem16x16_c(unsigned char *src,
+                         int src_stride,
+                         unsigned char *dst,
+                         int dst_stride) {
+  int r;
+
+  for (r = 0; r < 16; r++) {
+#if !(CONFIG_FAST_UNALIGNED)
+    dst[0] = src[0];
+    dst[1] = src[1];
+    dst[2] = src[2];
+    dst[3] = src[3];
+    dst[4] = src[4];
+    dst[5] = src[5];
+    dst[6] = src[6];
+    dst[7] = src[7];
+    dst[8] = src[8];
+    dst[9] = src[9];
+    dst[10] = src[10];
+    dst[11] = src[11];
+    dst[12] = src[12];
+    dst[13] = src[13];
+    dst[14] = src[14];
+    dst[15] = src[15];
+
+#else
+    ((uint32_t *)dst)[0] = ((uint32_t *)src)[0];
+    ((uint32_t *)dst)[1] = ((uint32_t *)src)[1];
+    ((uint32_t *)dst)[2] = ((uint32_t *)src)[2];
+    ((uint32_t *)dst)[3] = ((uint32_t *)src)[3];
+
+#endif
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+void vp9_avg_mem16x16_c(unsigned char *src,
+                        int src_stride,
+                        unsigned char *dst,
+                        int dst_stride) {
+  int r;
+
+  for (r = 0; r < 16; r++) {
+    int n;
+
+    for (n = 0; n < 16; n++) {
+      dst[n] = (dst[n] + src[n] + 1) >> 1;
+    }
+
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+void vp9_copy_mem8x8_c(unsigned char *src,
+                       int src_stride,
+                       unsigned char *dst,
+                       int dst_stride) {
+  int r;
+
+  for (r = 0; r < 8; r++) {
+#if !(CONFIG_FAST_UNALIGNED)
+    dst[0] = src[0];
+    dst[1] = src[1];
+    dst[2] = src[2];
+    dst[3] = src[3];
+    dst[4] = src[4];
+    dst[5] = src[5];
+    dst[6] = src[6];
+    dst[7] = src[7];
+#else
+    ((uint32_t *)dst)[0] = ((uint32_t *)src)[0];
+    ((uint32_t *)dst)[1] = ((uint32_t *)src)[1];
+#endif
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+void vp9_avg_mem8x8_c(unsigned char *src,
+                      int src_stride,
+                      unsigned char *dst,
+                      int dst_stride) {
+  int r;
+
+  for (r = 0; r < 8; r++) {
+    int n;
+
+    for (n = 0; n < 8; n++) {
+      dst[n] = (dst[n] + src[n] + 1) >> 1;
+    }
+
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+void vp9_copy_mem8x4_c(unsigned char *src,
+                       int src_stride,
+                       unsigned char *dst,
+                       int dst_stride) {
+  int r;
+
+  for (r = 0; r < 4; r++) {
+#if !(CONFIG_FAST_UNALIGNED)
+    dst[0] = src[0];
+    dst[1] = src[1];
+    dst[2] = src[2];
+    dst[3] = src[3];
+    dst[4] = src[4];
+    dst[5] = src[5];
+    dst[6] = src[6];
+    dst[7] = src[7];
+#else
+    ((uint32_t *)dst)[0] = ((uint32_t *)src)[0];
+    ((uint32_t *)dst)[1] = ((uint32_t *)src)[1];
+#endif
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+void vp9_build_inter_predictors_b(BLOCKD *d, int pitch, vp9_subpix_fn_t sppf) {
+  int r;
+  unsigned char *ptr_base;
+  unsigned char *ptr;
+  unsigned char *pred_ptr = d->predictor;
+  int_mv mv;
+
+  ptr_base = *(d->base_pre);
+  mv.as_int = d->bmi.as_mv.first.as_int;
+
+  if (mv.as_mv.row & 7 || mv.as_mv.col & 7) {
+    ptr = ptr_base + d->pre + (mv.as_mv.row >> 3) * d->pre_stride +
+          (mv.as_mv.col >> 3);
+    sppf(ptr, d->pre_stride, (mv.as_mv.col & 7) << 1, (mv.as_mv.row & 7) << 1,
+         pred_ptr, pitch);
+  } else {
+    ptr_base += d->pre + (mv.as_mv.row >> 3) * d->pre_stride +
+                (mv.as_mv.col >> 3);
+    ptr = ptr_base;
+
+    for (r = 0; r < 4; r++) {
+#if !(CONFIG_FAST_UNALIGNED)
+      pred_ptr[0]  = ptr[0];
+      pred_ptr[1]  = ptr[1];
+      pred_ptr[2]  = ptr[2];
+      pred_ptr[3]  = ptr[3];
+#else
+      *(uint32_t *)pred_ptr = *(uint32_t *)ptr;
+#endif
+      pred_ptr     += pitch;
+      ptr         += d->pre_stride;
+    }
+  }
+}
+
+/*
+ * Similar to vp9_build_inter_predictors_b(), but instead of storing the
+ * results in d->predictor, we average the contents of d->predictor (which
+ * come from an earlier call to vp9_build_inter_predictors_b()) with the
+ * predictor of the second reference frame / motion vector.
+ */
+void vp9_build_2nd_inter_predictors_b(BLOCKD *d, int pitch,
+                                      vp9_subpix_fn_t sppf) {
+  int r;
+  unsigned char *ptr_base;
+  unsigned char *ptr;
+  unsigned char *pred_ptr = d->predictor;
+  int_mv mv;
+
+  ptr_base = *(d->base_second_pre);
+  mv.as_int = d->bmi.as_mv.second.as_int;
+
+  if (mv.as_mv.row & 7 || mv.as_mv.col & 7) {
+    ptr = ptr_base + d->pre + (mv.as_mv.row >> 3) * d->pre_stride +
+          (mv.as_mv.col >> 3);
+    sppf(ptr, d->pre_stride, (mv.as_mv.col & 7) << 1, (mv.as_mv.row & 7) << 1,
+         pred_ptr, pitch);
+  } else {
+    ptr_base += d->pre + (mv.as_mv.row >> 3) * d->pre_stride +
+                (mv.as_mv.col >> 3);
+    ptr = ptr_base;
+
+    for (r = 0; r < 4; r++) {
+      pred_ptr[0]  = (pred_ptr[0] + ptr[0] + 1) >> 1;
+      pred_ptr[1]  = (pred_ptr[1] + ptr[1] + 1) >> 1;
+      pred_ptr[2]  = (pred_ptr[2] + ptr[2] + 1) >> 1;
+      pred_ptr[3]  = (pred_ptr[3] + ptr[3] + 1) >> 1;
+      pred_ptr    += pitch;
+      ptr         += d->pre_stride;
+    }
+  }
+}
+
+void vp9_build_inter_predictors4b(MACROBLOCKD *xd, BLOCKD *d, int pitch) {
+  unsigned char *ptr_base;
+  unsigned char *ptr;
+  unsigned char *pred_ptr = d->predictor;
+  int_mv mv;
+
+  ptr_base = *(d->base_pre);
+  mv.as_int = d->bmi.as_mv.first.as_int;
+  ptr = ptr_base + d->pre + (mv.as_mv.row >> 3) * d->pre_stride +
+        (mv.as_mv.col >> 3);
+
+  if (mv.as_mv.row & 7 || mv.as_mv.col & 7) {
+    xd->subpixel_predict8x8(ptr, d->pre_stride, (mv.as_mv.col & 7) << 1,
+                            (mv.as_mv.row & 7) << 1, pred_ptr, pitch);
+  } else {
+    vp9_copy_mem8x8(ptr, d->pre_stride, pred_ptr, pitch);
+  }
+}
+
+/*
+ * Similar to build_inter_predictors_4b(), but instead of storing the
+ * results in d->predictor, we average the contents of d->predictor (which
+ * come from an earlier call to build_inter_predictors_4b()) with the
+ * predictor of the second reference frame / motion vector.
+ */
+void vp9_build_2nd_inter_predictors4b(MACROBLOCKD *xd,
+                                      BLOCKD *d, int pitch) {
+  unsigned char *ptr_base;
+  unsigned char *ptr;
+  unsigned char *pred_ptr = d->predictor;
+  int_mv mv;
+
+  ptr_base = *(d->base_second_pre);
+  mv.as_int = d->bmi.as_mv.second.as_int;
+  ptr = ptr_base + d->pre + (mv.as_mv.row >> 3) * d->pre_stride +
+        (mv.as_mv.col >> 3);
+
+  if (mv.as_mv.row & 7 || mv.as_mv.col & 7) {
+    xd->subpixel_predict_avg8x8(ptr, d->pre_stride, (mv.as_mv.col & 7) << 1,
+                               (mv.as_mv.row & 7) << 1, pred_ptr, pitch);
+  } else {
+    vp9_avg_mem8x8(ptr, d->pre_stride, pred_ptr, pitch);
+  }
+}
+
+static void build_inter_predictors2b(MACROBLOCKD *xd, BLOCKD *d, int pitch) {
+  unsigned char *ptr_base;
+  unsigned char *ptr;
+  unsigned char *pred_ptr = d->predictor;
+  int_mv mv;
+
+  ptr_base = *(d->base_pre);
+  mv.as_int = d->bmi.as_mv.first.as_int;
+  ptr = ptr_base + d->pre + (mv.as_mv.row >> 3) * d->pre_stride +
+        (mv.as_mv.col >> 3);
+
+  if (mv.as_mv.row & 7 || mv.as_mv.col & 7) {
+    xd->subpixel_predict8x4(ptr, d->pre_stride, (mv.as_mv.col & 7) << 1,
+                           (mv.as_mv.row & 7) << 1, pred_ptr, pitch);
+  } else {
+    vp9_copy_mem8x4(ptr, d->pre_stride, pred_ptr, pitch);
+  }
+}
+
+
+/*encoder only*/
+#if CONFIG_PRED_FILTER
+
+// Select the thresholded or non-thresholded filter
+#define USE_THRESH_FILTER 0
+
+#define PRED_FILT_LEN 5
+
+static const int filt_shift = 4;
+static const int pred_filter[PRED_FILT_LEN] = {1, 2, 10, 2, 1};
+// Alternative filter {1, 1, 4, 1, 1}
+
+#if !USE_THRESH_FILTER
+void filter_mb(unsigned char *src, int src_stride,
+               unsigned char *dst, int dst_stride,
+               int width, int height) {
+  int i, j, k;
+  unsigned int Temp[32 * 32];
+  unsigned int  *pTmp = Temp;
+  unsigned char *pSrc = src - (1 + src_stride) * (PRED_FILT_LEN / 2);
+
+  // Horizontal
+  for (i = 0; i < height + PRED_FILT_LEN - 1; i++) {
+    for (j = 0; j < width; j++) {
+      int sum = 0;
+      for (k = 0; k < PRED_FILT_LEN; k++)
+        sum += pSrc[j + k] * pred_filter[k];
+      pTmp[j] = sum;
+    }
+
+    pSrc += src_stride;
+    pTmp += width;
+  }
+
+  // Vertical
+  pTmp = Temp;
+  for (i = 0; i < width; i++) {
+    unsigned char *pDst = dst + i;
+    for (j = 0; j < height; j++) {
+      int sum = 0;
+      for (k = 0; k < PRED_FILT_LEN; k++)
+        sum += pTmp[(j + k) * width] * pred_filter[k];
+      // Round
+      sum = (sum + ((1 << (filt_shift << 1)) >> 1)) >> (filt_shift << 1);
+      pDst[j * dst_stride] = (sum < 0 ? 0 : sum > 255 ? 255 : sum);
+    }
+    ++pTmp;
+  }
+}
+#else
+// Based on vp9_post_proc_down_and_across_c (postproc.c)
+void filter_mb(unsigned char *src, int src_stride,
+               unsigned char *dst, int dst_stride,
+               int width, int height) {
+  unsigned char *pSrc, *pDst;
+  int row;
+  int col;
+  int i;
+  int v;
+  unsigned char d[8];
+
+  /* TODO flimit should be linked to the quantizer value */
+  int flimit = 7;
+
+  for (row = 0; row < height; row++) {
+    /* post_proc_down for one row */
+    pSrc = src;
+    pDst = dst;
+
+    for (col = 0; col < width; col++) {
+      int kernel = (1 << (filt_shift - 1));
+      int v = pSrc[col];
+
+      for (i = -2; i <= 2; i++) {
+        if (abs(v - pSrc[col + i * src_stride]) > flimit)
+          goto down_skip_convolve;
+
+        kernel += pred_filter[2 + i] * pSrc[col + i * src_stride];
+      }
+
+      v = (kernel >> filt_shift);
+    down_skip_convolve:
+      pDst[col] = v;
+    }
+
+    /* now post_proc_across */
+    pSrc = dst;
+    pDst = dst;
+
+    for (i = 0; i < 8; i++)
+      d[i] = pSrc[i];
+
+    for (col = 0; col < width; col++) {
+      int kernel = (1 << (filt_shift - 1));
+      v = pSrc[col];
+
+      d[col & 7] = v;
+
+      for (i = -2; i <= 2; i++) {
+        if (abs(v - pSrc[col + i]) > flimit)
+          goto across_skip_convolve;
+
+        kernel += pred_filter[2 + i] * pSrc[col + i];
+      }
+
+      d[col & 7] = (kernel >> filt_shift);
+    across_skip_convolve:
+
+      if (col >= 2)
+        pDst[col - 2] = d[(col - 2) & 7];
+    }
+
+    /* handle the last two pixels */
+    pDst[col - 2] = d[(col - 2) & 7];
+    pDst[col - 1] = d[(col - 1) & 7];
+
+    /* next row */
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+#endif  // !USE_THRESH_FILTER
+
+#endif  // CONFIG_PRED_FILTER
+
+/*encoder only*/
+void vp9_build_inter4x4_predictors_mbuv(MACROBLOCKD *xd) {
+  int i, j;
+  BLOCKD *blockd = xd->block;
+
+  /* build uv mvs */
+  for (i = 0; i < 2; i++) {
+    for (j = 0; j < 2; j++) {
+      int yoffset = i * 8 + j * 2;
+      int uoffset = 16 + i * 2 + j;
+      int voffset = 20 + i * 2 + j;
+      int temp;
+
+      temp = blockd[yoffset  ].bmi.as_mv.first.as_mv.row
+             + blockd[yoffset + 1].bmi.as_mv.first.as_mv.row
+             + blockd[yoffset + 4].bmi.as_mv.first.as_mv.row
+             + blockd[yoffset + 5].bmi.as_mv.first.as_mv.row;
+
+      if (temp < 0) temp -= 4;
+      else temp += 4;
+
+      xd->block[uoffset].bmi.as_mv.first.as_mv.row = (temp / 8) &
+        xd->fullpixel_mask;
+
+      temp = blockd[yoffset  ].bmi.as_mv.first.as_mv.col
+             + blockd[yoffset + 1].bmi.as_mv.first.as_mv.col
+             + blockd[yoffset + 4].bmi.as_mv.first.as_mv.col
+             + blockd[yoffset + 5].bmi.as_mv.first.as_mv.col;
+
+      if (temp < 0) temp -= 4;
+      else temp += 4;
+
+      blockd[uoffset].bmi.as_mv.first.as_mv.col = (temp / 8) &
+        xd->fullpixel_mask;
+
+      blockd[voffset].bmi.as_mv.first.as_mv.row =
+        blockd[uoffset].bmi.as_mv.first.as_mv.row;
+      blockd[voffset].bmi.as_mv.first.as_mv.col =
+        blockd[uoffset].bmi.as_mv.first.as_mv.col;
+
+      if (xd->mode_info_context->mbmi.second_ref_frame) {
+        temp = blockd[yoffset  ].bmi.as_mv.second.as_mv.row
+               + blockd[yoffset + 1].bmi.as_mv.second.as_mv.row
+               + blockd[yoffset + 4].bmi.as_mv.second.as_mv.row
+               + blockd[yoffset + 5].bmi.as_mv.second.as_mv.row;
+
+        if (temp < 0) {
+          temp -= 4;
+        } else {
+          temp += 4;
+        }
+
+        blockd[uoffset].bmi.as_mv.second.as_mv.row = (temp / 8) &
+          xd->fullpixel_mask;
+
+        temp = blockd[yoffset  ].bmi.as_mv.second.as_mv.col
+               + blockd[yoffset + 1].bmi.as_mv.second.as_mv.col
+               + blockd[yoffset + 4].bmi.as_mv.second.as_mv.col
+               + blockd[yoffset + 5].bmi.as_mv.second.as_mv.col;
+
+        if (temp < 0) {
+          temp -= 4;
+        } else {
+          temp += 4;
+        }
+
+        blockd[uoffset].bmi.as_mv.second.as_mv.col = (temp / 8) &
+          xd->fullpixel_mask;
+
+        blockd[voffset].bmi.as_mv.second.as_mv.row =
+          blockd[uoffset].bmi.as_mv.second.as_mv.row;
+        blockd[voffset].bmi.as_mv.second.as_mv.col =
+          blockd[uoffset].bmi.as_mv.second.as_mv.col;
+      }
+    }
+  }
+
+  for (i = 16; i < 24; i += 2) {
+    BLOCKD *d0 = &blockd[i];
+    BLOCKD *d1 = &blockd[i + 1];
+
+    if (d0->bmi.as_mv.first.as_int == d1->bmi.as_mv.first.as_int)
+      build_inter_predictors2b(xd, d0, 8);
+    else {
+      vp9_build_inter_predictors_b(d0, 8, xd->subpixel_predict);
+      vp9_build_inter_predictors_b(d1, 8, xd->subpixel_predict);
+    }
+
+    if (xd->mode_info_context->mbmi.second_ref_frame) {
+      vp9_build_2nd_inter_predictors_b(d0, 8, xd->subpixel_predict_avg);
+      vp9_build_2nd_inter_predictors_b(d1, 8, xd->subpixel_predict_avg);
+    }
+  }
+}
+
+static void clamp_mv_to_umv_border(MV *mv, const MACROBLOCKD *xd) {
+  /* If the MV points so far into the UMV border that no visible pixels
+   * are used for reconstruction, the subpel part of the MV can be
+   * discarded and the MV limited to 16 pixels with equivalent results.
+   *
+   * This limit kicks in at 19 pixels for the top and left edges, for
+   * the 16 pixels plus 3 taps right of the central pixel when subpel
+   * filtering. The bottom and right edges use 16 pixels plus 2 pixels
+   * left of the central pixel when filtering.
+   */
+  if (mv->col < (xd->mb_to_left_edge - ((16 + INTERP_EXTEND) << 3)))
+    mv->col = xd->mb_to_left_edge - (16 << 3);
+  else if (mv->col > xd->mb_to_right_edge + ((15 + INTERP_EXTEND) << 3))
+    mv->col = xd->mb_to_right_edge + (16 << 3);
+
+  if (mv->row < (xd->mb_to_top_edge - ((16 + INTERP_EXTEND) << 3)))
+    mv->row = xd->mb_to_top_edge - (16 << 3);
+  else if (mv->row > xd->mb_to_bottom_edge + ((15 + INTERP_EXTEND) << 3))
+    mv->row = xd->mb_to_bottom_edge + (16 << 3);
+}
+
+/* A version of the above function for chroma block MVs.*/
+static void clamp_uvmv_to_umv_border(MV *mv, const MACROBLOCKD *xd) {
+  mv->col = (2 * mv->col < (xd->mb_to_left_edge - ((16 + INTERP_EXTEND) << 3))) ?
+            (xd->mb_to_left_edge - (16 << 3)) >> 1 : mv->col;
+  mv->col = (2 * mv->col > xd->mb_to_right_edge + ((15 + INTERP_EXTEND) << 3)) ?
+            (xd->mb_to_right_edge + (16 << 3)) >> 1 : mv->col;
+
+  mv->row = (2 * mv->row < (xd->mb_to_top_edge - ((16 + INTERP_EXTEND) << 3))) ?
+            (xd->mb_to_top_edge - (16 << 3)) >> 1 : mv->row;
+  mv->row = (2 * mv->row > xd->mb_to_bottom_edge + ((15 + INTERP_EXTEND) << 3)) ?
+            (xd->mb_to_bottom_edge + (16 << 3)) >> 1 : mv->row;
+}
+
+/*encoder only*/
+void vp9_build_1st_inter16x16_predictors_mby(MACROBLOCKD *xd,
+                                             unsigned char *dst_y,
+                                             int dst_ystride,
+                                             int clamp_mvs) {
+  unsigned char *ptr_base = xd->pre.y_buffer;
+  unsigned char *ptr;
+  int pre_stride = xd->block[0].pre_stride;
+  int_mv ymv;
+
+  ymv.as_int = xd->mode_info_context->mbmi.mv[0].as_int;
+
+  if (clamp_mvs)
+    clamp_mv_to_umv_border(&ymv.as_mv, xd);
+
+  ptr = ptr_base + (ymv.as_mv.row >> 3) * pre_stride + (ymv.as_mv.col >> 3);
+
+#if CONFIG_PRED_FILTER
+  if (xd->mode_info_context->mbmi.pred_filter_enabled) {
+    if ((ymv.as_mv.row | ymv.as_mv.col) & 7) {
+      // Sub-pel filter needs extended input
+      int len = 15 + (INTERP_EXTEND << 1);
+      unsigned char Temp[32 * 32]; // Data required by sub-pel filter
+      unsigned char *pTemp = Temp + (INTERP_EXTEND - 1) * (len + 1);
+
+      // Copy extended MB into Temp array, applying the spatial filter
+      filter_mb(ptr - (INTERP_EXTEND - 1) * (pre_stride + 1), pre_stride,
+                Temp, len, len, len);
+
+      // Sub-pel interpolation
+      xd->subpixel_predict16x16(pTemp, len,
+                                (ymv.as_mv.col & 7) << 1,
+                                (ymv.as_mv.row & 7) << 1,
+                                dst_y, dst_ystride);
+    } else {
+      // Apply spatial filter to create the prediction directly
+      filter_mb(ptr, pre_stride, dst_y, dst_ystride, 16, 16);
+    }
+  } else
+#endif
+    if ((ymv.as_mv.row | ymv.as_mv.col) & 7) {
+      xd->subpixel_predict16x16(ptr, pre_stride,
+                                (ymv.as_mv.col & 7) << 1,
+                                (ymv.as_mv.row & 7) << 1,
+                                dst_y, dst_ystride);
+    } else {
+      vp9_copy_mem16x16(ptr, pre_stride, dst_y, dst_ystride);
+    }
+}
+
+void vp9_build_1st_inter16x16_predictors_mbuv(MACROBLOCKD *xd,
+                                              unsigned char *dst_u,
+                                              unsigned char *dst_v,
+                                              int dst_uvstride) {
+  int offset;
+  unsigned char *uptr, *vptr;
+  int pre_stride = xd->block[0].pre_stride;
+  int_mv _o16x16mv;
+  int_mv _16x16mv;
+
+  _16x16mv.as_int = xd->mode_info_context->mbmi.mv[0].as_int;
+
+  if (xd->mode_info_context->mbmi.need_to_clamp_mvs)
+    clamp_mv_to_umv_border(&_16x16mv.as_mv, xd);
+
+  _o16x16mv = _16x16mv;
+  /* calc uv motion vectors */
+  if (_16x16mv.as_mv.row < 0)
+    _16x16mv.as_mv.row -= 1;
+  else
+    _16x16mv.as_mv.row += 1;
+
+  if (_16x16mv.as_mv.col < 0)
+    _16x16mv.as_mv.col -= 1;
+  else
+    _16x16mv.as_mv.col += 1;
+
+  _16x16mv.as_mv.row /= 2;
+  _16x16mv.as_mv.col /= 2;
+
+  _16x16mv.as_mv.row &= xd->fullpixel_mask;
+  _16x16mv.as_mv.col &= xd->fullpixel_mask;
+
+  pre_stride >>= 1;
+  offset = (_16x16mv.as_mv.row >> 3) * pre_stride + (_16x16mv.as_mv.col >> 3);
+  uptr = xd->pre.u_buffer + offset;
+  vptr = xd->pre.v_buffer + offset;
+
+#if CONFIG_PRED_FILTER
+  if (xd->mode_info_context->mbmi.pred_filter_enabled) {
+    int i;
+    unsigned char *pSrc = uptr;
+    unsigned char *pDst = dst_u;
+    int len = 7 + (INTERP_EXTEND << 1);
+    unsigned char Temp[32 * 32]; // Data required by the sub-pel filter
+    unsigned char *pTemp = Temp + (INTERP_EXTEND - 1) * (len + 1);
+
+    // U & V
+    for (i = 0; i < 2; i++) {
+      if (_o16x16mv.as_int & 0x000f000f) {
+        // Copy extended MB into Temp array, applying the spatial filter
+        filter_mb(pSrc - (INTERP_EXTEND - 1) * (pre_stride + 1), pre_stride,
+                  Temp, len, len, len);
+
+        // Sub-pel filter
+        xd->subpixel_predict8x8(pTemp, len,
+                                _o16x16mv.as_mv.col & 15,
+                                _o16x16mv.as_mv.row & 15,
+                                pDst, dst_uvstride);
+      } else {
+        filter_mb(pSrc, pre_stride, pDst, dst_uvstride, 8, 8);
+      }
+
+      // V
+      pSrc = vptr;
+      pDst = dst_v;
+    }
+  } else
+#endif
+    if (_o16x16mv.as_int & 0x000f000f) {
+      xd->subpixel_predict8x8(uptr, pre_stride, _o16x16mv.as_mv.col & 15,
+                              _o16x16mv.as_mv.row & 15, dst_u, dst_uvstride);
+      xd->subpixel_predict8x8(vptr, pre_stride, _o16x16mv.as_mv.col & 15,
+                              _o16x16mv.as_mv.row & 15, dst_v, dst_uvstride);
+    } else {
+      vp9_copy_mem8x8(uptr, pre_stride, dst_u, dst_uvstride);
+      vp9_copy_mem8x8(vptr, pre_stride, dst_v, dst_uvstride);
+    }
+}
+
+
+void vp9_build_1st_inter16x16_predictors_mb(MACROBLOCKD *xd,
+                                            unsigned char *dst_y,
+                                            unsigned char *dst_u,
+                                            unsigned char *dst_v,
+                                            int dst_ystride, int dst_uvstride) {
+  vp9_build_1st_inter16x16_predictors_mby(xd, dst_y, dst_ystride,
+      xd->mode_info_context->mbmi.need_to_clamp_mvs);
+  vp9_build_1st_inter16x16_predictors_mbuv(xd, dst_u, dst_v, dst_uvstride);
+}
+
+#if CONFIG_SUPERBLOCKS
+void vp9_build_inter32x32_predictors_sb(MACROBLOCKD *x,
+                                        unsigned char *dst_y,
+                                        unsigned char *dst_u,
+                                        unsigned char *dst_v,
+                                        int dst_ystride,
+                                        int dst_uvstride) {
+  uint8_t *y1 = x->pre.y_buffer, *u1 = x->pre.u_buffer, *v1 = x->pre.v_buffer;
+  uint8_t *y2 = x->second_pre.y_buffer, *u2 = x->second_pre.u_buffer,
+          *v2 = x->second_pre.v_buffer;
+  int n;
+
+  for (n = 0; n < 4; n++)
+  {
+    const int x_idx = n & 1, y_idx = n >> 1;
+
+    x->pre.y_buffer = y1 + y_idx * 16 * x->pre.y_stride  + x_idx * 16;
+    x->pre.u_buffer = u1 + y_idx *  8 * x->pre.uv_stride + x_idx *  8;
+    x->pre.v_buffer = v1 + y_idx *  8 * x->pre.uv_stride + x_idx *  8;
+
+    vp9_build_1st_inter16x16_predictors_mb(x,
+      dst_y + y_idx * 16 * dst_ystride  + x_idx * 16,
+      dst_u + y_idx *  8 * dst_uvstride + x_idx *  8,
+      dst_v + y_idx *  8 * dst_uvstride + x_idx *  8,
+      dst_ystride, dst_uvstride);
+    if (x->mode_info_context->mbmi.second_ref_frame) {
+      x->second_pre.y_buffer = y2 + y_idx * 16 * x->pre.y_stride  + x_idx * 16;
+      x->second_pre.u_buffer = u2 + y_idx *  8 * x->pre.uv_stride + x_idx *  8;
+      x->second_pre.v_buffer = v2 + y_idx *  8 * x->pre.uv_stride + x_idx *  8;
+
+      vp9_build_2nd_inter16x16_predictors_mb(x,
+        dst_y + y_idx * 16 * dst_ystride  + x_idx * 16,
+        dst_u + y_idx *  8 * dst_uvstride + x_idx *  8,
+        dst_v + y_idx *  8 * dst_uvstride + x_idx *  8,
+        dst_ystride, dst_uvstride);
+    }
+  }
+
+  x->pre.y_buffer = y1;
+  x->pre.u_buffer = u1;
+  x->pre.v_buffer = v1;
+
+  if (x->mode_info_context->mbmi.second_ref_frame) {
+    x->second_pre.y_buffer = y2;
+    x->second_pre.u_buffer = u2;
+    x->second_pre.v_buffer = v2;
+  }
+}
+#endif
+
+/*
+ * The following functions should be called after an initial
+ * call to vp9_build_1st_inter16x16_predictors_mb() or _mby()/_mbuv().
+ * It will run a second sixtap filter on a (different) ref
+ * frame and average the result with the output of the
+ * first sixtap filter. The second reference frame is stored
+ * in x->second_pre (the reference frame index is in
+ * x->mode_info_context->mbmi.second_ref_frame). The second
+ * motion vector is x->mode_info_context->mbmi.second_mv.
+ *
+ * This allows blending prediction from two reference frames
+ * which sometimes leads to better prediction than from a
+ * single reference framer.
+ */
+void vp9_build_2nd_inter16x16_predictors_mby(MACROBLOCKD *xd,
+                                             unsigned char *dst_y,
+                                             int dst_ystride) {
+  unsigned char *ptr;
+
+  int_mv _16x16mv;
+  int mv_row;
+  int mv_col;
+
+  unsigned char *ptr_base = xd->second_pre.y_buffer;
+  int pre_stride = xd->block[0].pre_stride;
+
+  _16x16mv.as_int = xd->mode_info_context->mbmi.mv[1].as_int;
+
+  if (xd->mode_info_context->mbmi.need_to_clamp_secondmv)
+    clamp_mv_to_umv_border(&_16x16mv.as_mv, xd);
+
+  mv_row = _16x16mv.as_mv.row;
+  mv_col = _16x16mv.as_mv.col;
+
+  ptr = ptr_base + (mv_row >> 3) * pre_stride + (mv_col >> 3);
+
+#if CONFIG_PRED_FILTER
+  if (xd->mode_info_context->mbmi.pred_filter_enabled) {
+    if ((mv_row | mv_col) & 7) {
+      // Sub-pel filter needs extended input
+      int len = 15 + (INTERP_EXTEND << 1);
+      unsigned char Temp[32 * 32]; // Data required by sub-pel filter
+      unsigned char *pTemp = Temp + (INTERP_EXTEND - 1) * (len + 1);
+
+      // Copy extended MB into Temp array, applying the spatial filter
+      filter_mb(ptr - (INTERP_EXTEND - 1) * (pre_stride + 1), pre_stride,
+                Temp, len, len, len);
+
+      // Sub-pel filter
+      xd->subpixel_predict_avg16x16(pTemp, len, (mv_col & 7) << 1,
+                                    (mv_row & 7) << 1, dst_y, dst_ystride);
+    } else {
+      // TODO Needs to AVERAGE with the dst_y
+      // For now, do not apply the prediction filter in these cases!
+      vp9_avg_mem16x16(ptr, pre_stride, dst_y, dst_ystride);
+    }
+  } else
+#endif  // CONFIG_PRED_FILTER
+  {
+    if ((mv_row | mv_col) & 7) {
+      xd->subpixel_predict_avg16x16(ptr, pre_stride, (mv_col & 7) << 1,
+                                    (mv_row & 7) << 1, dst_y, dst_ystride);
+    } else {
+      vp9_avg_mem16x16(ptr, pre_stride, dst_y, dst_ystride);
+    }
+  }
+}
+
+void vp9_build_2nd_inter16x16_predictors_mbuv(MACROBLOCKD *xd,
+                                              unsigned char *dst_u,
+                                              unsigned char *dst_v,
+                                              int dst_uvstride) {
+  int offset;
+  unsigned char *uptr, *vptr;
+
+  int_mv _16x16mv;
+  int mv_row;
+  int mv_col;
+  int omv_row, omv_col;
+
+  int pre_stride = xd->block[0].pre_stride;
+
+  _16x16mv.as_int = xd->mode_info_context->mbmi.mv[1].as_int;
+
+  if (xd->mode_info_context->mbmi.need_to_clamp_secondmv)
+    clamp_mv_to_umv_border(&_16x16mv.as_mv, xd);
+
+  mv_row = _16x16mv.as_mv.row;
+  mv_col = _16x16mv.as_mv.col;
+
+  /* calc uv motion vectors */
+  omv_row = mv_row;
+  omv_col = mv_col;
+  mv_row = (mv_row + (mv_row > 0)) >> 1;
+  mv_col = (mv_col + (mv_col > 0)) >> 1;
+
+  mv_row &= xd->fullpixel_mask;
+  mv_col &= xd->fullpixel_mask;
+
+  pre_stride >>= 1;
+  offset = (mv_row >> 3) * pre_stride + (mv_col >> 3);
+  uptr = xd->second_pre.u_buffer + offset;
+  vptr = xd->second_pre.v_buffer + offset;
+
+#if CONFIG_PRED_FILTER
+  if (xd->mode_info_context->mbmi.pred_filter_enabled) {
+    int i;
+    int len = 7 + (INTERP_EXTEND << 1);
+    unsigned char Temp[32 * 32]; // Data required by sub-pel filter
+    unsigned char *pTemp = Temp + (INTERP_EXTEND - 1) * (len + 1);
+    unsigned char *pSrc = uptr;
+    unsigned char *pDst = dst_u;
+
+    // U & V
+    for (i = 0; i < 2; i++) {
+      if ((omv_row | omv_col) & 15) {
+        // Copy extended MB into Temp array, applying the spatial filter
+        filter_mb(pSrc - (INTERP_EXTEND - 1) * (pre_stride + 1), pre_stride,
+                  Temp, len, len, len);
+
+        // Sub-pel filter
+        xd->subpixel_predict_avg8x8(pTemp, len, omv_col & 15,
+                                    omv_row & 15, pDst, dst_uvstride);
+      } else {
+        // TODO Needs to AVERAGE with the dst_[u|v]
+        // For now, do not apply the prediction filter here!
+        vp9_avg_mem8x8(pSrc, pre_stride, pDst, dst_uvstride);
+      }
+
+      // V
+      pSrc = vptr;
+      pDst = dst_v;
+    }
+  } else
+#endif  // CONFIG_PRED_FILTER
+    if ((omv_row | omv_col) & 15) {
+      xd->subpixel_predict_avg8x8(uptr, pre_stride, omv_col & 15,
+                                  omv_row & 15, dst_u, dst_uvstride);
+      xd->subpixel_predict_avg8x8(vptr, pre_stride, omv_col & 15,
+                                  omv_row & 15, dst_v, dst_uvstride);
+    } else {
+      vp9_avg_mem8x8(uptr, pre_stride, dst_u, dst_uvstride);
+      vp9_avg_mem8x8(vptr, pre_stride, dst_v, dst_uvstride);
+    }
+}
+
+void vp9_build_2nd_inter16x16_predictors_mb(MACROBLOCKD *xd,
+                                            unsigned char *dst_y,
+                                            unsigned char *dst_u,
+                                            unsigned char *dst_v,
+                                            int dst_ystride,
+                                            int dst_uvstride) {
+  vp9_build_2nd_inter16x16_predictors_mby(xd, dst_y, dst_ystride);
+  vp9_build_2nd_inter16x16_predictors_mbuv(xd, dst_u, dst_v, dst_uvstride);
+}
+
+static void build_inter4x4_predictors_mb(MACROBLOCKD *xd) {
+  int i;
+  MB_MODE_INFO * mbmi = &xd->mode_info_context->mbmi;
+  BLOCKD *blockd = xd->block;
+
+  if (xd->mode_info_context->mbmi.partitioning != PARTITIONING_4X4) {
+    blockd[ 0].bmi = xd->mode_info_context->bmi[ 0];
+    blockd[ 2].bmi = xd->mode_info_context->bmi[ 2];
+    blockd[ 8].bmi = xd->mode_info_context->bmi[ 8];
+    blockd[10].bmi = xd->mode_info_context->bmi[10];
+
+    if (mbmi->need_to_clamp_mvs) {
+      clamp_mv_to_umv_border(&blockd[ 0].bmi.as_mv.first.as_mv, xd);
+      clamp_mv_to_umv_border(&blockd[ 2].bmi.as_mv.first.as_mv, xd);
+      clamp_mv_to_umv_border(&blockd[ 8].bmi.as_mv.first.as_mv, xd);
+      clamp_mv_to_umv_border(&blockd[10].bmi.as_mv.first.as_mv, xd);
+      if (mbmi->second_ref_frame) {
+        clamp_mv_to_umv_border(&blockd[ 0].bmi.as_mv.second.as_mv, xd);
+        clamp_mv_to_umv_border(&blockd[ 2].bmi.as_mv.second.as_mv, xd);
+        clamp_mv_to_umv_border(&blockd[ 8].bmi.as_mv.second.as_mv, xd);
+        clamp_mv_to_umv_border(&blockd[10].bmi.as_mv.second.as_mv, xd);
+      }
+    }
+
+
+    vp9_build_inter_predictors4b(xd, &blockd[ 0], 16);
+    vp9_build_inter_predictors4b(xd, &blockd[ 2], 16);
+    vp9_build_inter_predictors4b(xd, &blockd[ 8], 16);
+    vp9_build_inter_predictors4b(xd, &blockd[10], 16);
+
+    if (mbmi->second_ref_frame) {
+      vp9_build_2nd_inter_predictors4b(xd, &blockd[ 0], 16);
+      vp9_build_2nd_inter_predictors4b(xd, &blockd[ 2], 16);
+      vp9_build_2nd_inter_predictors4b(xd, &blockd[ 8], 16);
+      vp9_build_2nd_inter_predictors4b(xd, &blockd[10], 16);
+    }
+  } else {
+    for (i = 0; i < 16; i += 2) {
+      BLOCKD *d0 = &blockd[i];
+      BLOCKD *d1 = &blockd[i + 1];
+
+      blockd[i + 0].bmi = xd->mode_info_context->bmi[i + 0];
+      blockd[i + 1].bmi = xd->mode_info_context->bmi[i + 1];
+
+      if (mbmi->need_to_clamp_mvs) {
+        clamp_mv_to_umv_border(&blockd[i + 0].bmi.as_mv.first.as_mv, xd);
+        clamp_mv_to_umv_border(&blockd[i + 1].bmi.as_mv.first.as_mv, xd);
+        if (mbmi->second_ref_frame) {
+          clamp_mv_to_umv_border(&blockd[i + 0].bmi.as_mv.second.as_mv, xd);
+          clamp_mv_to_umv_border(&blockd[i + 1].bmi.as_mv.second.as_mv, xd);
+        }
+      }
+
+      if (d0->bmi.as_mv.first.as_int == d1->bmi.as_mv.first.as_int)
+        build_inter_predictors2b(xd, d0, 16);
+      else {
+        vp9_build_inter_predictors_b(d0, 16, xd->subpixel_predict);
+        vp9_build_inter_predictors_b(d1, 16, xd->subpixel_predict);
+      }
+
+      if (mbmi->second_ref_frame) {
+        vp9_build_2nd_inter_predictors_b(d0, 16, xd->subpixel_predict_avg);
+        vp9_build_2nd_inter_predictors_b(d1, 16, xd->subpixel_predict_avg);
+      }
+    }
+  }
+
+  for (i = 16; i < 24; i += 2) {
+    BLOCKD *d0 = &blockd[i];
+    BLOCKD *d1 = &blockd[i + 1];
+
+    if (d0->bmi.as_mv.first.as_int == d1->bmi.as_mv.first.as_int)
+      build_inter_predictors2b(xd, d0, 8);
+    else {
+      vp9_build_inter_predictors_b(d0, 8, xd->subpixel_predict);
+      vp9_build_inter_predictors_b(d1, 8, xd->subpixel_predict);
+    }
+
+    if (mbmi->second_ref_frame) {
+      vp9_build_2nd_inter_predictors_b(d0, 8, xd->subpixel_predict_avg);
+      vp9_build_2nd_inter_predictors_b(d1, 8, xd->subpixel_predict_avg);
+    }
+  }
+}
+
+static
+void build_4x4uvmvs(MACROBLOCKD *xd) {
+  int i, j;
+  BLOCKD *blockd = xd->block;
+
+  for (i = 0; i < 2; i++) {
+    for (j = 0; j < 2; j++) {
+      int yoffset = i * 8 + j * 2;
+      int uoffset = 16 + i * 2 + j;
+      int voffset = 20 + i * 2 + j;
+
+      int temp;
+
+      temp = xd->mode_info_context->bmi[yoffset + 0].as_mv.first.as_mv.row
+             + xd->mode_info_context->bmi[yoffset + 1].as_mv.first.as_mv.row
+             + xd->mode_info_context->bmi[yoffset + 4].as_mv.first.as_mv.row
+             + xd->mode_info_context->bmi[yoffset + 5].as_mv.first.as_mv.row;
+
+      if (temp < 0) temp -= 4;
+      else temp += 4;
+
+      blockd[uoffset].bmi.as_mv.first.as_mv.row = (temp / 8) &
+                                                  xd->fullpixel_mask;
+
+      temp = xd->mode_info_context->bmi[yoffset + 0].as_mv.first.as_mv.col
+             + xd->mode_info_context->bmi[yoffset + 1].as_mv.first.as_mv.col
+             + xd->mode_info_context->bmi[yoffset + 4].as_mv.first.as_mv.col
+             + xd->mode_info_context->bmi[yoffset + 5].as_mv.first.as_mv.col;
+
+      if (temp < 0) temp -= 4;
+      else temp += 4;
+
+      blockd[uoffset].bmi.as_mv.first.as_mv.col = (temp / 8) &
+        xd->fullpixel_mask;
+
+      // if (x->mode_info_context->mbmi.need_to_clamp_mvs)
+      clamp_uvmv_to_umv_border(&blockd[uoffset].bmi.as_mv.first.as_mv, xd);
+
+      // if (x->mode_info_context->mbmi.need_to_clamp_mvs)
+      clamp_uvmv_to_umv_border(&blockd[uoffset].bmi.as_mv.first.as_mv, xd);
+
+      blockd[voffset].bmi.as_mv.first.as_mv.row =
+        blockd[uoffset].bmi.as_mv.first.as_mv.row;
+      blockd[voffset].bmi.as_mv.first.as_mv.col =
+        blockd[uoffset].bmi.as_mv.first.as_mv.col;
+
+      if (xd->mode_info_context->mbmi.second_ref_frame) {
+        temp = xd->mode_info_context->bmi[yoffset + 0].as_mv.second.as_mv.row
+               + xd->mode_info_context->bmi[yoffset + 1].as_mv.second.as_mv.row
+               + xd->mode_info_context->bmi[yoffset + 4].as_mv.second.as_mv.row
+               + xd->mode_info_context->bmi[yoffset + 5].as_mv.second.as_mv.row;
+
+        if (temp < 0) {
+          temp -= 4;
+        } else {
+          temp += 4;
+        }
+
+       blockd[uoffset].bmi.as_mv.second.as_mv.row = (temp / 8) &
+                                                    xd->fullpixel_mask;
+
+        temp = xd->mode_info_context->bmi[yoffset + 0].as_mv.second.as_mv.col
+               + xd->mode_info_context->bmi[yoffset + 1].as_mv.second.as_mv.col
+               + xd->mode_info_context->bmi[yoffset + 4].as_mv.second.as_mv.col
+               + xd->mode_info_context->bmi[yoffset + 5].as_mv.second.as_mv.col;
+
+        if (temp < 0) {
+          temp -= 4;
+        } else {
+          temp += 4;
+        }
+
+        blockd[uoffset].bmi.as_mv.second.as_mv.col = (temp / 8) &
+                                                        xd->fullpixel_mask;
+
+        // if (mbmi->need_to_clamp_mvs)
+        clamp_uvmv_to_umv_border(
+          &blockd[uoffset].bmi.as_mv.second.as_mv, xd);
+
+        // if (mbmi->need_to_clamp_mvs)
+        clamp_uvmv_to_umv_border(
+          &blockd[uoffset].bmi.as_mv.second.as_mv, xd);
+
+        blockd[voffset].bmi.as_mv.second.as_mv.row =
+          blockd[uoffset].bmi.as_mv.second.as_mv.row;
+        blockd[voffset].bmi.as_mv.second.as_mv.col =
+          blockd[uoffset].bmi.as_mv.second.as_mv.col;
+      }
+    }
+  }
+}
+
+void vp9_build_inter_predictors_mb(MACROBLOCKD *xd) {
+  if (xd->mode_info_context->mbmi.mode != SPLITMV) {
+    vp9_build_1st_inter16x16_predictors_mb(xd, xd->predictor,
+                                           &xd->predictor[256],
+                                           &xd->predictor[320], 16, 8);
+
+    if (xd->mode_info_context->mbmi.second_ref_frame) {
+      /* 256 = offset of U plane in Y+U+V buffer;
+       * 320 = offset of V plane in Y+U+V buffer.
+       * (256=16x16, 320=16x16+8x8). */
+      vp9_build_2nd_inter16x16_predictors_mb(xd, xd->predictor,
+                                             &xd->predictor[256],
+                                             &xd->predictor[320], 16, 8);
+    }
+  } else {
+    build_4x4uvmvs(xd);
+    build_inter4x4_predictors_mb(xd);
+  }
+}
--- /dev/null
+++ b/vp9/common/reconinter.h
@@ -1,0 +1,78 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef __INC_RECONINTER_H
+#define __INC_RECONINTER_H
+
+#include "onyxc_int.h"
+
+extern void vp9_build_1st_inter16x16_predictors_mby(MACROBLOCKD *xd,
+                                                    unsigned char *dst_y,
+                                                    int dst_ystride,
+                                                    int clamp_mvs);
+
+extern void vp9_build_1st_inter16x16_predictors_mbuv(MACROBLOCKD *xd,
+                                                     unsigned char *dst_u,
+                                                     unsigned char *dst_v,
+                                                     int dst_uvstride);
+
+extern void vp9_build_1st_inter16x16_predictors_mb(MACROBLOCKD *xd,
+                                                   unsigned char *dst_y,
+                                                   unsigned char *dst_u,
+                                                   unsigned char *dst_v,
+                                                   int dst_ystride,
+                                                   int dst_uvstride);
+
+extern void vp9_build_2nd_inter16x16_predictors_mby(MACROBLOCKD *xd,
+                                                    unsigned char *dst_y,
+                                                    int dst_ystride);
+
+extern void vp9_build_2nd_inter16x16_predictors_mbuv(MACROBLOCKD *xd,
+                                                     unsigned char *dst_u,
+                                                     unsigned char *dst_v,
+                                                     int dst_uvstride);
+
+extern void vp9_build_2nd_inter16x16_predictors_mb(MACROBLOCKD *xd,
+                                                   unsigned char *dst_y,
+                                                   unsigned char *dst_u,
+                                                   unsigned char *dst_v,
+                                                   int dst_ystride,
+                                                   int dst_uvstride);
+
+#if CONFIG_SUPERBLOCKS
+extern void vp9_build_inter32x32_predictors_sb(MACROBLOCKD *x,
+                                               unsigned char *dst_y,
+                                               unsigned char *dst_u,
+                                               unsigned char *dst_v,
+                                               int dst_ystride,
+                                               int dst_uvstride);
+#endif
+
+extern void vp9_build_inter_predictors_mb(MACROBLOCKD *xd);
+
+extern void vp9_build_inter_predictors_b(BLOCKD *d, int pitch,
+                                         vp9_subpix_fn_t sppf);
+
+extern void vp9_build_2nd_inter_predictors_b(BLOCKD *d, int pitch,
+                                             vp9_subpix_fn_t sppf);
+
+extern void vp9_build_inter_predictors4b(MACROBLOCKD *xd, BLOCKD *d,
+                                         int pitch);
+
+extern void vp9_build_2nd_inter_predictors4b(MACROBLOCKD *xd,
+                                             BLOCKD *d, int pitch);
+
+extern void vp9_build_inter4x4_predictors_mbuv(MACROBLOCKD *xd);
+
+extern void vp9_setup_interp_filters(MACROBLOCKD *xd,
+                                     INTERPOLATIONFILTERTYPE filter,
+                                     VP9_COMMON *cm);
+
+#endif  // __INC_RECONINTER_H
--- /dev/null
+++ b/vp9/common/reconintra.c
@@ -1,0 +1,490 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdio.h>
+#include "vpx_ports/config.h"
+#include "vpx_rtcd.h"
+#include "reconintra.h"
+#include "vpx_mem/vpx_mem.h"
+
+/* For skip_recon_mb(), add vp9_build_intra_predictors_mby_s(MACROBLOCKD *xd)
+ * and vp9_build_intra_predictors_mbuv_s(MACROBLOCKD *xd).
+ */
+
+static void d27_predictor(uint8_t *ypred_ptr, int y_stride, int n,
+                          uint8_t *yabove_row, uint8_t *yleft_col) {
+  int r, c, h, w, v;
+  int a, b;
+  r = 0;
+  for (c = 0; c < n - 2; c++) {
+    if (c & 1)
+      a = yleft_col[r + 1];
+    else
+      a = (yleft_col[r] + yleft_col[r + 1] + 1) >> 1;
+    b = yabove_row[c + 2];
+    ypred_ptr[c] = (2 * a + (c + 1) * b + (c + 3) / 2) / (c + 3);
+  }
+  for (r = 1; r < n / 2 - 1; r++) {
+    for (c = 0; c < n - 2 - 2 * r; c++) {
+      if (c & 1)
+        a = yleft_col[r + 1];
+      else
+        a = (yleft_col[r] + yleft_col[r + 1] + 1) >> 1;
+      b = ypred_ptr[(r - 1) * y_stride + c + 2];
+      ypred_ptr[r * y_stride + c] = (2 * a + (c + 1) * b + (c + 3) / 2) / (c + 3);
+    }
+  }
+  for (; r < n - 1; ++r) {
+    for (c = 0; c < n; c++) {
+      v = (c & 1 ? yleft_col[r + 1] : (yleft_col[r] + yleft_col[r + 1] + 1) >> 1);
+      h = r - c / 2;
+      ypred_ptr[h * y_stride + c] = v;
+    }
+  }
+  c = 0;
+  r = n - 1;
+  ypred_ptr[r * y_stride] = (ypred_ptr[(r - 1) * y_stride] +
+                             yleft_col[r] + 1) >> 1;
+  for (r = n - 2; r >= n / 2; --r) {
+    w = c + (n - 1 - r) * 2;
+    ypred_ptr[r * y_stride + w] = (ypred_ptr[(r - 1) * y_stride + w] +
+                                   ypred_ptr[r * y_stride + w - 1] + 1) >> 1;
+  }
+  for (c = 1; c < n; c++) {
+    for (r = n - 1; r >= n / 2 + c / 2; --r) {
+      w = c + (n - 1 - r) * 2;
+      ypred_ptr[r * y_stride + w] = (ypred_ptr[(r - 1) * y_stride + w] +
+                                     ypred_ptr[r * y_stride + w - 1] + 1) >> 1;
+    }
+  }
+}
+
+static void d63_predictor(uint8_t *ypred_ptr, int y_stride, int n,
+                          uint8_t *yabove_row, uint8_t *yleft_col) {
+  int r, c, h, w, v;
+  int a, b;
+  c = 0;
+  for (r = 0; r < n - 2; r++) {
+    if (r & 1)
+      a = yabove_row[c + 1];
+    else
+      a = (yabove_row[c] + yabove_row[c + 1] + 1) >> 1;
+    b = yleft_col[r + 2];
+    ypred_ptr[r * y_stride] = (2 * a + (r + 1) * b + (r + 3) / 2) / (r + 3);
+  }
+  for (c = 1; c < n / 2 - 1; c++) {
+    for (r = 0; r < n - 2 - 2 * c; r++) {
+      if (r & 1)
+        a = yabove_row[c + 1];
+      else
+        a = (yabove_row[c] + yabove_row[c + 1] + 1) >> 1;
+      b = ypred_ptr[(r + 2) * y_stride + c - 1];
+      ypred_ptr[r * y_stride + c] = (2 * a + (c + 1) * b + (c + 3) / 2) / (c + 3);
+    }
+  }
+  for (; c < n - 1; ++c) {
+    for (r = 0; r < n; r++) {
+      v = (r & 1 ? yabove_row[c + 1] : (yabove_row[c] + yabove_row[c + 1] + 1) >> 1);
+      w = c - r / 2;
+      ypred_ptr[r * y_stride + w] = v;
+    }
+  }
+  r = 0;
+  c = n - 1;
+  ypred_ptr[c] = (ypred_ptr[(c - 1)] + yabove_row[c] + 1) >> 1;
+  for (c = n - 2; c >= n / 2; --c) {
+    h = r + (n - 1 - c) * 2;
+    ypred_ptr[h * y_stride + c] = (ypred_ptr[h * y_stride + c - 1] +
+                                   ypred_ptr[(h - 1) * y_stride + c] + 1) >> 1;
+  }
+  for (r = 1; r < n; r++) {
+    for (c = n - 1; c >= n / 2 + r / 2; --c) {
+      h = r + (n - 1 - c) * 2;
+      ypred_ptr[h * y_stride + c] = (ypred_ptr[h * y_stride + c - 1] +
+                                     ypred_ptr[(h - 1) * y_stride + c] + 1) >> 1;
+    }
+  }
+}
+
+static void d45_predictor(uint8_t *ypred_ptr, int y_stride, int n,
+                          uint8_t *yabove_row, uint8_t *yleft_col) {
+  int r, c;
+  for (r = 0; r < n - 1; ++r) {
+    for (c = 0; c <= r; ++c) {
+      ypred_ptr[(r - c) * y_stride + c] =
+        (yabove_row[r + 1] * (c + 1) +
+         yleft_col[r + 1] * (r - c + 1) + r / 2 + 1) / (r + 2);
+    }
+  }
+  for (c = 0; c <= r; ++c) {
+    int yabove_ext = yabove_row[r]; // 2*yabove_row[r] - yabove_row[r-1];
+    int yleft_ext = yleft_col[r]; // 2*yleft_col[r] - yleft_col[r-1];
+    yabove_ext = (yabove_ext > 255 ? 255 : (yabove_ext < 0 ? 0 : yabove_ext));
+    yleft_ext = (yleft_ext > 255 ? 255 : (yleft_ext < 0 ? 0 : yleft_ext));
+    ypred_ptr[(r - c) * y_stride + c] =
+      (yabove_ext * (c + 1) +
+       yleft_ext * (r - c + 1) + r / 2 + 1) / (r + 2);
+  }
+  for (r = 1; r < n; ++r) {
+    for (c = n - r; c < n; ++c)
+      ypred_ptr[r * y_stride + c] = (ypred_ptr[(r - 1) * y_stride + c] +
+                                     ypred_ptr[r * y_stride + c - 1] + 1) >> 1;
+  }
+}
+
+static void d117_predictor(uint8_t *ypred_ptr, int y_stride, int n,
+                           uint8_t *yabove_row, uint8_t *yleft_col) {
+  int r, c;
+  for (c = 0; c < n; c++)
+    ypred_ptr[c] = (yabove_row[c - 1] + yabove_row[c] + 1) >> 1;
+  ypred_ptr += y_stride;
+  for (c = 0; c < n; c++)
+    ypred_ptr[c] = yabove_row[c - 1];
+  ypred_ptr += y_stride;
+  for (r = 2; r < n; ++r) {
+    ypred_ptr[0] = yleft_col[r - 2];
+    for (c = 1; c < n; c++)
+      ypred_ptr[c] = ypred_ptr[-2 * y_stride + c - 1];
+    ypred_ptr += y_stride;
+  }
+}
+
+static void d135_predictor(uint8_t *ypred_ptr, int y_stride, int n,
+                           uint8_t *yabove_row, uint8_t *yleft_col) {
+  int r, c;
+  ypred_ptr[0] = yabove_row[-1];
+  for (c = 1; c < n; c++)
+    ypred_ptr[c] = yabove_row[c - 1];
+  for (r = 1; r < n; ++r)
+    ypred_ptr[r * y_stride] = yleft_col[r - 1];
+
+  ypred_ptr += y_stride;
+  for (r = 1; r < n; ++r) {
+    for (c = 1; c < n; c++) {
+      ypred_ptr[c] = ypred_ptr[-y_stride + c - 1];
+    }
+    ypred_ptr += y_stride;
+  }
+}
+
+static void d153_predictor(uint8_t *ypred_ptr, int y_stride, int n,
+                           uint8_t *yabove_row, uint8_t *yleft_col) {
+  int r, c;
+  ypred_ptr[0] = (yabove_row[-1] + yleft_col[0] + 1) >> 1;
+  for (r = 1; r < n; r++)
+    ypred_ptr[r * y_stride] = (yleft_col[r - 1] + yleft_col[r] + 1) >> 1;
+  ypred_ptr++;
+  ypred_ptr[0] = yabove_row[-1];
+  for (r = 1; r < n; r++)
+    ypred_ptr[r * y_stride] = yleft_col[r - 1];
+  ypred_ptr++;
+
+  for (c = 0; c < n - 2; c++)
+    ypred_ptr[c] = yabove_row[c];
+  ypred_ptr += y_stride;
+  for (r = 1; r < n; ++r) {
+    for (c = 0; c < n - 2; c++)
+      ypred_ptr[c] = ypred_ptr[-y_stride + c - 2];
+    ypred_ptr += y_stride;
+  }
+}
+
+void vp9_recon_intra_mbuv(MACROBLOCKD *xd) {
+  int i;
+
+  for (i = 16; i < 24; i += 2) {
+    BLOCKD *b = &xd->block[i];
+    vp9_recon2b(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+  }
+}
+
+void vp9_build_intra_predictors_internal(unsigned char *src, int src_stride,
+                                         unsigned char *ypred_ptr,
+                                         int y_stride, int mode, int bsize,
+                                         int up_available, int left_available) {
+
+  unsigned char *yabove_row = src - src_stride;
+  unsigned char yleft_col[32];
+  unsigned char ytop_left = yabove_row[-1];
+  int r, c, i;
+
+  for (i = 0; i < bsize; i++) {
+    yleft_col[i] = src[i * src_stride - 1];
+  }
+
+  /* for Y */
+  switch (mode) {
+    case DC_PRED: {
+      int expected_dc;
+      int i;
+      int shift;
+      int average = 0;
+      int log2_bsize_minus_1;
+
+      assert(bsize == 4 || bsize == 8 || bsize == 16 || bsize == 32);
+      if (bsize == 4) {
+        log2_bsize_minus_1 = 1;
+      } else if (bsize == 8) {
+        log2_bsize_minus_1 = 2;
+      } else if (bsize == 16) {
+        log2_bsize_minus_1 = 3;
+      } else /* bsize == 32 */ {
+        log2_bsize_minus_1 = 4;
+      }
+
+      if (up_available || left_available) {
+        if (up_available) {
+          for (i = 0; i < bsize; i++) {
+            average += yabove_row[i];
+          }
+        }
+
+        if (left_available) {
+          for (i = 0; i < bsize; i++) {
+            average += yleft_col[i];
+          }
+        }
+        shift = log2_bsize_minus_1 + up_available + left_available;
+        expected_dc = (average + (1 << (shift - 1))) >> shift;
+      } else {
+        expected_dc = 128;
+      }
+
+      for (r = 0; r < bsize; r++) {
+        vpx_memset(ypred_ptr, expected_dc, bsize);
+        ypred_ptr += y_stride;
+      }
+    }
+    break;
+    case V_PRED: {
+      for (r = 0; r < bsize; r++) {
+        memcpy(ypred_ptr, yabove_row, bsize);
+        ypred_ptr += y_stride;
+      }
+    }
+    break;
+    case H_PRED: {
+      for (r = 0; r < bsize; r++) {
+        vpx_memset(ypred_ptr, yleft_col[r], bsize);
+        ypred_ptr += y_stride;
+      }
+    }
+    break;
+    case TM_PRED: {
+      for (r = 0; r < bsize; r++) {
+        for (c = 0; c < bsize; c++) {
+          int pred =  yleft_col[r] + yabove_row[ c] - ytop_left;
+
+          if (pred < 0)
+            pred = 0;
+
+          if (pred > 255)
+            pred = 255;
+
+          ypred_ptr[c] = pred;
+        }
+
+        ypred_ptr += y_stride;
+      }
+    }
+    break;
+    case D45_PRED: {
+      d45_predictor(ypred_ptr, y_stride, bsize,  yabove_row, yleft_col);
+    }
+    break;
+    case D135_PRED: {
+      d135_predictor(ypred_ptr, y_stride, bsize,  yabove_row, yleft_col);
+    }
+    break;
+    case D117_PRED: {
+      d117_predictor(ypred_ptr, y_stride, bsize,  yabove_row, yleft_col);
+    }
+    break;
+    case D153_PRED: {
+      d153_predictor(ypred_ptr, y_stride, bsize,  yabove_row, yleft_col);
+    }
+    break;
+    case D27_PRED: {
+      d27_predictor(ypred_ptr, y_stride, bsize,  yabove_row, yleft_col);
+    }
+    break;
+    case D63_PRED: {
+      d63_predictor(ypred_ptr, y_stride, bsize,  yabove_row, yleft_col);
+    }
+    break;
+    case I8X8_PRED:
+    case B_PRED:
+    case NEARESTMV:
+    case NEARMV:
+    case ZEROMV:
+    case NEWMV:
+    case SPLITMV:
+    case MB_MODE_COUNT:
+      break;
+  }
+}
+
+void vp9_build_intra_predictors_mby(MACROBLOCKD *xd) {
+  vp9_build_intra_predictors_internal(xd->dst.y_buffer, xd->dst.y_stride,
+                                      xd->predictor, 16,
+                                      xd->mode_info_context->mbmi.mode, 16,
+                                      xd->up_available, xd->left_available);
+}
+
+void vp9_build_intra_predictors_mby_s(MACROBLOCKD *xd) {
+  vp9_build_intra_predictors_internal(xd->dst.y_buffer, xd->dst.y_stride,
+                                      xd->dst.y_buffer, xd->dst.y_stride,
+                                      xd->mode_info_context->mbmi.mode, 16,
+                                      xd->up_available, xd->left_available);
+}
+
+#if CONFIG_SUPERBLOCKS
+void vp9_build_intra_predictors_sby_s(MACROBLOCKD *xd) {
+  vp9_build_intra_predictors_internal(xd->dst.y_buffer, xd->dst.y_stride,
+                                      xd->dst.y_buffer, xd->dst.y_stride,
+                                      xd->mode_info_context->mbmi.mode, 32,
+                                      xd->up_available, xd->left_available);
+}
+#endif
+
+#if CONFIG_COMP_INTRA_PRED
+void vp9_build_comp_intra_predictors_mby(MACROBLOCKD *xd) {
+  unsigned char predictor[2][256];
+  int i;
+
+  vp9_build_intra_predictors_internal(xd->dst.y_buffer, xd->dst.y_stride,
+                                      predictor[0], 16,
+                                      xd->mode_info_context->mbmi.mode,
+                                      16, xd->up_available,
+                                      xd->left_available);
+  vp9_build_intra_predictors_internal(xd->dst.y_buffer, xd->dst.y_stride,
+                                      predictor[1], 16,
+                                      xd->mode_info_context->mbmi.second_mode,
+                                      16, xd->up_available,
+                                      xd->left_available);
+
+  for (i = 0; i < 256; i++) {
+    xd->predictor[i] = (predictor[0][i] + predictor[1][i] + 1) >> 1;
+  }
+}
+#endif
+
+void vp9_build_intra_predictors_mbuv_internal(MACROBLOCKD *xd,
+                                              unsigned char *upred_ptr,
+                                              unsigned char *vpred_ptr,
+                                              int uv_stride,
+                                              int mode, int bsize) {
+  vp9_build_intra_predictors_internal(xd->dst.u_buffer, xd->dst.uv_stride,
+                                      upred_ptr, uv_stride, mode, bsize,
+                                      xd->up_available, xd->left_available);
+  vp9_build_intra_predictors_internal(xd->dst.v_buffer, xd->dst.uv_stride,
+                                      vpred_ptr, uv_stride, mode, bsize,
+                                      xd->up_available, xd->left_available);
+}
+
+void vp9_build_intra_predictors_mbuv(MACROBLOCKD *xd) {
+  vp9_build_intra_predictors_mbuv_internal(xd, &xd->predictor[256],
+                                           &xd->predictor[320], 8,
+                                           xd->mode_info_context->mbmi.uv_mode,
+                                           8);
+}
+
+void vp9_build_intra_predictors_mbuv_s(MACROBLOCKD *xd) {
+  vp9_build_intra_predictors_mbuv_internal(xd, xd->dst.u_buffer,
+                                           xd->dst.v_buffer,
+                                           xd->dst.uv_stride,
+                                           xd->mode_info_context->mbmi.uv_mode,
+                                           8);
+}
+
+#if CONFIG_SUPERBLOCKS
+void vp9_build_intra_predictors_sbuv_s(MACROBLOCKD *xd) {
+  vp9_build_intra_predictors_mbuv_internal(xd, xd->dst.u_buffer,
+                                           xd->dst.v_buffer, xd->dst.uv_stride,
+                                           xd->mode_info_context->mbmi.uv_mode,
+                                           16);
+}
+#endif
+
+#if CONFIG_COMP_INTRA_PRED
+void vp9_build_comp_intra_predictors_mbuv(MACROBLOCKD *xd) {
+  unsigned char predictor[2][2][64];
+  int i;
+
+  vp9_build_intra_predictors_mbuv_internal(
+    xd, predictor[0][0], predictor[1][0], 8,
+    xd->mode_info_context->mbmi.uv_mode, 8);
+  vp9_build_intra_predictors_mbuv_internal(
+    xd, predictor[0][1], predictor[1][1], 8,
+    xd->mode_info_context->mbmi.second_uv_mode, 8);
+  for (i = 0; i < 64; i++) {
+    xd->predictor[256 + i] = (predictor[0][0][i] + predictor[0][1][i] + 1) >> 1;
+    xd->predictor[256 + 64 + i] = (predictor[1][0][i] +
+                                   predictor[1][1][i] + 1) >> 1;
+  }
+}
+#endif
+
+void vp9_intra8x8_predict(BLOCKD *xd,
+                          int mode,
+                          unsigned char *predictor) {
+  vp9_build_intra_predictors_internal(*(xd->base_dst) + xd->dst,
+                                      xd->dst_stride, predictor, 16,
+                                      mode, 8, 1, 1);
+}
+
+#if CONFIG_COMP_INTRA_PRED
+void vp9_comp_intra8x8_predict(BLOCKD *xd,
+                               int mode, int second_mode,
+                               unsigned char *out_predictor) {
+  unsigned char predictor[2][8 * 16];
+  int i, j;
+
+  vp9_intra8x8_predict(xd, mode, predictor[0]);
+  vp9_intra8x8_predict(xd, second_mode, predictor[1]);
+
+  for (i = 0; i < 8 * 16; i += 16) {
+    for (j = i; j < i + 8; j++) {
+      out_predictor[j] = (predictor[0][j] + predictor[1][j] + 1) >> 1;
+    }
+  }
+}
+#endif
+
+void vp9_intra_uv4x4_predict(BLOCKD *xd,
+                             int mode,
+                             unsigned char *predictor) {
+  vp9_build_intra_predictors_internal(*(xd->base_dst) + xd->dst,
+                                      xd->dst_stride, predictor, 8,
+                                      mode, 4, 1, 1);
+}
+
+#if CONFIG_COMP_INTRA_PRED
+void vp9_comp_intra_uv4x4_predict(BLOCKD *xd,
+                                  int mode, int mode2,
+                                  unsigned char *out_predictor) {
+  unsigned char predictor[2][8 * 4];
+  int i, j;
+
+  vp9_intra_uv4x4_predict(xd, mode, predictor[0]);
+  vp9_intra_uv4x4_predict(xd, mode2, predictor[1]);
+
+  for (i = 0; i < 4 * 8; i += 8) {
+    for (j = i; j < i + 4; j++) {
+      out_predictor[j] = (predictor[0][j] + predictor[1][j] + 1) >> 1;
+    }
+  }
+}
+#endif
+
+/* TODO: try different ways of use Y-UV mode correlation
+ Current code assumes that a uv 4x4 block use same mode
+ as corresponding Y 8x8 area
+ */
--- /dev/null
+++ b/vp9/common/reconintra.h
@@ -1,0 +1,18 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef __INC_RECONINTRA_H
+#define __INC_RECONINTRA_H
+
+#include "blockd.h"
+
+extern void init_intra_left_above_pixels(MACROBLOCKD *xd);
+
+#endif  // __INC_RECONINTRA_H
--- /dev/null
+++ b/vp9/common/reconintra4x4.c
@@ -1,0 +1,321 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_ports/config.h"
+#include "vpx_mem/vpx_mem.h"
+#include "reconintra.h"
+#include "vpx_rtcd.h"
+
+void vp9_intra4x4_predict_c(BLOCKD *x, int b_mode,
+                            unsigned char *predictor) {
+  int i, r, c;
+
+  unsigned char *Above = *(x->base_dst) + x->dst - x->dst_stride;
+  unsigned char Left[4];
+  unsigned char top_left = Above[-1];
+
+  Left[0] = (*(x->base_dst))[x->dst - 1];
+  Left[1] = (*(x->base_dst))[x->dst - 1 + x->dst_stride];
+  Left[2] = (*(x->base_dst))[x->dst - 1 + 2 * x->dst_stride];
+  Left[3] = (*(x->base_dst))[x->dst - 1 + 3 * x->dst_stride];
+
+  switch (b_mode) {
+    case B_DC_PRED: {
+      int expected_dc = 0;
+
+      for (i = 0; i < 4; i++) {
+        expected_dc += Above[i];
+        expected_dc += Left[i];
+      }
+
+      expected_dc = (expected_dc + 4) >> 3;
+
+      for (r = 0; r < 4; r++) {
+        for (c = 0; c < 4; c++) {
+          predictor[c] = expected_dc;
+        }
+
+        predictor += 16;
+      }
+    }
+    break;
+    case B_TM_PRED: {
+      /* prediction similar to true_motion prediction */
+      for (r = 0; r < 4; r++) {
+        for (c = 0; c < 4; c++) {
+          int pred = Above[c] - top_left + Left[r];
+
+          if (pred < 0)
+            pred = 0;
+
+          if (pred > 255)
+            pred = 255;
+
+          predictor[c] = pred;
+        }
+
+        predictor += 16;
+      }
+    }
+    break;
+
+    case B_VE_PRED: {
+
+      unsigned int ap[4];
+      ap[0] = Above[0];
+      ap[1] = Above[1];
+      ap[2] = Above[2];
+      ap[3] = Above[3];
+
+      for (r = 0; r < 4; r++) {
+        for (c = 0; c < 4; c++) {
+
+          predictor[c] = ap[c];
+        }
+
+        predictor += 16;
+      }
+
+    }
+    break;
+
+
+    case B_HE_PRED: {
+
+      unsigned int lp[4];
+      lp[0] = Left[0];
+      lp[1] = Left[1];
+      lp[2] = Left[2];
+      lp[3] = Left[3];
+
+      for (r = 0; r < 4; r++) {
+        for (c = 0; c < 4; c++) {
+          predictor[c] = lp[r];
+        }
+
+        predictor += 16;
+      }
+    }
+    break;
+    case B_LD_PRED: {
+      unsigned char *ptr = Above;
+      predictor[0 * 16 + 0] = (ptr[0] + ptr[1] * 2 + ptr[2] + 2) >> 2;
+      predictor[0 * 16 + 1] =
+        predictor[1 * 16 + 0] = (ptr[1] + ptr[2] * 2 + ptr[3] + 2) >> 2;
+      predictor[0 * 16 + 2] =
+        predictor[1 * 16 + 1] =
+          predictor[2 * 16 + 0] = (ptr[2] + ptr[3] * 2 + ptr[4] + 2) >> 2;
+      predictor[0 * 16 + 3] =
+        predictor[1 * 16 + 2] =
+          predictor[2 * 16 + 1] =
+            predictor[3 * 16 + 0] = (ptr[3] + ptr[4] * 2 + ptr[5] + 2) >> 2;
+      predictor[1 * 16 + 3] =
+        predictor[2 * 16 + 2] =
+          predictor[3 * 16 + 1] = (ptr[4] + ptr[5] * 2 + ptr[6] + 2) >> 2;
+      predictor[2 * 16 + 3] =
+        predictor[3 * 16 + 2] = (ptr[5] + ptr[6] * 2 + ptr[7] + 2) >> 2;
+      predictor[3 * 16 + 3] = (ptr[6] + ptr[7] * 2 + ptr[7] + 2) >> 2;
+
+    }
+    break;
+    case B_RD_PRED: {
+
+      unsigned char pp[9];
+
+      pp[0] = Left[3];
+      pp[1] = Left[2];
+      pp[2] = Left[1];
+      pp[3] = Left[0];
+      pp[4] = top_left;
+      pp[5] = Above[0];
+      pp[6] = Above[1];
+      pp[7] = Above[2];
+      pp[8] = Above[3];
+
+      predictor[3 * 16 + 0] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2;
+      predictor[3 * 16 + 1] =
+        predictor[2 * 16 + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
+      predictor[3 * 16 + 2] =
+        predictor[2 * 16 + 1] =
+          predictor[1 * 16 + 0] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2;
+      predictor[3 * 16 + 3] =
+        predictor[2 * 16 + 2] =
+          predictor[1 * 16 + 1] =
+            predictor[0 * 16 + 0] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2;
+      predictor[2 * 16 + 3] =
+        predictor[1 * 16 + 2] =
+          predictor[0 * 16 + 1] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2;
+      predictor[1 * 16 + 3] =
+        predictor[0 * 16 + 2] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2;
+      predictor[0 * 16 + 3] = (pp[6] + pp[7] * 2 + pp[8] + 2) >> 2;
+
+    }
+    break;
+    case B_VR_PRED: {
+
+      unsigned char pp[9];
+
+      pp[0] = Left[3];
+      pp[1] = Left[2];
+      pp[2] = Left[1];
+      pp[3] = Left[0];
+      pp[4] = top_left;
+      pp[5] = Above[0];
+      pp[6] = Above[1];
+      pp[7] = Above[2];
+      pp[8] = Above[3];
+
+
+      predictor[3 * 16 + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
+      predictor[2 * 16 + 0] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2;
+      predictor[3 * 16 + 1] =
+        predictor[1 * 16 + 0] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2;
+      predictor[2 * 16 + 1] =
+        predictor[0 * 16 + 0] = (pp[4] + pp[5] + 1) >> 1;
+      predictor[3 * 16 + 2] =
+        predictor[1 * 16 + 1] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2;
+      predictor[2 * 16 + 2] =
+        predictor[0 * 16 + 1] = (pp[5] + pp[6] + 1) >> 1;
+      predictor[3 * 16 + 3] =
+        predictor[1 * 16 + 2] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2;
+      predictor[2 * 16 + 3] =
+        predictor[0 * 16 + 2] = (pp[6] + pp[7] + 1) >> 1;
+      predictor[1 * 16 + 3] = (pp[6] + pp[7] * 2 + pp[8] + 2) >> 2;
+      predictor[0 * 16 + 3] = (pp[7] + pp[8] + 1) >> 1;
+
+    }
+    break;
+    case B_VL_PRED: {
+
+      unsigned char *pp = Above;
+
+      predictor[0 * 16 + 0] = (pp[0] + pp[1] + 1) >> 1;
+      predictor[1 * 16 + 0] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2;
+      predictor[2 * 16 + 0] =
+        predictor[0 * 16 + 1] = (pp[1] + pp[2] + 1) >> 1;
+      predictor[1 * 16 + 1] =
+        predictor[3 * 16 + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
+      predictor[2 * 16 + 1] =
+        predictor[0 * 16 + 2] = (pp[2] + pp[3] + 1) >> 1;
+      predictor[3 * 16 + 1] =
+        predictor[1 * 16 + 2] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2;
+      predictor[0 * 16 + 3] =
+        predictor[2 * 16 + 2] = (pp[3] + pp[4] + 1) >> 1;
+      predictor[1 * 16 + 3] =
+        predictor[3 * 16 + 2] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2;
+      predictor[2 * 16 + 3] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2;
+      predictor[3 * 16 + 3] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2;
+    }
+    break;
+
+    case B_HD_PRED: {
+      unsigned char pp[9];
+      pp[0] = Left[3];
+      pp[1] = Left[2];
+      pp[2] = Left[1];
+      pp[3] = Left[0];
+      pp[4] = top_left;
+      pp[5] = Above[0];
+      pp[6] = Above[1];
+      pp[7] = Above[2];
+      pp[8] = Above[3];
+
+
+      predictor[3 * 16 + 0] = (pp[0] + pp[1] + 1) >> 1;
+      predictor[3 * 16 + 1] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2;
+      predictor[2 * 16 + 0] =
+        predictor[3 * 16 + 2] = (pp[1] + pp[2] + 1) >> 1;
+      predictor[2 * 16 + 1] =
+        predictor[3 * 16 + 3] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
+      predictor[2 * 16 + 2] =
+        predictor[1 * 16 + 0] = (pp[2] + pp[3] + 1) >> 1;
+      predictor[2 * 16 + 3] =
+        predictor[1 * 16 + 1] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2;
+      predictor[1 * 16 + 2] =
+        predictor[0 * 16 + 0] = (pp[3] + pp[4] + 1) >> 1;
+      predictor[1 * 16 + 3] =
+        predictor[0 * 16 + 1] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2;
+      predictor[0 * 16 + 2] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2;
+      predictor[0 * 16 + 3] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2;
+    }
+    break;
+
+
+    case B_HU_PRED: {
+      unsigned char *pp = Left;
+      predictor[0 * 16 + 0] = (pp[0] + pp[1] + 1) >> 1;
+      predictor[0 * 16 + 1] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2;
+      predictor[0 * 16 + 2] =
+        predictor[1 * 16 + 0] = (pp[1] + pp[2] + 1) >> 1;
+      predictor[0 * 16 + 3] =
+        predictor[1 * 16 + 1] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
+      predictor[1 * 16 + 2] =
+        predictor[2 * 16 + 0] = (pp[2] + pp[3] + 1) >> 1;
+      predictor[1 * 16 + 3] =
+        predictor[2 * 16 + 1] = (pp[2] + pp[3] * 2 + pp[3] + 2) >> 2;
+      predictor[2 * 16 + 2] =
+        predictor[2 * 16 + 3] =
+          predictor[3 * 16 + 0] =
+            predictor[3 * 16 + 1] =
+              predictor[3 * 16 + 2] =
+                predictor[3 * 16 + 3] = pp[3];
+    }
+    break;
+
+
+  }
+}
+
+#if CONFIG_COMP_INTRA_PRED
+void vp9_comp_intra4x4_predict_c(BLOCKD *x,
+                               int b_mode, int b_mode2,
+                               unsigned char *out_predictor) {
+  unsigned char predictor[2][4 * 16];
+  int i, j;
+
+  vp9_intra4x4_predict(x, b_mode, predictor[0]);
+  vp9_intra4x4_predict(x, b_mode2, predictor[1]);
+
+  for (i = 0; i < 16 * 4; i += 16) {
+    for (j = i; j < i + 4; j++) {
+      out_predictor[j] = (predictor[0][j] + predictor[1][j] + 1) >> 1;
+    }
+  }
+}
+#endif
+
+/* copy 4 bytes from the above right down so that the 4x4 prediction modes using pixels above and
+ * to the right prediction have filled in pixels to use.
+ */
+void vp9_intra_prediction_down_copy(MACROBLOCKD *xd) {
+  int extend_edge = (xd->mb_to_right_edge == 0 && xd->mb_index < 2);
+  unsigned char *above_right = *(xd->block[0].base_dst) + xd->block[0].dst -
+                               xd->block[0].dst_stride + 16;
+  unsigned int *src_ptr = (unsigned int *)
+      (above_right - (xd->mb_index == 3 ? 16 * xd->block[0].dst_stride : 0));
+
+  unsigned int *dst_ptr0 = (unsigned int *)above_right;
+  unsigned int *dst_ptr1 =
+    (unsigned int *)(above_right + 4 * xd->block[0].dst_stride);
+  unsigned int *dst_ptr2 =
+    (unsigned int *)(above_right + 8 * xd->block[0].dst_stride);
+  unsigned int *dst_ptr3 =
+    (unsigned int *)(above_right + 12 * xd->block[0].dst_stride);
+
+  if (extend_edge) {
+    *src_ptr = ((uint8_t *) src_ptr)[-1] * 0x01010101U;
+  }
+
+  *dst_ptr0 = *src_ptr;
+  *dst_ptr1 = *src_ptr;
+  *dst_ptr2 = *src_ptr;
+  *dst_ptr3 = *src_ptr;
+}
--- /dev/null
+++ b/vp9/common/reconintra4x4.h
@@ -1,0 +1,17 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_RECONINTRA4x4_H
+#define __INC_RECONINTRA4x4_H
+
+extern void vp9_intra_prediction_down_copy(MACROBLOCKD *xd);
+
+#endif
--- /dev/null
+++ b/vp9/common/rtcd.c
@@ -1,0 +1,105 @@
+/*
+ *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include "vpx_config.h"
+#define RTCD_C
+#include "vpx_rtcd.h"
+
+#if CONFIG_MULTITHREAD && defined(_WIN32)
+#include <windows.h>
+#include <stdlib.h>
+static void once(void (*func)(void))
+{
+    static CRITICAL_SECTION *lock;
+    static LONG waiters;
+    static int done;
+    void *lock_ptr = &lock;
+
+    /* If the initialization is complete, return early. This isn't just an
+     * optimization, it prevents races on the destruction of the global
+     * lock.
+     */
+    if(done)
+        return;
+
+    InterlockedIncrement(&waiters);
+
+    /* Get a lock. We create one and try to make it the one-true-lock,
+     * throwing it away if we lost the race.
+     */
+
+    {
+        /* Scope to protect access to new_lock */
+        CRITICAL_SECTION *new_lock = malloc(sizeof(CRITICAL_SECTION));
+        InitializeCriticalSection(new_lock);
+        if (InterlockedCompareExchangePointer(lock_ptr, new_lock, NULL) != NULL)
+        {
+            DeleteCriticalSection(new_lock);
+            free(new_lock);
+        }
+    }
+
+    /* At this point, we have a lock that can be synchronized on. We don't
+     * care which thread actually performed the allocation.
+     */
+
+    EnterCriticalSection(lock);
+
+    if (!done)
+    {
+        func();
+        done = 1;
+    }
+
+    LeaveCriticalSection(lock);
+
+    /* Last one out should free resources. The destructed objects are
+     * protected by checking if(done) above.
+     */
+    if(!InterlockedDecrement(&waiters))
+    {
+        DeleteCriticalSection(lock);
+        free(lock);
+        lock = NULL;
+    }
+}
+
+
+#elif CONFIG_MULTITHREAD && HAVE_PTHREAD_H
+#include <pthread.h>
+static void once(void (*func)(void))
+{
+    static pthread_once_t lock = PTHREAD_ONCE_INIT;
+    pthread_once(&lock, func);
+}
+
+
+#else
+/* No-op version that performs no synchronization. vpx_rtcd() is idempotent,
+ * so as long as your platform provides atomic loads/stores of pointers
+ * no synchronization is strictly necessary.
+ */
+
+static void once(void (*func)(void))
+{
+    static int done;
+
+    if(!done)
+    {
+        func();
+        done = 1;
+    }
+}
+#endif
+
+
+void vpx_rtcd()
+{
+    once(setup_rtcd_internal);
+}
--- /dev/null
+++ b/vp9/common/rtcd_defs.sh
@@ -1,0 +1,482 @@
+common_forward_decls() {
+cat <<EOF
+
+struct loop_filter_info;
+struct blockd;
+struct macroblockd;
+struct loop_filter_info;
+
+/* Encoder forward decls */
+struct block;
+struct macroblock;
+struct variance_vtable;
+
+/* Encoder forward decls */
+struct variance_vtable;
+union int_mv;
+struct yv12_buffer_config;
+EOF
+}
+forward_decls common_forward_decls
+
+prototype void vp9_filter_block2d_4x4_8 "const unsigned char *src_ptr, const unsigned int src_stride, const short *HFilter_aligned16, const short *VFilter_aligned16, unsigned char *dst_ptr, unsigned int dst_stride"
+prototype void vp9_filter_block2d_8x4_8 "const unsigned char *src_ptr, const unsigned int src_stride, const short *HFilter_aligned16, const short *VFilter_aligned16, unsigned char *dst_ptr, unsigned int dst_stride"
+prototype void vp9_filter_block2d_8x8_8 "const unsigned char *src_ptr, const unsigned int src_stride, const short *HFilter_aligned16, const short *VFilter_aligned16, unsigned char *dst_ptr, unsigned int dst_stride"
+prototype void vp9_filter_block2d_16x16_8 "const unsigned char *src_ptr, const unsigned int src_stride, const short *HFilter_aligned16, const short *VFilter_aligned16, unsigned char *dst_ptr, unsigned int dst_stride"
+
+# At the very least, MSVC 2008 has compiler bug exhibited by this code; code
+# compiles warning free but a dissassembly of generated code show bugs. To be
+# on the safe side, only enabled when compiled with 'gcc'.
+if [ "$CONFIG_GCC" = "yes" ]; then
+    specialize vp9_filter_block2d_4x4_8 sse4_1 sse2
+fi
+    specialize vp9_filter_block2d_8x4_8 ssse3 #sse4_1 sse2
+    specialize vp9_filter_block2d_8x8_8 ssse3 #sse4_1 sse2
+    specialize vp9_filter_block2d_16x16_8 ssse3 #sse4_1 sse2
+
+#
+# Dequant
+#
+prototype void vp9_dequantize_b "struct blockd *x"
+specialize vp9_dequantize_b mmx
+
+prototype void vp9_dequantize_b_2x2 "struct blockd *x"
+specialize vp9_dequantize_b_2x2
+
+prototype void vp9_dequant_dc_idct_add_y_block_8x8 "short *q, short *dq, unsigned char *pre, unsigned char *dst, int stride, char *eobs, short *dc, struct macroblockd *xd"
+specialize vp9_dequant_dc_idct_add_y_block_8x8
+
+prototype void vp9_dequant_idct_add_y_block_8x8 "short *q, short *dq, unsigned char *pre, unsigned char *dst, int stride, char *eobs, struct macroblockd *xd"
+specialize vp9_dequant_idct_add_y_block_8x8
+
+prototype void vp9_dequant_idct_add_uv_block_8x8 "short *q, short *dq, unsigned char *pre, unsigned char *dstu, unsigned char *dstv, int stride, char *eobs, struct macroblockd *xd"
+specialize vp9_dequant_idct_add_uv_block_8x8
+
+prototype void vp9_dequant_idct_add_16x16 "short *input, short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride"
+specialize vp9_dequant_idct_add_16x16
+
+prototype void vp9_dequant_idct_add "short *input, short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride"
+specialize vp9_dequant_idct_add
+
+prototype void vp9_dequant_dc_idct_add "short *input, short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride, int Dc"
+specialize vp9_dequant_dc_idct_add
+
+prototype void vp9_dequant_dc_idct_add_y_block "short *q, short *dq, unsigned char *pre, unsigned char *dst, int stride, char *eobs, short *dc"
+specialize vp9_dequant_dc_idct_add_y_block mmx
+
+prototype void vp9_dequant_idct_add_y_block "short *q, short *dq, unsigned char *pre, unsigned char *dst, int stride, char *eobs"
+specialize vp9_dequant_idct_add_y_block mmx
+
+prototype void vp9_dequant_idct_add_uv_block "short *q, short *dq, unsigned char *pre, unsigned char *dstu, unsigned char *dstv, int stride, char *eobs"
+specialize vp9_dequant_idct_add_uv_block mmx
+
+#
+# RECON
+#
+prototype void vp9_copy_mem16x16 "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch"
+specialize vp9_copy_mem16x16 mmx sse2 media neon dspr2
+vp9_copy_mem16x16_media=vp9_copy_mem16x16_v6
+vp9_copy_mem16x16_dspr2=vp9_copy_mem16x16_dspr2
+
+prototype void vp9_copy_mem8x8 "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch"
+specialize vp9_copy_mem8x8 mmx media neon dspr2
+vp9_copy_mem8x8_media=vp9_copy_mem8x8_v6
+vp9_copy_mem8x8_dspr2=vp9_copy_mem8x8_dspr2
+
+prototype void vp9_copy_mem8x4 "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch"
+specialize vp9_copy_mem8x4 mmx
+
+prototype void vp9_intra4x4_predict "unsigned char *Above, unsigned char *yleft, int left_stride, B_PREDICTION_MODE b_mode, unsigned char *dst, int dst_stride, unsigned char top_left"
+specialize vp9_intra4x4_predict
+
+prototype void vp9_avg_mem16x16 "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch"
+specialize vp9_avg_mem16x16
+
+prototype void vp9_avg_mem8x8 "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch"
+specialize vp9_avg_mem8x8
+
+prototype void vp9_copy_mem8x4 "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch"
+specialize vp9_copy_mem8x4 mmx media neon dspr2
+vp9_copy_mem8x4_media=vp9_copy_mem8x4_v6
+vp9_copy_mem8x4_dspr2=vp9_copy_mem8x4_dspr2
+
+prototype void vp9_recon_b "unsigned char *pred_ptr, short *diff_ptr, unsigned char *dst_ptr, int stride"
+specialize vp9_recon_b
+
+prototype void vp9_recon_uv_b "unsigned char *pred_ptr, short *diff_ptr, unsigned char *dst_ptr, int stride"
+specialize vp9_recon_uv_b
+
+prototype void vp9_recon2b "unsigned char *pred_ptr, short *diff_ptr, unsigned char *dst_ptr, int stride"
+specialize vp9_recon2b sse2
+
+prototype void vp9_recon4b "unsigned char *pred_ptr, short *diff_ptr, unsigned char *dst_ptr, int stride"
+specialize vp9_recon4b sse2
+
+prototype void vp9_recon_mb "struct macroblockd *x"
+specialize vp9_recon_mb
+
+prototype void vp9_recon_mby "struct macroblockd *x"
+specialize vp9_recon_mby
+
+prototype void vp9_build_intra_predictors_mby_s "struct macroblockd *x"
+specialize vp9_build_intra_predictors_mby_s
+
+prototype void vp9_build_intra_predictors_sby_s "struct macroblockd *x"
+specialize vp9_build_intra_predictors_sby_s;
+
+prototype void vp9_build_intra_predictors_sbuv_s "struct macroblockd *x"
+specialize vp9_build_intra_predictors_sbuv_s;
+
+prototype void vp9_build_intra_predictors_mby "struct macroblockd *x"
+specialize vp9_build_intra_predictors_mby;
+
+prototype void vp9_build_comp_intra_predictors_mby "struct macroblockd *x"
+specialize vp9_build_comp_intra_predictors_mby;
+
+prototype void vp9_build_intra_predictors_mby_s "struct macroblockd *x"
+specialize vp9_build_intra_predictors_mby_s;
+
+prototype void vp9_build_intra_predictors_mbuv "struct macroblockd *x"
+specialize vp9_build_intra_predictors_mbuv;
+
+prototype void vp9_build_intra_predictors_mbuv_s "struct macroblockd *x"
+specialize vp9_build_intra_predictors_mbuv_s;
+
+prototype void vp9_build_comp_intra_predictors_mbuv "struct macroblockd *x"
+specialize vp9_build_comp_intra_predictors_mbuv;
+
+prototype void vp9_intra4x4_predict "struct blockd *x, int b_mode, unsigned char *predictor"
+specialize vp9_intra4x4_predict;
+
+prototype void vp9_comp_intra4x4_predict "struct blockd *x, int b_mode, int second_mode, unsigned char *predictor"
+specialize vp9_comp_intra4x4_predict;
+
+prototype void vp9_intra8x8_predict "struct blockd *x, int b_mode, unsigned char *predictor"
+specialize vp9_intra8x8_predict;
+
+prototype void vp9_comp_intra8x8_predict "struct blockd *x, int b_mode, int second_mode, unsigned char *predictor"
+specialize vp9_comp_intra8x8_predict;
+
+prototype void vp9_intra_uv4x4_predict "struct blockd *x, int b_mode, unsigned char *predictor"
+specialize vp9_intra_uv4x4_predict;
+
+prototype void vp9_comp_intra_uv4x4_predict "struct blockd *x, int b_mode, int second_mode, unsigned char *predictor"
+specialize vp9_comp_intra_uv4x4_predict;
+
+#
+# Loopfilter
+#
+prototype void vp9_loop_filter_mbv "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi"
+specialize vp9_loop_filter_mbv sse2
+
+prototype void vp9_loop_filter_bv "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi"
+specialize vp9_loop_filter_bv sse2
+
+prototype void vp9_loop_filter_bv8x8 "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi"
+specialize vp9_loop_filter_bv8x8 sse2
+
+prototype void vp9_loop_filter_mbh "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi"
+specialize vp9_loop_filter_mbh sse2
+
+prototype void vp9_loop_filter_bh "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi"
+specialize vp9_loop_filter_bh sse2
+
+prototype void vp9_loop_filter_bh8x8 "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi"
+specialize vp9_loop_filter_bh8x8 sse2
+
+prototype void vp9_loop_filter_simple_mbv "unsigned char *y, int ystride, const unsigned char *blimit"
+specialize vp9_loop_filter_simple_mbv mmx sse2 media neon
+vp9_loop_filter_simple_mbv_c=vp9_loop_filter_simple_vertical_edge_c
+vp9_loop_filter_simple_mbv_mmx=vp9_loop_filter_simple_vertical_edge_mmx
+vp9_loop_filter_simple_mbv_sse2=vp9_loop_filter_simple_vertical_edge_sse2
+vp9_loop_filter_simple_mbv_media=vp9_loop_filter_simple_vertical_edge_armv6
+vp9_loop_filter_simple_mbv_neon=vp9_loop_filter_mbvs_neon
+
+prototype void vp9_loop_filter_simple_mbh "unsigned char *y, int ystride, const unsigned char *blimit"
+specialize vp9_loop_filter_simple_mbh mmx sse2 media neon
+vp9_loop_filter_simple_mbh_c=vp9_loop_filter_simple_horizontal_edge_c
+vp9_loop_filter_simple_mbh_mmx=vp9_loop_filter_simple_horizontal_edge_mmx
+vp9_loop_filter_simple_mbh_sse2=vp9_loop_filter_simple_horizontal_edge_sse2
+vp9_loop_filter_simple_mbh_media=vp9_loop_filter_simple_horizontal_edge_armv6
+vp9_loop_filter_simple_mbh_neon=vp9_loop_filter_mbhs_neon
+
+prototype void vp9_loop_filter_simple_bv "unsigned char *y, int ystride, const unsigned char *blimit"
+specialize vp9_loop_filter_simple_bv mmx sse2 media neon
+vp9_loop_filter_simple_bv_c=vp9_loop_filter_bvs_c
+vp9_loop_filter_simple_bv_mmx=vp9_loop_filter_bvs_mmx
+vp9_loop_filter_simple_bv_sse2=vp9_loop_filter_bvs_sse2
+vp9_loop_filter_simple_bv_media=vp9_loop_filter_bvs_armv6
+vp9_loop_filter_simple_bv_neon=vp9_loop_filter_bvs_neon
+
+prototype void vp9_loop_filter_simple_bh "unsigned char *y, int ystride, const unsigned char *blimit"
+specialize vp9_loop_filter_simple_bh mmx sse2 media neon
+vp9_loop_filter_simple_bh_c=vp9_loop_filter_bhs_c
+vp9_loop_filter_simple_bh_mmx=vp9_loop_filter_bhs_mmx
+vp9_loop_filter_simple_bh_sse2=vp9_loop_filter_bhs_sse2
+vp9_loop_filter_simple_bh_media=vp9_loop_filter_bhs_armv6
+vp9_loop_filter_simple_bh_neon=vp9_loop_filter_bhs_neon
+
+#
+# sad 16x3, 3x16
+#
+if [ "$CONFIG_NEWBESTREFMV" = "yes" ]; then
+prototype unsigned int vp9_sad16x3 "const unsigned char *src_ptr, int  src_stride, const unsigned char *ref_ptr, int ref_stride, int max_sad"
+specialize vp9_sad16x3 sse2
+
+prototype unsigned int vp9_sad3x16 "const unsigned char *src_ptr, int  src_stride, const unsigned char *ref_ptr, int ref_stride, int max_sad"
+specialize vp9_sad3x16 sse2
+fi
+
+#
+# Encoder functions below this point.
+#
+if [ "$CONFIG_VP9_ENCODER" = "yes" ]; then
+
+
+# variance
+[ $arch = "x86_64" ] && mmx_x86_64=mmx && sse2_x86_64=sse2
+
+prototype unsigned int vp9_variance32x32 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"
+specialize vp9_variance32x32
+
+prototype unsigned int vp9_variance16x16 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"
+specialize vp9_variance16x16 mmx sse2
+vp9_variance16x16_sse2=vp9_variance16x16_wmt
+vp9_variance16x16_mmx=vp9_variance16x16_mmx
+
+prototype unsigned int vp9_variance16x8 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"
+specialize vp9_variance16x8 mmx sse2
+vp9_variance16x8_sse2=vp9_variance16x8_wmt
+vp9_variance16x8_mmx=vp9_variance16x8_mmx
+
+prototype unsigned int vp9_variance8x16 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"
+specialize vp9_variance8x16 mmx sse2
+vp9_variance8x16_sse2=vp9_variance8x16_wmt
+vp9_variance8x16_mmx=vp9_variance8x16_mmx
+
+prototype unsigned int vp9_variance8x8 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"
+specialize vp9_variance8x8 mmx sse2
+vp9_variance8x8_sse2=vp9_variance8x8_wmt
+vp9_variance8x8_mmx=vp9_variance8x8_mmx
+
+prototype unsigned int vp9_variance4x4 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"
+specialize vp9_variance4x4 mmx sse2
+vp9_variance4x4_sse2=vp9_variance4x4_wmt
+vp9_variance4x4_mmx=vp9_variance4x4_mmx
+
+prototype unsigned int vp9_sub_pixel_variance32x32 "const unsigned char *src_ptr, int source_stride, int xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"
+specialize vp9_sub_pixel_variance32x32
+
+prototype unsigned int vp9_sub_pixel_variance16x16 "const unsigned char *src_ptr, int source_stride, int xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"
+specialize vp9_sub_pixel_variance16x16 sse2 mmx ssse3
+vp9_sub_pixel_variance16x16_sse2=vp9_sub_pixel_variance16x16_wmt
+
+prototype unsigned int vp9_sub_pixel_variance8x16 "const unsigned char *src_ptr, int source_stride, int xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"
+specialize vp9_sub_pixel_variance8x16 sse2 mmx
+vp9_sub_pixel_variance8x16_sse2=vp9_sub_pixel_variance8x16_wmt
+
+prototype unsigned int vp9_sub_pixel_variance16x8 "const unsigned char *src_ptr, int source_stride, int xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"
+specialize vp9_sub_pixel_variance16x8 sse2 mmx ssse3
+vp9_sub_pixel_variance16x8_sse2=vp9_sub_pixel_variance16x8_ssse3;
+vp9_sub_pixel_variance16x8_sse2=vp9_sub_pixel_variance16x8_wmt
+
+prototype unsigned int vp9_sub_pixel_variance8x8 "const unsigned char *src_ptr, int source_stride, int xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"
+specialize vp9_sub_pixel_variance8x8 sse2 mmx
+vp9_sub_pixel_variance8x8_sse2=vp9_sub_pixel_variance8x8_wmt
+
+prototype unsigned int vp9_sub_pixel_variance4x4 "const unsigned char *src_ptr, int source_stride, int xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"
+specialize vp9_sub_pixel_variance4x4 sse2 mmx
+vp9_sub_pixel_variance4x4_sse2=vp9_sub_pixel_variance4x4_wmt
+
+prototype unsigned int vp9_sad32x32 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int max_sad"
+specialize vp9_sad32x32
+
+prototype unsigned int vp9_sad16x16 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int max_sad"
+specialize vp9_sad16x16 mmx sse2 sse3
+vp9_sad16x16_sse2=vp9_sad16x16_wmt
+
+prototype unsigned int vp9_sad16x8 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int max_sad"
+specialize vp9_sad16x8 mmx sse2
+vp9_sad16x8_sse2=vp9_sad16x8_wmt
+
+prototype unsigned int vp9_sad8x16 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int max_sad"
+specialize vp9_sad8x16 mmx sse2
+vp9_sad8x16_sse2=vp9_sad8x16_wmt
+
+prototype unsigned int vp9_sad8x8 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int max_sad"
+specialize vp9_sad8x8 mmx sse2
+vp9_sad8x8_sse2=vp9_sad8x8_wmt
+
+prototype unsigned int vp9_sad4x4 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int max_sad"
+specialize vp9_sad4x4 mmx sse2
+vp9_sad4x4_sse2=vp9_sad4x4_wmt
+
+prototype unsigned int vp9_variance_halfpixvar16x16_h "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"
+specialize vp9_variance_halfpixvar16x16_h mmx sse2
+vp9_variance_halfpixvar16x16_h_sse2=vp9_variance_halfpixvar16x16_h_wmt
+
+prototype unsigned int vp9_variance_halfpixvar16x16_v "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"
+specialize vp9_variance_halfpixvar16x16_v mmx sse2
+vp9_variance_halfpixvar16x16_v_sse2=vp9_variance_halfpixvar16x16_v_wmt
+
+prototype unsigned int vp9_variance_halfpixvar16x16_hv "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"
+specialize vp9_variance_halfpixvar16x16_hv mmx sse2
+vp9_variance_halfpixvar16x16_hv_sse2=vp9_variance_halfpixvar16x16_hv_wmt
+
+prototype unsigned int vp9_variance_halfpixvar32x32_h "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"
+specialize vp9_variance_halfpixvar32x32_h
+
+prototype unsigned int vp9_variance_halfpixvar32x32_v "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"
+specialize vp9_variance_halfpixvar32x32_v
+
+prototype unsigned int vp9_variance_halfpixvar32x32_hv "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"
+specialize vp9_variance_halfpixvar32x32_hv
+
+prototype void vp9_sad32x32x3 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sad_array"
+specialize vp9_sad32x32x3
+
+prototype void vp9_sad16x16x3 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sad_array"
+specialize vp9_sad16x16x3 sse3 ssse3
+
+prototype void vp9_sad16x8x3 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sad_array"
+specialize vp9_sad16x8x3 sse3 ssse3
+
+prototype void vp9_sad8x16x3 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sad_array"
+specialize vp9_sad8x16x3 sse3
+
+prototype void vp9_sad8x8x3 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sad_array"
+specialize vp9_sad8x8x3 sse3
+
+prototype void vp9_sad4x4x3 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sad_array"
+specialize vp9_sad4x4x3 sse3
+
+prototype void vp9_sad32x32x8 "const unsigned char *src_ptr, int  src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned short *sad_array"
+specialize vp9_sad32x32x8
+
+prototype void vp9_sad16x16x8 "const unsigned char *src_ptr, int  src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned short *sad_array"
+specialize vp9_sad16x16x8 sse4
+
+prototype void vp9_sad16x8x8 "const unsigned char *src_ptr, int  src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned short *sad_array"
+specialize vp9_sad16x8x8 sse4
+
+prototype void vp9_sad8x16x8 "const unsigned char *src_ptr, int  src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned short *sad_array"
+specialize vp9_sad8x16x8 sse4
+
+prototype void vp9_sad8x8x8 "const unsigned char *src_ptr, int  src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned short *sad_array"
+specialize vp9_sad8x8x8 sse4
+
+prototype void vp9_sad4x4x8 "const unsigned char *src_ptr, int  src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned short *sad_array"
+specialize vp9_sad4x4x8 sse4
+
+prototype void vp9_sad32x32x4d "const unsigned char *src_ptr, int  src_stride, unsigned char *ref_ptr[], int  ref_stride, unsigned int *sad_array"
+specialize vp9_sad32x32x4d
+
+prototype void vp9_sad16x16x4d "const unsigned char *src_ptr, int  src_stride, unsigned char *ref_ptr[], int  ref_stride, unsigned int *sad_array"
+specialize vp9_sad16x16x4d sse3
+
+prototype void vp9_sad16x8x4d "const unsigned char *src_ptr, int  src_stride, unsigned char *ref_ptr[], int  ref_stride, unsigned int *sad_array"
+specialize vp9_sad16x8x4d sse3
+
+prototype void vp9_sad8x16x4d "const unsigned char *src_ptr, int  src_stride, unsigned char *ref_ptr[], int  ref_stride, unsigned int *sad_array"
+specialize vp9_sad8x16x4d sse3
+
+prototype void vp9_sad8x8x4d "const unsigned char *src_ptr, int  src_stride, unsigned char *ref_ptr[], int  ref_stride, unsigned int *sad_array"
+specialize vp9_sad8x8x4d sse3
+
+prototype void vp9_sad4x4x4d "const unsigned char *src_ptr, int  src_stride, unsigned char *ref_ptr[], int  ref_stride, unsigned int *sad_array"
+specialize vp9_sad4x4x4d sse3
+
+#
+# Block copy
+#
+case $arch in
+    x86*)
+    prototype void vp9_copy32xn "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, int n"
+    specialize vp9_copy32xn sse2 sse3
+    ;;
+esac
+
+prototype unsigned int vp9_sub_pixel_mse16x16 "const unsigned char  *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, const unsigned char *dst_ptr, int dst_pixels_per_line, unsigned int *sse"
+specialize vp9_sub_pixel_mse16x16 sse2 mmx
+vp9_sub_pixel_mse16x16_sse2=vp9_sub_pixel_mse16x16_wmt
+
+prototype unsigned int vp9_mse16x16 "const unsigned char *src_ptr, int  source_stride, const unsigned char *ref_ptr, int  recon_stride, unsigned int *sse"
+specialize vp9_mse16x16 mmx sse2
+vp9_mse16x16_sse2=vp9_mse16x16_wmt
+
+prototype unsigned int vp9_sub_pixel_mse32x32 "const unsigned char  *src_ptr, int  source_stride, int  xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"
+specialize vp9_sub_pixel_mse32x32
+
+prototype unsigned int vp9_get_mb_ss "const short *"
+specialize vp9_get_mb_ss mmx sse2
+# ENCODEMB INVOKE
+prototype int vp9_mbblock_error "struct macroblock *mb, int dc"
+specialize vp9_mbblock_error mmx sse2
+vp9_mbblock_error_sse2=vp9_mbblock_error_xmm
+
+prototype int vp9_block_error "short *coeff, short *dqcoeff, int block_size"
+specialize vp9_block_error mmx sse2
+vp9_block_error_sse2=vp9_block_error_xmm
+
+prototype void vp9_subtract_b "struct block *be, struct blockd *bd, int pitch"
+specialize vp9_subtract_b mmx sse2
+
+prototype int vp9_mbuverror "struct macroblock *mb"
+specialize vp9_mbuverror mmx sse2
+vp9_mbuverror_sse2=vp9_mbuverror_xmm
+
+prototype void vp9_subtract_b "struct block *be, struct blockd *bd, int pitch"
+specialize vp9_subtract_b mmx sse2
+
+prototype void vp9_subtract_mby "short *diff, unsigned char *src, unsigned char *pred, int stride"
+specialize vp9_subtract_mby mmx sse2
+
+prototype void vp9_subtract_mbuv "short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride"
+specialize vp9_subtract_mbuv mmx sse2
+
+#
+# Structured Similarity (SSIM)
+#
+if [ "$CONFIG_INTERNAL_STATS" = "yes" ]; then
+    [ $arch = "x86_64" ] && sse2_on_x86_64=sse2
+
+    prototype void vp9_ssim_parms_8x8 "unsigned char *s, int sp, unsigned char *r, int rp, unsigned long *sum_s, unsigned long *sum_r, unsigned long *sum_sq_s, unsigned long *sum_sq_r, unsigned long *sum_sxr"
+    specialize vp9_ssim_parms_8x8 $sse2_on_x86_64
+
+    prototype void vp9_ssim_parms_16x16 "unsigned char *s, int sp, unsigned char *r, int rp, unsigned long *sum_s, unsigned long *sum_r, unsigned long *sum_sq_s, unsigned long *sum_sq_r, unsigned long *sum_sxr"
+    specialize vp9_ssim_parms_16x16 $sse2_on_x86_64
+fi
+
+# fdct functions
+prototype void vp9_fht "const short *input, int pitch, short *output, int tx_type, int tx_dim"
+specialize vp9_fht
+
+prototype void vp9_short_fdct8x8 "short *InputData, short *OutputData, int pitch"
+specialize vp9_short_fdct8x8
+
+prototype void vp9_short_fhaar2x2 "short *InputData, short *OutputData, int pitch"
+specialize vp9_short_fhaar2x2
+
+prototype void vp9_short_fdct4x4 "short *InputData, short *OutputData, int pitch"
+specialize vp9_short_fdct4x4
+
+prototype void vp9_short_fdct8x4 "short *InputData, short *OutputData, int pitch"
+specialize vp9_short_fdct8x4
+
+prototype void vp9_short_walsh4x4 "short *InputData, short *OutputData, int pitch"
+specialize vp9_short_walsh4x4
+
+prototype void vp9_short_fdct16x16 "short *InputData, short *OutputData, int pitch"
+specialize vp9_short_fdct16x16
+
+prototype void vp9_short_walsh4x4_lossless "short *InputData, short *OutputData, int pitch"
+specialize vp9_short_walsh4x4_lossless
+
+prototype void vp9_short_walsh4x4_x8 "short *InputData, short *OutputData, int pitch"
+specialize vp9_short_walsh4x4_x8
+
+prototype void vp9_short_walsh8x4_x8 "short *InputData, short *OutputData, int pitch"
+specialize vp9_short_walsh8x4_x8
+
+fi
+# end encoder functions
--- /dev/null
+++ b/vp9/common/sadmxn.h
@@ -1,0 +1,37 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef __INC_SAD_H
+#define __INC_SAD_H
+
+static __inline
+unsigned int sad_mx_n_c(
+  const unsigned char *src_ptr,
+  int  src_stride,
+  const unsigned char *ref_ptr,
+  int  ref_stride,
+  int m,
+  int n) {
+  int r, c;
+  unsigned int sad = 0;
+
+  for (r = 0; r < n; r++) {
+    for (c = 0; c < m; c++) {
+      sad += abs(src_ptr[c] - ref_ptr[c]);
+    }
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+  }
+
+  return sad;
+}
+
+#endif
--- /dev/null
+++ b/vp9/common/seg_common.c
@@ -1,0 +1,103 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vp9/common/seg_common.h"
+
+static const int segfeaturedata_signed[SEG_LVL_MAX] = { 1, 1, 0, 0, 0, 0 };
+static const int seg_feature_data_bits[SEG_LVL_MAX] = { QINDEX_BITS, 6, 4, 4, 6, 2 };
+
+// These functions provide access to new segment level features.
+// Eventually these function may be "optimized out" but for the moment,
+// the coding mechanism is still subject to change so these provide a
+// convenient single point of change.
+
+int vp9_segfeature_active(const MACROBLOCKD *xd,
+                          int segment_id,
+                          SEG_LVL_FEATURES feature_id) {
+  // Return true if mask bit set and segmentation enabled.
+  return (xd->segmentation_enabled &&
+          (xd->segment_feature_mask[segment_id] &
+           (0x01 << feature_id)));
+}
+
+void vp9_clearall_segfeatures(MACROBLOCKD *xd) {
+  vpx_memset(xd->segment_feature_data, 0, sizeof(xd->segment_feature_data));
+  vpx_memset(xd->segment_feature_mask, 0, sizeof(xd->segment_feature_mask));
+}
+
+void vp9_enable_segfeature(MACROBLOCKD *xd,
+                           int segment_id,
+                           SEG_LVL_FEATURES feature_id) {
+  xd->segment_feature_mask[segment_id] |= (0x01 << feature_id);
+}
+
+void vp9_disable_segfeature(MACROBLOCKD *xd,
+                            int segment_id,
+                            SEG_LVL_FEATURES feature_id) {
+  xd->segment_feature_mask[segment_id] &= ~(1 << feature_id);
+}
+
+int vp9_seg_feature_data_bits(SEG_LVL_FEATURES feature_id) {
+  return seg_feature_data_bits[feature_id];
+}
+
+int vp9_is_segfeature_signed(SEG_LVL_FEATURES feature_id) {
+  return (segfeaturedata_signed[feature_id]);
+}
+
+void vp9_clear_segdata(MACROBLOCKD *xd,
+                       int segment_id,
+                       SEG_LVL_FEATURES feature_id) {
+  xd->segment_feature_data[segment_id][feature_id] = 0;
+}
+
+void vp9_set_segdata(MACROBLOCKD *xd,
+                     int segment_id,
+                     SEG_LVL_FEATURES feature_id,
+                     int seg_data) {
+  xd->segment_feature_data[segment_id][feature_id] = seg_data;
+}
+
+int vp9_get_segdata(const MACROBLOCKD *xd,
+                    int segment_id,
+                    SEG_LVL_FEATURES feature_id) {
+  return xd->segment_feature_data[segment_id][feature_id];
+}
+
+void vp9_clear_segref(MACROBLOCKD *xd, int segment_id) {
+  xd->segment_feature_data[segment_id][SEG_LVL_REF_FRAME] = 0;
+}
+
+void vp9_set_segref(MACROBLOCKD *xd,
+                    int segment_id,
+                    MV_REFERENCE_FRAME ref_frame) {
+  xd->segment_feature_data[segment_id][SEG_LVL_REF_FRAME] |=
+    (1 << ref_frame);
+}
+
+int vp9_check_segref(const MACROBLOCKD *xd,
+                     int segment_id,
+                     MV_REFERENCE_FRAME ref_frame) {
+  return (xd->segment_feature_data[segment_id][SEG_LVL_REF_FRAME] &
+          (1 << ref_frame)) ? 1 : 0;
+}
+
+int vp9_check_segref_inter(MACROBLOCKD *xd, int segment_id) {
+  return (xd->segment_feature_data[segment_id][SEG_LVL_REF_FRAME] &
+          ~(1 << INTRA_FRAME)) ? 1 : 0;
+}
+
+int vp9_get_seg_tx_type(MACROBLOCKD *xd, int segment_id) {
+  if (vp9_segfeature_active(xd, segment_id, SEG_LVL_TRANSFORM))
+    return vp9_get_segdata(xd, segment_id, SEG_LVL_TRANSFORM);
+  else
+    return TX_4X4;
+}
+// TBD? Functions to read and write segment data with range / validity checking
--- /dev/null
+++ b/vp9/common/seg_common.h
@@ -1,0 +1,64 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "type_aliases.h"
+#include "onyxc_int.h"
+#include "vp9/common/blockd.h"
+
+#ifndef __INC_SEG_COMMON_H__
+#define __INC_SEG_COMMON_H__ 1
+
+int vp9_segfeature_active(const MACROBLOCKD *xd,
+                          int segment_id,
+                          SEG_LVL_FEATURES feature_id);
+
+void vp9_clearall_segfeatures(MACROBLOCKD *xd);
+
+void vp9_enable_segfeature(MACROBLOCKD *xd,
+                           int segment_id,
+                           SEG_LVL_FEATURES feature_id);
+
+void vp9_disable_segfeature(MACROBLOCKD *xd,
+                            int segment_id,
+                            SEG_LVL_FEATURES feature_id);
+
+int vp9_seg_feature_data_bits(SEG_LVL_FEATURES feature_id);
+
+int vp9_is_segfeature_signed(SEG_LVL_FEATURES feature_id);
+
+void vp9_clear_segdata(MACROBLOCKD *xd,
+                       int segment_id,
+                       SEG_LVL_FEATURES feature_id);
+
+void vp9_set_segdata(MACROBLOCKD *xd,
+                     int segment_id,
+                     SEG_LVL_FEATURES feature_id,
+                     int seg_data);
+
+int vp9_get_segdata(const MACROBLOCKD *xd,
+                    int segment_id,
+                    SEG_LVL_FEATURES feature_id);
+
+void vp9_clear_segref(MACROBLOCKD *xd, int segment_id);
+
+void vp9_set_segref(MACROBLOCKD *xd,
+                    int segment_id,
+                    MV_REFERENCE_FRAME ref_frame);
+
+int vp9_check_segref(const MACROBLOCKD *xd,
+                     int segment_id,
+                     MV_REFERENCE_FRAME ref_frame);
+
+int vp9_check_segref_inter(MACROBLOCKD *xd, int segment_id);
+
+int vp9_get_seg_tx_type(MACROBLOCKD *xd, int segment_id);
+
+#endif /* __INC_SEG_COMMON_H__ */
+
--- /dev/null
+++ b/vp9/common/setupintrarecon.c
@@ -1,0 +1,31 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "setupintrarecon.h"
+#include "vpx_mem/vpx_mem.h"
+
+void vp9_setup_intra_recon(YV12_BUFFER_CONFIG *ybf) {
+  int i;
+
+  /* set up frame new frame for intra coded blocks */
+  vpx_memset(ybf->y_buffer - 1 - ybf->y_stride, 127, ybf->y_width + 5);
+  for (i = 0; i < ybf->y_height; i++)
+    ybf->y_buffer[ybf->y_stride * i - 1] = (unsigned char) 129;
+
+  vpx_memset(ybf->u_buffer - 1 - ybf->uv_stride, 127, ybf->uv_width + 5);
+  for (i = 0; i < ybf->uv_height; i++)
+    ybf->u_buffer[ybf->uv_stride * i - 1] = (unsigned char) 129;
+
+  vpx_memset(ybf->v_buffer - 1 - ybf->uv_stride, 127, ybf->uv_width + 5);
+  for (i = 0; i < ybf->uv_height; i++)
+    ybf->v_buffer[ybf->uv_stride * i - 1] = (unsigned char) 129;
+
+}
--- /dev/null
+++ b/vp9/common/setupintrarecon.h
@@ -1,0 +1,13 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_scale/yv12config.h"
+extern void vp9_setup_intra_recon(YV12_BUFFER_CONFIG *ybf);
--- /dev/null
+++ b/vp9/common/subpixel.h
@@ -1,0 +1,204 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef SUBPIXEL_H
+#define SUBPIXEL_H
+
+#define prototype_subpixel_predict(sym) \
+  void sym(unsigned char *src, int src_pitch, int xofst, int yofst, \
+           unsigned char *dst, int dst_pitch)
+
+#if ARCH_X86 || ARCH_X86_64
+#include "x86/subpixel_x86.h"
+#endif
+
+#if ARCH_ARM
+#include "arm/subpixel_arm.h"
+#endif
+
+#ifndef vp9_subpix_sixtap16x16
+#define vp9_subpix_sixtap16x16 vp9_sixtap_predict16x16_c
+#endif
+extern prototype_subpixel_predict(vp9_subpix_sixtap16x16);
+
+#ifndef vp9_subpix_sixtap8x8
+#define vp9_subpix_sixtap8x8 vp9_sixtap_predict8x8_c
+#endif
+extern prototype_subpixel_predict(vp9_subpix_sixtap8x8);
+
+#ifndef vp9_subpix_sixtap_avg16x16
+#define vp9_subpix_sixtap_avg16x16 vp9_sixtap_predict_avg16x16_c
+#endif
+extern prototype_subpixel_predict(vp9_subpix_sixtap_avg16x16);
+
+#ifndef vp9_subpix_sixtap_avg8x8
+#define vp9_subpix_sixtap_avg8x8 vp9_sixtap_predict_avg8x8_c
+#endif
+extern prototype_subpixel_predict(vp9_subpix_sixtap_avg8x8);
+#ifndef vp9_subpix_sixtap8x4
+#define vp9_subpix_sixtap8x4 vp9_sixtap_predict8x4_c
+#endif
+extern prototype_subpixel_predict(vp9_subpix_sixtap8x4);
+
+#ifndef vp9_subpix_sixtap4x4
+#define vp9_subpix_sixtap4x4 vp9_sixtap_predict_c
+#endif
+extern prototype_subpixel_predict(vp9_subpix_sixtap4x4);
+
+#ifndef vp9_subpix_sixtap_avg4x4
+#define vp9_subpix_sixtap_avg4x4 vp9_sixtap_predict_avg_c
+#endif
+extern prototype_subpixel_predict(vp9_subpix_sixtap_avg4x4);
+
+#ifndef vp9_subpix_eighttap16x16
+#define vp9_subpix_eighttap16x16 vp9_eighttap_predict16x16_c
+#endif
+extern prototype_subpixel_predict(vp9_subpix_eighttap16x16);
+
+#ifndef vp9_subpix_eighttap8x8
+#define vp9_subpix_eighttap8x8 vp9_eighttap_predict8x8_c
+#endif
+extern prototype_subpixel_predict(vp9_subpix_eighttap8x8);
+
+#ifndef vp9_subpix_eighttap_avg16x16
+#define vp9_subpix_eighttap_avg16x16 vp9_eighttap_predict_avg16x16_c
+#endif
+extern prototype_subpixel_predict(vp9_subpix_eighttap_avg16x16);
+
+#ifndef vp9_subpix_eighttap_avg8x8
+#define vp9_subpix_eighttap_avg8x8 vp9_eighttap_predict_avg8x8_c
+#endif
+extern prototype_subpixel_predict(vp9_subpix_eighttap_avg8x8);
+
+#ifndef vp9_subpix_eighttap8x4
+#define vp9_subpix_eighttap8x4 vp9_eighttap_predict8x4_c
+#endif
+extern prototype_subpixel_predict(vp9_subpix_eighttap8x4);
+
+#ifndef vp9_subpix_eighttap4x4
+#define vp9_subpix_eighttap4x4 vp9_eighttap_predict_c
+#endif
+extern prototype_subpixel_predict(vp9_subpix_eighttap4x4);
+
+#ifndef vp9_subpix_eighttap_avg4x4
+#define vp9_subpix_eighttap_avg4x4 vp9_eighttap_predict_avg4x4_c
+#endif
+extern prototype_subpixel_predict(vp9_subpix_eighttap_avg4x4);
+
+#ifndef vp9_subpix_eighttap16x16_sharp
+#define vp9_subpix_eighttap16x16_sharp vp9_eighttap_predict16x16_sharp_c
+#endif
+extern prototype_subpixel_predict(vp9_subpix_eighttap16x16_sharp);
+
+#ifndef vp9_subpix_eighttap8x8_sharp
+#define vp9_subpix_eighttap8x8_sharp vp9_eighttap_predict8x8_sharp_c
+#endif
+extern prototype_subpixel_predict(vp9_subpix_eighttap8x8_sharp);
+
+#ifndef vp9_subpix_eighttap_avg16x16_sharp
+#define vp9_subpix_eighttap_avg16x16_sharp vp9_eighttap_predict_avg16x16_sharp_c
+#endif
+extern prototype_subpixel_predict(vp9_subpix_eighttap_avg16x16_sharp);
+
+#ifndef vp9_subpix_eighttap_avg8x8_sharp
+#define vp9_subpix_eighttap_avg8x8_sharp vp9_eighttap_predict_avg8x8_sharp_c
+#endif
+extern prototype_subpixel_predict(vp9_subpix_eighttap_avg8x8_sharp);
+
+#ifndef vp9_subpix_eighttap8x4_sharp
+#define vp9_subpix_eighttap8x4_sharp vp9_eighttap_predict8x4_sharp_c
+#endif
+extern prototype_subpixel_predict(vp9_subpix_eighttap8x4_sharp);
+
+#ifndef vp9_subpix_eighttap4x4_sharp
+#define vp9_subpix_eighttap4x4_sharp vp9_eighttap_predict_sharp_c
+#endif
+extern prototype_subpixel_predict(vp9_subpix_eighttap4x4_sharp);
+
+#ifndef vp9_subpix_eighttap_avg4x4_sharp
+#define vp9_subpix_eighttap_avg4x4_sharp vp9_eighttap_predict_avg4x4_sharp_c
+#endif
+extern prototype_subpixel_predict(vp9_subpix_eighttap_avg4x4_sharp);
+
+#ifndef vp9_subpix_bilinear16x16
+#define vp9_subpix_bilinear16x16 vp9_bilinear_predict16x16_c
+#endif
+extern prototype_subpixel_predict(vp9_subpix_bilinear16x16);
+
+#ifndef vp9_subpix_bilinear8x8
+#define vp9_subpix_bilinear8x8 vp9_bilinear_predict8x8_c
+#endif
+extern prototype_subpixel_predict(vp9_subpix_bilinear8x8);
+
+#ifndef vp9_subpix_bilinear_avg16x16
+#define vp9_subpix_bilinear_avg16x16 vp9_bilinear_predict_avg16x16_c
+#endif
+extern prototype_subpixel_predict(vp9_subpix_bilinear_avg16x16);
+
+#ifndef vp9_subpix_bilinear_avg8x8
+#define vp9_subpix_bilinear_avg8x8 vp9_bilinear_predict_avg8x8_c
+#endif
+extern prototype_subpixel_predict(vp9_subpix_bilinear_avg8x8);
+
+#ifndef vp9_subpix_bilinear8x4
+#define vp9_subpix_bilinear8x4 vp9_bilinear_predict8x4_c
+#endif
+extern prototype_subpixel_predict(vp9_subpix_bilinear8x4);
+
+#ifndef vp9_subpix_bilinear4x4
+#define vp9_subpix_bilinear4x4 vp9_bilinear_predict4x4_c
+#endif
+extern prototype_subpixel_predict(vp9_subpix_bilinear4x4);
+
+#ifndef vp9_subpix_bilinear_avg4x4
+#define vp9_subpix_bilinear_avg4x4 vp9_bilinear_predict_avg4x4_c
+#endif
+extern prototype_subpixel_predict(vp9_subpix_bilinear_avg4x4);
+
+typedef prototype_subpixel_predict((*vp9_subpix_fn_t));
+typedef struct {
+  vp9_subpix_fn_t  eighttap16x16;
+  vp9_subpix_fn_t  eighttap8x8;
+  vp9_subpix_fn_t  eighttap_avg16x16;
+  vp9_subpix_fn_t  eighttap_avg8x8;
+  vp9_subpix_fn_t  eighttap_avg4x4;
+  vp9_subpix_fn_t  eighttap8x4;
+  vp9_subpix_fn_t  eighttap4x4;
+  vp9_subpix_fn_t  eighttap16x16_sharp;
+  vp9_subpix_fn_t  eighttap8x8_sharp;
+  vp9_subpix_fn_t  eighttap_avg16x16_sharp;
+  vp9_subpix_fn_t  eighttap_avg8x8_sharp;
+  vp9_subpix_fn_t  eighttap_avg4x4_sharp;
+  vp9_subpix_fn_t  eighttap8x4_sharp;
+  vp9_subpix_fn_t  eighttap4x4_sharp;
+  vp9_subpix_fn_t  sixtap16x16;
+  vp9_subpix_fn_t  sixtap8x8;
+  vp9_subpix_fn_t  sixtap_avg16x16;
+  vp9_subpix_fn_t  sixtap_avg8x8;
+  vp9_subpix_fn_t  sixtap8x4;
+  vp9_subpix_fn_t  sixtap4x4;
+  vp9_subpix_fn_t  sixtap_avg4x4;
+  vp9_subpix_fn_t  bilinear16x16;
+  vp9_subpix_fn_t  bilinear8x8;
+  vp9_subpix_fn_t  bilinear_avg16x16;
+  vp9_subpix_fn_t  bilinear_avg8x8;
+  vp9_subpix_fn_t  bilinear8x4;
+  vp9_subpix_fn_t  bilinear4x4;
+  vp9_subpix_fn_t  bilinear_avg4x4;
+} vp9_subpix_rtcd_vtable_t;
+
+#if CONFIG_RUNTIME_CPU_DETECT
+#define SUBPIX_INVOKE(ctx,fn) (ctx)->fn
+#else
+#define SUBPIX_INVOKE(ctx,fn) vp9_subpix_##fn
+#endif
+
+#endif
--- /dev/null
+++ b/vp9/common/swapyv12buffer.c
@@ -1,0 +1,32 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "swapyv12buffer.h"
+
+void vp9_swap_yv12_buffer(YV12_BUFFER_CONFIG *new_frame,
+                          YV12_BUFFER_CONFIG *last_frame) {
+  unsigned char *temp;
+
+  temp = last_frame->buffer_alloc;
+  last_frame->buffer_alloc = new_frame->buffer_alloc;
+  new_frame->buffer_alloc = temp;
+
+  temp = last_frame->y_buffer;
+  last_frame->y_buffer = new_frame->y_buffer;
+  new_frame->y_buffer = temp;
+
+  temp = last_frame->u_buffer;
+  last_frame->u_buffer = new_frame->u_buffer;
+  new_frame->u_buffer = temp;
+
+  temp = last_frame->v_buffer;
+  last_frame->v_buffer = new_frame->v_buffer;
+  new_frame->v_buffer = temp;
+}
--- /dev/null
+++ b/vp9/common/swapyv12buffer.h
@@ -1,0 +1,19 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef __SWAPYV12_BUFFER_H
+#define __SWAPYV12_BUFFER_H
+
+#include "vpx_scale/yv12config.h"
+
+void vp9_swap_yv12_buffer(YV12_BUFFER_CONFIG *new_frame,
+                          YV12_BUFFER_CONFIG *last_frame);
+
+#endif  // __SWAPYV12_BUFFER_H
--- /dev/null
+++ b/vp9/common/systemdependent.h
@@ -1,0 +1,21 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_ports/config.h"
+#if ARCH_X86 || ARCH_X86_64
+void vpx_reset_mmx_state(void);
+#define vp9_clear_system_state() vpx_reset_mmx_state()
+#else
+#define vp9_clear_system_state()
+#endif
+
+struct VP9Common;
+void vp9_machine_specific_config(struct VP9Common *);
--- /dev/null
+++ b/vp9/common/tapify.py
@@ -1,0 +1,106 @@
+"""
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+"""
+#!/usr/bin/env python
+import sys,string,os,re,math,numpy
+scale = 2**16
+def dist(p1,p2):
+  x1,y1 = p1
+  x2,y2 = p2
+  if x1==x2 and y1==y2 :
+    return 1.0 
+  return 1/ math.sqrt((x1-x2)*(x1-x2)+(y1-y2)*(y1-y2))
+
+def gettaps(p):
+  def l(b):
+    return int(math.floor(b))
+  def h(b):
+    return int(math.ceil(b))
+  def t(b,p,s):
+    return int((scale*dist(b,p)+s/2)/s)
+  r,c = p
+  ul=[l(r),l(c)]
+  ur=[l(r),h(c)]
+  ll=[h(r),l(c)]
+  lr=[h(r),h(c)]
+  sum = dist(ul,p)+dist(ur,p)+dist(ll,p)+dist(lr,p)
+  t4 = scale - t(ul,p,sum) - t(ur,p,sum) - t(ll,p,sum);
+  return [[ul,t(ul,p,sum)],[ur,t(ur,p,sum)],
+          [ll,t(ll,p,sum)],[lr,t4]]
+
+def print_mb_taps(angle,blocksize):
+  theta = angle / 57.2957795;
+  affine = [[math.cos(theta),-math.sin(theta)],
+            [math.sin(theta),math.cos(theta)]]
+  radius = (float(blocksize)-1)/2
+  print " // angle of",angle,"degrees"
+  for y in range(blocksize) :
+    for x in range(blocksize) :
+      r,c = numpy.dot(affine,[y-radius, x-radius])
+      tps = gettaps([r+radius,c+radius])
+      for t in tps :
+        p,t = t
+        tr,tc = p
+        print " %2d, %2d, %5d, " % (tr,tc,t,),
+      print " // %2d,%2d " % (y,x)
+
+i=float(sys.argv[1])
+while  i <= float(sys.argv[2]) :
+  print_mb_taps(i,float(sys.argv[4]))
+  i=i+float(sys.argv[3])
+"""
+
+taps = []
+pt=dict()
+ptr=dict()
+for y in range(16) :
+  for x in range(16) :
+    r,c = numpy.dot(affine,[y-7.5, x-7.5])
+    tps = gettaps([r+7.5,c+7.5])
+    j=0
+    for tp in tps : 
+      p,i = tp
+      r,c = p
+      pt[y,x,j]= [p,i]
+      try: 
+        ptr[r,j,c].append([y,x])
+      except:
+        ptr[r,j,c]=[[y,x]]
+      j = j+1 
+
+for key in sorted(pt.keys()) :
+  print key,pt[key]
+
+lr = -99
+lj = -99 
+lc = 0
+
+shuf=""
+mask=""
+for r,j,c in sorted(ptr.keys()) :
+  for y,x in ptr[r,j,c] :
+    if lr != r or lj != j :
+      print "shuf_"+str(lr)+"_"+str(lj)+"_"+shuf.ljust(16,"0"), lc
+      shuf=""
+      lc = 0
+    for i in range(lc,c-1) :
+      shuf = shuf +"0"
+    shuf = shuf + hex(x)[2]
+    lc =c
+    break
+  lr = r
+  lj = j
+#  print r,j,c,ptr[r,j,c]    
+#  print 
+
+for r,j,c in sorted(ptr.keys()) :
+  for y,x in ptr[r,j,c] :
+    print r,j,c,y,x 
+    break
+"""
--- /dev/null
+++ b/vp9/common/textblit.c
@@ -1,0 +1,116 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+
+
+void vp9_blit_text(const char *msg, unsigned char *address, const int pitch) {
+  int letter_bitmap;
+  unsigned char *output_pos = address;
+  int colpos;
+  const int font[] = {
+    0x0, 0x5C00, 0x8020, 0xAFABEA, 0xD7EC0, 0x1111111, 0x1855740, 0x18000,
+    0x45C0, 0x74400, 0x51140, 0x23880, 0xC4000, 0x21080, 0x80000, 0x111110,
+    0xE9D72E, 0x87E40, 0x12AD732, 0xAAD62A, 0x4F94C4, 0x4D6B7, 0x456AA,
+    0x3E8423, 0xAAD6AA, 0xAAD6A2, 0x2800, 0x2A00, 0x8A880, 0x52940, 0x22A20,
+    0x15422, 0x6AD62E, 0x1E4A53E, 0xAAD6BF, 0x8C62E, 0xE8C63F, 0x118D6BF,
+    0x1094BF, 0xCAC62E, 0x1F2109F, 0x118FE31, 0xF8C628, 0x8A89F, 0x108421F,
+    0x1F1105F, 0x1F4105F, 0xE8C62E, 0x2294BF, 0x164C62E, 0x12694BF, 0x8AD6A2,
+    0x10FC21, 0x1F8421F, 0x744107, 0xF8220F, 0x1151151, 0x117041, 0x119D731,
+    0x47E0, 0x1041041, 0xFC400, 0x10440, 0x1084210, 0x820
+  };
+  colpos = 0;
+
+  while (msg[colpos] != 0) {
+    char letter = msg[colpos];
+    int fontcol, fontrow;
+
+    if (letter <= 'Z' && letter >= ' ')
+      letter_bitmap = font[letter - ' '];
+    else if (letter <= 'z' && letter >= 'a')
+      letter_bitmap = font[letter - 'a' + 'A' - ' '];
+    else
+      letter_bitmap = font[0];
+
+    for (fontcol = 6; fontcol >= 0; fontcol--)
+      for (fontrow = 0; fontrow < 5; fontrow++)
+        output_pos[fontrow * pitch + fontcol] =
+          ((letter_bitmap >> (fontcol * 5)) & (1 << fontrow) ? 255 : 0);
+
+    output_pos += 7;
+    colpos++;
+  }
+}
+
+static void plot(const int x, const int y, unsigned char *image, const int pitch) {
+  image [x + y * pitch] ^= 255;
+}
+
+/* Bresenham line algorithm */
+void vp9_blit_line(int x0, int x1, int y0, int y1, unsigned char *image, const int pitch) {
+  int steep = abs(y1 - y0) > abs(x1 - x0);
+  int deltax, deltay;
+  int error, ystep, y, x;
+
+  if (steep) {
+    int t;
+    t = x0;
+    x0 = y0;
+    y0 = t;
+
+    t = x1;
+    x1 = y1;
+    y1 = t;
+  }
+
+  if (x0 > x1) {
+    int t;
+    t = x0;
+    x0 = x1;
+    x1 = t;
+
+    t = y0;
+    y0 = y1;
+    y1 = t;
+  }
+
+  deltax = x1 - x0;
+  deltay = abs(y1 - y0);
+  error  = deltax / 2;
+
+  y = y0;
+
+  if (y0 < y1)
+    ystep = 1;
+  else
+    ystep = -1;
+
+  if (steep) {
+    for (x = x0; x <= x1; x++) {
+      plot(y, x, image, pitch);
+
+      error = error - deltay;
+      if (error < 0) {
+        y = y + ystep;
+        error = error + deltax;
+      }
+    }
+  } else {
+    for (x = x0; x <= x1; x++) {
+      plot(x, y, image, pitch);
+
+      error = error - deltay;
+      if (error < 0) {
+        y = y + ystep;
+        error = error + deltax;
+      }
+    }
+  }
+}
--- /dev/null
+++ b/vp9/common/treecoder.c
@@ -1,0 +1,138 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_config.h"
+
+#if defined(CONFIG_DEBUG) && CONFIG_DEBUG
+#include <assert.h>
+#endif
+#include <stdio.h>
+
+#include "treecoder.h"
+
+static void tree2tok(
+  struct vp9_token_struct *const p,
+  vp9_tree t,
+  int i,
+  int v,
+  int L
+) {
+  v += v;
+  ++L;
+
+  do {
+    const vp9_tree_index j = t[i++];
+
+    if (j <= 0) {
+      p[-j].value = v;
+      p[-j].Len = L;
+    } else
+      tree2tok(p, t, j, v, L);
+  } while (++v & 1);
+}
+
+void vp9_tokens_from_tree(struct vp9_token_struct *p, vp9_tree t) {
+  tree2tok(p, t, 0, 0, 0);
+}
+
+void vp9_tokens_from_tree_offset(struct vp9_token_struct *p, vp9_tree t,
+                                 int offset) {
+  tree2tok(p - offset, t, 0, 0, 0);
+}
+
+static void branch_counts(
+  int n,                      /* n = size of alphabet */
+  vp9_token tok               [ /* n */ ],
+  vp9_tree tree,
+  unsigned int branch_ct       [ /* n-1 */ ] [2],
+  const unsigned int num_events[ /* n */ ]
+) {
+  const int tree_len = n - 1;
+  int t = 0;
+
+#if CONFIG_DEBUG
+  assert(tree_len);
+#endif
+
+  do {
+    branch_ct[t][0] = branch_ct[t][1] = 0;
+  } while (++t < tree_len);
+
+  t = 0;
+
+  do {
+    int L = tok[t].Len;
+    const int enc = tok[t].value;
+    const unsigned int ct = num_events[t];
+
+    vp9_tree_index i = 0;
+
+    do {
+      const int b = (enc >> --L) & 1;
+      const int j = i >> 1;
+#if CONFIG_DEBUG
+      assert(j < tree_len  &&  0 <= L);
+#endif
+
+      branch_ct [j] [b] += ct;
+      i = tree[ i + b];
+    } while (i > 0);
+
+#if CONFIG_DEBUG
+    assert(!L);
+#endif
+  } while (++t < n);
+
+}
+
+
+void vp9_tree_probs_from_distribution(
+  int n,                      /* n = size of alphabet */
+  vp9_token tok               [ /* n */ ],
+  vp9_tree tree,
+  vp9_prob probs          [ /* n-1 */ ],
+  unsigned int branch_ct       [ /* n-1 */ ] [2],
+  const unsigned int num_events[ /* n */ ],
+  unsigned int Pfac,
+  int rd
+) {
+  const int tree_len = n - 1;
+  int t = 0;
+
+  branch_counts(n, tok, tree, branch_ct, num_events);
+
+  do {
+    const unsigned int *const c = branch_ct[t];
+    const unsigned int tot = c[0] + c[1];
+
+#if CONFIG_DEBUG
+    assert(tot < (1 << 24));        /* no overflow below */
+#endif
+
+    if (tot) {
+      const unsigned int p = ((c[0] * Pfac) + (rd ? tot >> 1 : 0)) / tot;
+      probs[t] = p < 256 ? (p ? p : 1) : 255; /* agree w/old version for now */
+    } else
+      probs[t] = vp9_prob_half;
+  } while (++t < tree_len);
+}
+
+vp9_prob vp9_bin_prob_from_distribution(const unsigned int counts[2]) {
+  int tot_count = counts[0] + counts[1];
+  vp9_prob prob;
+  if (tot_count) {
+    prob = (counts[0] * 255 + (tot_count >> 1)) / tot_count;
+    prob += !prob;
+  } else {
+    prob = 128;
+  }
+  return prob;
+}
--- /dev/null
+++ b/vp9/common/treecoder.h
@@ -1,0 +1,75 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_TREECODER_H
+#define __INC_TREECODER_H
+
+typedef unsigned char vp9_prob;
+
+#define vp9_prob_half ( (vp9_prob) 128)
+
+typedef signed char vp9_tree_index;
+struct bool_coder_spec;
+
+typedef struct bool_coder_spec bool_coder_spec;
+typedef struct bool_writer bool_writer;
+typedef struct bool_reader bool_reader;
+
+typedef const bool_coder_spec c_bool_coder_spec;
+typedef const bool_writer c_bool_writer;
+typedef const bool_reader c_bool_reader;
+
+
+
+# define vp9_complement( x) (255 - x)
+
+
+/* We build coding trees compactly in arrays.
+   Each node of the tree is a pair of vp9_tree_indices.
+   Array index often references a corresponding probability table.
+   Index <= 0 means done encoding/decoding and value = -Index,
+   Index > 0 means need another bit, specification at index.
+   Nonnegative indices are always even;  processing begins at node 0. */
+
+typedef const vp9_tree_index vp9_tree[], *vp9_tree_p;
+
+
+typedef const struct vp9_token_struct {
+  int value;
+  int Len;
+} vp9_token;
+
+/* Construct encoding array from tree. */
+
+void vp9_tokens_from_tree(struct vp9_token_struct *, vp9_tree);
+void vp9_tokens_from_tree_offset(struct vp9_token_struct *, vp9_tree,
+                                 int offset);
+
+
+/* Convert array of token occurrence counts into a table of probabilities
+   for the associated binary encoding tree.  Also writes count of branches
+   taken for each node on the tree; this facilitiates decisions as to
+   probability updates. */
+
+void vp9_tree_probs_from_distribution(
+  int n,                      /* n = size of alphabet */
+  vp9_token tok               [ /* n */ ],
+  vp9_tree tree,
+  vp9_prob probs          [ /* n-1 */ ],
+  unsigned int branch_ct       [ /* n-1 */ ] [2],
+  const unsigned int num_events[ /* n */ ],
+  unsigned int Pfactor,
+  int Round
+);
+
+vp9_prob vp9_bin_prob_from_distribution(const unsigned int counts[2]);
+
+#endif
--- /dev/null
+++ b/vp9/common/type_aliases.h
@@ -1,0 +1,120 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+/****************************************************************************
+*
+*   Module Title :     type_aliases.h
+*
+*   Description  :     Standard type aliases
+*
+****************************************************************************/
+#ifndef __INC_TYPE_ALIASES_H
+#define __INC_TYPE_ALIASES_H
+
+/****************************************************************************
+* Macros
+****************************************************************************/
+#define EXPORT
+#define IMPORT          extern      /* Used to declare imported data & routines */
+#define PRIVATE         static      /* Used to declare & define module-local data */
+#define LOCAL           static      /* Used to define all persistent routine-local data */
+#define STD_IN_PATH     0           /* Standard input path */
+#define STD_OUT_PATH    1           /* Standard output path */
+#define STD_ERR_PATH    2           /* Standard error path */
+#define STD_IN_FILE     stdin       /* Standard input file pointer */
+#define STD_OUT_FILE    stdout      /* Standard output file pointer */
+#define STD_ERR_FILE    stderr      /* Standard error file pointer */
+#define max_int         0x7FFFFFFF
+
+#define __export
+#define _export
+
+#define CCONV
+
+#ifndef NULL
+#ifdef __cplusplus
+#define NULL    0
+#else
+#define NULL    ((void *)0)
+#endif
+#endif
+
+#ifndef FALSE
+#define FALSE   0
+#endif
+
+#ifndef TRUE
+#define TRUE    1
+#endif
+
+/****************************************************************************
+* Typedefs
+****************************************************************************/
+#ifndef TYPE_INT8
+#define TYPE_INT8
+typedef signed char     INT8;
+#endif
+
+#ifndef TYPE_INT16
+/*#define TYPE_INT16*/
+typedef signed short    INT16;
+#endif
+
+#ifndef TYPE_INT32
+/*#define TYPE_INT32*/
+typedef signed int      INT32;
+#endif
+
+#ifndef TYPE_UINT8
+/*#define TYPE_UINT8*/
+typedef unsigned char   UINT8;
+#endif
+
+#ifndef TYPE_UINT32
+/*#define TYPE_UINT32*/
+typedef unsigned int    UINT32;
+#endif
+
+#ifndef TYPE_UINT16
+/*#define TYPE_UINT16*/
+typedef unsigned short  UINT16;
+#endif
+
+#ifndef TYPE_BOOL
+/*#define TYPE_BOOL*/
+typedef int             BOOL;
+#endif
+
+typedef unsigned char   BOOLEAN;
+
+#ifdef _MSC_VER
+typedef __int64 INT64;
+#ifndef INT64_MAX
+#define INT64_MAX LLONG_MAX
+#endif
+#else
+
+#ifndef TYPE_INT64
+#ifdef _TMS320C6X
+/* for now we only have 40bits */
+typedef long INT64;
+#else
+typedef long long INT64;
+#endif
+#endif
+
+#endif
+
+/* Floating point */
+typedef  double         FLOAT64;
+typedef  float          FLOAT32;
+
+#endif
--- /dev/null
+++ b/vp9/common/x86/filter_sse2.c
@@ -1,0 +1,289 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h> // for alignment checks
+#include <emmintrin.h> // SSE2
+#include "vp9/common/filter.h"
+#include "vpx_ports/mem.h" // for DECLARE_ALIGNED
+#include "vpx_rtcd.h"
+
+// TODO(cd): After cleanup, commit faster versions for non 4x4 size. This is
+//           just a quick partial snapshot so that other can already use some
+//           speedup.
+// TODO(cd): Use vectorized 8 tap filtering code as speedup to pure C 6 tap
+//           filtering.
+// TODO(cd): Add some comments, better variable naming.
+// TODO(cd): Maybe use _mm_maddubs_epi16 if smaller filter coeficients (no sum
+//           of positive above 128), or have higher precision filter
+//           coefficients.
+
+DECLARE_ALIGNED(16, static const unsigned int, rounding_c[4]) = {
+  VP9_FILTER_WEIGHT >> 1,
+  VP9_FILTER_WEIGHT >> 1,
+  VP9_FILTER_WEIGHT >> 1,
+  VP9_FILTER_WEIGHT >> 1,
+};
+
+// Creating a macro to do more than four pixels at once to hide instruction
+// latency is actually slower :-(
+#define DO_FOUR_PIXELS(result, src_ptr, offset)                                \
+  {                                                                            \
+  /* Do shifted load to achieve require shuffles through unpacking */          \
+  const __m128i src0  = _mm_loadu_si128((const __m128i *)(src_ptr + offset + 0)); \
+  const __m128i src1  = _mm_loadu_si128((const __m128i *)(src_ptr + offset + 1)); \
+  const __m128i src2  = _mm_loadu_si128((const __m128i *)(src_ptr + offset + 2)); \
+  const __m128i src3  = _mm_loadu_si128((const __m128i *)(src_ptr + offset + 3)); \
+  const __m128i src01 = _mm_unpacklo_epi8(src0, src1);                         \
+  const __m128i src01_16 = _mm_unpacklo_epi8(src01, zero);                     \
+  const __m128i src23 = _mm_unpacklo_epi8(src2, src3);                         \
+  const __m128i src23_16 = _mm_unpacklo_epi8(src23, zero);                     \
+  /* Shit by 4 bytes through suffle to get additional shifted loads */         \
+  const __m128i src4  = _mm_shuffle_epi32(src0, _MM_SHUFFLE(3, 3, 2, 1));      \
+  const __m128i src5  = _mm_shuffle_epi32(src1, _MM_SHUFFLE(3, 3, 2, 1));      \
+  const __m128i src6  = _mm_shuffle_epi32(src2, _MM_SHUFFLE(3, 3, 2, 1));      \
+  const __m128i src7  = _mm_shuffle_epi32(src3, _MM_SHUFFLE(3, 3, 2, 1));      \
+  const __m128i src45 = _mm_unpacklo_epi8(src4, src5);                         \
+  const __m128i src45_16 = _mm_unpacklo_epi8(src45, zero);                     \
+  const __m128i src67 = _mm_unpacklo_epi8(src6, src7);                         \
+  const __m128i src67_16 = _mm_unpacklo_epi8(src67, zero);                     \
+  /* multiply accumulate them */                                               \
+  const __m128i mad01 = _mm_madd_epi16(src01_16, fil01);                       \
+  const __m128i mad23 = _mm_madd_epi16(src23_16, fil23);                       \
+  const __m128i mad45 = _mm_madd_epi16(src45_16, fil45);                       \
+  const __m128i mad67 = _mm_madd_epi16(src67_16, fil67);                       \
+  const __m128i mad0123 = _mm_add_epi32(mad01, mad23);                         \
+  const __m128i mad4567 = _mm_add_epi32(mad45, mad67);                         \
+  __m128i mad_all = _mm_add_epi32(mad0123, mad4567);                           \
+  mad_all = _mm_add_epi32(mad_all, rounding);                                  \
+  result = _mm_srai_epi32(mad_all, VP9_FILTER_SHIFT);                          \
+  }
+
+void vp9_filter_block2d_4x4_8_sse2
+(
+ const unsigned char *src_ptr, const unsigned int src_stride,
+ const short *HFilter_aligned16, const short *VFilter_aligned16,
+ unsigned char *dst_ptr, unsigned int dst_stride
+) {
+  __m128i intermediateA, intermediateB, intermediateC;
+
+  const int kInterp_Extend = 4;
+
+  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i rounding = _mm_load_si128((const __m128i *)rounding_c);
+
+  // check alignment
+  assert(0 == ((long)HFilter_aligned16)%16);
+  assert(0 == ((long)VFilter_aligned16)%16);
+
+  {
+    __m128i transpose3_0;
+    __m128i transpose3_1;
+    __m128i transpose3_2;
+    __m128i transpose3_3;
+
+    // Horizontal pass (src -> intermediate).
+    {
+      const __m128i HFilter = _mm_load_si128((const __m128i *)HFilter_aligned16);
+      // get first two columns filter coefficients
+      __m128i fil01 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(0, 0, 0, 0));
+      __m128i fil23 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(1, 1, 1, 1));
+      __m128i fil45 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(2, 2, 2, 2));
+      __m128i fil67 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(3, 3, 3, 3));
+      src_ptr -= (kInterp_Extend - 1) * src_stride + (kInterp_Extend - 1);
+
+      {
+        __m128i mad_all0;
+        __m128i mad_all1;
+        __m128i mad_all2;
+        __m128i mad_all3;
+        DO_FOUR_PIXELS(mad_all0, src_ptr, 0*src_stride)
+        DO_FOUR_PIXELS(mad_all1, src_ptr, 1*src_stride)
+        DO_FOUR_PIXELS(mad_all2, src_ptr, 2*src_stride)
+        DO_FOUR_PIXELS(mad_all3, src_ptr, 3*src_stride)
+        mad_all0 = _mm_packs_epi32(mad_all0, mad_all1);
+        mad_all2 = _mm_packs_epi32(mad_all2, mad_all3);
+        intermediateA = _mm_packus_epi16(mad_all0, mad_all2);
+        // --
+        src_ptr += src_stride*4;
+        // --
+        DO_FOUR_PIXELS(mad_all0, src_ptr, 0*src_stride)
+        DO_FOUR_PIXELS(mad_all1, src_ptr, 1*src_stride)
+        DO_FOUR_PIXELS(mad_all2, src_ptr, 2*src_stride)
+        DO_FOUR_PIXELS(mad_all3, src_ptr, 3*src_stride)
+        mad_all0 = _mm_packs_epi32(mad_all0, mad_all1);
+        mad_all2 = _mm_packs_epi32(mad_all2, mad_all3);
+        intermediateB = _mm_packus_epi16(mad_all0, mad_all2);
+        // --
+        src_ptr += src_stride*4;
+        // --
+        DO_FOUR_PIXELS(mad_all0, src_ptr, 0*src_stride)
+        DO_FOUR_PIXELS(mad_all1, src_ptr, 1*src_stride)
+        DO_FOUR_PIXELS(mad_all2, src_ptr, 2*src_stride)
+        mad_all0 = _mm_packs_epi32(mad_all0, mad_all1);
+        mad_all2 = _mm_packs_epi32(mad_all2, mad_all2);
+        intermediateC = _mm_packus_epi16(mad_all0, mad_all2);
+      }
+    }
+
+    // Transpose result (intermediate -> transpose3_x)
+    {
+      // 00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33
+      // 40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73
+      // 80 81 82 83 90 91 92 93 A0 A1 A2 A3 xx xx xx xx
+      const __m128i transpose0_0 = _mm_unpacklo_epi8(intermediateA, intermediateB);
+      const __m128i transpose0_1 = _mm_unpackhi_epi8(intermediateA, intermediateB);
+      const __m128i transpose0_2 = _mm_unpacklo_epi8(intermediateC, intermediateC);
+      const __m128i transpose0_3 = _mm_unpackhi_epi8(intermediateC, intermediateC);
+      // 00 40 01 41 02 42 03 43 10 50 11 51 12 52 13 53
+      // 20 60 21 61 22 62 23 63 30 70 31 71 32 72 33 73
+      // 80 xx 81 xx 82 xx 83 xx 90 xx 91 xx 92 xx 93 xx
+      // A0 xx A1 xx A2 xx A3 xx xx xx xx xx xx xx xx xx
+      const __m128i transpose1_0 = _mm_unpacklo_epi8(transpose0_0, transpose0_1);
+      const __m128i transpose1_1 = _mm_unpackhi_epi8(transpose0_0, transpose0_1);
+      const __m128i transpose1_2 = _mm_unpacklo_epi8(transpose0_2, transpose0_3);
+      const __m128i transpose1_3 = _mm_unpackhi_epi8(transpose0_2, transpose0_3);
+      // 00 20 40 60 01 21 41 61 02 22 42 62 03 23 43 63
+      // 10 30 50 70 11 31 51 71 12 32 52 72 13 33 53 73
+      // 80 A0 xx xx 81 A1 xx xx 82 A2 xx xx 83 A3 xx xx
+      // 90 xx xx xx 91 xx xx xx 92 xx xx xx 93 xx xx xx
+      const __m128i transpose2_0 = _mm_unpacklo_epi8(transpose1_0, transpose1_1);
+      const __m128i transpose2_1 = _mm_unpackhi_epi8(transpose1_0, transpose1_1);
+      const __m128i transpose2_2 = _mm_unpacklo_epi8(transpose1_2, transpose1_3);
+      const __m128i transpose2_3 = _mm_unpackhi_epi8(transpose1_2, transpose1_3);
+      // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
+      // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
+      // 80 90 A0 xx xx xx xx xx 81 91 A1 xx xx xx xx xx
+      // 82 92 A2 xx xx xx xx xx 83 93 A3 xx xx xx xx xx
+      transpose3_0 = _mm_castps_si128(
+                            _mm_shuffle_ps(_mm_castsi128_ps(transpose2_0),
+                                           _mm_castsi128_ps(transpose2_2),
+                                           _MM_SHUFFLE(1, 0, 1, 0)));
+      transpose3_1 = _mm_castps_si128(
+                            _mm_shuffle_ps(_mm_castsi128_ps(transpose2_0),
+                                           _mm_castsi128_ps(transpose2_2),
+                                           _MM_SHUFFLE(3, 2, 3, 2)));
+      transpose3_2 = _mm_castps_si128(
+                            _mm_shuffle_ps(_mm_castsi128_ps(transpose2_1),
+                                           _mm_castsi128_ps(transpose2_3),
+                                           _MM_SHUFFLE(1, 0, 1, 0)));
+      transpose3_3 = _mm_castps_si128(
+                            _mm_shuffle_ps(_mm_castsi128_ps(transpose2_1),
+                                           _mm_castsi128_ps(transpose2_3),
+                                           _MM_SHUFFLE(3, 2, 3, 2)));
+      // 00 10 20 30 40 50 60 70 80 90 A0 xx xx xx xx xx
+      // 01 11 21 31 41 51 61 71 81 91 A1 xx xx xx xx xx
+      // 02 12 22 32 42 52 62 72 82 92 A2 xx xx xx xx xx
+      // 03 13 23 33 43 53 63 73 83 93 A3 xx xx xx xx xx
+    }
+
+    // Vertical pass (transpose3_x -> dst).
+    {
+      const __m128i VFilter = _mm_load_si128((const __m128i *)VFilter_aligned16);
+      // get first two columns filter coefficients
+      __m128i fil01 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(0, 0, 0, 0));
+      __m128i fil23 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(1, 1, 1, 1));
+      __m128i fil45 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(2, 2, 2, 2));
+      __m128i fil67 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(3, 3, 3, 3));
+      __m128i col0, col1, col2, col3;
+        DECLARE_ALIGNED(16, unsigned char, temp[32]);
+      {
+        _mm_store_si128((__m128i *)temp, transpose3_0);
+        DO_FOUR_PIXELS(col0, temp, 0);
+      }
+      {
+        _mm_store_si128((__m128i *)temp, transpose3_1);
+        DO_FOUR_PIXELS(col1, temp, 0);
+      }
+      {
+        _mm_store_si128((__m128i *)temp, transpose3_2);
+        DO_FOUR_PIXELS(col2, temp, 0);
+      }
+      {
+        _mm_store_si128((__m128i *)temp, transpose3_3);
+        DO_FOUR_PIXELS(col3, temp, 0);
+      }
+      // transpose
+      {
+        __m128i T0 = _mm_unpacklo_epi32(col0, col1);
+        __m128i T1 = _mm_unpacklo_epi32(col2, col3);
+        __m128i T2 = _mm_unpackhi_epi32(col0, col1);
+        __m128i T3 = _mm_unpackhi_epi32(col2, col3);
+        col0 = _mm_unpacklo_epi64(T0, T1);
+        col1 = _mm_unpackhi_epi64(T0, T1);
+        col2 = _mm_unpacklo_epi64(T2, T3);
+        col3 = _mm_unpackhi_epi64(T2, T3);
+      }
+      // saturate to 8 bit
+      {
+        col0 = _mm_packs_epi32(col0, col0);
+        col0 = _mm_packus_epi16(col0, col0);
+        col1 = _mm_packs_epi32(col1, col1);
+        col1 = _mm_packus_epi16(col1, col1);
+        col2 = _mm_packs_epi32 (col2, col2);
+        col2 = _mm_packus_epi16(col2, col2);
+        col3 = _mm_packs_epi32 (col3, col3);
+        col3 = _mm_packus_epi16(col3, col3);
+      }
+      // store
+      {
+        *((unsigned int *)&dst_ptr[dst_stride * 0]) = _mm_cvtsi128_si32(col0);
+        *((unsigned int *)&dst_ptr[dst_stride * 1]) = _mm_cvtsi128_si32(col1);
+        *((unsigned int *)&dst_ptr[dst_stride * 2]) = _mm_cvtsi128_si32(col2);
+        *((unsigned int *)&dst_ptr[dst_stride * 3]) = _mm_cvtsi128_si32(col3);
+      }
+    }
+  }
+}
+
+void vp9_filter_block2d_8x4_8_sse2
+(
+ const unsigned char *src_ptr, const unsigned int src_stride,
+ const short *HFilter_aligned16, const short *VFilter_aligned16,
+ unsigned char *dst_ptr, unsigned int dst_stride
+) {
+  int j;
+  for (j=0; j<8; j+=4) {
+    vp9_filter_block2d_4x4_8_sse2(src_ptr + j, src_stride,
+                                  HFilter_aligned16, VFilter_aligned16,
+                                  dst_ptr + j, dst_stride);
+  }
+}
+
+void vp9_filter_block2d_8x8_8_sse2
+(
+ const unsigned char *src_ptr, const unsigned int src_stride,
+ const short *HFilter_aligned16, const short *VFilter_aligned16,
+ unsigned char *dst_ptr, unsigned int dst_stride
+) {
+  int i, j;
+  for (i=0; i<8; i+=4) {
+    for (j=0; j<8; j+=4) {
+      vp9_filter_block2d_4x4_8_sse2(src_ptr + j + i*src_stride, src_stride,
+                                    HFilter_aligned16, VFilter_aligned16,
+                                    dst_ptr + j + i*dst_stride, dst_stride);
+    }
+  }
+}
+
+void vp9_filter_block2d_16x16_8_sse2
+(
+ const unsigned char *src_ptr, const unsigned int src_stride,
+ const short *HFilter_aligned16, const short *VFilter_aligned16,
+ unsigned char *dst_ptr, unsigned int dst_stride
+) {
+  int i, j;
+  for (i=0; i<16; i+=4) {
+    for (j=0; j<16; j+=4) {
+      vp9_filter_block2d_4x4_8_sse2(src_ptr + j + i*src_stride, src_stride,
+                                    HFilter_aligned16, VFilter_aligned16,
+                                    dst_ptr + j + i*dst_stride, dst_stride);
+    }
+  }
+}
--- /dev/null
+++ b/vp9/common/x86/filter_sse4.c
@@ -1,0 +1,362 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h> // for alignment checks
+#include <smmintrin.h> // SSE4.1
+#include "vp9/common/filter.h"
+#include "vpx_ports/mem.h" // for DECLARE_ALIGNED
+#include "vpx_rtcd.h"
+
+// TODO(cd): After cleanup, commit faster versions for non 4x4 size. This is
+//           just a quick partial snapshot so that other can already use some
+//           speedup.
+// TODO(cd): Use vectorized 8 tap filtering code as speedup to pure C 6 tap
+//           filtering.
+// TODO(cd): Reduce source size by using macros instead of current code
+//           duplication.
+// TODO(cd): Add some comments, better variable naming.
+// TODO(cd): Maybe use _mm_maddubs_epi16 if smaller filter coeficients (no sum
+//           of positive above 128), or have higher precision filter
+//           coefficients.
+
+DECLARE_ALIGNED(16, static const unsigned char, mask0123_c[16]) = {
+  0x00, 0x01,
+  0x01, 0x02,
+  0x02, 0x03,
+  0x03, 0x04,
+  0x02, 0x03,
+  0x03, 0x04,
+  0x04, 0x05,
+  0x05, 0x06,
+};
+DECLARE_ALIGNED(16, static const unsigned char, mask4567_c[16]) = {
+  0x04, 0x05,
+  0x05, 0x06,
+  0x06, 0x07,
+  0x07, 0x08,
+  0x06, 0x07,
+  0x07, 0x08,
+  0x08, 0x09,
+  0x09, 0x0A,
+};
+DECLARE_ALIGNED(16, static const unsigned int, rounding_c[4]) = {
+  VP9_FILTER_WEIGHT >> 1,
+  VP9_FILTER_WEIGHT >> 1,
+  VP9_FILTER_WEIGHT >> 1,
+  VP9_FILTER_WEIGHT >> 1,
+};
+DECLARE_ALIGNED(16, static const unsigned char, transpose_c[16]) = {
+  0, 4,  8, 12,
+  1, 5,  9, 13,
+  2, 6, 10, 14,
+  3, 7, 11, 15
+};
+
+// Creating a macro to do more than four pixels at once to hide instruction
+// latency is actually slower :-(
+#define DO_FOUR_PIXELS(result, offset)                                         \
+  {                                                                            \
+  /*load pixels*/                                                              \
+  __m128i src  = _mm_loadu_si128((const __m128i *)(src_ptr + offset));         \
+  /* extract the ones used for first column */                                 \
+  __m128i src0123 = _mm_shuffle_epi8(src, mask0123);                           \
+  __m128i src4567 = _mm_shuffle_epi8(src, mask4567);                           \
+  __m128i src01_16 = _mm_unpacklo_epi8(src0123, zero);                         \
+  __m128i src23_16 = _mm_unpackhi_epi8(src0123, zero);                         \
+  __m128i src45_16 = _mm_unpacklo_epi8(src4567, zero);                         \
+  __m128i src67_16 = _mm_unpackhi_epi8(src4567, zero);                         \
+  /* multiply accumulate them */                                               \
+  __m128i mad01 = _mm_madd_epi16(src01_16, fil01);                             \
+  __m128i mad23 = _mm_madd_epi16(src23_16, fil23);                             \
+  __m128i mad45 = _mm_madd_epi16(src45_16, fil45);                             \
+  __m128i mad67 = _mm_madd_epi16(src67_16, fil67);                             \
+  __m128i mad0123 = _mm_add_epi32(mad01, mad23);                               \
+  __m128i mad4567 = _mm_add_epi32(mad45, mad67);                               \
+  __m128i mad_all = _mm_add_epi32(mad0123, mad4567);                           \
+  mad_all = _mm_add_epi32(mad_all, rounding);                                  \
+  result = _mm_srai_epi32(mad_all, VP9_FILTER_SHIFT);                          \
+  }
+
+void vp9_filter_block2d_4x4_8_sse4_1
+(
+ const unsigned char *src_ptr, const unsigned int src_stride,
+ const short *HFilter_aligned16, const short *VFilter_aligned16,
+ unsigned char *dst_ptr, unsigned int dst_stride
+) {
+  __m128i intermediateA, intermediateB, intermediateC;
+
+  const int kInterp_Extend = 4;
+
+  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i mask0123 = _mm_load_si128((const __m128i *)mask0123_c);
+  const __m128i mask4567 = _mm_load_si128((const __m128i *)mask4567_c);
+  const __m128i rounding = _mm_load_si128((const __m128i *)rounding_c);
+  const __m128i transpose = _mm_load_si128((const __m128i *)transpose_c);
+
+  // check alignment
+  assert(0 == ((long)HFilter_aligned16)%16);
+  assert(0 == ((long)VFilter_aligned16)%16);
+
+  {
+    __m128i transpose3_0;
+    __m128i transpose3_1;
+    __m128i transpose3_2;
+    __m128i transpose3_3;
+
+    // Horizontal pass (src -> intermediate).
+    {
+      const __m128i HFilter = _mm_load_si128((const __m128i *)HFilter_aligned16);
+      // get first two columns filter coefficients
+      __m128i fil01 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(0, 0, 0, 0));
+      __m128i fil23 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(1, 1, 1, 1));
+      __m128i fil45 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(2, 2, 2, 2));
+      __m128i fil67 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(3, 3, 3, 3));
+      src_ptr -= (kInterp_Extend - 1) * src_stride + (kInterp_Extend - 1);
+
+      {
+        __m128i mad_all0;
+        __m128i mad_all1;
+        __m128i mad_all2;
+        __m128i mad_all3;
+        DO_FOUR_PIXELS(mad_all0, 0*src_stride)
+        DO_FOUR_PIXELS(mad_all1, 1*src_stride)
+        DO_FOUR_PIXELS(mad_all2, 2*src_stride)
+        DO_FOUR_PIXELS(mad_all3, 3*src_stride)
+        mad_all0 = _mm_packs_epi32(mad_all0, mad_all1);
+        mad_all2 = _mm_packs_epi32(mad_all2, mad_all3);
+        intermediateA = _mm_packus_epi16(mad_all0, mad_all2);
+        // --
+        src_ptr += src_stride*4;
+        // --
+        DO_FOUR_PIXELS(mad_all0, 0*src_stride)
+        DO_FOUR_PIXELS(mad_all1, 1*src_stride)
+        DO_FOUR_PIXELS(mad_all2, 2*src_stride)
+        DO_FOUR_PIXELS(mad_all3, 3*src_stride)
+        mad_all0 = _mm_packs_epi32(mad_all0, mad_all1);
+        mad_all2 = _mm_packs_epi32(mad_all2, mad_all3);
+        intermediateB = _mm_packus_epi16(mad_all0, mad_all2);
+        // --
+        src_ptr += src_stride*4;
+        // --
+        DO_FOUR_PIXELS(mad_all0, 0*src_stride)
+        DO_FOUR_PIXELS(mad_all1, 1*src_stride)
+        DO_FOUR_PIXELS(mad_all2, 2*src_stride)
+        mad_all0 = _mm_packs_epi32(mad_all0, mad_all1);
+        mad_all2 = _mm_packs_epi32(mad_all2, mad_all2);
+        intermediateC = _mm_packus_epi16(mad_all0, mad_all2);
+      }
+    }
+
+    // Transpose result (intermediate -> transpose3_x)
+    {
+      // 00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33
+      // 40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73
+      // 80 81 82 83 90 91 92 93 A0 A1 A2 A3 xx xx xx xx
+      const __m128i transpose1_0 = _mm_shuffle_epi8(intermediateA, transpose);
+      const __m128i transpose1_1 = _mm_shuffle_epi8(intermediateB, transpose);
+      const __m128i transpose1_2 = _mm_shuffle_epi8(intermediateC, transpose);
+      // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+      // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
+      // 80 90 A0 xx 81 91 A1 xx 82 92 A2 xx 83 93 A3 xx
+      const __m128i transpose2_0 = _mm_unpacklo_epi32(transpose1_0, transpose1_1);
+      const __m128i transpose2_1 = _mm_unpackhi_epi32(transpose1_0, transpose1_1);
+      // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
+      // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
+      transpose3_0 = _mm_castps_si128(
+                            _mm_shuffle_ps(_mm_castsi128_ps(transpose2_0),
+                                           _mm_castsi128_ps(transpose1_2),
+                                           _MM_SHUFFLE(0, 0, 1, 0)));
+      transpose3_1 = _mm_castps_si128(
+                            _mm_shuffle_ps(_mm_castsi128_ps(transpose2_0),
+                                           _mm_castsi128_ps(transpose1_2),
+                                           _MM_SHUFFLE(1, 1, 3, 2)));
+      transpose3_2 = _mm_castps_si128(
+                            _mm_shuffle_ps(_mm_castsi128_ps(transpose2_1),
+                                           _mm_castsi128_ps(transpose1_2),
+                                           _MM_SHUFFLE(2, 2, 1, 0)));
+      transpose3_3 = _mm_castps_si128(
+                            _mm_shuffle_ps(_mm_castsi128_ps(transpose2_1),
+                                           _mm_castsi128_ps(transpose1_2),
+                                           _MM_SHUFFLE(3, 3, 3, 2)));
+      // 00 10 20 30 40 50 60 70 80 90 A0 xx xx xx xx xx
+      // 01 11 21 31 41 51 61 71 81 91 A1 xx xx xx xx xx
+      // 02 12 22 32 42 52 62 72 82 92 A2 xx xx xx xx xx
+      // 03 13 23 33 43 53 63 73 83 93 A3 xx xx xx xx xx
+    }
+
+    // Vertical pass (transpose3_x -> dst).
+    {
+      const __m128i VFilter = _mm_load_si128((const __m128i *)VFilter_aligned16);
+      // get first two columns filter coefficients
+      __m128i fil01 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(0, 0, 0, 0));
+      __m128i fil23 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(1, 1, 1, 1));
+      __m128i fil45 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(2, 2, 2, 2));
+      __m128i fil67 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(3, 3, 3, 3));
+      __m128i col0, col1, col2, col3;
+      {
+        //load pixels
+        __m128i src  = transpose3_0;
+        // extract the ones used for first column
+        __m128i src0123 = _mm_shuffle_epi8(src, mask0123);
+        __m128i src4567 = _mm_shuffle_epi8(src, mask4567);
+        __m128i src01_16 = _mm_unpacklo_epi8(src0123, zero);
+        __m128i src23_16 = _mm_unpackhi_epi8(src0123, zero);
+        __m128i src45_16 = _mm_unpacklo_epi8(src4567, zero);
+        __m128i src67_16 = _mm_unpackhi_epi8(src4567, zero);
+        // multiply accumulate them
+        __m128i mad01 = _mm_madd_epi16(src01_16, fil01);
+        __m128i mad23 = _mm_madd_epi16(src23_16, fil23);
+        __m128i mad45 = _mm_madd_epi16(src45_16, fil45);
+        __m128i mad67 = _mm_madd_epi16(src67_16, fil67);
+        __m128i mad0123 = _mm_add_epi32(mad01, mad23);
+        __m128i mad4567 = _mm_add_epi32(mad45, mad67);
+        __m128i mad_all = _mm_add_epi32(mad0123, mad4567);
+        mad_all = _mm_add_epi32(mad_all, rounding);
+        mad_all = _mm_srai_epi32(mad_all, VP9_FILTER_SHIFT);
+        mad_all = _mm_packs_epi32(mad_all, mad_all);
+        col0 = _mm_packus_epi16(mad_all, mad_all);
+      }
+      {
+        //load pixels
+        __m128i src  = transpose3_1;
+        // extract the ones used for first column
+        __m128i src0123 = _mm_shuffle_epi8(src, mask0123);
+        __m128i src4567 = _mm_shuffle_epi8(src, mask4567);
+        __m128i src01_16 = _mm_unpacklo_epi8(src0123, zero);
+        __m128i src23_16 = _mm_unpackhi_epi8(src0123, zero);
+        __m128i src45_16 = _mm_unpacklo_epi8(src4567, zero);
+        __m128i src67_16 = _mm_unpackhi_epi8(src4567, zero);
+        // multiply accumulate them
+        __m128i mad01 = _mm_madd_epi16(src01_16, fil01);
+        __m128i mad23 = _mm_madd_epi16(src23_16, fil23);
+        __m128i mad45 = _mm_madd_epi16(src45_16, fil45);
+        __m128i mad67 = _mm_madd_epi16(src67_16, fil67);
+        __m128i mad0123 = _mm_add_epi32(mad01, mad23);
+        __m128i mad4567 = _mm_add_epi32(mad45, mad67);
+        __m128i mad_all = _mm_add_epi32(mad0123, mad4567);
+        mad_all = _mm_add_epi32(mad_all, rounding);
+        mad_all = _mm_srai_epi32(mad_all, VP9_FILTER_SHIFT);
+        mad_all = _mm_packs_epi32(mad_all, mad_all);
+        col1 = _mm_packus_epi16(mad_all, mad_all);
+      }
+      {
+        //load pixels
+        __m128i src  = transpose3_2;
+        // extract the ones used for first column
+        __m128i src0123 = _mm_shuffle_epi8(src, mask0123);
+        __m128i src4567 = _mm_shuffle_epi8(src, mask4567);
+        __m128i src01_16 = _mm_unpacklo_epi8(src0123, zero);
+        __m128i src23_16 = _mm_unpackhi_epi8(src0123, zero);
+        __m128i src45_16 = _mm_unpacklo_epi8(src4567, zero);
+        __m128i src67_16 = _mm_unpackhi_epi8(src4567, zero);
+        // multiply accumulate them
+        __m128i mad01 = _mm_madd_epi16(src01_16, fil01);
+        __m128i mad23 = _mm_madd_epi16(src23_16, fil23);
+        __m128i mad45 = _mm_madd_epi16(src45_16, fil45);
+        __m128i mad67 = _mm_madd_epi16(src67_16, fil67);
+        __m128i mad0123 = _mm_add_epi32(mad01, mad23);
+        __m128i mad4567 = _mm_add_epi32(mad45, mad67);
+        __m128i mad_all = _mm_add_epi32(mad0123, mad4567);
+        mad_all = _mm_add_epi32(mad_all, rounding);
+        mad_all = _mm_srai_epi32(mad_all, VP9_FILTER_SHIFT);
+        mad_all = _mm_packs_epi32(mad_all, mad_all);
+        col2 = _mm_packus_epi16(mad_all, mad_all);
+      }
+      {
+        //load pixels
+        __m128i src  = transpose3_3;
+        // extract the ones used for first column
+        __m128i src0123 = _mm_shuffle_epi8(src, mask0123);
+        __m128i src4567 = _mm_shuffle_epi8(src, mask4567);
+        __m128i src01_16 = _mm_unpacklo_epi8(src0123, zero);
+        __m128i src23_16 = _mm_unpackhi_epi8(src0123, zero);
+        __m128i src45_16 = _mm_unpacklo_epi8(src4567, zero);
+        __m128i src67_16 = _mm_unpackhi_epi8(src4567, zero);
+        // multiply accumulate them
+        __m128i mad01 = _mm_madd_epi16(src01_16, fil01);
+        __m128i mad23 = _mm_madd_epi16(src23_16, fil23);
+        __m128i mad45 = _mm_madd_epi16(src45_16, fil45);
+        __m128i mad67 = _mm_madd_epi16(src67_16, fil67);
+        __m128i mad0123 = _mm_add_epi32(mad01, mad23);
+        __m128i mad4567 = _mm_add_epi32(mad45, mad67);
+        __m128i mad_all = _mm_add_epi32(mad0123, mad4567);
+        mad_all = _mm_add_epi32(mad_all, rounding);
+        mad_all = _mm_srai_epi32(mad_all, VP9_FILTER_SHIFT);
+        mad_all = _mm_packs_epi32(mad_all, mad_all);
+        col3 = _mm_packus_epi16(mad_all, mad_all);
+      }
+      {
+        __m128i col01 = _mm_unpacklo_epi8(col0, col1);
+        __m128i col23 = _mm_unpacklo_epi8(col2, col3);
+        __m128i col0123 = _mm_unpacklo_epi16(col01, col23);
+        //TODO(cd): look into Ronald's comment:
+        //    Future suggestion: I believe here, too, you can merge the
+        //    packs_epi32() and pacus_epi16() for the 4 cols above, so that
+        //    you get the data in a single register, and then use pshufb
+        //    (shuffle_epi8()) instead of the unpacks here. Should be
+        //    2+3+2 instructions faster.
+        *((unsigned int *)&dst_ptr[dst_stride * 0]) =
+            _mm_extract_epi32(col0123, 0);
+        *((unsigned int *)&dst_ptr[dst_stride * 1]) =
+            _mm_extract_epi32(col0123, 1);
+        *((unsigned int *)&dst_ptr[dst_stride * 2]) =
+            _mm_extract_epi32(col0123, 2);
+        *((unsigned int *)&dst_ptr[dst_stride * 3]) =
+            _mm_extract_epi32(col0123, 3);
+      }
+    }
+  }
+}
+
+void vp9_filter_block2d_8x4_8_sse4_1
+(
+ const unsigned char *src_ptr, const unsigned int src_stride,
+ const short *HFilter_aligned16, const short *VFilter_aligned16,
+ unsigned char *dst_ptr, unsigned int dst_stride
+) {
+  int j;
+  for (j=0; j<8; j+=4) {
+    vp9_filter_block2d_4x4_8_sse4_1(src_ptr + j, src_stride,
+                                    HFilter_aligned16, VFilter_aligned16,
+                                    dst_ptr + j, dst_stride);
+  }
+}
+
+void vp9_filter_block2d_8x8_8_sse4_1
+(
+ const unsigned char *src_ptr, const unsigned int src_stride,
+ const short *HFilter_aligned16, const short *VFilter_aligned16,
+ unsigned char *dst_ptr, unsigned int dst_stride
+) {
+  int i, j;
+  for (i=0; i<8; i+=4) {
+    for (j=0; j<8; j+=4) {
+      vp9_filter_block2d_4x4_8_sse4_1(src_ptr + j + i*src_stride, src_stride,
+                                      HFilter_aligned16, VFilter_aligned16,
+                                      dst_ptr + j + i*dst_stride, dst_stride);
+    }
+  }
+}
+
+void vp9_filter_block2d_16x16_8_sse4_1
+(
+ const unsigned char *src_ptr, const unsigned int src_stride,
+ const short *HFilter_aligned16, const short *VFilter_aligned16,
+ unsigned char *dst_ptr, unsigned int dst_stride
+) {
+  int i, j;
+  for (i=0; i<16; i+=4) {
+    for (j=0; j<16; j+=4) {
+      vp9_filter_block2d_4x4_8_sse4_1(src_ptr + j + i*src_stride, src_stride,
+                                      HFilter_aligned16, VFilter_aligned16,
+                                      dst_ptr + j + i*dst_stride, dst_stride);
+    }
+  }
+}
--- /dev/null
+++ b/vp9/common/x86/idct_x86.h
@@ -1,0 +1,64 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef IDCT_X86_H
+#define IDCT_X86_H
+
+/* Note:
+ *
+ * This platform is commonly built for runtime CPU detection. If you modify
+ * any of the function mappings present in this file, be sure to also update
+ * them in the function pointer initialization code
+ */
+
+#if HAVE_MMX
+extern prototype_idct(vp9_short_idct4x4llm_1_mmx);
+extern prototype_idct(vp9_short_idct4x4llm_mmx);
+extern prototype_idct_scalar_add(vp9_dc_only_idct_add_mmx);
+
+extern prototype_second_order(vp9_short_inv_walsh4x4_mmx);
+extern prototype_second_order(vp9_short_inv_walsh4x4_1_mmx);
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+#undef  vp9_idct_idct1
+#define vp9_idct_idct1 vp9_short_idct4x4llm_1_mmx
+
+#undef  vp9_idct_idct16
+#define vp9_idct_idct16 vp9_short_idct4x4llm_mmx
+
+#undef  vp9_idct_idct1_scalar_add
+#define vp9_idct_idct1_scalar_add vp9_dc_only_idct_add_mmx
+
+#undef vp9_idct_iwalsh16
+#define vp9_idct_iwalsh16 vp9_short_inv_walsh4x4_mmx
+
+#undef vp9_idct_iwalsh1
+#define vp9_idct_iwalsh1 vp9_short_inv_walsh4x4_1_mmx
+
+#endif
+#endif
+
+#if HAVE_SSE2
+
+extern prototype_second_order(vp9_short_inv_walsh4x4_sse2);
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+
+#undef vp9_idct_iwalsh16
+#define vp9_idct_iwalsh16 vp9_short_inv_walsh4x4_sse2
+
+#endif
+
+#endif
+
+
+
+#endif
--- /dev/null
+++ b/vp9/common/x86/idctllm_mmx.asm
@@ -1,0 +1,241 @@
+;
+;  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION_RODATA
+align 16
+x_s1sqr2:      times 4 dw 0x8A8C
+align 16
+x_c1sqr2less1: times 4 dw 0x4E7B
+align 16
+pw_16:         times 4 dw 16
+
+SECTION .text
+
+
+; /****************************************************************************
+; * Notes:
+; *
+; * This implementation makes use of 16 bit fixed point version of two multiply
+; * constants:
+; *        1.   sqrt(2) * cos (pi/8)
+; *        2.   sqrt(2) * sin (pi/8)
+; * Because the first constant is bigger than 1, to maintain the same 16 bit
+; * fixed point precision as the second one, we use a trick of
+; *        x * a = x + x*(a-1)
+; * so
+; *        x * sqrt(2) * cos (pi/8) = x + x * (sqrt(2) *cos(pi/8)-1).
+; *
+; * For the second constant, because of the 16bit version is 35468, which
+; * is bigger than 32768, in signed 16 bit multiply, it becomes a negative
+; * number.
+; *        (x * (unsigned)35468 >> 16) = x * (signed)35468 >> 16 + x
+; *
+; **************************************************************************/
+
+INIT_MMX
+
+;void short_idct4x4llm_mmx(short *input, short *output, int pitch)
+cglobal short_idct4x4llm_mmx, 3,3,0, inp, out, pit
+    mova            m0,     [inpq +0]
+    mova            m1,     [inpq +8]
+
+    mova            m2,     [inpq+16]
+    mova            m3,     [inpq+24]
+
+    psubw           m0,      m2             ; b1= 0-2
+    paddw           m2,      m2             ;
+
+    mova            m5,      m1
+    paddw           m2,      m0             ; a1 =0+2
+
+    pmulhw          m5,     [x_s1sqr2]       ;
+    paddw           m5,      m1             ; ip1 * sin(pi/8) * sqrt(2)
+
+    mova            m7,      m3             ;
+    pmulhw          m7,     [x_c1sqr2less1]   ;
+
+    paddw           m7,      m3             ; ip3 * cos(pi/8) * sqrt(2)
+    psubw           m7,      m5             ; c1
+
+    mova            m5,      m1
+    mova            m4,      m3
+
+    pmulhw          m5,     [x_c1sqr2less1]
+    paddw           m5,      m1
+
+    pmulhw          m3,     [x_s1sqr2]
+    paddw           m3,      m4
+
+    paddw           m3,      m5             ; d1
+    mova            m6,      m2             ; a1
+
+    mova            m4,      m0             ; b1
+    paddw           m2,      m3             ;0
+
+    paddw           m4,      m7             ;1
+    psubw           m0,      m7             ;2
+
+    psubw           m6,      m3             ;3
+
+    mova            m1,      m2             ; 03 02 01 00
+    mova            m3,      m4             ; 23 22 21 20
+
+    punpcklwd       m1,      m0             ; 11 01 10 00
+    punpckhwd       m2,      m0             ; 13 03 12 02
+
+    punpcklwd       m3,      m6             ; 31 21 30 20
+    punpckhwd       m4,      m6             ; 33 23 32 22
+
+    mova            m0,      m1             ; 11 01 10 00
+    mova            m5,      m2             ; 13 03 12 02
+
+    punpckldq       m0,      m3             ; 30 20 10 00
+    punpckhdq       m1,      m3             ; 31 21 11 01
+
+    punpckldq       m2,      m4             ; 32 22 12 02
+    punpckhdq       m5,      m4             ; 33 23 13 03
+
+    mova            m3,      m5             ; 33 23 13 03
+
+    psubw           m0,      m2             ; b1= 0-2
+    paddw           m2,      m2             ;
+
+    mova            m5,      m1
+    paddw           m2,      m0             ; a1 =0+2
+
+    pmulhw          m5,     [x_s1sqr2]        ;
+    paddw           m5,      m1             ; ip1 * sin(pi/8) * sqrt(2)
+
+    mova            m7,      m3             ;
+    pmulhw          m7,     [x_c1sqr2less1]   ;
+
+    paddw           m7,      m3             ; ip3 * cos(pi/8) * sqrt(2)
+    psubw           m7,      m5             ; c1
+
+    mova            m5,      m1
+    mova            m4,      m3
+
+    pmulhw          m5,     [x_c1sqr2less1]
+    paddw           m5,      m1
+
+    pmulhw          m3,     [x_s1sqr2]
+    paddw           m3,      m4
+
+    paddw           m3,      m5             ; d1
+    paddw           m0,     [pw_16]
+
+    paddw           m2,     [pw_16]
+    mova            m6,      m2             ; a1
+
+    mova            m4,      m0             ; b1
+    paddw           m2,      m3             ;0
+
+    paddw           m4,      m7             ;1
+    psubw           m0,      m7             ;2
+
+    psubw           m6,      m3             ;3
+    psraw           m2,      5
+
+    psraw           m0,      5
+    psraw           m4,      5
+
+    psraw           m6,      5
+
+    mova            m1,      m2             ; 03 02 01 00
+    mova            m3,      m4             ; 23 22 21 20
+
+    punpcklwd       m1,      m0             ; 11 01 10 00
+    punpckhwd       m2,      m0             ; 13 03 12 02
+
+    punpcklwd       m3,      m6             ; 31 21 30 20
+    punpckhwd       m4,      m6             ; 33 23 32 22
+
+    mova            m0,      m1             ; 11 01 10 00
+    mova            m5,      m2             ; 13 03 12 02
+
+    punpckldq       m0,      m3             ; 30 20 10 00
+    punpckhdq       m1,      m3             ; 31 21 11 01
+
+    punpckldq       m2,      m4             ; 32 22 12 02
+    punpckhdq       m5,      m4             ; 33 23 13 03
+
+    mova        [outq],      m0
+
+    mova     [outq+r2],      m1
+    mova [outq+pitq*2],      m2
+
+    add           outq,      pitq
+    mova [outq+pitq*2],      m5
+    RET
+
+;void short_idct4x4llm_1_mmx(short *input, short *output, int pitch)
+cglobal short_idct4x4llm_1_mmx,3,3,0,inp,out,pit
+    movh            m0,     [inpq]
+    paddw           m0,     [pw_16]
+    psraw           m0,      5
+    punpcklwd       m0,      m0
+    punpckldq       m0,      m0
+
+    mova        [outq],      m0
+    mova   [outq+pitq],      m0
+
+    mova [outq+pitq*2],      m0
+    add             r1,      r2
+
+    mova [outq+pitq*2],      m0
+    RET
+
+
+;void dc_only_idct_add_mmx(short input_dc, unsigned char *pred_ptr, unsigned char *dst_ptr, int pitch, int stride)
+cglobal dc_only_idct_add_mmx, 4,5,0,in_dc,pred,dst,pit,stride
+%if ARCH_X86_64
+    movsxd         strideq,      dword stridem
+%else
+    mov            strideq,      stridem
+%endif
+    pxor                m0,      m0
+
+    movh                m5,      in_dcq ; dc
+    paddw               m5,     [pw_16]
+
+    psraw               m5,      5
+
+    punpcklwd           m5,      m5
+    punpckldq           m5,      m5
+
+    movh                m1,     [predq]
+    punpcklbw           m1,      m0
+    paddsw              m1,      m5
+    packuswb            m1,      m0              ; pack and unpack to saturate
+    movh            [dstq],      m1
+
+    movh                m2,     [predq+pitq]
+    punpcklbw           m2,      m0
+    paddsw              m2,      m5
+    packuswb            m2,      m0              ; pack and unpack to saturate
+    movh    [dstq+strideq],      m2
+
+    movh                m3,     [predq+2*pitq]
+    punpcklbw           m3,      m0
+    paddsw              m3,      m5
+    packuswb            m3,      m0              ; pack and unpack to saturate
+    movh  [dstq+2*strideq],      m3
+
+    add               dstq,      strideq
+    add              predq,      pitq
+    movh                m4,     [predq+2*pitq]
+    punpcklbw           m4,      m0
+    paddsw              m4,      m5
+    packuswb            m4,      m0              ; pack and unpack to saturate
+    movh  [dstq+2*strideq],      m4
+    RET
+
--- /dev/null
+++ b/vp9/common/x86/idctllm_sse2.asm
@@ -1,0 +1,712 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;void vp9_idct_dequant_0_2x_sse2
+; (
+;   short *qcoeff       - 0
+;   short *dequant      - 1
+;   unsigned char *pre  - 2
+;   unsigned char *dst  - 3
+;   int dst_stride      - 4
+;   int blk_stride      - 5
+; )
+
+global sym(vp9_idct_dequant_0_2x_sse2)
+sym(vp9_idct_dequant_0_2x_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    GET_GOT     rbx
+    ; end prolog
+
+        mov         rdx,            arg(1) ; dequant
+        mov         rax,            arg(0) ; qcoeff
+
+        movd        xmm4,           [rax]
+        movd        xmm5,           [rdx]
+
+        pinsrw      xmm4,           [rax+32],   4
+        pinsrw      xmm5,           [rdx],      4
+
+        pmullw      xmm4,           xmm5
+
+    ; Zero out xmm5, for use unpacking
+        pxor        xmm5,           xmm5
+
+    ; clear coeffs
+        movd        [rax],          xmm5
+        movd        [rax+32],       xmm5
+;pshufb
+        pshuflw     xmm4,           xmm4,       00000000b
+        pshufhw     xmm4,           xmm4,       00000000b
+
+        mov         rax,            arg(2) ; pre
+        paddw       xmm4,           [GLOBAL(fours)]
+
+        movsxd      rcx,            dword ptr arg(5) ; blk_stride
+        psraw       xmm4,           3
+
+        movq        xmm0,           [rax]
+        movq        xmm1,           [rax+rcx]
+        movq        xmm2,           [rax+2*rcx]
+        lea         rcx,            [3*rcx]
+        movq        xmm3,           [rax+rcx]
+
+        punpcklbw   xmm0,           xmm5
+        punpcklbw   xmm1,           xmm5
+        punpcklbw   xmm2,           xmm5
+        punpcklbw   xmm3,           xmm5
+
+        mov         rax,            arg(3) ; dst
+        movsxd      rdx,            dword ptr arg(4) ; dst_stride
+
+    ; Add to predict buffer
+        paddw       xmm0,           xmm4
+        paddw       xmm1,           xmm4
+        paddw       xmm2,           xmm4
+        paddw       xmm3,           xmm4
+
+    ; pack up before storing
+        packuswb    xmm0,           xmm5
+        packuswb    xmm1,           xmm5
+        packuswb    xmm2,           xmm5
+        packuswb    xmm3,           xmm5
+
+    ; store blocks back out
+        movq        [rax],          xmm0
+        movq        [rax + rdx],    xmm1
+
+        lea         rax,            [rax + 2*rdx]
+
+        movq        [rax],          xmm2
+        movq        [rax + rdx],    xmm3
+
+    ; begin epilog
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vp9_idct_dequant_full_2x_sse2)
+sym(vp9_idct_dequant_full_2x_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ; special case when 2 blocks have 0 or 1 coeffs
+    ; dc is set as first coeff, so no need to load qcoeff
+        mov         rax,            arg(0) ; qcoeff
+        mov         rsi,            arg(2) ; pre
+        mov         rdi,            arg(3) ; dst
+        movsxd      rcx,            dword ptr arg(5) ; blk_stride
+
+    ; Zero out xmm7, for use unpacking
+        pxor        xmm7,           xmm7
+
+        mov         rdx,            arg(1)  ; dequant
+
+    ; note the transpose of xmm1 and xmm2, necessary for shuffle
+    ;   to spit out sensicle data
+        movdqa      xmm0,           [rax]
+        movdqa      xmm2,           [rax+16]
+        movdqa      xmm1,           [rax+32]
+        movdqa      xmm3,           [rax+48]
+
+    ; Clear out coeffs
+        movdqa      [rax],          xmm7
+        movdqa      [rax+16],       xmm7
+        movdqa      [rax+32],       xmm7
+        movdqa      [rax+48],       xmm7
+
+    ; dequantize qcoeff buffer
+        pmullw      xmm0,           [rdx]
+        pmullw      xmm2,           [rdx+16]
+        pmullw      xmm1,           [rdx]
+        pmullw      xmm3,           [rdx+16]
+
+    ; repack so block 0 row x and block 1 row x are together
+        movdqa      xmm4,           xmm0
+        punpckldq   xmm0,           xmm1
+        punpckhdq   xmm4,           xmm1
+
+        pshufd      xmm0,           xmm0,       11011000b
+        pshufd      xmm1,           xmm4,       11011000b
+
+        movdqa      xmm4,           xmm2
+        punpckldq   xmm2,           xmm3
+        punpckhdq   xmm4,           xmm3
+
+        pshufd      xmm2,           xmm2,       11011000b
+        pshufd      xmm3,           xmm4,       11011000b
+
+    ; first pass
+        psubw       xmm0,           xmm2        ; b1 = 0-2
+        paddw       xmm2,           xmm2        ;
+
+        movdqa      xmm5,           xmm1
+        paddw       xmm2,           xmm0        ; a1 = 0+2
+
+        pmulhw      xmm5,           [GLOBAL(x_s1sqr2)]
+        paddw       xmm5,           xmm1        ; ip1 * sin(pi/8) * sqrt(2)
+
+        movdqa      xmm7,           xmm3
+        pmulhw      xmm7,           [GLOBAL(x_c1sqr2less1)]
+
+        paddw       xmm7,           xmm3        ; ip3 * cos(pi/8) * sqrt(2)
+        psubw       xmm7,           xmm5        ; c1
+
+        movdqa      xmm5,           xmm1
+        movdqa      xmm4,           xmm3
+
+        pmulhw      xmm5,           [GLOBAL(x_c1sqr2less1)]
+        paddw       xmm5,           xmm1
+
+        pmulhw      xmm3,           [GLOBAL(x_s1sqr2)]
+        paddw       xmm3,           xmm4
+
+        paddw       xmm3,           xmm5        ; d1
+        movdqa      xmm6,           xmm2        ; a1
+
+        movdqa      xmm4,           xmm0        ; b1
+        paddw       xmm2,           xmm3        ;0
+
+        paddw       xmm4,           xmm7        ;1
+        psubw       xmm0,           xmm7        ;2
+
+        psubw       xmm6,           xmm3        ;3
+
+    ; transpose for the second pass
+        movdqa      xmm7,           xmm2        ; 103 102 101 100 003 002 001 000
+        punpcklwd   xmm2,           xmm0        ; 007 003 006 002 005 001 004 000
+        punpckhwd   xmm7,           xmm0        ; 107 103 106 102 105 101 104 100
+
+        movdqa      xmm5,           xmm4        ; 111 110 109 108 011 010 009 008
+        punpcklwd   xmm4,           xmm6        ; 015 011 014 010 013 009 012 008
+        punpckhwd   xmm5,           xmm6        ; 115 111 114 110 113 109 112 108
+
+
+        movdqa      xmm1,           xmm2        ; 007 003 006 002 005 001 004 000
+        punpckldq   xmm2,           xmm4        ; 013 009 005 001 012 008 004 000
+        punpckhdq   xmm1,           xmm4        ; 015 011 007 003 014 010 006 002
+
+        movdqa      xmm6,           xmm7        ; 107 103 106 102 105 101 104 100
+        punpckldq   xmm7,           xmm5        ; 113 109 105 101 112 108 104 100
+        punpckhdq   xmm6,           xmm5        ; 115 111 107 103 114 110 106 102
+
+
+        movdqa      xmm5,           xmm2        ; 013 009 005 001 012 008 004 000
+        punpckldq   xmm2,           xmm7        ; 112 108 012 008 104 100 004 000
+        punpckhdq   xmm5,           xmm7        ; 113 109 013 009 105 101 005 001
+
+        movdqa      xmm7,           xmm1        ; 015 011 007 003 014 010 006 002
+        punpckldq   xmm1,           xmm6        ; 114 110 014 010 106 102 006 002
+        punpckhdq   xmm7,           xmm6        ; 115 111 015 011 107 103 007 003
+
+        pshufd      xmm0,           xmm2,       11011000b
+        pshufd      xmm2,           xmm1,       11011000b
+
+        pshufd      xmm1,           xmm5,       11011000b
+        pshufd      xmm3,           xmm7,       11011000b
+
+    ; second pass
+        psubw       xmm0,           xmm2            ; b1 = 0-2
+        paddw       xmm2,           xmm2
+
+        movdqa      xmm5,           xmm1
+        paddw       xmm2,           xmm0            ; a1 = 0+2
+
+        pmulhw      xmm5,           [GLOBAL(x_s1sqr2)]
+        paddw       xmm5,           xmm1            ; ip1 * sin(pi/8) * sqrt(2)
+
+        movdqa      xmm7,           xmm3
+        pmulhw      xmm7,           [GLOBAL(x_c1sqr2less1)]
+
+        paddw       xmm7,           xmm3            ; ip3 * cos(pi/8) * sqrt(2)
+        psubw       xmm7,           xmm5            ; c1
+
+        movdqa      xmm5,           xmm1
+        movdqa      xmm4,           xmm3
+
+        pmulhw      xmm5,           [GLOBAL(x_c1sqr2less1)]
+        paddw       xmm5,           xmm1
+
+        pmulhw      xmm3,           [GLOBAL(x_s1sqr2)]
+        paddw       xmm3,           xmm4
+
+        paddw       xmm3,           xmm5            ; d1
+        paddw       xmm0,           [GLOBAL(fours)]
+
+        paddw       xmm2,           [GLOBAL(fours)]
+        movdqa      xmm6,           xmm2            ; a1
+
+        movdqa      xmm4,           xmm0            ; b1
+        paddw       xmm2,           xmm3            ;0
+
+        paddw       xmm4,           xmm7            ;1
+        psubw       xmm0,           xmm7            ;2
+
+        psubw       xmm6,           xmm3            ;3
+        psraw       xmm2,           3
+
+        psraw       xmm0,           3
+        psraw       xmm4,           3
+
+        psraw       xmm6,           3
+
+    ; transpose to save
+        movdqa      xmm7,           xmm2        ; 103 102 101 100 003 002 001 000
+        punpcklwd   xmm2,           xmm0        ; 007 003 006 002 005 001 004 000
+        punpckhwd   xmm7,           xmm0        ; 107 103 106 102 105 101 104 100
+
+        movdqa      xmm5,           xmm4        ; 111 110 109 108 011 010 009 008
+        punpcklwd   xmm4,           xmm6        ; 015 011 014 010 013 009 012 008
+        punpckhwd   xmm5,           xmm6        ; 115 111 114 110 113 109 112 108
+
+
+        movdqa      xmm1,           xmm2        ; 007 003 006 002 005 001 004 000
+        punpckldq   xmm2,           xmm4        ; 013 009 005 001 012 008 004 000
+        punpckhdq   xmm1,           xmm4        ; 015 011 007 003 014 010 006 002
+
+        movdqa      xmm6,           xmm7        ; 107 103 106 102 105 101 104 100
+        punpckldq   xmm7,           xmm5        ; 113 109 105 101 112 108 104 100
+        punpckhdq   xmm6,           xmm5        ; 115 111 107 103 114 110 106 102
+
+
+        movdqa      xmm5,           xmm2        ; 013 009 005 001 012 008 004 000
+        punpckldq   xmm2,           xmm7        ; 112 108 012 008 104 100 004 000
+        punpckhdq   xmm5,           xmm7        ; 113 109 013 009 105 101 005 001
+
+        movdqa      xmm7,           xmm1        ; 015 011 007 003 014 010 006 002
+        punpckldq   xmm1,           xmm6        ; 114 110 014 010 106 102 006 002
+        punpckhdq   xmm7,           xmm6        ; 115 111 015 011 107 103 007 003
+
+        pshufd      xmm0,           xmm2,       11011000b
+        pshufd      xmm2,           xmm1,       11011000b
+
+        pshufd      xmm1,           xmm5,       11011000b
+        pshufd      xmm3,           xmm7,       11011000b
+
+        pxor        xmm7,           xmm7
+
+    ; Load up predict blocks
+        movq        xmm4,           [rsi]
+        movq        xmm5,           [rsi+rcx]
+
+        punpcklbw   xmm4,           xmm7
+        punpcklbw   xmm5,           xmm7
+
+        paddw       xmm0,           xmm4
+        paddw       xmm1,           xmm5
+
+        movq        xmm4,           [rsi+2*rcx]
+        lea         rcx,            [3*rcx]
+        movq        xmm5,           [rsi+rcx]
+
+        punpcklbw   xmm4,           xmm7
+        punpcklbw   xmm5,           xmm7
+
+        paddw       xmm2,           xmm4
+        paddw       xmm3,           xmm5
+
+.finish:
+
+    ; pack up before storing
+        packuswb    xmm0,           xmm7
+        packuswb    xmm1,           xmm7
+        packuswb    xmm2,           xmm7
+        packuswb    xmm3,           xmm7
+
+    ; Load destination stride before writing out,
+    ;   doesn't need to persist
+        movsxd      rdx,            dword ptr arg(4) ; dst_stride
+
+    ; store blocks back out
+        movq        [rdi],          xmm0
+        movq        [rdi + rdx],    xmm1
+
+        lea         rdi,            [rdi + 2*rdx]
+
+        movq        [rdi],          xmm2
+        movq        [rdi + rdx],    xmm3
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vp9_idct_dequant_dc_0_2x_sse2
+; (
+;   short *qcoeff       - 0
+;   short *dequant      - 1
+;   unsigned char *pre  - 2
+;   unsigned char *dst  - 3
+;   int dst_stride      - 4
+;   short *dc           - 5
+; )
+global sym(vp9_idct_dequant_dc_0_2x_sse2)
+sym(vp9_idct_dequant_dc_0_2x_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ; special case when 2 blocks have 0 or 1 coeffs
+    ; dc is set as first coeff, so no need to load qcoeff
+        mov         rax,            arg(0) ; qcoeff
+        mov         rsi,            arg(2) ; pre
+        mov         rdi,            arg(3) ; dst
+        mov         rdx,            arg(5) ; dc
+
+    ; Zero out xmm5, for use unpacking
+        pxor        xmm5,           xmm5
+
+    ; load up 2 dc words here == 2*16 = doubleword
+        movd        xmm4,           [rdx]
+
+    ; Load up predict blocks
+        movq        xmm0,           [rsi]
+        movq        xmm1,           [rsi+16]
+        movq        xmm2,           [rsi+32]
+        movq        xmm3,           [rsi+48]
+
+    ; Duplicate and expand dc across
+        punpcklwd   xmm4,           xmm4
+        punpckldq   xmm4,           xmm4
+
+    ; Rounding to dequant and downshift
+        paddw       xmm4,           [GLOBAL(fours)]
+        psraw       xmm4,           3
+
+    ; Predict buffer needs to be expanded from bytes to words
+        punpcklbw   xmm0,           xmm5
+        punpcklbw   xmm1,           xmm5
+        punpcklbw   xmm2,           xmm5
+        punpcklbw   xmm3,           xmm5
+
+    ; Add to predict buffer
+        paddw       xmm0,           xmm4
+        paddw       xmm1,           xmm4
+        paddw       xmm2,           xmm4
+        paddw       xmm3,           xmm4
+
+    ; pack up before storing
+        packuswb    xmm0,           xmm5
+        packuswb    xmm1,           xmm5
+        packuswb    xmm2,           xmm5
+        packuswb    xmm3,           xmm5
+
+    ; Load destination stride before writing out,
+    ;   doesn't need to persist
+        movsxd      rdx,            dword ptr arg(4) ; dst_stride
+
+    ; store blocks back out
+        movq        [rdi],          xmm0
+        movq        [rdi + rdx],    xmm1
+
+        lea         rdi,            [rdi + 2*rdx]
+
+        movq        [rdi],          xmm2
+        movq        [rdi + rdx],    xmm3
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vp9_idct_dequant_dc_full_2x_sse2)
+sym(vp9_idct_dequant_dc_full_2x_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ; special case when 2 blocks have 0 or 1 coeffs
+    ; dc is set as first coeff, so no need to load qcoeff
+        mov         rax,            arg(0) ; qcoeff
+        mov         rsi,            arg(2) ; pre
+        mov         rdi,            arg(3) ; dst
+
+    ; Zero out xmm7, for use unpacking
+        pxor        xmm7,           xmm7
+
+        mov         rdx,            arg(1)  ; dequant
+
+    ; note the transpose of xmm1 and xmm2, necessary for shuffle
+    ;   to spit out sensicle data
+        movdqa      xmm0,           [rax]
+        movdqa      xmm2,           [rax+16]
+        movdqa      xmm1,           [rax+32]
+        movdqa      xmm3,           [rax+48]
+
+    ; Clear out coeffs
+        movdqa      [rax],          xmm7
+        movdqa      [rax+16],       xmm7
+        movdqa      [rax+32],       xmm7
+        movdqa      [rax+48],       xmm7
+
+    ; dequantize qcoeff buffer
+        pmullw      xmm0,           [rdx]
+        pmullw      xmm2,           [rdx+16]
+        pmullw      xmm1,           [rdx]
+        pmullw      xmm3,           [rdx+16]
+
+    ; DC component
+        mov         rdx,            arg(5)
+
+    ; repack so block 0 row x and block 1 row x are together
+        movdqa      xmm4,           xmm0
+        punpckldq   xmm0,           xmm1
+        punpckhdq   xmm4,           xmm1
+
+        pshufd      xmm0,           xmm0,       11011000b
+        pshufd      xmm1,           xmm4,       11011000b
+
+        movdqa      xmm4,           xmm2
+        punpckldq   xmm2,           xmm3
+        punpckhdq   xmm4,           xmm3
+
+        pshufd      xmm2,           xmm2,       11011000b
+        pshufd      xmm3,           xmm4,       11011000b
+
+    ; insert DC component
+        pinsrw      xmm0,           [rdx],      0
+        pinsrw      xmm0,           [rdx+2],    4
+
+    ; first pass
+        psubw       xmm0,           xmm2        ; b1 = 0-2
+        paddw       xmm2,           xmm2        ;
+
+        movdqa      xmm5,           xmm1
+        paddw       xmm2,           xmm0        ; a1 = 0+2
+
+        pmulhw      xmm5,           [GLOBAL(x_s1sqr2)]
+        paddw       xmm5,           xmm1        ; ip1 * sin(pi/8) * sqrt(2)
+
+        movdqa      xmm7,           xmm3
+        pmulhw      xmm7,           [GLOBAL(x_c1sqr2less1)]
+
+        paddw       xmm7,           xmm3        ; ip3 * cos(pi/8) * sqrt(2)
+        psubw       xmm7,           xmm5        ; c1
+
+        movdqa      xmm5,           xmm1
+        movdqa      xmm4,           xmm3
+
+        pmulhw      xmm5,           [GLOBAL(x_c1sqr2less1)]
+        paddw       xmm5,           xmm1
+
+        pmulhw      xmm3,           [GLOBAL(x_s1sqr2)]
+        paddw       xmm3,           xmm4
+
+        paddw       xmm3,           xmm5        ; d1
+        movdqa      xmm6,           xmm2        ; a1
+
+        movdqa      xmm4,           xmm0        ; b1
+        paddw       xmm2,           xmm3        ;0
+
+        paddw       xmm4,           xmm7        ;1
+        psubw       xmm0,           xmm7        ;2
+
+        psubw       xmm6,           xmm3        ;3
+
+    ; transpose for the second pass
+        movdqa      xmm7,           xmm2        ; 103 102 101 100 003 002 001 000
+        punpcklwd   xmm2,           xmm0        ; 007 003 006 002 005 001 004 000
+        punpckhwd   xmm7,           xmm0        ; 107 103 106 102 105 101 104 100
+
+        movdqa      xmm5,           xmm4        ; 111 110 109 108 011 010 009 008
+        punpcklwd   xmm4,           xmm6        ; 015 011 014 010 013 009 012 008
+        punpckhwd   xmm5,           xmm6        ; 115 111 114 110 113 109 112 108
+
+
+        movdqa      xmm1,           xmm2        ; 007 003 006 002 005 001 004 000
+        punpckldq   xmm2,           xmm4        ; 013 009 005 001 012 008 004 000
+        punpckhdq   xmm1,           xmm4        ; 015 011 007 003 014 010 006 002
+
+        movdqa      xmm6,           xmm7        ; 107 103 106 102 105 101 104 100
+        punpckldq   xmm7,           xmm5        ; 113 109 105 101 112 108 104 100
+        punpckhdq   xmm6,           xmm5        ; 115 111 107 103 114 110 106 102
+
+
+        movdqa      xmm5,           xmm2        ; 013 009 005 001 012 008 004 000
+        punpckldq   xmm2,           xmm7        ; 112 108 012 008 104 100 004 000
+        punpckhdq   xmm5,           xmm7        ; 113 109 013 009 105 101 005 001
+
+        movdqa      xmm7,           xmm1        ; 015 011 007 003 014 010 006 002
+        punpckldq   xmm1,           xmm6        ; 114 110 014 010 106 102 006 002
+        punpckhdq   xmm7,           xmm6        ; 115 111 015 011 107 103 007 003
+
+        pshufd      xmm0,           xmm2,       11011000b
+        pshufd      xmm2,           xmm1,       11011000b
+
+        pshufd      xmm1,           xmm5,       11011000b
+        pshufd      xmm3,           xmm7,       11011000b
+
+    ; second pass
+        psubw       xmm0,           xmm2            ; b1 = 0-2
+        paddw       xmm2,           xmm2
+
+        movdqa      xmm5,           xmm1
+        paddw       xmm2,           xmm0            ; a1 = 0+2
+
+        pmulhw      xmm5,           [GLOBAL(x_s1sqr2)]
+        paddw       xmm5,           xmm1            ; ip1 * sin(pi/8) * sqrt(2)
+
+        movdqa      xmm7,           xmm3
+        pmulhw      xmm7,           [GLOBAL(x_c1sqr2less1)]
+
+        paddw       xmm7,           xmm3            ; ip3 * cos(pi/8) * sqrt(2)
+        psubw       xmm7,           xmm5            ; c1
+
+        movdqa      xmm5,           xmm1
+        movdqa      xmm4,           xmm3
+
+        pmulhw      xmm5,           [GLOBAL(x_c1sqr2less1)]
+        paddw       xmm5,           xmm1
+
+        pmulhw      xmm3,           [GLOBAL(x_s1sqr2)]
+        paddw       xmm3,           xmm4
+
+        paddw       xmm3,           xmm5            ; d1
+        paddw       xmm0,           [GLOBAL(fours)]
+
+        paddw       xmm2,           [GLOBAL(fours)]
+        movdqa      xmm6,           xmm2            ; a1
+
+        movdqa      xmm4,           xmm0            ; b1
+        paddw       xmm2,           xmm3            ;0
+
+        paddw       xmm4,           xmm7            ;1
+        psubw       xmm0,           xmm7            ;2
+
+        psubw       xmm6,           xmm3            ;3
+        psraw       xmm2,           3
+
+        psraw       xmm0,           3
+        psraw       xmm4,           3
+
+        psraw       xmm6,           3
+
+    ; transpose to save
+        movdqa      xmm7,           xmm2        ; 103 102 101 100 003 002 001 000
+        punpcklwd   xmm2,           xmm0        ; 007 003 006 002 005 001 004 000
+        punpckhwd   xmm7,           xmm0        ; 107 103 106 102 105 101 104 100
+
+        movdqa      xmm5,           xmm4        ; 111 110 109 108 011 010 009 008
+        punpcklwd   xmm4,           xmm6        ; 015 011 014 010 013 009 012 008
+        punpckhwd   xmm5,           xmm6        ; 115 111 114 110 113 109 112 108
+
+
+        movdqa      xmm1,           xmm2        ; 007 003 006 002 005 001 004 000
+        punpckldq   xmm2,           xmm4        ; 013 009 005 001 012 008 004 000
+        punpckhdq   xmm1,           xmm4        ; 015 011 007 003 014 010 006 002
+
+        movdqa      xmm6,           xmm7        ; 107 103 106 102 105 101 104 100
+        punpckldq   xmm7,           xmm5        ; 113 109 105 101 112 108 104 100
+        punpckhdq   xmm6,           xmm5        ; 115 111 107 103 114 110 106 102
+
+
+        movdqa      xmm5,           xmm2        ; 013 009 005 001 012 008 004 000
+        punpckldq   xmm2,           xmm7        ; 112 108 012 008 104 100 004 000
+        punpckhdq   xmm5,           xmm7        ; 113 109 013 009 105 101 005 001
+
+        movdqa      xmm7,           xmm1        ; 015 011 007 003 014 010 006 002
+        punpckldq   xmm1,           xmm6        ; 114 110 014 010 106 102 006 002
+        punpckhdq   xmm7,           xmm6        ; 115 111 015 011 107 103 007 003
+
+        pshufd      xmm0,           xmm2,       11011000b
+        pshufd      xmm2,           xmm1,       11011000b
+
+        pshufd      xmm1,           xmm5,       11011000b
+        pshufd      xmm3,           xmm7,       11011000b
+
+        pxor        xmm7,           xmm7
+
+    ; Load up predict blocks
+        movq        xmm4,           [rsi]
+        movq        xmm5,           [rsi+16]
+
+        punpcklbw   xmm4,           xmm7
+        punpcklbw   xmm5,           xmm7
+
+        paddw       xmm0,           xmm4
+        paddw       xmm1,           xmm5
+
+        movq        xmm4,           [rsi+32]
+        movq        xmm5,           [rsi+48]
+
+        punpcklbw   xmm4,           xmm7
+        punpcklbw   xmm5,           xmm7
+
+        paddw       xmm2,           xmm4
+        paddw       xmm3,           xmm5
+
+.finish:
+
+    ; pack up before storing
+        packuswb    xmm0,           xmm7
+        packuswb    xmm1,           xmm7
+        packuswb    xmm2,           xmm7
+        packuswb    xmm3,           xmm7
+
+    ; Load destination stride before writing out,
+    ;   doesn't need to persist
+        movsxd      rdx,            dword ptr arg(4) ; dst_stride
+
+    ; store blocks back out
+        movq        [rdi],          xmm0
+        movq        [rdi + rdx],    xmm1
+
+        lea         rdi,            [rdi + 2*rdx]
+
+        movq        [rdi],          xmm2
+        movq        [rdi + rdx],    xmm3
+
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+SECTION_RODATA
+align 16
+fours:
+    times 8 dw 0x0004
+align 16
+x_s1sqr2:
+    times 8 dw 0x8A8C
+align 16
+x_c1sqr2less1:
+    times 8 dw 0x4E7B
--- /dev/null
+++ b/vp9/common/x86/iwalsh_mmx.asm
@@ -1,0 +1,173 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;void vp9_short_inv_walsh4x4_1_mmx(short *input, short *output)
+global sym(vp9_short_inv_walsh4x4_1_mmx)
+sym(vp9_short_inv_walsh4x4_1_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 2
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    mov     rsi, arg(0)
+    mov     rax, 3
+
+    mov     rdi, arg(1)
+    add     rax, [rsi]          ;input[0] + 3
+
+    movd    mm0, eax
+
+    punpcklwd mm0, mm0          ;x x val val
+
+    punpckldq mm0, mm0          ;val val val val
+
+    psraw   mm0, 3            ;(input[0] + 3) >> 3
+
+    movq  [rdi + 0], mm0
+    movq  [rdi + 8], mm0
+    movq  [rdi + 16], mm0
+    movq  [rdi + 24], mm0
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vp9_short_inv_walsh4x4_mmx(short *input, short *output)
+global sym(vp9_short_inv_walsh4x4_mmx)
+sym(vp9_short_inv_walsh4x4_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 2
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    mov     rax, 3
+    mov     rsi, arg(0)
+    mov     rdi, arg(1)
+    shl     rax, 16
+
+    movq    mm0, [rsi + 0]        ;ip[0]
+    movq    mm1, [rsi + 8]        ;ip[4]
+    or      rax, 3            ;00030003h
+
+    movq    mm2, [rsi + 16]       ;ip[8]
+    movq    mm3, [rsi + 24]       ;ip[12]
+
+    movq    mm7, rax
+    movq    mm4, mm0
+
+    punpcklwd mm7, mm7          ;0003000300030003h
+    movq    mm5, mm1
+
+    paddw   mm4, mm3          ;ip[0] + ip[12] aka al
+    paddw   mm5, mm2          ;ip[4] + ip[8] aka bl
+
+    movq    mm6, mm4          ;temp al
+
+    paddw   mm4, mm5          ;al + bl
+    psubw   mm6, mm5          ;al - bl
+
+    psubw   mm0, mm3          ;ip[0] - ip[12] aka d1
+    psubw   mm1, mm2          ;ip[4] - ip[8] aka c1
+
+    movq    mm5, mm0          ;temp dl
+
+    paddw   mm0, mm1          ;dl + cl
+    psubw   mm5, mm1          ;dl - cl
+
+    ; 03 02 01 00
+    ; 13 12 11 10
+    ; 23 22 21 20
+    ; 33 32 31 30
+
+    movq    mm3, mm4          ; 03 02 01 00
+    punpcklwd mm4, mm0          ; 11 01 10 00
+    punpckhwd mm3, mm0          ; 13 03 12 02
+
+    movq    mm1, mm6          ; 23 22 21 20
+    punpcklwd mm6, mm5          ; 31 21 30 20
+    punpckhwd mm1, mm5          ; 33 23 32 22
+
+    movq    mm0, mm4          ; 11 01 10 00
+    movq    mm2, mm3          ; 13 03 12 02
+
+    punpckldq mm0, mm6          ; 30 20 10 00 aka ip[0]
+    punpckhdq mm4, mm6          ; 31 21 11 01 aka ip[4]
+
+    punpckldq mm2, mm1          ; 32 22 12 02 aka ip[8]
+    punpckhdq mm3, mm1          ; 33 23 13 03 aka ip[12]
+;~~~~~~~~~~~~~~~~~~~~~
+    movq    mm1, mm0
+    movq    mm5, mm4
+
+    paddw   mm1, mm3          ;ip[0] + ip[12] aka al
+    paddw   mm5, mm2          ;ip[4] + ip[8] aka bl
+
+    movq    mm6, mm1          ;temp al
+
+    paddw   mm1, mm5          ;al + bl
+    psubw   mm6, mm5          ;al - bl
+
+    psubw   mm0, mm3          ;ip[0] - ip[12] aka d1
+    psubw   mm4, mm2          ;ip[4] - ip[8] aka c1
+
+    movq    mm5, mm0          ;temp dl
+
+    paddw   mm0, mm4          ;dl + cl
+    psubw   mm5, mm4          ;dl - cl
+;~~~~~~~~~~~~~~~~~~~~~
+    movq    mm3, mm1          ; 03 02 01 00
+    punpcklwd mm1, mm0          ; 11 01 10 00
+    punpckhwd mm3, mm0          ; 13 03 12 02
+
+    movq    mm4, mm6          ; 23 22 21 20
+    punpcklwd mm6, mm5          ; 31 21 30 20
+    punpckhwd mm4, mm5          ; 33 23 32 22
+
+    movq    mm0, mm1          ; 11 01 10 00
+    movq    mm2, mm3          ; 13 03 12 02
+
+    punpckldq mm0, mm6          ; 30 20 10 00 aka ip[0]
+    punpckhdq mm1, mm6          ; 31 21 11 01 aka ip[4]
+
+    punpckldq mm2, mm4          ; 32 22 12 02 aka ip[8]
+    punpckhdq mm3, mm4          ; 33 23 13 03 aka ip[12]
+
+    paddw   mm0, mm7
+    paddw   mm1, mm7
+    paddw   mm2, mm7
+    paddw   mm3, mm7
+
+    psraw   mm0, 3
+    psraw   mm1, 3
+    psraw   mm2, 3
+    psraw   mm3, 3
+
+    movq  [rdi + 0], mm0
+    movq  [rdi + 8], mm1
+    movq  [rdi + 16], mm2
+    movq  [rdi + 24], mm3
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
--- /dev/null
+++ b/vp9/common/x86/iwalsh_sse2.asm
@@ -1,0 +1,119 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;void vp9_short_inv_walsh4x4_sse2(short *input, short *output)
+global sym(vp9_short_inv_walsh4x4_sse2)
+sym(vp9_short_inv_walsh4x4_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 2
+    SAVE_XMM 6
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    mov     rsi, arg(0)
+    mov     rdi, arg(1)
+    mov     rax, 3
+
+    movdqa    xmm0, [rsi + 0]       ;ip[4] ip[0]
+    movdqa    xmm1, [rsi + 16]      ;ip[12] ip[8]
+
+    shl     rax, 16
+    or      rax, 3            ;00030003h
+
+    pshufd    xmm2, xmm1, 4eh       ;ip[8] ip[12]
+    movdqa    xmm3, xmm0          ;ip[4] ip[0]
+
+    paddw   xmm0, xmm2          ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1
+    psubw   xmm3, xmm2          ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1
+
+    movdqa    xmm4, xmm0
+    punpcklqdq  xmm0, xmm3          ;d1 a1
+    punpckhqdq  xmm4, xmm3          ;c1 b1
+    movd    xmm6, eax
+
+    movdqa    xmm1, xmm4          ;c1 b1
+    paddw   xmm4, xmm0          ;dl+cl a1+b1 aka op[4] op[0]
+    psubw   xmm0, xmm1          ;d1-c1 a1-b1 aka op[12] op[8]
+
+;;;temp output
+;;  movdqu  [rdi + 0], xmm4
+;;  movdqu  [rdi + 16], xmm3
+
+;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    ; 13 12 11 10 03 02 01 00
+    ;
+    ; 33 32 31 30 23 22 21 20
+    ;
+    movdqa    xmm3, xmm4          ; 13 12 11 10 03 02 01 00
+    punpcklwd xmm4, xmm0          ; 23 03 22 02 21 01 20 00
+    punpckhwd xmm3, xmm0          ; 33 13 32 12 31 11 30 10
+    movdqa    xmm1, xmm4          ; 23 03 22 02 21 01 20 00
+    punpcklwd xmm4, xmm3          ; 31 21 11 01 30 20 10 00
+    punpckhwd xmm1, xmm3          ; 33 23 13 03 32 22 12 02
+    ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    pshufd    xmm2, xmm1, 4eh       ;ip[8] ip[12]
+    movdqa    xmm3, xmm4          ;ip[4] ip[0]
+
+    pshufd    xmm6, xmm6, 0       ;03 03 03 03 03 03 03 03
+
+    paddw   xmm4, xmm2          ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1
+    psubw   xmm3, xmm2          ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1
+
+    movdqa    xmm5, xmm4
+    punpcklqdq  xmm4, xmm3          ;d1 a1
+    punpckhqdq  xmm5, xmm3          ;c1 b1
+
+    movdqa    xmm1, xmm5          ;c1 b1
+    paddw   xmm5, xmm4          ;dl+cl a1+b1 aka op[4] op[0]
+    psubw   xmm4, xmm1          ;d1-c1 a1-b1 aka op[12] op[8]
+;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    ; 13 12 11 10 03 02 01 00
+    ;
+    ; 33 32 31 30 23 22 21 20
+    ;
+    movdqa    xmm0, xmm5          ; 13 12 11 10 03 02 01 00
+    punpcklwd xmm5, xmm4          ; 23 03 22 02 21 01 20 00
+    punpckhwd xmm0, xmm4          ; 33 13 32 12 31 11 30 10
+    movdqa    xmm1, xmm5          ; 23 03 22 02 21 01 20 00
+    punpcklwd xmm5, xmm0          ; 31 21 11 01 30 20 10 00
+    punpckhwd xmm1, xmm0          ; 33 23 13 03 32 22 12 02
+;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    paddw   xmm5, xmm6
+    paddw   xmm1, xmm6
+
+    psraw   xmm5, 3
+    psraw   xmm1, 3
+
+    movdqa  [rdi + 0], xmm5
+    movdqa  [rdi + 16], xmm1
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+SECTION_RODATA
+align 16
+x_s1sqr2:
+    times 4 dw 0x8A8C
+align 16
+x_c1sqr2less1:
+    times 4 dw 0x4E7B
+align 16
+fours:
+    times 4 dw 0x0004
--- /dev/null
+++ b/vp9/common/x86/loopfilter_mmx.asm
@@ -1,0 +1,969 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+
+;void vp9_loop_filter_horizontal_edge_mmx
+;(
+;    unsigned char *src_ptr,
+;    int src_pixel_step,
+;    const char *blimit,
+;    const char *limit,
+;    const char *thresh,
+;    int  count
+;)
+global sym(vp9_loop_filter_horizontal_edge_mmx)
+sym(vp9_loop_filter_horizontal_edge_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 32                         ; reserve 32 bytes
+    %define t0 [rsp + 0]    ;__declspec(align(16)) char t0[8];
+    %define t1 [rsp + 16]   ;__declspec(align(16)) char t1[8];
+
+        mov         rsi, arg(0) ;src_ptr
+        movsxd      rax, dword ptr arg(1) ;src_pixel_step     ; destination pitch?
+
+        movsxd      rcx, dword ptr arg(5) ;count
+.next8_h:
+        mov         rdx, arg(3) ;limit
+        movq        mm7, [rdx]
+        mov         rdi, rsi              ; rdi points to row +1 for indirect addressing
+        add         rdi, rax
+
+        ; calculate breakout conditions
+        movq        mm2, [rdi+2*rax]      ; q3
+        movq        mm1, [rsi+2*rax]      ; q2
+        movq        mm6, mm1              ; q2
+        psubusb     mm1, mm2              ; q2-=q3
+        psubusb     mm2, mm6              ; q3-=q2
+        por         mm1, mm2              ; abs(q3-q2)
+        psubusb     mm1, mm7              ;
+
+
+        movq        mm4, [rsi+rax]        ; q1
+        movq        mm3, mm4              ; q1
+        psubusb     mm4, mm6              ; q1-=q2
+        psubusb     mm6, mm3              ; q2-=q1
+        por         mm4, mm6              ; abs(q2-q1)
+
+        psubusb     mm4, mm7
+        por        mm1, mm4
+
+        movq        mm4, [rsi]            ; q0
+        movq        mm0, mm4              ; q0
+        psubusb     mm4, mm3              ; q0-=q1
+        psubusb     mm3, mm0              ; q1-=q0
+        por         mm4, mm3              ; abs(q0-q1)
+        movq        t0, mm4               ; save to t0
+        psubusb     mm4, mm7
+        por        mm1, mm4
+
+
+        neg         rax                   ; negate pitch to deal with above border
+
+        movq        mm2, [rsi+4*rax]      ; p3
+        movq        mm4, [rdi+4*rax]      ; p2
+        movq        mm5, mm4              ; p2
+        psubusb     mm4, mm2              ; p2-=p3
+        psubusb     mm2, mm5              ; p3-=p2
+        por         mm4, mm2              ; abs(p3 - p2)
+        psubusb     mm4, mm7
+        por        mm1, mm4
+
+
+        movq        mm4, [rsi+2*rax]      ; p1
+        movq        mm3, mm4              ; p1
+        psubusb     mm4, mm5              ; p1-=p2
+        psubusb     mm5, mm3              ; p2-=p1
+        por         mm4, mm5              ; abs(p2 - p1)
+        psubusb     mm4, mm7
+        por        mm1, mm4
+
+        movq        mm2, mm3              ; p1
+
+        movq        mm4, [rsi+rax]        ; p0
+        movq        mm5, mm4              ; p0
+        psubusb     mm4, mm3              ; p0-=p1
+        psubusb     mm3, mm5              ; p1-=p0
+        por         mm4, mm3              ; abs(p1 - p0)
+        movq        t1, mm4               ; save to t1
+        psubusb     mm4, mm7
+        por        mm1, mm4
+
+        movq        mm3, [rdi]            ; q1
+        movq        mm4, mm3              ; q1
+        psubusb     mm3, mm2              ; q1-=p1
+        psubusb     mm2, mm4              ; p1-=q1
+        por         mm2, mm3              ; abs(p1-q1)
+        pand        mm2, [GLOBAL(tfe)]    ; set lsb of each byte to zero
+        psrlw       mm2, 1                ; abs(p1-q1)/2
+
+        movq        mm6, mm5              ; p0
+        movq        mm3, [rsi]            ; q0
+        psubusb     mm5, mm3              ; p0-=q0
+        psubusb     mm3, mm6              ; q0-=p0
+        por         mm5, mm3              ; abs(p0 - q0)
+        paddusb     mm5, mm5              ; abs(p0-q0)*2
+        paddusb     mm5, mm2              ; abs (p0 - q0) *2 + abs(p1-q1)/2
+
+        mov         rdx, arg(2) ;blimit           ; get blimit
+        movq        mm7, [rdx]            ; blimit
+
+        psubusb     mm5,    mm7           ; abs (p0 - q0) *2 + abs(p1-q1)/2  > blimit
+        por         mm1,    mm5
+        pxor        mm5,    mm5
+        pcmpeqb     mm1,    mm5           ; mask mm1
+
+        ; calculate high edge variance
+        mov         rdx, arg(4) ;thresh           ; get thresh
+        movq        mm7, [rdx]            ;
+        movq        mm4, t0               ; get abs (q1 - q0)
+        psubusb     mm4, mm7
+        movq        mm3, t1               ; get abs (p1 - p0)
+        psubusb     mm3, mm7
+        paddb       mm4, mm3              ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
+
+        pcmpeqb     mm4,        mm5
+
+        pcmpeqb     mm5,        mm5
+        pxor        mm4,        mm5
+
+
+        ; start work on filters
+        movq        mm2, [rsi+2*rax]      ; p1
+        movq        mm7, [rdi]            ; q1
+        pxor        mm2, [GLOBAL(t80)]    ; p1 offset to convert to signed values
+        pxor        mm7, [GLOBAL(t80)]    ; q1 offset to convert to signed values
+        psubsb      mm2, mm7              ; p1 - q1
+        pand        mm2, mm4              ; high var mask (hvm)(p1 - q1)
+        pxor        mm6, [GLOBAL(t80)]    ; offset to convert to signed values
+        pxor        mm0, [GLOBAL(t80)]    ; offset to convert to signed values
+        movq        mm3, mm0              ; q0
+        psubsb      mm0, mm6              ; q0 - p0
+        paddsb      mm2, mm0              ; 1 * (q0 - p0) + hvm(p1 - q1)
+        paddsb      mm2, mm0              ; 2 * (q0 - p0) + hvm(p1 - q1)
+        paddsb      mm2, mm0              ; 3 * (q0 - p0) + hvm(p1 - q1)
+        pand        mm1, mm2                  ; mask filter values we don't care about
+        movq        mm2, mm1
+        paddsb      mm1, [GLOBAL(t4)]     ; 3* (q0 - p0) + hvm(p1 - q1) + 4
+        paddsb      mm2, [GLOBAL(t3)]     ; 3* (q0 - p0) + hvm(p1 - q1) + 3
+
+        pxor        mm0, mm0             ;
+        pxor        mm5, mm5
+        punpcklbw   mm0, mm2            ;
+        punpckhbw   mm5, mm2            ;
+        psraw       mm0, 11             ;
+        psraw       mm5, 11
+        packsswb    mm0, mm5
+        movq        mm2, mm0            ;  (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3;
+
+        pxor        mm0, mm0              ; 0
+        movq        mm5, mm1              ; abcdefgh
+        punpcklbw   mm0, mm1              ; e0f0g0h0
+        psraw       mm0, 11               ; sign extended shift right by 3
+        pxor        mm1, mm1              ; 0
+        punpckhbw   mm1, mm5              ; a0b0c0d0
+        psraw       mm1, 11               ; sign extended shift right by 3
+        movq        mm5, mm0              ; save results
+
+        packsswb    mm0, mm1              ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3
+        paddsw      mm5, [GLOBAL(ones)]
+        paddsw      mm1, [GLOBAL(ones)]
+        psraw       mm5, 1                ; partial shifted one more time for 2nd tap
+        psraw       mm1, 1                ; partial shifted one more time for 2nd tap
+        packsswb    mm5, mm1              ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4
+        pandn       mm4, mm5              ; high edge variance additive
+
+        paddsb      mm6, mm2              ; p0+= p0 add
+        pxor        mm6, [GLOBAL(t80)]    ; unoffset
+        movq        [rsi+rax], mm6        ; write back
+
+        movq        mm6, [rsi+2*rax]      ; p1
+        pxor        mm6, [GLOBAL(t80)]    ; reoffset
+        paddsb      mm6, mm4              ; p1+= p1 add
+        pxor        mm6, [GLOBAL(t80)]    ; unoffset
+        movq        [rsi+2*rax], mm6      ; write back
+
+        psubsb      mm3, mm0              ; q0-= q0 add
+        pxor        mm3, [GLOBAL(t80)]    ; unoffset
+        movq        [rsi], mm3            ; write back
+
+        psubsb      mm7, mm4              ; q1-= q1 add
+        pxor        mm7, [GLOBAL(t80)]    ; unoffset
+        movq        [rdi], mm7            ; write back
+
+        add         rsi,8
+        neg         rax
+        dec         rcx
+        jnz         .next8_h
+
+    add rsp, 32
+    pop rsp
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp9_loop_filter_vertical_edge_mmx
+;(
+;    unsigned char *src_ptr,
+;    int  src_pixel_step,
+;    const char *blimit,
+;    const char *limit,
+;    const char *thresh,
+;    int count
+;)
+global sym(vp9_loop_filter_vertical_edge_mmx)
+sym(vp9_loop_filter_vertical_edge_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub          rsp, 64      ; reserve 64 bytes
+    %define t0   [rsp + 0]    ;__declspec(align(16)) char t0[8];
+    %define t1   [rsp + 16]   ;__declspec(align(16)) char t1[8];
+    %define srct [rsp + 32]   ;__declspec(align(16)) char srct[32];
+
+        mov         rsi,        arg(0) ;src_ptr
+        movsxd      rax,        dword ptr arg(1) ;src_pixel_step     ; destination pitch?
+
+        lea         rsi,        [rsi + rax*4 - 4]
+
+        movsxd      rcx,        dword ptr arg(5) ;count
+.next8_v:
+        mov         rdi,        rsi           ; rdi points to row +1 for indirect addressing
+        add         rdi,        rax
+
+
+        ;transpose
+        movq        mm6,        [rsi+2*rax]                 ; 67 66 65 64 63 62 61 60
+        movq        mm7,        mm6                         ; 77 76 75 74 73 72 71 70
+
+        punpckhbw   mm7,        [rdi+2*rax]                 ; 77 67 76 66 75 65 74 64
+        punpcklbw   mm6,        [rdi+2*rax]                 ; 73 63 72 62 71 61 70 60
+
+        movq        mm4,        [rsi]                       ; 47 46 45 44 43 42 41 40
+        movq        mm5,        mm4                         ; 47 46 45 44 43 42 41 40
+
+        punpckhbw   mm5,        [rsi+rax]                   ; 57 47 56 46 55 45 54 44
+        punpcklbw   mm4,        [rsi+rax]                   ; 53 43 52 42 51 41 50 40
+
+        movq        mm3,        mm5                         ; 57 47 56 46 55 45 54 44
+        punpckhwd   mm5,        mm7                         ; 77 67 57 47 76 66 56 46
+
+        punpcklwd   mm3,        mm7                         ; 75 65 55 45 74 64 54 44
+        movq        mm2,        mm4                         ; 53 43 52 42 51 41 50 40
+
+        punpckhwd   mm4,        mm6                         ; 73 63 53 43 72 62 52 42
+        punpcklwd   mm2,        mm6                         ; 71 61 51 41 70 60 50 40
+
+        neg         rax
+        movq        mm6,        [rsi+rax*2]                 ; 27 26 25 24 23 22 21 20
+
+        movq        mm1,        mm6                         ; 27 26 25 24 23 22 21 20
+        punpckhbw   mm6,        [rsi+rax]                   ; 37 27 36 36 35 25 34 24
+
+        punpcklbw   mm1,        [rsi+rax]                   ; 33 23 32 22 31 21 30 20
+        movq        mm7,        [rsi+rax*4];                ; 07 06 05 04 03 02 01 00
+
+        punpckhbw   mm7,        [rdi+rax*4]                 ; 17 07 16 06 15 05 14 04
+        movq        mm0,        mm7                         ; 17 07 16 06 15 05 14 04
+
+        punpckhwd   mm7,        mm6                         ; 37 27 17 07 36 26 16 06
+        punpcklwd   mm0,        mm6                         ; 35 25 15 05 34 24 14 04
+
+        movq        mm6,        mm7                         ; 37 27 17 07 36 26 16 06
+        punpckhdq   mm7,        mm5                         ; 77 67 57 47 37 27 17 07  = q3
+
+        punpckldq   mm6,        mm5                         ; 76 66 56 46 36 26 16 06  = q2
+
+        movq        mm5,        mm6                         ; 76 66 56 46 36 26 16 06
+        psubusb     mm5,        mm7                         ; q2-q3
+
+        psubusb     mm7,        mm6                         ; q3-q2
+        por         mm7,        mm5;                        ; mm7=abs (q3-q2)
+
+        movq        mm5,        mm0                         ; 35 25 15 05 34 24 14 04
+        punpckhdq   mm5,        mm3                         ; 75 65 55 45 35 25 15 05 = q1
+
+        punpckldq   mm0,        mm3                         ; 74 64 54 44 34 24 15 04 = q0
+        movq        mm3,        mm5                         ; 75 65 55 45 35 25 15 05 = q1
+
+        psubusb     mm3,        mm6                         ; q1-q2
+        psubusb     mm6,        mm5                         ; q2-q1
+
+        por         mm6,        mm3                         ; mm6=abs(q2-q1)
+        lea         rdx,        srct
+
+        movq        [rdx+24],   mm5                         ; save q1
+        movq        [rdx+16],   mm0                         ; save q0
+
+        movq        mm3,        [rsi+rax*4]                 ; 07 06 05 04 03 02 01 00
+        punpcklbw   mm3,        [rdi+rax*4]                 ; 13 03 12 02 11 01 10 00
+
+        movq        mm0,        mm3                         ; 13 03 12 02 11 01 10 00
+        punpcklwd   mm0,        mm1                         ; 31 21 11 01 30 20 10 00
+
+        punpckhwd   mm3,        mm1                         ; 33 23 13 03 32 22 12 02
+        movq        mm1,        mm0                         ; 31 21 11 01 30 20 10 00
+
+        punpckldq   mm0,        mm2                         ; 70 60 50 40 30 20 10 00  =p3
+        punpckhdq   mm1,        mm2                         ; 71 61 51 41 31 21 11 01  =p2
+
+        movq        mm2,        mm1                         ; 71 61 51 41 31 21 11 01  =p2
+        psubusb     mm2,        mm0                         ; p2-p3
+
+        psubusb     mm0,        mm1                         ; p3-p2
+        por         mm0,        mm2                         ; mm0=abs(p3-p2)
+
+        movq        mm2,        mm3                         ; 33 23 13 03 32 22 12 02
+        punpckldq   mm2,        mm4                         ; 72 62 52 42 32 22 12 02 = p1
+
+        punpckhdq   mm3,        mm4                         ; 73 63 53 43 33 23 13 03 = p0
+        movq        [rdx+8],    mm3                         ; save p0
+
+        movq        [rdx],      mm2                         ; save p1
+        movq        mm5,        mm2                         ; mm5 = p1
+
+        psubusb     mm2,        mm1                         ; p1-p2
+        psubusb     mm1,        mm5                         ; p2-p1
+
+        por         mm1,        mm2                         ; mm1=abs(p2-p1)
+        mov         rdx,        arg(3) ;limit
+
+        movq        mm4,        [rdx]                       ; mm4 = limit
+        psubusb     mm7,        mm4
+
+        psubusb     mm0,        mm4
+        psubusb     mm1,        mm4
+
+        psubusb     mm6,        mm4
+        por         mm7,        mm6
+
+        por         mm0,        mm1
+        por         mm0,        mm7                         ;   abs(q3-q2) > limit || abs(p3-p2) > limit ||abs(p2-p1) > limit || abs(q2-q1) > limit
+
+        movq        mm1,        mm5                         ; p1
+
+        movq        mm7,        mm3                         ; mm3=mm7=p0
+        psubusb     mm7,        mm5                         ; p0 - p1
+
+        psubusb     mm5,        mm3                         ; p1 - p0
+        por         mm5,        mm7                         ; abs(p1-p0)
+
+        movq        t0,         mm5                         ; save abs(p1-p0)
+        lea         rdx,        srct
+
+        psubusb     mm5,        mm4
+        por         mm0,        mm5                         ; mm0=mask
+
+        movq        mm5,        [rdx+16]                    ; mm5=q0
+        movq        mm7,        [rdx+24]                    ; mm7=q1
+
+        movq        mm6,        mm5                         ; mm6=q0
+        movq        mm2,        mm7                         ; q1
+        psubusb     mm5,        mm7                         ; q0-q1
+
+        psubusb     mm7,        mm6                         ; q1-q0
+        por         mm7,        mm5                         ; abs(q1-q0)
+
+        movq        t1,         mm7                         ; save abs(q1-q0)
+        psubusb     mm7,        mm4
+
+        por         mm0,        mm7                         ; mask
+
+        movq        mm5,        mm2                         ; q1
+        psubusb     mm5,        mm1                         ; q1-=p1
+        psubusb     mm1,        mm2                         ; p1-=q1
+        por         mm5,        mm1                         ; abs(p1-q1)
+        pand        mm5,        [GLOBAL(tfe)]               ; set lsb of each byte to zero
+        psrlw       mm5,        1                           ; abs(p1-q1)/2
+
+        mov         rdx,        arg(2) ;blimit                      ;
+
+        movq        mm4,        [rdx]                       ;blimit
+        movq        mm1,        mm3                         ; mm1=mm3=p0
+
+        movq        mm7,        mm6                         ; mm7=mm6=q0
+        psubusb     mm1,        mm7                         ; p0-q0
+
+        psubusb     mm7,        mm3                         ; q0-p0
+        por         mm1,        mm7                         ; abs(q0-p0)
+        paddusb     mm1,        mm1                         ; abs(q0-p0)*2
+        paddusb     mm1,        mm5                         ; abs (p0 - q0) *2 + abs(p1-q1)/2
+
+        psubusb     mm1,        mm4                         ; abs (p0 - q0) *2 + abs(p1-q1)/2  > blimit
+        por         mm1,        mm0;                        ; mask
+
+        pxor        mm0,        mm0
+        pcmpeqb     mm1,        mm0
+
+        ; calculate high edge variance
+        mov         rdx,        arg(4) ;thresh            ; get thresh
+        movq        mm7,        [rdx]
+        ;
+        movq        mm4,        t0              ; get abs (q1 - q0)
+        psubusb     mm4,        mm7
+
+        movq        mm3,        t1              ; get abs (p1 - p0)
+        psubusb     mm3,        mm7
+
+        por         mm4,        mm3             ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
+        pcmpeqb     mm4,        mm0
+
+        pcmpeqb     mm0,        mm0
+        pxor        mm4,        mm0
+
+
+
+        ; start work on filters
+        lea         rdx,        srct
+
+        movq        mm2,        [rdx]           ; p1
+        movq        mm7,        [rdx+24]        ; q1
+
+        movq        mm6,        [rdx+8]         ; p0
+        movq        mm0,        [rdx+16]        ; q0
+
+        pxor        mm2,        [GLOBAL(t80)]   ; p1 offset to convert to signed values
+        pxor        mm7,        [GLOBAL(t80)]   ; q1 offset to convert to signed values
+
+        psubsb      mm2,        mm7             ; p1 - q1
+        pand        mm2,        mm4             ; high var mask (hvm)(p1 - q1)
+
+        pxor        mm6,        [GLOBAL(t80)]   ; offset to convert to signed values
+        pxor        mm0,        [GLOBAL(t80)]   ; offset to convert to signed values
+
+        movq        mm3,        mm0             ; q0
+        psubsb      mm0,        mm6             ; q0 - p0
+
+        paddsb      mm2,        mm0             ; 1 * (q0 - p0) + hvm(p1 - q1)
+        paddsb      mm2,        mm0             ; 2 * (q0 - p0) + hvm(p1 - q1)
+
+        paddsb      mm2,        mm0             ; 3 * (q0 - p0) + hvm(p1 - q1)
+        pand       mm1,        mm2              ; mask filter values we don't care about
+
+        movq        mm2,        mm1
+        paddsb      mm1,        [GLOBAL(t4)]      ; 3* (q0 - p0) + hvm(p1 - q1) + 4
+
+        paddsb      mm2,        [GLOBAL(t3)]      ; 3* (q0 - p0) + hvm(p1 - q1) + 3
+        pxor        mm0,        mm0          ;
+
+        pxor        mm5,        mm5
+        punpcklbw   mm0,        mm2         ;
+
+        punpckhbw   mm5,        mm2         ;
+        psraw       mm0,        11              ;
+
+        psraw       mm5,        11
+        packsswb    mm0,        mm5
+
+        movq        mm2,        mm0         ;  (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3;
+
+        pxor        mm0,        mm0           ; 0
+        movq        mm5,        mm1           ; abcdefgh
+
+        punpcklbw   mm0,        mm1           ; e0f0g0h0
+        psraw       mm0,        11                ; sign extended shift right by 3
+
+        pxor        mm1,        mm1           ; 0
+        punpckhbw   mm1,        mm5           ; a0b0c0d0
+
+        psraw       mm1,        11                ; sign extended shift right by 3
+        movq        mm5,        mm0              ; save results
+
+        packsswb    mm0,        mm1           ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3
+        paddsw      mm5,        [GLOBAL(ones)]
+
+        paddsw      mm1,        [GLOBAL(ones)]
+        psraw       mm5,        1                 ; partial shifted one more time for 2nd tap
+
+        psraw       mm1,        1                 ; partial shifted one more time for 2nd tap
+        packsswb    mm5,        mm1           ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4
+
+        pandn       mm4,        mm5             ; high edge variance additive
+
+        paddsb      mm6,        mm2             ; p0+= p0 add
+        pxor        mm6,        [GLOBAL(t80)]   ; unoffset
+
+        ; mm6=p0                               ;
+        movq        mm1,        [rdx]           ; p1
+        pxor        mm1,        [GLOBAL(t80)]   ; reoffset
+
+        paddsb      mm1,        mm4                 ; p1+= p1 add
+        pxor        mm1,        [GLOBAL(t80)]       ; unoffset
+        ; mm6 = p0 mm1 = p1
+
+        psubsb      mm3,        mm0                 ; q0-= q0 add
+        pxor        mm3,        [GLOBAL(t80)]       ; unoffset
+
+        ; mm3 = q0
+        psubsb      mm7,        mm4                 ; q1-= q1 add
+        pxor        mm7,        [GLOBAL(t80)]       ; unoffset
+        ; mm7 = q1
+
+        ; tranpose and write back
+        ; mm1 =    72 62 52 42 32 22 12 02
+        ; mm6 =    73 63 53 43 33 23 13 03
+        ; mm3 =    74 64 54 44 34 24 14 04
+        ; mm7 =    75 65 55 45 35 25 15 05
+
+        movq        mm2,        mm1             ; 72 62 52 42 32 22 12 02
+        punpcklbw   mm2,        mm6             ; 33 32 23 22 13 12 03 02
+
+        movq        mm4,        mm3             ; 74 64 54 44 34 24 14 04
+        punpckhbw   mm1,        mm6             ; 73 72 63 62 53 52 43 42
+
+        punpcklbw   mm4,        mm7             ; 35 34 25 24 15 14 05 04
+        punpckhbw   mm3,        mm7             ; 75 74 65 64 55 54 45 44
+
+        movq        mm6,        mm2             ; 33 32 23 22 13 12 03 02
+        punpcklwd   mm2,        mm4             ; 15 14 13 12 05 04 03 02
+
+        punpckhwd   mm6,        mm4             ; 35 34 33 32 25 24 23 22
+        movq        mm5,        mm1             ; 73 72 63 62 53 52 43 42
+
+        punpcklwd   mm1,        mm3             ; 55 54 53 52 45 44 43 42
+        punpckhwd   mm5,        mm3             ; 75 74 73 72 65 64 63 62
+
+
+        ; mm2 = 15 14 13 12 05 04 03 02
+        ; mm6 = 35 34 33 32 25 24 23 22
+        ; mm5 = 55 54 53 52 45 44 43 42
+        ; mm1 = 75 74 73 72 65 64 63 62
+
+
+
+        movd        [rsi+rax*4+2], mm2
+        psrlq       mm2,        32
+
+        movd        [rdi+rax*4+2], mm2
+        movd        [rsi+rax*2+2], mm6
+
+        psrlq       mm6,        32
+        movd        [rsi+rax+2],mm6
+
+        movd        [rsi+2],    mm1
+        psrlq       mm1,        32
+
+        movd        [rdi+2],    mm1
+        neg         rax
+
+        movd        [rdi+rax+2],mm5
+        psrlq       mm5,        32
+
+        movd        [rdi+rax*2+2], mm5
+
+        lea         rsi,        [rsi+rax*8]
+        dec         rcx
+        jnz         .next8_v
+
+    add rsp, 64
+    pop rsp
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp9_loop_filter_simple_horizontal_edge_mmx
+;(
+;    unsigned char *src_ptr,
+;    int  src_pixel_step,
+;    const char *blimit
+;)
+global sym(vp9_loop_filter_simple_horizontal_edge_mmx)
+sym(vp9_loop_filter_simple_horizontal_edge_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 3
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov         rsi, arg(0) ;src_ptr
+        movsxd      rax, dword ptr arg(1) ;src_pixel_step     ; destination pitch?
+
+        mov         rcx, 2                ; count
+.nexts8_h:
+        mov         rdx, arg(2) ;blimit           ; get blimit
+        movq        mm3, [rdx]            ;
+
+        mov         rdi, rsi              ; rdi points to row +1 for indirect addressing
+        add         rdi, rax
+        neg         rax
+
+        ; calculate mask
+        movq        mm1, [rsi+2*rax]      ; p1
+        movq        mm0, [rdi]            ; q1
+        movq        mm2, mm1
+        movq        mm7, mm0
+        movq        mm4, mm0
+        psubusb     mm0, mm1              ; q1-=p1
+        psubusb     mm1, mm4              ; p1-=q1
+        por         mm1, mm0              ; abs(p1-q1)
+        pand        mm1, [GLOBAL(tfe)]    ; set lsb of each byte to zero
+        psrlw       mm1, 1                ; abs(p1-q1)/2
+
+        movq        mm5, [rsi+rax]        ; p0
+        movq        mm4, [rsi]            ; q0
+        movq        mm0, mm4              ; q0
+        movq        mm6, mm5              ; p0
+        psubusb     mm5, mm4              ; p0-=q0
+        psubusb     mm4, mm6              ; q0-=p0
+        por         mm5, mm4              ; abs(p0 - q0)
+        paddusb     mm5, mm5              ; abs(p0-q0)*2
+        paddusb     mm5, mm1              ; abs (p0 - q0) *2 + abs(p1-q1)/2
+
+        psubusb     mm5, mm3              ; abs(p0 - q0) *2 + abs(p1-q1)/2  > blimit
+        pxor        mm3, mm3
+        pcmpeqb     mm5, mm3
+
+        ; start work on filters
+        pxor        mm2, [GLOBAL(t80)]    ; p1 offset to convert to signed values
+        pxor        mm7, [GLOBAL(t80)]    ; q1 offset to convert to signed values
+        psubsb      mm2, mm7              ; p1 - q1
+
+        pxor        mm6, [GLOBAL(t80)]    ; offset to convert to signed values
+        pxor        mm0, [GLOBAL(t80)]    ; offset to convert to signed values
+        movq        mm3, mm0              ; q0
+        psubsb      mm0, mm6              ; q0 - p0
+        paddsb      mm2, mm0              ; p1 - q1 + 1 * (q0 - p0)
+        paddsb      mm2, mm0              ; p1 - q1 + 2 * (q0 - p0)
+        paddsb      mm2, mm0              ; p1 - q1 + 3 * (q0 - p0)
+        pand        mm5, mm2              ; mask filter values we don't care about
+
+        ; do + 4 side
+        paddsb      mm5, [GLOBAL(t4)]     ; 3* (q0 - p0) + (p1 - q1) + 4
+
+        movq        mm0, mm5              ; get a copy of filters
+        psllw       mm0, 8                ; shift left 8
+        psraw       mm0, 3                ; arithmetic shift right 11
+        psrlw       mm0, 8
+        movq        mm1, mm5              ; get a copy of filters
+        psraw       mm1, 11               ; arithmetic shift right 11
+        psllw       mm1, 8                ; shift left 8 to put it back
+
+        por         mm0, mm1              ; put the two together to get result
+
+        psubsb      mm3, mm0              ; q0-= q0 add
+        pxor        mm3, [GLOBAL(t80)]    ; unoffset
+        movq        [rsi], mm3            ; write back
+
+
+        ; now do +3 side
+        psubsb      mm5, [GLOBAL(t1s)]     ; +3 instead of +4
+
+        movq        mm0, mm5              ; get a copy of filters
+        psllw       mm0, 8                ; shift left 8
+        psraw       mm0, 3                ; arithmetic shift right 11
+        psrlw       mm0, 8
+        psraw       mm5, 11               ; arithmetic shift right 11
+        psllw       mm5, 8                ; shift left 8 to put it back
+        por         mm0, mm5              ; put the two together to get result
+
+
+        paddsb      mm6, mm0              ; p0+= p0 add
+        pxor        mm6, [GLOBAL(t80)]    ; unoffset
+        movq        [rsi+rax], mm6        ; write back
+
+        add         rsi,8
+        neg         rax
+        dec         rcx
+        jnz         .nexts8_h
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp9_loop_filter_simple_vertical_edge_mmx
+;(
+;    unsigned char *src_ptr,
+;    int  src_pixel_step,
+;    const char *blimit
+;)
+global sym(vp9_loop_filter_simple_vertical_edge_mmx)
+sym(vp9_loop_filter_simple_vertical_edge_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 3
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub          rsp, 32      ; reserve 32 bytes
+    %define t0   [rsp + 0]    ;__declspec(align(16)) char t0[8];
+    %define t1   [rsp + 16]   ;__declspec(align(16)) char t1[8];
+
+        mov         rsi, arg(0) ;src_ptr
+        movsxd      rax, dword ptr arg(1) ;src_pixel_step     ; destination pitch?
+
+        lea         rsi, [rsi + rax*4- 2];  ;
+        mov         rcx, 2                                      ; count
+.nexts8_v:
+
+        lea         rdi,        [rsi + rax];
+        movd        mm0,        [rdi + rax * 2]                 ; xx xx xx xx 73 72 71 70
+
+        movd        mm6,        [rsi + rax * 2]                 ; xx xx xx xx 63 62 61 60
+        punpcklbw   mm6,        mm0                             ; 73 63 72 62 71 61 70 60
+
+        movd        mm0,        [rsi + rax]                     ; xx xx xx xx 53 52 51 50
+        movd        mm4,        [rsi]                           ; xx xx xx xx 43 42 41 40
+
+        punpcklbw   mm4,        mm0                             ; 53 43 52 42 51 41 50 40
+        movq        mm5,        mm4                             ; 53 43 52 42 51 41 50 40
+
+        punpcklwd   mm4,        mm6                             ; 71 61 51 41 70 60 50 40
+        punpckhwd   mm5,        mm6                             ; 73 63 53 43 72 62 52 42
+
+        neg         rax
+
+        movd        mm7,        [rsi + rax]                     ; xx xx xx xx 33 32 31 30
+        movd        mm6,        [rsi + rax * 2]                 ; xx xx xx xx 23 22 21 20
+
+        punpcklbw   mm6,        mm7                             ; 33 23 32 22 31 21 30 20
+        movd        mm1,        [rdi + rax * 4]                 ; xx xx xx xx 13 12 11 10
+
+        movd        mm0,        [rsi + rax * 4]                 ; xx xx xx xx 03 02 01 00
+        punpcklbw   mm0,        mm1                             ; 13 03 12 02 11 01 10 00
+
+        movq        mm2,        mm0                             ; 13 03 12 02 11 01 10 00
+        punpcklwd   mm0,        mm6                             ; 31 21 11 01 30 20 10 00
+
+        punpckhwd   mm2,        mm6                             ; 33 23 13 03 32 22 12 02
+        movq        mm1,        mm0                             ; 13 03 12 02 11 01 10 00
+
+        punpckldq   mm0,        mm4                             ; 70 60 50 40 30 20 10 00       = p1
+        movq        mm3,        mm2                             ; 33 23 13 03 32 22 12 02
+
+        punpckhdq   mm1,        mm4                             ; 71 61 51 41 31 21 11 01       = p0
+        punpckldq   mm2,        mm5                             ; 72 62 52 42 32 22 12 02       = q0
+
+        punpckhdq   mm3,        mm5                             ; 73 63 53 43 33 23 13 03       = q1
+
+
+        ; calculate mask
+        movq        mm6,        mm0                             ; p1
+        movq        mm7,        mm3                             ; q1
+        psubusb     mm7,        mm6                             ; q1-=p1
+        psubusb     mm6,        mm3                             ; p1-=q1
+        por         mm6,        mm7                             ; abs(p1-q1)
+        pand        mm6,        [GLOBAL(tfe)]                   ; set lsb of each byte to zero
+        psrlw       mm6,        1                               ; abs(p1-q1)/2
+
+        movq        mm5,        mm1                             ; p0
+        movq        mm4,        mm2                             ; q0
+
+        psubusb     mm5,        mm2                             ; p0-=q0
+        psubusb     mm4,        mm1                             ; q0-=p0
+
+        por         mm5,        mm4                             ; abs(p0 - q0)
+        paddusb     mm5,        mm5                             ; abs(p0-q0)*2
+        paddusb     mm5,        mm6                             ; abs (p0 - q0) *2 + abs(p1-q1)/2
+
+        mov         rdx,        arg(2) ;blimit                          ; get blimit
+        movq        mm7,        [rdx]
+
+        psubusb     mm5,        mm7                             ; abs(p0 - q0) *2 + abs(p1-q1)/2  > blimit
+        pxor        mm7,        mm7
+        pcmpeqb     mm5,        mm7                             ; mm5 = mask
+
+        ; start work on filters
+        movq        t0,         mm0
+        movq        t1,         mm3
+
+        pxor        mm0,        [GLOBAL(t80)]                   ; p1 offset to convert to signed values
+        pxor        mm3,        [GLOBAL(t80)]                   ; q1 offset to convert to signed values
+
+        psubsb      mm0,        mm3                             ; p1 - q1
+        movq        mm6,        mm1                             ; p0
+
+        movq        mm7,        mm2                             ; q0
+        pxor        mm6,        [GLOBAL(t80)]                   ; offset to convert to signed values
+
+        pxor        mm7,        [GLOBAL(t80)]                   ; offset to convert to signed values
+        movq        mm3,        mm7                             ; offseted ; q0
+
+        psubsb      mm7,        mm6                             ; q0 - p0
+        paddsb      mm0,        mm7                             ; p1 - q1 + 1 * (q0 - p0)
+
+        paddsb      mm0,        mm7                             ; p1 - q1 + 2 * (q0 - p0)
+        paddsb      mm0,        mm7                             ; p1 - q1 + 3 * (q0 - p0)
+
+        pand        mm5,        mm0                             ; mask filter values we don't care about
+
+        paddsb      mm5,        [GLOBAL(t4)]                    ;  3* (q0 - p0) + (p1 - q1) + 4
+
+        movq        mm0,        mm5                             ; get a copy of filters
+        psllw       mm0,        8                               ; shift left 8
+        psraw       mm0,        3                               ; arithmetic shift right 11
+        psrlw       mm0,        8
+
+        movq        mm7,        mm5                             ; get a copy of filters
+        psraw       mm7,        11                              ; arithmetic shift right 11
+        psllw       mm7,        8                               ; shift left 8 to put it back
+
+        por         mm0,        mm7                             ; put the two together to get result
+
+        psubsb      mm3,        mm0                             ; q0-= q0sz add
+        pxor        mm3,        [GLOBAL(t80)]                   ; unoffset
+
+        ; now do +3 side
+        psubsb      mm5, [GLOBAL(t1s)]                          ; +3 instead of +4
+
+        movq        mm0, mm5                                    ; get a copy of filters
+        psllw       mm0, 8                                      ; shift left 8
+        psraw       mm0, 3                                      ; arithmetic shift right 11
+        psrlw       mm0, 8
+
+        psraw       mm5, 11                                     ; arithmetic shift right 11
+        psllw       mm5, 8                                      ; shift left 8 to put it back
+        por         mm0, mm5                                    ; put the two together to get result
+
+        paddsb      mm6, mm0                                    ; p0+= p0 add
+        pxor        mm6, [GLOBAL(t80)]                          ; unoffset
+
+
+        movq        mm0,        t0
+        movq        mm4,        t1
+
+        ; mm0 = 70 60 50 40 30 20 10 00
+        ; mm6 = 71 61 51 41 31 21 11 01
+        ; mm3 = 72 62 52 42 32 22 12 02
+        ; mm4 = 73 63 53 43 33 23 13 03
+        ; transpose back to write out
+
+        movq        mm1,        mm0                         ;
+        punpcklbw   mm0,        mm6                         ; 31 30 21 20 11 10 01 00
+
+        punpckhbw   mm1,        mm6                         ; 71 70 61 60 51 50 41 40
+        movq        mm2,        mm3                         ;
+
+        punpcklbw   mm2,        mm4                         ; 33 32 23 22 13 12 03 02
+        movq        mm5,        mm1                         ; 71 70 61 60 51 50 41 40
+
+        punpckhbw   mm3,        mm4                         ; 73 72 63 62 53 52 43 42
+        movq        mm6,        mm0                         ; 31 30 21 20 11 10 01 00
+
+        punpcklwd   mm0,        mm2                         ; 13 12 11 10 03 02 01 00
+        punpckhwd   mm6,        mm2                         ; 33 32 31 30 23 22 21 20
+
+        movd        [rsi+rax*4], mm0                        ; write 03 02 01 00
+        punpcklwd   mm1,        mm3                         ; 53 52 51 50 43 42 41 40
+
+        psrlq       mm0,        32                          ; xx xx xx xx 13 12 11 10
+        punpckhwd   mm5,        mm3                         ; 73 72 71 70 63 62 61 60
+
+        movd        [rdi+rax*4], mm0                        ; write 13 12 11 10
+        movd        [rsi+rax*2], mm6                        ; write 23 22 21 20
+
+        psrlq       mm6,        32                          ; 33 32 31 30
+        movd        [rsi],      mm1                         ; write 43 42 41 40
+
+        movd        [rsi + rax], mm6                        ; write 33 32 31 30
+        neg         rax
+
+        movd        [rsi + rax*2], mm5                      ; write 63 62 61 60
+        psrlq       mm1,        32                          ; 53 52 51 50
+
+        movd        [rdi],      mm1                         ; write out 53 52 51 50
+        psrlq       mm5,        32                          ; 73 72 71 70
+
+        movd        [rdi + rax*2], mm5                      ; write 73 72 71 70
+
+        lea         rsi,        [rsi+rax*8]                 ; next 8
+
+        dec         rcx
+        jnz         .nexts8_v
+
+    add rsp, 32
+    pop rsp
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+
+;void fast_loop_filter_vertical_edges_mmx(unsigned char *y_ptr,
+;                  int y_stride,
+;                  loop_filter_info *lfi)
+;{
+;
+;
+;    vp9_loop_filter_simple_vertical_edge_mmx(y_ptr+4, y_stride, lfi->flim,lfi->lim,lfi->thr,2);
+;    vp9_loop_filter_simple_vertical_edge_mmx(y_ptr+8, y_stride, lfi->flim,lfi->lim,lfi->thr,2);
+;    vp9_loop_filter_simple_vertical_edge_mmx(y_ptr+12, y_stride, lfi->flim,lfi->lim,lfi->thr,2);
+;}
+
+SECTION_RODATA
+align 16
+tfe:
+    times 8 db 0xfe
+align 16
+t80:
+    times 8 db 0x80
+align 16
+t1s:
+    times 8 db 0x01
+align 16
+t3:
+    times 8 db 0x03
+align 16
+t4:
+    times 8 db 0x04
+align 16
+ones:
+    times 4 dw 0x0001
+align 16
+s27:
+    times 4 dw 0x1b00
+align 16
+s18:
+    times 4 dw 0x1200
+align 16
+s9:
+    times 4 dw 0x0900
+align 16
+s63:
+    times 4 dw 0x003f
--- /dev/null
+++ b/vp9/common/x86/loopfilter_sse2.asm
@@ -1,0 +1,1238 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+; Use of pmaxub instead of psubusb to compute filter mask was seen
+; in ffvp8
+
+%macro LFH_FILTER_AND_HEV_MASK 1
+%if %1
+        movdqa      xmm2,                   [rdi+2*rax]       ; q3
+        movdqa      xmm1,                   [rsi+2*rax]       ; q2
+        movdqa      xmm4,                   [rsi+rax]         ; q1
+        movdqa      xmm5,                   [rsi]             ; q0
+        neg         rax                     ; negate pitch to deal with above border
+%else
+        movlps      xmm2,                   [rsi + rcx*2]     ; q3
+        movlps      xmm1,                   [rsi + rcx]       ; q2
+        movlps      xmm4,                   [rsi]             ; q1
+        movlps      xmm5,                   [rsi + rax]       ; q0
+
+        movhps      xmm2,                   [rdi + rcx*2]
+        movhps      xmm1,                   [rdi + rcx]
+        movhps      xmm4,                   [rdi]
+        movhps      xmm5,                   [rdi + rax]
+
+        lea         rsi,                    [rsi + rax*4]
+        lea         rdi,                    [rdi + rax*4]
+
+        movdqa      XMMWORD PTR [rsp],      xmm1              ; store q2
+        movdqa      XMMWORD PTR [rsp + 16], xmm4              ; store q1
+%endif
+
+        movdqa      xmm6,                   xmm1              ; q2
+        movdqa      xmm3,                   xmm4              ; q1
+
+        psubusb     xmm1,                   xmm2              ; q2-=q3
+        psubusb     xmm2,                   xmm6              ; q3-=q2
+
+        psubusb     xmm4,                   xmm6              ; q1-=q2
+        psubusb     xmm6,                   xmm3              ; q2-=q1
+
+        por         xmm4,                   xmm6              ; abs(q2-q1)
+        por         xmm1,                   xmm2              ; abs(q3-q2)
+
+        movdqa      xmm0,                   xmm5              ; q0
+        pmaxub      xmm1,                   xmm4
+
+        psubusb     xmm5,                   xmm3              ; q0-=q1
+        psubusb     xmm3,                   xmm0              ; q1-=q0
+
+        por         xmm5,                   xmm3              ; abs(q0-q1)
+        movdqa      t0,                     xmm5              ; save to t0
+
+        pmaxub      xmm1,                   xmm5
+
+%if %1
+        movdqa      xmm2,                   [rsi+4*rax]       ; p3
+        movdqa      xmm4,                   [rdi+4*rax]       ; p2
+        movdqa      xmm6,                   [rsi+2*rax]       ; p1
+%else
+        movlps      xmm2,                   [rsi + rax]       ; p3
+        movlps      xmm4,                   [rsi]             ; p2
+        movlps      xmm6,                   [rsi + rcx]       ; p1
+
+        movhps      xmm2,                   [rdi + rax]
+        movhps      xmm4,                   [rdi]
+        movhps      xmm6,                   [rdi + rcx]
+
+        movdqa      XMMWORD PTR [rsp + 32], xmm4              ; store p2
+        movdqa      XMMWORD PTR [rsp + 48], xmm6              ; store p1
+%endif
+
+        movdqa      xmm5,                   xmm4              ; p2
+        movdqa      xmm3,                   xmm6              ; p1
+
+        psubusb     xmm4,                   xmm2              ; p2-=p3
+        psubusb     xmm2,                   xmm5              ; p3-=p2
+
+        psubusb     xmm3,                   xmm5              ; p1-=p2
+        pmaxub      xmm1,                   xmm4              ; abs(p3 - p2)
+
+        psubusb     xmm5,                   xmm6              ; p2-=p1
+        pmaxub      xmm1,                   xmm2              ; abs(p3 - p2)
+
+        pmaxub      xmm1,                   xmm5              ; abs(p2 - p1)
+        movdqa      xmm2,                   xmm6              ; p1
+
+        pmaxub      xmm1,                   xmm3              ; abs(p2 - p1)
+%if %1
+        movdqa      xmm4,                   [rsi+rax]         ; p0
+        movdqa      xmm3,                   [rdi]             ; q1
+%else
+        movlps      xmm4,                   [rsi + rcx*2]     ; p0
+        movhps      xmm4,                   [rdi + rcx*2]
+        movdqa      xmm3,                   q1                ; q1
+%endif
+
+        movdqa      xmm5,                   xmm4              ; p0
+        psubusb     xmm4,                   xmm6              ; p0-=p1
+
+        psubusb     xmm6,                   xmm5              ; p1-=p0
+
+        por         xmm6,                   xmm4              ; abs(p1 - p0)
+        mov         rdx,                    arg(2)            ; get blimit
+
+        movdqa        t1,                   xmm6              ; save to t1
+
+        movdqa      xmm4,                   xmm3              ; q1
+        pmaxub      xmm1,                   xmm6
+
+        psubusb     xmm3,                   xmm2              ; q1-=p1
+        psubusb     xmm2,                   xmm4              ; p1-=q1
+
+        psubusb     xmm1,                   xmm7
+        por         xmm2,                   xmm3              ; abs(p1-q1)
+
+        movdqa      xmm7,                   XMMWORD PTR [rdx] ; blimit
+
+        movdqa      xmm3,                   xmm0              ; q0
+        pand        xmm2,                   [GLOBAL(tfe)]     ; set lsb of each byte to zero
+
+        mov         rdx,                    arg(4)            ; hev get thresh
+
+        movdqa      xmm6,                   xmm5              ; p0
+        psrlw       xmm2,                   1                 ; abs(p1-q1)/2
+
+        psubusb     xmm5,                   xmm3              ; p0-=q0
+
+        psubusb     xmm3,                   xmm6              ; q0-=p0
+        por         xmm5,                   xmm3              ; abs(p0 - q0)
+
+        paddusb     xmm5,                   xmm5              ; abs(p0-q0)*2
+
+        movdqa      xmm4,                   t0                ; hev get abs (q1 - q0)
+
+        movdqa      xmm3,                   t1                ; get abs (p1 - p0)
+
+        paddusb     xmm5,                   xmm2              ; abs (p0 - q0) *2 + abs(p1-q1)/2
+
+        movdqa      xmm2,                   XMMWORD PTR [rdx] ; hev
+
+        psubusb     xmm5,                   xmm7              ; abs (p0 - q0) *2 + abs(p1-q1)/2  > blimit
+        psubusb     xmm4,                   xmm2              ; hev
+
+        psubusb     xmm3,                   xmm2              ; hev
+        por         xmm1,                   xmm5
+
+        pxor        xmm7,                   xmm7
+        paddb       xmm4,                   xmm3              ; hev abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
+
+        pcmpeqb     xmm4,                   xmm5              ; hev
+        pcmpeqb     xmm3,                   xmm3              ; hev
+
+        pcmpeqb     xmm1,                   xmm7              ; mask xmm1
+        pxor        xmm4,                   xmm3              ; hev
+%endmacro
+
+%macro B_FILTER 1
+%if %1 == 0
+        movdqa      xmm2,                   p1                ; p1
+        movdqa      xmm7,                   q1                ; q1
+%elif %1 == 1
+        movdqa      xmm2,                   [rsi+2*rax]       ; p1
+        movdqa      xmm7,                   [rdi]             ; q1
+%elif %1 == 2
+        lea         rdx,                    srct
+
+        movdqa      xmm2,                   [rdx]             ; p1
+        movdqa      xmm7,                   [rdx+48]          ; q1
+        movdqa      xmm6,                   [rdx+16]          ; p0
+        movdqa      xmm0,                   [rdx+32]          ; q0
+%endif
+
+        pxor        xmm2,                   [GLOBAL(t80)]     ; p1 offset to convert to signed values
+        pxor        xmm7,                   [GLOBAL(t80)]     ; q1 offset to convert to signed values
+
+        psubsb      xmm2,                   xmm7              ; p1 - q1
+        pxor        xmm6,                   [GLOBAL(t80)]     ; offset to convert to signed values
+
+        pand        xmm2,                   xmm4              ; high var mask (hvm)(p1 - q1)
+        pxor        xmm0,                   [GLOBAL(t80)]     ; offset to convert to signed values
+
+        movdqa      xmm3,                   xmm0              ; q0
+        psubsb      xmm0,                   xmm6              ; q0 - p0
+
+        paddsb      xmm2,                   xmm0              ; 1 * (q0 - p0) + hvm(p1 - q1)
+
+        paddsb      xmm2,                   xmm0              ; 2 * (q0 - p0) + hvm(p1 - q1)
+
+        paddsb      xmm2,                   xmm0              ; 3 * (q0 - p0) + hvm(p1 - q1)
+
+        pand        xmm1,                   xmm2              ; mask filter values we don't care about
+
+        movdqa      xmm2,                   xmm1
+
+        paddsb      xmm1,                   [GLOBAL(t4)]      ; 3* (q0 - p0) + hvm(p1 - q1) + 4
+        paddsb      xmm2,                   [GLOBAL(t3)]      ; 3* (q0 - p0) + hvm(p1 - q1) + 3
+
+        punpckhbw   xmm5,                   xmm2              ; axbxcxdx
+        punpcklbw   xmm2,                   xmm2              ; exfxgxhx
+
+        punpcklbw   xmm0,                   xmm1              ; exfxgxhx
+        psraw       xmm5,                   11                ; sign extended shift right by 3
+
+        punpckhbw   xmm1,                   xmm1              ; axbxcxdx
+        psraw       xmm2,                   11                ; sign extended shift right by 3
+
+        packsswb    xmm2,                   xmm5              ; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3;
+        psraw       xmm0,                   11                ; sign extended shift right by 3
+
+        psraw       xmm1,                   11                ; sign extended shift right by 3
+        movdqa      xmm5,                   xmm0              ; save results
+
+        packsswb    xmm0,                   xmm1              ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3
+        paddsw      xmm5,                   [GLOBAL(ones)]
+
+        paddsw      xmm1,                   [GLOBAL(ones)]
+        psraw       xmm5,                   1                 ; partial shifted one more time for 2nd tap
+
+        psraw       xmm1,                   1                 ; partial shifted one more time for 2nd tap
+
+        paddsb      xmm6,                   xmm2              ; p0+= p0 add
+        packsswb    xmm5,                   xmm1              ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4
+
+%if %1 == 0
+        movdqa      xmm1,                   p1                ; p1
+%elif %1 == 1
+        movdqa      xmm1,                   [rsi+2*rax]       ; p1
+%elif %1 == 2
+        movdqa      xmm1,                   [rdx]             ; p1
+%endif
+        pandn       xmm4,                   xmm5              ; high edge variance additive
+        pxor        xmm6,                   [GLOBAL(t80)]     ; unoffset
+
+        pxor        xmm1,                   [GLOBAL(t80)]     ; reoffset
+        psubsb      xmm3,                   xmm0              ; q0-= q0 add
+
+        paddsb      xmm1,                   xmm4              ; p1+= p1 add
+        pxor        xmm3,                   [GLOBAL(t80)]     ; unoffset
+
+        pxor        xmm1,                   [GLOBAL(t80)]     ; unoffset
+        psubsb      xmm7,                   xmm4              ; q1-= q1 add
+
+        pxor        xmm7,                   [GLOBAL(t80)]     ; unoffset
+%if %1 == 0
+        lea         rsi,                    [rsi + rcx*2]
+        lea         rdi,                    [rdi + rcx*2]
+        movq        MMWORD PTR [rsi],       xmm6              ; p0
+        movhps      MMWORD PTR [rdi],       xmm6
+        movq        MMWORD PTR [rsi + rax], xmm1              ; p1
+        movhps      MMWORD PTR [rdi + rax], xmm1
+        movq        MMWORD PTR [rsi + rcx], xmm3              ; q0
+        movhps      MMWORD PTR [rdi + rcx], xmm3
+        movq        MMWORD PTR [rsi + rcx*2],xmm7             ; q1
+        movhps      MMWORD PTR [rdi + rcx*2],xmm7
+%elif %1 == 1
+        movdqa      [rsi+rax],              xmm6              ; write back
+        movdqa      [rsi+2*rax],            xmm1              ; write back
+        movdqa      [rsi],                  xmm3              ; write back
+        movdqa      [rdi],                  xmm7              ; write back
+%endif
+
+%endmacro
+
+
+;void vp9_loop_filter_horizontal_edge_sse2
+;(
+;    unsigned char *src_ptr,
+;    int            src_pixel_step,
+;    const char    *blimit,
+;    const char    *limit,
+;    const char    *thresh,
+;    int            count
+;)
+global sym(vp9_loop_filter_horizontal_edge_sse2)
+sym(vp9_loop_filter_horizontal_edge_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 32     ; reserve 32 bytes
+    %define t0 [rsp + 0]    ;__declspec(align(16)) char t0[16];
+    %define t1 [rsp + 16]   ;__declspec(align(16)) char t1[16];
+
+        mov         rsi,                    arg(0)           ;src_ptr
+        movsxd      rax,                    dword ptr arg(1) ;src_pixel_step
+
+        mov         rdx,                    arg(3)           ;limit
+        movdqa      xmm7,                   XMMWORD PTR [rdx]
+
+        lea         rdi,                    [rsi+rax]        ; rdi points to row +1 for indirect addressing
+
+        ; calculate breakout conditions and high edge variance
+        LFH_FILTER_AND_HEV_MASK 1
+        ; filter and write back the result
+        B_FILTER 1
+
+    add rsp, 32
+    pop rsp
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp9_loop_filter_horizontal_edge_uv_sse2
+;(
+;    unsigned char *src_ptr,
+;    int            src_pixel_step,
+;    const char    *blimit,
+;    const char    *limit,
+;    const char    *thresh,
+;    int            count
+;)
+global sym(vp9_loop_filter_horizontal_edge_uv_sse2)
+sym(vp9_loop_filter_horizontal_edge_uv_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 96       ; reserve 96 bytes
+    %define q2  [rsp + 0]     ;__declspec(align(16)) char q2[16];
+    %define q1  [rsp + 16]    ;__declspec(align(16)) char q1[16];
+    %define p2  [rsp + 32]    ;__declspec(align(16)) char p2[16];
+    %define p1  [rsp + 48]    ;__declspec(align(16)) char p1[16];
+    %define t0  [rsp + 64]    ;__declspec(align(16)) char t0[16];
+    %define t1  [rsp + 80]    ;__declspec(align(16)) char t1[16];
+
+        mov         rsi,                    arg(0)             ; u
+        mov         rdi,                    arg(5)             ; v
+        movsxd      rax,                    dword ptr arg(1)   ; src_pixel_step
+        mov         rcx,                    rax
+        neg         rax                     ; negate pitch to deal with above border
+
+        mov         rdx,                    arg(3)             ;limit
+        movdqa      xmm7,                   XMMWORD PTR [rdx]
+
+        lea         rsi,                    [rsi + rcx]
+        lea         rdi,                    [rdi + rcx]
+
+        ; calculate breakout conditions and high edge variance
+        LFH_FILTER_AND_HEV_MASK 0
+        ; filter and write back the result
+        B_FILTER 0
+
+    add rsp, 96
+    pop rsp
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+%macro TRANSPOSE_16X8 2
+        movq        xmm4,               QWORD PTR [rsi]        ; xx xx xx xx xx xx xx xx 07 06 05 04 03 02 01 00
+        movq        xmm1,               QWORD PTR [rdi]        ; xx xx xx xx xx xx xx xx 17 16 15 14 13 12 11 10
+        movq        xmm0,               QWORD PTR [rsi+2*rax]  ; xx xx xx xx xx xx xx xx 27 26 25 24 23 22 21 20
+        movq        xmm7,               QWORD PTR [rdi+2*rax]  ; xx xx xx xx xx xx xx xx 37 36 35 34 33 32 31 30
+        movq        xmm5,               QWORD PTR [rsi+4*rax]  ; xx xx xx xx xx xx xx xx 47 46 45 44 43 42 41 40
+        movq        xmm2,               QWORD PTR [rdi+4*rax]  ; xx xx xx xx xx xx xx xx 57 56 55 54 53 52 51 50
+
+        punpcklbw   xmm4,               xmm1            ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00
+
+        movq        xmm1,               QWORD PTR [rdi+2*rcx]  ; xx xx xx xx xx xx xx xx 77 76 75 74 73 72 71 70
+
+        movdqa      xmm3,               xmm4            ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00
+        punpcklbw   xmm0,               xmm7            ; 37 27 36 36 35 25 34 24 33 23 32 22 31 21 30 20
+
+        movq        xmm7,               QWORD PTR [rsi+2*rcx]  ; xx xx xx xx xx xx xx xx 67 66 65 64 63 62 61 60
+
+        punpcklbw   xmm5,               xmm2            ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40
+%if %1
+        lea         rsi,                [rsi+rax*8]
+%else
+        mov         rsi,                arg(5)          ; v_ptr
+%endif
+
+        movdqa      xmm6,               xmm5            ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40
+        punpcklbw   xmm7,               xmm1            ; 77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60
+
+        punpcklwd   xmm5,               xmm7            ; 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40
+
+        punpckhwd   xmm6,               xmm7            ; 77 67 57 47 76 66 56 46 75 65 55 45 74 64 54 44
+%if %1
+        lea         rdi,                [rdi+rax*8]
+%else
+        lea         rsi,                [rsi - 4]
+%endif
+
+        punpcklwd   xmm3,               xmm0            ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
+%if %1
+        lea         rdx,                srct
+%else
+        lea         rdi,                [rsi + rax]     ; rdi points to row +1 for indirect addressing
+%endif
+
+        movdqa      xmm2,               xmm3            ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
+        punpckhwd   xmm4,               xmm0            ; 37 27 17 07 36 26 16 06 35 25 15 05 34 24 14 04
+
+        movdqa      xmm7,               xmm4            ; 37 27 17 07 36 26 16 06 35 25 15 05 34 24 14 04
+        punpckhdq   xmm3,               xmm5            ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
+
+        punpckhdq   xmm7,               xmm6            ; 77 67 57 47 37 27 17 07 76 66 56 46 36 26 16 06
+
+        punpckldq   xmm4,               xmm6            ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04
+
+        punpckldq   xmm2,               xmm5            ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
+
+        movdqa      t0,                 xmm2            ; save to free XMM2
+        movq        xmm2,               QWORD PTR [rsi]       ; xx xx xx xx xx xx xx xx 87 86 85 84 83 82 81 80
+        movq        xmm6,               QWORD PTR [rdi]       ; xx xx xx xx xx xx xx xx 97 96 95 94 93 92 91 90
+        movq        xmm0,               QWORD PTR [rsi+2*rax] ; xx xx xx xx xx xx xx xx a7 a6 a5 a4 a3 a2 a1 a0
+        movq        xmm5,               QWORD PTR [rdi+2*rax] ; xx xx xx xx xx xx xx xx b7 b6 b5 b4 b3 b2 b1 b0
+        movq        xmm1,               QWORD PTR [rsi+4*rax] ; xx xx xx xx xx xx xx xx c7 c6 c5 c4 c3 c2 c1 c0
+
+        punpcklbw   xmm2,               xmm6            ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80
+
+        movq        xmm6,               QWORD PTR [rdi+4*rax] ; xx xx xx xx xx xx xx xx d7 d6 d5 d4 d3 d2 d1 d0
+
+        punpcklbw   xmm0,               xmm5                  ; b7 a7 b6 a6 b5 a5 b4 a4 b3 a3 b2 a2 b1 a1 b0 a0
+
+        movq        xmm5,               QWORD PTR [rsi+2*rcx] ; xx xx xx xx xx xx xx xx e7 e6 e5 e4 e3 e2 e1 e0
+
+        punpcklbw   xmm1,               xmm6            ; d7 c7 d6 c6 d5 c5 d4 c4 d3 c3 d2 c2 d1 e1 d0 c0
+
+        movq        xmm6,               QWORD PTR [rdi+2*rcx] ; xx xx xx xx xx xx xx xx f7 f6 f5 f4 f3 f2 f1 f0
+
+        punpcklbw   xmm5,               xmm6            ; f7 e7 f6 e6 f5 e5 f4 e4 f3 e3 f2 e2 f1 e1 f0 e0
+
+        movdqa      xmm6,               xmm1            ;
+        punpckhwd   xmm6,               xmm5            ; f7 e7 d7 c7 f6 e6 d6 c6 f5 e5 d5 c5 f4 e4 d4 c4
+
+        punpcklwd   xmm1,               xmm5            ; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0
+        movdqa      xmm5,               xmm2            ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80
+
+        punpcklwd   xmm5,               xmm0            ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80
+
+        punpckhwd   xmm2,               xmm0            ; b7 a7 97 87 b6 a6 96 86 b5 a5 95 85 b4 a4 94 84
+
+        movdqa      xmm0,               xmm5
+        punpckldq   xmm0,               xmm1            ; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80
+
+        punpckhdq   xmm5,               xmm1            ; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82
+        movdqa      xmm1,               xmm2            ; b7 a7 97 87 b6 a6 96 86 b5 a5 95 85 b4 a4 94 84
+
+        punpckldq   xmm1,               xmm6            ; f5 e5 d5 c5 b5 a5 95 85 f4 e4 d4 c4 b4 a4 94 84
+
+        punpckhdq   xmm2,               xmm6            ; f7 e7 d7 c7 b7 a7 97 87 f6 e6 d6 c6 b6 a6 96 86
+        movdqa      xmm6,               xmm7            ; 77 67 57 47 37 27 17 07 76 66 56 46 36 26 16 06
+
+        punpcklqdq  xmm6,               xmm2            ; f6 e6 d6 c6 b6 a6 96 86 76 66 56 46 36 26 16 06
+
+        punpckhqdq  xmm7,               xmm2            ; f7 e7 d7 c7 b7 a7 97 87 77 67 57 47 37 27 17 07
+%if %2
+        movdqa      xmm2,               xmm3            ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
+        punpcklqdq  xmm2,               xmm5            ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
+
+        punpckhqdq  xmm3,               xmm5            ; f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
+
+        movdqa      [rdx],              xmm2            ; save 2
+
+        movdqa      xmm5,               xmm4            ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04
+        punpcklqdq  xmm4,               xmm1            ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
+
+        movdqa      [rdx+16],           xmm3            ; save 3
+
+        punpckhqdq  xmm5,               xmm1            ; f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05
+
+        movdqa      [rdx+32],           xmm4            ; save 4
+        movdqa      [rdx+48],           xmm5            ; save 5
+        movdqa      xmm1,               t0              ; get
+
+        movdqa      xmm2,               xmm1            ;
+        punpckhqdq  xmm1,               xmm0            ; f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
+
+        punpcklqdq  xmm2,               xmm0            ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
+%else
+        movdqa      [rdx+112],          xmm7            ; save 7
+
+        movdqa      [rdx+96],           xmm6            ; save 6
+
+        movdqa      xmm2,               xmm3            ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
+        punpckhqdq  xmm3,               xmm5            ; f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
+
+        punpcklqdq  xmm2,               xmm5            ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
+
+        movdqa      [rdx+32],           xmm2            ; save 2
+
+        movdqa      xmm5,               xmm4            ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04
+        punpcklqdq  xmm4,               xmm1            ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
+
+        movdqa      [rdx+48],           xmm3            ; save 3
+
+        punpckhqdq  xmm5,               xmm1            ; f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05
+
+        movdqa      [rdx+64],           xmm4            ; save 4
+        movdqa      [rdx+80],           xmm5            ; save 5
+        movdqa      xmm1,               t0              ; get
+
+        movdqa      xmm2,               xmm1
+        punpckhqdq  xmm1,               xmm0            ; f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
+
+        punpcklqdq  xmm2,               xmm0            ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
+
+        movdqa      [rdx+16],           xmm1
+
+        movdqa      [rdx],              xmm2
+%endif
+%endmacro
+
+%macro LFV_FILTER_MASK_HEV_MASK 1
+        movdqa      xmm0,               xmm6            ; q2
+        psubusb     xmm0,               xmm7            ; q2-q3
+
+        psubusb     xmm7,               xmm6            ; q3-q2
+        movdqa      xmm4,               xmm5            ; q1
+
+        por         xmm7,               xmm0            ; abs (q3-q2)
+        psubusb     xmm4,               xmm6            ; q1-q2
+
+        movdqa      xmm0,               xmm1
+        psubusb     xmm6,               xmm5            ; q2-q1
+
+        por         xmm6,               xmm4            ; abs (q2-q1)
+        psubusb     xmm0,               xmm2            ; p2 - p3;
+
+        psubusb     xmm2,               xmm1            ; p3 - p2;
+        por         xmm0,               xmm2            ; abs(p2-p3)
+%if %1
+        movdqa      xmm2,               [rdx]           ; p1
+%else
+        movdqa      xmm2,               [rdx+32]        ; p1
+%endif
+        movdqa      xmm5,               xmm2            ; p1
+        pmaxub      xmm0,               xmm7
+
+        psubusb     xmm5,               xmm1            ; p1-p2
+        psubusb     xmm1,               xmm2            ; p2-p1
+
+        movdqa      xmm7,               xmm3            ; p0
+        psubusb     xmm7,               xmm2            ; p0-p1
+
+        por         xmm1,               xmm5            ; abs(p2-p1)
+        pmaxub      xmm0,               xmm6
+
+        pmaxub      xmm0,               xmm1
+        movdqa      xmm1,               xmm2            ; p1
+
+        psubusb     xmm2,               xmm3            ; p1-p0
+        lea         rdx,                srct
+
+        por         xmm2,               xmm7            ; abs(p1-p0)
+
+        movdqa      t0,                 xmm2            ; save abs(p1-p0)
+
+        pmaxub      xmm0,               xmm2
+
+%if %1
+        movdqa      xmm5,               [rdx+32]        ; q0
+        movdqa      xmm7,               [rdx+48]        ; q1
+%else
+        movdqa      xmm5,               [rdx+64]        ; q0
+        movdqa      xmm7,               [rdx+80]        ; q1
+%endif
+        mov         rdx,                arg(3)          ; limit
+
+        movdqa      xmm6,               xmm5            ; q0
+        movdqa      xmm2,               xmm7            ; q1
+
+        psubusb     xmm5,               xmm7            ; q0-q1
+        psubusb     xmm7,               xmm6            ; q1-q0
+
+        por         xmm7,               xmm5            ; abs(q1-q0)
+
+        movdqa      t1,                 xmm7            ; save abs(q1-q0)
+
+        movdqa      xmm4,               XMMWORD PTR [rdx]; limit
+
+        pmaxub      xmm0,               xmm7
+        mov         rdx,                arg(2)          ; blimit
+
+        psubusb     xmm0,               xmm4
+        movdqa      xmm5,               xmm2            ; q1
+
+        psubusb     xmm5,               xmm1            ; q1-=p1
+        psubusb     xmm1,               xmm2            ; p1-=q1
+
+        por         xmm5,               xmm1            ; abs(p1-q1)
+        movdqa      xmm1,               xmm3            ; p0
+
+        pand        xmm5,               [GLOBAL(tfe)]   ; set lsb of each byte to zero
+        psubusb     xmm1,               xmm6            ; p0-q0
+
+        psrlw       xmm5,               1               ; abs(p1-q1)/2
+        psubusb     xmm6,               xmm3            ; q0-p0
+
+        movdqa      xmm4,               XMMWORD PTR [rdx]; blimit
+
+        mov         rdx,                arg(4)          ; get thresh
+
+        por         xmm1,               xmm6            ; abs(q0-p0)
+
+        movdqa      xmm6,               t0              ; get abs (q1 - q0)
+
+        paddusb     xmm1,               xmm1            ; abs(q0-p0)*2
+
+        movdqa      xmm3,               t1              ; get abs (p1 - p0)
+
+        movdqa      xmm7,               XMMWORD PTR [rdx]
+
+        paddusb     xmm1,               xmm5            ; abs (p0 - q0) *2 + abs(p1-q1)/2
+        psubusb     xmm6,               xmm7            ; abs(q1 - q0) > thresh
+
+        psubusb     xmm3,               xmm7            ; abs(p1 - p0)> thresh
+
+        psubusb     xmm1,               xmm4            ; abs (p0 - q0) *2 + abs(p1-q1)/2  > blimit
+        por         xmm6,               xmm3            ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
+
+        por         xmm1,               xmm0            ; mask
+        pcmpeqb     xmm6,               xmm0
+
+        pxor        xmm0,               xmm0
+        pcmpeqb     xmm4,               xmm4
+
+        pcmpeqb     xmm1,               xmm0
+        pxor        xmm4,               xmm6
+%endmacro
+
+%macro BV_TRANSPOSE 0
+        ; xmm1 =    f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
+        ; xmm6 =    f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
+        ; xmm3 =    f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
+        ; xmm7 =    f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05
+        movdqa      xmm2,               xmm1            ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
+        punpcklbw   xmm2,               xmm6            ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
+
+        movdqa      xmm4,               xmm3            ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
+        punpckhbw   xmm1,               xmm6            ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
+
+        punpcklbw   xmm4,               xmm7            ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04
+
+        punpckhbw   xmm3,               xmm7            ; f5 f4 e5 e4 d5 d4 c5 c4 b5 b4 a5 a4 95 94 85 84
+
+        movdqa      xmm6,               xmm2            ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
+        punpcklwd   xmm2,               xmm4            ; 35 34 33 32 25 24 23 22 15 14 13 12 05 04 03 02
+
+        punpckhwd   xmm6,               xmm4            ; 75 74 73 72 65 64 63 62 55 54 53 52 45 44 43 42
+        movdqa      xmm5,               xmm1            ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
+
+        punpcklwd   xmm1,               xmm3            ; b5 b4 b3 b2 a5 a4 a3 a2 95 94 93 92 85 84 83 82
+
+        punpckhwd   xmm5,               xmm3            ; f5 f4 f3 f2 e5 e4 e3 e2 d5 d4 d3 d2 c5 c4 c3 c2
+        ; xmm2 = 35 34 33 32 25 24 23 22 15 14 13 12 05 04 03 02
+        ; xmm6 = 75 74 73 72 65 64 63 62 55 54 53 52 45 44 43 42
+        ; xmm1 = b5 b4 b3 b2 a5 a4 a3 a2 95 94 93 92 85 84 83 82
+        ; xmm5 = f5 f4 f3 f2 e5 e4 e3 e2 d5 d4 d3 d2 c5 c4 c3 c2
+%endmacro
+
+%macro BV_WRITEBACK 2
+        movd        [rsi+2],            %1
+        psrldq      %1,                 4
+
+        movd        [rdi+2],            %1
+        psrldq      %1,                 4
+
+        movd        [rsi+2*rax+2],      %1
+        psrldq      %1,                 4
+
+        movd        [rdi+2*rax+2],      %1
+
+        movd        [rsi+4*rax+2],      %2
+        psrldq      %2,                 4
+
+        movd        [rdi+4*rax+2],      %2
+        psrldq      %2,                 4
+
+        movd        [rsi+2*rcx+2],      %2
+        psrldq      %2,                 4
+
+        movd        [rdi+2*rcx+2],      %2
+%endmacro
+
+
+;void vp9_loop_filter_vertical_edge_sse2
+;(
+;    unsigned char *src_ptr,
+;    int            src_pixel_step,
+;    const char    *blimit,
+;    const char    *limit,
+;    const char    *thresh,
+;    int            count
+;)
+global sym(vp9_loop_filter_vertical_edge_sse2)
+sym(vp9_loop_filter_vertical_edge_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub             rsp, 96      ; reserve 96 bytes
+    %define t0      [rsp + 0]    ;__declspec(align(16)) char t0[16];
+    %define t1      [rsp + 16]   ;__declspec(align(16)) char t1[16];
+    %define srct    [rsp + 32]   ;__declspec(align(16)) char srct[64];
+
+        mov         rsi,        arg(0)                  ; src_ptr
+        movsxd      rax,        dword ptr arg(1)        ; src_pixel_step
+
+        lea         rsi,        [rsi - 4]
+        lea         rdi,        [rsi + rax]             ; rdi points to row +1 for indirect addressing
+        lea         rcx,        [rax*2+rax]
+
+        ;transpose 16x8 to 8x16, and store the 8-line result on stack.
+        TRANSPOSE_16X8 1, 1
+
+        ; calculate filter mask and high edge variance
+        LFV_FILTER_MASK_HEV_MASK 1
+
+        ; start work on filters
+        B_FILTER 2
+
+        ; tranpose and write back - only work on q1, q0, p0, p1
+        BV_TRANSPOSE
+        ; store 16-line result
+
+        lea         rdx,        [rax]
+        neg         rdx
+
+        BV_WRITEBACK xmm1, xmm5
+
+        lea         rsi,        [rsi+rdx*8]
+        lea         rdi,        [rdi+rdx*8]
+        BV_WRITEBACK xmm2, xmm6
+
+    add rsp, 96
+    pop rsp
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp9_loop_filter_vertical_edge_uv_sse2
+;(
+;    unsigned char *u,
+;    int            src_pixel_step,
+;    const char    *blimit,
+;    const char    *limit,
+;    const char    *thresh,
+;    unsigned char *v
+;)
+global sym(vp9_loop_filter_vertical_edge_uv_sse2)
+sym(vp9_loop_filter_vertical_edge_uv_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub             rsp, 96      ; reserve 96 bytes
+    %define t0      [rsp + 0]    ;__declspec(align(16)) char t0[16];
+    %define t1      [rsp + 16]   ;__declspec(align(16)) char t1[16];
+    %define srct    [rsp + 32]   ;__declspec(align(16)) char srct[64];
+
+        mov         rsi,        arg(0)                  ; u_ptr
+        movsxd      rax,        dword ptr arg(1)        ; src_pixel_step
+
+        lea         rsi,        [rsi - 4]
+        lea         rdi,        [rsi + rax]             ; rdi points to row +1 for indirect addressing
+        lea         rcx,        [rax+2*rax]
+
+        lea         rdx,        srct
+
+        ;transpose 16x8 to 8x16, and store the 8-line result on stack.
+        TRANSPOSE_16X8 0, 1
+
+        ; calculate filter mask and high edge variance
+        LFV_FILTER_MASK_HEV_MASK 1
+
+        ; start work on filters
+        B_FILTER 2
+
+        ; tranpose and write back - only work on q1, q0, p0, p1
+        BV_TRANSPOSE
+
+        lea         rdi,        [rsi + rax]             ; rdi points to row +1 for indirect addressing
+
+        ; store 16-line result
+        BV_WRITEBACK xmm1, xmm5
+
+        mov         rsi,        arg(0)                  ; u_ptr
+        lea         rsi,        [rsi - 4]
+        lea         rdi,        [rsi + rax]             ; rdi points to row +1 for indirect addressing
+        BV_WRITEBACK xmm2, xmm6
+
+    add rsp, 96
+    pop rsp
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vp9_loop_filter_simple_horizontal_edge_sse2
+;(
+;    unsigned char *src_ptr,
+;    int  src_pixel_step,
+;    const char *blimit,
+;)
+global sym(vp9_loop_filter_simple_horizontal_edge_sse2)
+sym(vp9_loop_filter_simple_horizontal_edge_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 3
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov         rsi, arg(0)             ;src_ptr
+        movsxd      rax, dword ptr arg(1)   ;src_pixel_step     ; destination pitch?
+        mov         rdx, arg(2)             ;blimit
+        movdqa      xmm3, XMMWORD PTR [rdx]
+
+        mov         rdi, rsi                ; rdi points to row +1 for indirect addressing
+        add         rdi, rax
+        neg         rax
+
+        ; calculate mask
+        movdqa      xmm1, [rsi+2*rax]       ; p1
+        movdqa      xmm0, [rdi]             ; q1
+        movdqa      xmm2, xmm1
+        movdqa      xmm7, xmm0
+        movdqa      xmm4, xmm0
+        psubusb     xmm0, xmm1              ; q1-=p1
+        psubusb     xmm1, xmm4              ; p1-=q1
+        por         xmm1, xmm0              ; abs(p1-q1)
+        pand        xmm1, [GLOBAL(tfe)]     ; set lsb of each byte to zero
+        psrlw       xmm1, 1                 ; abs(p1-q1)/2
+
+        movdqa      xmm5, [rsi+rax]         ; p0
+        movdqa      xmm4, [rsi]             ; q0
+        movdqa      xmm0, xmm4              ; q0
+        movdqa      xmm6, xmm5              ; p0
+        psubusb     xmm5, xmm4              ; p0-=q0
+        psubusb     xmm4, xmm6              ; q0-=p0
+        por         xmm5, xmm4              ; abs(p0 - q0)
+        paddusb     xmm5, xmm5              ; abs(p0-q0)*2
+        paddusb     xmm5, xmm1              ; abs (p0 - q0) *2 + abs(p1-q1)/2
+
+        psubusb     xmm5, xmm3              ; abs(p0 - q0) *2 + abs(p1-q1)/2  > blimit
+        pxor        xmm3, xmm3
+        pcmpeqb     xmm5, xmm3
+
+        ; start work on filters
+        pxor        xmm2, [GLOBAL(t80)]     ; p1 offset to convert to signed values
+        pxor        xmm7, [GLOBAL(t80)]     ; q1 offset to convert to signed values
+        psubsb      xmm2, xmm7              ; p1 - q1
+
+        pxor        xmm6, [GLOBAL(t80)]     ; offset to convert to signed values
+        pxor        xmm0, [GLOBAL(t80)]     ; offset to convert to signed values
+        movdqa      xmm3, xmm0              ; q0
+        psubsb      xmm0, xmm6              ; q0 - p0
+        paddsb      xmm2, xmm0              ; p1 - q1 + 1 * (q0 - p0)
+        paddsb      xmm2, xmm0              ; p1 - q1 + 2 * (q0 - p0)
+        paddsb      xmm2, xmm0              ; p1 - q1 + 3 * (q0 - p0)
+        pand        xmm5, xmm2              ; mask filter values we don't care about
+
+        ; do + 4 side
+        paddsb      xmm5, [GLOBAL(t4)]      ; 3* (q0 - p0) + (p1 - q1) + 4
+
+        movdqa      xmm0, xmm5              ; get a copy of filters
+        psllw       xmm0, 8                 ; shift left 8
+        psraw       xmm0, 3                 ; arithmetic shift right 11
+        psrlw       xmm0, 8
+        movdqa      xmm1, xmm5              ; get a copy of filters
+        psraw       xmm1, 11                ; arithmetic shift right 11
+        psllw       xmm1, 8                 ; shift left 8 to put it back
+
+        por         xmm0, xmm1              ; put the two together to get result
+
+        psubsb      xmm3, xmm0              ; q0-= q0 add
+        pxor        xmm3, [GLOBAL(t80)]     ; unoffset
+        movdqa      [rsi], xmm3             ; write back
+
+        ; now do +3 side
+        psubsb      xmm5, [GLOBAL(t1s)]     ; +3 instead of +4
+
+        movdqa      xmm0, xmm5              ; get a copy of filters
+        psllw       xmm0, 8                 ; shift left 8
+        psraw       xmm0, 3                 ; arithmetic shift right 11
+        psrlw       xmm0, 8
+        psraw       xmm5, 11                ; arithmetic shift right 11
+        psllw       xmm5, 8                 ; shift left 8 to put it back
+        por         xmm0, xmm5              ; put the two together to get result
+
+
+        paddsb      xmm6, xmm0              ; p0+= p0 add
+        pxor        xmm6, [GLOBAL(t80)]     ; unoffset
+        movdqa      [rsi+rax], xmm6         ; write back
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp9_loop_filter_simple_vertical_edge_sse2
+;(
+;    unsigned char *src_ptr,
+;    int  src_pixel_step,
+;    const char *blimit,
+;)
+global sym(vp9_loop_filter_simple_vertical_edge_sse2)
+sym(vp9_loop_filter_simple_vertical_edge_sse2):
+    push        rbp         ; save old base pointer value.
+    mov         rbp, rsp    ; set new base pointer value.
+    SHADOW_ARGS_TO_STACK 3
+    SAVE_XMM 7
+    GET_GOT     rbx         ; save callee-saved reg
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 32                         ; reserve 32 bytes
+    %define t0  [rsp + 0]    ;__declspec(align(16)) char t0[16];
+    %define t1  [rsp + 16]   ;__declspec(align(16)) char t1[16];
+
+        mov         rsi, arg(0) ;src_ptr
+        movsxd      rax, dword ptr arg(1) ;src_pixel_step     ; destination pitch?
+
+        lea         rsi,        [rsi - 2 ]
+        lea         rdi,        [rsi + rax]
+        lea         rdx,        [rsi + rax*4]
+        lea         rcx,        [rdx + rax]
+
+        movd        xmm0,       [rsi]                   ; (high 96 bits unused) 03 02 01 00
+        movd        xmm1,       [rdx]                   ; (high 96 bits unused) 43 42 41 40
+        movd        xmm2,       [rdi]                   ; 13 12 11 10
+        movd        xmm3,       [rcx]                   ; 53 52 51 50
+        punpckldq   xmm0,       xmm1                    ; (high 64 bits unused) 43 42 41 40 03 02 01 00
+        punpckldq   xmm2,       xmm3                    ; 53 52 51 50 13 12 11 10
+
+        movd        xmm4,       [rsi + rax*2]           ; 23 22 21 20
+        movd        xmm5,       [rdx + rax*2]           ; 63 62 61 60
+        movd        xmm6,       [rdi + rax*2]           ; 33 32 31 30
+        movd        xmm7,       [rcx + rax*2]           ; 73 72 71 70
+        punpckldq   xmm4,       xmm5                    ; 63 62 61 60 23 22 21 20
+        punpckldq   xmm6,       xmm7                    ; 73 72 71 70 33 32 31 30
+
+        punpcklbw   xmm0,       xmm2                    ; 53 43 52 42 51 41 50 40 13 03 12 02 11 01 10 00
+        punpcklbw   xmm4,       xmm6                    ; 73 63 72 62 71 61 70 60 33 23 32 22 31 21 30 20
+
+        movdqa      xmm1,       xmm0
+        punpcklwd   xmm0,       xmm4                    ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
+        punpckhwd   xmm1,       xmm4                    ; 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40
+
+        movdqa      xmm2,       xmm0
+        punpckldq   xmm0,       xmm1                    ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
+        punpckhdq   xmm2,       xmm1                    ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
+
+        movdqa      t0,         xmm0                    ; save to t0
+        movdqa      t1,         xmm2                    ; save to t1
+
+        lea         rsi,        [rsi + rax*8]
+        lea         rdi,        [rsi + rax]
+        lea         rdx,        [rsi + rax*4]
+        lea         rcx,        [rdx + rax]
+
+        movd        xmm4,       [rsi]                   ; 83 82 81 80
+        movd        xmm1,       [rdx]                   ; c3 c2 c1 c0
+        movd        xmm6,       [rdi]                   ; 93 92 91 90
+        movd        xmm3,       [rcx]                   ; d3 d2 d1 d0
+        punpckldq   xmm4,       xmm1                    ; c3 c2 c1 c0 83 82 81 80
+        punpckldq   xmm6,       xmm3                    ; d3 d2 d1 d0 93 92 91 90
+
+        movd        xmm0,       [rsi + rax*2]           ; a3 a2 a1 a0
+        movd        xmm5,       [rdx + rax*2]           ; e3 e2 e1 e0
+        movd        xmm2,       [rdi + rax*2]           ; b3 b2 b1 b0
+        movd        xmm7,       [rcx + rax*2]           ; f3 f2 f1 f0
+        punpckldq   xmm0,       xmm5                    ; e3 e2 e1 e0 a3 a2 a1 a0
+        punpckldq   xmm2,       xmm7                    ; f3 f2 f1 f0 b3 b2 b1 b0
+
+        punpcklbw   xmm4,       xmm6                    ; d3 c3 d2 c2 d1 c1 d0 c0 93 83 92 82 91 81 90 80
+        punpcklbw   xmm0,       xmm2                    ; f3 e3 f2 e2 f1 e1 f0 e0 b3 a3 b2 a2 b1 a1 b0 a0
+
+        movdqa      xmm1,       xmm4
+        punpcklwd   xmm4,       xmm0                    ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80
+        punpckhwd   xmm1,       xmm0                    ; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0
+
+        movdqa      xmm6,       xmm4
+        punpckldq   xmm4,       xmm1                    ; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80
+        punpckhdq   xmm6,       xmm1                    ; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82
+
+        movdqa      xmm0,       t0                      ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
+        movdqa      xmm2,       t1                      ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
+        movdqa      xmm1,       xmm0
+        movdqa      xmm3,       xmm2
+
+        punpcklqdq  xmm0,       xmm4                    ; p1  f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
+        punpckhqdq  xmm1,       xmm4                    ; p0  f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
+        punpcklqdq  xmm2,       xmm6                    ; q0  f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
+        punpckhqdq  xmm3,       xmm6                    ; q1  f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
+
+        ; calculate mask
+        movdqa      xmm6,       xmm0                            ; p1
+        movdqa      xmm7,       xmm3                            ; q1
+        psubusb     xmm7,       xmm0                            ; q1-=p1
+        psubusb     xmm6,       xmm3                            ; p1-=q1
+        por         xmm6,       xmm7                            ; abs(p1-q1)
+        pand        xmm6,       [GLOBAL(tfe)]                   ; set lsb of each byte to zero
+        psrlw       xmm6,       1                               ; abs(p1-q1)/2
+
+        movdqa      xmm5,       xmm1                            ; p0
+        movdqa      xmm4,       xmm2                            ; q0
+        psubusb     xmm5,       xmm2                            ; p0-=q0
+        psubusb     xmm4,       xmm1                            ; q0-=p0
+        por         xmm5,       xmm4                            ; abs(p0 - q0)
+        paddusb     xmm5,       xmm5                            ; abs(p0-q0)*2
+        paddusb     xmm5,       xmm6                            ; abs (p0 - q0) *2 + abs(p1-q1)/2
+
+        mov         rdx,        arg(2)                          ;blimit
+        movdqa      xmm7, XMMWORD PTR [rdx]
+
+        psubusb     xmm5,        xmm7                           ; abs(p0 - q0) *2 + abs(p1-q1)/2  > blimit
+        pxor        xmm7,        xmm7
+        pcmpeqb     xmm5,        xmm7                           ; mm5 = mask
+
+        ; start work on filters
+        movdqa        t0,        xmm0
+        movdqa        t1,        xmm3
+
+        pxor        xmm0,        [GLOBAL(t80)]                  ; p1 offset to convert to signed values
+        pxor        xmm3,        [GLOBAL(t80)]                  ; q1 offset to convert to signed values
+
+        psubsb      xmm0,        xmm3                           ; p1 - q1
+        movdqa      xmm6,        xmm1                           ; p0
+
+        movdqa      xmm7,        xmm2                           ; q0
+        pxor        xmm6,        [GLOBAL(t80)]                  ; offset to convert to signed values
+
+        pxor        xmm7,        [GLOBAL(t80)]                  ; offset to convert to signed values
+        movdqa      xmm3,        xmm7                           ; offseted ; q0
+
+        psubsb      xmm7,        xmm6                           ; q0 - p0
+        paddsb      xmm0,        xmm7                           ; p1 - q1 + 1 * (q0 - p0)
+
+        paddsb      xmm0,        xmm7                           ; p1 - q1 + 2 * (q0 - p0)
+        paddsb      xmm0,        xmm7                           ; p1 - q1 + 3 * (q0 - p0)
+
+        pand        xmm5,        xmm0                           ; mask filter values we don't care about
+
+
+        paddsb      xmm5,        [GLOBAL(t4)]                   ;  3* (q0 - p0) + (p1 - q1) + 4
+
+        movdqa      xmm0,        xmm5                           ; get a copy of filters
+        psllw       xmm0,        8                              ; shift left 8
+
+        psraw       xmm0,        3                              ; arithmetic shift right 11
+        psrlw       xmm0,        8
+
+        movdqa      xmm7,        xmm5                           ; get a copy of filters
+        psraw       xmm7,        11                             ; arithmetic shift right 11
+
+        psllw       xmm7,        8                              ; shift left 8 to put it back
+        por         xmm0,        xmm7                           ; put the two together to get result
+
+        psubsb      xmm3,        xmm0                           ; q0-= q0sz add
+        pxor        xmm3,        [GLOBAL(t80)]                  ; unoffset   q0
+
+        ; now do +3 side
+        psubsb      xmm5,        [GLOBAL(t1s)]                  ; +3 instead of +4
+        movdqa      xmm0,        xmm5                           ; get a copy of filters
+
+        psllw       xmm0,        8                              ; shift left 8
+        psraw       xmm0,        3                              ; arithmetic shift right 11
+
+        psrlw       xmm0,        8
+        psraw       xmm5,        11                             ; arithmetic shift right 11
+
+        psllw       xmm5,        8                              ; shift left 8 to put it back
+        por         xmm0,        xmm5                           ; put the two together to get result
+
+        paddsb      xmm6,        xmm0                           ; p0+= p0 add
+        pxor        xmm6,        [GLOBAL(t80)]                  ; unoffset   p0
+
+        movdqa      xmm0,        t0                             ; p1
+        movdqa      xmm4,        t1                             ; q1
+
+        ; transpose back to write out
+        ; p1  f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
+        ; p0  f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
+        ; q0  f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
+        ; q1  f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
+        movdqa      xmm1,       xmm0
+        punpcklbw   xmm0,       xmm6                               ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
+        punpckhbw   xmm1,       xmm6                               ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80
+
+        movdqa      xmm5,       xmm3
+        punpcklbw   xmm3,       xmm4                               ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
+        punpckhbw   xmm5,       xmm4                               ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
+
+        movdqa      xmm2,       xmm0
+        punpcklwd   xmm0,       xmm3                               ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00
+        punpckhwd   xmm2,       xmm3                               ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40
+
+        movdqa      xmm3,       xmm1
+        punpcklwd   xmm1,       xmm5                               ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80
+        punpckhwd   xmm3,       xmm5                               ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0
+
+        ; write out order: xmm0 xmm2 xmm1 xmm3
+        lea         rdx,        [rsi + rax*4]
+
+        movd        [rsi],      xmm1                               ; write the second 8-line result
+        psrldq      xmm1,       4
+        movd        [rdi],      xmm1
+        psrldq      xmm1,       4
+        movd        [rsi + rax*2], xmm1
+        psrldq      xmm1,       4
+        movd        [rdi + rax*2], xmm1
+
+        movd        [rdx],      xmm3
+        psrldq      xmm3,       4
+        movd        [rcx],      xmm3
+        psrldq      xmm3,       4
+        movd        [rdx + rax*2], xmm3
+        psrldq      xmm3,       4
+        movd        [rcx + rax*2], xmm3
+
+        neg         rax
+        lea         rsi,        [rsi + rax*8]
+        neg         rax
+        lea         rdi,        [rsi + rax]
+        lea         rdx,        [rsi + rax*4]
+        lea         rcx,        [rdx + rax]
+
+        movd        [rsi],      xmm0                                ; write the first 8-line result
+        psrldq      xmm0,       4
+        movd        [rdi],      xmm0
+        psrldq      xmm0,       4
+        movd        [rsi + rax*2], xmm0
+        psrldq      xmm0,       4
+        movd        [rdi + rax*2], xmm0
+
+        movd        [rdx],      xmm2
+        psrldq      xmm2,       4
+        movd        [rcx],      xmm2
+        psrldq      xmm2,       4
+        movd        [rdx + rax*2], xmm2
+        psrldq      xmm2,       4
+        movd        [rcx + rax*2], xmm2
+
+    add rsp, 32
+    pop rsp
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+SECTION_RODATA
+align 16
+tfe:
+    times 16 db 0xfe
+align 16
+t80:
+    times 16 db 0x80
+align 16
+t1s:
+    times 16 db 0x01
+align 16
+t3:
+    times 16 db 0x03
+align 16
+t4:
+    times 16 db 0x04
+align 16
+ones:
+    times 8 dw 0x0001
+align 16
+s9:
+    times 8 dw 0x0900
+align 16
+s63:
+    times 8 dw 0x003f
--- /dev/null
+++ b/vp9/common/x86/loopfilter_x86.c
@@ -1,0 +1,543 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <emmintrin.h>  // SSE2
+#include "vpx_config.h"
+#include "vp9/common/loopfilter.h"
+
+prototype_loopfilter(vp9_loop_filter_vertical_edge_mmx);
+prototype_loopfilter(vp9_loop_filter_horizontal_edge_mmx);
+
+prototype_loopfilter(vp9_loop_filter_vertical_edge_sse2);
+prototype_loopfilter(vp9_loop_filter_horizontal_edge_sse2);
+
+extern loop_filter_uvfunction vp9_loop_filter_horizontal_edge_uv_sse2;
+extern loop_filter_uvfunction vp9_loop_filter_vertical_edge_uv_sse2;
+
+#if HAVE_MMX
+/* Horizontal MB filtering */
+void vp9_loop_filter_mbh_mmx(unsigned char *y_ptr,
+                             unsigned char *u_ptr, unsigned char *v_ptr,
+                             int y_stride, int uv_stride,
+                             struct loop_filter_info *lfi) {
+}
+
+/* Vertical MB Filtering */
+void vp9_loop_filter_mbv_mmx(unsigned char *y_ptr,
+                             unsigned char *u_ptr, unsigned char *v_ptr,
+                             int y_stride, int uv_stride,
+                             struct loop_filter_info *lfi) {
+}
+
+/* Horizontal B Filtering */
+void vp9_loop_filter_bh_mmx(unsigned char *y_ptr,
+                            unsigned char *u_ptr, unsigned char *v_ptr,
+                            int y_stride, int uv_stride,
+                            struct loop_filter_info *lfi) {
+
+}
+
+void vp9_loop_filter_bhs_mmx(unsigned char *y_ptr, int y_stride,
+                             const unsigned char *blimit) {
+  vp9_loop_filter_simple_horizontal_edge_mmx(y_ptr + 4 * y_stride,
+                                             y_stride, blimit);
+  vp9_loop_filter_simple_horizontal_edge_mmx(y_ptr + 8 * y_stride,
+                                             y_stride, blimit);
+  vp9_loop_filter_simple_horizontal_edge_mmx(y_ptr + 12 * y_stride,
+                                             y_stride, blimit);
+}
+
+/* Vertical B Filtering */
+void vp9_loop_filter_bv_mmx(unsigned char *y_ptr,
+                            unsigned char *u_ptr, unsigned char *v_ptr,
+                            int y_stride, int uv_stride,
+                            struct loop_filter_info *lfi) {
+  vp9_loop_filter_vertical_edge_mmx(y_ptr + 4, y_stride,
+                                    lfi->blim, lfi->lim, lfi->hev_thr, 2);
+  vp9_loop_filter_vertical_edge_mmx(y_ptr + 8, y_stride,
+                                    lfi->blim, lfi->lim, lfi->hev_thr, 2);
+  vp9_loop_filter_vertical_edge_mmx(y_ptr + 12, y_stride,
+                                    lfi->blim, lfi->lim, lfi->hev_thr, 2);
+
+  if (u_ptr)
+    vp9_loop_filter_vertical_edge_mmx(u_ptr + 4, uv_stride,
+                                      lfi->blim, lfi->lim, lfi->hev_thr, 1);
+
+  if (v_ptr)
+    vp9_loop_filter_vertical_edge_mmx(v_ptr + 4, uv_stride,
+                                      lfi->blim, lfi->lim, lfi->hev_thr, 1);
+}
+
+void vp9_loop_filter_bvs_mmx(unsigned char *y_ptr, int y_stride,
+                             const unsigned char *blimit) {
+  vp9_loop_filter_simple_vertical_edge_mmx(y_ptr + 4, y_stride, blimit);
+  vp9_loop_filter_simple_vertical_edge_mmx(y_ptr + 8, y_stride, blimit);
+  vp9_loop_filter_simple_vertical_edge_mmx(y_ptr + 12, y_stride, blimit);
+}
+#endif
+
+#if HAVE_SSE2
+void vp9_mbloop_filter_horizontal_edge_c_sse2(unsigned char *s,
+                                              int p,
+                                              const unsigned char *_blimit,
+                                              const unsigned char *_limit,
+                                              const unsigned char *_thresh,
+                                              int count) {
+  DECLARE_ALIGNED(16, unsigned char, flat_op2[16]);
+  DECLARE_ALIGNED(16, unsigned char, flat_op1[16]);
+  DECLARE_ALIGNED(16, unsigned char, flat_op0[16]);
+  DECLARE_ALIGNED(16, unsigned char, flat_oq2[16]);
+  DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]);
+  DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]);
+  __m128i mask, hev, flat;
+  __m128i thresh, limit, blimit;
+  const __m128i zero = _mm_set1_epi16(0);
+  __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4;
+
+  thresh = _mm_shuffle_epi32(_mm_cvtsi32_si128(_thresh[0] * 0x01010101), 0);
+  limit = _mm_shuffle_epi32(_mm_cvtsi32_si128(_limit[0] * 0x01010101), 0);
+  blimit = _mm_shuffle_epi32(_mm_cvtsi32_si128(_blimit[0] * 0x01010101), 0);
+
+  p4 = _mm_loadu_si128((__m128i *)(s - 5 * p));
+  p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
+  p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
+  p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
+  p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
+  q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
+  q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
+  q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
+  q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
+  q4 = _mm_loadu_si128((__m128i *)(s + 4 * p));
+  {
+    const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0),
+                                          _mm_subs_epu8(p0, p1));
+    const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0),
+                                          _mm_subs_epu8(q0, q1));
+    const __m128i one = _mm_set1_epi8(1);
+    const __m128i fe = _mm_set1_epi8(0xfe);
+    const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
+    __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0),
+                                    _mm_subs_epu8(q0, p0));
+    __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1),
+                                    _mm_subs_epu8(q1, p1));
+    __m128i work;
+    flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
+    hev = _mm_subs_epu8(flat, thresh);
+    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+
+    abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0);
+    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
+    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
+    mask = _mm_max_epu8(flat, mask);
+    // mask |= (abs(p1 - p0) > limit) * -1;
+    // mask |= (abs(q1 - q0) > limit) * -1;
+    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p1),
+                                     _mm_subs_epu8(p1, p2)),
+                         _mm_or_si128(_mm_subs_epu8(p3, p2),
+                                      _mm_subs_epu8(p2, p3)));
+    mask = _mm_max_epu8(work, mask);
+    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2, q1),
+                                     _mm_subs_epu8(q1, q2)),
+                         _mm_or_si128(_mm_subs_epu8(q3, q2),
+                                      _mm_subs_epu8(q2, q3)));
+    mask = _mm_max_epu8(work, mask);
+    mask = _mm_subs_epu8(mask, limit);
+    mask = _mm_cmpeq_epi8(mask, zero);
+
+    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p0),
+                                     _mm_subs_epu8(p0, p2)),
+                         _mm_or_si128(_mm_subs_epu8(q2, q0),
+                                      _mm_subs_epu8(q0, q2)));
+    flat = _mm_max_epu8(work, flat);
+    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p3, p0),
+                                     _mm_subs_epu8(p0, p3)),
+                         _mm_or_si128(_mm_subs_epu8(q3, q0),
+                                      _mm_subs_epu8(q0, q3)));
+    flat = _mm_max_epu8(work, flat);
+    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p4, p0),
+                                     _mm_subs_epu8(p0, p4)),
+                         _mm_or_si128(_mm_subs_epu8(q4, q0),
+                                      _mm_subs_epu8(q0, q4)));
+    flat = _mm_max_epu8(work, flat);
+    flat = _mm_subs_epu8(flat, one);
+    flat = _mm_cmpeq_epi8(flat, zero);
+    flat = _mm_and_si128(flat, mask);
+  }
+  {
+    const __m128i four = _mm_set1_epi16(4);
+    unsigned char *src = s;
+    int i = 0;
+    do {
+      __m128i workp_a, workp_b, workp_shft;
+      p4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 5 * p)), zero);
+      p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero);
+      p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero);
+      p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero);
+      p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero);
+      q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero);
+      q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero);
+      q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero);
+      q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero);
+      q4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 4 * p)), zero);
+
+      workp_a = _mm_add_epi16(_mm_add_epi16(p4, p3), _mm_add_epi16(p2, p1));
+      workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0);
+      workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p4);
+      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+      _mm_storel_epi64((__m128i *)&flat_op2[i*8],
+                       _mm_packus_epi16(workp_shft, workp_shft));
+
+      workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1);
+      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+      _mm_storel_epi64((__m128i *)&flat_op1[i*8],
+                       _mm_packus_epi16(workp_shft, workp_shft));
+
+      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p4), q2);
+      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0);
+      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+      _mm_storel_epi64((__m128i *)&flat_op0[i*8],
+                       _mm_packus_epi16(workp_shft, workp_shft));
+
+      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3);
+      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0);
+      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+      _mm_storel_epi64((__m128i *)&flat_oq0[i*8],
+                       _mm_packus_epi16(workp_shft, workp_shft));
+
+      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q4);
+      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1);
+      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+      _mm_storel_epi64((__m128i *)&flat_oq1[i*8],
+                       _mm_packus_epi16(workp_shft, workp_shft));
+
+      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q4);
+      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2);
+      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+      _mm_storel_epi64((__m128i *)&flat_oq2[i*8],
+                       _mm_packus_epi16(workp_shft, workp_shft));
+
+      src += 8;
+    } while (++i < count);
+  }
+  // lp filter
+  {
+    const __m128i t4 = _mm_set1_epi8(4);
+    const __m128i t3 = _mm_set1_epi8(3);
+    const __m128i t80 = _mm_set1_epi8(0x80);
+    const __m128i te0 = _mm_set1_epi8(0xe0);
+    const __m128i t1f = _mm_set1_epi8(0x1f);
+    const __m128i t1 = _mm_set1_epi8(0x1);
+    const __m128i t7f = _mm_set1_epi8(0x7f);
+
+    const __m128i ps1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)),
+                                      t80);
+    const __m128i ps0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)),
+                                      t80);
+    const __m128i qs0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)),
+                                      t80);
+    const __m128i qs1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)),
+                                      t80);
+    __m128i filt;
+    __m128i work_a;
+    __m128i filter1, filter2;
+
+    filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
+    work_a = _mm_subs_epi8(qs0, ps0);
+    filt = _mm_adds_epi8(filt, work_a);
+    filt = _mm_adds_epi8(filt, work_a);
+    filt = _mm_adds_epi8(filt, work_a);
+    /* (vp9_filter + 3 * (qs0 - ps0)) & mask */
+    filt = _mm_and_si128(filt, mask);
+
+    filter1 = _mm_adds_epi8(filt, t4);
+    filter2 = _mm_adds_epi8(filt, t3);
+
+    /* Filter1 >> 3 */
+    work_a = _mm_cmpgt_epi8(zero, filter1);
+    filter1 = _mm_srli_epi16(filter1, 3);
+    work_a = _mm_and_si128(work_a, te0);
+    filter1 = _mm_and_si128(filter1, t1f);
+    filter1 = _mm_or_si128(filter1, work_a);
+
+    /* Filter2 >> 3 */
+    work_a = _mm_cmpgt_epi8(zero, filter2);
+    filter2 = _mm_srli_epi16(filter2, 3);
+    work_a = _mm_and_si128(work_a, te0);
+    filter2 = _mm_and_si128(filter2, t1f);
+    filter2 = _mm_or_si128(filter2, work_a);
+
+    /* filt >> 1 */
+    filt = _mm_adds_epi8(filter1, t1);
+    work_a = _mm_cmpgt_epi8(zero, filt);
+    filt = _mm_srli_epi16(filt, 1);
+    work_a = _mm_and_si128(work_a, t80);
+    filt = _mm_and_si128(filt, t7f);
+    filt = _mm_or_si128(filt, work_a);
+
+    filt = _mm_andnot_si128(hev, filt);
+
+    work_a = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
+    q0 = _mm_load_si128((__m128i *)flat_oq0);
+    work_a = _mm_andnot_si128(flat, work_a);
+    q0 = _mm_and_si128(flat, q0);
+    q0 = _mm_or_si128(work_a, q0);
+
+    work_a = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
+    q1 = _mm_load_si128((__m128i *)flat_oq1);
+    work_a = _mm_andnot_si128(flat, work_a);
+    q1 = _mm_and_si128(flat, q1);
+    q1 = _mm_or_si128(work_a, q1);
+
+    work_a = _mm_loadu_si128((__m128i *)(s + 2 * p));
+    q2 = _mm_load_si128((__m128i *)flat_oq2);
+    work_a = _mm_andnot_si128(flat, work_a);
+    q2 = _mm_and_si128(flat, q2);
+    q2 = _mm_or_si128(work_a, q2);
+
+    work_a = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
+    p0 = _mm_load_si128((__m128i *)flat_op0);
+    work_a = _mm_andnot_si128(flat, work_a);
+    p0 = _mm_and_si128(flat, p0);
+    p0 = _mm_or_si128(work_a, p0);
+
+    work_a = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
+    p1 = _mm_load_si128((__m128i *)flat_op1);
+    work_a = _mm_andnot_si128(flat, work_a);
+    p1 = _mm_and_si128(flat, p1);
+    p1 = _mm_or_si128(work_a, p1);
+
+    work_a = _mm_loadu_si128((__m128i *)(s - 3 * p));
+    p2 = _mm_load_si128((__m128i *)flat_op2);
+    work_a = _mm_andnot_si128(flat, work_a);
+    p2 = _mm_and_si128(flat, p2);
+    p2 = _mm_or_si128(work_a, p2);
+
+    if (count == 1) {
+      _mm_storel_epi64((__m128i *)(s - 3 * p), p2);
+      _mm_storel_epi64((__m128i *)(s - 2 * p), p1);
+      _mm_storel_epi64((__m128i *)(s - 1 * p), p0);
+      _mm_storel_epi64((__m128i *)(s + 0 * p), q0);
+      _mm_storel_epi64((__m128i *)(s + 1 * p), q1);
+      _mm_storel_epi64((__m128i *)(s + 2 * p), q2);
+    } else {
+      _mm_storeu_si128((__m128i *)(s - 3 * p), p2);
+      _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
+      _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
+      _mm_storeu_si128((__m128i *)(s + 0 * p), q0);
+      _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
+      _mm_storeu_si128((__m128i *)(s + 2 * p), q2);
+    }
+  }
+}
+
+static __inline void transpose(unsigned char *src[], int in_p,
+                               unsigned char *dst[], int out_p,
+                               int num_8x8_to_transpose) {
+  int idx8x8 = 0;
+  __m128i x0, x1, x2, x3, x4, x5, x6, x7;
+  do {
+    unsigned char *in = src[idx8x8];
+    unsigned char *out = dst[idx8x8];
+
+    x0 = _mm_loadl_epi64((__m128i *)(in + 0*in_p));  // 00 01 02 03 04 05 06 07
+    x1 = _mm_loadl_epi64((__m128i *)(in + 1*in_p));  // 10 11 12 13 14 15 16 17
+    x2 = _mm_loadl_epi64((__m128i *)(in + 2*in_p));  // 20 21 22 23 24 25 26 27
+    x3 = _mm_loadl_epi64((__m128i *)(in + 3*in_p));  // 30 31 32 33 34 35 36 37
+    x4 = _mm_loadl_epi64((__m128i *)(in + 4*in_p));  // 40 41 42 43 44 45 46 47
+    x5 = _mm_loadl_epi64((__m128i *)(in + 5*in_p));  // 50 51 52 53 54 55 56 57
+    x6 = _mm_loadl_epi64((__m128i *)(in + 6*in_p));  // 60 61 62 63 64 65 66 67
+    x7 = _mm_loadl_epi64((__m128i *)(in + 7*in_p));  // 70 71 72 73 74 75 76 77
+    // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+    x0 = _mm_unpacklo_epi8(x0, x1);
+    // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+    x1 = _mm_unpacklo_epi8(x2, x3);
+    // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
+    x2 = _mm_unpacklo_epi8(x4, x5);
+    // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
+    x3 = _mm_unpacklo_epi8(x6, x7);
+    // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+    x4 = _mm_unpacklo_epi16(x0, x1);
+    // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
+    x5 = _mm_unpacklo_epi16(x2, x3);
+    // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
+    x6 = _mm_unpacklo_epi32(x4, x5);
+    // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
+    x7 = _mm_unpackhi_epi32(x4, x5);
+
+    _mm_storel_pd((double *)(out + 0*out_p),
+                  _mm_castsi128_pd(x6));  // 00 10 20 30 40 50 60 70
+    _mm_storeh_pd((double *)(out + 1*out_p),
+                  _mm_castsi128_pd(x6));  // 01 11 21 31 41 51 61 71
+    _mm_storel_pd((double *)(out + 2*out_p),
+                  _mm_castsi128_pd(x7));  // 02 12 22 32 42 52 62 72
+    _mm_storeh_pd((double *)(out + 3*out_p),
+                  _mm_castsi128_pd(x7));  // 03 13 23 33 43 53 63 73
+
+    // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
+    x4 = _mm_unpackhi_epi16(x0, x1);
+    // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77
+    x5 = _mm_unpackhi_epi16(x2, x3);
+    // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
+    x6 = _mm_unpacklo_epi32(x4, x5);
+    // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
+    x7 = _mm_unpackhi_epi32(x4, x5);
+
+    _mm_storel_pd((double *)(out + 4*out_p),
+                  _mm_castsi128_pd(x6));  // 04 14 24 34 44 54 64 74
+    _mm_storeh_pd((double *)(out + 5*out_p),
+                  _mm_castsi128_pd(x6));  // 05 15 25 35 45 55 65 75
+    _mm_storel_pd((double *)(out + 6*out_p),
+                  _mm_castsi128_pd(x7));  // 06 16 26 36 46 56 66 76
+    _mm_storeh_pd((double *)(out + 7*out_p),
+                  _mm_castsi128_pd(x7));  // 07 17 27 37 47 57 67 77
+  } while (++idx8x8 < num_8x8_to_transpose);
+}
+
+void vp9_mbloop_filter_vertical_edge_c_sse2(unsigned char *s,
+                                            int p,
+                                            const unsigned char *blimit,
+                                            const unsigned char *limit,
+                                            const unsigned char *thresh,
+                                            int count) {
+  DECLARE_ALIGNED(16, unsigned char, t_dst[16 * 16]);
+  unsigned char *src[4];
+  unsigned char *dst[4];
+
+  src[0] = s - 5;
+  src[1] = s - 5 + 8;
+  src[2] = s - 5 + p*8;
+  src[3] = s - 5 + p*8 + 8;
+
+  dst[0] = t_dst;
+  dst[1] = t_dst + 16*8;
+  dst[2] = t_dst + 8;
+  dst[3] = t_dst + 16*8 + 8;
+
+  // 16x16->16x16 or 16x8->8x16
+  transpose(src, p, dst, 16, (1 << count));
+
+  vp9_mbloop_filter_horizontal_edge_c_sse2(t_dst + 5*16, 16, blimit, limit,
+                                           thresh, count);
+
+  dst[0] = s - 5;
+  dst[1] = s - 5 + p*8;
+
+  src[0] = t_dst;
+  src[1] = t_dst + 8;
+
+  // 16x8->8x16 or 8x8->8x8
+  transpose(src, 16, dst, p, (1 << (count - 1)));
+}
+
+/* Horizontal MB filtering */
+void vp9_loop_filter_mbh_sse2(unsigned char *y_ptr,
+                              unsigned char *u_ptr, unsigned char *v_ptr,
+                              int y_stride, int uv_stride,
+                              struct loop_filter_info *lfi) {
+  vp9_mbloop_filter_horizontal_edge_c_sse2(y_ptr, y_stride, lfi->mblim,
+                                           lfi->lim, lfi->hev_thr, 2);
+
+  /* TODO: write sse2 version with u,v interleaved */
+  if (u_ptr)
+    vp9_mbloop_filter_horizontal_edge_c_sse2(u_ptr, uv_stride, lfi->mblim,
+                                             lfi->lim, lfi->hev_thr, 1);
+
+  if (v_ptr)
+    vp9_mbloop_filter_horizontal_edge_c_sse2(v_ptr, uv_stride, lfi->mblim,
+                                             lfi->lim, lfi->hev_thr, 1);
+}
+
+void vp9_loop_filter_bh8x8_sse2(unsigned char *y_ptr, unsigned char *u_ptr,
+                             unsigned char *v_ptr, int y_stride, int uv_stride,
+                             struct loop_filter_info *lfi) {
+  vp9_mbloop_filter_horizontal_edge_c_sse2(
+    y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+}
+
+/* Vertical MB Filtering */
+void vp9_loop_filter_mbv_sse2(unsigned char *y_ptr, unsigned char *u_ptr,
+                              unsigned char *v_ptr, int y_stride, int uv_stride,
+                              struct loop_filter_info *lfi) {
+  vp9_mbloop_filter_vertical_edge_c_sse2(y_ptr, y_stride, lfi->mblim, lfi->lim,
+                                         lfi->hev_thr, 2);
+
+  /* TODO: write sse2 version with u,v interleaved */
+  if (u_ptr)
+    vp9_mbloop_filter_vertical_edge_c_sse2(u_ptr, uv_stride, lfi->mblim,
+                                           lfi->lim, lfi->hev_thr, 1);
+
+  if (v_ptr)
+    vp9_mbloop_filter_vertical_edge_c_sse2(v_ptr, uv_stride, lfi->mblim,
+                                           lfi->lim, lfi->hev_thr, 1);
+}
+
+void vp9_loop_filter_bv8x8_sse2(unsigned char *y_ptr, unsigned char *u_ptr,
+                             unsigned char *v_ptr, int y_stride, int uv_stride,
+                             struct loop_filter_info *lfi) {
+  vp9_mbloop_filter_vertical_edge_c_sse2(
+    y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+}
+
+/* Horizontal B Filtering */
+void vp9_loop_filter_bh_sse2(unsigned char *y_ptr,
+                             unsigned char *u_ptr, unsigned char *v_ptr,
+                             int y_stride, int uv_stride,
+                             struct loop_filter_info *lfi) {
+  vp9_loop_filter_horizontal_edge_sse2(y_ptr + 4 * y_stride, y_stride,
+                                       lfi->blim, lfi->lim, lfi->hev_thr, 2);
+  vp9_loop_filter_horizontal_edge_sse2(y_ptr + 8 * y_stride, y_stride,
+                                       lfi->blim, lfi->lim, lfi->hev_thr, 2);
+  vp9_loop_filter_horizontal_edge_sse2(y_ptr + 12 * y_stride, y_stride,
+                                       lfi->blim, lfi->lim, lfi->hev_thr, 2);
+
+  if (u_ptr)
+    vp9_loop_filter_horizontal_edge_uv_sse2(u_ptr + 4 * uv_stride, uv_stride,
+                                            lfi->blim, lfi->lim, lfi->hev_thr,
+                                            v_ptr + 4 * uv_stride);
+}
+
+void vp9_loop_filter_bhs_sse2(unsigned char *y_ptr, int y_stride,
+                              const unsigned char *blimit) {
+  vp9_loop_filter_simple_horizontal_edge_sse2(y_ptr + 4 * y_stride,
+                                              y_stride, blimit);
+  vp9_loop_filter_simple_horizontal_edge_sse2(y_ptr + 8 * y_stride,
+                                              y_stride, blimit);
+  vp9_loop_filter_simple_horizontal_edge_sse2(y_ptr + 12 * y_stride,
+                                              y_stride, blimit);
+}
+
+/* Vertical B Filtering */
+void vp9_loop_filter_bv_sse2(unsigned char *y_ptr,
+                             unsigned char *u_ptr, unsigned char *v_ptr,
+                             int y_stride, int uv_stride,
+                             struct loop_filter_info *lfi) {
+  vp9_loop_filter_vertical_edge_sse2(y_ptr + 4, y_stride,
+                                     lfi->blim, lfi->lim, lfi->hev_thr, 2);
+  vp9_loop_filter_vertical_edge_sse2(y_ptr + 8, y_stride,
+                                     lfi->blim, lfi->lim, lfi->hev_thr, 2);
+  vp9_loop_filter_vertical_edge_sse2(y_ptr + 12, y_stride,
+                                     lfi->blim, lfi->lim, lfi->hev_thr, 2);
+
+  if (u_ptr)
+    vp9_loop_filter_vertical_edge_uv_sse2(u_ptr + 4, uv_stride,
+                                          lfi->blim, lfi->lim, lfi->hev_thr,
+                                          v_ptr + 4);
+}
+
+void vp9_loop_filter_bvs_sse2(unsigned char *y_ptr, int y_stride,
+                              const unsigned char *blimit) {
+  vp9_loop_filter_simple_vertical_edge_sse2(y_ptr + 4, y_stride, blimit);
+  vp9_loop_filter_simple_vertical_edge_sse2(y_ptr + 8, y_stride, blimit);
+  vp9_loop_filter_simple_vertical_edge_sse2(y_ptr + 12, y_stride, blimit);
+}
+
+#endif
--- /dev/null
+++ b/vp9/common/x86/loopfilter_x86.h
@@ -1,0 +1,43 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef LOOPFILTER_X86_H
+#define LOOPFILTER_X86_H
+
+/* Note:
+ *
+ * This platform is commonly built for runtime CPU detection. If you modify
+ * any of the function mappings present in this file, be sure to also update
+ * them in the function pointer initialization code
+ */
+
+#if HAVE_MMX
+extern prototype_loopfilter_block(vp9_loop_filter_mbv_mmx);
+extern prototype_loopfilter_block(vp9_loop_filter_bv_mmx);
+extern prototype_loopfilter_block(vp9_loop_filter_mbh_mmx);
+extern prototype_loopfilter_block(vp9_loop_filter_bh_mmx);
+extern prototype_simple_loopfilter(vp9_loop_filter_simple_vertical_edge_mmx);
+extern prototype_simple_loopfilter(vp9_loop_filter_bvs_mmx);
+extern prototype_simple_loopfilter(vp9_loop_filter_simple_horizontal_edge_mmx);
+extern prototype_simple_loopfilter(vp9_loop_filter_bhs_mmx);
+#endif
+
+#if HAVE_SSE2
+extern prototype_loopfilter_block(vp9_loop_filter_mbv_sse2);
+extern prototype_loopfilter_block(vp9_loop_filter_bv_sse2);
+extern prototype_loopfilter_block(vp9_loop_filter_mbh_sse2);
+extern prototype_loopfilter_block(vp9_loop_filter_bh_sse2);
+extern prototype_simple_loopfilter(vp9_loop_filter_simple_vertical_edge_sse2);
+extern prototype_simple_loopfilter(vp9_loop_filter_bvs_sse2);
+extern prototype_simple_loopfilter(vp9_loop_filter_simple_horizontal_edge_sse2);
+extern prototype_simple_loopfilter(vp9_loop_filter_bhs_sse2);
+#endif
+
+#endif  // LOOPFILTER_X86_H
--- /dev/null
+++ b/vp9/common/x86/mask_sse3.asm
@@ -1,0 +1,484 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;void int vp8_makemask_sse3(
+;    unsigned char *y,
+;    unsigned char *u,
+;    unsigned char *v,
+;    unsigned char *ym,
+;    unsigned char *uvm,
+;    int yp,
+;    int uvp,
+;    int ys,
+;    int us,
+;    int vs,
+;    int yt,
+;    int ut,
+;    int vt)
+global sym(vp8_makemask_sse3)
+sym(vp8_makemask_sse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 14
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov             rsi,        arg(0) ;y
+        mov             rdi,        arg(1) ;u
+        mov             rcx,        arg(2) ;v
+        mov             rax,        arg(3) ;ym
+        movsxd          rbx,        dword arg(4) ;yp
+        movsxd          rdx,        dword arg(5) ;uvp
+
+        pxor            xmm0,xmm0
+
+        ;make 16 copies of the center y value
+        movd            xmm1, arg(6)
+        pshufb          xmm1, xmm0
+
+        ; make 16 copies of the center u value
+        movd            xmm2, arg(7)
+        pshufb          xmm2, xmm0
+
+        ; make 16 copies of the center v value
+        movd            xmm3, arg(8)
+        pshufb          xmm3, xmm0
+        unpcklpd        xmm2, xmm3
+
+        ;make 16 copies of the y tolerance
+        movd            xmm3, arg(9)
+        pshufb          xmm3, xmm0
+
+        ;make 16 copies of the u tolerance
+        movd            xmm4, arg(10)
+        pshufb          xmm4, xmm0
+
+        ;make 16 copies of the v tolerance
+        movd            xmm5, arg(11)
+        pshufb          xmm5, xmm0
+        unpckhpd        xmm4, xmm5
+
+        mov             r8,8
+
+NextPairOfRows:
+
+        ;grab the y source values
+        movdqu          xmm0, [rsi]
+
+        ;compute abs difference between source and y target
+        movdqa          xmm6, xmm1
+        movdqa          xmm7, xmm0
+        psubusb         xmm0, xmm1
+        psubusb         xmm6, xmm7
+        por             xmm0, xmm6
+
+        ;compute abs difference between
+        movdqa          xmm6, xmm3
+        pcmpgtb         xmm6, xmm0
+
+        ;grab the y source values
+        add             rsi, rbx
+        movdqu          xmm0, [rsi]
+
+        ;compute abs difference between source and y target
+        movdqa          xmm11, xmm1
+        movdqa          xmm7, xmm0
+        psubusb         xmm0, xmm1
+        psubusb         xmm11, xmm7
+        por             xmm0, xmm11
+
+        ;compute abs difference between
+        movdqa          xmm11, xmm3
+        pcmpgtb         xmm11, xmm0
+
+
+        ;grab the u and v source values
+        movdqu          xmm7, [rdi]
+        movdqu          xmm8, [rcx]
+        unpcklpd        xmm7, xmm8
+
+        ;compute abs difference between source and uv targets
+        movdqa          xmm9, xmm2
+        movdqa          xmm10, xmm7
+        psubusb         xmm7, xmm2
+        psubusb         xmm9, xmm10
+        por             xmm7, xmm9
+
+        ;check whether the number is < tolerance
+        movdqa          xmm0, xmm4
+        pcmpgtb         xmm0, xmm7
+
+        ;double  u and v masks
+        movdqa          xmm8, xmm0
+        punpckhbw       xmm0, xmm0
+        punpcklbw       xmm8, xmm8
+
+        ;mask row 0 and output
+        pand            xmm6, xmm8
+        pand            xmm6, xmm0
+        movdqa          [rax],xmm6
+
+        ;mask row 1 and output
+        pand            xmm11, xmm8
+        pand            xmm11, xmm0
+        movdqa          [rax+16],xmm11
+
+
+        ; to the next row or set of rows
+        add             rsi, rbx
+        add             rdi, rdx
+        add             rcx, rdx
+        add             rax,32
+        dec r8
+        jnz NextPairOfRows
+
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;GROW_HORIZ (register for result, source register or mem local)
+; takes source and shifts left and ors with source
+; then shifts right and ors with source
+%macro GROW_HORIZ 2
+    movdqa          %1, %2
+    movdqa          xmm14, %1
+    movdqa          xmm15, %1
+    pslldq          xmm14, 1
+    psrldq          xmm15, 1
+    por             %1,xmm14
+    por             %1,xmm15
+%endmacro
+;GROW_VERT (result, center row, above row, below row)
+%macro GROW_VERT 4
+    movdqa          %1,%2
+    por             %1,%3
+    por             %1,%4
+%endmacro
+
+;GROW_NEXTLINE (new line to grow, new source, line to write)
+%macro GROW_NEXTLINE 3
+    GROW_HORIZ %1, %2
+    GROW_VERT xmm3, xmm0, xmm1, xmm2
+    movdqa %3,xmm3
+%endmacro
+
+
+;void int vp8_growmaskmb_sse3(
+;    unsigned char *om,
+;    unsigned char *nm,
+global sym(vp8_growmaskmb_sse3)
+sym(vp8_growmaskmb_sse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 2
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    mov             rsi,        arg(0) ;src
+    mov             rdi,        arg(1) ;rst
+
+    GROW_HORIZ xmm0, [rsi]
+    GROW_HORIZ xmm1, [rsi+16]
+    GROW_HORIZ xmm2, [rsi+32]
+
+    GROW_VERT xmm3, xmm0, xmm1, xmm2
+    por xmm0,xmm1
+    movdqa [rdi], xmm0
+    movdqa [rdi+16],xmm3
+
+    GROW_NEXTLINE xmm0,[rsi+48],[rdi+32]
+    GROW_NEXTLINE xmm1,[rsi+64],[rdi+48]
+    GROW_NEXTLINE xmm2,[rsi+80],[rdi+64]
+    GROW_NEXTLINE xmm0,[rsi+96],[rdi+80]
+    GROW_NEXTLINE xmm1,[rsi+112],[rdi+96]
+    GROW_NEXTLINE xmm2,[rsi+128],[rdi+112]
+    GROW_NEXTLINE xmm0,[rsi+144],[rdi+128]
+    GROW_NEXTLINE xmm1,[rsi+160],[rdi+144]
+    GROW_NEXTLINE xmm2,[rsi+176],[rdi+160]
+    GROW_NEXTLINE xmm0,[rsi+192],[rdi+176]
+    GROW_NEXTLINE xmm1,[rsi+208],[rdi+192]
+    GROW_NEXTLINE xmm2,[rsi+224],[rdi+208]
+    GROW_NEXTLINE xmm0,[rsi+240],[rdi+224]
+
+    por xmm0,xmm2
+    movdqa [rdi+240], xmm0
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+
+;unsigned int vp8_sad16x16_masked_wmt(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *ref_ptr,
+;    int  ref_stride,
+;    unsigned char *mask)
+global sym(vp8_sad16x16_masked_wmt)
+sym(vp8_sad16x16_masked_wmt):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    push        rsi
+    push        rdi
+    ; end prolog
+    mov             rsi,        arg(0) ;src_ptr
+    mov             rdi,        arg(2) ;ref_ptr
+
+    mov             rbx,        arg(4) ;mask
+    movsxd          rax,        dword ptr arg(1) ;src_stride
+    movsxd          rdx,        dword ptr arg(3) ;ref_stride
+
+    mov             rcx,        16
+
+    pxor            xmm3,       xmm3
+
+NextSadRow:
+    movdqu          xmm0,       [rsi]
+    movdqu          xmm1,       [rdi]
+    movdqu          xmm2,       [rbx]
+    pand            xmm0,       xmm2
+    pand            xmm1,       xmm2
+
+    psadbw          xmm0,       xmm1
+    paddw           xmm3,       xmm0
+
+    add             rsi, rax
+    add             rdi, rdx
+    add             rbx,  16
+
+    dec rcx
+    jnz NextSadRow
+
+    movdqa          xmm4 ,     xmm3
+    psrldq          xmm4,       8
+    paddw           xmm3,      xmm4
+    movq            rax,       xmm3
+    ; begin epilog
+    pop rdi
+    pop rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;unsigned int vp8_sad16x16_unmasked_wmt(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *ref_ptr,
+;    int  ref_stride,
+;    unsigned char *mask)
+global sym(vp8_sad16x16_unmasked_wmt)
+sym(vp8_sad16x16_unmasked_wmt):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    push        rsi
+    push        rdi
+    ; end prolog
+    mov             rsi,        arg(0) ;src_ptr
+    mov             rdi,        arg(2) ;ref_ptr
+
+    mov             rbx,        arg(4) ;mask
+    movsxd          rax,        dword ptr arg(1) ;src_stride
+    movsxd          rdx,        dword ptr arg(3) ;ref_stride
+
+    mov             rcx,        16
+
+    pxor            xmm3,       xmm3
+
+next_vp8_sad16x16_unmasked_wmt:
+    movdqu          xmm0,       [rsi]
+    movdqu          xmm1,       [rdi]
+    movdqu          xmm2,       [rbx]
+    por             xmm0,       xmm2
+    por             xmm1,       xmm2
+
+    psadbw          xmm0,       xmm1
+    paddw           xmm3,       xmm0
+
+    add             rsi, rax
+    add             rdi, rdx
+    add             rbx,  16
+
+    dec rcx
+    jnz next_vp8_sad16x16_unmasked_wmt
+
+    movdqa          xmm4 ,     xmm3
+    psrldq          xmm4,       8
+    paddw           xmm3,      xmm4
+    movq            rax,        xmm3
+    ; begin epilog
+    pop rdi
+    pop rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;unsigned int vp8_masked_predictor_wmt(
+;    unsigned char *masked,
+;    unsigned char *unmasked,
+;    int  src_stride,
+;    unsigned char *dst_ptr,
+;    int  dst_stride,
+;    unsigned char *mask)
+global sym(vp8_masked_predictor_wmt)
+sym(vp8_masked_predictor_wmt):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    push        rsi
+    push        rdi
+    ; end prolog
+    mov             rsi,        arg(0) ;src_ptr
+    mov             rdi,        arg(1) ;ref_ptr
+
+    mov             rbx,        arg(5) ;mask
+    movsxd          rax,        dword ptr arg(2) ;src_stride
+    mov             r11,        arg(3) ; destination
+    movsxd          rdx,        dword ptr arg(4) ;dst_stride
+
+    mov             rcx,        16
+
+    pxor            xmm3,       xmm3
+
+next_vp8_masked_predictor_wmt:
+    movdqu          xmm0,       [rsi]
+    movdqu          xmm1,       [rdi]
+    movdqu          xmm2,       [rbx]
+
+    pand            xmm0,       xmm2
+    pandn           xmm2,       xmm1
+    por             xmm0,       xmm2
+    movdqu          [r11],      xmm0
+
+    add             r11, rdx
+    add             rsi, rax
+    add             rdi, rdx
+    add             rbx,  16
+
+    dec rcx
+    jnz next_vp8_masked_predictor_wmt
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;unsigned int vp8_masked_predictor_uv_wmt(
+;    unsigned char *masked,
+;    unsigned char *unmasked,
+;    int  src_stride,
+;    unsigned char *dst_ptr,
+;    int  dst_stride,
+;    unsigned char *mask)
+global sym(vp8_masked_predictor_uv_wmt)
+sym(vp8_masked_predictor_uv_wmt):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    push        rsi
+    push        rdi
+    ; end prolog
+    mov             rsi,        arg(0) ;src_ptr
+    mov             rdi,        arg(1) ;ref_ptr
+
+    mov             rbx,        arg(5) ;mask
+    movsxd          rax,        dword ptr arg(2) ;src_stride
+    mov             r11,        arg(3) ; destination
+    movsxd          rdx,        dword ptr arg(4) ;dst_stride
+
+    mov             rcx,        8
+
+    pxor            xmm3,       xmm3
+
+next_vp8_masked_predictor_uv_wmt:
+    movq            xmm0,       [rsi]
+    movq            xmm1,       [rdi]
+    movq            xmm2,       [rbx]
+
+    pand            xmm0,       xmm2
+    pandn           xmm2,       xmm1
+    por             xmm0,       xmm2
+    movq            [r11],      xmm0
+
+    add             r11, rdx
+    add             rsi, rax
+    add             rdi, rax
+    add             rbx,  8
+
+    dec rcx
+    jnz next_vp8_masked_predictor_uv_wmt
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;unsigned int vp8_uv_from_y_mask(
+;    unsigned char *ymask,
+;    unsigned char *uvmask)
+global sym(vp8_uv_from_y_mask)
+sym(vp8_uv_from_y_mask):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    push        rsi
+    push        rdi
+    ; end prolog
+    mov             rsi,        arg(0) ;src_ptr
+    mov             rdi,        arg(1) ;dst_ptr
+
+
+    mov             rcx,        8
+
+    pxor            xmm3,       xmm3
+
+next_p8_uv_from_y_mask:
+    movdqu          xmm0,       [rsi]
+    pshufb          xmm0, [shuf1b] ;[GLOBAL(shuf1b)]
+    movq            [rdi],xmm0
+    add             rdi, 8
+    add             rsi,32
+
+    dec rcx
+    jnz next_p8_uv_from_y_mask
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+SECTION_RODATA
+align 16
+shuf1b:
+    db 0, 2, 4, 6, 8, 10, 12, 14, 0, 0, 0, 0, 0, 0, 0, 0
+
--- /dev/null
+++ b/vp9/common/x86/postproc_mmx.asm
@@ -1,0 +1,534 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+%define VP9_FILTER_WEIGHT 128
+%define VP9_FILTER_SHIFT  7
+
+;void vp9_post_proc_down_and_across_mmx
+;(
+;    unsigned char *src_ptr,
+;    unsigned char *dst_ptr,
+;    int src_pixels_per_line,
+;    int dst_pixels_per_line,
+;    int rows,
+;    int cols,
+;    int flimit
+;)
+global sym(vp9_post_proc_down_and_across_mmx)
+sym(vp9_post_proc_down_and_across_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+%if ABI_IS_32BIT=1 && CONFIG_PIC=1
+    ; move the global rd onto the stack, since we don't have enough registers
+    ; to do PIC addressing
+    movq        mm0, [GLOBAL(rd)]
+    sub         rsp, 8
+    movq        [rsp], mm0
+%define RD [rsp]
+%else
+%define RD [GLOBAL(rd)]
+%endif
+
+        push        rbx
+        lea         rbx, [GLOBAL(Blur)]
+        movd        mm2, dword ptr arg(6) ;flimit
+        punpcklwd   mm2, mm2
+        punpckldq   mm2, mm2
+
+        mov         rsi,        arg(0) ;src_ptr
+        mov         rdi,        arg(1) ;dst_ptr
+
+        movsxd      rcx, DWORD PTR arg(4) ;rows
+        movsxd      rax, DWORD PTR arg(2) ;src_pixels_per_line ; destination pitch?
+        pxor        mm0, mm0              ; mm0 = 00000000
+
+.nextrow:
+
+        xor         rdx,        rdx       ; clear out rdx for use as loop counter
+.nextcol:
+
+        pxor        mm7, mm7              ; mm7 = 00000000
+        movq        mm6, [rbx + 32 ]      ; mm6 = kernel 2 taps
+        movq        mm3, [rsi]            ; mm4 = r0 p0..p7
+        punpcklbw   mm3, mm0              ; mm3 = p0..p3
+        movq        mm1, mm3              ; mm1 = p0..p3
+        pmullw      mm3, mm6              ; mm3 *= kernel 2 modifiers
+
+        movq        mm6, [rbx + 48]       ; mm6 = kernel 3 taps
+        movq        mm5, [rsi + rax]      ; mm4 = r1 p0..p7
+        punpcklbw   mm5, mm0              ; mm5 = r1 p0..p3
+        pmullw      mm6, mm5              ; mm6 *= p0..p3 * kernel 3 modifiers
+        paddusw     mm3, mm6              ; mm3 += mm6
+
+        ; thresholding
+        movq        mm7, mm1              ; mm7 = r0 p0..p3
+        psubusw     mm7, mm5              ; mm7 = r0 p0..p3 - r1 p0..p3
+        psubusw     mm5, mm1              ; mm5 = r1 p0..p3 - r0 p0..p3
+        paddusw     mm7, mm5              ; mm7 = abs(r0 p0..p3 - r1 p0..p3)
+        pcmpgtw     mm7, mm2
+
+        movq        mm6, [rbx + 64 ]      ; mm6 = kernel 4 modifiers
+        movq        mm5, [rsi + 2*rax]    ; mm4 = r2 p0..p7
+        punpcklbw   mm5, mm0              ; mm5 = r2 p0..p3
+        pmullw      mm6, mm5              ; mm5 *= kernel 4 modifiers
+        paddusw     mm3, mm6              ; mm3 += mm5
+
+        ; thresholding
+        movq        mm6, mm1              ; mm6 = r0 p0..p3
+        psubusw     mm6, mm5              ; mm6 = r0 p0..p3 - r2 p0..p3
+        psubusw     mm5, mm1              ; mm5 = r2 p0..p3 - r2 p0..p3
+        paddusw     mm6, mm5              ; mm6 = abs(r0 p0..p3 - r2 p0..p3)
+        pcmpgtw     mm6, mm2
+        por         mm7, mm6              ; accumulate thresholds
+
+
+        neg         rax
+        movq        mm6, [rbx ]           ; kernel 0 taps
+        movq        mm5, [rsi+2*rax]      ; mm4 = r-2 p0..p7
+        punpcklbw   mm5, mm0              ; mm5 = r-2 p0..p3
+        pmullw      mm6, mm5              ; mm5 *= kernel 0 modifiers
+        paddusw     mm3, mm6              ; mm3 += mm5
+
+        ; thresholding
+        movq        mm6, mm1              ; mm6 = r0 p0..p3
+        psubusw     mm6, mm5              ; mm6 = p0..p3 - r-2 p0..p3
+        psubusw     mm5, mm1              ; mm5 = r-2 p0..p3 - p0..p3
+        paddusw     mm6, mm5              ; mm6 = abs(r0 p0..p3 - r-2 p0..p3)
+        pcmpgtw     mm6, mm2
+        por         mm7, mm6              ; accumulate thresholds
+
+        movq        mm6, [rbx + 16]       ; kernel 1 taps
+        movq        mm4, [rsi+rax]        ; mm4 = r-1 p0..p7
+        punpcklbw   mm4, mm0              ; mm4 = r-1 p0..p3
+        pmullw      mm6, mm4              ; mm4 *= kernel 1 modifiers.
+        paddusw     mm3, mm6              ; mm3 += mm5
+
+        ; thresholding
+        movq        mm6, mm1              ; mm6 = r0 p0..p3
+        psubusw     mm6, mm4              ; mm6 = p0..p3 - r-2 p0..p3
+        psubusw     mm4, mm1              ; mm5 = r-1 p0..p3 - p0..p3
+        paddusw     mm6, mm4              ; mm6 = abs(r0 p0..p3 - r-1 p0..p3)
+        pcmpgtw     mm6, mm2
+        por         mm7, mm6              ; accumulate thresholds
+
+
+        paddusw     mm3, RD               ; mm3 += round value
+        psraw       mm3, VP9_FILTER_SHIFT     ; mm3 /= 128
+
+        pand        mm1, mm7              ; mm1 select vals > thresh from source
+        pandn       mm7, mm3              ; mm7 select vals < thresh from blurred result
+        paddusw     mm1, mm7              ; combination
+
+        packuswb    mm1, mm0              ; pack to bytes
+
+        movd        [rdi], mm1            ;
+        neg         rax                   ; pitch is positive
+
+
+        add         rsi, 4
+        add         rdi, 4
+        add         rdx, 4
+
+        cmp         edx, dword ptr arg(5) ;cols
+        jl          .nextcol
+        ; done with the all cols, start the across filtering in place
+        sub         rsi, rdx
+        sub         rdi, rdx
+
+
+        push        rax
+        xor         rdx,    rdx
+        mov         rax,    [rdi-4];
+
+.acrossnextcol:
+        pxor        mm7, mm7              ; mm7 = 00000000
+        movq        mm6, [rbx + 32 ]      ;
+        movq        mm4, [rdi+rdx]        ; mm4 = p0..p7
+        movq        mm3, mm4              ; mm3 = p0..p7
+        punpcklbw   mm3, mm0              ; mm3 = p0..p3
+        movq        mm1, mm3              ; mm1 = p0..p3
+        pmullw      mm3, mm6              ; mm3 *= kernel 2 modifiers
+
+        movq        mm6, [rbx + 48]
+        psrlq       mm4, 8                ; mm4 = p1..p7
+        movq        mm5, mm4              ; mm5 = p1..p7
+        punpcklbw   mm5, mm0              ; mm5 = p1..p4
+        pmullw      mm6, mm5              ; mm6 *= p1..p4 * kernel 3 modifiers
+        paddusw     mm3, mm6              ; mm3 += mm6
+
+        ; thresholding
+        movq        mm7, mm1              ; mm7 = p0..p3
+        psubusw     mm7, mm5              ; mm7 = p0..p3 - p1..p4
+        psubusw     mm5, mm1              ; mm5 = p1..p4 - p0..p3
+        paddusw     mm7, mm5              ; mm7 = abs(p0..p3 - p1..p4)
+        pcmpgtw     mm7, mm2
+
+        movq        mm6, [rbx + 64 ]
+        psrlq       mm4, 8                ; mm4 = p2..p7
+        movq        mm5, mm4              ; mm5 = p2..p7
+        punpcklbw   mm5, mm0              ; mm5 = p2..p5
+        pmullw      mm6, mm5              ; mm5 *= kernel 4 modifiers
+        paddusw     mm3, mm6              ; mm3 += mm5
+
+        ; thresholding
+        movq        mm6, mm1              ; mm6 = p0..p3
+        psubusw     mm6, mm5              ; mm6 = p0..p3 - p1..p4
+        psubusw     mm5, mm1              ; mm5 = p1..p4 - p0..p3
+        paddusw     mm6, mm5              ; mm6 = abs(p0..p3 - p1..p4)
+        pcmpgtw     mm6, mm2
+        por         mm7, mm6              ; accumulate thresholds
+
+
+        movq        mm6, [rbx ]
+        movq        mm4, [rdi+rdx-2]      ; mm4 = p-2..p5
+        movq        mm5, mm4              ; mm5 = p-2..p5
+        punpcklbw   mm5, mm0              ; mm5 = p-2..p1
+        pmullw      mm6, mm5              ; mm5 *= kernel 0 modifiers
+        paddusw     mm3, mm6              ; mm3 += mm5
+
+        ; thresholding
+        movq        mm6, mm1              ; mm6 = p0..p3
+        psubusw     mm6, mm5              ; mm6 = p0..p3 - p1..p4
+        psubusw     mm5, mm1              ; mm5 = p1..p4 - p0..p3
+        paddusw     mm6, mm5              ; mm6 = abs(p0..p3 - p1..p4)
+        pcmpgtw     mm6, mm2
+        por         mm7, mm6              ; accumulate thresholds
+
+        movq        mm6, [rbx + 16]
+        psrlq       mm4, 8                ; mm4 = p-1..p5
+        punpcklbw   mm4, mm0              ; mm4 = p-1..p2
+        pmullw      mm6, mm4              ; mm4 *= kernel 1 modifiers.
+        paddusw     mm3, mm6              ; mm3 += mm5
+
+        ; thresholding
+        movq        mm6, mm1              ; mm6 = p0..p3
+        psubusw     mm6, mm4              ; mm6 = p0..p3 - p1..p4
+        psubusw     mm4, mm1              ; mm5 = p1..p4 - p0..p3
+        paddusw     mm6, mm4              ; mm6 = abs(p0..p3 - p1..p4)
+        pcmpgtw     mm6, mm2
+        por         mm7, mm6              ; accumulate thresholds
+
+        paddusw     mm3, RD               ; mm3 += round value
+        psraw       mm3, VP9_FILTER_SHIFT     ; mm3 /= 128
+
+        pand        mm1, mm7              ; mm1 select vals > thresh from source
+        pandn       mm7, mm3              ; mm7 select vals < thresh from blurred result
+        paddusw     mm1, mm7              ; combination
+
+        packuswb    mm1, mm0              ; pack to bytes
+        mov         DWORD PTR [rdi+rdx-4],  eax   ; store previous four bytes
+        movd        eax,    mm1
+
+        add         rdx, 4
+        cmp         edx, dword ptr arg(5) ;cols
+        jl          .acrossnextcol;
+
+        mov         DWORD PTR [rdi+rdx-4],  eax
+        pop         rax
+
+        ; done with this rwo
+        add         rsi,rax               ; next line
+        movsxd      rax, dword ptr arg(3) ;dst_pixels_per_line ; destination pitch?
+        add         rdi,rax               ; next destination
+        movsxd      rax, dword ptr arg(2) ;src_pixels_per_line ; destination pitch?
+
+        dec         rcx                   ; decrement count
+        jnz         .nextrow               ; next row
+        pop         rbx
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+%undef RD
+
+
+;void vp9_mbpost_proc_down_mmx(unsigned char *dst,
+;                             int pitch, int rows, int cols,int flimit)
+extern sym(vp9_rv)
+global sym(vp9_mbpost_proc_down_mmx)
+sym(vp9_mbpost_proc_down_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 136
+
+    ; unsigned char d[16][8] at [rsp]
+    ; create flimit2 at [rsp+128]
+    mov         eax, dword ptr arg(4) ;flimit
+    mov         [rsp+128], eax
+    mov         [rsp+128+4], eax
+%define flimit2 [rsp+128]
+
+%if ABI_IS_32BIT=0
+    lea         r8,       [GLOBAL(sym(vp9_rv))]
+%endif
+
+    ;rows +=8;
+    add         dword ptr arg(2), 8
+
+    ;for(c=0; c<cols; c+=4)
+.loop_col:
+            mov         rsi,        arg(0)  ;s
+            pxor        mm0,        mm0     ;
+
+            movsxd      rax,        dword ptr arg(1) ;pitch       ;
+            neg         rax                                     ; rax = -pitch
+
+            lea         rsi,        [rsi + rax*8];              ; rdi = s[-pitch*8]
+            neg         rax
+
+
+            pxor        mm5,        mm5
+            pxor        mm6,        mm6     ;
+
+            pxor        mm7,        mm7     ;
+            mov         rdi,        rsi
+
+            mov         rcx,        15          ;
+
+.loop_initvar:
+            movd        mm1,        DWORD PTR [rdi];
+            punpcklbw   mm1,        mm0     ;
+
+            paddw       mm5,        mm1     ;
+            pmullw      mm1,        mm1     ;
+
+            movq        mm2,        mm1     ;
+            punpcklwd   mm1,        mm0     ;
+
+            punpckhwd   mm2,        mm0     ;
+            paddd       mm6,        mm1     ;
+
+            paddd       mm7,        mm2     ;
+            lea         rdi,        [rdi+rax]   ;
+
+            dec         rcx
+            jne         .loop_initvar
+            ;save the var and sum
+            xor         rdx,        rdx
+.loop_row:
+            movd        mm1,        DWORD PTR [rsi]     ; [s-pitch*8]
+            movd        mm2,        DWORD PTR [rdi]     ; [s+pitch*7]
+
+            punpcklbw   mm1,        mm0
+            punpcklbw   mm2,        mm0
+
+            paddw       mm5,        mm2
+            psubw       mm5,        mm1
+
+            pmullw      mm2,        mm2
+            movq        mm4,        mm2
+
+            punpcklwd   mm2,        mm0
+            punpckhwd   mm4,        mm0
+
+            paddd       mm6,        mm2
+            paddd       mm7,        mm4
+
+            pmullw      mm1,        mm1
+            movq        mm2,        mm1
+
+            punpcklwd   mm1,        mm0
+            psubd       mm6,        mm1
+
+            punpckhwd   mm2,        mm0
+            psubd       mm7,        mm2
+
+
+            movq        mm3,        mm6
+            pslld       mm3,        4
+
+            psubd       mm3,        mm6
+            movq        mm1,        mm5
+
+            movq        mm4,        mm5
+            pmullw      mm1,        mm1
+
+            pmulhw      mm4,        mm4
+            movq        mm2,        mm1
+
+            punpcklwd   mm1,        mm4
+            punpckhwd   mm2,        mm4
+
+            movq        mm4,        mm7
+            pslld       mm4,        4
+
+            psubd       mm4,        mm7
+
+            psubd       mm3,        mm1
+            psubd       mm4,        mm2
+
+            psubd       mm3,        flimit2
+            psubd       mm4,        flimit2
+
+            psrad       mm3,        31
+            psrad       mm4,        31
+
+            packssdw    mm3,        mm4
+            packsswb    mm3,        mm0
+
+            movd        mm1,        DWORD PTR [rsi+rax*8]
+
+            movq        mm2,        mm1
+            punpcklbw   mm1,        mm0
+
+            paddw       mm1,        mm5
+            mov         rcx,        rdx
+
+            and         rcx,        127
+%if ABI_IS_32BIT=1 && CONFIG_PIC=1
+            push        rax
+            lea         rax,        [GLOBAL(sym(vp9_rv))]
+            movq        mm4,        [rax + rcx*2] ;vp9_rv[rcx*2]
+            pop         rax
+%elif ABI_IS_32BIT=0
+            movq        mm4,        [r8 + rcx*2] ;vp9_rv[rcx*2]
+%else
+            movq        mm4,        [sym(vp9_rv) + rcx*2]
+%endif
+            paddw       mm1,        mm4
+            ;paddw     xmm1,       eight8s
+            psraw       mm1,        4
+
+            packuswb    mm1,        mm0
+            pand        mm1,        mm3
+
+            pandn       mm3,        mm2
+            por         mm1,        mm3
+
+            and         rcx,        15
+            movd        DWORD PTR   [rsp+rcx*4], mm1 ;d[rcx*4]
+
+            mov         rcx,        rdx
+            sub         rcx,        8
+
+            and         rcx,        15
+            movd        mm1,        DWORD PTR [rsp+rcx*4] ;d[rcx*4]
+
+            movd        [rsi],      mm1
+            lea         rsi,        [rsi+rax]
+
+            lea         rdi,        [rdi+rax]
+            add         rdx,        1
+
+            cmp         edx,        dword arg(2) ;rows
+            jl          .loop_row
+
+
+        add         dword arg(0), 4 ; s += 4
+        sub         dword arg(3), 4 ; cols -= 4
+        cmp         dword arg(3), 0
+        jg          .loop_col
+
+    add         rsp, 136
+    pop         rsp
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+%undef flimit2
+
+
+;void vp9_plane_add_noise_mmx (unsigned char *Start, unsigned char *noise,
+;                            unsigned char blackclamp[16],
+;                            unsigned char whiteclamp[16],
+;                            unsigned char bothclamp[16],
+;                            unsigned int Width, unsigned int Height, int Pitch)
+extern sym(rand)
+global sym(vp9_plane_add_noise_mmx)
+sym(vp9_plane_add_noise_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 8
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+.addnoise_loop:
+    call sym(rand) WRT_PLT
+    mov     rcx, arg(1) ;noise
+    and     rax, 0xff
+    add     rcx, rax
+
+    ; we rely on the fact that the clamping vectors are stored contiguously
+    ; in black/white/both order. Note that we have to reload this here because
+    ; rdx could be trashed by rand()
+    mov     rdx, arg(2) ; blackclamp
+
+
+            mov     rdi, rcx
+            movsxd  rcx, dword arg(5) ;[Width]
+            mov     rsi, arg(0) ;Pos
+            xor         rax,rax
+
+.addnoise_nextset:
+            movq        mm1,[rsi+rax]         ; get the source
+
+            psubusb     mm1, [rdx]    ;blackclamp        ; clamp both sides so we don't outrange adding noise
+            paddusb     mm1, [rdx+32] ;bothclamp
+            psubusb     mm1, [rdx+16] ;whiteclamp
+
+            movq        mm2,[rdi+rax]         ; get the noise for this line
+            paddb       mm1,mm2              ; add it in
+            movq        [rsi+rax],mm1         ; store the result
+
+            add         rax,8                 ; move to the next line
+
+            cmp         rax, rcx
+            jl          .addnoise_nextset
+
+    movsxd  rax, dword arg(7) ; Pitch
+    add     arg(0), rax ; Start += Pitch
+    sub     dword arg(6), 1   ; Height -= 1
+    jg      .addnoise_loop
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+SECTION_RODATA
+align 16
+Blur:
+    times 16 dw 16
+    times  8 dw 64
+    times 16 dw 16
+    times  8 dw  0
+
+rd:
+    times 4 dw 0x40
--- /dev/null
+++ b/vp9/common/x86/postproc_sse2.asm
@@ -1,0 +1,695 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;void vp9_post_proc_down_and_across_xmm
+;(
+;    unsigned char *src_ptr,
+;    unsigned char *dst_ptr,
+;    int src_pixels_per_line,
+;    int dst_pixels_per_line,
+;    int rows,
+;    int cols,
+;    int flimit
+;)
+global sym(vp9_post_proc_down_and_across_xmm)
+sym(vp9_post_proc_down_and_across_xmm):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+%if ABI_IS_32BIT=1 && CONFIG_PIC=1
+    ALIGN_STACK 16, rax
+    ; move the global rd onto the stack, since we don't have enough registers
+    ; to do PIC addressing
+    movdqa      xmm0, [GLOBAL(rd42)]
+    sub         rsp, 16
+    movdqa      [rsp], xmm0
+%define RD42 [rsp]
+%else
+%define RD42 [GLOBAL(rd42)]
+%endif
+
+
+        movd        xmm2,       dword ptr arg(6) ;flimit
+        punpcklwd   xmm2,       xmm2
+        punpckldq   xmm2,       xmm2
+        punpcklqdq  xmm2,       xmm2
+
+        mov         rsi,        arg(0) ;src_ptr
+        mov         rdi,        arg(1) ;dst_ptr
+
+        movsxd      rcx,        DWORD PTR arg(4) ;rows
+        movsxd      rax,        DWORD PTR arg(2) ;src_pixels_per_line ; destination pitch?
+        pxor        xmm0,       xmm0              ; mm0 = 00000000
+
+.nextrow:
+
+        xor         rdx,        rdx       ; clear out rdx for use as loop counter
+.nextcol:
+        movq        xmm3,       QWORD PTR [rsi]         ; mm4 = r0 p0..p7
+        punpcklbw   xmm3,       xmm0                    ; mm3 = p0..p3
+        movdqa      xmm1,       xmm3                    ; mm1 = p0..p3
+        psllw       xmm3,       2                       ;
+
+        movq        xmm5,       QWORD PTR [rsi + rax]   ; mm4 = r1 p0..p7
+        punpcklbw   xmm5,       xmm0                    ; mm5 = r1 p0..p3
+        paddusw     xmm3,       xmm5                    ; mm3 += mm6
+
+        ; thresholding
+        movdqa      xmm7,       xmm1                    ; mm7 = r0 p0..p3
+        psubusw     xmm7,       xmm5                    ; mm7 = r0 p0..p3 - r1 p0..p3
+        psubusw     xmm5,       xmm1                    ; mm5 = r1 p0..p3 - r0 p0..p3
+        paddusw     xmm7,       xmm5                    ; mm7 = abs(r0 p0..p3 - r1 p0..p3)
+        pcmpgtw     xmm7,       xmm2
+
+        movq        xmm5,       QWORD PTR [rsi + 2*rax] ; mm4 = r2 p0..p7
+        punpcklbw   xmm5,       xmm0                    ; mm5 = r2 p0..p3
+        paddusw     xmm3,       xmm5                    ; mm3 += mm5
+
+        ; thresholding
+        movdqa      xmm6,       xmm1                    ; mm6 = r0 p0..p3
+        psubusw     xmm6,       xmm5                    ; mm6 = r0 p0..p3 - r2 p0..p3
+        psubusw     xmm5,       xmm1                    ; mm5 = r2 p0..p3 - r2 p0..p3
+        paddusw     xmm6,       xmm5                    ; mm6 = abs(r0 p0..p3 - r2 p0..p3)
+        pcmpgtw     xmm6,       xmm2
+        por         xmm7,       xmm6                    ; accumulate thresholds
+
+
+        neg         rax
+        movq        xmm5,       QWORD PTR [rsi+2*rax]   ; mm4 = r-2 p0..p7
+        punpcklbw   xmm5,       xmm0                    ; mm5 = r-2 p0..p3
+        paddusw     xmm3,       xmm5                    ; mm3 += mm5
+
+        ; thresholding
+        movdqa      xmm6,       xmm1                    ; mm6 = r0 p0..p3
+        psubusw     xmm6,       xmm5                    ; mm6 = p0..p3 - r-2 p0..p3
+        psubusw     xmm5,       xmm1                    ; mm5 = r-2 p0..p3 - p0..p3
+        paddusw     xmm6,       xmm5                    ; mm6 = abs(r0 p0..p3 - r-2 p0..p3)
+        pcmpgtw     xmm6,       xmm2
+        por         xmm7,       xmm6                    ; accumulate thresholds
+
+        movq        xmm4,       QWORD PTR [rsi+rax]     ; mm4 = r-1 p0..p7
+        punpcklbw   xmm4,       xmm0                    ; mm4 = r-1 p0..p3
+        paddusw     xmm3,       xmm4                    ; mm3 += mm5
+
+        ; thresholding
+        movdqa      xmm6,       xmm1                    ; mm6 = r0 p0..p3
+        psubusw     xmm6,       xmm4                    ; mm6 = p0..p3 - r-2 p0..p3
+        psubusw     xmm4,       xmm1                    ; mm5 = r-1 p0..p3 - p0..p3
+        paddusw     xmm6,       xmm4                    ; mm6 = abs(r0 p0..p3 - r-1 p0..p3)
+        pcmpgtw     xmm6,       xmm2
+        por         xmm7,       xmm6                    ; accumulate thresholds
+
+
+        paddusw     xmm3,       RD42                    ; mm3 += round value
+        psraw       xmm3,       3                       ; mm3 /= 8
+
+        pand        xmm1,       xmm7                    ; mm1 select vals > thresh from source
+        pandn       xmm7,       xmm3                    ; mm7 select vals < thresh from blurred result
+        paddusw     xmm1,       xmm7                    ; combination
+
+        packuswb    xmm1,       xmm0                    ; pack to bytes
+        movq        QWORD PTR [rdi], xmm1             ;
+
+        neg         rax                   ; pitch is positive
+        add         rsi,        8
+        add         rdi,        8
+
+        add         rdx,        8
+        cmp         edx,        dword arg(5) ;cols
+
+        jl          .nextcol
+
+        ; done with the all cols, start the across filtering in place
+        sub         rsi,        rdx
+        sub         rdi,        rdx
+
+        xor         rdx,        rdx
+        movq        mm0,        QWORD PTR [rdi-8];
+
+.acrossnextcol:
+        movq        xmm7,       QWORD PTR [rdi +rdx -2]
+        movd        xmm4,       DWORD PTR [rdi +rdx +6]
+
+        pslldq      xmm4,       8
+        por         xmm4,       xmm7
+
+        movdqa      xmm3,       xmm4
+        psrldq      xmm3,       2
+        punpcklbw   xmm3,       xmm0              ; mm3 = p0..p3
+        movdqa      xmm1,       xmm3              ; mm1 = p0..p3
+        psllw       xmm3,       2
+
+
+        movdqa      xmm5,       xmm4
+        psrldq      xmm5,       3
+        punpcklbw   xmm5,       xmm0              ; mm5 = p1..p4
+        paddusw     xmm3,       xmm5              ; mm3 += mm6
+
+        ; thresholding
+        movdqa      xmm7,       xmm1              ; mm7 = p0..p3
+        psubusw     xmm7,       xmm5              ; mm7 = p0..p3 - p1..p4
+        psubusw     xmm5,       xmm1              ; mm5 = p1..p4 - p0..p3
+        paddusw     xmm7,       xmm5              ; mm7 = abs(p0..p3 - p1..p4)
+        pcmpgtw     xmm7,       xmm2
+
+        movdqa      xmm5,       xmm4
+        psrldq      xmm5,       4
+        punpcklbw   xmm5,       xmm0              ; mm5 = p2..p5
+        paddusw     xmm3,       xmm5              ; mm3 += mm5
+
+        ; thresholding
+        movdqa      xmm6,       xmm1              ; mm6 = p0..p3
+        psubusw     xmm6,       xmm5              ; mm6 = p0..p3 - p1..p4
+        psubusw     xmm5,       xmm1              ; mm5 = p1..p4 - p0..p3
+        paddusw     xmm6,       xmm5              ; mm6 = abs(p0..p3 - p1..p4)
+        pcmpgtw     xmm6,       xmm2
+        por         xmm7,       xmm6              ; accumulate thresholds
+
+
+        movdqa      xmm5,       xmm4              ; mm5 = p-2..p5
+        punpcklbw   xmm5,       xmm0              ; mm5 = p-2..p1
+        paddusw     xmm3,       xmm5              ; mm3 += mm5
+
+        ; thresholding
+        movdqa      xmm6,       xmm1              ; mm6 = p0..p3
+        psubusw     xmm6,       xmm5              ; mm6 = p0..p3 - p1..p4
+        psubusw     xmm5,       xmm1              ; mm5 = p1..p4 - p0..p3
+        paddusw     xmm6,       xmm5              ; mm6 = abs(p0..p3 - p1..p4)
+        pcmpgtw     xmm6,       xmm2
+        por         xmm7,       xmm6              ; accumulate thresholds
+
+        psrldq      xmm4,       1                   ; mm4 = p-1..p5
+        punpcklbw   xmm4,       xmm0              ; mm4 = p-1..p2
+        paddusw     xmm3,       xmm4              ; mm3 += mm5
+
+        ; thresholding
+        movdqa      xmm6,       xmm1              ; mm6 = p0..p3
+        psubusw     xmm6,       xmm4              ; mm6 = p0..p3 - p1..p4
+        psubusw     xmm4,       xmm1              ; mm5 = p1..p4 - p0..p3
+        paddusw     xmm6,       xmm4              ; mm6 = abs(p0..p3 - p1..p4)
+        pcmpgtw     xmm6,       xmm2
+        por         xmm7,       xmm6              ; accumulate thresholds
+
+        paddusw     xmm3,       RD42              ; mm3 += round value
+        psraw       xmm3,       3                 ; mm3 /= 8
+
+        pand        xmm1,       xmm7              ; mm1 select vals > thresh from source
+        pandn       xmm7,       xmm3              ; mm7 select vals < thresh from blurred result
+        paddusw     xmm1,       xmm7              ; combination
+
+        packuswb    xmm1,       xmm0              ; pack to bytes
+        movq        QWORD PTR [rdi+rdx-8],  mm0   ; store previous four bytes
+        movdq2q     mm0,        xmm1
+
+        add         rdx,        8
+        cmp         edx,        dword arg(5) ;cols
+        jl          .acrossnextcol;
+
+        ; last 8 pixels
+        movq        QWORD PTR [rdi+rdx-8],  mm0
+
+        ; done with this rwo
+        add         rsi,rax               ; next line
+        mov         eax, dword arg(3) ;dst_pixels_per_line ; destination pitch?
+        add         rdi,rax               ; next destination
+        mov         eax, dword arg(2) ;src_pixels_per_line ; destination pitch?
+
+        dec         rcx                   ; decrement count
+        jnz         .nextrow              ; next row
+
+%if ABI_IS_32BIT=1 && CONFIG_PIC=1
+    add rsp,16
+    pop rsp
+%endif
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+%undef RD42
+
+
+;void vp9_mbpost_proc_down_xmm(unsigned char *dst,
+;                            int pitch, int rows, int cols,int flimit)
+extern sym(vp9_rv)
+global sym(vp9_mbpost_proc_down_xmm)
+sym(vp9_mbpost_proc_down_xmm):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 128+16
+
+    ; unsigned char d[16][8] at [rsp]
+    ; create flimit2 at [rsp+128]
+    mov         eax, dword ptr arg(4) ;flimit
+    mov         [rsp+128], eax
+    mov         [rsp+128+4], eax
+    mov         [rsp+128+8], eax
+    mov         [rsp+128+12], eax
+%define flimit4 [rsp+128]
+
+%if ABI_IS_32BIT=0
+    lea         r8,       [GLOBAL(sym(vp9_rv))]
+%endif
+
+    ;rows +=8;
+    add         dword arg(2), 8
+
+    ;for(c=0; c<cols; c+=8)
+.loop_col:
+            mov         rsi,        arg(0) ; s
+            pxor        xmm0,       xmm0        ;
+
+            movsxd      rax,        dword ptr arg(1) ;pitch       ;
+            neg         rax                                     ; rax = -pitch
+
+            lea         rsi,        [rsi + rax*8];              ; rdi = s[-pitch*8]
+            neg         rax
+
+
+            pxor        xmm5,       xmm5
+            pxor        xmm6,       xmm6        ;
+
+            pxor        xmm7,       xmm7        ;
+            mov         rdi,        rsi
+
+            mov         rcx,        15          ;
+
+.loop_initvar:
+            movq        xmm1,       QWORD PTR [rdi];
+            punpcklbw   xmm1,       xmm0        ;
+
+            paddw       xmm5,       xmm1        ;
+            pmullw      xmm1,       xmm1        ;
+
+            movdqa      xmm2,       xmm1        ;
+            punpcklwd   xmm1,       xmm0        ;
+
+            punpckhwd   xmm2,       xmm0        ;
+            paddd       xmm6,       xmm1        ;
+
+            paddd       xmm7,       xmm2        ;
+            lea         rdi,        [rdi+rax]   ;
+
+            dec         rcx
+            jne         .loop_initvar
+            ;save the var and sum
+            xor         rdx,        rdx
+.loop_row:
+            movq        xmm1,       QWORD PTR [rsi]     ; [s-pitch*8]
+            movq        xmm2,       QWORD PTR [rdi]     ; [s+pitch*7]
+
+            punpcklbw   xmm1,       xmm0
+            punpcklbw   xmm2,       xmm0
+
+            paddw       xmm5,       xmm2
+            psubw       xmm5,       xmm1
+
+            pmullw      xmm2,       xmm2
+            movdqa      xmm4,       xmm2
+
+            punpcklwd   xmm2,       xmm0
+            punpckhwd   xmm4,       xmm0
+
+            paddd       xmm6,       xmm2
+            paddd       xmm7,       xmm4
+
+            pmullw      xmm1,       xmm1
+            movdqa      xmm2,       xmm1
+
+            punpcklwd   xmm1,       xmm0
+            psubd       xmm6,       xmm1
+
+            punpckhwd   xmm2,       xmm0
+            psubd       xmm7,       xmm2
+
+
+            movdqa      xmm3,       xmm6
+            pslld       xmm3,       4
+
+            psubd       xmm3,       xmm6
+            movdqa      xmm1,       xmm5
+
+            movdqa      xmm4,       xmm5
+            pmullw      xmm1,       xmm1
+
+            pmulhw      xmm4,       xmm4
+            movdqa      xmm2,       xmm1
+
+            punpcklwd   xmm1,       xmm4
+            punpckhwd   xmm2,       xmm4
+
+            movdqa      xmm4,       xmm7
+            pslld       xmm4,       4
+
+            psubd       xmm4,       xmm7
+
+            psubd       xmm3,       xmm1
+            psubd       xmm4,       xmm2
+
+            psubd       xmm3,       flimit4
+            psubd       xmm4,       flimit4
+
+            psrad       xmm3,       31
+            psrad       xmm4,       31
+
+            packssdw    xmm3,       xmm4
+            packsswb    xmm3,       xmm0
+
+            movq        xmm1,       QWORD PTR [rsi+rax*8]
+
+            movq        xmm2,       xmm1
+            punpcklbw   xmm1,       xmm0
+
+            paddw       xmm1,       xmm5
+            mov         rcx,        rdx
+
+            and         rcx,        127
+%if ABI_IS_32BIT=1 && CONFIG_PIC=1
+            push        rax
+            lea         rax,        [GLOBAL(sym(vp9_rv))]
+            movdqu      xmm4,       [rax + rcx*2] ;vp9_rv[rcx*2]
+            pop         rax
+%elif ABI_IS_32BIT=0
+            movdqu      xmm4,       [r8 + rcx*2] ;vp9_rv[rcx*2]
+%else
+            movdqu      xmm4,       [sym(vp9_rv) + rcx*2]
+%endif
+
+            paddw       xmm1,       xmm4
+            ;paddw     xmm1,       eight8s
+            psraw       xmm1,       4
+
+            packuswb    xmm1,       xmm0
+            pand        xmm1,       xmm3
+
+            pandn       xmm3,       xmm2
+            por         xmm1,       xmm3
+
+            and         rcx,        15
+            movq        QWORD PTR   [rsp + rcx*8], xmm1 ;d[rcx*8]
+
+            mov         rcx,        rdx
+            sub         rcx,        8
+
+            and         rcx,        15
+            movq        mm0,        [rsp + rcx*8] ;d[rcx*8]
+
+            movq        [rsi],      mm0
+            lea         rsi,        [rsi+rax]
+
+            lea         rdi,        [rdi+rax]
+            add         rdx,        1
+
+            cmp         edx,        dword arg(2) ;rows
+            jl          .loop_row
+
+        add         dword arg(0), 8 ; s += 8
+        sub         dword arg(3), 8 ; cols -= 8
+        cmp         dword arg(3), 0
+        jg          .loop_col
+
+    add         rsp, 128+16
+    pop         rsp
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+%undef flimit4
+
+
+;void vp9_mbpost_proc_across_ip_xmm(unsigned char *src,
+;                                int pitch, int rows, int cols,int flimit)
+global sym(vp9_mbpost_proc_across_ip_xmm)
+sym(vp9_mbpost_proc_across_ip_xmm):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16
+
+    ; create flimit4 at [rsp]
+    mov         eax, dword ptr arg(4) ;flimit
+    mov         [rsp], eax
+    mov         [rsp+4], eax
+    mov         [rsp+8], eax
+    mov         [rsp+12], eax
+%define flimit4 [rsp]
+
+
+    ;for(r=0;r<rows;r++)
+.ip_row_loop:
+
+        xor         rdx,    rdx ;sumsq=0;
+        xor         rcx,    rcx ;sum=0;
+        mov         rsi,    arg(0); s
+        mov         rdi,    -8
+.ip_var_loop:
+        ;for(i=-8;i<=6;i++)
+        ;{
+        ;    sumsq += s[i]*s[i];
+        ;    sum   += s[i];
+        ;}
+        movzx       eax, byte [rsi+rdi]
+        add         ecx, eax
+        mul         al
+        add         edx, eax
+        add         rdi, 1
+        cmp         rdi, 6
+        jle         .ip_var_loop
+
+
+            ;mov         rax,    sumsq
+            ;movd        xmm7,   rax
+            movd        xmm7,   edx
+
+            ;mov         rax,    sum
+            ;movd        xmm6,   rax
+            movd        xmm6,   ecx
+
+            mov         rsi,    arg(0) ;s
+            xor         rcx,    rcx
+
+            movsxd      rdx,    dword arg(3) ;cols
+            add         rdx,    8
+            pxor        mm0,    mm0
+            pxor        mm1,    mm1
+
+            pxor        xmm0,   xmm0
+.nextcol4:
+
+            movd        xmm1,   DWORD PTR [rsi+rcx-8]   ; -8 -7 -6 -5
+            movd        xmm2,   DWORD PTR [rsi+rcx+7]   ; +7 +8 +9 +10
+
+            punpcklbw   xmm1,   xmm0                    ; expanding
+            punpcklbw   xmm2,   xmm0                    ; expanding
+
+            punpcklwd   xmm1,   xmm0                    ; expanding to dwords
+            punpcklwd   xmm2,   xmm0                    ; expanding to dwords
+
+            psubd       xmm2,   xmm1                    ; 7--8   8--7   9--6 10--5
+            paddd       xmm1,   xmm1                    ; -8*2   -7*2   -6*2 -5*2
+
+            paddd       xmm1,   xmm2                    ; 7+-8   8+-7   9+-6 10+-5
+            pmaddwd     xmm1,   xmm2                    ; squared of 7+-8   8+-7   9+-6 10+-5
+
+            paddd       xmm6,   xmm2
+            paddd       xmm7,   xmm1
+
+            pshufd      xmm6,   xmm6,   0               ; duplicate the last ones
+            pshufd      xmm7,   xmm7,   0               ; duplicate the last ones
+
+            psrldq      xmm1,       4                   ; 8--7   9--6 10--5  0000
+            psrldq      xmm2,       4                   ; 8--7   9--6 10--5  0000
+
+            pshufd      xmm3,   xmm1,   3               ; 0000  8--7   8--7   8--7 squared
+            pshufd      xmm4,   xmm2,   3               ; 0000  8--7   8--7   8--7 squared
+
+            paddd       xmm6,   xmm4
+            paddd       xmm7,   xmm3
+
+            pshufd      xmm3,   xmm1,   01011111b       ; 0000  0000   9--6   9--6 squared
+            pshufd      xmm4,   xmm2,   01011111b       ; 0000  0000   9--6   9--6 squared
+
+            paddd       xmm7,   xmm3
+            paddd       xmm6,   xmm4
+
+            pshufd      xmm3,   xmm1,   10111111b       ; 0000  0000   8--7   8--7 squared
+            pshufd      xmm4,   xmm2,   10111111b       ; 0000  0000   8--7   8--7 squared
+
+            paddd       xmm7,   xmm3
+            paddd       xmm6,   xmm4
+
+            movdqa      xmm3,   xmm6
+            pmaddwd     xmm3,   xmm3
+
+            movdqa      xmm5,   xmm7
+            pslld       xmm5,   4
+
+            psubd       xmm5,   xmm7
+            psubd       xmm5,   xmm3
+
+            psubd       xmm5,   flimit4
+            psrad       xmm5,   31
+
+            packssdw    xmm5,   xmm0
+            packsswb    xmm5,   xmm0
+
+            movd        xmm1,   DWORD PTR [rsi+rcx]
+            movq        xmm2,   xmm1
+
+            punpcklbw   xmm1,   xmm0
+            punpcklwd   xmm1,   xmm0
+
+            paddd       xmm1,   xmm6
+            paddd       xmm1,   [GLOBAL(four8s)]
+
+            psrad       xmm1,   4
+            packssdw    xmm1,   xmm0
+
+            packuswb    xmm1,   xmm0
+            pand        xmm1,   xmm5
+
+            pandn       xmm5,   xmm2
+            por         xmm5,   xmm1
+
+            movd        [rsi+rcx-8],  mm0
+            movq        mm0,    mm1
+
+            movdq2q     mm1,    xmm5
+            psrldq      xmm7,   12
+
+            psrldq      xmm6,   12
+            add         rcx,    4
+
+            cmp         rcx,    rdx
+            jl          .nextcol4
+
+        ;s+=pitch;
+        movsxd rax, dword arg(1)
+        add    arg(0), rax
+
+        sub dword arg(2), 1 ;rows-=1
+        cmp dword arg(2), 0
+        jg .ip_row_loop
+
+    add         rsp, 16
+    pop         rsp
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+%undef flimit4
+
+
+;void vp9_plane_add_noise_wmt (unsigned char *Start, unsigned char *noise,
+;                            unsigned char blackclamp[16],
+;                            unsigned char whiteclamp[16],
+;                            unsigned char bothclamp[16],
+;                            unsigned int Width, unsigned int Height, int Pitch)
+extern sym(rand)
+global sym(vp9_plane_add_noise_wmt)
+sym(vp9_plane_add_noise_wmt):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 8
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+.addnoise_loop:
+    call sym(rand) WRT_PLT
+    mov     rcx, arg(1) ;noise
+    and     rax, 0xff
+    add     rcx, rax
+
+    ; we rely on the fact that the clamping vectors are stored contiguously
+    ; in black/white/both order. Note that we have to reload this here because
+    ; rdx could be trashed by rand()
+    mov     rdx, arg(2) ; blackclamp
+
+
+            mov     rdi, rcx
+            movsxd  rcx, dword arg(5) ;[Width]
+            mov     rsi, arg(0) ;Pos
+            xor         rax,rax
+
+.addnoise_nextset:
+            movdqu      xmm1,[rsi+rax]         ; get the source
+
+            psubusb     xmm1, [rdx]    ;blackclamp        ; clamp both sides so we don't outrange adding noise
+            paddusb     xmm1, [rdx+32] ;bothclamp
+            psubusb     xmm1, [rdx+16] ;whiteclamp
+
+            movdqu      xmm2,[rdi+rax]         ; get the noise for this line
+            paddb       xmm1,xmm2              ; add it in
+            movdqu      [rsi+rax],xmm1         ; store the result
+
+            add         rax,16                 ; move to the next line
+
+            cmp         rax, rcx
+            jl          .addnoise_nextset
+
+    movsxd  rax, dword arg(7) ; Pitch
+    add     arg(0), rax ; Start += Pitch
+    sub     dword arg(6), 1   ; Height -= 1
+    jg      .addnoise_loop
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+SECTION_RODATA
+align 16
+rd42:
+    times 8 dw 0x04
+four8s:
+    times 4 dd 8
--- /dev/null
+++ b/vp9/common/x86/postproc_x86.h
@@ -1,0 +1,64 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef POSTPROC_X86_H
+#define POSTPROC_X86_H
+
+/* Note:
+ *
+ * This platform is commonly built for runtime CPU detection. If you modify
+ * any of the function mappings present in this file, be sure to also update
+ * them in the function pointer initialization code
+ */
+
+#if HAVE_MMX
+extern prototype_postproc_inplace(vp9_mbpost_proc_down_mmx);
+extern prototype_postproc(vp9_post_proc_down_and_across_mmx);
+extern prototype_postproc_addnoise(vp9_plane_add_noise_mmx);
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+#undef  vp9_postproc_down
+#define vp9_postproc_down vp9_mbpost_proc_down_mmx
+
+#undef  vp9_postproc_downacross
+#define vp9_postproc_downacross vp9_post_proc_down_and_across_mmx
+
+#undef  vp9_postproc_addnoise
+#define vp9_postproc_addnoise vp9_plane_add_noise_mmx
+
+#endif
+#endif
+
+
+#if HAVE_SSE2
+extern prototype_postproc_inplace(vp9_mbpost_proc_down_xmm);
+extern prototype_postproc_inplace(vp9_mbpost_proc_across_ip_xmm);
+extern prototype_postproc(vp9_post_proc_down_and_across_xmm);
+extern prototype_postproc_addnoise(vp9_plane_add_noise_wmt);
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+#undef  vp9_postproc_down
+#define vp9_postproc_down vp9_mbpost_proc_down_xmm
+
+#undef  vp9_postproc_across
+#define vp9_postproc_across vp9_mbpost_proc_across_ip_xmm
+
+#undef  vp9_postproc_downacross
+#define vp9_postproc_downacross vp9_post_proc_down_and_across_xmm
+
+#undef  vp9_postproc_addnoise
+#define vp9_postproc_addnoise vp9_plane_add_noise_wmt
+
+
+#endif
+#endif
+
+#endif
--- /dev/null
+++ b/vp9/common/x86/recon_mmx.asm
@@ -1,0 +1,321 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+;void vp9_recon_b_mmx(unsigned char *s, short *q, unsigned char *d, int stride)
+global sym(vp9_recon_b_mmx)
+sym(vp9_recon_b_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 4
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov       rsi, arg(0) ;s
+        mov       rdi, arg(2) ;d
+        mov       rdx, arg(1) ;q
+        movsxd    rax, dword ptr arg(3) ;stride
+        pxor      mm0, mm0
+
+        movd      mm1, [rsi]
+        punpcklbw mm1, mm0
+        paddsw    mm1, [rdx]
+        packuswb  mm1,  mm0              ; pack and unpack to saturate
+        movd      [rdi], mm1
+
+        movd      mm2, [rsi+16]
+        punpcklbw mm2, mm0
+        paddsw    mm2, [rdx+32]
+        packuswb  mm2, mm0              ; pack and unpack to saturate
+        movd      [rdi+rax], mm2
+
+        movd      mm3, [rsi+32]
+        punpcklbw mm3, mm0
+        paddsw    mm3, [rdx+64]
+        packuswb  mm3,  mm0              ; pack and unpack to saturate
+        movd      [rdi+2*rax], mm3
+
+        add       rdi, rax
+        movd      mm4, [rsi+48]
+        punpcklbw mm4, mm0
+        paddsw    mm4, [rdx+96]
+        packuswb  mm4, mm0              ; pack and unpack to saturate
+        movd      [rdi+2*rax], mm4
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void copy_mem8x8_mmx(
+;    unsigned char *src,
+;    int src_stride,
+;    unsigned char *dst,
+;    int dst_stride
+;    )
+global sym(vp9_copy_mem8x8_mmx)
+sym(vp9_copy_mem8x8_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 4
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov         rsi,        arg(0) ;src;
+        movq        mm0,        [rsi]
+
+        movsxd      rax,        dword ptr arg(1) ;src_stride;
+        mov         rdi,        arg(2) ;dst;
+
+        movq        mm1,        [rsi+rax]
+        movq        mm2,        [rsi+rax*2]
+
+        movsxd      rcx,        dword ptr arg(3) ;dst_stride
+        lea         rsi,        [rsi+rax*2]
+
+        movq        [rdi],      mm0
+        add         rsi,        rax
+
+        movq        [rdi+rcx],      mm1
+        movq        [rdi+rcx*2],    mm2
+
+
+        lea         rdi,        [rdi+rcx*2]
+        movq        mm3,        [rsi]
+
+        add         rdi,        rcx
+        movq        mm4,        [rsi+rax]
+
+        movq        mm5,        [rsi+rax*2]
+        movq        [rdi],      mm3
+
+        lea         rsi,        [rsi+rax*2]
+        movq        [rdi+rcx],  mm4
+
+        movq        [rdi+rcx*2],    mm5
+        lea         rdi,        [rdi+rcx*2]
+
+        movq        mm0,        [rsi+rax]
+        movq        mm1,        [rsi+rax*2]
+
+        movq        [rdi+rcx],  mm0
+        movq        [rdi+rcx*2],mm1
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void copy_mem8x4_mmx(
+;    unsigned char *src,
+;    int src_stride,
+;    unsigned char *dst,
+;    int dst_stride
+;    )
+global sym(vp9_copy_mem8x4_mmx)
+sym(vp9_copy_mem8x4_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 4
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov         rsi,        arg(0) ;src;
+        movq        mm0,        [rsi]
+
+        movsxd      rax,        dword ptr arg(1) ;src_stride;
+        mov         rdi,        arg(2) ;dst;
+
+        movq        mm1,        [rsi+rax]
+        movq        mm2,        [rsi+rax*2]
+
+        movsxd      rcx,        dword ptr arg(3) ;dst_stride
+        lea         rsi,        [rsi+rax*2]
+
+        movq        [rdi],      mm0
+        movq        [rdi+rcx],      mm1
+
+        movq        [rdi+rcx*2],    mm2
+        lea         rdi,        [rdi+rcx*2]
+
+        movq        mm3,        [rsi+rax]
+        movq        [rdi+rcx],      mm3
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void copy_mem16x16_mmx(
+;    unsigned char *src,
+;    int src_stride,
+;    unsigned char *dst,
+;    int dst_stride
+;    )
+global sym(vp9_copy_mem16x16_mmx)
+sym(vp9_copy_mem16x16_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 4
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov         rsi,        arg(0) ;src;
+        movsxd      rax,        dword ptr arg(1) ;src_stride;
+
+        mov         rdi,        arg(2) ;dst;
+        movsxd      rcx,        dword ptr arg(3) ;dst_stride
+
+        movq        mm0,            [rsi]
+        movq        mm3,            [rsi+8];
+
+        movq        mm1,            [rsi+rax]
+        movq        mm4,            [rsi+rax+8]
+
+        movq        mm2,            [rsi+rax*2]
+        movq        mm5,            [rsi+rax*2+8]
+
+        lea         rsi,            [rsi+rax*2]
+        add         rsi,            rax
+
+        movq        [rdi],          mm0
+        movq        [rdi+8],        mm3
+
+        movq        [rdi+rcx],      mm1
+        movq        [rdi+rcx+8],    mm4
+
+        movq        [rdi+rcx*2],    mm2
+        movq        [rdi+rcx*2+8],  mm5
+
+        lea         rdi,            [rdi+rcx*2]
+        add         rdi,            rcx
+
+        movq        mm0,            [rsi]
+        movq        mm3,            [rsi+8];
+
+        movq        mm1,            [rsi+rax]
+        movq        mm4,            [rsi+rax+8]
+
+        movq        mm2,            [rsi+rax*2]
+        movq        mm5,            [rsi+rax*2+8]
+
+        lea         rsi,            [rsi+rax*2]
+        add         rsi,            rax
+
+        movq        [rdi],          mm0
+        movq        [rdi+8],        mm3
+
+        movq        [rdi+rcx],      mm1
+        movq        [rdi+rcx+8],    mm4
+
+        movq        [rdi+rcx*2],    mm2
+        movq        [rdi+rcx*2+8],  mm5
+
+        lea         rdi,            [rdi+rcx*2]
+        add         rdi,            rcx
+
+        movq        mm0,            [rsi]
+        movq        mm3,            [rsi+8];
+
+        movq        mm1,            [rsi+rax]
+        movq        mm4,            [rsi+rax+8]
+
+        movq        mm2,            [rsi+rax*2]
+        movq        mm5,            [rsi+rax*2+8]
+
+        lea         rsi,            [rsi+rax*2]
+        add         rsi,            rax
+
+        movq        [rdi],          mm0
+        movq        [rdi+8],        mm3
+
+        movq        [rdi+rcx],      mm1
+        movq        [rdi+rcx+8],    mm4
+
+        movq        [rdi+rcx*2],    mm2
+        movq        [rdi+rcx*2+8],  mm5
+
+        lea         rdi,            [rdi+rcx*2]
+        add         rdi,            rcx
+
+        movq        mm0,            [rsi]
+        movq        mm3,            [rsi+8];
+
+        movq        mm1,            [rsi+rax]
+        movq        mm4,            [rsi+rax+8]
+
+        movq        mm2,            [rsi+rax*2]
+        movq        mm5,            [rsi+rax*2+8]
+
+        lea         rsi,            [rsi+rax*2]
+        add         rsi,            rax
+
+        movq        [rdi],          mm0
+        movq        [rdi+8],        mm3
+
+        movq        [rdi+rcx],      mm1
+        movq        [rdi+rcx+8],    mm4
+
+        movq        [rdi+rcx*2],    mm2
+        movq        [rdi+rcx*2+8],  mm5
+
+        lea         rdi,            [rdi+rcx*2]
+        add         rdi,            rcx
+
+        movq        mm0,            [rsi]
+        movq        mm3,            [rsi+8];
+
+        movq        mm1,            [rsi+rax]
+        movq        mm4,            [rsi+rax+8]
+
+        movq        mm2,            [rsi+rax*2]
+        movq        mm5,            [rsi+rax*2+8]
+
+        lea         rsi,            [rsi+rax*2]
+        add         rsi,            rax
+
+        movq        [rdi],          mm0
+        movq        [rdi+8],        mm3
+
+        movq        [rdi+rcx],      mm1
+        movq        [rdi+rcx+8],    mm4
+
+        movq        [rdi+rcx*2],    mm2
+        movq        [rdi+rcx*2+8],  mm5
+
+        lea         rdi,            [rdi+rcx*2]
+        add         rdi,            rcx
+
+        movq        mm0,            [rsi]
+        movq        mm3,            [rsi+8];
+
+        movq        [rdi],          mm0
+        movq        [rdi+8],        mm3
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
--- /dev/null
+++ b/vp9/common/x86/recon_sse2.asm
@@ -1,0 +1,688 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+;void vp9_recon2b_sse2(unsigned char *s, short *q, unsigned char *d, int stride)
+global sym(vp9_recon2b_sse2)
+sym(vp9_recon2b_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 4
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov         rsi,        arg(0) ;s
+        mov         rdi,        arg(2) ;d
+        mov         rdx,        arg(1) ;q
+        movsxd      rax,        dword ptr arg(3) ;stride
+        pxor        xmm0,       xmm0
+
+        movq        xmm1,       MMWORD PTR [rsi]
+        punpcklbw   xmm1,       xmm0
+        paddsw      xmm1,       XMMWORD PTR [rdx]
+        packuswb    xmm1,       xmm0              ; pack and unpack to saturate
+        movq        MMWORD PTR [rdi],   xmm1
+
+
+        movq        xmm2,       MMWORD PTR [rsi+8]
+        punpcklbw   xmm2,       xmm0
+        paddsw      xmm2,       XMMWORD PTR [rdx+16]
+        packuswb    xmm2,       xmm0              ; pack and unpack to saturate
+        movq        MMWORD PTR [rdi+rax],   xmm2
+
+
+        movq        xmm3,       MMWORD PTR [rsi+16]
+        punpcklbw   xmm3,       xmm0
+        paddsw      xmm3,       XMMWORD PTR [rdx+32]
+        packuswb    xmm3,       xmm0              ; pack and unpack to saturate
+        movq        MMWORD PTR [rdi+rax*2], xmm3
+
+        add         rdi, rax
+        movq        xmm4,       MMWORD PTR [rsi+24]
+        punpcklbw   xmm4,       xmm0
+        paddsw      xmm4,       XMMWORD PTR [rdx+48]
+        packuswb    xmm4,       xmm0              ; pack and unpack to saturate
+        movq        MMWORD PTR [rdi+rax*2], xmm4
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp9_recon4b_sse2(unsigned char *s, short *q, unsigned char *d, int stride)
+global sym(vp9_recon4b_sse2)
+sym(vp9_recon4b_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 4
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov         rsi,        arg(0) ;s
+        mov         rdi,        arg(2) ;d
+        mov         rdx,        arg(1) ;q
+        movsxd      rax,        dword ptr arg(3) ;stride
+        pxor        xmm0,       xmm0
+
+        movdqa      xmm1,       XMMWORD PTR [rsi]
+        movdqa      xmm5,       xmm1
+        punpcklbw   xmm1,       xmm0
+        punpckhbw   xmm5,       xmm0
+        paddsw      xmm1,       XMMWORD PTR [rdx]
+        paddsw      xmm5,       XMMWORD PTR [rdx+16]
+        packuswb    xmm1,       xmm5              ; pack and unpack to saturate
+        movdqa      XMMWORD PTR [rdi],  xmm1
+
+
+        movdqa      xmm2,       XMMWORD PTR [rsi+16]
+        movdqa      xmm6,       xmm2
+        punpcklbw   xmm2,       xmm0
+        punpckhbw   xmm6,       xmm0
+        paddsw      xmm2,       XMMWORD PTR [rdx+32]
+        paddsw      xmm6,       XMMWORD PTR [rdx+48]
+        packuswb    xmm2,       xmm6              ; pack and unpack to saturate
+        movdqa      XMMWORD PTR [rdi+rax],  xmm2
+
+
+        movdqa      xmm3,       XMMWORD PTR [rsi+32]
+        movdqa      xmm7,       xmm3
+        punpcklbw   xmm3,       xmm0
+        punpckhbw   xmm7,       xmm0
+        paddsw      xmm3,       XMMWORD PTR [rdx+64]
+        paddsw      xmm7,       XMMWORD PTR [rdx+80]
+        packuswb    xmm3,       xmm7              ; pack and unpack to saturate
+        movdqa      XMMWORD PTR [rdi+rax*2],    xmm3
+
+        add       rdi, rax
+        movdqa      xmm4,       XMMWORD PTR [rsi+48]
+        movdqa      xmm5,       xmm4
+        punpcklbw   xmm4,       xmm0
+        punpckhbw   xmm5,       xmm0
+        paddsw      xmm4,       XMMWORD PTR [rdx+96]
+        paddsw      xmm5,       XMMWORD PTR [rdx+112]
+        packuswb    xmm4,       xmm5              ; pack and unpack to saturate
+        movdqa      XMMWORD PTR [rdi+rax*2],    xmm4
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void copy_mem16x16_sse2(
+;    unsigned char *src,
+;    int src_stride,
+;    unsigned char *dst,
+;    int dst_stride
+;    )
+global sym(vp9_copy_mem16x16_sse2)
+sym(vp9_copy_mem16x16_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 4
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov         rsi,        arg(0) ;src;
+        movdqu      xmm0,       [rsi]
+
+        movsxd      rax,        dword ptr arg(1) ;src_stride;
+        mov         rdi,        arg(2) ;dst;
+
+        movdqu      xmm1,       [rsi+rax]
+        movdqu      xmm2,       [rsi+rax*2]
+
+        movsxd      rcx,        dword ptr arg(3) ;dst_stride
+        lea         rsi,        [rsi+rax*2]
+
+        movdqa      [rdi],      xmm0
+        add         rsi,        rax
+
+        movdqa      [rdi+rcx],  xmm1
+        movdqa      [rdi+rcx*2],xmm2
+
+        lea         rdi,        [rdi+rcx*2]
+        movdqu      xmm3,       [rsi]
+
+        add         rdi,        rcx
+        movdqu      xmm4,       [rsi+rax]
+
+        movdqu      xmm5,       [rsi+rax*2]
+        lea         rsi,        [rsi+rax*2]
+
+        movdqa      [rdi],  xmm3
+        add         rsi,        rax
+
+        movdqa      [rdi+rcx],  xmm4
+        movdqa      [rdi+rcx*2],xmm5
+
+        lea         rdi,        [rdi+rcx*2]
+        movdqu      xmm0,       [rsi]
+
+        add         rdi,        rcx
+        movdqu      xmm1,       [rsi+rax]
+
+        movdqu      xmm2,       [rsi+rax*2]
+        lea         rsi,        [rsi+rax*2]
+
+        movdqa      [rdi],      xmm0
+        add         rsi,        rax
+
+        movdqa      [rdi+rcx],  xmm1
+
+        movdqa      [rdi+rcx*2],    xmm2
+        movdqu      xmm3,       [rsi]
+
+        movdqu      xmm4,       [rsi+rax]
+        lea         rdi,        [rdi+rcx*2]
+
+        add         rdi,        rcx
+        movdqu      xmm5,       [rsi+rax*2]
+
+        lea         rsi,        [rsi+rax*2]
+        movdqa      [rdi],  xmm3
+
+        add         rsi,        rax
+        movdqa      [rdi+rcx],  xmm4
+
+        movdqa      [rdi+rcx*2],xmm5
+        movdqu      xmm0,       [rsi]
+
+        lea         rdi,        [rdi+rcx*2]
+        movdqu      xmm1,       [rsi+rax]
+
+        add         rdi,        rcx
+        movdqu      xmm2,       [rsi+rax*2]
+
+        lea         rsi,        [rsi+rax*2]
+        movdqa      [rdi],      xmm0
+
+        movdqa      [rdi+rcx],  xmm1
+        movdqa      [rdi+rcx*2],xmm2
+
+        movdqu      xmm3,       [rsi+rax]
+        lea         rdi,        [rdi+rcx*2]
+
+        movdqa      [rdi+rcx],  xmm3
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp9_intra_pred_uv_dc_mmx2(
+;    unsigned char *dst,
+;    int dst_stride
+;    unsigned char *src,
+;    int src_stride,
+;    )
+global sym(vp9_intra_pred_uv_dc_mmx2)
+sym(vp9_intra_pred_uv_dc_mmx2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 4
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ; from top
+    mov         rsi,        arg(2) ;src;
+    movsxd      rax,        dword ptr arg(3) ;src_stride;
+    sub         rsi,        rax
+    pxor        mm0,        mm0
+    movq        mm1,        [rsi]
+    psadbw      mm1,        mm0
+
+    ; from left
+    dec         rsi
+    lea         rdi,        [rax*3]
+    movzx       ecx,        byte [rsi+rax]
+    movzx       edx,        byte [rsi+rax*2]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi+rdi]
+    add         ecx,        edx
+    lea         rsi,        [rsi+rax*4]
+    movzx       edx,        byte [rsi]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi+rax]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi+rax*2]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi+rdi]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi+rax*4]
+    add         ecx,        edx
+
+    ; add up
+    pextrw      edx,        mm1, 0x0
+    lea         edx,        [edx+ecx+8]
+    sar         edx,        4
+    movd        mm1,        edx
+    pshufw      mm1,        mm1, 0x0
+    packuswb    mm1,        mm1
+
+    ; write out
+    mov         rdi,        arg(0) ;dst;
+    movsxd      rcx,        dword ptr arg(1) ;dst_stride
+    lea         rax,        [rcx*3]
+
+    movq [rdi      ],       mm1
+    movq [rdi+rcx  ],       mm1
+    movq [rdi+rcx*2],       mm1
+    movq [rdi+rax  ],       mm1
+    lea         rdi,        [rdi+rcx*4]
+    movq [rdi      ],       mm1
+    movq [rdi+rcx  ],       mm1
+    movq [rdi+rcx*2],       mm1
+    movq [rdi+rax  ],       mm1
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vp9_intra_pred_uv_dctop_mmx2(
+;    unsigned char *dst,
+;    int dst_stride
+;    unsigned char *src,
+;    int src_stride,
+;    )
+global sym(vp9_intra_pred_uv_dctop_mmx2)
+sym(vp9_intra_pred_uv_dctop_mmx2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 4
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ; from top
+    mov         rsi,        arg(2) ;src;
+    movsxd      rax,        dword ptr arg(3) ;src_stride;
+    sub         rsi,        rax
+    pxor        mm0,        mm0
+    movq        mm1,        [rsi]
+    psadbw      mm1,        mm0
+
+    ; add up
+    paddw       mm1,        [GLOBAL(dc_4)]
+    psraw       mm1,        3
+    pshufw      mm1,        mm1, 0x0
+    packuswb    mm1,        mm1
+
+    ; write out
+    mov         rdi,        arg(0) ;dst;
+    movsxd      rcx,        dword ptr arg(1) ;dst_stride
+    lea         rax,        [rcx*3]
+
+    movq [rdi      ],       mm1
+    movq [rdi+rcx  ],       mm1
+    movq [rdi+rcx*2],       mm1
+    movq [rdi+rax  ],       mm1
+    lea         rdi,        [rdi+rcx*4]
+    movq [rdi      ],       mm1
+    movq [rdi+rcx  ],       mm1
+    movq [rdi+rcx*2],       mm1
+    movq [rdi+rax  ],       mm1
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vp9_intra_pred_uv_dcleft_mmx2(
+;    unsigned char *dst,
+;    int dst_stride
+;    unsigned char *src,
+;    int src_stride,
+;    )
+global sym(vp9_intra_pred_uv_dcleft_mmx2)
+sym(vp9_intra_pred_uv_dcleft_mmx2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 4
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ; from left
+    mov         rsi,        arg(2) ;src;
+    movsxd      rax,        dword ptr arg(3) ;src_stride;
+    dec         rsi
+    lea         rdi,        [rax*3]
+    movzx       ecx,        byte [rsi]
+    movzx       edx,        byte [rsi+rax]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi+rax*2]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi+rdi]
+    add         ecx,        edx
+    lea         rsi,        [rsi+rax*4]
+    movzx       edx,        byte [rsi]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi+rax]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi+rax*2]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi+rdi]
+    lea         edx,        [ecx+edx+4]
+
+    ; add up
+    shr         edx,        3
+    movd        mm1,        edx
+    pshufw      mm1,        mm1, 0x0
+    packuswb    mm1,        mm1
+
+    ; write out
+    mov         rdi,        arg(0) ;dst;
+    movsxd      rcx,        dword ptr arg(1) ;dst_stride
+    lea         rax,        [rcx*3]
+
+    movq [rdi      ],       mm1
+    movq [rdi+rcx  ],       mm1
+    movq [rdi+rcx*2],       mm1
+    movq [rdi+rax  ],       mm1
+    lea         rdi,        [rdi+rcx*4]
+    movq [rdi      ],       mm1
+    movq [rdi+rcx  ],       mm1
+    movq [rdi+rcx*2],       mm1
+    movq [rdi+rax  ],       mm1
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vp9_intra_pred_uv_dc128_mmx(
+;    unsigned char *dst,
+;    int dst_stride
+;    unsigned char *src,
+;    int src_stride,
+;    )
+global sym(vp9_intra_pred_uv_dc128_mmx)
+sym(vp9_intra_pred_uv_dc128_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 4
+    GET_GOT     rbx
+    ; end prolog
+
+    ; write out
+    movq        mm1,        [GLOBAL(dc_128)]
+    mov         rax,        arg(0) ;dst;
+    movsxd      rdx,        dword ptr arg(1) ;dst_stride
+    lea         rcx,        [rdx*3]
+
+    movq [rax      ],       mm1
+    movq [rax+rdx  ],       mm1
+    movq [rax+rdx*2],       mm1
+    movq [rax+rcx  ],       mm1
+    lea         rax,        [rax+rdx*4]
+    movq [rax      ],       mm1
+    movq [rax+rdx  ],       mm1
+    movq [rax+rdx*2],       mm1
+    movq [rax+rcx  ],       mm1
+
+    ; begin epilog
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vp9_intra_pred_uv_tm_sse2(
+;    unsigned char *dst,
+;    int dst_stride
+;    unsigned char *src,
+;    int src_stride,
+;    )
+%macro vp9_intra_pred_uv_tm 1
+global sym(vp9_intra_pred_uv_tm_%1)
+sym(vp9_intra_pred_uv_tm_%1):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 4
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ; read top row
+    mov         edx,        4
+    mov         rsi,        arg(2) ;src;
+    movsxd      rax,        dword ptr arg(3) ;src_stride;
+    sub         rsi,        rax
+    pxor        xmm0,       xmm0
+%ifidn %1, ssse3
+    movdqa      xmm2,       [GLOBAL(dc_1024)]
+%endif
+    movq        xmm1,       [rsi]
+    punpcklbw   xmm1,       xmm0
+
+    ; set up left ptrs ans subtract topleft
+    movd        xmm3,       [rsi-1]
+    lea         rsi,        [rsi+rax-1]
+%ifidn %1, sse2
+    punpcklbw   xmm3,       xmm0
+    pshuflw     xmm3,       xmm3, 0x0
+    punpcklqdq  xmm3,       xmm3
+%else
+    pshufb      xmm3,       xmm2
+%endif
+    psubw       xmm1,       xmm3
+
+    ; set up dest ptrs
+    mov         rdi,        arg(0) ;dst;
+    movsxd      rcx,        dword ptr arg(1) ;dst_stride
+
+.vp9_intra_pred_uv_tm_%1_loop:
+    movd        xmm3,       [rsi]
+    movd        xmm5,       [rsi+rax]
+%ifidn %1, sse2
+    punpcklbw   xmm3,       xmm0
+    punpcklbw   xmm5,       xmm0
+    pshuflw     xmm3,       xmm3, 0x0
+    pshuflw     xmm5,       xmm5, 0x0
+    punpcklqdq  xmm3,       xmm3
+    punpcklqdq  xmm5,       xmm5
+%else
+    pshufb      xmm3,       xmm2
+    pshufb      xmm5,       xmm2
+%endif
+    paddw       xmm3,       xmm1
+    paddw       xmm5,       xmm1
+    packuswb    xmm3,       xmm5
+    movq  [rdi    ],        xmm3
+    movhps[rdi+rcx],        xmm3
+    lea         rsi,        [rsi+rax*2]
+    lea         rdi,        [rdi+rcx*2]
+    dec         edx
+    jnz .vp9_intra_pred_uv_tm_%1_loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+%endmacro
+
+vp9_intra_pred_uv_tm sse2
+vp9_intra_pred_uv_tm ssse3
+
+;void vp9_intra_pred_uv_ve_mmx(
+;    unsigned char *dst,
+;    int dst_stride
+;    unsigned char *src,
+;    int src_stride,
+;    )
+global sym(vp9_intra_pred_uv_ve_mmx)
+sym(vp9_intra_pred_uv_ve_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 4
+    ; end prolog
+
+    ; read from top
+    mov         rax,        arg(2) ;src;
+    movsxd      rdx,        dword ptr arg(3) ;src_stride;
+    sub         rax,        rdx
+    movq        mm1,        [rax]
+
+    ; write out
+    mov         rax,        arg(0) ;dst;
+    movsxd      rdx,        dword ptr arg(1) ;dst_stride
+    lea         rcx,        [rdx*3]
+
+    movq [rax      ],       mm1
+    movq [rax+rdx  ],       mm1
+    movq [rax+rdx*2],       mm1
+    movq [rax+rcx  ],       mm1
+    lea         rax,        [rax+rdx*4]
+    movq [rax      ],       mm1
+    movq [rax+rdx  ],       mm1
+    movq [rax+rdx*2],       mm1
+    movq [rax+rcx  ],       mm1
+
+    ; begin epilog
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vp9_intra_pred_uv_ho_mmx2(
+;    unsigned char *dst,
+;    int dst_stride
+;    unsigned char *src,
+;    int src_stride,
+;    )
+%macro vp9_intra_pred_uv_ho 1
+global sym(vp9_intra_pred_uv_ho_%1)
+sym(vp9_intra_pred_uv_ho_%1):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 4
+    push        rsi
+    push        rdi
+%ifidn %1, ssse3
+%ifndef GET_GOT_SAVE_ARG
+    push        rbx
+%endif
+    GET_GOT     rbx
+%endif
+    ; end prolog
+
+    ; read from left and write out
+%ifidn %1, mmx2
+    mov         edx,        4
+%endif
+    mov         rsi,        arg(2) ;src;
+    movsxd      rax,        dword ptr arg(3) ;src_stride;
+    mov         rdi,        arg(0) ;dst;
+    movsxd      rcx,        dword ptr arg(1) ;dst_stride
+%ifidn %1, ssse3
+    lea         rdx,        [rcx*3]
+    movdqa      xmm2,       [GLOBAL(dc_00001111)]
+    lea         rbx,        [rax*3]
+%endif
+    dec         rsi
+%ifidn %1, mmx2
+.vp9_intra_pred_uv_ho_%1_loop:
+    movd        mm0,        [rsi]
+    movd        mm1,        [rsi+rax]
+    punpcklbw   mm0,        mm0
+    punpcklbw   mm1,        mm1
+    pshufw      mm0,        mm0, 0x0
+    pshufw      mm1,        mm1, 0x0
+    movq  [rdi    ],        mm0
+    movq  [rdi+rcx],        mm1
+    lea         rsi,        [rsi+rax*2]
+    lea         rdi,        [rdi+rcx*2]
+    dec         edx
+    jnz .vp9_intra_pred_uv_ho_%1_loop
+%else
+    movd        xmm0,       [rsi]
+    movd        xmm3,       [rsi+rax]
+    movd        xmm1,       [rsi+rax*2]
+    movd        xmm4,       [rsi+rbx]
+    punpcklbw   xmm0,       xmm3
+    punpcklbw   xmm1,       xmm4
+    pshufb      xmm0,       xmm2
+    pshufb      xmm1,       xmm2
+    movq   [rdi    ],       xmm0
+    movhps [rdi+rcx],       xmm0
+    movq [rdi+rcx*2],       xmm1
+    movhps [rdi+rdx],       xmm1
+    lea         rsi,        [rsi+rax*4]
+    lea         rdi,        [rdi+rcx*4]
+    movd        xmm0,       [rsi]
+    movd        xmm3,       [rsi+rax]
+    movd        xmm1,       [rsi+rax*2]
+    movd        xmm4,       [rsi+rbx]
+    punpcklbw   xmm0,       xmm3
+    punpcklbw   xmm1,       xmm4
+    pshufb      xmm0,       xmm2
+    pshufb      xmm1,       xmm2
+    movq   [rdi    ],       xmm0
+    movhps [rdi+rcx],       xmm0
+    movq [rdi+rcx*2],       xmm1
+    movhps [rdi+rdx],       xmm1
+%endif
+
+    ; begin epilog
+%ifidn %1, ssse3
+    RESTORE_GOT
+%ifndef GET_GOT_SAVE_ARG
+    pop         rbx
+%endif
+%endif
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+%endmacro
+
+vp9_intra_pred_uv_ho mmx2
+vp9_intra_pred_uv_ho ssse3
+
+SECTION_RODATA
+dc_128:
+    times 8 db 128
+dc_4:
+    times 4 dw 4
+align 16
+dc_1024:
+    times 8 dw 0x400
+align 16
+dc_00001111:
+    times 8 db 0
+    times 8 db 1
--- /dev/null
+++ b/vp9/common/x86/recon_wrapper_sse2.c
@@ -1,0 +1,101 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_ports/config.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vp9/common/blockd.h"
+
+#define build_intra_predictors_mbuv_prototype(sym) \
+  void sym(unsigned char *dst, int dst_stride, \
+           const unsigned char *src, int src_stride)
+typedef build_intra_predictors_mbuv_prototype((*build_intra_pred_mbuv_fn_t));
+
+extern build_intra_predictors_mbuv_prototype(vp9_intra_pred_uv_dc_mmx2);
+extern build_intra_predictors_mbuv_prototype(vp9_intra_pred_uv_dctop_mmx2);
+extern build_intra_predictors_mbuv_prototype(vp9_intra_pred_uv_dcleft_mmx2);
+extern build_intra_predictors_mbuv_prototype(vp9_intra_pred_uv_dc128_mmx);
+extern build_intra_predictors_mbuv_prototype(vp9_intra_pred_uv_ho_mmx2);
+extern build_intra_predictors_mbuv_prototype(vp9_intra_pred_uv_ho_ssse3);
+extern build_intra_predictors_mbuv_prototype(vp9_intra_pred_uv_ve_mmx);
+extern build_intra_predictors_mbuv_prototype(vp9_intra_pred_uv_tm_sse2);
+extern build_intra_predictors_mbuv_prototype(vp9_intra_pred_uv_tm_ssse3);
+
+static void build_intra_predictors_mbuv_x86(MACROBLOCKD *xd,
+                                            unsigned char *dst_u,
+                                            unsigned char *dst_v,
+                                            int dst_stride,
+                                            build_intra_pred_mbuv_fn_t tm_fn,
+                                            build_intra_pred_mbuv_fn_t ho_fn) {
+  int mode = xd->mode_info_context->mbmi.uv_mode;
+  build_intra_pred_mbuv_fn_t fn;
+  int src_stride = xd->dst.uv_stride;
+
+  switch (mode) {
+    case  V_PRED:
+      fn = vp9_intra_pred_uv_ve_mmx;
+      break;
+    case  H_PRED:
+      fn = ho_fn;
+      break;
+    case TM_PRED:
+      fn = tm_fn;
+      break;
+    case DC_PRED:
+      if (xd->up_available) {
+        if (xd->left_available) {
+          fn = vp9_intra_pred_uv_dc_mmx2;
+          break;
+        } else {
+          fn = vp9_intra_pred_uv_dctop_mmx2;
+          break;
+        }
+      } else if (xd->left_available) {
+        fn = vp9_intra_pred_uv_dcleft_mmx2;
+        break;
+      } else {
+        fn = vp9_intra_pred_uv_dc128_mmx;
+        break;
+      }
+      break;
+    default:
+      return;
+  }
+
+  fn(dst_u, dst_stride, xd->dst.u_buffer, src_stride);
+  fn(dst_v, dst_stride, xd->dst.v_buffer, src_stride);
+}
+
+void vp9_build_intra_predictors_mbuv_sse2(MACROBLOCKD *xd) {
+  build_intra_predictors_mbuv_x86(xd, &xd->predictor[256],
+                                  &xd->predictor[320], 8,
+                                  vp9_intra_pred_uv_tm_sse2,
+                                  vp9_intra_pred_uv_ho_mmx2);
+}
+
+void vp9_build_intra_predictors_mbuv_ssse3(MACROBLOCKD *xd) {
+  build_intra_predictors_mbuv_x86(xd, &xd->predictor[256],
+                                  &xd->predictor[320], 8,
+                                  vp9_intra_pred_uv_tm_ssse3,
+                                  vp9_intra_pred_uv_ho_ssse3);
+}
+
+void vp9_build_intra_predictors_mbuv_s_sse2(MACROBLOCKD *xd) {
+  build_intra_predictors_mbuv_x86(xd, xd->dst.u_buffer,
+                                  xd->dst.v_buffer, xd->dst.uv_stride,
+                                  vp9_intra_pred_uv_tm_sse2,
+                                  vp9_intra_pred_uv_ho_mmx2);
+}
+
+void vp9_build_intra_predictors_mbuv_s_ssse3(MACROBLOCKD *xd) {
+  build_intra_predictors_mbuv_x86(xd, xd->dst.u_buffer,
+                                  xd->dst.v_buffer, xd->dst.uv_stride,
+                                  vp9_intra_pred_uv_tm_ssse3,
+                                  vp9_intra_pred_uv_ho_ssse3);
+}
--- /dev/null
+++ b/vp9/common/x86/sadmxn_x86.c
@@ -1,0 +1,92 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <emmintrin.h>  // SSE2
+#include "./vpx_config.h"
+#include "./vpx_rtcd.h"
+
+
+#if CONFIG_NEWBESTREFMV
+
+
+#if HAVE_SSE2
+unsigned int vp9_sad16x3_sse2(
+  const unsigned char *src_ptr,
+  int  src_stride,
+  const unsigned char *ref_ptr,
+  int  ref_stride,
+  int max_sad) {
+  __m128i s0, s1, s2;
+  __m128i r0, r1, r2;
+  __m128i sad;
+
+  (void)max_sad;
+
+  s0 = _mm_loadu_si128((const __m128i *)(src_ptr + 0 * src_stride));
+  s1 = _mm_loadu_si128((const __m128i *)(src_ptr + 1 * src_stride));
+  s2 = _mm_loadu_si128((const __m128i *)(src_ptr + 2 * src_stride));
+
+  r0 = _mm_loadu_si128((const __m128i *)(ref_ptr + 0 * src_stride));
+  r1 = _mm_loadu_si128((const __m128i *)(ref_ptr + 1 * src_stride));
+  r2 = _mm_loadu_si128((const __m128i *)(ref_ptr + 2 * src_stride));
+
+  sad = _mm_sad_epu8(s0, r0);
+  sad = _mm_add_epi16(sad,  _mm_sad_epu8(s1, r1));
+  sad = _mm_add_epi16(sad,  _mm_sad_epu8(s2, r2));
+  sad = _mm_add_epi16(sad,  _mm_srli_si128(sad, 8));
+
+  return _mm_cvtsi128_si32(sad);
+}
+
+unsigned int vp9_sad3x16_sse2(
+  const unsigned char *src_ptr,
+  int  src_stride,
+  const unsigned char *ref_ptr,
+  int  ref_stride,
+  int max_sad) {
+  int r;
+  __m128i s0, s1, s2, s3;
+  __m128i r0, r1, r2, r3;
+  __m128i sad = _mm_set1_epi16(0);
+  for (r = 0; r < 16; r += 4) {
+    s0 = _mm_cvtsi32_si128 (*(const int *)(src_ptr + 0 * src_stride));
+    s1 = _mm_cvtsi32_si128 (*(const int *)(src_ptr + 1 * src_stride));
+    s2 = _mm_cvtsi32_si128 (*(const int *)(src_ptr + 2 * src_stride));
+    s3 = _mm_cvtsi32_si128 (*(const int *)(src_ptr + 3 * src_stride));
+    r0 = _mm_cvtsi32_si128 (*(const int *)(ref_ptr + 0 * src_stride));
+    r1 = _mm_cvtsi32_si128 (*(const int *)(ref_ptr + 1 * src_stride));
+    r2 = _mm_cvtsi32_si128 (*(const int *)(ref_ptr + 2 * src_stride));
+    r3 = _mm_cvtsi32_si128 (*(const int *)(ref_ptr + 3 * src_stride));
+
+    s0 = _mm_unpacklo_epi8(s0, s1);
+    r0 = _mm_unpacklo_epi8(r0, r1);
+    s2 = _mm_unpacklo_epi8(s2, s3);
+    r2 = _mm_unpacklo_epi8(r2, r3);
+    s0 = _mm_unpacklo_epi64(s0, s2);
+    r0 = _mm_unpacklo_epi64(r0, r2);
+
+    // throw out byte 3
+    s0 = _mm_slli_epi64(s0, 16);
+    r0 = _mm_slli_epi64(r0, 16);
+
+    sad = _mm_add_epi16(sad, _mm_sad_epu8(s0, r0));
+
+    src_ptr += src_stride*4;
+    ref_ptr += ref_stride*4;
+  }
+
+  sad = _mm_add_epi16(sad,  _mm_srli_si128(sad, 8));
+  return _mm_cvtsi128_si32(sad);
+}
+
+#endif
+
+
+#endif  // CONFIG_NEWBESTREFMV
--- /dev/null
+++ b/vp9/common/x86/subpixel_8t_ssse3.asm
@@ -1,0 +1,550 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;/************************************************************************************
+; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
+; input pixel array has output_height rows. This routine assumes that output_height is an
+; even number. This function handles 8 pixels in horizontal direction, calculating ONE
+; rows each iteration to take advantage of the 128 bits operations.
+;
+; This is an implementation of some of the SSE optimizations first seen in ffvp8
+;
+;*************************************************************************************/
+
+;void vp9_filter_block1d8_v8_ssse3
+;(
+;    unsigned char *src_ptr,
+;    unsigned int   src_pitch,
+;    unsigned char *output_ptr,
+;    unsigned int   out_pitch,
+;    unsigned int   output_height,
+;    short *filter
+;)
+global sym(vp9_filter_block1d8_v8_ssse3)
+sym(vp9_filter_block1d8_v8_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    push        rbx
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16*5
+    %define k0k1 [rsp + 16*0]
+    %define k2k3 [rsp + 16*1]
+    %define k4k5 [rsp + 16*2]
+    %define k6k7 [rsp + 16*3]
+    %define krd [rsp + 16*4]
+
+    mov         rdx, arg(5)                 ;filter ptr
+    mov         rsi, arg(0)                 ;src_ptr
+    mov         rdi, arg(2)                 ;output_ptr
+    mov         rcx, 0x0400040
+
+    movdqa      xmm4, [rdx]                 ;load filters
+    movd        xmm5, rcx
+    packsswb    xmm4, xmm4
+    pshuflw     xmm0, xmm4, 0b              ;k0_k1
+    pshuflw     xmm1, xmm4, 01010101b       ;k2_k3
+    pshuflw     xmm2, xmm4, 10101010b       ;k4_k5
+    pshuflw     xmm3, xmm4, 11111111b       ;k6_k7
+
+    punpcklqdq  xmm0, xmm0
+    punpcklqdq  xmm1, xmm1
+    punpcklqdq  xmm2, xmm2
+    punpcklqdq  xmm3, xmm3
+
+    movdqa      k0k1, xmm0
+    movdqa      k2k3, xmm1
+    pshufd      xmm5, xmm5, 0
+    movdqa      k4k5, xmm2
+    movdqa      k6k7, xmm3
+    movdqa      krd, xmm5
+
+    movsxd      rdx, DWORD PTR arg(1)       ;pixels_per_line
+
+%if ABI_IS_32BIT=0
+    movsxd      r8, DWORD PTR arg(3)        ;out_pitch
+%endif
+    mov         rax, rsi
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+    add         rax, rdx
+
+    lea         rbx, [rdx + rdx*4]
+    add         rbx, rdx                    ;pitch * 6
+
+.vp9_filter_block1d8_v8_ssse3_loop:
+    movq        xmm0, [rsi]                 ;A
+    movq        xmm1, [rsi + rdx]           ;B
+    movq        xmm2, [rsi + rdx * 2]       ;C
+    movq        xmm3, [rax + rdx * 2]       ;D
+    movq        xmm4, [rsi + rdx * 4]       ;E
+    movq        xmm5, [rax + rdx * 4]       ;F
+
+    punpcklbw   xmm0, xmm1                  ;A B
+    punpcklbw   xmm2, xmm3                  ;C D
+    punpcklbw   xmm4, xmm5                  ;E F
+
+    movq        xmm6, [rsi + rbx]           ;G
+    movq        xmm7, [rax + rbx]           ;H
+
+    pmaddubsw   xmm0, k0k1
+    pmaddubsw   xmm2, k2k3
+    punpcklbw   xmm6, xmm7                  ;G H
+    pmaddubsw   xmm4, k4k5
+    pmaddubsw   xmm6, k6k7
+
+    paddsw      xmm0, xmm2
+    paddsw      xmm0, krd
+    paddsw      xmm4, xmm6
+    paddsw      xmm0, xmm4
+
+    psraw       xmm0, 7
+    packuswb    xmm0, xmm0
+
+    add         rsi,  rdx
+    add         rax,  rdx
+
+    movq        [rdi], xmm0
+
+%if ABI_IS_32BIT
+    add         rdi, DWORD PTR arg(3)       ;out_pitch
+%else
+    add         rdi, r8
+%endif
+    dec         rcx
+    jnz         .vp9_filter_block1d8_v8_ssse3_loop
+
+    add rsp, 16*5
+    pop rsp
+    pop rbx
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vp9_filter_block1d16_v8_ssse3
+;(
+;    unsigned char *src_ptr,
+;    unsigned int   src_pitch,
+;    unsigned char *output_ptr,
+;    unsigned int   out_pitch,
+;    unsigned int   output_height,
+;    short *filter
+;)
+global sym(vp9_filter_block1d16_v8_ssse3)
+sym(vp9_filter_block1d16_v8_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    push        rbx
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16*5
+    %define k0k1 [rsp + 16*0]
+    %define k2k3 [rsp + 16*1]
+    %define k4k5 [rsp + 16*2]
+    %define k6k7 [rsp + 16*3]
+    %define krd [rsp + 16*4]
+
+    mov         rdx, arg(5)                 ;filter ptr
+    mov         rsi, arg(0)                 ;src_ptr
+    mov         rdi, arg(2)                 ;output_ptr
+    mov         rcx, 0x0400040
+
+    movdqa      xmm4, [rdx]                 ;load filters
+    movd        xmm5, rcx
+    packsswb    xmm4, xmm4
+    pshuflw     xmm0, xmm4, 0b              ;k0_k1
+    pshuflw     xmm1, xmm4, 01010101b       ;k2_k3
+    pshuflw     xmm2, xmm4, 10101010b       ;k4_k5
+    pshuflw     xmm3, xmm4, 11111111b       ;k6_k7
+
+    punpcklqdq  xmm0, xmm0
+    punpcklqdq  xmm1, xmm1
+    punpcklqdq  xmm2, xmm2
+    punpcklqdq  xmm3, xmm3
+
+    movdqa      k0k1, xmm0
+    movdqa      k2k3, xmm1
+    pshufd      xmm5, xmm5, 0
+    movdqa      k4k5, xmm2
+    movdqa      k6k7, xmm3
+    movdqa      krd, xmm5
+
+    movsxd      rdx, DWORD PTR arg(1)       ;pixels_per_line
+
+%if ABI_IS_32BIT=0
+    movsxd      r8, DWORD PTR arg(3)        ;out_pitch
+%endif
+    mov         rax, rsi
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+    add         rax, rdx
+
+    lea         rbx, [rdx + rdx*4]
+    add         rbx, rdx                    ;pitch * 6
+
+.vp9_filter_block1d16_v8_ssse3_loop:
+    movq        xmm0, [rsi]                 ;A
+    movq        xmm1, [rsi + rdx]           ;B
+    movq        xmm2, [rsi + rdx * 2]       ;C
+    movq        xmm3, [rax + rdx * 2]       ;D
+    movq        xmm4, [rsi + rdx * 4]       ;E
+    movq        xmm5, [rax + rdx * 4]       ;F
+
+    punpcklbw   xmm0, xmm1                  ;A B
+    punpcklbw   xmm2, xmm3                  ;C D
+    punpcklbw   xmm4, xmm5                  ;E F
+
+    movq        xmm6, [rsi + rbx]           ;G
+    movq        xmm7, [rax + rbx]           ;H
+
+    pmaddubsw   xmm0, k0k1
+    pmaddubsw   xmm2, k2k3
+    punpcklbw   xmm6, xmm7                  ;G H
+    pmaddubsw   xmm4, k4k5
+    pmaddubsw   xmm6, k6k7
+
+    paddsw      xmm0, xmm2
+    paddsw      xmm0, krd
+    paddsw      xmm4, xmm6
+    paddsw      xmm0, xmm4
+
+    psraw       xmm0, 7
+    packuswb    xmm0, xmm0
+
+    movq        [rdi], xmm0
+
+    movq        xmm0, [rsi + 8]             ;A
+    movq        xmm1, [rsi + rdx + 8]       ;B
+    movq        xmm2, [rsi + rdx * 2 + 8]   ;C
+    movq        xmm3, [rax + rdx * 2 + 8]   ;D
+    movq        xmm4, [rsi + rdx * 4 + 8]   ;E
+    movq        xmm5, [rax + rdx * 4 + 8]   ;F
+
+    punpcklbw   xmm0, xmm1                  ;A B
+    punpcklbw   xmm2, xmm3                  ;C D
+    punpcklbw   xmm4, xmm5                  ;E F
+
+
+    movq        xmm6, [rsi + rbx + 8]       ;G
+    movq        xmm7, [rax + rbx + 8]       ;H
+    punpcklbw   xmm6, xmm7                  ;G H
+
+
+    pmaddubsw   xmm0, k0k1
+    pmaddubsw   xmm2, k2k3
+    pmaddubsw   xmm4, k4k5
+    pmaddubsw   xmm6, k6k7
+
+    paddsw      xmm0, xmm2
+    paddsw      xmm4, xmm6
+    paddsw      xmm0, krd
+    paddsw      xmm0, xmm4
+
+    psraw       xmm0, 7
+    packuswb    xmm0, xmm0
+
+    add         rsi,  rdx
+    add         rax,  rdx
+
+    movq        [rdi+8], xmm0
+
+%if ABI_IS_32BIT
+    add         rdi, DWORD PTR arg(3)       ;out_pitch
+%else
+    add         rdi, r8
+%endif
+    dec         rcx
+    jnz         .vp9_filter_block1d16_v8_ssse3_loop
+
+    add rsp, 16*5
+    pop rsp
+    pop rbx
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vp9_filter_block1d8_h8_ssse3
+;(
+;    unsigned char  *src_ptr,
+;    unsigned int    src_pixels_per_line,
+;    unsigned char  *output_ptr,
+;    unsigned int    output_pitch,
+;    unsigned int    output_height,
+;    short *filter
+;)
+global sym(vp9_filter_block1d8_h8_ssse3)
+sym(vp9_filter_block1d8_h8_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16*5
+    %define k0k1 [rsp + 16*0]
+    %define k2k3 [rsp + 16*1]
+    %define k4k5 [rsp + 16*2]
+    %define k6k7 [rsp + 16*3]
+    %define krd [rsp + 16*4]
+
+    mov         rdx, arg(5)                 ;filter ptr
+    mov         rsi, arg(0)                 ;src_ptr
+    mov         rdi, arg(2)                 ;output_ptr
+    mov         rcx, 0x0400040
+
+    movdqa      xmm4, [rdx]                 ;load filters
+    movd        xmm5, rcx
+    packsswb    xmm4, xmm4
+    pshuflw     xmm0, xmm4, 0b              ;k0_k1
+    pshuflw     xmm1, xmm4, 01010101b       ;k2_k3
+    pshuflw     xmm2, xmm4, 10101010b       ;k4_k5
+    pshuflw     xmm3, xmm4, 11111111b       ;k6_k7
+
+    punpcklqdq  xmm0, xmm0
+    punpcklqdq  xmm1, xmm1
+    punpcklqdq  xmm2, xmm2
+    punpcklqdq  xmm3, xmm3
+
+    movdqa      k0k1, xmm0
+    movdqa      k2k3, xmm1
+    pshufd      xmm5, xmm5, 0
+    movdqa      k4k5, xmm2
+    movdqa      k6k7, xmm3
+;    movdqa      krd, xmm5
+
+    movsxd      rax, dword ptr arg(1)       ;src_pixels_per_line
+    movsxd      rdx, dword ptr arg(3)       ;output_pitch
+    movsxd      rcx, dword ptr arg(4)       ;output_height
+
+.filter_block1d8_h8_rowloop_ssse3:
+    movq        xmm0,   [rsi - 3]    ; -3 -2 -1  0  1  2  3  4
+
+;    movq        xmm3,   [rsi + 4]    ; 4  5  6  7  8  9 10 11
+    movq        xmm3,   [rsi + 5]    ; 5  6  7  8  9 10 11 12
+;note: if we create a k0_k7 filter, we can save a pshufb
+;    punpcklbw   xmm0,   xmm3         ; -3 4 -2 5 -1 6 0 7 1 8 2 9 3 10 4 11
+    punpcklqdq  xmm0,   xmm3
+
+    movdqa      xmm1,   xmm0
+    pshufb      xmm0,   [GLOBAL(shuf_t0t1)]
+    pmaddubsw   xmm0,   k0k1
+
+    movdqa      xmm2,   xmm1
+    pshufb      xmm1,   [GLOBAL(shuf_t2t3)]
+    pmaddubsw   xmm1,   k2k3
+
+    movdqa      xmm4,   xmm2
+    pshufb      xmm2,   [GLOBAL(shuf_t4t5)]
+    pmaddubsw   xmm2,   k4k5
+
+    pshufb      xmm4,   [GLOBAL(shuf_t6t7)]
+    pmaddubsw   xmm4,   k6k7
+
+    paddsw      xmm0,   xmm1
+    paddsw      xmm0,   xmm2
+    paddsw      xmm0,   xmm5
+    paddsw      xmm0,   xmm4
+    psraw       xmm0,   7
+    packuswb    xmm0,   xmm0
+
+    lea         rsi,    [rsi + rax]
+    movq        [rdi],  xmm0
+
+    lea         rdi,    [rdi + rdx]
+    dec         rcx
+    jnz         .filter_block1d8_h8_rowloop_ssse3
+
+    add rsp, 16*5
+    pop rsp
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vp9_filter_block1d16_h8_ssse3
+;(
+;    unsigned char  *src_ptr,
+;    unsigned int    src_pixels_per_line,
+;    unsigned char  *output_ptr,
+;    unsigned int    output_pitch,
+;    unsigned int    output_height,
+;    short *filter
+;)
+global sym(vp9_filter_block1d16_h8_ssse3)
+sym(vp9_filter_block1d16_h8_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16*5
+    %define k0k1 [rsp + 16*0]
+    %define k2k3 [rsp + 16*1]
+    %define k4k5 [rsp + 16*2]
+    %define k6k7 [rsp + 16*3]
+    %define krd [rsp + 16*4]
+
+    mov         rdx, arg(5)                 ;filter ptr
+    mov         rsi, arg(0)                 ;src_ptr
+    mov         rdi, arg(2)                 ;output_ptr
+    mov         rcx, 0x0400040
+
+    movdqa      xmm4, [rdx]                 ;load filters
+    movd        xmm5, rcx
+    packsswb    xmm4, xmm4
+    pshuflw     xmm0, xmm4, 0b              ;k0_k1
+    pshuflw     xmm1, xmm4, 01010101b       ;k2_k3
+    pshuflw     xmm2, xmm4, 10101010b       ;k4_k5
+    pshuflw     xmm3, xmm4, 11111111b       ;k6_k7
+
+    punpcklqdq  xmm0, xmm0
+    punpcklqdq  xmm1, xmm1
+    punpcklqdq  xmm2, xmm2
+    punpcklqdq  xmm3, xmm3
+
+    movdqa      k0k1, xmm0
+    movdqa      k2k3, xmm1
+    pshufd      xmm5, xmm5, 0
+    movdqa      k4k5, xmm2
+    movdqa      k6k7, xmm3
+    movdqa      krd, xmm5
+
+    movsxd      rax, dword ptr arg(1)       ;src_pixels_per_line
+    movsxd      rdx, dword ptr arg(3)       ;output_pitch
+    movsxd      rcx, dword ptr arg(4)       ;output_height
+
+.filter_block1d16_h8_rowloop_ssse3:
+    movq        xmm0,   [rsi - 3]    ; -3 -2 -1  0  1  2  3  4
+
+;    movq        xmm3,   [rsi + 4]    ; 4  5  6  7  8  9 10 11
+    movq        xmm3,   [rsi + 5]    ; 5  6  7  8  9 10 11 12
+;note: if we create a k0_k7 filter, we can save a pshufb
+;    punpcklbw   xmm0,   xmm3         ; -3 4 -2 5 -1 6 0 7 1 8 2 9 3 10 4 11
+    punpcklqdq  xmm0,   xmm3
+
+    movdqa      xmm1,   xmm0
+    pshufb      xmm0,   [GLOBAL(shuf_t0t1)]
+    pmaddubsw   xmm0,   k0k1
+
+    movdqa      xmm2,   xmm1
+    pshufb      xmm1,   [GLOBAL(shuf_t2t3)]
+    pmaddubsw   xmm1,   k2k3
+
+    movdqa      xmm4,   xmm2
+    pshufb      xmm2,   [GLOBAL(shuf_t4t5)]
+    pmaddubsw   xmm2,   k4k5
+
+    pshufb      xmm4,   [GLOBAL(shuf_t6t7)]
+    pmaddubsw   xmm4,   k6k7
+
+    paddsw      xmm0,   xmm1
+    paddsw      xmm0,   xmm4
+    paddsw      xmm0,   xmm2
+    paddsw      xmm0,   krd
+    psraw       xmm0,   7
+    packuswb    xmm0,   xmm0
+
+
+    movq        xmm3,   [rsi +  5]
+;    movq        xmm7,   [rsi + 12]
+    movq        xmm7,   [rsi + 13]
+;note: same as above
+;    punpcklbw   xmm3,   xmm7
+    punpcklqdq  xmm3,   xmm7
+
+    movdqa      xmm1,   xmm3
+    pshufb      xmm3,   [GLOBAL(shuf_t0t1)]
+    pmaddubsw   xmm3,   k0k1
+
+    movdqa      xmm2,   xmm1
+    pshufb      xmm1,   [GLOBAL(shuf_t2t3)]
+    pmaddubsw   xmm1,   k2k3
+
+    movdqa      xmm4,   xmm2
+    pshufb      xmm2,   [GLOBAL(shuf_t4t5)]
+    pmaddubsw   xmm2,   k4k5
+
+    pshufb      xmm4,   [GLOBAL(shuf_t6t7)]
+    pmaddubsw   xmm4,   k6k7
+
+    paddsw      xmm3,   xmm1
+    paddsw      xmm3,   xmm2
+    paddsw      xmm3,   krd
+    paddsw      xmm3,   xmm4
+    psraw       xmm3,   7
+    packuswb    xmm3,   xmm3
+    punpcklqdq  xmm0,   xmm3
+
+    lea         rsi,    [rsi + rax]
+    movdqa      [rdi],  xmm0
+
+    lea         rdi,    [rdi + rdx]
+    dec         rcx
+    jnz         .filter_block1d16_h8_rowloop_ssse3
+
+    add rsp, 16*5
+    pop rsp
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+SECTION_RODATA
+align 16
+shuf_t0t1:
+    db  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
+align 16
+shuf_t2t3:
+    db  2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
+align 16
+shuf_t4t5:
+    db  4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12
+align 16
+shuf_t6t7:
+    db  6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
--- /dev/null
+++ b/vp9/common/x86/subpixel_mmx.asm
@@ -1,0 +1,727 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+
+%define BLOCK_HEIGHT_WIDTH 4
+%define vp9_filter_weight 128
+%define VP9_FILTER_SHIFT  7
+
+
+;void vp9_filter_block1d_h6_mmx
+;(
+;    unsigned char   *src_ptr,
+;    unsigned short  *output_ptr,
+;    unsigned int    src_pixels_per_line,
+;    unsigned int    pixel_step,
+;    unsigned int    output_height,
+;    unsigned int    output_width,
+;    short           * vp9_filter
+;)
+global sym(vp9_filter_block1d_h6_mmx)
+sym(vp9_filter_block1d_h6_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov         rdx,    arg(6) ;vp9_filter
+
+        movq        mm1,    [rdx + 16]             ; do both the negative taps first!!!
+        movq        mm2,    [rdx + 32]         ;
+        movq        mm6,    [rdx + 48]        ;
+        movq        mm7,    [rdx + 64]        ;
+
+        mov         rdi,    arg(1) ;output_ptr
+        mov         rsi,    arg(0) ;src_ptr
+        movsxd      rcx,    dword ptr arg(4) ;output_height
+        movsxd      rax,    dword ptr arg(5) ;output_width      ; destination pitch?
+        pxor        mm0,    mm0              ; mm0 = 00000000
+
+.nextrow:
+        movq        mm3,    [rsi-2]          ; mm3 = p-2..p5
+        movq        mm4,    mm3              ; mm4 = p-2..p5
+        psrlq       mm3,    8                ; mm3 = p-1..p5
+        punpcklbw   mm3,    mm0              ; mm3 = p-1..p2
+        pmullw      mm3,    mm1              ; mm3 *= kernel 1 modifiers.
+
+        movq        mm5,    mm4              ; mm5 = p-2..p5
+        punpckhbw   mm4,    mm0              ; mm5 = p2..p5
+        pmullw      mm4,    mm7              ; mm5 *= kernel 4 modifiers
+        paddsw      mm3,    mm4              ; mm3 += mm5
+
+        movq        mm4,    mm5              ; mm4 = p-2..p5;
+        psrlq       mm5,    16               ; mm5 = p0..p5;
+        punpcklbw   mm5,    mm0              ; mm5 = p0..p3
+        pmullw      mm5,    mm2              ; mm5 *= kernel 2 modifiers
+        paddsw      mm3,    mm5              ; mm3 += mm5
+
+        movq        mm5,    mm4              ; mm5 = p-2..p5
+        psrlq       mm4,    24               ; mm4 = p1..p5
+        punpcklbw   mm4,    mm0              ; mm4 = p1..p4
+        pmullw      mm4,    mm6              ; mm5 *= kernel 3 modifiers
+        paddsw      mm3,    mm4              ; mm3 += mm5
+
+        ; do outer positive taps
+        movd        mm4,    [rsi+3]
+        punpcklbw   mm4,    mm0              ; mm5 = p3..p6
+        pmullw      mm4,    [rdx+80]         ; mm5 *= kernel 0 modifiers
+        paddsw      mm3,    mm4              ; mm3 += mm5
+
+        punpcklbw   mm5,    mm0              ; mm5 = p-2..p1
+        pmullw      mm5,    [rdx]            ; mm5 *= kernel 5 modifiers
+        paddsw      mm3,    mm5              ; mm3 += mm5
+
+        paddsw      mm3,    [GLOBAL(rd)]              ; mm3 += round value
+        psraw       mm3,    VP9_FILTER_SHIFT     ; mm3 /= 128
+        packuswb    mm3,    mm0              ; pack and unpack to saturate
+        punpcklbw   mm3,    mm0              ;
+
+        movq        [rdi],  mm3              ; store the results in the destination
+
+%if ABI_IS_32BIT
+        add         rsi,    dword ptr arg(2) ;src_pixels_per_line ; next line
+        add         rdi,    rax;
+%else
+        movsxd      r8,     dword ptr arg(2) ;src_pixels_per_line
+        add         rdi,    rax;
+
+        add         rsi,    r8               ; next line
+%endif
+
+        dec         rcx                      ; decrement count
+        jnz         .nextrow                 ; next row
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp9_filter_block1dc_v6_mmx
+;(
+;   short *src_ptr,
+;   unsigned char *output_ptr,
+;    int output_pitch,
+;   unsigned int pixels_per_line,
+;   unsigned int pixel_step,
+;   unsigned int output_height,
+;   unsigned int output_width,
+;   short * vp9_filter
+;)
+global sym(vp9_filter_block1dc_v6_mmx)
+sym(vp9_filter_block1dc_v6_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 8
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        movq      mm5, [GLOBAL(rd)]
+        push        rbx
+        mov         rbx, arg(7) ;vp9_filter
+        movq      mm1, [rbx + 16]             ; do both the negative taps first!!!
+        movq      mm2, [rbx + 32]         ;
+        movq      mm6, [rbx + 48]        ;
+        movq      mm7, [rbx + 64]        ;
+
+        movsxd      rdx, dword ptr arg(3) ;pixels_per_line
+        mov         rdi, arg(1) ;output_ptr
+        mov         rsi, arg(0) ;src_ptr
+        sub         rsi, rdx
+        sub         rsi, rdx
+        movsxd      rcx, DWORD PTR arg(5) ;output_height
+        movsxd      rax, DWORD PTR arg(2) ;output_pitch      ; destination pitch?
+        pxor        mm0, mm0              ; mm0 = 00000000
+
+
+.nextrow_cv:
+        movq        mm3, [rsi+rdx]        ; mm3 = p0..p8  = row -1
+        pmullw      mm3, mm1              ; mm3 *= kernel 1 modifiers.
+
+
+        movq        mm4, [rsi + 4*rdx]      ; mm4 = p0..p3  = row 2
+        pmullw      mm4, mm7              ; mm4 *= kernel 4 modifiers.
+        paddsw      mm3, mm4              ; mm3 += mm4
+
+        movq        mm4, [rsi + 2*rdx]           ; mm4 = p0..p3  = row 0
+        pmullw      mm4, mm2              ; mm4 *= kernel 2 modifiers.
+        paddsw      mm3, mm4              ; mm3 += mm4
+
+        movq        mm4, [rsi]            ; mm4 = p0..p3  = row -2
+        pmullw      mm4, [rbx]            ; mm4 *= kernel 0 modifiers.
+        paddsw      mm3, mm4              ; mm3 += mm4
+
+
+        add         rsi, rdx              ; move source forward 1 line to avoid 3 * pitch
+        movq        mm4, [rsi + 2*rdx]     ; mm4 = p0..p3  = row 1
+        pmullw      mm4, mm6              ; mm4 *= kernel 3 modifiers.
+        paddsw      mm3, mm4              ; mm3 += mm4
+
+        movq        mm4, [rsi + 4*rdx]    ; mm4 = p0..p3  = row 3
+        pmullw      mm4, [rbx +80]        ; mm4 *= kernel 3 modifiers.
+        paddsw      mm3, mm4              ; mm3 += mm4
+
+
+        paddsw      mm3, mm5               ; mm3 += round value
+        psraw       mm3, VP9_FILTER_SHIFT     ; mm3 /= 128
+        packuswb    mm3, mm0              ; pack and saturate
+
+        movd        [rdi],mm3             ; store the results in the destination
+        ; the subsequent iterations repeat 3 out of 4 of these reads.  Since the
+        ; recon block should be in cache this shouldn't cost much.  Its obviously
+        ; avoidable!!!.
+        lea         rdi,  [rdi+rax] ;
+        dec         rcx                   ; decrement count
+        jnz         .nextrow_cv           ; next row
+
+        pop         rbx
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void bilinear_predict8x8_mmx
+;(
+;    unsigned char  *src_ptr,
+;    int   src_pixels_per_line,
+;    int  xoffset,
+;    int  yoffset,
+;   unsigned char *dst_ptr,
+;    int dst_pitch
+;)
+global sym(vp9_bilinear_predict8x8_mmx)
+sym(vp9_bilinear_predict8x8_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ;const short *HFilter = bilinear_filters_mmx[xoffset];
+    ;const short *VFilter = bilinear_filters_mmx[yoffset];
+
+        movsxd      rax,        dword ptr arg(2) ;xoffset
+        mov         rdi,        arg(4) ;dst_ptr           ;
+
+        shl         rax,        5 ; offset * 32
+        lea         rcx,        [GLOBAL(sym(vp9_bilinear_filters_8x_mmx))]
+
+        add         rax,        rcx ; HFilter
+        mov         rsi,        arg(0) ;src_ptr              ;
+
+        movsxd      rdx,        dword ptr arg(5) ;dst_pitch
+        movq        mm1,        [rax]               ;
+
+        movq        mm2,        [rax+16]            ;
+        movsxd      rax,        dword ptr arg(3) ;yoffset
+
+        pxor        mm0,        mm0                 ;
+
+        shl         rax,        5 ; offset*32
+        add         rax,        rcx ; VFilter
+
+        lea         rcx,        [rdi+rdx*8]          ;
+        movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line    ;
+
+
+
+        ; get the first horizontal line done       ;
+        movq        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
+        movq        mm4,        mm3                 ; make a copy of current line
+
+        punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
+        punpckhbw   mm4,        mm0                 ;
+
+        pmullw      mm3,        mm1                 ;
+        pmullw      mm4,        mm1                 ;
+
+        movq        mm5,        [rsi+1]             ;
+        movq        mm6,        mm5                 ;
+
+        punpcklbw   mm5,        mm0                 ;
+        punpckhbw   mm6,        mm0                 ;
+
+        pmullw      mm5,        mm2                 ;
+        pmullw      mm6,        mm2                 ;
+
+        paddw       mm3,        mm5                 ;
+        paddw       mm4,        mm6                 ;
+
+        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
+        psraw       mm3,        VP9_FILTER_SHIFT        ; xmm3 /= 128
+
+        paddw       mm4,        [GLOBAL(rd)]                 ;
+        psraw       mm4,        VP9_FILTER_SHIFT        ;
+
+        movq        mm7,        mm3                 ;
+        packuswb    mm7,        mm4                 ;
+
+        add         rsi,        rdx                 ; next line
+.next_row_8x8:
+        movq        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
+        movq        mm4,        mm3                 ; make a copy of current line
+
+        punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
+        punpckhbw   mm4,        mm0                 ;
+
+        pmullw      mm3,        mm1                 ;
+        pmullw      mm4,        mm1                 ;
+
+        movq        mm5,        [rsi+1]             ;
+        movq        mm6,        mm5                 ;
+
+        punpcklbw   mm5,        mm0                 ;
+        punpckhbw   mm6,        mm0                 ;
+
+        pmullw      mm5,        mm2                 ;
+        pmullw      mm6,        mm2                 ;
+
+        paddw       mm3,        mm5                 ;
+        paddw       mm4,        mm6                 ;
+
+        movq        mm5,        mm7                 ;
+        movq        mm6,        mm7                 ;
+
+        punpcklbw   mm5,        mm0                 ;
+        punpckhbw   mm6,        mm0
+
+        pmullw      mm5,        [rax]               ;
+        pmullw      mm6,        [rax]               ;
+
+        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
+        psraw       mm3,        VP9_FILTER_SHIFT        ; xmm3 /= 128
+
+        paddw       mm4,        [GLOBAL(rd)]                 ;
+        psraw       mm4,        VP9_FILTER_SHIFT        ;
+
+        movq        mm7,        mm3                 ;
+        packuswb    mm7,        mm4                 ;
+
+
+        pmullw      mm3,        [rax+16]            ;
+        pmullw      mm4,        [rax+16]            ;
+
+        paddw       mm3,        mm5                 ;
+        paddw       mm4,        mm6                 ;
+
+
+        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
+        psraw       mm3,        VP9_FILTER_SHIFT        ; xmm3 /= 128
+
+        paddw       mm4,        [GLOBAL(rd)]                 ;
+        psraw       mm4,        VP9_FILTER_SHIFT        ;
+
+        packuswb    mm3,        mm4
+
+        movq        [rdi],      mm3                 ; store the results in the destination
+
+%if ABI_IS_32BIT
+        add         rsi,        rdx                 ; next line
+        add         rdi,        dword ptr arg(5) ;dst_pitch                   ;
+%else
+        movsxd      r8,         dword ptr arg(5) ;dst_pitch
+        add         rsi,        rdx                 ; next line
+        add         rdi,        r8                  ;dst_pitch
+%endif
+        cmp         rdi,        rcx                 ;
+        jne         .next_row_8x8
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void bilinear_predict8x4_mmx
+;(
+;    unsigned char  *src_ptr,
+;    int   src_pixels_per_line,
+;    int  xoffset,
+;    int  yoffset,
+;    unsigned char *dst_ptr,
+;    int dst_pitch
+;)
+global sym(vp9_bilinear_predict8x4_mmx)
+sym(vp9_bilinear_predict8x4_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ;const short *HFilter = bilinear_filters_mmx[xoffset];
+    ;const short *VFilter = bilinear_filters_mmx[yoffset];
+
+        movsxd      rax,        dword ptr arg(2) ;xoffset
+        mov         rdi,        arg(4) ;dst_ptr           ;
+
+        lea         rcx,        [GLOBAL(sym(vp9_bilinear_filters_8x_mmx))]
+        shl         rax,        5
+
+        mov         rsi,        arg(0) ;src_ptr              ;
+        add         rax,        rcx
+
+        movsxd      rdx,        dword ptr arg(5) ;dst_pitch
+        movq        mm1,        [rax]               ;
+
+        movq        mm2,        [rax+16]            ;
+        movsxd      rax,        dword ptr arg(3) ;yoffset
+
+        pxor        mm0,        mm0                 ;
+        shl         rax,        5
+
+        add         rax,        rcx
+        lea         rcx,        [rdi+rdx*4]          ;
+
+        movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line    ;
+
+        ; get the first horizontal line done       ;
+        movq        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
+        movq        mm4,        mm3                 ; make a copy of current line
+
+        punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
+        punpckhbw   mm4,        mm0                 ;
+
+        pmullw      mm3,        mm1                 ;
+        pmullw      mm4,        mm1                 ;
+
+        movq        mm5,        [rsi+1]             ;
+        movq        mm6,        mm5                 ;
+
+        punpcklbw   mm5,        mm0                 ;
+        punpckhbw   mm6,        mm0                 ;
+
+        pmullw      mm5,        mm2                 ;
+        pmullw      mm6,        mm2                 ;
+
+        paddw       mm3,        mm5                 ;
+        paddw       mm4,        mm6                 ;
+
+        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
+        psraw       mm3,        VP9_FILTER_SHIFT        ; xmm3 /= 128
+
+        paddw       mm4,        [GLOBAL(rd)]                 ;
+        psraw       mm4,        VP9_FILTER_SHIFT        ;
+
+        movq        mm7,        mm3                 ;
+        packuswb    mm7,        mm4                 ;
+
+        add         rsi,        rdx                 ; next line
+.next_row_8x4:
+        movq        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
+        movq        mm4,        mm3                 ; make a copy of current line
+
+        punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
+        punpckhbw   mm4,        mm0                 ;
+
+        pmullw      mm3,        mm1                 ;
+        pmullw      mm4,        mm1                 ;
+
+        movq        mm5,        [rsi+1]             ;
+        movq        mm6,        mm5                 ;
+
+        punpcklbw   mm5,        mm0                 ;
+        punpckhbw   mm6,        mm0                 ;
+
+        pmullw      mm5,        mm2                 ;
+        pmullw      mm6,        mm2                 ;
+
+        paddw       mm3,        mm5                 ;
+        paddw       mm4,        mm6                 ;
+
+        movq        mm5,        mm7                 ;
+        movq        mm6,        mm7                 ;
+
+        punpcklbw   mm5,        mm0                 ;
+        punpckhbw   mm6,        mm0
+
+        pmullw      mm5,        [rax]               ;
+        pmullw      mm6,        [rax]               ;
+
+        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
+        psraw       mm3,        VP9_FILTER_SHIFT        ; xmm3 /= 128
+
+        paddw       mm4,        [GLOBAL(rd)]                 ;
+        psraw       mm4,        VP9_FILTER_SHIFT        ;
+
+        movq        mm7,        mm3                 ;
+        packuswb    mm7,        mm4                 ;
+
+
+        pmullw      mm3,        [rax+16]            ;
+        pmullw      mm4,        [rax+16]            ;
+
+        paddw       mm3,        mm5                 ;
+        paddw       mm4,        mm6                 ;
+
+
+        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
+        psraw       mm3,        VP9_FILTER_SHIFT        ; xmm3 /= 128
+
+        paddw       mm4,        [GLOBAL(rd)]                 ;
+        psraw       mm4,        VP9_FILTER_SHIFT        ;
+
+        packuswb    mm3,        mm4
+
+        movq        [rdi],      mm3                 ; store the results in the destination
+
+%if ABI_IS_32BIT
+        add         rsi,        rdx                 ; next line
+        add         rdi,        dword ptr arg(5) ;dst_pitch                   ;
+%else
+        movsxd      r8,         dword ptr arg(5) ;dst_pitch
+        add         rsi,        rdx                 ; next line
+        add         rdi,        r8
+%endif
+        cmp         rdi,        rcx                 ;
+        jne         .next_row_8x4
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void bilinear_predict4x4_mmx
+;(
+;    unsigned char  *src_ptr,
+;    int   src_pixels_per_line,
+;    int  xoffset,
+;    int  yoffset,
+;    unsigned char *dst_ptr,
+;    int dst_pitch
+;)
+global sym(vp9_bilinear_predict4x4_mmx)
+sym(vp9_bilinear_predict4x4_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ;const short *HFilter = bilinear_filters_mmx[xoffset];
+    ;const short *VFilter = bilinear_filters_mmx[yoffset];
+
+        movsxd      rax,        dword ptr arg(2) ;xoffset
+        mov         rdi,        arg(4) ;dst_ptr           ;
+
+        lea         rcx,        [GLOBAL(sym(vp9_bilinear_filters_8x_mmx))]
+        shl         rax,        5
+
+        add         rax,        rcx ; HFilter
+        mov         rsi,        arg(0) ;src_ptr              ;
+
+        movsxd      rdx,        dword ptr arg(5) ;ldst_pitch
+        movq        mm1,        [rax]               ;
+
+        movq        mm2,        [rax+16]            ;
+        movsxd      rax,        dword ptr arg(3) ;yoffset
+
+        pxor        mm0,        mm0                 ;
+        shl         rax,        5
+
+        add         rax,        rcx
+        lea         rcx,        [rdi+rdx*4]          ;
+
+        movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line    ;
+
+        ; get the first horizontal line done       ;
+        movd        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
+        punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
+
+        pmullw      mm3,        mm1                 ;
+        movd        mm5,        [rsi+1]             ;
+
+        punpcklbw   mm5,        mm0                 ;
+        pmullw      mm5,        mm2                 ;
+
+        paddw       mm3,        mm5                 ;
+        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
+
+        psraw       mm3,        VP9_FILTER_SHIFT        ; xmm3 /= 128
+
+        movq        mm7,        mm3                 ;
+        packuswb    mm7,        mm0                 ;
+
+        add         rsi,        rdx                 ; next line
+.next_row_4x4:
+        movd        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
+        punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
+
+        pmullw      mm3,        mm1                 ;
+        movd        mm5,        [rsi+1]             ;
+
+        punpcklbw   mm5,        mm0                 ;
+        pmullw      mm5,        mm2                 ;
+
+        paddw       mm3,        mm5                 ;
+
+        movq        mm5,        mm7                 ;
+        punpcklbw   mm5,        mm0                 ;
+
+        pmullw      mm5,        [rax]               ;
+        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
+
+        psraw       mm3,        VP9_FILTER_SHIFT        ; xmm3 /= 128
+        movq        mm7,        mm3                 ;
+
+        packuswb    mm7,        mm0                 ;
+
+        pmullw      mm3,        [rax+16]            ;
+        paddw       mm3,        mm5                 ;
+
+
+        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
+        psraw       mm3,        VP9_FILTER_SHIFT        ; xmm3 /= 128
+
+        packuswb    mm3,        mm0
+        movd        [rdi],      mm3                 ; store the results in the destination
+
+%if ABI_IS_32BIT
+        add         rsi,        rdx                 ; next line
+        add         rdi,        dword ptr arg(5) ;dst_pitch                   ;
+%else
+        movsxd      r8,         dword ptr arg(5) ;dst_pitch                   ;
+        add         rsi,        rdx                 ; next line
+        add         rdi,        r8
+%endif
+
+        cmp         rdi,        rcx                 ;
+        jne         .next_row_4x4
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+
+SECTION_RODATA
+align 16
+rd:
+    times 4 dw 0x40
+
+align 16
+global HIDDEN_DATA(sym(vp9_six_tap_mmx))
+sym(vp9_six_tap_mmx):
+    times 8 dw 0
+    times 8 dw 0
+    times 8 dw 128
+    times 8 dw 0
+    times 8 dw 0
+    times 8 dw 0
+
+    times 8 dw 0
+    times 8 dw -6
+    times 8 dw 123
+    times 8 dw 12
+    times 8 dw -1
+    times 8 dw 0
+
+    times 8 dw 2
+    times 8 dw -11
+    times 8 dw 108
+    times 8 dw 36
+    times 8 dw -8
+    times 8 dw 1
+
+    times 8 dw 0
+    times 8 dw -9
+    times 8 dw 93
+    times 8 dw 50
+    times 8 dw -6
+    times 8 dw 0
+
+    times 8 dw 3
+    times 8 dw -16
+    times 8 dw 77
+    times 8 dw 77
+    times 8 dw -16
+    times 8 dw 3
+
+    times 8 dw 0
+    times 8 dw -6
+    times 8 dw 50
+    times 8 dw 93
+    times 8 dw -9
+    times 8 dw 0
+
+    times 8 dw 1
+    times 8 dw -8
+    times 8 dw 36
+    times 8 dw 108
+    times 8 dw -11
+    times 8 dw 2
+
+    times 8 dw 0
+    times 8 dw -1
+    times 8 dw 12
+    times 8 dw 123
+    times 8 dw -6
+    times 8 dw 0
+
+
+align 16
+global HIDDEN_DATA(sym(vp9_bilinear_filters_8x_mmx))
+sym(vp9_bilinear_filters_8x_mmx):
+    times 8 dw 128
+    times 8 dw 0
+
+    times 8 dw 112
+    times 8 dw 16
+
+    times 8 dw 96
+    times 8 dw 32
+
+    times 8 dw 80
+    times 8 dw 48
+
+    times 8 dw 64
+    times 8 dw 64
+
+    times 8 dw 48
+    times 8 dw 80
+
+    times 8 dw 32
+    times 8 dw 96
+
+    times 8 dw 16
+    times 8 dw 112
--- /dev/null
+++ b/vp9/common/x86/subpixel_sse2.asm
@@ -1,0 +1,1372 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+%define BLOCK_HEIGHT_WIDTH 4
+%define VP9_FILTER_WEIGHT 128
+%define VP9_FILTER_SHIFT  7
+
+
+;/************************************************************************************
+; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
+; input pixel array has output_height rows. This routine assumes that output_height is an
+; even number. This function handles 8 pixels in horizontal direction, calculating ONE
+; rows each iteration to take advantage of the 128 bits operations.
+;*************************************************************************************/
+;void vp9_filter_block1d8_h6_sse2
+;(
+;    unsigned char  *src_ptr,
+;    unsigned short *output_ptr,
+;    unsigned int    src_pixels_per_line,
+;    unsigned int    pixel_step,
+;    unsigned int    output_height,
+;    unsigned int    output_width,
+;    short           *vp9_filter
+;)
+global sym(vp9_filter_block1d8_h6_sse2)
+sym(vp9_filter_block1d8_h6_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov         rdx,        arg(6) ;vp9_filter
+        mov         rsi,        arg(0) ;src_ptr
+
+        mov         rdi,        arg(1) ;output_ptr
+
+        movsxd      rcx,        dword ptr arg(4) ;output_height
+        movsxd      rax,        dword ptr arg(2) ;src_pixels_per_line            ; Pitch for Source
+%if ABI_IS_32BIT=0
+        movsxd      r8,         dword ptr arg(5) ;output_width
+%endif
+        pxor        xmm0,       xmm0                        ; clear xmm0 for unpack
+
+.filter_block1d8_h6_rowloop:
+        movq        xmm3,       MMWORD PTR [rsi - 2]
+        movq        xmm1,       MMWORD PTR [rsi + 6]
+
+        prefetcht2  [rsi+rax-2]
+
+        pslldq      xmm1,       8
+        por         xmm1,       xmm3
+
+        movdqa      xmm4,       xmm1
+        movdqa      xmm5,       xmm1
+
+        movdqa      xmm6,       xmm1
+        movdqa      xmm7,       xmm1
+
+        punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
+        psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
+
+        pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1
+        punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
+
+        psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
+        pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2
+
+
+        punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
+        psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
+
+        pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3
+
+        punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
+        psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
+
+        pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4
+
+        punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
+        psrldq      xmm1,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
+
+
+        pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5
+
+        punpcklbw   xmm1,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
+        pmullw      xmm1,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6
+
+
+        paddsw      xmm4,       xmm7
+        paddsw      xmm4,       xmm5
+
+        paddsw      xmm4,       xmm3
+        paddsw      xmm4,       xmm6
+
+        paddsw      xmm4,       xmm1
+        paddsw      xmm4,       [GLOBAL(rd)]
+
+        psraw       xmm4,       7
+
+        packuswb    xmm4,       xmm0
+        punpcklbw   xmm4,       xmm0
+
+        movdqa      XMMWORD Ptr [rdi],         xmm4
+        lea         rsi,        [rsi + rax]
+
+%if ABI_IS_32BIT
+        add         rdi,        DWORD Ptr arg(5) ;[output_width]
+%else
+        add         rdi,        r8
+%endif
+        dec         rcx
+
+        jnz         .filter_block1d8_h6_rowloop                ; next row
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp9_filter_block1d16_h6_sse2
+;(
+;    unsigned char  *src_ptr,
+;    unsigned short *output_ptr,
+;    unsigned int    src_pixels_per_line,
+;    unsigned int    pixel_step,
+;    unsigned int    output_height,
+;    unsigned int    output_width,
+;    short           *vp9_filter
+;)
+;/************************************************************************************
+; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
+; input pixel array has output_height rows. This routine assumes that output_height is an
+; even number. This function handles 8 pixels in horizontal direction, calculating ONE
+; rows each iteration to take advantage of the 128 bits operations.
+;*************************************************************************************/
+global sym(vp9_filter_block1d16_h6_sse2)
+sym(vp9_filter_block1d16_h6_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov         rdx,        arg(6) ;vp9_filter
+        mov         rsi,        arg(0) ;src_ptr
+
+        mov         rdi,        arg(1) ;output_ptr
+
+        movsxd      rcx,        dword ptr arg(4) ;output_height
+        movsxd      rax,        dword ptr arg(2) ;src_pixels_per_line            ; Pitch for Source
+%if ABI_IS_32BIT=0
+        movsxd      r8,         dword ptr arg(5) ;output_width
+%endif
+
+        pxor        xmm0,       xmm0                        ; clear xmm0 for unpack
+
+.filter_block1d16_h6_sse2_rowloop:
+        movq        xmm3,       MMWORD PTR [rsi - 2]
+        movq        xmm1,       MMWORD PTR [rsi + 6]
+
+        movq        xmm2,       MMWORD PTR [rsi +14]
+        pslldq      xmm2,       8
+
+        por         xmm2,       xmm1
+        prefetcht2  [rsi+rax-2]
+
+        pslldq      xmm1,       8
+        por         xmm1,       xmm3
+
+        movdqa      xmm4,       xmm1
+        movdqa      xmm5,       xmm1
+
+        movdqa      xmm6,       xmm1
+        movdqa      xmm7,       xmm1
+
+        punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
+        psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
+
+        pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1
+        punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
+
+        psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
+        pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2
+
+
+        punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
+        psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
+
+        pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3
+
+        punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
+        psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
+
+        pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4
+
+        punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
+        psrldq      xmm1,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
+
+
+        pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5
+
+        punpcklbw   xmm1,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
+        pmullw      xmm1,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6
+
+        paddsw      xmm4,       xmm7
+        paddsw      xmm4,       xmm5
+
+        paddsw      xmm4,       xmm3
+        paddsw      xmm4,       xmm6
+
+        paddsw      xmm4,       xmm1
+        paddsw      xmm4,       [GLOBAL(rd)]
+
+        psraw       xmm4,       7
+
+        packuswb    xmm4,       xmm0
+        punpcklbw   xmm4,       xmm0
+
+        movdqa      XMMWORD Ptr [rdi],         xmm4
+
+        movdqa      xmm3,       xmm2
+        movdqa      xmm4,       xmm2
+
+        movdqa      xmm5,       xmm2
+        movdqa      xmm6,       xmm2
+
+        movdqa      xmm7,       xmm2
+
+        punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
+        psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
+
+        pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1
+        punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
+
+        psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
+        pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2
+
+
+        punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
+        psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
+
+        pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3
+
+        punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
+        psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
+
+        pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4
+
+        punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
+        psrldq      xmm2,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
+
+        pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5
+
+        punpcklbw   xmm2,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
+        pmullw      xmm2,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6
+
+
+        paddsw      xmm4,       xmm7
+        paddsw      xmm4,       xmm5
+
+        paddsw      xmm4,       xmm3
+        paddsw      xmm4,       xmm6
+
+        paddsw      xmm4,       xmm2
+        paddsw      xmm4,       [GLOBAL(rd)]
+
+        psraw       xmm4,       7
+
+        packuswb    xmm4,       xmm0
+        punpcklbw   xmm4,       xmm0
+
+        movdqa      XMMWORD Ptr [rdi+16],      xmm4
+
+        lea         rsi,        [rsi + rax]
+%if ABI_IS_32BIT
+        add         rdi,        DWORD Ptr arg(5) ;[output_width]
+%else
+        add         rdi,        r8
+%endif
+
+        dec         rcx
+        jnz         .filter_block1d16_h6_sse2_rowloop                ; next row
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp9_filter_block1d8_v6_sse2
+;(
+;    short *src_ptr,
+;    unsigned char *output_ptr,
+;    int dst_ptich,
+;    unsigned int pixels_per_line,
+;    unsigned int pixel_step,
+;    unsigned int output_height,
+;    unsigned int output_width,
+;    short * vp9_filter
+;)
+;/************************************************************************************
+; Notes: filter_block1d8_v6 applies a 6 tap filter vertically to the input pixels. The
+; input pixel array has output_height rows.
+;*************************************************************************************/
+global sym(vp9_filter_block1d8_v6_sse2)
+sym(vp9_filter_block1d8_v6_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 8
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov         rax,        arg(7) ;vp9_filter
+        movsxd      rdx,        dword ptr arg(3) ;pixels_per_line
+
+        mov         rdi,        arg(1) ;output_ptr
+        mov         rsi,        arg(0) ;src_ptr
+
+        sub         rsi,        rdx
+        sub         rsi,        rdx
+
+        movsxd      rcx,        DWORD PTR arg(5) ;[output_height]
+        pxor        xmm0,       xmm0                        ; clear xmm0
+
+        movdqa      xmm7,       XMMWORD PTR [GLOBAL(rd)]
+%if ABI_IS_32BIT=0
+        movsxd      r8,         dword ptr arg(2) ; dst_ptich
+%endif
+
+.vp9_filter_block1d8_v6_sse2_loop:
+        movdqa      xmm1,       XMMWORD PTR [rsi]
+        pmullw      xmm1,       [rax]
+
+        movdqa      xmm2,       XMMWORD PTR [rsi + rdx]
+        pmullw      xmm2,       [rax + 16]
+
+        movdqa      xmm3,       XMMWORD PTR [rsi + rdx * 2]
+        pmullw      xmm3,       [rax + 32]
+
+        movdqa      xmm5,       XMMWORD PTR [rsi + rdx * 4]
+        pmullw      xmm5,       [rax + 64]
+
+        add         rsi,        rdx
+        movdqa      xmm4,       XMMWORD PTR [rsi + rdx * 2]
+
+        pmullw      xmm4,       [rax + 48]
+        movdqa      xmm6,       XMMWORD PTR [rsi + rdx * 4]
+
+        pmullw      xmm6,       [rax + 80]
+
+        paddsw      xmm2,       xmm5
+        paddsw      xmm2,       xmm3
+
+        paddsw      xmm2,       xmm1
+        paddsw      xmm2,       xmm4
+
+        paddsw      xmm2,       xmm6
+        paddsw      xmm2,       xmm7
+
+        psraw       xmm2,       7
+        packuswb    xmm2,       xmm0              ; pack and saturate
+
+        movq        QWORD PTR [rdi], xmm2         ; store the results in the destination
+%if ABI_IS_32BIT
+        add         rdi,        DWORD PTR arg(2) ;[dst_ptich]
+%else
+        add         rdi,        r8
+%endif
+        dec         rcx         ; decrement count
+        jnz         .vp9_filter_block1d8_v6_sse2_loop               ; next row
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp9_filter_block1d16_v6_sse2
+;(
+;    unsigned short *src_ptr,
+;    unsigned char *output_ptr,
+;    int dst_ptich,
+;    unsigned int pixels_per_line,
+;    unsigned int pixel_step,
+;    unsigned int output_height,
+;    unsigned int output_width,
+;    const short    *vp9_filter
+;)
+;/************************************************************************************
+; Notes: filter_block1d16_v6 applies a 6 tap filter vertically to the input pixels. The
+; input pixel array has output_height rows.
+;*************************************************************************************/
+global sym(vp9_filter_block1d16_v6_sse2)
+sym(vp9_filter_block1d16_v6_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 8
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov         rax,        arg(7) ;vp9_filter
+        movsxd      rdx,        dword ptr arg(3) ;pixels_per_line
+
+        mov         rdi,        arg(1) ;output_ptr
+        mov         rsi,        arg(0) ;src_ptr
+
+        sub         rsi,        rdx
+        sub         rsi,        rdx
+
+        movsxd      rcx,        DWORD PTR arg(5) ;[output_height]
+%if ABI_IS_32BIT=0
+        movsxd      r8,         dword ptr arg(2) ; dst_ptich
+%endif
+
+.vp9_filter_block1d16_v6_sse2_loop:
+; The order for adding 6-tap is 2 5 3 1 4 6. Read in data in that order.
+        movdqa      xmm1,       XMMWORD PTR [rsi + rdx]       ; line 2
+        movdqa      xmm2,       XMMWORD PTR [rsi + rdx + 16]
+        pmullw      xmm1,       [rax + 16]
+        pmullw      xmm2,       [rax + 16]
+
+        movdqa      xmm3,       XMMWORD PTR [rsi + rdx * 4]       ; line 5
+        movdqa      xmm4,       XMMWORD PTR [rsi + rdx * 4 + 16]
+        pmullw      xmm3,       [rax + 64]
+        pmullw      xmm4,       [rax + 64]
+
+        movdqa      xmm5,       XMMWORD PTR [rsi + rdx * 2]       ; line 3
+        movdqa      xmm6,       XMMWORD PTR [rsi + rdx * 2 + 16]
+        pmullw      xmm5,       [rax + 32]
+        pmullw      xmm6,       [rax + 32]
+
+        movdqa      xmm7,       XMMWORD PTR [rsi]       ; line 1
+        movdqa      xmm0,       XMMWORD PTR [rsi + 16]
+        pmullw      xmm7,       [rax]
+        pmullw      xmm0,       [rax]
+
+        paddsw      xmm1,       xmm3
+        paddsw      xmm2,       xmm4
+        paddsw      xmm1,       xmm5
+        paddsw      xmm2,       xmm6
+        paddsw      xmm1,       xmm7
+        paddsw      xmm2,       xmm0
+
+        add         rsi,        rdx
+
+        movdqa      xmm3,       XMMWORD PTR [rsi + rdx * 2]       ; line 4
+        movdqa      xmm4,       XMMWORD PTR [rsi + rdx * 2 + 16]
+        pmullw      xmm3,       [rax + 48]
+        pmullw      xmm4,       [rax + 48]
+
+        movdqa      xmm5,       XMMWORD PTR [rsi + rdx * 4]       ; line 6
+        movdqa      xmm6,       XMMWORD PTR [rsi + rdx * 4 + 16]
+        pmullw      xmm5,       [rax + 80]
+        pmullw      xmm6,       [rax + 80]
+
+        movdqa      xmm7,       XMMWORD PTR [GLOBAL(rd)]
+        pxor        xmm0,       xmm0                        ; clear xmm0
+
+        paddsw      xmm1,       xmm3
+        paddsw      xmm2,       xmm4
+        paddsw      xmm1,       xmm5
+        paddsw      xmm2,       xmm6
+
+        paddsw      xmm1,       xmm7
+        paddsw      xmm2,       xmm7
+
+        psraw       xmm1,       7
+        psraw       xmm2,       7
+
+        packuswb    xmm1,       xmm2              ; pack and saturate
+        movdqa      XMMWORD PTR [rdi], xmm1       ; store the results in the destination
+%if ABI_IS_32BIT
+        add         rdi,        DWORD PTR arg(2) ;[dst_ptich]
+%else
+        add         rdi,        r8
+%endif
+        dec         rcx         ; decrement count
+        jnz         .vp9_filter_block1d16_v6_sse2_loop              ; next row
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp9_filter_block1d8_h6_only_sse2
+;(
+;    unsigned char  *src_ptr,
+;    unsigned int    src_pixels_per_line,
+;    unsigned char  *output_ptr,
+;    int dst_ptich,
+;    unsigned int    output_height,
+;    const short    *vp9_filter
+;)
+; First-pass filter only when yoffset==0
+global sym(vp9_filter_block1d8_h6_only_sse2)
+sym(vp9_filter_block1d8_h6_only_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov         rdx,        arg(5) ;vp9_filter
+        mov         rsi,        arg(0) ;src_ptr
+
+        mov         rdi,        arg(2) ;output_ptr
+
+        movsxd      rcx,        dword ptr arg(4) ;output_height
+        movsxd      rax,        dword ptr arg(1) ;src_pixels_per_line            ; Pitch for Source
+%if ABI_IS_32BIT=0
+        movsxd      r8,         dword ptr arg(3) ;dst_ptich
+%endif
+        pxor        xmm0,       xmm0                        ; clear xmm0 for unpack
+
+.filter_block1d8_h6_only_rowloop:
+        movq        xmm3,       MMWORD PTR [rsi - 2]
+        movq        xmm1,       MMWORD PTR [rsi + 6]
+
+        prefetcht2  [rsi+rax-2]
+
+        pslldq      xmm1,       8
+        por         xmm1,       xmm3
+
+        movdqa      xmm4,       xmm1
+        movdqa      xmm5,       xmm1
+
+        movdqa      xmm6,       xmm1
+        movdqa      xmm7,       xmm1
+
+        punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
+        psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
+
+        pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1
+        punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
+
+        psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
+        pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2
+
+
+        punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
+        psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
+
+        pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3
+
+        punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
+        psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
+
+        pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4
+
+        punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
+        psrldq      xmm1,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
+
+
+        pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5
+
+        punpcklbw   xmm1,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
+        pmullw      xmm1,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6
+
+
+        paddsw      xmm4,       xmm7
+        paddsw      xmm4,       xmm5
+
+        paddsw      xmm4,       xmm3
+        paddsw      xmm4,       xmm6
+
+        paddsw      xmm4,       xmm1
+        paddsw      xmm4,       [GLOBAL(rd)]
+
+        psraw       xmm4,       7
+
+        packuswb    xmm4,       xmm0
+
+        movq        QWORD PTR [rdi],   xmm4       ; store the results in the destination
+        lea         rsi,        [rsi + rax]
+
+%if ABI_IS_32BIT
+        add         rdi,        DWORD Ptr arg(3) ;dst_ptich
+%else
+        add         rdi,        r8
+%endif
+        dec         rcx
+
+        jnz         .filter_block1d8_h6_only_rowloop               ; next row
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp9_filter_block1d16_h6_only_sse2
+;(
+;    unsigned char  *src_ptr,
+;    unsigned int    src_pixels_per_line,
+;    unsigned char  *output_ptr,
+;    int dst_ptich,
+;    unsigned int    output_height,
+;    const short    *vp9_filter
+;)
+; First-pass filter only when yoffset==0
+global sym(vp9_filter_block1d16_h6_only_sse2)
+sym(vp9_filter_block1d16_h6_only_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov         rdx,        arg(5) ;vp9_filter
+        mov         rsi,        arg(0) ;src_ptr
+
+        mov         rdi,        arg(2) ;output_ptr
+
+        movsxd      rcx,        dword ptr arg(4) ;output_height
+        movsxd      rax,        dword ptr arg(1) ;src_pixels_per_line            ; Pitch for Source
+%if ABI_IS_32BIT=0
+        movsxd      r8,         dword ptr arg(3) ;dst_ptich
+%endif
+
+        pxor        xmm0,       xmm0                        ; clear xmm0 for unpack
+
+.filter_block1d16_h6_only_sse2_rowloop:
+        movq        xmm3,       MMWORD PTR [rsi - 2]
+        movq        xmm1,       MMWORD PTR [rsi + 6]
+
+        movq        xmm2,       MMWORD PTR [rsi +14]
+        pslldq      xmm2,       8
+
+        por         xmm2,       xmm1
+        prefetcht2  [rsi+rax-2]
+
+        pslldq      xmm1,       8
+        por         xmm1,       xmm3
+
+        movdqa      xmm4,       xmm1
+        movdqa      xmm5,       xmm1
+
+        movdqa      xmm6,       xmm1
+        movdqa      xmm7,       xmm1
+
+        punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
+        psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
+
+        pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1
+        punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
+
+        psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
+        pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2
+
+        punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
+        psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
+
+        pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3
+
+        punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
+        psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
+
+        pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4
+
+        punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
+        psrldq      xmm1,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
+
+        pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5
+
+        punpcklbw   xmm1,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
+        pmullw      xmm1,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6
+
+        paddsw      xmm4,       xmm7
+        paddsw      xmm4,       xmm5
+
+        paddsw      xmm4,       xmm3
+        paddsw      xmm4,       xmm6
+
+        paddsw      xmm4,       xmm1
+        paddsw      xmm4,       [GLOBAL(rd)]
+
+        psraw       xmm4,       7
+
+        packuswb    xmm4,       xmm0                        ; lower 8 bytes
+
+        movq        QWORD Ptr [rdi],         xmm4           ; store the results in the destination
+
+        movdqa      xmm3,       xmm2
+        movdqa      xmm4,       xmm2
+
+        movdqa      xmm5,       xmm2
+        movdqa      xmm6,       xmm2
+
+        movdqa      xmm7,       xmm2
+
+        punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
+        psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
+
+        pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1
+        punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
+
+        psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
+        pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2
+
+        punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
+        psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
+
+        pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3
+
+        punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
+        psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
+
+        pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4
+
+        punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
+        psrldq      xmm2,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
+
+        pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5
+
+        punpcklbw   xmm2,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
+        pmullw      xmm2,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6
+
+        paddsw      xmm4,       xmm7
+        paddsw      xmm4,       xmm5
+
+        paddsw      xmm4,       xmm3
+        paddsw      xmm4,       xmm6
+
+        paddsw      xmm4,       xmm2
+        paddsw      xmm4,       [GLOBAL(rd)]
+
+        psraw       xmm4,       7
+
+        packuswb    xmm4,       xmm0                        ; higher 8 bytes
+
+        movq        QWORD Ptr [rdi+8],      xmm4            ; store the results in the destination
+
+        lea         rsi,        [rsi + rax]
+%if ABI_IS_32BIT
+        add         rdi,        DWORD Ptr arg(3) ;dst_ptich
+%else
+        add         rdi,        r8
+%endif
+
+        dec         rcx
+        jnz         .filter_block1d16_h6_only_sse2_rowloop               ; next row
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp9_filter_block1d8_v6_only_sse2
+;(
+;    unsigned char *src_ptr,
+;    unsigned int    src_pixels_per_line,
+;    unsigned char *output_ptr,
+;    int dst_ptich,
+;    unsigned int output_height,
+;    const short    *vp9_filter
+;)
+; Second-pass filter only when xoffset==0
+global sym(vp9_filter_block1d8_v6_only_sse2)
+sym(vp9_filter_block1d8_v6_only_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov         rsi,        arg(0) ;src_ptr
+        mov         rdi,        arg(2) ;output_ptr
+
+        movsxd      rcx,        dword ptr arg(4) ;output_height
+        movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line
+
+        mov         rax,        arg(5) ;vp9_filter
+
+        pxor        xmm0,       xmm0                        ; clear xmm0
+
+        movdqa      xmm7,       XMMWORD PTR [GLOBAL(rd)]
+%if ABI_IS_32BIT=0
+        movsxd      r8,         dword ptr arg(3) ; dst_ptich
+%endif
+
+.vp9_filter_block1d8_v6_only_sse2_loop:
+        movq        xmm1,       MMWORD PTR [rsi]
+        movq        xmm2,       MMWORD PTR [rsi + rdx]
+        movq        xmm3,       MMWORD PTR [rsi + rdx * 2]
+        movq        xmm5,       MMWORD PTR [rsi + rdx * 4]
+        add         rsi,        rdx
+        movq        xmm4,       MMWORD PTR [rsi + rdx * 2]
+        movq        xmm6,       MMWORD PTR [rsi + rdx * 4]
+
+        punpcklbw   xmm1,       xmm0
+        pmullw      xmm1,       [rax]
+
+        punpcklbw   xmm2,       xmm0
+        pmullw      xmm2,       [rax + 16]
+
+        punpcklbw   xmm3,       xmm0
+        pmullw      xmm3,       [rax + 32]
+
+        punpcklbw   xmm5,       xmm0
+        pmullw      xmm5,       [rax + 64]
+
+        punpcklbw   xmm4,       xmm0
+        pmullw      xmm4,       [rax + 48]
+
+        punpcklbw   xmm6,       xmm0
+        pmullw      xmm6,       [rax + 80]
+
+        paddsw      xmm2,       xmm5
+        paddsw      xmm2,       xmm3
+
+        paddsw      xmm2,       xmm1
+        paddsw      xmm2,       xmm4
+
+        paddsw      xmm2,       xmm6
+        paddsw      xmm2,       xmm7
+
+        psraw       xmm2,       7
+        packuswb    xmm2,       xmm0              ; pack and saturate
+
+        movq        QWORD PTR [rdi], xmm2         ; store the results in the destination
+%if ABI_IS_32BIT
+        add         rdi,        DWORD PTR arg(3) ;[dst_ptich]
+%else
+        add         rdi,        r8
+%endif
+        dec         rcx         ; decrement count
+        jnz         .vp9_filter_block1d8_v6_only_sse2_loop              ; next row
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp9_unpack_block1d16_h6_sse2
+;(
+;    unsigned char  *src_ptr,
+;    unsigned short *output_ptr,
+;    unsigned int    src_pixels_per_line,
+;    unsigned int    output_height,
+;    unsigned int    output_width
+;)
+global sym(vp9_unpack_block1d16_h6_sse2)
+sym(vp9_unpack_block1d16_h6_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov         rsi,        arg(0) ;src_ptr
+        mov         rdi,        arg(1) ;output_ptr
+
+        movsxd      rcx,        dword ptr arg(3) ;output_height
+        movsxd      rax,        dword ptr arg(2) ;src_pixels_per_line            ; Pitch for Source
+
+        pxor        xmm0,       xmm0                        ; clear xmm0 for unpack
+%if ABI_IS_32BIT=0
+        movsxd      r8,         dword ptr arg(4) ;output_width            ; Pitch for Source
+%endif
+
+.unpack_block1d16_h6_sse2_rowloop:
+        movq        xmm1,       MMWORD PTR [rsi]            ; 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 -2
+        movq        xmm3,       MMWORD PTR [rsi+8]          ; make copy of xmm1
+
+        punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
+        punpcklbw   xmm1,       xmm0
+
+        movdqa      XMMWORD Ptr [rdi],         xmm1
+        movdqa      XMMWORD Ptr [rdi + 16],    xmm3
+
+        lea         rsi,        [rsi + rax]
+%if ABI_IS_32BIT
+        add         rdi,        DWORD Ptr arg(4) ;[output_width]
+%else
+        add         rdi,        r8
+%endif
+        dec         rcx
+        jnz         .unpack_block1d16_h6_sse2_rowloop               ; next row
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp9_bilinear_predict16x16_sse2
+;(
+;    unsigned char  *src_ptr,
+;    int   src_pixels_per_line,
+;    int  xoffset,
+;    int  yoffset,
+;    unsigned char *dst_ptr,
+;    int dst_pitch
+;)
+extern sym(vp9_bilinear_filters_mmx)
+global sym(vp9_bilinear_predict16x16_sse2)
+sym(vp9_bilinear_predict16x16_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ;const short *HFilter = bilinear_filters_mmx[xoffset]
+    ;const short *VFilter = bilinear_filters_mmx[yoffset]
+
+        lea         rcx,        [GLOBAL(sym(vp9_bilinear_filters_mmx))]
+        movsxd      rax,        dword ptr arg(2) ;xoffset
+
+        cmp         rax,        0      ;skip first_pass filter if xoffset=0
+        je          .b16x16_sp_only
+
+        shl         rax,        5
+        add         rax,        rcx    ;HFilter
+
+        mov         rdi,        arg(4) ;dst_ptr
+        mov         rsi,        arg(0) ;src_ptr
+        movsxd      rdx,        dword ptr arg(5) ;dst_pitch
+
+        movdqa      xmm1,       [rax]
+        movdqa      xmm2,       [rax+16]
+
+        movsxd      rax,        dword ptr arg(3) ;yoffset
+
+        cmp         rax,        0      ;skip second_pass filter if yoffset=0
+        je          .b16x16_fp_only
+
+        shl         rax,        5
+        add         rax,        rcx    ;VFilter
+
+        lea         rcx,        [rdi+rdx*8]
+        lea         rcx,        [rcx+rdx*8]
+        movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line
+
+        pxor        xmm0,       xmm0
+
+%if ABI_IS_32BIT=0
+        movsxd      r8,         dword ptr arg(5) ;dst_pitch
+%endif
+        ; get the first horizontal line done
+        movdqu      xmm3,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
+        movdqa      xmm4,       xmm3                 ; make a copy of current line
+
+        punpcklbw   xmm3,       xmm0                 ; xx 00 01 02 03 04 05 06
+        punpckhbw   xmm4,       xmm0
+
+        pmullw      xmm3,       xmm1
+        pmullw      xmm4,       xmm1
+
+        movdqu      xmm5,       [rsi+1]
+        movdqa      xmm6,       xmm5
+
+        punpcklbw   xmm5,       xmm0
+        punpckhbw   xmm6,       xmm0
+
+        pmullw      xmm5,       xmm2
+        pmullw      xmm6,       xmm2
+
+        paddw       xmm3,       xmm5
+        paddw       xmm4,       xmm6
+
+        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
+        psraw       xmm3,       VP9_FILTER_SHIFT        ; xmm3 /= 128
+
+        paddw       xmm4,       [GLOBAL(rd)]
+        psraw       xmm4,       VP9_FILTER_SHIFT
+
+        movdqa      xmm7,       xmm3
+        packuswb    xmm7,       xmm4
+
+        add         rsi,        rdx                 ; next line
+.next_row:
+        movdqu      xmm3,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
+        movdqa      xmm4,       xmm3                 ; make a copy of current line
+
+        punpcklbw   xmm3,       xmm0                 ; xx 00 01 02 03 04 05 06
+        punpckhbw   xmm4,       xmm0
+
+        pmullw      xmm3,       xmm1
+        pmullw      xmm4,       xmm1
+
+        movdqu      xmm5,       [rsi+1]
+        movdqa      xmm6,       xmm5
+
+        punpcklbw   xmm5,       xmm0
+        punpckhbw   xmm6,       xmm0
+
+        pmullw      xmm5,       xmm2
+        pmullw      xmm6,       xmm2
+
+        paddw       xmm3,       xmm5
+        paddw       xmm4,       xmm6
+
+        movdqa      xmm5,       xmm7
+        movdqa      xmm6,       xmm7
+
+        punpcklbw   xmm5,       xmm0
+        punpckhbw   xmm6,       xmm0
+
+        pmullw      xmm5,       [rax]
+        pmullw      xmm6,       [rax]
+
+        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
+        psraw       xmm3,       VP9_FILTER_SHIFT        ; xmm3 /= 128
+
+        paddw       xmm4,       [GLOBAL(rd)]
+        psraw       xmm4,       VP9_FILTER_SHIFT
+
+        movdqa      xmm7,       xmm3
+        packuswb    xmm7,       xmm4
+
+        pmullw      xmm3,       [rax+16]
+        pmullw      xmm4,       [rax+16]
+
+        paddw       xmm3,       xmm5
+        paddw       xmm4,       xmm6
+
+        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
+        psraw       xmm3,       VP9_FILTER_SHIFT        ; xmm3 /= 128
+
+        paddw       xmm4,       [GLOBAL(rd)]
+        psraw       xmm4,       VP9_FILTER_SHIFT
+
+        packuswb    xmm3,       xmm4
+        movdqa      [rdi],      xmm3                 ; store the results in the destination
+
+        add         rsi,        rdx                 ; next line
+%if ABI_IS_32BIT
+        add         rdi,        DWORD PTR arg(5) ;dst_pitch
+%else
+        add         rdi,        r8
+%endif
+
+        cmp         rdi,        rcx
+        jne         .next_row
+
+        jmp         .done
+
+.b16x16_sp_only:
+        movsxd      rax,        dword ptr arg(3) ;yoffset
+        shl         rax,        5
+        add         rax,        rcx    ;VFilter
+
+        mov         rdi,        arg(4) ;dst_ptr
+        mov         rsi,        arg(0) ;src_ptr
+        movsxd      rdx,        dword ptr arg(5) ;dst_pitch
+
+        movdqa      xmm1,       [rax]
+        movdqa      xmm2,       [rax+16]
+
+        lea         rcx,        [rdi+rdx*8]
+        lea         rcx,        [rcx+rdx*8]
+        movsxd      rax,        dword ptr arg(1) ;src_pixels_per_line
+
+        pxor        xmm0,       xmm0
+
+        ; get the first horizontal line done
+        movdqu      xmm7,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
+
+        add         rsi,        rax                 ; next line
+.next_row_spo:
+        movdqu      xmm3,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
+
+        movdqa      xmm5,       xmm7
+        movdqa      xmm6,       xmm7
+
+        movdqa      xmm4,       xmm3                 ; make a copy of current line
+        movdqa      xmm7,       xmm3
+
+        punpcklbw   xmm5,       xmm0
+        punpckhbw   xmm6,       xmm0
+        punpcklbw   xmm3,       xmm0                 ; xx 00 01 02 03 04 05 06
+        punpckhbw   xmm4,       xmm0
+
+        pmullw      xmm5,       xmm1
+        pmullw      xmm6,       xmm1
+        pmullw      xmm3,       xmm2
+        pmullw      xmm4,       xmm2
+
+        paddw       xmm3,       xmm5
+        paddw       xmm4,       xmm6
+
+        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
+        psraw       xmm3,       VP9_FILTER_SHIFT        ; xmm3 /= 128
+
+        paddw       xmm4,       [GLOBAL(rd)]
+        psraw       xmm4,       VP9_FILTER_SHIFT
+
+        packuswb    xmm3,       xmm4
+        movdqa      [rdi],      xmm3                 ; store the results in the destination
+
+        add         rsi,        rax                 ; next line
+        add         rdi,        rdx                 ;dst_pitch
+        cmp         rdi,        rcx
+        jne         .next_row_spo
+
+        jmp         .done
+
+.b16x16_fp_only:
+        lea         rcx,        [rdi+rdx*8]
+        lea         rcx,        [rcx+rdx*8]
+        movsxd      rax,        dword ptr arg(1) ;src_pixels_per_line
+        pxor        xmm0,       xmm0
+
+.next_row_fpo:
+        movdqu      xmm3,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
+        movdqa      xmm4,       xmm3                 ; make a copy of current line
+
+        punpcklbw   xmm3,       xmm0                 ; xx 00 01 02 03 04 05 06
+        punpckhbw   xmm4,       xmm0
+
+        pmullw      xmm3,       xmm1
+        pmullw      xmm4,       xmm1
+
+        movdqu      xmm5,       [rsi+1]
+        movdqa      xmm6,       xmm5
+
+        punpcklbw   xmm5,       xmm0
+        punpckhbw   xmm6,       xmm0
+
+        pmullw      xmm5,       xmm2
+        pmullw      xmm6,       xmm2
+
+        paddw       xmm3,       xmm5
+        paddw       xmm4,       xmm6
+
+        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
+        psraw       xmm3,       VP9_FILTER_SHIFT        ; xmm3 /= 128
+
+        paddw       xmm4,       [GLOBAL(rd)]
+        psraw       xmm4,       VP9_FILTER_SHIFT
+
+        packuswb    xmm3,       xmm4
+        movdqa      [rdi],      xmm3                 ; store the results in the destination
+
+        add         rsi,        rax                 ; next line
+        add         rdi,        rdx                 ; dst_pitch
+        cmp         rdi,        rcx
+        jne         .next_row_fpo
+
+.done:
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp9_bilinear_predict8x8_sse2
+;(
+;    unsigned char  *src_ptr,
+;    int   src_pixels_per_line,
+;    int  xoffset,
+;    int  yoffset,
+;    unsigned char *dst_ptr,
+;    int dst_pitch
+;)
+extern sym(vp9_bilinear_filters_mmx)
+global sym(vp9_bilinear_predict8x8_sse2)
+sym(vp9_bilinear_predict8x8_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 144                         ; reserve 144 bytes
+
+    ;const short *HFilter = bilinear_filters_mmx[xoffset]
+    ;const short *VFilter = bilinear_filters_mmx[yoffset]
+        lea         rcx,        [GLOBAL(sym(vp9_bilinear_filters_mmx))]
+
+        mov         rsi,        arg(0) ;src_ptr
+        movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line
+
+    ;Read 9-line unaligned data in and put them on stack. This gives a big
+    ;performance boost.
+        movdqu      xmm0,       [rsi]
+        lea         rax,        [rdx + rdx*2]
+        movdqu      xmm1,       [rsi+rdx]
+        movdqu      xmm2,       [rsi+rdx*2]
+        add         rsi,        rax
+        movdqu      xmm3,       [rsi]
+        movdqu      xmm4,       [rsi+rdx]
+        movdqu      xmm5,       [rsi+rdx*2]
+        add         rsi,        rax
+        movdqu      xmm6,       [rsi]
+        movdqu      xmm7,       [rsi+rdx]
+
+        movdqa      XMMWORD PTR [rsp],            xmm0
+
+        movdqu      xmm0,       [rsi+rdx*2]
+
+        movdqa      XMMWORD PTR [rsp+16],         xmm1
+        movdqa      XMMWORD PTR [rsp+32],         xmm2
+        movdqa      XMMWORD PTR [rsp+48],         xmm3
+        movdqa      XMMWORD PTR [rsp+64],         xmm4
+        movdqa      XMMWORD PTR [rsp+80],         xmm5
+        movdqa      XMMWORD PTR [rsp+96],         xmm6
+        movdqa      XMMWORD PTR [rsp+112],        xmm7
+        movdqa      XMMWORD PTR [rsp+128],        xmm0
+
+        movsxd      rax,        dword ptr arg(2) ;xoffset
+        shl         rax,        5
+        add         rax,        rcx    ;HFilter
+
+        mov         rdi,        arg(4) ;dst_ptr
+        movsxd      rdx,        dword ptr arg(5) ;dst_pitch
+
+        movdqa      xmm1,       [rax]
+        movdqa      xmm2,       [rax+16]
+
+        movsxd      rax,        dword ptr arg(3) ;yoffset
+        shl         rax,        5
+        add         rax,        rcx    ;VFilter
+
+        lea         rcx,        [rdi+rdx*8]
+
+        movdqa      xmm5,       [rax]
+        movdqa      xmm6,       [rax+16]
+
+        pxor        xmm0,       xmm0
+
+        ; get the first horizontal line done
+        movdqa      xmm3,       XMMWORD PTR [rsp]
+        movdqa      xmm4,       xmm3                 ; make a copy of current line
+        psrldq      xmm4,       1
+
+        punpcklbw   xmm3,       xmm0                 ; 00 01 02 03 04 05 06 07
+        punpcklbw   xmm4,       xmm0                 ; 01 02 03 04 05 06 07 08
+
+        pmullw      xmm3,       xmm1
+        pmullw      xmm4,       xmm2
+
+        paddw       xmm3,       xmm4
+
+        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
+        psraw       xmm3,       VP9_FILTER_SHIFT        ; xmm3 /= 128
+
+        movdqa      xmm7,       xmm3
+        add         rsp,        16                 ; next line
+.next_row8x8:
+        movdqa      xmm3,       XMMWORD PTR [rsp]               ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
+        movdqa      xmm4,       xmm3                 ; make a copy of current line
+        psrldq      xmm4,       1
+
+        punpcklbw   xmm3,       xmm0                 ; 00 01 02 03 04 05 06 07
+        punpcklbw   xmm4,       xmm0                 ; 01 02 03 04 05 06 07 08
+
+        pmullw      xmm3,       xmm1
+        pmullw      xmm4,       xmm2
+
+        paddw       xmm3,       xmm4
+        pmullw      xmm7,       xmm5
+
+        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
+        psraw       xmm3,       VP9_FILTER_SHIFT        ; xmm3 /= 128
+
+        movdqa      xmm4,       xmm3
+
+        pmullw      xmm3,       xmm6
+        paddw       xmm3,       xmm7
+
+        movdqa      xmm7,       xmm4
+
+        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
+        psraw       xmm3,       VP9_FILTER_SHIFT        ; xmm3 /= 128
+
+        packuswb    xmm3,       xmm0
+        movq        [rdi],      xmm3                 ; store the results in the destination
+
+        add         rsp,        16                 ; next line
+        add         rdi,        rdx
+
+        cmp         rdi,        rcx
+        jne         .next_row8x8
+
+    ;add rsp, 144
+    pop rsp
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+SECTION_RODATA
+align 16
+rd:
+    times 8 dw 0x40
--- /dev/null
+++ b/vp9/common/x86/subpixel_ssse3.asm
@@ -1,0 +1,1515 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+%define BLOCK_HEIGHT_WIDTH 4
+%define VP9_FILTER_WEIGHT 128
+%define VP9_FILTER_SHIFT  7
+
+
+;/************************************************************************************
+; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
+; input pixel array has output_height rows. This routine assumes that output_height is an
+; even number. This function handles 8 pixels in horizontal direction, calculating ONE
+; rows each iteration to take advantage of the 128 bits operations.
+;
+; This is an implementation of some of the SSE optimizations first seen in ffvp8
+;
+;*************************************************************************************/
+;void vp9_filter_block1d8_h6_ssse3
+;(
+;    unsigned char  *src_ptr,
+;    unsigned int    src_pixels_per_line,
+;    unsigned char *output_ptr,
+;    unsigned int    output_pitch,
+;    unsigned int    output_height,
+;    unsigned int    vp9_filter_index
+;)
+global sym(vp9_filter_block1d8_h6_ssse3)
+sym(vp9_filter_block1d8_h6_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    movsxd      rdx, DWORD PTR arg(5)   ;table index
+    xor         rsi, rsi
+    shl         rdx, 4
+
+    movdqa      xmm7, [GLOBAL(rd)]
+
+    lea         rax, [GLOBAL(k0_k5)]
+    add         rax, rdx
+    mov         rdi, arg(2)             ;output_ptr
+
+    cmp         esi, DWORD PTR [rax]
+    je          vp9_filter_block1d8_h4_ssse3
+
+    movdqa      xmm4, XMMWORD PTR [rax]         ;k0_k5
+    movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4
+    movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3
+
+    mov         rsi, arg(0)             ;src_ptr
+    movsxd      rax, dword ptr arg(1)   ;src_pixels_per_line
+    movsxd      rcx, dword ptr arg(4)   ;output_height
+
+    movsxd      rdx, dword ptr arg(3)   ;output_pitch
+
+    sub         rdi, rdx
+;xmm3 free
+.filter_block1d8_h6_rowloop_ssse3:
+    movq        xmm0,   MMWORD PTR [rsi - 2]    ; -2 -1  0  1  2  3  4  5
+
+    movq        xmm2,   MMWORD PTR [rsi + 3]    ;  3  4  5  6  7  8  9 10
+
+    punpcklbw   xmm0,   xmm2                    ; -2  3 -1  4  0  5  1  6  2  7  3  8  4  9  5 10
+
+    movdqa      xmm1,   xmm0
+    pmaddubsw   xmm0,   xmm4
+
+    movdqa      xmm2,   xmm1
+    pshufb      xmm1,   [GLOBAL(shuf2bfrom1)]
+
+    pshufb      xmm2,   [GLOBAL(shuf3bfrom1)]
+    pmaddubsw   xmm1,   xmm5
+
+    lea         rdi,    [rdi + rdx]
+    pmaddubsw   xmm2,   xmm6
+
+    lea         rsi,    [rsi + rax]
+    dec         rcx
+
+    paddsw      xmm0,   xmm1
+    paddsw      xmm2,   xmm7
+
+    paddsw      xmm0,   xmm2
+
+    psraw       xmm0,   7
+
+    packuswb    xmm0,   xmm0
+
+    movq        MMWORD Ptr [rdi], xmm0
+    jnz         .filter_block1d8_h6_rowloop_ssse3
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+vp9_filter_block1d8_h4_ssse3:
+    movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4
+    movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3
+
+    movdqa      xmm3, XMMWORD PTR [GLOBAL(shuf2bfrom1)]
+    movdqa      xmm4, XMMWORD PTR [GLOBAL(shuf3bfrom1)]
+
+    mov         rsi, arg(0)             ;src_ptr
+
+    movsxd      rax, dword ptr arg(1)   ;src_pixels_per_line
+    movsxd      rcx, dword ptr arg(4)   ;output_height
+
+    movsxd      rdx, dword ptr arg(3)   ;output_pitch
+
+    sub         rdi, rdx
+
+.filter_block1d8_h4_rowloop_ssse3:
+    movq        xmm0,   MMWORD PTR [rsi - 2]    ; -2 -1  0  1  2  3  4  5
+
+    movq        xmm1,   MMWORD PTR [rsi + 3]    ;  3  4  5  6  7  8  9 10
+
+    punpcklbw   xmm0,   xmm1                    ; -2  3 -1  4  0  5  1  6  2  7  3  8  4  9  5 10
+
+    movdqa      xmm2,   xmm0
+    pshufb      xmm0,   xmm3
+
+    pshufb      xmm2,   xmm4
+    pmaddubsw   xmm0,   xmm5
+
+    lea         rdi,    [rdi + rdx]
+    pmaddubsw   xmm2,   xmm6
+
+    lea         rsi,    [rsi + rax]
+    dec         rcx
+
+    paddsw      xmm0,   xmm7
+
+    paddsw      xmm0,   xmm2
+
+    psraw       xmm0,   7
+
+    packuswb    xmm0,   xmm0
+
+    movq        MMWORD Ptr [rdi], xmm0
+
+    jnz         .filter_block1d8_h4_rowloop_ssse3
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+;void vp9_filter_block1d16_h6_ssse3
+;(
+;    unsigned char  *src_ptr,
+;    unsigned int    src_pixels_per_line,
+;    unsigned char  *output_ptr,
+;    unsigned int    output_pitch,
+;    unsigned int    output_height,
+;    unsigned int    vp9_filter_index
+;)
+global sym(vp9_filter_block1d16_h6_ssse3)
+sym(vp9_filter_block1d16_h6_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    movsxd      rdx, DWORD PTR arg(5)           ;table index
+    xor         rsi, rsi
+    shl         rdx, 4      ;
+
+    lea         rax, [GLOBAL(k0_k5)]
+    add         rax, rdx
+
+    mov         rdi, arg(2)                     ;output_ptr
+
+    mov         rsi, arg(0)                     ;src_ptr
+
+    movdqa      xmm4, XMMWORD PTR [rax]         ;k0_k5
+    movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4
+    movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3
+
+    movsxd      rax, dword ptr arg(1)           ;src_pixels_per_line
+    movsxd      rcx, dword ptr arg(4)           ;output_height
+    movsxd      rdx, dword ptr arg(3)           ;output_pitch
+
+.filter_block1d16_h6_rowloop_ssse3:
+    movq        xmm0,   MMWORD PTR [rsi - 2]    ; -2 -1  0  1  2  3  4  5
+
+    movq        xmm3,   MMWORD PTR [rsi + 3]    ;  3  4  5  6  7  8  9 10
+
+    punpcklbw   xmm0,   xmm3                    ; -2  3 -1  4  0  5  1  6  2  7  3  8  4  9  5 10
+
+    movdqa      xmm1,   xmm0
+    pmaddubsw   xmm0,   xmm4
+
+    movdqa      xmm2,   xmm1
+    pshufb      xmm1,   [GLOBAL(shuf2bfrom1)]
+
+    pshufb      xmm2,   [GLOBAL(shuf3bfrom1)]
+    movq        xmm3,   MMWORD PTR [rsi +  6]
+
+    pmaddubsw   xmm1,   xmm5
+    movq        xmm7,   MMWORD PTR [rsi + 11]
+
+    pmaddubsw   xmm2,   xmm6
+    punpcklbw   xmm3,   xmm7
+
+    paddsw      xmm0,   xmm1
+    movdqa      xmm1,   xmm3
+
+    pmaddubsw   xmm3,   xmm4
+    paddsw      xmm0,   xmm2
+
+    movdqa      xmm2,   xmm1
+    paddsw      xmm0,   [GLOBAL(rd)]
+
+    pshufb      xmm1,   [GLOBAL(shuf2bfrom1)]
+    pshufb      xmm2,   [GLOBAL(shuf3bfrom1)]
+
+    psraw       xmm0,   7
+    pmaddubsw   xmm1,   xmm5
+
+    pmaddubsw   xmm2,   xmm6
+    packuswb    xmm0,   xmm0
+
+    lea         rsi,    [rsi + rax]
+    paddsw      xmm3,   xmm1
+
+    paddsw      xmm3,   xmm2
+
+    paddsw      xmm3,   [GLOBAL(rd)]
+
+    psraw       xmm3,   7
+
+    packuswb    xmm3,   xmm3
+
+    punpcklqdq  xmm0,   xmm3
+
+    movdqa      XMMWORD Ptr [rdi], xmm0
+
+    lea         rdi,    [rdi + rdx]
+    dec         rcx
+    jnz         .filter_block1d16_h6_rowloop_ssse3
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vp9_filter_block1d4_h6_ssse3
+;(
+;    unsigned char  *src_ptr,
+;    unsigned int    src_pixels_per_line,
+;    unsigned char  *output_ptr,
+;    unsigned int    output_pitch,
+;    unsigned int    output_height,
+;    unsigned int    vp9_filter_index
+;)
+global sym(vp9_filter_block1d4_h6_ssse3)
+sym(vp9_filter_block1d4_h6_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    movsxd      rdx, DWORD PTR arg(5)   ;table index
+    xor         rsi, rsi
+    shl         rdx, 4      ;
+
+    lea         rax, [GLOBAL(k0_k5)]
+    add         rax, rdx
+    movdqa      xmm7, [GLOBAL(rd)]
+
+    cmp         esi, DWORD PTR [rax]
+    je          .vp9_filter_block1d4_h4_ssse3
+
+    movdqa      xmm4, XMMWORD PTR [rax]         ;k0_k5
+    movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4
+    movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3
+
+    mov         rsi, arg(0)             ;src_ptr
+    mov         rdi, arg(2)             ;output_ptr
+    movsxd      rax, dword ptr arg(1)   ;src_pixels_per_line
+    movsxd      rcx, dword ptr arg(4)   ;output_height
+
+    movsxd      rdx, dword ptr arg(3)   ;output_pitch
+
+;xmm3 free
+.filter_block1d4_h6_rowloop_ssse3:
+    movdqu      xmm0,   XMMWORD PTR [rsi - 2]
+
+    movdqa      xmm1, xmm0
+    pshufb      xmm0, [GLOBAL(shuf1b)]
+
+    movdqa      xmm2, xmm1
+    pshufb      xmm1, [GLOBAL(shuf2b)]
+    pmaddubsw   xmm0, xmm4
+    pshufb      xmm2, [GLOBAL(shuf3b)]
+    pmaddubsw   xmm1, xmm5
+
+;--
+    pmaddubsw   xmm2, xmm6
+
+    lea         rsi,    [rsi + rax]
+;--
+    paddsw      xmm0, xmm1
+    paddsw      xmm0, xmm7
+    pxor        xmm1, xmm1
+    paddsw      xmm0, xmm2
+    psraw       xmm0, 7
+    packuswb    xmm0, xmm0
+
+    movd        DWORD PTR [rdi], xmm0
+
+    add         rdi, rdx
+    dec         rcx
+    jnz         .filter_block1d4_h6_rowloop_ssse3
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+.vp9_filter_block1d4_h4_ssse3:
+    movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4
+    movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3
+    movdqa      xmm0, XMMWORD PTR [GLOBAL(shuf2b)]
+    movdqa      xmm3, XMMWORD PTR [GLOBAL(shuf3b)]
+
+    mov         rsi, arg(0)             ;src_ptr
+    mov         rdi, arg(2)             ;output_ptr
+    movsxd      rax, dword ptr arg(1)   ;src_pixels_per_line
+    movsxd      rcx, dword ptr arg(4)   ;output_height
+
+    movsxd      rdx, dword ptr arg(3)   ;output_pitch
+
+.filter_block1d4_h4_rowloop_ssse3:
+    movdqu      xmm1,   XMMWORD PTR [rsi - 2]
+
+    movdqa      xmm2, xmm1
+    pshufb      xmm1, xmm0 ;;[GLOBAL(shuf2b)]
+    pshufb      xmm2, xmm3 ;;[GLOBAL(shuf3b)]
+    pmaddubsw   xmm1, xmm5
+
+;--
+    pmaddubsw   xmm2, xmm6
+
+    lea         rsi,    [rsi + rax]
+;--
+    paddsw      xmm1, xmm7
+    paddsw      xmm1, xmm2
+    psraw       xmm1, 7
+    packuswb    xmm1, xmm1
+
+    movd        DWORD PTR [rdi], xmm1
+
+    add         rdi, rdx
+    dec         rcx
+    jnz         .filter_block1d4_h4_rowloop_ssse3
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+
+;void vp9_filter_block1d16_v6_ssse3
+;(
+;    unsigned char *src_ptr,
+;    unsigned int   src_pitch,
+;    unsigned char *output_ptr,
+;    unsigned int   out_pitch,
+;    unsigned int   output_height,
+;    unsigned int   vp9_filter_index
+;)
+global sym(vp9_filter_block1d16_v6_ssse3)
+sym(vp9_filter_block1d16_v6_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    movsxd      rdx, DWORD PTR arg(5)   ;table index
+    xor         rsi, rsi
+    shl         rdx, 4      ;
+
+    lea         rax, [GLOBAL(k0_k5)]
+    add         rax, rdx
+
+    cmp         esi, DWORD PTR [rax]
+    je          .vp9_filter_block1d16_v4_ssse3
+
+    movdqa      xmm5, XMMWORD PTR [rax]         ;k0_k5
+    movdqa      xmm6, XMMWORD PTR [rax+256]     ;k2_k4
+    movdqa      xmm7, XMMWORD PTR [rax+128]     ;k1_k3
+
+    mov         rsi, arg(0)             ;src_ptr
+    movsxd      rdx, DWORD PTR arg(1)   ;pixels_per_line
+    mov         rdi, arg(2)             ;output_ptr
+
+%if ABI_IS_32BIT=0
+    movsxd      r8, DWORD PTR arg(3)    ;out_pitch
+%endif
+    mov         rax, rsi
+    movsxd      rcx, DWORD PTR arg(4)   ;output_height
+    add         rax, rdx
+
+
+.vp9_filter_block1d16_v6_ssse3_loop:
+    movq        xmm1, MMWORD PTR [rsi]                  ;A
+    movq        xmm2, MMWORD PTR [rsi + rdx]            ;B
+    movq        xmm3, MMWORD PTR [rsi + rdx * 2]        ;C
+    movq        xmm4, MMWORD PTR [rax + rdx * 2]        ;D
+    movq        xmm0, MMWORD PTR [rsi + rdx * 4]        ;E
+
+    punpcklbw   xmm2, xmm4                  ;B D
+    punpcklbw   xmm3, xmm0                  ;C E
+
+    movq        xmm0, MMWORD PTR [rax + rdx * 4]        ;F
+
+    pmaddubsw   xmm3, xmm6
+    punpcklbw   xmm1, xmm0                  ;A F
+    pmaddubsw   xmm2, xmm7
+    pmaddubsw   xmm1, xmm5
+
+    paddsw      xmm2, xmm3
+    paddsw      xmm2, xmm1
+    paddsw      xmm2, [GLOBAL(rd)]
+    psraw       xmm2, 7
+    packuswb    xmm2, xmm2
+
+    movq        MMWORD PTR [rdi], xmm2          ;store the results
+
+    movq        xmm1, MMWORD PTR [rsi + 8]                  ;A
+    movq        xmm2, MMWORD PTR [rsi + rdx + 8]            ;B
+    movq        xmm3, MMWORD PTR [rsi + rdx * 2 + 8]        ;C
+    movq        xmm4, MMWORD PTR [rax + rdx * 2 + 8]        ;D
+    movq        xmm0, MMWORD PTR [rsi + rdx * 4 + 8]        ;E
+
+    punpcklbw   xmm2, xmm4                  ;B D
+    punpcklbw   xmm3, xmm0                  ;C E
+
+    movq        xmm0, MMWORD PTR [rax + rdx * 4 + 8]        ;F
+    pmaddubsw   xmm3, xmm6
+    punpcklbw   xmm1, xmm0                  ;A F
+    pmaddubsw   xmm2, xmm7
+    pmaddubsw   xmm1, xmm5
+
+    add         rsi,  rdx
+    add         rax,  rdx
+;--
+;--
+    paddsw      xmm2, xmm3
+    paddsw      xmm2, xmm1
+    paddsw      xmm2, [GLOBAL(rd)]
+    psraw       xmm2, 7
+    packuswb    xmm2, xmm2
+
+    movq        MMWORD PTR [rdi+8], xmm2
+
+%if ABI_IS_32BIT
+    add         rdi,        DWORD PTR arg(3) ;out_pitch
+%else
+    add         rdi,        r8
+%endif
+    dec         rcx
+    jnz         .vp9_filter_block1d16_v6_ssse3_loop
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+.vp9_filter_block1d16_v4_ssse3:
+    movdqa      xmm6, XMMWORD PTR [rax+256]     ;k2_k4
+    movdqa      xmm7, XMMWORD PTR [rax+128]     ;k1_k3
+
+    mov         rsi, arg(0)             ;src_ptr
+    movsxd      rdx, DWORD PTR arg(1)   ;pixels_per_line
+    mov         rdi, arg(2)             ;output_ptr
+
+%if ABI_IS_32BIT=0
+    movsxd      r8, DWORD PTR arg(3)    ;out_pitch
+%endif
+    mov         rax, rsi
+    movsxd      rcx, DWORD PTR arg(4)   ;output_height
+    add         rax, rdx
+
+.vp9_filter_block1d16_v4_ssse3_loop:
+    movq        xmm2, MMWORD PTR [rsi + rdx]            ;B
+    movq        xmm3, MMWORD PTR [rsi + rdx * 2]        ;C
+    movq        xmm4, MMWORD PTR [rax + rdx * 2]        ;D
+    movq        xmm0, MMWORD PTR [rsi + rdx * 4]        ;E
+
+    punpcklbw   xmm2, xmm4                  ;B D
+    punpcklbw   xmm3, xmm0                  ;C E
+
+    pmaddubsw   xmm3, xmm6
+    pmaddubsw   xmm2, xmm7
+    movq        xmm5, MMWORD PTR [rsi + rdx + 8]            ;B
+    movq        xmm1, MMWORD PTR [rsi + rdx * 2 + 8]        ;C
+    movq        xmm4, MMWORD PTR [rax + rdx * 2 + 8]        ;D
+    movq        xmm0, MMWORD PTR [rsi + rdx * 4 + 8]        ;E
+
+    paddsw      xmm2, [GLOBAL(rd)]
+    paddsw      xmm2, xmm3
+    psraw       xmm2, 7
+    packuswb    xmm2, xmm2
+
+    punpcklbw   xmm5, xmm4                  ;B D
+    punpcklbw   xmm1, xmm0                  ;C E
+
+    pmaddubsw   xmm1, xmm6
+    pmaddubsw   xmm5, xmm7
+
+    movdqa      xmm4, [GLOBAL(rd)]
+    add         rsi,  rdx
+    add         rax,  rdx
+;--
+;--
+    paddsw      xmm5, xmm1
+    paddsw      xmm5, xmm4
+    psraw       xmm5, 7
+    packuswb    xmm5, xmm5
+
+    punpcklqdq  xmm2, xmm5
+
+    movdqa       XMMWORD PTR [rdi], xmm2
+
+%if ABI_IS_32BIT
+    add         rdi,        DWORD PTR arg(3) ;out_pitch
+%else
+    add         rdi,        r8
+%endif
+    dec         rcx
+    jnz         .vp9_filter_block1d16_v4_ssse3_loop
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vp9_filter_block1d8_v6_ssse3
+;(
+;    unsigned char *src_ptr,
+;    unsigned int   src_pitch,
+;    unsigned char *output_ptr,
+;    unsigned int   out_pitch,
+;    unsigned int   output_height,
+;    unsigned int   vp9_filter_index
+;)
+global sym(vp9_filter_block1d8_v6_ssse3)
+sym(vp9_filter_block1d8_v6_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    movsxd      rdx, DWORD PTR arg(5)   ;table index
+    xor         rsi, rsi
+    shl         rdx, 4      ;
+
+    lea         rax, [GLOBAL(k0_k5)]
+    add         rax, rdx
+
+    movsxd      rdx, DWORD PTR arg(1)   ;pixels_per_line
+    mov         rdi, arg(2)             ;output_ptr
+%if ABI_IS_32BIT=0
+    movsxd      r8, DWORD PTR arg(3)    ; out_pitch
+%endif
+    movsxd      rcx, DWORD PTR arg(4)   ;[output_height]
+
+    cmp         esi, DWORD PTR [rax]
+    je          .vp9_filter_block1d8_v4_ssse3
+
+    movdqa      xmm5, XMMWORD PTR [rax]         ;k0_k5
+    movdqa      xmm6, XMMWORD PTR [rax+256]     ;k2_k4
+    movdqa      xmm7, XMMWORD PTR [rax+128]     ;k1_k3
+
+    mov         rsi, arg(0)             ;src_ptr
+
+    mov         rax, rsi
+    add         rax, rdx
+
+.vp9_filter_block1d8_v6_ssse3_loop:
+    movq        xmm1, MMWORD PTR [rsi]                  ;A
+    movq        xmm2, MMWORD PTR [rsi + rdx]            ;B
+    movq        xmm3, MMWORD PTR [rsi + rdx * 2]        ;C
+    movq        xmm4, MMWORD PTR [rax + rdx * 2]        ;D
+    movq        xmm0, MMWORD PTR [rsi + rdx * 4]        ;E
+
+    punpcklbw   xmm2, xmm4                  ;B D
+    punpcklbw   xmm3, xmm0                  ;C E
+
+    movq        xmm0, MMWORD PTR [rax + rdx * 4]        ;F
+    movdqa      xmm4, [GLOBAL(rd)]
+
+    pmaddubsw   xmm3, xmm6
+    punpcklbw   xmm1, xmm0                  ;A F
+    pmaddubsw   xmm2, xmm7
+    pmaddubsw   xmm1, xmm5
+    add         rsi,  rdx
+    add         rax,  rdx
+;--
+;--
+    paddsw      xmm2, xmm3
+    paddsw      xmm2, xmm1
+    paddsw      xmm2, xmm4
+    psraw       xmm2, 7
+    packuswb    xmm2, xmm2
+
+    movq        MMWORD PTR [rdi], xmm2
+
+%if ABI_IS_32BIT
+    add         rdi,        DWORD PTR arg(3) ;[out_pitch]
+%else
+    add         rdi,        r8
+%endif
+    dec         rcx
+    jnz         .vp9_filter_block1d8_v6_ssse3_loop
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+.vp9_filter_block1d8_v4_ssse3:
+    movdqa      xmm6, XMMWORD PTR [rax+256]     ;k2_k4
+    movdqa      xmm7, XMMWORD PTR [rax+128]     ;k1_k3
+    movdqa      xmm5, [GLOBAL(rd)]
+
+    mov         rsi, arg(0)             ;src_ptr
+
+    mov         rax, rsi
+    add         rax, rdx
+
+.vp9_filter_block1d8_v4_ssse3_loop:
+    movq        xmm2, MMWORD PTR [rsi + rdx]            ;B
+    movq        xmm3, MMWORD PTR [rsi + rdx * 2]        ;C
+    movq        xmm4, MMWORD PTR [rax + rdx * 2]        ;D
+    movq        xmm0, MMWORD PTR [rsi + rdx * 4]        ;E
+
+    punpcklbw   xmm2, xmm4                  ;B D
+    punpcklbw   xmm3, xmm0                  ;C E
+
+    pmaddubsw   xmm3, xmm6
+    pmaddubsw   xmm2, xmm7
+    add         rsi,  rdx
+    add         rax,  rdx
+;--
+;--
+    paddsw      xmm2, xmm3
+    paddsw      xmm2, xmm5
+    psraw       xmm2, 7
+    packuswb    xmm2, xmm2
+
+    movq        MMWORD PTR [rdi], xmm2
+
+%if ABI_IS_32BIT
+    add         rdi,        DWORD PTR arg(3) ;[out_pitch]
+%else
+    add         rdi,        r8
+%endif
+    dec         rcx
+    jnz         .vp9_filter_block1d8_v4_ssse3_loop
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+;void vp9_filter_block1d4_v6_ssse3
+;(
+;    unsigned char *src_ptr,
+;    unsigned int   src_pitch,
+;    unsigned char *output_ptr,
+;    unsigned int   out_pitch,
+;    unsigned int   output_height,
+;    unsigned int   vp9_filter_index
+;)
+global sym(vp9_filter_block1d4_v6_ssse3)
+sym(vp9_filter_block1d4_v6_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    movsxd      rdx, DWORD PTR arg(5)   ;table index
+    xor         rsi, rsi
+    shl         rdx, 4      ;
+
+    lea         rax, [GLOBAL(k0_k5)]
+    add         rax, rdx
+
+    movsxd      rdx, DWORD PTR arg(1)   ;pixels_per_line
+    mov         rdi, arg(2)             ;output_ptr
+%if ABI_IS_32BIT=0
+    movsxd      r8, DWORD PTR arg(3)    ; out_pitch
+%endif
+    movsxd      rcx, DWORD PTR arg(4)   ;[output_height]
+
+    cmp         esi, DWORD PTR [rax]
+    je          .vp9_filter_block1d4_v4_ssse3
+
+    movq        mm5, MMWORD PTR [rax]         ;k0_k5
+    movq        mm6, MMWORD PTR [rax+256]     ;k2_k4
+    movq        mm7, MMWORD PTR [rax+128]     ;k1_k3
+
+    mov         rsi, arg(0)             ;src_ptr
+
+    mov         rax, rsi
+    add         rax, rdx
+
+.vp9_filter_block1d4_v6_ssse3_loop:
+    movd        mm1, DWORD PTR [rsi]                  ;A
+    movd        mm2, DWORD PTR [rsi + rdx]            ;B
+    movd        mm3, DWORD PTR [rsi + rdx * 2]        ;C
+    movd        mm4, DWORD PTR [rax + rdx * 2]        ;D
+    movd        mm0, DWORD PTR [rsi + rdx * 4]        ;E
+
+    punpcklbw   mm2, mm4                  ;B D
+    punpcklbw   mm3, mm0                  ;C E
+
+    movd        mm0, DWORD PTR [rax + rdx * 4]        ;F
+
+    movq        mm4, [GLOBAL(rd)]
+
+    pmaddubsw   mm3, mm6
+    punpcklbw   mm1, mm0                  ;A F
+    pmaddubsw   mm2, mm7
+    pmaddubsw   mm1, mm5
+    add         rsi,  rdx
+    add         rax,  rdx
+;--
+;--
+    paddsw      mm2, mm3
+    paddsw      mm2, mm1
+    paddsw      mm2, mm4
+    psraw       mm2, 7
+    packuswb    mm2, mm2
+
+    movd        DWORD PTR [rdi], mm2
+
+%if ABI_IS_32BIT
+    add         rdi,        DWORD PTR arg(3) ;[out_pitch]
+%else
+    add         rdi,        r8
+%endif
+    dec         rcx
+    jnz         .vp9_filter_block1d4_v6_ssse3_loop
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+.vp9_filter_block1d4_v4_ssse3:
+    movq        mm6, MMWORD PTR [rax+256]     ;k2_k4
+    movq        mm7, MMWORD PTR [rax+128]     ;k1_k3
+    movq        mm5, MMWORD PTR [GLOBAL(rd)]
+
+    mov         rsi, arg(0)             ;src_ptr
+
+    mov         rax, rsi
+    add         rax, rdx
+
+.vp9_filter_block1d4_v4_ssse3_loop:
+    movd        mm2, DWORD PTR [rsi + rdx]            ;B
+    movd        mm3, DWORD PTR [rsi + rdx * 2]        ;C
+    movd        mm4, DWORD PTR [rax + rdx * 2]        ;D
+    movd        mm0, DWORD PTR [rsi + rdx * 4]        ;E
+
+    punpcklbw   mm2, mm4                  ;B D
+    punpcklbw   mm3, mm0                  ;C E
+
+    pmaddubsw   mm3, mm6
+    pmaddubsw   mm2, mm7
+    add         rsi,  rdx
+    add         rax,  rdx
+;--
+;--
+    paddsw      mm2, mm3
+    paddsw      mm2, mm5
+    psraw       mm2, 7
+    packuswb    mm2, mm2
+
+    movd        DWORD PTR [rdi], mm2
+
+%if ABI_IS_32BIT
+    add         rdi,        DWORD PTR arg(3) ;[out_pitch]
+%else
+    add         rdi,        r8
+%endif
+    dec         rcx
+    jnz         .vp9_filter_block1d4_v4_ssse3_loop
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vp9_bilinear_predict16x16_ssse3
+;(
+;    unsigned char  *src_ptr,
+;    int   src_pixels_per_line,
+;    int  xoffset,
+;    int  yoffset,
+;    unsigned char *dst_ptr,
+;    int dst_pitch
+;)
+global sym(vp9_bilinear_predict16x16_ssse3)
+sym(vp9_bilinear_predict16x16_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        lea         rcx,        [GLOBAL(bilinear_filters_ssse3)]
+        movsxd      rax,        dword ptr arg(2)    ; xoffset
+
+        cmp         rax,        0                   ; skip first_pass filter if xoffset=0
+        je          .b16x16_sp_only
+
+        shl         rax,        4
+        lea         rax,        [rax + rcx]         ; HFilter
+
+        mov         rdi,        arg(4)              ; dst_ptr
+        mov         rsi,        arg(0)              ; src_ptr
+        movsxd      rdx,        dword ptr arg(5)    ; dst_pitch
+
+        movdqa      xmm1,       [rax]
+
+        movsxd      rax,        dword ptr arg(3)    ; yoffset
+
+        cmp         rax,        0                   ; skip second_pass filter if yoffset=0
+        je          .b16x16_fp_only
+
+        shl         rax,        4
+        lea         rax,        [rax + rcx]         ; VFilter
+
+        lea         rcx,        [rdi+rdx*8]
+        lea         rcx,        [rcx+rdx*8]
+        movsxd      rdx,        dword ptr arg(1)    ; src_pixels_per_line
+
+        movdqa      xmm2,       [rax]
+
+%if ABI_IS_32BIT=0
+        movsxd      r8,         dword ptr arg(5)    ; dst_pitch
+%endif
+        movq        xmm3,       [rsi]               ; 00 01 02 03 04 05 06 07
+        movq        xmm5,       [rsi+1]             ; 01 02 03 04 05 06 07 08
+
+        punpcklbw   xmm3,       xmm5                ; 00 01 01 02 02 03 03 04 04 05 05 06 06 07 07 08
+        movq        xmm4,       [rsi+8]             ; 08 09 10 11 12 13 14 15
+
+        movq        xmm5,       [rsi+9]             ; 09 10 11 12 13 14 15 16
+
+        lea         rsi,        [rsi + rdx]         ; next line
+
+        pmaddubsw   xmm3,       xmm1                ; 00 02 04 06 08 10 12 14
+
+        punpcklbw   xmm4,       xmm5                ; 08 09 09 10 10 11 11 12 12 13 13 14 14 15 15 16
+        pmaddubsw   xmm4,       xmm1                ; 01 03 05 07 09 11 13 15
+
+        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
+        psraw       xmm3,       VP9_FILTER_SHIFT    ; xmm3 /= 128
+
+        paddw       xmm4,       [GLOBAL(rd)]        ; xmm4 += round value
+        psraw       xmm4,       VP9_FILTER_SHIFT    ; xmm4 /= 128
+
+        movdqa      xmm7,       xmm3
+        packuswb    xmm7,       xmm4                ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
+
+.next_row:
+        movq        xmm6,       [rsi]               ; 00 01 02 03 04 05 06 07
+        movq        xmm5,       [rsi+1]             ; 01 02 03 04 05 06 07 08
+
+        punpcklbw   xmm6,       xmm5
+        movq        xmm4,       [rsi+8]             ; 08 09 10 11 12 13 14 15
+
+        movq        xmm5,       [rsi+9]             ; 09 10 11 12 13 14 15 16
+        lea         rsi,        [rsi + rdx]         ; next line
+
+        pmaddubsw   xmm6,       xmm1
+
+        punpcklbw   xmm4,       xmm5
+        pmaddubsw   xmm4,       xmm1
+
+        paddw       xmm6,       [GLOBAL(rd)]        ; xmm6 += round value
+        psraw       xmm6,       VP9_FILTER_SHIFT    ; xmm6 /= 128
+
+        paddw       xmm4,       [GLOBAL(rd)]        ; xmm4 += round value
+        psraw       xmm4,       VP9_FILTER_SHIFT    ; xmm4 /= 128
+
+        packuswb    xmm6,       xmm4
+        movdqa      xmm5,       xmm7
+
+        punpcklbw   xmm5,       xmm6
+        pmaddubsw   xmm5,       xmm2
+
+        punpckhbw   xmm7,       xmm6
+        pmaddubsw   xmm7,       xmm2
+
+        paddw       xmm5,       [GLOBAL(rd)]        ; xmm5 += round value
+        psraw       xmm5,       VP9_FILTER_SHIFT    ; xmm5 /= 128
+
+        paddw       xmm7,       [GLOBAL(rd)]        ; xmm7 += round value
+        psraw       xmm7,       VP9_FILTER_SHIFT    ; xmm7 /= 128
+
+        packuswb    xmm5,       xmm7
+        movdqa      xmm7,       xmm6
+
+        movdqa      [rdi],      xmm5                ; store the results in the destination
+%if ABI_IS_32BIT
+        add         rdi,        DWORD PTR arg(5)    ; dst_pitch
+%else
+        add         rdi,        r8
+%endif
+
+        cmp         rdi,        rcx
+        jne         .next_row
+
+        jmp         .done
+
+.b16x16_sp_only:
+        movsxd      rax,        dword ptr arg(3)    ; yoffset
+        shl         rax,        4
+        lea         rax,        [rax + rcx]         ; VFilter
+
+        mov         rdi,        arg(4)              ; dst_ptr
+        mov         rsi,        arg(0)              ; src_ptr
+        movsxd      rdx,        dword ptr arg(5)    ; dst_pitch
+
+        movdqa      xmm1,       [rax]               ; VFilter
+
+        lea         rcx,        [rdi+rdx*8]
+        lea         rcx,        [rcx+rdx*8]
+        movsxd      rax,        dword ptr arg(1)    ; src_pixels_per_line
+
+        ; get the first horizontal line done
+        movq        xmm4,       [rsi]               ; load row 0
+        movq        xmm2,       [rsi + 8]           ; load row 0
+
+        lea         rsi,        [rsi + rax]         ; next line
+.next_row_sp:
+        movq        xmm3,       [rsi]               ; load row + 1
+        movq        xmm5,       [rsi + 8]           ; load row + 1
+
+        punpcklbw   xmm4,       xmm3
+        punpcklbw   xmm2,       xmm5
+
+        pmaddubsw   xmm4,       xmm1
+        movq        xmm7,       [rsi + rax]         ; load row + 2
+
+        pmaddubsw   xmm2,       xmm1
+        movq        xmm6,       [rsi + rax + 8]     ; load row + 2
+
+        punpcklbw   xmm3,       xmm7
+        punpcklbw   xmm5,       xmm6
+
+        pmaddubsw   xmm3,       xmm1
+        paddw       xmm4,       [GLOBAL(rd)]
+
+        pmaddubsw   xmm5,       xmm1
+        paddw       xmm2,       [GLOBAL(rd)]
+
+        psraw       xmm4,       VP9_FILTER_SHIFT
+        psraw       xmm2,       VP9_FILTER_SHIFT
+
+        packuswb    xmm4,       xmm2
+        paddw       xmm3,       [GLOBAL(rd)]
+
+        movdqa      [rdi],      xmm4                ; store row 0
+        paddw       xmm5,       [GLOBAL(rd)]
+
+        psraw       xmm3,       VP9_FILTER_SHIFT
+        psraw       xmm5,       VP9_FILTER_SHIFT
+
+        packuswb    xmm3,       xmm5
+        movdqa      xmm4,       xmm7
+
+        movdqa      [rdi + rdx],xmm3                ; store row 1
+        lea         rsi,        [rsi + 2*rax]
+
+        movdqa      xmm2,       xmm6
+        lea         rdi,        [rdi + 2*rdx]
+
+        cmp         rdi,        rcx
+        jne         .next_row_sp
+
+        jmp         .done
+
+.b16x16_fp_only:
+        lea         rcx,        [rdi+rdx*8]
+        lea         rcx,        [rcx+rdx*8]
+        movsxd      rax,        dword ptr arg(1)    ; src_pixels_per_line
+
+.next_row_fp:
+        movq        xmm2,       [rsi]               ; 00 01 02 03 04 05 06 07
+        movq        xmm4,       [rsi+1]             ; 01 02 03 04 05 06 07 08
+
+        punpcklbw   xmm2,       xmm4
+        movq        xmm3,       [rsi+8]             ; 08 09 10 11 12 13 14 15
+
+        pmaddubsw   xmm2,       xmm1
+        movq        xmm4,       [rsi+9]             ; 09 10 11 12 13 14 15 16
+
+        lea         rsi,        [rsi + rax]         ; next line
+        punpcklbw   xmm3,       xmm4
+
+        pmaddubsw   xmm3,       xmm1
+        movq        xmm5,       [rsi]
+
+        paddw       xmm2,       [GLOBAL(rd)]
+        movq        xmm7,       [rsi+1]
+
+        movq        xmm6,       [rsi+8]
+        psraw       xmm2,       VP9_FILTER_SHIFT
+
+        punpcklbw   xmm5,       xmm7
+        movq        xmm7,       [rsi+9]
+
+        paddw       xmm3,       [GLOBAL(rd)]
+        pmaddubsw   xmm5,       xmm1
+
+        psraw       xmm3,       VP9_FILTER_SHIFT
+        punpcklbw   xmm6,       xmm7
+
+        packuswb    xmm2,       xmm3
+        pmaddubsw   xmm6,       xmm1
+
+        movdqa      [rdi],      xmm2                ; store the results in the destination
+        paddw       xmm5,       [GLOBAL(rd)]
+
+        lea         rdi,        [rdi + rdx]         ; dst_pitch
+        psraw       xmm5,       VP9_FILTER_SHIFT
+
+        paddw       xmm6,       [GLOBAL(rd)]
+        psraw       xmm6,       VP9_FILTER_SHIFT
+
+        packuswb    xmm5,       xmm6
+        lea         rsi,        [rsi + rax]         ; next line
+
+        movdqa      [rdi],      xmm5                ; store the results in the destination
+        lea         rdi,        [rdi + rdx]         ; dst_pitch
+
+        cmp         rdi,        rcx
+
+        jne         .next_row_fp
+
+.done:
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vp9_bilinear_predict8x8_ssse3
+;(
+;    unsigned char  *src_ptr,
+;    int   src_pixels_per_line,
+;    int  xoffset,
+;    int  yoffset,
+;    unsigned char *dst_ptr,
+;    int dst_pitch
+;)
+global sym(vp9_bilinear_predict8x8_ssse3)
+sym(vp9_bilinear_predict8x8_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 144                         ; reserve 144 bytes
+
+        lea         rcx,        [GLOBAL(bilinear_filters_ssse3)]
+
+        mov         rsi,        arg(0) ;src_ptr
+        movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line
+
+    ;Read 9-line unaligned data in and put them on stack. This gives a big
+    ;performance boost.
+        movdqu      xmm0,       [rsi]
+        lea         rax,        [rdx + rdx*2]
+        movdqu      xmm1,       [rsi+rdx]
+        movdqu      xmm2,       [rsi+rdx*2]
+        add         rsi,        rax
+        movdqu      xmm3,       [rsi]
+        movdqu      xmm4,       [rsi+rdx]
+        movdqu      xmm5,       [rsi+rdx*2]
+        add         rsi,        rax
+        movdqu      xmm6,       [rsi]
+        movdqu      xmm7,       [rsi+rdx]
+
+        movdqa      XMMWORD PTR [rsp],            xmm0
+
+        movdqu      xmm0,       [rsi+rdx*2]
+
+        movdqa      XMMWORD PTR [rsp+16],         xmm1
+        movdqa      XMMWORD PTR [rsp+32],         xmm2
+        movdqa      XMMWORD PTR [rsp+48],         xmm3
+        movdqa      XMMWORD PTR [rsp+64],         xmm4
+        movdqa      XMMWORD PTR [rsp+80],         xmm5
+        movdqa      XMMWORD PTR [rsp+96],         xmm6
+        movdqa      XMMWORD PTR [rsp+112],        xmm7
+        movdqa      XMMWORD PTR [rsp+128],        xmm0
+
+        movsxd      rax,        dword ptr arg(2)    ; xoffset
+        cmp         rax,        0                   ; skip first_pass filter if xoffset=0
+        je          .b8x8_sp_only
+
+        shl         rax,        4
+        add         rax,        rcx                 ; HFilter
+
+        mov         rdi,        arg(4)              ; dst_ptr
+        movsxd      rdx,        dword ptr arg(5)    ; dst_pitch
+
+        movdqa      xmm0,       [rax]
+
+        movsxd      rax,        dword ptr arg(3)    ; yoffset
+        cmp         rax,        0                   ; skip second_pass filter if yoffset=0
+        je          .b8x8_fp_only
+
+        shl         rax,        4
+        lea         rax,        [rax + rcx]         ; VFilter
+
+        lea         rcx,        [rdi+rdx*8]
+
+        movdqa      xmm1,       [rax]
+
+        ; get the first horizontal line done
+        movdqa      xmm3,       [rsp]               ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
+        movdqa      xmm5,       xmm3                ; 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 xx
+
+        psrldq      xmm5,       1
+        lea         rsp,        [rsp + 16]          ; next line
+
+        punpcklbw   xmm3,       xmm5                ; 00 01 01 02 02 03 03 04 04 05 05 06 06 07 07 08
+        pmaddubsw   xmm3,       xmm0                ; 00 02 04 06 08 10 12 14
+
+        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
+        psraw       xmm3,       VP9_FILTER_SHIFT    ; xmm3 /= 128
+
+        movdqa      xmm7,       xmm3
+        packuswb    xmm7,       xmm7                ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
+
+.next_row:
+        movdqa      xmm6,       [rsp]               ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
+        lea         rsp,        [rsp + 16]          ; next line
+
+        movdqa      xmm5,       xmm6
+
+        psrldq      xmm5,       1
+
+        punpcklbw   xmm6,       xmm5
+        pmaddubsw   xmm6,       xmm0
+
+        paddw       xmm6,       [GLOBAL(rd)]        ; xmm6 += round value
+        psraw       xmm6,       VP9_FILTER_SHIFT    ; xmm6 /= 128
+
+        packuswb    xmm6,       xmm6
+
+        punpcklbw   xmm7,       xmm6
+        pmaddubsw   xmm7,       xmm1
+
+        paddw       xmm7,       [GLOBAL(rd)]        ; xmm7 += round value
+        psraw       xmm7,       VP9_FILTER_SHIFT    ; xmm7 /= 128
+
+        packuswb    xmm7,       xmm7
+
+        movq        [rdi],      xmm7                ; store the results in the destination
+        lea         rdi,        [rdi + rdx]
+
+        movdqa      xmm7,       xmm6
+
+        cmp         rdi,        rcx
+        jne         .next_row
+
+        jmp         .done8x8
+
+.b8x8_sp_only:
+        movsxd      rax,        dword ptr arg(3)    ; yoffset
+        shl         rax,        4
+        lea         rax,        [rax + rcx]         ; VFilter
+
+        mov         rdi,        arg(4) ;dst_ptr
+        movsxd      rdx,        dword ptr arg(5)    ; dst_pitch
+
+        movdqa      xmm0,       [rax]               ; VFilter
+
+        movq        xmm1,       XMMWORD PTR [rsp]
+        movq        xmm2,       XMMWORD PTR [rsp+16]
+
+        movq        xmm3,       XMMWORD PTR [rsp+32]
+        punpcklbw   xmm1,       xmm2
+
+        movq        xmm4,       XMMWORD PTR [rsp+48]
+        punpcklbw   xmm2,       xmm3
+
+        movq        xmm5,       XMMWORD PTR [rsp+64]
+        punpcklbw   xmm3,       xmm4
+
+        movq        xmm6,       XMMWORD PTR [rsp+80]
+        punpcklbw   xmm4,       xmm5
+
+        movq        xmm7,       XMMWORD PTR [rsp+96]
+        punpcklbw   xmm5,       xmm6
+
+        pmaddubsw   xmm1,       xmm0
+        pmaddubsw   xmm2,       xmm0
+
+        pmaddubsw   xmm3,       xmm0
+        pmaddubsw   xmm4,       xmm0
+
+        pmaddubsw   xmm5,       xmm0
+        punpcklbw   xmm6,       xmm7
+
+        pmaddubsw   xmm6,       xmm0
+        paddw       xmm1,       [GLOBAL(rd)]
+
+        paddw       xmm2,       [GLOBAL(rd)]
+        psraw       xmm1,       VP9_FILTER_SHIFT
+
+        paddw       xmm3,       [GLOBAL(rd)]
+        psraw       xmm2,       VP9_FILTER_SHIFT
+
+        paddw       xmm4,       [GLOBAL(rd)]
+        psraw       xmm3,       VP9_FILTER_SHIFT
+
+        paddw       xmm5,       [GLOBAL(rd)]
+        psraw       xmm4,       VP9_FILTER_SHIFT
+
+        paddw       xmm6,       [GLOBAL(rd)]
+        psraw       xmm5,       VP9_FILTER_SHIFT
+
+        psraw       xmm6,       VP9_FILTER_SHIFT
+        packuswb    xmm1,       xmm1
+
+        packuswb    xmm2,       xmm2
+        movq        [rdi],      xmm1
+
+        packuswb    xmm3,       xmm3
+        movq        [rdi+rdx],  xmm2
+
+        packuswb    xmm4,       xmm4
+        movq        xmm1,       XMMWORD PTR [rsp+112]
+
+        lea         rdi,        [rdi + 2*rdx]
+        movq        xmm2,       XMMWORD PTR [rsp+128]
+
+        packuswb    xmm5,       xmm5
+        movq        [rdi],      xmm3
+
+        packuswb    xmm6,       xmm6
+        movq        [rdi+rdx],  xmm4
+
+        lea         rdi,        [rdi + 2*rdx]
+        punpcklbw   xmm7,       xmm1
+
+        movq        [rdi],      xmm5
+        pmaddubsw   xmm7,       xmm0
+
+        movq        [rdi+rdx],  xmm6
+        punpcklbw   xmm1,       xmm2
+
+        pmaddubsw   xmm1,       xmm0
+        paddw       xmm7,       [GLOBAL(rd)]
+
+        psraw       xmm7,       VP9_FILTER_SHIFT
+        paddw       xmm1,       [GLOBAL(rd)]
+
+        psraw       xmm1,       VP9_FILTER_SHIFT
+        packuswb    xmm7,       xmm7
+
+        packuswb    xmm1,       xmm1
+        lea         rdi,        [rdi + 2*rdx]
+
+        movq        [rdi],      xmm7
+
+        movq        [rdi+rdx],  xmm1
+        lea         rsp,        [rsp + 144]
+
+        jmp         .done8x8
+
+.b8x8_fp_only:
+        lea         rcx,        [rdi+rdx*8]
+
+.next_row_fp:
+        movdqa      xmm1,       XMMWORD PTR [rsp]
+        movdqa      xmm3,       XMMWORD PTR [rsp+16]
+
+        movdqa      xmm2,       xmm1
+        movdqa      xmm5,       XMMWORD PTR [rsp+32]
+
+        psrldq      xmm2,       1
+        movdqa      xmm7,       XMMWORD PTR [rsp+48]
+
+        movdqa      xmm4,       xmm3
+        psrldq      xmm4,       1
+
+        movdqa      xmm6,       xmm5
+        psrldq      xmm6,       1
+
+        punpcklbw   xmm1,       xmm2
+        pmaddubsw   xmm1,       xmm0
+
+        punpcklbw   xmm3,       xmm4
+        pmaddubsw   xmm3,       xmm0
+
+        punpcklbw   xmm5,       xmm6
+        pmaddubsw   xmm5,       xmm0
+
+        movdqa      xmm2,       xmm7
+        psrldq      xmm2,       1
+
+        punpcklbw   xmm7,       xmm2
+        pmaddubsw   xmm7,       xmm0
+
+        paddw       xmm1,       [GLOBAL(rd)]
+        psraw       xmm1,       VP9_FILTER_SHIFT
+
+        paddw       xmm3,       [GLOBAL(rd)]
+        psraw       xmm3,       VP9_FILTER_SHIFT
+
+        paddw       xmm5,       [GLOBAL(rd)]
+        psraw       xmm5,       VP9_FILTER_SHIFT
+
+        paddw       xmm7,       [GLOBAL(rd)]
+        psraw       xmm7,       VP9_FILTER_SHIFT
+
+        packuswb    xmm1,       xmm1
+        packuswb    xmm3,       xmm3
+
+        packuswb    xmm5,       xmm5
+        movq        [rdi],      xmm1
+
+        packuswb    xmm7,       xmm7
+        movq        [rdi+rdx],  xmm3
+
+        lea         rdi,        [rdi + 2*rdx]
+        movq        [rdi],      xmm5
+
+        lea         rsp,        [rsp + 4*16]
+        movq        [rdi+rdx],  xmm7
+
+        lea         rdi,        [rdi + 2*rdx]
+        cmp         rdi,        rcx
+
+        jne         .next_row_fp
+
+        lea         rsp,        [rsp + 16]
+
+.done8x8:
+    ;add rsp, 144
+    pop         rsp
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+SECTION_RODATA
+align 16
+shuf1b:
+    db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12
+shuf2b:
+    db 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11
+shuf3b:
+    db 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10
+
+align 16
+shuf2bfrom1:
+    db  4, 8, 6, 1, 8, 3, 1, 5, 3, 7, 5, 9, 7,11, 9,13
+align 16
+shuf3bfrom1:
+    db  2, 6, 4, 8, 6, 1, 8, 3, 1, 5, 3, 7, 5, 9, 7,11
+
+align 16
+rd:
+    times 8 dw 0x40
+
+align 16
+k0_k5:
+    times 8 db 0, 0             ;placeholder
+    times 8 db 0, 0
+    times 8 db 2, 1
+    times 8 db 0, 0
+    times 8 db 3, 3
+    times 8 db 0, 0
+    times 8 db 1, 2
+    times 8 db 0, 0
+k1_k3:
+    times 8 db  0,    0         ;placeholder
+    times 8 db  -6,  12
+    times 8 db -11,  36
+    times 8 db  -9,  50
+    times 8 db -16,  77
+    times 8 db  -6,  93
+    times 8 db  -8, 108
+    times 8 db  -1, 123
+k2_k4:
+    times 8 db 128,    0        ;placeholder
+    times 8 db 123,   -1
+    times 8 db 108,   -8
+    times 8 db  93,   -6
+    times 8 db  77,  -16
+    times 8 db  50,   -9
+    times 8 db  36,  -11
+    times 8 db  12,   -6
+align 16
+bilinear_filters_ssse3:
+    times 8 db 128, 0
+    times 8 db 120, 8
+    times 8 db 112, 16
+    times 8 db 104, 24
+    times 8 db 96,  32
+    times 8 db 88,  40
+    times 8 db 80,  48
+    times 8 db 72,  56
+    times 8 db 64,  64
+    times 8 db 56,  72
+    times 8 db 48,  80
+    times 8 db 40,  88
+    times 8 db 32,  96
+    times 8 db 24,  104
+    times 8 db 16,  112
+    times 8 db 8,   120
+
--- /dev/null
+++ b/vp9/common/x86/subpixel_x86.h
@@ -1,0 +1,122 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef SUBPIXEL_X86_H
+#define SUBPIXEL_X86_H
+
+/* Note:
+ *
+ * This platform is commonly built for runtime CPU detection. If you modify
+ * any of the function mappings present in this file, be sure to also update
+ * them in the function pointer initialization code
+ */
+
+#if HAVE_MMX
+extern prototype_subpixel_predict(vp9_sixtap_predict16x16_mmx);
+extern prototype_subpixel_predict(vp9_sixtap_predict8x8_mmx);
+extern prototype_subpixel_predict(vp9_sixtap_predict8x4_mmx);
+extern prototype_subpixel_predict(vp9_sixtap_predict4x4_mmx);
+extern prototype_subpixel_predict(vp9_bilinear_predict16x16_mmx);
+extern prototype_subpixel_predict(vp9_bilinear_predict8x8_mmx);
+extern prototype_subpixel_predict(vp9_bilinear_predict8x4_mmx);
+extern prototype_subpixel_predict(vp9_bilinear_predict4x4_mmx);
+
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+#undef  vp9_subpix_sixtap16x16
+#define vp9_subpix_sixtap16x16 vp9_sixtap_predict16x16_mmx
+
+#undef  vp9_subpix_sixtap8x8
+#define vp9_subpix_sixtap8x8 vp9_sixtap_predict8x8_mmx
+
+#undef  vp9_subpix_sixtap8x4
+#define vp9_subpix_sixtap8x4 vp9_sixtap_predict8x4_mmx
+
+#undef  vp9_subpix_sixtap4x4
+#define vp9_subpix_sixtap4x4 vp9_sixtap_predict4x4_mmx
+
+#undef  vp9_subpix_bilinear16x16
+#define vp9_subpix_bilinear16x16 vp9_bilinear_predict16x16_mmx
+
+#undef  vp9_subpix_bilinear8x8
+#define vp9_subpix_bilinear8x8 vp9_bilinear_predict8x8_mmx
+
+#undef  vp9_subpix_bilinear8x4
+#define vp9_subpix_bilinear8x4 vp9_bilinear_predict8x4_mmx
+
+#undef  vp9_subpix_bilinear4x4
+#define vp9_subpix_bilinear4x4 vp9_bilinear_predict4x4_mmx
+
+#endif
+#endif
+
+
+#if HAVE_SSE2
+extern prototype_subpixel_predict(vp9_sixtap_predict16x16_sse2);
+extern prototype_subpixel_predict(vp9_sixtap_predict8x8_sse2);
+extern prototype_subpixel_predict(vp9_sixtap_predict8x4_sse2);
+extern prototype_subpixel_predict(vp9_bilinear_predict16x16_sse2);
+extern prototype_subpixel_predict(vp9_bilinear_predict8x8_sse2);
+
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+#undef  vp9_subpix_sixtap16x16
+#define vp9_subpix_sixtap16x16 vp9_sixtap_predict16x16_sse2
+
+#undef  vp9_subpix_sixtap8x8
+#define vp9_subpix_sixtap8x8 vp9_sixtap_predict8x8_sse2
+
+#undef  vp9_subpix_sixtap8x4
+#define vp9_subpix_sixtap8x4 vp9_sixtap_predict8x4_sse2
+
+#undef  vp9_subpix_bilinear16x16
+#define vp9_subpix_bilinear16x16 vp9_bilinear_predict16x16_sse2
+
+#undef  vp9_subpix_bilinear8x8
+#define vp9_subpix_bilinear8x8 vp9_bilinear_predict8x8_sse2
+
+#endif
+#endif
+
+#if HAVE_SSSE3
+extern prototype_subpixel_predict(vp9_sixtap_predict16x16_ssse3);
+extern prototype_subpixel_predict(vp9_sixtap_predict8x8_ssse3);
+extern prototype_subpixel_predict(vp9_sixtap_predict8x4_ssse3);
+extern prototype_subpixel_predict(vp9_sixtap_predict4x4_ssse3);
+extern prototype_subpixel_predict(vp9_bilinear_predict16x16_ssse3);
+extern prototype_subpixel_predict(vp9_bilinear_predict8x8_ssse3);
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+#undef  vp9_subpix_sixtap16x16
+#define vp9_subpix_sixtap16x16 vp9_sixtap_predict16x16_ssse3
+
+#undef  vp9_subpix_sixtap8x8
+#define vp9_subpix_sixtap8x8 vp9_sixtap_predict8x8_ssse3
+
+#undef  vp9_subpix_sixtap8x4
+#define vp9_subpix_sixtap8x4 vp9_sixtap_predict8x4_ssse3
+
+#undef  vp9_subpix_sixtap4x4
+#define vp9_subpix_sixtap4x4 vp9_sixtap_predict4x4_ssse3
+
+
+#undef  vp9_subpix_bilinear16x16
+#define vp9_subpix_bilinear16x16 vp9_bilinear_predict16x16_ssse3
+
+#undef  vp9_subpix_bilinear8x8
+#define vp9_subpix_bilinear8x8 vp9_bilinear_predict8x8_ssse3
+
+#endif
+#endif
+
+
+
+#endif
--- /dev/null
+++ b/vp9/common/x86/vp8_asm_stubs.c
@@ -1,0 +1,602 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_ports/config.h"
+#include "vpx_ports/mem.h"
+#include "vp9/common/subpixel.h"
+
+extern const short vp9_six_tap_mmx[16][6 * 8];
+
+extern const short vp9_bilinear_filters_8x_mmx[16][2 * 8];
+
+extern void vp9_filter_block1d_h6_mmx(unsigned char   *src_ptr,
+                                      unsigned short  *output_ptr,
+                                      unsigned int     src_pixels_per_line,
+                                      unsigned int     pixel_step,
+                                      unsigned int     output_height,
+                                      unsigned int     output_width,
+                                      const short     *vp9_filter);
+
+extern void vp9_filter_block1dc_v6_mmx(unsigned short *src_ptr,
+                                       unsigned char  *output_ptr,
+                                       int             output_pitch,
+                                       unsigned int    pixels_per_line,
+                                       unsigned int    pixel_step,
+                                       unsigned int    output_height,
+                                       unsigned int    output_width,
+                                       const short    *vp9_filter);
+
+extern void vp9_filter_block1d8_h6_sse2(unsigned char  *src_ptr,
+                                        unsigned short *output_ptr,
+                                        unsigned int    src_pixels_per_line,
+                                        unsigned int    pixel_step,
+                                        unsigned int    output_height,
+                                        unsigned int    output_width,
+                                        const short    *vp9_filter);
+
+extern void vp9_filter_block1d16_h6_sse2(unsigned char  *src_ptr,
+                                         unsigned short *output_ptr,
+                                         unsigned int    src_pixels_per_line,
+                                         unsigned int    pixel_step,
+                                         unsigned int    output_height,
+                                         unsigned int    output_width,
+                                         const short    *vp9_filter);
+
+extern void vp9_filter_block1d8_v6_sse2(unsigned short *src_ptr,
+                                        unsigned char *output_ptr,
+                                        int dst_ptich,
+                                        unsigned int pixels_per_line,
+                                        unsigned int pixel_step,
+                                        unsigned int output_height,
+                                        unsigned int output_width,
+                                        const short    *vp9_filter);
+
+extern void vp9_filter_block1d16_v6_sse2(unsigned short *src_ptr,
+                                         unsigned char *output_ptr,
+                                         int dst_ptich,
+                                         unsigned int pixels_per_line,
+                                         unsigned int pixel_step,
+                                         unsigned int output_height,
+                                         unsigned int output_width,
+                                         const short    *vp9_filter);
+
+extern void vp9_unpack_block1d16_h6_sse2(unsigned char  *src_ptr,
+                                         unsigned short *output_ptr,
+                                         unsigned int    src_pixels_per_line,
+                                         unsigned int    output_height,
+                                         unsigned int    output_width);
+
+extern void vp9_filter_block1d8_h6_only_sse2(unsigned char *src_ptr,
+                                             unsigned int   src_pixels_per_line,
+                                             unsigned char *output_ptr,
+                                             int            dst_pitch,
+                                             unsigned int   output_height,
+                                             const short   *vp9_filter);
+
+extern void vp9_filter_block1d16_h6_only_sse2(unsigned char *src_ptr,
+                                              unsigned int   src_pixels_per_lin,
+                                              unsigned char *output_ptr,
+                                              int            dst_pitch,
+                                              unsigned int   output_height,
+                                              const short   *vp9_filter);
+
+extern void vp9_filter_block1d8_v6_only_sse2(unsigned char *src_ptr,
+                                             unsigned int   src_pixels_per_line,
+                                             unsigned char *output_ptr,
+                                             int            dst_pitch,
+                                             unsigned int   output_height,
+                                             const short   *vp9_filter);
+
+extern prototype_subpixel_predict(vp9_bilinear_predict8x8_mmx);
+
+#if HAVE_MMX
+void vp9_sixtap_predict4x4_mmx(unsigned char  *src_ptr,
+                               int  src_pixels_per_line,
+                               int  xoffset,
+                               int  yoffset,
+                               unsigned char *dst_ptr,
+                               int  dst_pitch) {
+#ifdef ANNOUNCE_FUNCTION
+  printf("vp9_sixtap_predict4x4_mmx\n");
+#endif
+  /* Temp data bufffer used in filtering */
+  DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 16 * 16);
+  const short *hfilter, *vfilter;
+  hfilter = vp9_six_tap_mmx[xoffset];
+  vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), fdata2,
+                            src_pixels_per_line, 1, 9, 8, hfilter);
+  vfilter = vp9_six_tap_mmx[yoffset];
+  vp9_filter_block1dc_v6_mmx(fdata2 + 8, dst_ptr, dst_pitch,
+                             8, 4, 4, 4, vfilter);
+}
+
+void vp9_sixtap_predict16x16_mmx(unsigned char  *src_ptr,
+                                 int  src_pixels_per_line,
+                                 int  xoffset,
+                                 int  yoffset,
+                                 unsigned char *dst_ptr,
+                                 int dst_pitch) {
+#ifdef ANNOUNCE_FUNCTION
+  printf("vp9_sixtap_predict16x16_mmx\n");
+#endif
+  /* Temp data bufffer used in filtering */
+  DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 24 * 24);
+  const short *hfilter, *vfilter;
+
+  hfilter = vp9_six_tap_mmx[xoffset];
+  vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line),
+                            fdata2,   src_pixels_per_line, 1, 21, 32,
+                            hfilter);
+  vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4,
+                            fdata2 + 4, src_pixels_per_line, 1, 21, 32,
+                            hfilter);
+  vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 8,
+                            fdata2 + 8, src_pixels_per_line, 1, 21, 32,
+                            hfilter);
+  vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 12,
+                            fdata2 + 12, src_pixels_per_line, 1, 21, 32,
+                            hfilter);
+
+  vfilter = vp9_six_tap_mmx[yoffset];
+  vp9_filter_block1dc_v6_mmx(fdata2 + 32, dst_ptr,      dst_pitch,
+                             32, 16, 16, 16, vfilter);
+  vp9_filter_block1dc_v6_mmx(fdata2 + 36, dst_ptr + 4,  dst_pitch,
+                             32, 16, 16, 16, vfilter);
+  vp9_filter_block1dc_v6_mmx(fdata2 + 40, dst_ptr + 8,  dst_pitch,
+                             32, 16, 16, 16, vfilter);
+  vp9_filter_block1dc_v6_mmx(fdata2 + 44, dst_ptr + 12, dst_pitch,
+                             32, 16, 16, 16, vfilter);
+}
+
+void vp9_sixtap_predict8x8_mmx(unsigned char  *src_ptr,
+                               int  src_pixels_per_line,
+                               int  xoffset,
+                               int  yoffset,
+                               unsigned char *dst_ptr,
+                               int  dst_pitch) {
+#ifdef ANNOUNCE_FUNCTION
+  printf("vp9_sixtap_predict8x8_mmx\n");
+#endif
+  /* Temp data bufffer used in filtering */
+  DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 256);
+  const short *hfilter, *vfilter;
+
+  hfilter = vp9_six_tap_mmx[xoffset];
+  vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line),
+                            fdata2,   src_pixels_per_line, 1, 13, 16,
+                            hfilter);
+  vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4,
+                            fdata2 + 4, src_pixels_per_line, 1, 13, 16,
+                            hfilter);
+
+  vfilter = vp9_six_tap_mmx[yoffset];
+  vp9_filter_block1dc_v6_mmx(fdata2 + 16, dst_ptr,     dst_pitch,
+                             16, 8, 8, 8, vfilter);
+  vp9_filter_block1dc_v6_mmx(fdata2 + 20, dst_ptr + 4, dst_pitch,
+                             16, 8, 8, 8, vfilter);
+}
+
+void vp9_sixtap_predict8x4_mmx(unsigned char  *src_ptr,
+                               int  src_pixels_per_line,
+                               int  xoffset,
+                               int  yoffset,
+                               unsigned char *dst_ptr,
+                               int  dst_pitch) {
+#ifdef ANNOUNCE_FUNCTION
+  printf("vp9_sixtap_predict8x4_mmx\n");
+#endif
+  /* Temp data bufffer used in filtering */
+  DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 256);
+  const short *hfilter, *vfilter;
+
+  hfilter = vp9_six_tap_mmx[xoffset];
+  vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line),
+                            fdata2,   src_pixels_per_line, 1, 9, 16, hfilter);
+  vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4,
+                            fdata2 + 4, src_pixels_per_line, 1, 9, 16, hfilter);
+
+  vfilter = vp9_six_tap_mmx[yoffset];
+  vp9_filter_block1dc_v6_mmx(fdata2 + 16, dst_ptr,     dst_pitch,
+                             16, 8, 4, 8, vfilter);
+  vp9_filter_block1dc_v6_mmx(fdata2 + 20, dst_ptr + 4, dst_pitch,
+                             16, 8, 4, 8, vfilter);
+}
+
+void vp9_bilinear_predict16x16_mmx(unsigned char  *src_ptr,
+                                   int  src_pixels_per_line,
+                                   int  xoffset,
+                                   int  yoffset,
+                                   unsigned char *dst_ptr,
+                                   int  dst_pitch) {
+  vp9_bilinear_predict8x8_mmx(src_ptr,
+                              src_pixels_per_line, xoffset, yoffset,
+                              dst_ptr, dst_pitch);
+  vp9_bilinear_predict8x8_mmx(src_ptr + 8,
+                              src_pixels_per_line, xoffset, yoffset,
+                              dst_ptr + 8, dst_pitch);
+  vp9_bilinear_predict8x8_mmx(src_ptr + 8 * src_pixels_per_line,
+                              src_pixels_per_line, xoffset, yoffset,
+                              dst_ptr + dst_pitch * 8, dst_pitch);
+  vp9_bilinear_predict8x8_mmx(src_ptr + 8 * src_pixels_per_line + 8,
+                              src_pixels_per_line, xoffset, yoffset,
+                              dst_ptr + dst_pitch * 8 + 8, dst_pitch);
+}
+#endif
+
+#if HAVE_SSE2
+void vp9_sixtap_predict16x16_sse2(unsigned char  *src_ptr,
+                                  int  src_pixels_per_line,
+                                  int  xoffset,
+                                  int  yoffset,
+                                  unsigned char *dst_ptr,
+                                  int  dst_pitch) {
+  /* Temp data bufffer used in filtering */
+  DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 24 * 24);
+  const short *hfilter, *vfilter;
+#ifdef ANNOUNCE_FUNCTION
+  printf("vp9_sixtap_predict16x16_sse2\n");
+#endif
+
+  if (xoffset) {
+    if (yoffset) {
+      hfilter = vp9_six_tap_mmx[xoffset];
+      vp9_filter_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), fdata2,
+                                   src_pixels_per_line, 1, 21, 32, hfilter);
+      vfilter = vp9_six_tap_mmx[yoffset];
+      vp9_filter_block1d16_v6_sse2(fdata2 + 32, dst_ptr, dst_pitch,
+                                   32, 16, 16, dst_pitch, vfilter);
+    } else {
+      /* First-pass only */
+      hfilter = vp9_six_tap_mmx[xoffset];
+      vp9_filter_block1d16_h6_only_sse2(src_ptr, src_pixels_per_line,
+                                        dst_ptr, dst_pitch, 16, hfilter);
+    }
+  } else {
+    /* Second-pass only */
+    vfilter = vp9_six_tap_mmx[yoffset];
+    vp9_unpack_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), fdata2,
+                                 src_pixels_per_line, 21, 32);
+    vp9_filter_block1d16_v6_sse2(fdata2 + 32, dst_ptr, dst_pitch,
+                                 32, 16, 16, dst_pitch, vfilter);
+  }
+}
+
+void vp9_sixtap_predict8x8_sse2(unsigned char  *src_ptr,
+                                int  src_pixels_per_line,
+                                int  xoffset,
+                                int  yoffset,
+                                unsigned char *dst_ptr,
+                                int  dst_pitch) {
+  /* Temp data bufffer used in filtering */
+  DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 256);
+  const short *hfilter, *vfilter;
+#ifdef ANNOUNCE_FUNCTION
+  printf("vp9_sixtap_predict8x8_sse2\n");
+#endif
+
+  if (xoffset) {
+    if (yoffset) {
+      hfilter = vp9_six_tap_mmx[xoffset];
+      vp9_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), fdata2,
+                                  src_pixels_per_line, 1, 13, 16, hfilter);
+      vfilter = vp9_six_tap_mmx[yoffset];
+      vp9_filter_block1d8_v6_sse2(fdata2 + 16, dst_ptr, dst_pitch,
+                                  16, 8, 8, dst_pitch, vfilter);
+    } else {
+      /* First-pass only */
+      hfilter = vp9_six_tap_mmx[xoffset];
+      vp9_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line,
+                                       dst_ptr, dst_pitch, 8, hfilter);
+    }
+  } else {
+    /* Second-pass only */
+    vfilter = vp9_six_tap_mmx[yoffset];
+    vp9_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line),
+                                     src_pixels_per_line,
+                                     dst_ptr, dst_pitch, 8, vfilter);
+  }
+}
+
+void vp9_sixtap_predict8x4_sse2(unsigned char  *src_ptr,
+                                int  src_pixels_per_line,
+                                int  xoffset,
+                                int  yoffset,
+                                unsigned char *dst_ptr,
+                                int  dst_pitch) {
+  /* Temp data bufffer used in filtering */
+  DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 256);
+  const short *hfilter, *vfilter;
+#ifdef ANNOUNCE_FUNCTION
+  printf("vp9_sixtap_predict8x4_sse2\n");
+#endif
+
+  if (xoffset) {
+    if (yoffset) {
+      hfilter = vp9_six_tap_mmx[xoffset];
+      vp9_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), fdata2,
+                                  src_pixels_per_line, 1, 9, 16, hfilter);
+      vfilter = vp9_six_tap_mmx[yoffset];
+      vp9_filter_block1d8_v6_sse2(fdata2 + 16, dst_ptr, dst_pitch,
+                                  16, 8, 4, dst_pitch, vfilter);
+    } else {
+      /* First-pass only */
+      hfilter = vp9_six_tap_mmx[xoffset];
+      vp9_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line,
+                                       dst_ptr, dst_pitch, 4, hfilter);
+    }
+  } else {
+    /* Second-pass only */
+    vfilter = vp9_six_tap_mmx[yoffset];
+    vp9_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line),
+                                     src_pixels_per_line,
+                                     dst_ptr, dst_pitch, 4, vfilter);
+  }
+}
+#endif
+
+#if HAVE_SSSE3
+extern void vp9_filter_block1d8_h6_ssse3(unsigned char  *src_ptr,
+                                         unsigned int    src_pixels_per_line,
+                                         unsigned char  *output_ptr,
+                                         unsigned int    output_pitch,
+                                         unsigned int    output_height,
+                                         unsigned int    vp9_filter_index);
+
+extern void vp9_filter_block1d16_h6_ssse3(unsigned char  *src_ptr,
+                                          unsigned int    src_pixels_per_line,
+                                          unsigned char  *output_ptr,
+                                          unsigned int    output_pitch,
+                                          unsigned int    output_height,
+                                          unsigned int    vp9_filter_index);
+
+extern void vp9_filter_block1d16_v6_ssse3(unsigned char *src_ptr,
+                                          unsigned int   src_pitch,
+                                          unsigned char *output_ptr,
+                                          unsigned int   out_pitch,
+                                          unsigned int   output_height,
+                                          unsigned int   vp9_filter_index);
+
+extern void vp9_filter_block1d8_v6_ssse3(unsigned char *src_ptr,
+                                         unsigned int   src_pitch,
+                                         unsigned char *output_ptr,
+                                         unsigned int   out_pitch,
+                                         unsigned int   output_height,
+                                         unsigned int   vp9_filter_index);
+
+extern void vp9_filter_block1d4_h6_ssse3(unsigned char  *src_ptr,
+                                         unsigned int    src_pixels_per_line,
+                                         unsigned char  *output_ptr,
+                                         unsigned int    output_pitch,
+                                         unsigned int    output_height,
+                                         unsigned int    vp9_filter_index);
+
+extern void vp9_filter_block1d4_v6_ssse3(unsigned char *src_ptr,
+                                         unsigned int   src_pitch,
+                                         unsigned char *output_ptr,
+                                         unsigned int   out_pitch,
+                                         unsigned int   output_height,
+                                         unsigned int   vp9_filter_index);
+
+void vp9_sixtap_predict16x16_ssse3(unsigned char  *src_ptr,
+                                   int  src_pixels_per_line,
+                                   int  xoffset,
+                                   int  yoffset,
+                                   unsigned char *dst_ptr,
+                                   int  dst_pitch) {
+  DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 24 * 24);
+#ifdef ANNOUNCE_FUNCTION
+  printf("vp9_sixtap_predict16x16_ssse3\n");
+#endif
+
+  if (xoffset) {
+    if (yoffset) {
+      vp9_filter_block1d16_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
+                                    src_pixels_per_line,
+                                    fdata2, 16, 21, xoffset);
+      vp9_filter_block1d16_v6_ssse3(fdata2, 16, dst_ptr, dst_pitch,
+                                    16, yoffset);
+    } else {
+      /* First-pass only */
+      vp9_filter_block1d16_h6_ssse3(src_ptr, src_pixels_per_line,
+                                    dst_ptr, dst_pitch, 16, xoffset);
+    }
+  } else {
+    /* Second-pass only */
+    vp9_filter_block1d16_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
+                                  src_pixels_per_line,
+                                  dst_ptr, dst_pitch, 16, yoffset);
+  }
+}
+
+void vp9_sixtap_predict8x8_ssse3(unsigned char  *src_ptr,
+                                 int  src_pixels_per_line,
+                                 int  xoffset,
+                                 int  yoffset,
+                                 unsigned char *dst_ptr,
+                                 int  dst_pitch) {
+  DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 256);
+#ifdef ANNOUNCE_FUNCTION
+  printf("vp9_sixtap_predict8x8_ssse3\n");
+#endif
+
+  if (xoffset) {
+    if (yoffset) {
+      vp9_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
+                                   src_pixels_per_line, fdata2, 8, 13, xoffset);
+      vp9_filter_block1d8_v6_ssse3(fdata2, 8, dst_ptr, dst_pitch, 8, yoffset);
+    } else {
+      vp9_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line,
+                                   dst_ptr, dst_pitch, 8, xoffset);
+    }
+  } else {
+    /* Second-pass only */
+    vp9_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
+                                 src_pixels_per_line,
+                                 dst_ptr, dst_pitch, 8, yoffset);
+  }
+}
+
+void vp9_sixtap_predict8x4_ssse3(unsigned char  *src_ptr,
+                                 int  src_pixels_per_line,
+                                 int  xoffset,
+                                 int  yoffset,
+                                 unsigned char *dst_ptr,
+                                 int  dst_pitch) {
+  DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 256);
+#ifdef ANNOUNCE_FUNCTION
+  printf("vp9_sixtap_predict8x4_ssse3\n");
+#endif
+
+  if (xoffset) {
+    if (yoffset) {
+      vp9_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
+                                   src_pixels_per_line, fdata2, 8, 9, xoffset);
+      vp9_filter_block1d8_v6_ssse3(fdata2, 8, dst_ptr, dst_pitch, 4, yoffset);
+    } else {
+      /* First-pass only */
+      vp9_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line,
+                                   dst_ptr, dst_pitch, 4, xoffset);
+    }
+  } else {
+    /* Second-pass only */
+    vp9_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
+                                 src_pixels_per_line,
+                                 dst_ptr, dst_pitch, 4, yoffset);
+  }
+}
+
+void vp9_sixtap_predict4x4_ssse3(unsigned char  *src_ptr,
+                                 int   src_pixels_per_line,
+                                 int  xoffset,
+                                 int  yoffset,
+                                 unsigned char *dst_ptr,
+                                 int dst_pitch) {
+  DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 4 * 9);
+#ifdef ANNOUNCE_FUNCTION
+  printf("vp9_sixtap_predict4x4_ssse3\n");
+#endif
+
+  if (xoffset) {
+    if (yoffset) {
+      vp9_filter_block1d4_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
+                                   src_pixels_per_line, fdata2, 4, 9, xoffset);
+      vp9_filter_block1d4_v6_ssse3(fdata2, 4, dst_ptr, dst_pitch, 4, yoffset);
+    } else {
+      vp9_filter_block1d4_h6_ssse3(src_ptr, src_pixels_per_line,
+                                   dst_ptr, dst_pitch, 4, xoffset);
+    }
+  } else {
+    vp9_filter_block1d4_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
+                                 src_pixels_per_line,
+                                 dst_ptr, dst_pitch, 4, yoffset);
+  }
+}
+
+void vp9_filter_block1d16_v8_ssse3(const unsigned char *src_ptr,
+                                   const unsigned int src_pitch,
+                                   unsigned char *output_ptr,
+                                   unsigned int out_pitch,
+                                   unsigned int output_height,
+                                   const short *filter);
+
+void vp9_filter_block1d16_h8_ssse3(const unsigned char *src_ptr,
+                                   const unsigned int src_pitch,
+                                   unsigned char *output_ptr,
+                                   unsigned int out_pitch,
+                                   unsigned int output_height,
+                                   const short *filter);
+
+void vp9_filter_block2d_16x16_8_ssse3(const unsigned char *src_ptr,
+                                      const unsigned int src_stride,
+                                      const short *hfilter_aligned16,
+                                      const short *vfilter_aligned16,
+                                      unsigned char *dst_ptr,
+                                      unsigned int dst_stride) {
+  if (hfilter_aligned16[3] != 128 && vfilter_aligned16[3] != 128) {
+    DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 23 * 16);
+
+    vp9_filter_block1d16_h8_ssse3(src_ptr - (3 * src_stride), src_stride,
+                                  fdata2, 16, 23, hfilter_aligned16);
+    vp9_filter_block1d16_v8_ssse3(fdata2, 16, dst_ptr, dst_stride, 16,
+                                  vfilter_aligned16);
+  } else {
+    if (hfilter_aligned16[3] != 128) {
+      vp9_filter_block1d16_h8_ssse3(src_ptr, src_stride, dst_ptr, dst_stride,
+                                    16, hfilter_aligned16);
+    } else {
+      vp9_filter_block1d16_v8_ssse3(src_ptr - (3 * src_stride), src_stride,
+                                    dst_ptr, dst_stride, 16, vfilter_aligned16);
+    }
+  }
+}
+
+void vp9_filter_block1d8_v8_ssse3(const unsigned char *src_ptr,
+                                   const unsigned int src_pitch,
+                                   unsigned char *output_ptr,
+                                   unsigned int out_pitch,
+                                   unsigned int output_height,
+                                   const short *filter);
+
+void vp9_filter_block1d8_h8_ssse3(const unsigned char *src_ptr,
+                                   const unsigned int src_pitch,
+                                   unsigned char *output_ptr,
+                                   unsigned int out_pitch,
+                                   unsigned int output_height,
+                                   const short *filter);
+
+void vp9_filter_block2d_8x8_8_ssse3(const unsigned char *src_ptr,
+                                    const unsigned int src_stride,
+                                    const short *hfilter_aligned16,
+                                    const short *vfilter_aligned16,
+                                    unsigned char *dst_ptr,
+                                    unsigned int dst_stride) {
+  if (hfilter_aligned16[3] != 128 && vfilter_aligned16[3] != 128) {
+    DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 23 * 16);
+
+    vp9_filter_block1d8_h8_ssse3(src_ptr - (3 * src_stride), src_stride,
+                                 fdata2, 16, 15, hfilter_aligned16);
+    vp9_filter_block1d8_v8_ssse3(fdata2, 16, dst_ptr, dst_stride, 8,
+                                 vfilter_aligned16);
+  } else {
+    if (hfilter_aligned16[3] != 128) {
+      vp9_filter_block1d8_h8_ssse3(src_ptr, src_stride, dst_ptr, dst_stride, 8,
+                                   hfilter_aligned16);
+    } else {
+      vp9_filter_block1d8_v8_ssse3(src_ptr - (3 * src_stride), src_stride,
+                                   dst_ptr, dst_stride, 8, vfilter_aligned16);
+    }
+  }
+}
+
+void vp9_filter_block2d_8x4_8_ssse3(const unsigned char *src_ptr,
+                                    const unsigned int src_stride,
+                                    const short *hfilter_aligned16,
+                                    const short *vfilter_aligned16,
+                                    unsigned char *dst_ptr,
+                                    unsigned int dst_stride) {
+  if (hfilter_aligned16[3] !=128 && vfilter_aligned16[3] != 128) {
+      DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 23 * 16);
+
+      vp9_filter_block1d8_h8_ssse3(src_ptr - (3 * src_stride), src_stride,
+                                   fdata2, 16, 11, hfilter_aligned16);
+      vp9_filter_block1d8_v8_ssse3(fdata2, 16, dst_ptr, dst_stride, 4,
+                                   vfilter_aligned16);
+  } else {
+    if (hfilter_aligned16[3] != 128) {
+      vp9_filter_block1d8_h8_ssse3(src_ptr, src_stride, dst_ptr, dst_stride, 4,
+                                   hfilter_aligned16);
+    } else {
+      vp9_filter_block1d8_v8_ssse3(src_ptr - (3 * src_stride), src_stride,
+                                   dst_ptr, dst_stride, 4, vfilter_aligned16);
+    }
+  }
+}
+#endif
--- /dev/null
+++ b/vp9/common/x86/x86_systemdependent.c
@@ -1,0 +1,108 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_config.h"
+#include "vpx_ports/x86.h"
+#include "vp9/common/subpixel.h"
+#include "vp9/common/loopfilter.h"
+#include "vp9/common/idct.h"
+#include "vp9/common/pragmas.h"
+#include "vp9/common/onyxc_int.h"
+
+void vp9_arch_x86_common_init(VP9_COMMON *ctx) {
+#if CONFIG_RUNTIME_CPU_DETECT
+  VP9_COMMON_RTCD *rtcd = &ctx->rtcd;
+  int flags = x86_simd_caps();
+
+  /* Note:
+   *
+   * This platform can be built without runtime CPU detection as well. If
+   * you modify any of the function mappings present in this file, be sure
+   * to also update them in static mapings (<arch>/filename_<arch>.h)
+   */
+
+  /* Override default functions with fastest ones for this CPU. */
+#if HAVE_MMX
+// The commented functions need to be re-written for vpx.
+  if (flags & HAS_MMX) {
+    rtcd->idct.idct1        = vp9_short_idct4x4llm_1_mmx;
+    rtcd->idct.idct16       = vp9_short_idct4x4llm_mmx;
+    rtcd->idct.idct1_scalar_add = vp9_dc_only_idct_add_mmx;
+    // rtcd->idct.iwalsh16     = vp9_short_inv_walsh4x4_mmx;
+    // rtcd->idct.iwalsh1     = vp9_short_inv_walsh4x4_1_mmx;
+
+    /* Disabled due to unsupported enhanced interpolation/high_prec mv
+    rtcd->subpix.sixtap16x16   = vp9_sixtap_predict16x16_mmx;
+    rtcd->subpix.sixtap8x8     = vp9_sixtap_predict8x8_mmx;
+    rtcd->subpix.sixtap8x4     = vp9_sixtap_predict8x4_mmx;
+    rtcd->subpix.sixtap4x4     = vp9_sixtap_predict4x4_mmx;
+    */
+    rtcd->subpix.bilinear16x16 = vp9_bilinear_predict16x16_mmx;
+    rtcd->subpix.bilinear8x8   = vp9_bilinear_predict8x8_mmx;
+    rtcd->subpix.bilinear8x4   = vp9_bilinear_predict8x4_mmx;
+    rtcd->subpix.bilinear4x4   = vp9_bilinear_predict4x4_mmx;
+
+#if CONFIG_POSTPROC
+    rtcd->postproc.down        = vp9_mbpost_proc_down_mmx;
+    /*rtcd->postproc.across      = vp9_mbpost_proc_across_ip_c;*/
+    rtcd->postproc.downacross  = vp9_post_proc_down_and_across_mmx;
+    rtcd->postproc.addnoise    = vp9_plane_add_noise_mmx;
+#endif
+  }
+
+#endif
+#if HAVE_SSE2
+
+  if (flags & HAS_SSE2) {
+
+
+    // rtcd->idct.iwalsh16     = vp9_short_inv_walsh4x4_sse2;
+
+    /* Disabled due to unsupported enhanced interpolation/high_prec mv
+    rtcd->subpix.sixtap16x16   = vp9_sixtap_predict16x16_sse2;
+    rtcd->subpix.sixtap8x8     = vp9_sixtap_predict8x8_sse2;
+    rtcd->subpix.sixtap8x4     = vp9_sixtap_predict8x4_sse2;
+    */
+    rtcd->subpix.bilinear16x16 = vp9_bilinear_predict16x16_sse2;
+    rtcd->subpix.bilinear8x8   = vp9_bilinear_predict8x8_sse2;
+
+#if CONFIG_POSTPROC
+    rtcd->postproc.down        = vp9_mbpost_proc_down_xmm;
+    rtcd->postproc.across      = vp9_mbpost_proc_across_ip_xmm;
+    rtcd->postproc.downacross  = vp9_post_proc_down_and_across_xmm;
+    rtcd->postproc.addnoise    = vp9_plane_add_noise_wmt;
+#endif
+  }
+
+#endif
+
+#if HAVE_SSSE3
+
+  if (flags & HAS_SSSE3) {
+    /* Disabled due to unsupported enhanced interpolation/high_prec mv
+    rtcd->subpix.sixtap16x16   = vp9_sixtap_predict16x16_ssse3;
+    rtcd->subpix.sixtap8x8     = vp9_sixtap_predict8x8_ssse3;
+    rtcd->subpix.sixtap8x4     = vp9_sixtap_predict8x4_ssse3;
+    rtcd->subpix.sixtap4x4     = vp9_sixtap_predict4x4_ssse3;
+    rtcd->subpix.bilinear16x16 = vp9_bilinear_predict16x16_ssse3;
+    rtcd->subpix.bilinear8x8   = vp9_bilinear_predict8x8_ssse3;
+    */
+
+    /* these are disable because of unsupported diagonal pred modes
+    rtcd->recon.build_intra_predictors_mbuv =
+      vp9_build_intra_predictors_mbuv_ssse3;
+    rtcd->recon.build_intra_predictors_mbuv_s =
+      vp9_build_intra_predictors_mbuv_s_ssse3;
+      */
+  }
+#endif
+
+#endif
+}
--- /dev/null
+++ b/vp9/decoder/arm/armv6/dequant_dc_idct_v6.asm
@@ -1,0 +1,218 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+    EXPORT |vp8_dequant_dc_idct_add_v6|
+
+    AREA |.text|, CODE, READONLY
+
+;void vp8_dequant_dc_idct_v6(short *input, short *dq, unsigned char *pred,
+; unsigned char *dest, int pitch, int stride, int Dc)
+; r0 = input
+; r1 = dq
+; r2 = pred
+; r3 = dest
+; sp + 36 = pitch  ; +4 = 40
+; sp + 40 = stride  ; +4 = 44
+; sp + 44 = Dc  ; +4 = 48
+
+
+|vp8_dequant_dc_idct_add_v6| PROC
+    stmdb   sp!, {r4-r11, lr}
+
+    ldr     r6, [sp, #44]
+
+    ldr     r4, [r0]                ;input
+    ldr     r5, [r1], #4            ;dq
+
+    sub     sp, sp, #4
+    str     r3, [sp]
+
+    smultt  r7, r4, r5
+
+    ldr     r4, [r0, #4]            ;input
+    ldr     r5, [r1], #4            ;dq
+
+    strh    r6, [r0], #2
+    strh    r7, [r0], #2
+
+    smulbb  r6, r4, r5
+    smultt  r7, r4, r5
+
+    ldr     r4, [r0, #4]            ;input
+    ldr     r5, [r1], #4            ;dq
+
+    strh    r6, [r0], #2
+    strh    r7, [r0], #2
+
+    mov     r12, #3
+
+vp8_dequant_dc_add_loop
+    smulbb  r6, r4, r5
+    smultt  r7, r4, r5
+
+    ldr     r4, [r0, #4]            ;input
+    ldr     r5, [r1], #4            ;dq
+
+    strh    r6, [r0], #2
+    strh    r7, [r0], #2
+
+    smulbb  r6, r4, r5
+    smultt  r7, r4, r5
+
+    subs    r12, r12, #1
+
+    ldrne   r4, [r0, #4]
+    ldrne   r5, [r1], #4
+
+    strh    r6, [r0], #2
+    strh    r7, [r0], #2
+
+    bne     vp8_dequant_dc_add_loop
+
+    sub     r0, r0, #32
+    mov     r1, r0
+
+; short_idct4x4llm_v6_dual
+    ldr     r3, cospi8sqrt2minus1
+    ldr     r4, sinpi8sqrt2
+    ldr     r6, [r0, #8]
+    mov     r5, #2
+vp8_dequant_dc_idct_loop1_v6
+    ldr     r12, [r0, #24]
+    ldr     r14, [r0, #16]
+    smulwt  r9, r3, r6
+    smulwb  r7, r3, r6
+    smulwt  r10, r4, r6
+    smulwb  r8, r4, r6
+    pkhbt   r7, r7, r9, lsl #16
+    smulwt  r11, r3, r12
+    pkhbt   r8, r8, r10, lsl #16
+    uadd16  r6, r6, r7
+    smulwt  r7, r4, r12
+    smulwb  r9, r3, r12
+    smulwb  r10, r4, r12
+    subs    r5, r5, #1
+    pkhbt   r9, r9, r11, lsl #16
+    ldr     r11, [r0], #4
+    pkhbt   r10, r10, r7, lsl #16
+    uadd16  r7, r12, r9
+    usub16  r7, r8, r7
+    uadd16  r6, r6, r10
+    uadd16  r10, r11, r14
+    usub16  r8, r11, r14
+    uadd16  r9, r10, r6
+    usub16  r10, r10, r6
+    uadd16  r6, r8, r7
+    usub16  r7, r8, r7
+    str     r6, [r1, #8]
+    ldrne   r6, [r0, #8]
+    str     r7, [r1, #16]
+    str     r10, [r1, #24]
+    str     r9, [r1], #4
+    bne     vp8_dequant_dc_idct_loop1_v6
+
+    mov     r5, #2
+    sub     r0, r1, #8
+vp8_dequant_dc_idct_loop2_v6
+    ldr     r6, [r0], #4
+    ldr     r7, [r0], #4
+    ldr     r8, [r0], #4
+    ldr     r9, [r0], #4
+    smulwt  r1, r3, r6
+    smulwt  r12, r4, r6
+    smulwt  lr, r3, r8
+    smulwt  r10, r4, r8
+    pkhbt   r11, r8, r6, lsl #16
+    pkhbt   r1, lr, r1, lsl #16
+    pkhbt   r12, r10, r12, lsl #16
+    pkhtb   r6, r6, r8, asr #16
+    uadd16  r6, r1, r6
+    pkhbt   lr, r9, r7, lsl #16
+    uadd16  r10, r11, lr
+    usub16  lr, r11, lr
+    pkhtb   r8, r7, r9, asr #16
+    subs    r5, r5, #1
+    smulwt  r1, r3, r8
+    smulwb  r7, r3, r8
+    smulwt  r11, r4, r8
+    smulwb  r9, r4, r8
+    pkhbt   r1, r7, r1, lsl #16
+    uadd16  r8, r1, r8
+    pkhbt   r11, r9, r11, lsl #16
+    usub16  r1, r12, r8
+    uadd16  r8, r11, r6
+    ldr     r9, c0x00040004
+    ldr     r12, [sp, #40]
+    uadd16  r6, r10, r8
+    usub16  r7, r10, r8
+    uadd16  r7, r7, r9
+    uadd16  r6, r6, r9
+    uadd16  r10, r14, r1
+    usub16  r1, r14, r1
+    uadd16  r10, r10, r9
+    uadd16  r1, r1, r9
+    ldr     r11, [r2], r12
+    mov     r8, r7, asr #3
+    pkhtb   r9, r8, r10, asr #19
+    mov     r8, r1, asr #3
+    pkhtb   r8, r8, r6, asr #19
+    uxtb16  lr, r11, ror #8
+    qadd16  r9, r9, lr
+    uxtb16  lr, r11
+    qadd16  r8, r8, lr
+    usat16  r9, #8, r9
+    usat16  r8, #8, r8
+    orr     r9, r8, r9, lsl #8
+    ldr     r11, [r2], r12
+    ldr     lr, [sp]
+    ldr     r12, [sp, #44]
+    mov     r7, r7, lsl #16
+    mov     r1, r1, lsl #16
+    mov     r10, r10, lsl #16
+    mov     r6, r6, lsl #16
+    mov     r7, r7, asr #3
+    pkhtb   r7, r7, r10, asr #19
+    mov     r1, r1, asr #3
+    pkhtb   r1, r1, r6, asr #19
+    uxtb16  r8, r11, ror #8
+    qadd16  r7, r7, r8
+    uxtb16  r8, r11
+    qadd16  r1, r1, r8
+    usat16  r7, #8, r7
+    usat16  r1, #8, r1
+    orr     r1, r1, r7, lsl #8
+    str     r9, [lr], r12
+    str     r1, [lr], r12
+    str     lr, [sp]
+    bne     vp8_dequant_dc_idct_loop2_v6
+
+; vpx_memset
+    sub     r0, r0, #32
+    add     sp, sp, #4
+
+    mov     r12, #0
+    str     r12, [r0]
+    str     r12, [r0, #4]
+    str     r12, [r0, #8]
+    str     r12, [r0, #12]
+    str     r12, [r0, #16]
+    str     r12, [r0, #20]
+    str     r12, [r0, #24]
+    str     r12, [r0, #28]
+
+    ldmia   sp!, {r4 - r11, pc}
+    ENDP    ; |vp8_dequant_dc_idct_add_v6|
+
+; Constant Pool
+cospi8sqrt2minus1 DCD 0x00004E7B
+sinpi8sqrt2       DCD 0x00008A8C
+c0x00040004       DCD 0x00040004
+
+    END
--- /dev/null
+++ b/vp9/decoder/arm/armv6/dequant_idct_v6.asm
@@ -1,0 +1,196 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+    EXPORT |vp8_dequant_idct_add_v6|
+
+    AREA |.text|, CODE, READONLY
+;void vp8_dequant_idct_v6(short *input, short *dq, unsigned char *pred,
+; unsigned char *dest, int pitch, int stride)
+; r0 = input
+; r1 = dq
+; r2 = pred
+; r3 = dest
+; sp + 36 = pitch  ; +4 = 40
+; sp + 40 = stride  ; +4 = 44
+
+
+|vp8_dequant_idct_add_v6| PROC
+    stmdb   sp!, {r4-r11, lr}
+
+    ldr     r4, [r0]                ;input
+    ldr     r5, [r1], #4            ;dq
+
+    sub     sp, sp, #4
+    str     r3, [sp]
+
+    mov     r12, #4
+
+vp8_dequant_add_loop
+    smulbb  r6, r4, r5
+    smultt  r7, r4, r5
+
+    ldr     r4, [r0, #4]            ;input
+    ldr     r5, [r1], #4            ;dq
+
+    strh    r6, [r0], #2
+    strh    r7, [r0], #2
+
+    smulbb  r6, r4, r5
+    smultt  r7, r4, r5
+
+    subs    r12, r12, #1
+
+    ldrne   r4, [r0, #4]
+    ldrne   r5, [r1], #4
+
+    strh    r6, [r0], #2
+    strh    r7, [r0], #2
+
+    bne     vp8_dequant_add_loop
+
+    sub     r0, r0, #32
+    mov     r1, r0
+
+; short_idct4x4llm_v6_dual
+    ldr     r3, cospi8sqrt2minus1
+    ldr     r4, sinpi8sqrt2
+    ldr     r6, [r0, #8]
+    mov     r5, #2
+vp8_dequant_idct_loop1_v6
+    ldr     r12, [r0, #24]
+    ldr     r14, [r0, #16]
+    smulwt  r9, r3, r6
+    smulwb  r7, r3, r6
+    smulwt  r10, r4, r6
+    smulwb  r8, r4, r6
+    pkhbt   r7, r7, r9, lsl #16
+    smulwt  r11, r3, r12
+    pkhbt   r8, r8, r10, lsl #16
+    uadd16  r6, r6, r7
+    smulwt  r7, r4, r12
+    smulwb  r9, r3, r12
+    smulwb  r10, r4, r12
+    subs    r5, r5, #1
+    pkhbt   r9, r9, r11, lsl #16
+    ldr     r11, [r0], #4
+    pkhbt   r10, r10, r7, lsl #16
+    uadd16  r7, r12, r9
+    usub16  r7, r8, r7
+    uadd16  r6, r6, r10
+    uadd16  r10, r11, r14
+    usub16  r8, r11, r14
+    uadd16  r9, r10, r6
+    usub16  r10, r10, r6
+    uadd16  r6, r8, r7
+    usub16  r7, r8, r7
+    str     r6, [r1, #8]
+    ldrne   r6, [r0, #8]
+    str     r7, [r1, #16]
+    str     r10, [r1, #24]
+    str     r9, [r1], #4
+    bne     vp8_dequant_idct_loop1_v6
+
+    mov     r5, #2
+    sub     r0, r1, #8
+vp8_dequant_idct_loop2_v6
+    ldr     r6, [r0], #4
+    ldr     r7, [r0], #4
+    ldr     r8, [r0], #4
+    ldr     r9, [r0], #4
+    smulwt  r1, r3, r6
+    smulwt  r12, r4, r6
+    smulwt  lr, r3, r8
+    smulwt  r10, r4, r8
+    pkhbt   r11, r8, r6, lsl #16
+    pkhbt   r1, lr, r1, lsl #16
+    pkhbt   r12, r10, r12, lsl #16
+    pkhtb   r6, r6, r8, asr #16
+    uadd16  r6, r1, r6
+    pkhbt   lr, r9, r7, lsl #16
+    uadd16  r10, r11, lr
+    usub16  lr, r11, lr
+    pkhtb   r8, r7, r9, asr #16
+    subs    r5, r5, #1
+    smulwt  r1, r3, r8
+    smulwb  r7, r3, r8
+    smulwt  r11, r4, r8
+    smulwb  r9, r4, r8
+    pkhbt   r1, r7, r1, lsl #16
+    uadd16  r8, r1, r8
+    pkhbt   r11, r9, r11, lsl #16
+    usub16  r1, r12, r8
+    uadd16  r8, r11, r6
+    ldr     r9, c0x00040004
+    ldr     r12, [sp, #40]
+    uadd16  r6, r10, r8
+    usub16  r7, r10, r8
+    uadd16  r7, r7, r9
+    uadd16  r6, r6, r9
+    uadd16  r10, r14, r1
+    usub16  r1, r14, r1
+    uadd16  r10, r10, r9
+    uadd16  r1, r1, r9
+    ldr     r11, [r2], r12
+    mov     r8, r7, asr #3
+    pkhtb   r9, r8, r10, asr #19
+    mov     r8, r1, asr #3
+    pkhtb   r8, r8, r6, asr #19
+    uxtb16  lr, r11, ror #8
+    qadd16  r9, r9, lr
+    uxtb16  lr, r11
+    qadd16  r8, r8, lr
+    usat16  r9, #8, r9
+    usat16  r8, #8, r8
+    orr     r9, r8, r9, lsl #8
+    ldr     r11, [r2], r12
+    ldr     lr, [sp]
+    ldr     r12, [sp, #44]
+    mov     r7, r7, lsl #16
+    mov     r1, r1, lsl #16
+    mov     r10, r10, lsl #16
+    mov     r6, r6, lsl #16
+    mov     r7, r7, asr #3
+    pkhtb   r7, r7, r10, asr #19
+    mov     r1, r1, asr #3
+    pkhtb   r1, r1, r6, asr #19
+    uxtb16  r8, r11, ror #8
+    qadd16  r7, r7, r8
+    uxtb16  r8, r11
+    qadd16  r1, r1, r8
+    usat16  r7, #8, r7
+    usat16  r1, #8, r1
+    orr     r1, r1, r7, lsl #8
+    str     r9, [lr], r12
+    str     r1, [lr], r12
+    str     lr, [sp]
+    bne     vp8_dequant_idct_loop2_v6
+
+; vpx_memset
+    sub     r0, r0, #32
+    add     sp, sp, #4
+
+    mov     r12, #0
+    str     r12, [r0]
+    str     r12, [r0, #4]
+    str     r12, [r0, #8]
+    str     r12, [r0, #12]
+    str     r12, [r0, #16]
+    str     r12, [r0, #20]
+    str     r12, [r0, #24]
+    str     r12, [r0, #28]
+
+    ldmia   sp!, {r4 - r11, pc}
+    ENDP    ; |vp8_dequant_idct_add_v6|
+
+; Constant Pool
+cospi8sqrt2minus1 DCD 0x00004E7B
+sinpi8sqrt2       DCD 0x00008A8C
+c0x00040004       DCD 0x00040004
+
+    END
--- /dev/null
+++ b/vp9/decoder/arm/armv6/dequantize_v6.asm
@@ -1,0 +1,69 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_dequantize_b_loop_v6|
+
+    AREA    |.text|, CODE, READONLY  ; name this block of code
+;-------------------------------
+;void   vp8_dequantize_b_loop_v6(short *Q, short *DQC, short *DQ);
+; r0    short *Q,
+; r1    short *DQC
+; r2    short *DQ
+|vp8_dequantize_b_loop_v6| PROC
+    stmdb   sp!, {r4-r9, lr}
+
+    ldr     r3, [r0]                ;load Q
+    ldr     r4, [r1]                ;load DQC
+    ldr     r5, [r0, #4]
+    ldr     r6, [r1, #4]
+
+    mov     r12, #2                 ;loop counter
+
+dequant_loop
+    smulbb  r7, r3, r4              ;multiply
+    smultt  r8, r3, r4
+    smulbb  r9, r5, r6
+    smultt  lr, r5, r6
+
+    ldr     r3, [r0, #8]
+    ldr     r4, [r1, #8]
+    ldr     r5, [r0, #12]
+    ldr     r6, [r1, #12]
+
+    strh    r7, [r2], #2            ;store result
+    smulbb  r7, r3, r4              ;multiply
+    strh    r8, [r2], #2
+    smultt  r8, r3, r4
+    strh    r9, [r2], #2
+    smulbb  r9, r5, r6
+    strh    lr, [r2], #2
+    smultt  lr, r5, r6
+
+    subs    r12, r12, #1
+
+    add     r0, r0, #16
+    add     r1, r1, #16
+
+    ldrne       r3, [r0]
+    strh    r7, [r2], #2            ;store result
+    ldrne       r4, [r1]
+    strh    r8, [r2], #2
+    ldrne       r5, [r0, #4]
+    strh    r9, [r2], #2
+    ldrne       r6, [r1, #4]
+    strh    lr, [r2], #2
+
+    bne     dequant_loop
+
+    ldmia   sp!, {r4-r9, pc}
+    ENDP    ;|vp8_dequantize_b_loop_v6|
+
+    END
--- /dev/null
+++ b/vp9/decoder/arm/armv6/idct_blk_v6.c
@@ -1,0 +1,136 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_ports/config.h"
+#include "vp9/common/idct.h"
+#include "vp9/decoder/dequantize.h"
+
+void vp8_dequant_dc_idct_add_y_block_v6
+(short *q, short *dq, unsigned char *pre,
+ unsigned char *dst, int stride, char *eobs, short *dc) {
+  int i;
+
+  for (i = 0; i < 4; i++) {
+    if (eobs[0] > 1)
+      vp8_dequant_dc_idct_add_v6(q, dq, pre, dst, 16, stride, dc[0]);
+    else
+      vp8_dc_only_idct_add_v6(dc[0], pre, dst, 16, stride);
+
+    if (eobs[1] > 1)
+      vp8_dequant_dc_idct_add_v6(q + 16, dq, pre + 4, dst + 4, 16, stride, dc[1]);
+    else
+      vp8_dc_only_idct_add_v6(dc[1], pre + 4, dst + 4, 16, stride);
+
+    if (eobs[2] > 1)
+      vp8_dequant_dc_idct_add_v6(q + 32, dq, pre + 8, dst + 8, 16, stride, dc[2]);
+    else
+      vp8_dc_only_idct_add_v6(dc[2], pre + 8, dst + 8, 16, stride);
+
+    if (eobs[3] > 1)
+      vp8_dequant_dc_idct_add_v6(q + 48, dq, pre + 12, dst + 12, 16, stride, dc[3]);
+    else
+      vp8_dc_only_idct_add_v6(dc[3], pre + 12, dst + 12, 16, stride);
+
+    q    += 64;
+    dc   += 4;
+    pre  += 64;
+    dst  += 4 * stride;
+    eobs += 4;
+  }
+}
+
+void vp8_dequant_idct_add_y_block_v6
+(short *q, short *dq, unsigned char *pre,
+ unsigned char *dst, int stride, char *eobs) {
+  int i;
+
+  for (i = 0; i < 4; i++) {
+    if (eobs[0] > 1)
+      vp8_dequant_idct_add_v6(q, dq, pre, dst, 16, stride);
+    else {
+      vp8_dc_only_idct_add_v6(q[0]*dq[0], pre, dst, 16, stride);
+      ((int *)q)[0] = 0;
+    }
+
+    if (eobs[1] > 1)
+      vp8_dequant_idct_add_v6(q + 16, dq, pre + 4, dst + 4, 16, stride);
+    else {
+      vp8_dc_only_idct_add_v6(q[16]*dq[0], pre + 4, dst + 4, 16, stride);
+      ((int *)(q + 16))[0] = 0;
+    }
+
+    if (eobs[2] > 1)
+      vp8_dequant_idct_add_v6(q + 32, dq, pre + 8, dst + 8, 16, stride);
+    else {
+      vp8_dc_only_idct_add_v6(q[32]*dq[0], pre + 8, dst + 8, 16, stride);
+      ((int *)(q + 32))[0] = 0;
+    }
+
+    if (eobs[3] > 1)
+      vp8_dequant_idct_add_v6(q + 48, dq, pre + 12, dst + 12, 16, stride);
+    else {
+      vp8_dc_only_idct_add_v6(q[48]*dq[0], pre + 12, dst + 12, 16, stride);
+      ((int *)(q + 48))[0] = 0;
+    }
+
+    q    += 64;
+    pre  += 64;
+    dst  += 4 * stride;
+    eobs += 4;
+  }
+}
+
+void vp8_dequant_idct_add_uv_block_v6
+(short *q, short *dq, unsigned char *pre,
+ unsigned char *dstu, unsigned char *dstv, int stride, char *eobs) {
+  int i;
+
+  for (i = 0; i < 2; i++) {
+    if (eobs[0] > 1)
+      vp8_dequant_idct_add_v6(q, dq, pre, dstu, 8, stride);
+    else {
+      vp8_dc_only_idct_add_v6(q[0]*dq[0], pre, dstu, 8, stride);
+      ((int *)q)[0] = 0;
+    }
+
+    if (eobs[1] > 1)
+      vp8_dequant_idct_add_v6(q + 16, dq, pre + 4, dstu + 4, 8, stride);
+    else {
+      vp8_dc_only_idct_add_v6(q[16]*dq[0], pre + 4, dstu + 4, 8, stride);
+      ((int *)(q + 16))[0] = 0;
+    }
+
+    q    += 32;
+    pre  += 32;
+    dstu += 4 * stride;
+    eobs += 2;
+  }
+
+  for (i = 0; i < 2; i++) {
+    if (eobs[0] > 1)
+      vp8_dequant_idct_add_v6(q, dq, pre, dstv, 8, stride);
+    else {
+      vp8_dc_only_idct_add_v6(q[0]*dq[0], pre, dstv, 8, stride);
+      ((int *)q)[0] = 0;
+    }
+
+    if (eobs[1] > 1)
+      vp8_dequant_idct_add_v6(q + 16, dq, pre + 4, dstv + 4, 8, stride);
+    else {
+      vp8_dc_only_idct_add_v6(q[16]*dq[0], pre + 4, dstv + 4, 8, stride);
+      ((int *)(q + 16))[0] = 0;
+    }
+
+    q    += 32;
+    pre  += 32;
+    dstv += 4 * stride;
+    eobs += 2;
+  }
+}
--- /dev/null
+++ b/vp9/decoder/arm/dequantize_arm.c
@@ -1,0 +1,44 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_ports/config.h"
+#include "vp9/decoder/dequantize.h"
+#include "vp9/common/idct.h"
+#include "vpx_mem/vpx_mem.h"
+
+#if HAVE_ARMV7
+extern void vp9_dequantize_b_loop_neon(short *Q, short *DQC, short *DQ);
+#endif
+
+#if HAVE_ARMV6
+extern void vp9_dequantize_b_loop_v6(short *Q, short *DQC, short *DQ);
+#endif
+
+#if HAVE_ARMV7
+
+void vp9_dequantize_b_neon(BLOCKD *d) {
+  short *DQ  = d->dqcoeff;
+  short *Q   = d->qcoeff;
+  short *DQC = d->dequant;
+
+  vp9_dequantize_b_loop_neon(Q, DQC, DQ);
+}
+#endif
+
+#if HAVE_ARMV6
+void vp9_dequantize_b_v6(BLOCKD *d) {
+  short *DQ  = d->dqcoeff;
+  short *Q   = d->qcoeff;
+  short *DQC = d->dequant;
+
+  vp9_dequantize_b_loop_v6(Q, DQC, DQ);
+}
+#endif
--- /dev/null
+++ b/vp9/decoder/arm/neon/dequant_idct_neon.asm
@@ -1,0 +1,129 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_dequant_idct_add_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+;void vp8_dequant_idct_neon(short *input, short *dq, unsigned char *pred,
+;                           unsigned char *dest, int pitch, int stride)
+; r0    short *input,
+; r1    short *dq,
+; r2    unsigned char *pred
+; r3    unsigned char *dest
+; sp    int pitch
+; sp+4  int stride
+
+|vp8_dequant_idct_add_neon| PROC
+    vld1.16         {q3, q4}, [r0]
+    vld1.16         {q5, q6}, [r1]
+    ldr             r1, [sp]                ; pitch
+    vld1.32         {d14[0]}, [r2], r1
+    vld1.32         {d14[1]}, [r2], r1
+    vld1.32         {d15[0]}, [r2], r1
+    vld1.32         {d15[1]}, [r2]
+
+    ldr             r1, [sp, #4]            ; stride
+
+    adr             r12, cospi8sqrt2minus1  ; pointer to the first constant
+
+    vmul.i16        q1, q3, q5              ;input for short_idct4x4llm_neon
+    vmul.i16        q2, q4, q6
+
+;|short_idct4x4llm_neon| PROC
+    vld1.16         {d0}, [r12]
+    vswp            d3, d4                  ;q2(vp[4] vp[12])
+
+    vqdmulh.s16     q3, q2, d0[2]
+    vqdmulh.s16     q4, q2, d0[0]
+
+    vqadd.s16       d12, d2, d3             ;a1
+    vqsub.s16       d13, d2, d3             ;b1
+
+    vshr.s16        q3, q3, #1
+    vshr.s16        q4, q4, #1
+
+    vqadd.s16       q3, q3, q2
+    vqadd.s16       q4, q4, q2
+
+    vqsub.s16       d10, d6, d9             ;c1
+    vqadd.s16       d11, d7, d8             ;d1
+
+    vqadd.s16       d2, d12, d11
+    vqadd.s16       d3, d13, d10
+    vqsub.s16       d4, d13, d10
+    vqsub.s16       d5, d12, d11
+
+    vtrn.32         d2, d4
+    vtrn.32         d3, d5
+    vtrn.16         d2, d3
+    vtrn.16         d4, d5
+
+; memset(input, 0, 32) -- 32bytes
+    vmov.i16        q14, #0
+
+    vswp            d3, d4
+    vqdmulh.s16     q3, q2, d0[2]
+    vqdmulh.s16     q4, q2, d0[0]
+
+    vqadd.s16       d12, d2, d3             ;a1
+    vqsub.s16       d13, d2, d3             ;b1
+
+    vmov            q15, q14
+
+    vshr.s16        q3, q3, #1
+    vshr.s16        q4, q4, #1
+
+    vqadd.s16       q3, q3, q2
+    vqadd.s16       q4, q4, q2
+
+    vqsub.s16       d10, d6, d9             ;c1
+    vqadd.s16       d11, d7, d8             ;d1
+
+    vqadd.s16       d2, d12, d11
+    vqadd.s16       d3, d13, d10
+    vqsub.s16       d4, d13, d10
+    vqsub.s16       d5, d12, d11
+
+    vst1.16         {q14, q15}, [r0]
+
+    vrshr.s16       d2, d2, #3
+    vrshr.s16       d3, d3, #3
+    vrshr.s16       d4, d4, #3
+    vrshr.s16       d5, d5, #3
+
+    vtrn.32         d2, d4
+    vtrn.32         d3, d5
+    vtrn.16         d2, d3
+    vtrn.16         d4, d5
+
+    vaddw.u8        q1, q1, d14
+    vaddw.u8        q2, q2, d15
+
+    vqmovun.s16     d0, q1
+    vqmovun.s16     d1, q2
+
+    vst1.32         {d0[0]}, [r3], r1
+    vst1.32         {d0[1]}, [r3], r1
+    vst1.32         {d1[0]}, [r3], r1
+    vst1.32         {d1[1]}, [r3]
+
+    bx             lr
+
+    ENDP           ; |vp8_dequant_idct_add_neon|
+
+; Constant Pool
+cospi8sqrt2minus1 DCD 0x4e7b4e7b
+sinpi8sqrt2       DCD 0x8a8c8a8c
+
+    END
--- /dev/null
+++ b/vp9/decoder/arm/neon/dequantizeb_neon.asm
@@ -1,0 +1,34 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_dequantize_b_loop_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+; r0    short *Q,
+; r1    short *DQC
+; r2    short *DQ
+|vp8_dequantize_b_loop_neon| PROC
+    vld1.16         {q0, q1}, [r0]
+    vld1.16         {q2, q3}, [r1]
+
+    vmul.i16        q4, q0, q2
+    vmul.i16        q5, q1, q3
+
+    vst1.16         {q4, q5}, [r2]
+
+    bx             lr
+
+    ENDP
+
+    END
--- /dev/null
+++ b/vp9/decoder/arm/neon/idct_blk_neon.c
@@ -1,0 +1,110 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_ports/config.h"
+#include "vp9/common/idct.h"
+#include "vp9/decoder/dequantize.h"
+
+/* place these declarations here because we don't want to maintain them
+ * outside of this scope
+ */
+void idct_dequant_dc_full_2x_neon
+(short *input, short *dq, unsigned char *pre, unsigned char *dst,
+ int stride, short *dc);
+void idct_dequant_dc_0_2x_neon
+(short *dc, unsigned char *pre, unsigned char *dst, int stride);
+void idct_dequant_full_2x_neon
+(short *q, short *dq, unsigned char *pre, unsigned char *dst,
+ int pitch, int stride);
+void idct_dequant_0_2x_neon
+(short *q, short dq, unsigned char *pre, int pitch,
+ unsigned char *dst, int stride);
+
+void vp8_dequant_dc_idct_add_y_block_neon
+(short *q, short *dq, unsigned char *pre,
+ unsigned char *dst, int stride, char *eobs, short *dc) {
+  int i;
+
+  for (i = 0; i < 4; i++) {
+    if (((short *)eobs)[0] & 0xfefe)
+      idct_dequant_dc_full_2x_neon(q, dq, pre, dst, stride, dc);
+    else
+      idct_dequant_dc_0_2x_neon(dc, pre, dst, stride);
+
+    if (((short *)eobs)[1] & 0xfefe)
+      idct_dequant_dc_full_2x_neon(q + 32, dq, pre + 8, dst + 8, stride, dc + 2);
+    else
+      idct_dequant_dc_0_2x_neon(dc + 2, pre + 8, dst + 8, stride);
+
+    q    += 64;
+    dc   += 4;
+    pre  += 64;
+    dst  += 4 * stride;
+    eobs += 4;
+  }
+}
+
+void vp8_dequant_idct_add_y_block_neon
+(short *q, short *dq, unsigned char *pre,
+ unsigned char *dst, int stride, char *eobs) {
+  int i;
+
+  for (i = 0; i < 4; i++) {
+    if (((short *)eobs)[0] & 0xfefe)
+      idct_dequant_full_2x_neon(q, dq, pre, dst, 16, stride);
+    else
+      idct_dequant_0_2x_neon(q, dq[0], pre, 16, dst, stride);
+
+    if (((short *)eobs)[1] & 0xfefe)
+      idct_dequant_full_2x_neon(q + 32, dq, pre + 8, dst + 8, 16, stride);
+    else
+      idct_dequant_0_2x_neon(q + 32, dq[0], pre + 8, 16, dst + 8, stride);
+
+    q    += 64;
+    pre  += 64;
+    dst  += 4 * stride;
+    eobs += 4;
+  }
+}
+
+void vp8_dequant_idct_add_uv_block_neon
+(short *q, short *dq, unsigned char *pre,
+ unsigned char *dstu, unsigned char *dstv, int stride, char *eobs) {
+  if (((short *)eobs)[0] & 0xfefe)
+    idct_dequant_full_2x_neon(q, dq, pre, dstu, 8, stride);
+  else
+    idct_dequant_0_2x_neon(q, dq[0], pre, 8, dstu, stride);
+
+  q    += 32;
+  pre  += 32;
+  dstu += 4 * stride;
+
+  if (((short *)eobs)[1] & 0xfefe)
+    idct_dequant_full_2x_neon(q, dq, pre, dstu, 8, stride);
+  else
+    idct_dequant_0_2x_neon(q, dq[0], pre, 8, dstu, stride);
+
+  q += 32;
+  pre += 32;
+
+  if (((short *)eobs)[2] & 0xfefe)
+    idct_dequant_full_2x_neon(q, dq, pre, dstv, 8, stride);
+  else
+    idct_dequant_0_2x_neon(q, dq[0], pre, 8, dstv, stride);
+
+  q    += 32;
+  pre  += 32;
+  dstv += 4 * stride;
+
+  if (((short *)eobs)[3] & 0xfefe)
+    idct_dequant_full_2x_neon(q, dq, pre, dstv, 8, stride);
+  else
+    idct_dequant_0_2x_neon(q, dq[0], pre, 8, dstv, stride);
+}
--- /dev/null
+++ b/vp9/decoder/arm/neon/idct_dequant_0_2x_neon.asm
@@ -1,0 +1,79 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+    EXPORT  |idct_dequant_0_2x_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+;void idct_dequant_0_2x_neon(short *q, short dq, unsigned char *pre,
+;                            int pitch, unsigned char *dst, int stride);
+; r0   *q
+; r1   dq
+; r2   *pre
+; r3   pitch
+; sp   *dst
+; sp+4 stride
+|idct_dequant_0_2x_neon| PROC
+    add             r12, r2, #4
+    vld1.32         {d2[0]}, [r2], r3
+    vld1.32         {d2[1]}, [r2], r3
+    vld1.32         {d4[0]}, [r2], r3
+    vld1.32         {d4[1]}, [r2]
+    vld1.32         {d8[0]}, [r12], r3
+    vld1.32         {d8[1]}, [r12], r3
+    vld1.32         {d10[0]}, [r12], r3
+    vld1.32         {d10[1]}, [r12]
+
+    ldrh            r12, [r0]               ; lo q
+    ldrh            r2, [r0, #32]           ; hi q
+    mov             r3, #0
+    strh            r3, [r0]
+    strh            r3, [r0, #32]
+
+    sxth            r12, r12                ; lo
+    mul             r0, r12, r1
+    add             r0, r0, #4
+    asr             r0, r0, #3
+    vdup.16         q0, r0
+    sxth            r2, r2                  ; hi
+    mul             r0, r2, r1
+    add             r0, r0, #4
+    asr             r0, r0, #3
+    vdup.16         q3, r0
+
+    vaddw.u8        q1, q0, d2              ; lo
+    vaddw.u8        q2, q0, d4
+    vaddw.u8        q4, q3, d8              ; hi
+    vaddw.u8        q5, q3, d10
+
+    ldr             r2, [sp]                ; dst
+    ldr             r3, [sp, #4]            ; stride
+
+    vqmovun.s16     d2, q1                  ; lo
+    vqmovun.s16     d4, q2
+    vqmovun.s16     d8, q4                  ; hi
+    vqmovun.s16     d10, q5
+
+    add             r0, r2, #4
+    vst1.32         {d2[0]}, [r2], r3       ; lo
+    vst1.32         {d2[1]}, [r2], r3
+    vst1.32         {d4[0]}, [r2], r3
+    vst1.32         {d4[1]}, [r2]
+    vst1.32         {d8[0]}, [r0], r3       ; hi
+    vst1.32         {d8[1]}, [r0], r3
+    vst1.32         {d10[0]}, [r0], r3
+    vst1.32         {d10[1]}, [r0]
+
+    bx             lr
+
+    ENDP           ; |idct_dequant_0_2x_neon|
+    END
--- /dev/null
+++ b/vp9/decoder/arm/neon/idct_dequant_dc_0_2x_neon.asm
@@ -1,0 +1,69 @@
+;
+;  Copyright (c) 2010 The Webm project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+    EXPORT  |idct_dequant_dc_0_2x_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+;void idct_dequant_dc_0_2x_neon(short *dc, unsigned char *pre,
+;                               unsigned char *dst, int stride);
+; r0  *dc
+; r1  *pre
+; r2  *dst
+; r3  stride
+|idct_dequant_dc_0_2x_neon| PROC
+    ldr             r0, [r0]                ; *dc
+    mov             r12, #16
+
+    vld1.32         {d2[0]}, [r1], r12      ; lo
+    vld1.32         {d2[1]}, [r1], r12
+    vld1.32         {d4[0]}, [r1], r12
+    vld1.32         {d4[1]}, [r1]
+    sub             r1, r1, #44
+    vld1.32         {d8[0]}, [r1], r12      ; hi
+    vld1.32         {d8[1]}, [r1], r12
+    vld1.32         {d10[0]}, [r1], r12
+    vld1.32         {d10[1]}, [r1]
+
+    sxth            r1, r0                  ; lo *dc
+    add             r1, r1, #4
+    asr             r1, r1, #3
+    vdup.16         q0, r1
+    sxth            r0, r0, ror #16         ; hi *dc
+    add             r0, r0, #4
+    asr             r0, r0, #3
+    vdup.16         q3, r0
+
+    vaddw.u8        q1, q0, d2              ; lo
+    vaddw.u8        q2, q0, d4
+    vaddw.u8        q4, q3, d8              ; hi
+    vaddw.u8        q5, q3, d10
+
+    vqmovun.s16     d2, q1                  ; lo
+    vqmovun.s16     d4, q2
+    vqmovun.s16     d8, q4                  ; hi
+    vqmovun.s16     d10, q5
+
+    add             r0, r2, #4
+    vst1.32         {d2[0]}, [r2], r3       ; lo
+    vst1.32         {d2[1]}, [r2], r3
+    vst1.32         {d4[0]}, [r2], r3
+    vst1.32         {d4[1]}, [r2]
+    vst1.32         {d8[0]}, [r0], r3       ; hi
+    vst1.32         {d8[1]}, [r0], r3
+    vst1.32         {d10[0]}, [r0], r3
+    vst1.32         {d10[1]}, [r0]
+
+    bx             lr
+
+    ENDP           ;|idct_dequant_dc_0_2x_neon|
+    END
--- /dev/null
+++ b/vp9/decoder/arm/neon/idct_dequant_dc_full_2x_neon.asm
@@ -1,0 +1,205 @@
+;
+;  Copyright (c) 2010 The Webm project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |idct_dequant_dc_full_2x_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+;void idct_dequant_dc_full_2x_neon(short *q, short *dq, unsigned char *pre,
+;                                  unsigned char *dst, int stride, short *dc);
+; r0    *q,
+; r1    *dq,
+; r2    *pre
+; r3    *dst
+; sp    stride
+; sp+4  *dc
+|idct_dequant_dc_full_2x_neon| PROC
+    vld1.16         {q0, q1}, [r1]          ; dq (same l/r)
+    vld1.16         {q2, q3}, [r0]          ; l q
+    mov             r1, #16                 ; pitch
+    add             r0, r0, #32
+    vld1.16         {q4, q5}, [r0]          ; r q
+    add             r12, r2, #4
+    ; interleave the predictors
+    vld1.32         {d28[0]}, [r2], r1      ; l pre
+    vld1.32         {d28[1]}, [r12], r1     ; r pre
+    vld1.32         {d29[0]}, [r2], r1
+    vld1.32         {d29[1]}, [r12], r1
+    vld1.32         {d30[0]}, [r2], r1
+    vld1.32         {d30[1]}, [r12], r1
+    vld1.32         {d31[0]}, [r2]
+    ldr             r1, [sp, #4]
+    vld1.32         {d31[1]}, [r12]
+
+    adr             r2, cospi8sqrt2minus1   ; pointer to the first constant
+
+    ldrh            r12, [r1], #2           ; lo *dc
+    ldrh            r1, [r1]                ; hi *dc
+
+    ; dequant: q[i] = q[i] * dq[i]
+    vmul.i16        q2, q2, q0
+    vmul.i16        q3, q3, q1
+    vmul.i16        q4, q4, q0
+    vmul.i16        q5, q5, q1
+
+    ; move dc up to neon and overwrite first element
+    vmov.16         d4[0], r12
+    vmov.16         d8[0], r1
+
+    vld1.16         {d0}, [r2]
+
+    ; q2: l0r0  q3: l8r8
+    ; q4: l4r4  q5: l12r12
+    vswp            d5, d8
+    vswp            d7, d10
+
+    ; _CONSTANTS_ * 4,12 >> 16
+    ; q6:  4 * sinpi : c1/temp1
+    ; q7: 12 * sinpi : d1/temp2
+    ; q8:  4 * cospi
+    ; q9: 12 * cospi
+    vqdmulh.s16     q6, q4, d0[2]           ; sinpi8sqrt2
+    vqdmulh.s16     q7, q5, d0[2]
+    vqdmulh.s16     q8, q4, d0[0]           ; cospi8sqrt2minus1
+    vqdmulh.s16     q9, q5, d0[0]
+
+    vqadd.s16       q10, q2, q3             ; a1 = 0 + 8
+    vqsub.s16       q11, q2, q3             ; b1 = 0 - 8
+
+    ; vqdmulh only accepts signed values. this was a problem because
+    ; our constant had the high bit set, and was treated as a negative value.
+    ; vqdmulh also doubles the value before it shifts by 16. we need to
+    ; compensate for this. in the case of sinpi8sqrt2, the lowest bit is 0,
+    ; so we can shift the constant without losing precision. this avoids
+    ; shift again afterward, but also avoids the sign issue. win win!
+    ; for cospi8sqrt2minus1 the lowest bit is 1, so we lose precision if we
+    ; pre-shift it
+    vshr.s16        q8, q8, #1
+    vshr.s16        q9, q9, #1
+
+    ; q4:  4 +  4 * cospi : d1/temp1
+    ; q5: 12 + 12 * cospi : c1/temp2
+    vqadd.s16       q4, q4, q8
+    vqadd.s16       q5, q5, q9
+
+    ; c1 = temp1 - temp2
+    ; d1 = temp1 + temp2
+    vqsub.s16       q2, q6, q5
+    vqadd.s16       q3, q4, q7
+
+    ; [0]: a1+d1
+    ; [1]: b1+c1
+    ; [2]: b1-c1
+    ; [3]: a1-d1
+    vqadd.s16       q4, q10, q3
+    vqadd.s16       q5, q11, q2
+    vqsub.s16       q6, q11, q2
+    vqsub.s16       q7, q10, q3
+
+    ; rotate
+    vtrn.32         q4, q6
+    vtrn.32         q5, q7
+    vtrn.16         q4, q5
+    vtrn.16         q6, q7
+    ; idct loop 2
+    ; q4: l 0, 4, 8,12 r 0, 4, 8,12
+    ; q5: l 1, 5, 9,13 r 1, 5, 9,13
+    ; q6: l 2, 6,10,14 r 2, 6,10,14
+    ; q7: l 3, 7,11,15 r 3, 7,11,15
+
+    ; q8:  1 * sinpi : c1/temp1
+    ; q9:  3 * sinpi : d1/temp2
+    ; q10: 1 * cospi
+    ; q11: 3 * cospi
+    vqdmulh.s16     q8, q5, d0[2]           ; sinpi8sqrt2
+    vqdmulh.s16     q9, q7, d0[2]
+    vqdmulh.s16     q10, q5, d0[0]          ; cospi8sqrt2minus1
+    vqdmulh.s16     q11, q7, d0[0]
+
+    vqadd.s16       q2, q4, q6             ; a1 = 0 + 2
+    vqsub.s16       q3, q4, q6             ; b1 = 0 - 2
+
+    ; see note on shifting above
+    vshr.s16        q10, q10, #1
+    vshr.s16        q11, q11, #1
+
+    ; q10: 1 + 1 * cospi : d1/temp1
+    ; q11: 3 + 3 * cospi : c1/temp2
+    vqadd.s16       q10, q5, q10
+    vqadd.s16       q11, q7, q11
+
+    ; q8: c1 = temp1 - temp2
+    ; q9: d1 = temp1 + temp2
+    vqsub.s16       q8, q8, q11
+    vqadd.s16       q9, q10, q9
+
+    ; a1+d1
+    ; b1+c1
+    ; b1-c1
+    ; a1-d1
+    vqadd.s16       q4, q2, q9
+    vqadd.s16       q5, q3, q8
+    vqsub.s16       q6, q3, q8
+    vqsub.s16       q7, q2, q9
+
+    ; +4 >> 3 (rounding)
+    vrshr.s16       q4, q4, #3              ; lo
+    vrshr.s16       q5, q5, #3
+    vrshr.s16       q6, q6, #3              ; hi
+    vrshr.s16       q7, q7, #3
+
+    vtrn.32         q4, q6
+    vtrn.32         q5, q7
+    vtrn.16         q4, q5
+    vtrn.16         q6, q7
+
+    ; adding pre
+    ; input is still packed. pre was read interleaved
+    vaddw.u8        q4, q4, d28
+    vaddw.u8        q5, q5, d29
+    vaddw.u8        q6, q6, d30
+    vaddw.u8        q7, q7, d31
+
+    vmov.i16        q14, #0
+    vmov            q15, q14
+    vst1.16         {q14, q15}, [r0]        ; write over high input
+    sub             r0, r0, #32
+    vst1.16         {q14, q15}, [r0]        ; write over low input
+
+    ;saturate and narrow
+    vqmovun.s16     d0, q4                  ; lo
+    vqmovun.s16     d1, q5
+    vqmovun.s16     d2, q6                  ; hi
+    vqmovun.s16     d3, q7
+
+    ldr             r1, [sp]                ; stride
+    add             r2, r3, #4              ; hi
+    vst1.32         {d0[0]}, [r3], r1       ; lo
+    vst1.32         {d0[1]}, [r2], r1       ; hi
+    vst1.32         {d1[0]}, [r3], r1
+    vst1.32         {d1[1]}, [r2], r1
+    vst1.32         {d2[0]}, [r3], r1
+    vst1.32         {d2[1]}, [r2], r1
+    vst1.32         {d3[0]}, [r3]
+    vst1.32         {d3[1]}, [r2]
+
+    bx             lr
+
+    ENDP           ; |idct_dequant_dc_full_2x_neon|
+
+; Constant Pool
+cospi8sqrt2minus1 DCD 0x4e7b
+; because the lowest bit in 0x8a8c is 0, we can pre-shift this
+sinpi8sqrt2       DCD 0x4546
+
+    END
--- /dev/null
+++ b/vp9/decoder/arm/neon/idct_dequant_full_2x_neon.asm
@@ -1,0 +1,197 @@
+;
+;  Copyright (c) 2010 The Webm project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |idct_dequant_full_2x_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+;void idct_dequant_full_2x_neon(short *q, short *dq, unsigned char *pre,
+;                               unsigned char *dst, int pitch, int stride);
+; r0    *q,
+; r1    *dq,
+; r2    *pre
+; r3    *dst
+; sp    pitch
+; sp+4  stride
+|idct_dequant_full_2x_neon| PROC
+    vld1.16         {q0, q1}, [r1]          ; dq (same l/r)
+    vld1.16         {q2, q3}, [r0]          ; l q
+    ldr             r1, [sp]                ; pitch
+    add             r0, r0, #32
+    vld1.16         {q4, q5}, [r0]          ; r q
+    add             r12, r2, #4
+    ; interleave the predictors
+    vld1.32         {d28[0]}, [r2], r1      ; l pre
+    vld1.32         {d28[1]}, [r12], r1     ; r pre
+    vld1.32         {d29[0]}, [r2], r1
+    vld1.32         {d29[1]}, [r12], r1
+    vld1.32         {d30[0]}, [r2], r1
+    vld1.32         {d30[1]}, [r12], r1
+    vld1.32         {d31[0]}, [r2]
+    vld1.32         {d31[1]}, [r12]
+
+    adr             r2, cospi8sqrt2minus1   ; pointer to the first constant
+
+    ; dequant: q[i] = q[i] * dq[i]
+    vmul.i16        q2, q2, q0
+    vmul.i16        q3, q3, q1
+    vmul.i16        q4, q4, q0
+    vmul.i16        q5, q5, q1
+
+    vld1.16         {d0}, [r2]
+
+    ; q2: l0r0  q3: l8r8
+    ; q4: l4r4  q5: l12r12
+    vswp            d5, d8
+    vswp            d7, d10
+
+    ; _CONSTANTS_ * 4,12 >> 16
+    ; q6:  4 * sinpi : c1/temp1
+    ; q7: 12 * sinpi : d1/temp2
+    ; q8:  4 * cospi
+    ; q9: 12 * cospi
+    vqdmulh.s16     q6, q4, d0[2]           ; sinpi8sqrt2
+    vqdmulh.s16     q7, q5, d0[2]
+    vqdmulh.s16     q8, q4, d0[0]           ; cospi8sqrt2minus1
+    vqdmulh.s16     q9, q5, d0[0]
+
+    vqadd.s16       q10, q2, q3             ; a1 = 0 + 8
+    vqsub.s16       q11, q2, q3             ; b1 = 0 - 8
+
+    ; vqdmulh only accepts signed values. this was a problem because
+    ; our constant had the high bit set, and was treated as a negative value.
+    ; vqdmulh also doubles the value before it shifts by 16. we need to
+    ; compensate for this. in the case of sinpi8sqrt2, the lowest bit is 0,
+    ; so we can shift the constant without losing precision. this avoids
+    ; shift again afterward, but also avoids the sign issue. win win!
+    ; for cospi8sqrt2minus1 the lowest bit is 1, so we lose precision if we
+    ; pre-shift it
+    vshr.s16        q8, q8, #1
+    vshr.s16        q9, q9, #1
+
+    ; q4:  4 +  4 * cospi : d1/temp1
+    ; q5: 12 + 12 * cospi : c1/temp2
+    vqadd.s16       q4, q4, q8
+    vqadd.s16       q5, q5, q9
+
+    ; c1 = temp1 - temp2
+    ; d1 = temp1 + temp2
+    vqsub.s16       q2, q6, q5
+    vqadd.s16       q3, q4, q7
+
+    ; [0]: a1+d1
+    ; [1]: b1+c1
+    ; [2]: b1-c1
+    ; [3]: a1-d1
+    vqadd.s16       q4, q10, q3
+    vqadd.s16       q5, q11, q2
+    vqsub.s16       q6, q11, q2
+    vqsub.s16       q7, q10, q3
+
+    ; rotate
+    vtrn.32         q4, q6
+    vtrn.32         q5, q7
+    vtrn.16         q4, q5
+    vtrn.16         q6, q7
+    ; idct loop 2
+    ; q4: l 0, 4, 8,12 r 0, 4, 8,12
+    ; q5: l 1, 5, 9,13 r 1, 5, 9,13
+    ; q6: l 2, 6,10,14 r 2, 6,10,14
+    ; q7: l 3, 7,11,15 r 3, 7,11,15
+
+    ; q8:  1 * sinpi : c1/temp1
+    ; q9:  3 * sinpi : d1/temp2
+    ; q10: 1 * cospi
+    ; q11: 3 * cospi
+    vqdmulh.s16     q8, q5, d0[2]           ; sinpi8sqrt2
+    vqdmulh.s16     q9, q7, d0[2]
+    vqdmulh.s16     q10, q5, d0[0]          ; cospi8sqrt2minus1
+    vqdmulh.s16     q11, q7, d0[0]
+
+    vqadd.s16       q2, q4, q6             ; a1 = 0 + 2
+    vqsub.s16       q3, q4, q6             ; b1 = 0 - 2
+
+    ; see note on shifting above
+    vshr.s16        q10, q10, #1
+    vshr.s16        q11, q11, #1
+
+    ; q10: 1 + 1 * cospi : d1/temp1
+    ; q11: 3 + 3 * cospi : c1/temp2
+    vqadd.s16       q10, q5, q10
+    vqadd.s16       q11, q7, q11
+
+    ; q8: c1 = temp1 - temp2
+    ; q9: d1 = temp1 + temp2
+    vqsub.s16       q8, q8, q11
+    vqadd.s16       q9, q10, q9
+
+    ; a1+d1
+    ; b1+c1
+    ; b1-c1
+    ; a1-d1
+    vqadd.s16       q4, q2, q9
+    vqadd.s16       q5, q3, q8
+    vqsub.s16       q6, q3, q8
+    vqsub.s16       q7, q2, q9
+
+    ; +4 >> 3 (rounding)
+    vrshr.s16       q4, q4, #3              ; lo
+    vrshr.s16       q5, q5, #3
+    vrshr.s16       q6, q6, #3              ; hi
+    vrshr.s16       q7, q7, #3
+
+    vtrn.32         q4, q6
+    vtrn.32         q5, q7
+    vtrn.16         q4, q5
+    vtrn.16         q6, q7
+
+    ; adding pre
+    ; input is still packed. pre was read interleaved
+    vaddw.u8        q4, q4, d28
+    vaddw.u8        q5, q5, d29
+    vaddw.u8        q6, q6, d30
+    vaddw.u8        q7, q7, d31
+
+    vmov.i16        q14, #0
+    vmov            q15, q14
+    vst1.16         {q14, q15}, [r0]        ; write over high input
+    sub             r0, r0, #32
+    vst1.16         {q14, q15}, [r0]        ; write over low input
+
+    ;saturate and narrow
+    vqmovun.s16     d0, q4                  ; lo
+    vqmovun.s16     d1, q5
+    vqmovun.s16     d2, q6                  ; hi
+    vqmovun.s16     d3, q7
+
+    ldr             r1, [sp, #4]            ; stride
+    add             r2, r3, #4              ; hi
+    vst1.32         {d0[0]}, [r3], r1       ; lo
+    vst1.32         {d0[1]}, [r2], r1       ; hi
+    vst1.32         {d1[0]}, [r3], r1
+    vst1.32         {d1[1]}, [r2], r1
+    vst1.32         {d2[0]}, [r3], r1
+    vst1.32         {d2[1]}, [r2], r1
+    vst1.32         {d3[0]}, [r3]
+    vst1.32         {d3[1]}, [r2]
+
+    bx             lr
+
+    ENDP           ; |idct_dequant_full_2x_neon|
+
+; Constant Pool
+cospi8sqrt2minus1 DCD 0x4e7b
+; because the lowest bit in 0x8a8c is 0, we can pre-shift this
+sinpi8sqrt2       DCD 0x4546
+
+    END
--- /dev/null
+++ b/vp9/decoder/asm_dec_offsets.c
@@ -1,0 +1,39 @@
+/*
+ *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_ports/asm_offsets.h"
+#include "onyxd_int.h"
+
+BEGIN
+
+DEFINE(detok_scan,                              offsetof(DETOK, scan));
+DEFINE(detok_ptr_block2leftabove,               offsetof(DETOK, ptr_block2leftabove));
+DEFINE(detok_coef_tree_ptr,                     offsetof(DETOK, vp9_coef_tree_ptr));
+DEFINE(detok_norm_ptr,                          offsetof(DETOK, norm_ptr));
+DEFINE(detok_ptr_coef_bands_x,                  offsetof(DETOK, ptr_coef_bands_x));
+
+DEFINE(detok_A,                                 offsetof(DETOK, A));
+DEFINE(detok_L,                                 offsetof(DETOK, L));
+
+DEFINE(detok_qcoeff_start_ptr,                  offsetof(DETOK, qcoeff_start_ptr));
+DEFINE(detok_coef_probs,                        offsetof(DETOK, coef_probs));
+DEFINE(detok_eob,                               offsetof(DETOK, eob));
+
+DEFINE(bool_decoder_user_buffer_end,            offsetof(BOOL_DECODER, user_buffer_end));
+DEFINE(bool_decoder_user_buffer,                offsetof(BOOL_DECODER, user_buffer));
+DEFINE(bool_decoder_value,                      offsetof(BOOL_DECODER, value));
+DEFINE(bool_decoder_count,                      offsetof(BOOL_DECODER, count));
+DEFINE(bool_decoder_range,                      offsetof(BOOL_DECODER, range));
+
+END
+
+/* add asserts for any offset that is not supported by assembly code */
+/* add asserts for any size that is not supported by assembly code */
--- /dev/null
+++ b/vp9/decoder/dboolhuff.c
@@ -1,0 +1,100 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "dboolhuff.h"
+#include "vpx_ports/mem.h"
+#include "vpx_mem/vpx_mem.h"
+
+int vp9_start_decode(BOOL_DECODER *br,
+                     const unsigned char *source,
+                     unsigned int source_sz) {
+  br->user_buffer_end = source + source_sz;
+  br->user_buffer     = source;
+  br->value    = 0;
+  br->count    = -8;
+  br->range    = 255;
+
+  if (source_sz && !source)
+    return 1;
+
+  /* Populate the buffer */
+  vp9_bool_decoder_fill(br);
+
+  return 0;
+}
+
+
+void vp9_bool_decoder_fill(BOOL_DECODER *br) {
+  const unsigned char *bufptr;
+  const unsigned char *bufend;
+  VP9_BD_VALUE         value;
+  int                  count;
+  bufend = br->user_buffer_end;
+  bufptr = br->user_buffer;
+  value = br->value;
+  count = br->count;
+
+  VP9DX_BOOL_DECODER_FILL(count, value, bufptr, bufend);
+
+  br->user_buffer = bufptr;
+  br->value = value;
+  br->count = count;
+}
+
+
+static int get_unsigned_bits(unsigned num_values) {
+  int cat = 0;
+  if ((num_values--) <= 1) return 0;
+  while (num_values > 0) {
+    cat++;
+    num_values >>= 1;
+  }
+  return cat;
+}
+
+int vp9_inv_recenter_nonneg(int v, int m) {
+  if (v > (m << 1)) return v;
+  else if ((v & 1) == 0) return (v >> 1) + m;
+  else return m - ((v + 1) >> 1);
+}
+
+int vp9_decode_uniform(BOOL_DECODER *br, int n) {
+  int v;
+  int l = get_unsigned_bits(n);
+  int m = (1 << l) - n;
+  if (!l) return 0;
+  v = decode_value(br, l - 1);
+  if (v < m)
+    return v;
+  else
+    return (v << 1) - m + decode_value(br, 1);
+}
+
+int vp9_decode_term_subexp(BOOL_DECODER *br, int k, int num_syms) {
+  int i = 0, mk = 0, word;
+  while (1) {
+    int b = (i ? k + i - 1 : k);
+    int a = (1 << b);
+    if (num_syms <= mk + 3 * a) {
+      word = vp9_decode_uniform(br, num_syms - mk) + mk;
+      break;
+    } else {
+      if (decode_value(br, 1)) {
+        i++;
+        mk += a;
+      } else {
+        word = decode_value(br, b) + mk;
+        break;
+      }
+    }
+  }
+  return word;
+}
--- /dev/null
+++ b/vp9/decoder/dboolhuff.h
@@ -1,0 +1,153 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef DBOOLHUFF_H
+#define DBOOLHUFF_H
+#include <stddef.h>
+#include <limits.h>
+#include "vpx_ports/config.h"
+#include "vpx_ports/mem.h"
+#include "vpx/vpx_integer.h"
+
+typedef size_t VP9_BD_VALUE;
+
+# define VP9_BD_VALUE_SIZE ((int)sizeof(VP9_BD_VALUE)*CHAR_BIT)
+/*This is meant to be a large, positive constant that can still be efficiently
+   loaded as an immediate (on platforms like ARM, for example).
+  Even relatively modest values like 100 would work fine.*/
+# define VP9_LOTS_OF_BITS (0x40000000)
+
+typedef struct {
+  const unsigned char *user_buffer_end;
+  const unsigned char *user_buffer;
+  VP9_BD_VALUE         value;
+  int                  count;
+  unsigned int         range;
+} BOOL_DECODER;
+
+DECLARE_ALIGNED(16, extern const unsigned char, vp9_norm[256]);
+
+int vp9_start_decode(BOOL_DECODER *br,
+                     const unsigned char *source,
+                     unsigned int source_sz);
+
+void vp9_bool_decoder_fill(BOOL_DECODER *br);
+
+int vp9_decode_uniform(BOOL_DECODER *br, int n);
+int vp9_decode_term_subexp(BOOL_DECODER *br, int k, int num_syms);
+int vp9_inv_recenter_nonneg(int v, int m);
+
+/*The refill loop is used in several places, so define it in a macro to make
+   sure they're all consistent.
+  An inline function would be cleaner, but has a significant penalty, because
+   multiple BOOL_DECODER fields must be modified, and the compiler is not smart
+   enough to eliminate the stores to those fields and the subsequent reloads
+   from them when inlining the function.*/
+#define VP9DX_BOOL_DECODER_FILL(_count,_value,_bufptr,_bufend) \
+  do \
+  { \
+    int shift = VP9_BD_VALUE_SIZE - 8 - ((_count) + 8); \
+    int loop_end, x; \
+    size_t bits_left = ((_bufend)-(_bufptr))*CHAR_BIT; \
+    \
+    x = shift + CHAR_BIT - bits_left; \
+    loop_end = 0; \
+    if(x >= 0) \
+    { \
+      (_count) += VP9_LOTS_OF_BITS; \
+      loop_end = x; \
+      if(!bits_left) break; \
+    } \
+    while(shift >= loop_end) \
+    { \
+      (_count) += CHAR_BIT; \
+      (_value) |= (VP9_BD_VALUE)*(_bufptr)++ << shift; \
+      shift -= CHAR_BIT; \
+    } \
+  } \
+  while(0) \
+
+
+static int decode_bool(BOOL_DECODER *br, int probability) {
+  unsigned int bit = 0;
+  VP9_BD_VALUE value;
+  unsigned int split;
+  VP9_BD_VALUE bigsplit;
+  int count;
+  unsigned int range;
+
+  split = 1 + (((br->range - 1) * probability) >> 8);
+
+  if (br->count < 0)
+    vp9_bool_decoder_fill(br);
+
+  value = br->value;
+  count = br->count;
+
+  bigsplit = (VP9_BD_VALUE)split << (VP9_BD_VALUE_SIZE - 8);
+
+  range = split;
+
+  if (value >= bigsplit) {
+    range = br->range - split;
+    value = value - bigsplit;
+    bit = 1;
+  }
+
+  {
+    register unsigned int shift = vp9_norm[range];
+    range <<= shift;
+    value <<= shift;
+    count -= shift;
+  }
+  br->value = value;
+  br->count = count;
+  br->range = range;
+
+  return bit;
+}
+
+static int decode_value(BOOL_DECODER *br, int bits) {
+  int z = 0;
+  int bit;
+
+  for (bit = bits - 1; bit >= 0; bit--) {
+    z |= (decode_bool(br, 0x80) << bit);
+  }
+
+  return z;
+}
+
+static int bool_error(BOOL_DECODER *br) {
+  /* Check if we have reached the end of the buffer.
+   *
+   * Variable 'count' stores the number of bits in the 'value' buffer, minus
+   * 8. The top byte is part of the algorithm, and the remainder is buffered
+   * to be shifted into it. So if count == 8, the top 16 bits of 'value' are
+   * occupied, 8 for the algorithm and 8 in the buffer.
+   *
+   * When reading a byte from the user's buffer, count is filled with 8 and
+   * one byte is filled into the value buffer. When we reach the end of the
+   * data, count is additionally filled with VP9_LOTS_OF_BITS. So when
+   * count == VP9_LOTS_OF_BITS - 1, the user's data has been exhausted.
+   */
+  if ((br->count > VP9_BD_VALUE_SIZE) && (br->count < VP9_LOTS_OF_BITS)) {
+    /* We have tried to decode bits after the end of
+     * stream was encountered.
+     */
+    return 1;
+  }
+
+  /* No error. */
+  return 0;
+}
+
+#endif
--- /dev/null
+++ b/vp9/decoder/decodemv.c
@@ -1,0 +1,1199 @@
+/*
+  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "treereader.h"
+#include "vp9/common/entropymv.h"
+#include "vp9/common/entropymode.h"
+#include "onyxd_int.h"
+#include "vp9/common/findnearmv.h"
+
+#include "vp9/common/seg_common.h"
+#include "vp9/common/pred_common.h"
+#include "vp9/common/entropy.h"
+#include "vp9/decoder/decodemv.h"
+#if CONFIG_DEBUG
+#include <assert.h>
+#endif
+
+// #define DEBUG_DEC_MV
+#ifdef DEBUG_DEC_MV
+int dec_mvcount = 0;
+#endif
+
+static int read_bmode(vp9_reader *bc, const vp9_prob *p) {
+  return treed_read(bc, vp9_bmode_tree, p);
+}
+
+static int read_ymode(vp9_reader *bc, const vp9_prob *p) {
+  return treed_read(bc, vp9_ymode_tree, p);
+}
+
+#if CONFIG_SUPERBLOCKS
+static int read_kf_sb_ymode(vp9_reader *bc, const vp9_prob *p) {
+  return treed_read(bc, vp9_uv_mode_tree, p);
+}
+#endif
+
+static int read_kf_mb_ymode(vp9_reader *bc, const vp9_prob *p) {
+  return treed_read(bc, vp9_kf_ymode_tree, p);
+}
+
+static int read_i8x8_mode(vp9_reader *bc, const vp9_prob *p) {
+  return treed_read(bc, vp9_i8x8_mode_tree, p);
+}
+
+static int read_uv_mode(vp9_reader *bc, const vp9_prob *p) {
+  return treed_read(bc, vp9_uv_mode_tree, p);
+}
+
+// This function reads the current macro block's segnent id from the bitstream
+// It should only be called if a segment map update is indicated.
+static void read_mb_segid(vp9_reader *r, MB_MODE_INFO *mi,
+                          MACROBLOCKD *xd) {
+  /* Is segmentation enabled */
+  if (xd->segmentation_enabled && xd->update_mb_segmentation_map) {
+    /* If so then read the segment id. */
+    if (vp9_read(r, xd->mb_segment_tree_probs[0]))
+      mi->segment_id =
+        (unsigned char)(2 + vp9_read(r, xd->mb_segment_tree_probs[2]));
+    else
+      mi->segment_id =
+        (unsigned char)(vp9_read(r, xd->mb_segment_tree_probs[1]));
+  }
+}
+
+#if CONFIG_NEW_MVREF
+int vp9_read_mv_ref_id(vp9_reader *r,
+                       vp9_prob * ref_id_probs) {
+  int ref_index = 0;
+
+  if (vp9_read(r, ref_id_probs[0])) {
+    ref_index++;
+    if (vp9_read(r, ref_id_probs[1])) {
+      ref_index++;
+      if (vp9_read(r, ref_id_probs[2]))
+        ref_index++;
+    }
+  }
+  return ref_index;
+}
+#endif
+
+extern const int vp9_i8x8_block[4];
+static void kfread_modes(VP9D_COMP *pbi,
+                         MODE_INFO *m,
+                         int mb_row,
+                         int mb_col,
+                         BOOL_DECODER* const bc) {
+  VP9_COMMON *const cm = &pbi->common;
+  const int mis = pbi->common.mode_info_stride;
+  int map_index = mb_row * pbi->common.mb_cols + mb_col;
+  MB_PREDICTION_MODE y_mode;
+
+  // Read the Macroblock segmentation map if it is being updated explicitly
+  // this frame (reset to 0 by default).
+  m->mbmi.segment_id = 0;
+  if (pbi->mb.update_mb_segmentation_map) {
+    read_mb_segid(bc, &m->mbmi, &pbi->mb);
+    pbi->common.last_frame_seg_map[map_index] = m->mbmi.segment_id;
+  }
+
+  m->mbmi.mb_skip_coeff = 0;
+  if (pbi->common.mb_no_coeff_skip &&
+      (!vp9_segfeature_active(&pbi->mb,
+                              m->mbmi.segment_id, SEG_LVL_EOB) ||
+       (vp9_get_segdata(&pbi->mb,
+                        m->mbmi.segment_id, SEG_LVL_EOB) != 0))) {
+    MACROBLOCKD *const xd  = &pbi->mb;
+    m->mbmi.mb_skip_coeff =
+      vp9_read(bc, vp9_get_pred_prob(cm, xd, PRED_MBSKIP));
+  } else {
+    if (vp9_segfeature_active(&pbi->mb,
+                              m->mbmi.segment_id, SEG_LVL_EOB) &&
+        (vp9_get_segdata(&pbi->mb,
+                         m->mbmi.segment_id, SEG_LVL_EOB) == 0)) {
+      m->mbmi.mb_skip_coeff = 1;
+    } else
+      m->mbmi.mb_skip_coeff = 0;
+  }
+
+#if CONFIG_SUPERBLOCKS
+  if (m->mbmi.encoded_as_sb) {
+    y_mode = (MB_PREDICTION_MODE) read_kf_sb_ymode(bc,
+      pbi->common.sb_kf_ymode_prob[pbi->common.kf_ymode_probs_index]);
+  } else
+#endif
+  y_mode = (MB_PREDICTION_MODE) read_kf_mb_ymode(bc,
+    pbi->common.kf_ymode_prob[pbi->common.kf_ymode_probs_index]);
+#if CONFIG_COMP_INTRA_PRED
+  m->mbmi.second_mode = (MB_PREDICTION_MODE)(DC_PRED - 1);
+#endif
+
+  m->mbmi.ref_frame = INTRA_FRAME;
+
+  if ((m->mbmi.mode = y_mode) == B_PRED) {
+    int i = 0;
+#if CONFIG_COMP_INTRA_PRED
+    int use_comp_pred = vp9_read(bc, 128);
+#endif
+    do {
+      const B_PREDICTION_MODE A = above_block_mode(m, i, mis);
+      const B_PREDICTION_MODE L = left_block_mode(m, i);
+
+      m->bmi[i].as_mode.first =
+        (B_PREDICTION_MODE) read_bmode(
+          bc, pbi->common.kf_bmode_prob [A] [L]);
+#if CONFIG_COMP_INTRA_PRED
+      if (use_comp_pred) {
+        m->bmi[i].as_mode.second =
+          (B_PREDICTION_MODE) read_bmode(
+            bc, pbi->common.kf_bmode_prob [A] [L]);
+      } else {
+        m->bmi[i].as_mode.second = (B_PREDICTION_MODE)(B_DC_PRED - 1);
+      }
+#endif
+    } while (++i < 16);
+  }
+  if ((m->mbmi.mode = y_mode) == I8X8_PRED) {
+    int i;
+    int mode8x8;
+    for (i = 0; i < 4; i++) {
+      int ib = vp9_i8x8_block[i];
+      mode8x8 = read_i8x8_mode(bc, pbi->common.fc.i8x8_mode_prob);
+      m->bmi[ib + 0].as_mode.first = mode8x8;
+      m->bmi[ib + 1].as_mode.first = mode8x8;
+      m->bmi[ib + 4].as_mode.first = mode8x8;
+      m->bmi[ib + 5].as_mode.first = mode8x8;
+#if CONFIG_COMP_INTRA_PRED
+      m->bmi[ib + 0].as_mode.second = (MB_PREDICTION_MODE)(DC_PRED - 1);
+      m->bmi[ib + 1].as_mode.second = (MB_PREDICTION_MODE)(DC_PRED - 1);
+      m->bmi[ib + 4].as_mode.second = (MB_PREDICTION_MODE)(DC_PRED - 1);
+      m->bmi[ib + 5].as_mode.second = (MB_PREDICTION_MODE)(DC_PRED - 1);
+#endif
+    }
+  } else
+    m->mbmi.uv_mode = (MB_PREDICTION_MODE)read_uv_mode(bc,
+                                                       pbi->common.kf_uv_mode_prob[m->mbmi.mode]);
+#if CONFIG_COMP_INTRA_PRED
+  m->mbmi.second_uv_mode = (MB_PREDICTION_MODE)(DC_PRED - 1);
+#endif
+
+#if CONFIG_SUPERBLOCKS
+  if (m->mbmi.encoded_as_sb)
+    m->mbmi.txfm_size = TX_8X8;
+  else
+#endif
+  if (cm->txfm_mode == TX_MODE_SELECT && m->mbmi.mb_skip_coeff == 0 &&
+      m->mbmi.mode <= I8X8_PRED) {
+    // FIXME(rbultje) code ternary symbol once all experiments are merged
+    m->mbmi.txfm_size = vp9_read(bc, cm->prob_tx[0]);
+    if (m->mbmi.txfm_size != TX_4X4 && m->mbmi.mode != I8X8_PRED)
+      m->mbmi.txfm_size += vp9_read(bc, cm->prob_tx[1]);
+  } else if (cm->txfm_mode >= ALLOW_16X16 && m->mbmi.mode <= TM_PRED) {
+    m->mbmi.txfm_size = TX_16X16;
+  } else if (cm->txfm_mode >= ALLOW_8X8 && m->mbmi.mode != B_PRED) {
+    m->mbmi.txfm_size = TX_8X8;
+  } else {
+    m->mbmi.txfm_size = TX_4X4;
+  }
+}
+
+static int read_nmv_component(vp9_reader *r,
+                              int rv,
+                              const nmv_component *mvcomp) {
+  int v, s, z, c, o, d;
+  s = vp9_read(r, mvcomp->sign);
+  c = treed_read(r, vp9_mv_class_tree, mvcomp->classes);
+  if (c == MV_CLASS_0) {
+    d = treed_read(r, vp9_mv_class0_tree, mvcomp->class0);
+  } else {
+    int i, b;
+    d = 0;
+    b = c + CLASS0_BITS - 1;  /* number of bits */
+    for (i = 0; i < b; ++i)
+      d |= (vp9_read(r, mvcomp->bits[i]) << i);
+  }
+  o = d << 3;
+
+  z = vp9_get_mv_mag(c, o);
+  v = (s ? -(z + 8) : (z + 8));
+  return v;
+}
+
+static int read_nmv_component_fp(vp9_reader *r,
+                                 int v,
+                                 int rv,
+                                 const nmv_component *mvcomp,
+                                 int usehp) {
+  int s, z, c, o, d, e, f;
+  s = v < 0;
+  z = (s ? -v : v) - 1;       /* magnitude - 1 */
+  z &= ~7;
+
+  c = vp9_get_mv_class(z, &o);
+  d = o >> 3;
+
+  if (c == MV_CLASS_0) {
+    f = treed_read(r, vp9_mv_fp_tree, mvcomp->class0_fp[d]);
+  } else {
+    f = treed_read(r, vp9_mv_fp_tree, mvcomp->fp);
+  }
+  o += (f << 1);
+
+  if (usehp) {
+    if (c == MV_CLASS_0) {
+      e = vp9_read(r, mvcomp->class0_hp);
+    } else {
+      e = vp9_read(r, mvcomp->hp);
+    }
+    o += e;
+  } else {
+    ++o;  /* Note if hp is not used, the default value of the hp bit is 1 */
+  }
+  z = vp9_get_mv_mag(c, o);
+  v = (s ? -(z + 1) : (z + 1));
+  return v;
+}
+
+static void read_nmv(vp9_reader *r, MV *mv, const MV *ref,
+                     const nmv_context *mvctx) {
+  MV_JOINT_TYPE j = treed_read(r, vp9_mv_joint_tree, mvctx->joints);
+  mv->row = mv-> col = 0;
+  if (j == MV_JOINT_HZVNZ || j == MV_JOINT_HNZVNZ) {
+    mv->row = read_nmv_component(r, ref->row, &mvctx->comps[0]);
+  }
+  if (j == MV_JOINT_HNZVZ || j == MV_JOINT_HNZVNZ) {
+    mv->col = read_nmv_component(r, ref->col, &mvctx->comps[1]);
+  }
+}
+
+static void read_nmv_fp(vp9_reader *r, MV *mv, const MV *ref,
+                        const nmv_context *mvctx, int usehp) {
+  MV_JOINT_TYPE j = vp9_get_mv_joint(*mv);
+  usehp = usehp && vp9_use_nmv_hp(ref);
+  if (j == MV_JOINT_HZVNZ || j == MV_JOINT_HNZVNZ) {
+    mv->row = read_nmv_component_fp(r, mv->row, ref->row, &mvctx->comps[0],
+                                    usehp);
+  }
+  if (j == MV_JOINT_HNZVZ || j == MV_JOINT_HNZVNZ) {
+    mv->col = read_nmv_component_fp(r, mv->col, ref->col, &mvctx->comps[1],
+                                    usehp);
+  }
+  //printf("  %d: %d %d ref: %d %d\n", usehp, mv->row, mv-> col, ref->row, ref->col);
+}
+
+static void update_nmv(vp9_reader *bc, vp9_prob *const p,
+                       const vp9_prob upd_p) {
+  if (vp9_read(bc, upd_p)) {
+#ifdef LOW_PRECISION_MV_UPDATE
+    *p = (vp9_read_literal(bc, 7) << 1) | 1;
+#else
+    *p = (vp9_read_literal(bc, 8));
+#endif
+  }
+}
+
+static void read_nmvprobs(vp9_reader *bc, nmv_context *mvctx,
+                          int usehp) {
+  int i, j, k;
+#ifdef MV_GROUP_UPDATE
+  if (!vp9_read_bit(bc)) return;
+#endif
+  for (j = 0; j < MV_JOINTS - 1; ++j) {
+    update_nmv(bc, &mvctx->joints[j],
+               VP9_NMV_UPDATE_PROB);
+  }
+  for (i = 0; i < 2; ++i) {
+    update_nmv(bc, &mvctx->comps[i].sign,
+               VP9_NMV_UPDATE_PROB);
+    for (j = 0; j < MV_CLASSES - 1; ++j) {
+      update_nmv(bc, &mvctx->comps[i].classes[j],
+                 VP9_NMV_UPDATE_PROB);
+    }
+    for (j = 0; j < CLASS0_SIZE - 1; ++j) {
+      update_nmv(bc, &mvctx->comps[i].class0[j],
+                 VP9_NMV_UPDATE_PROB);
+    }
+    for (j = 0; j < MV_OFFSET_BITS; ++j) {
+      update_nmv(bc, &mvctx->comps[i].bits[j],
+                 VP9_NMV_UPDATE_PROB);
+    }
+  }
+
+  for (i = 0; i < 2; ++i) {
+    for (j = 0; j < CLASS0_SIZE; ++j) {
+      for (k = 0; k < 3; ++k)
+        update_nmv(bc, &mvctx->comps[i].class0_fp[j][k],
+                   VP9_NMV_UPDATE_PROB);
+    }
+    for (j = 0; j < 3; ++j) {
+      update_nmv(bc, &mvctx->comps[i].fp[j],
+                 VP9_NMV_UPDATE_PROB);
+    }
+  }
+
+  if (usehp) {
+    for (i = 0; i < 2; ++i) {
+      update_nmv(bc, &mvctx->comps[i].class0_hp,
+                 VP9_NMV_UPDATE_PROB);
+      update_nmv(bc, &mvctx->comps[i].hp,
+                 VP9_NMV_UPDATE_PROB);
+    }
+  }
+}
+
+// Read the referncence frame
+static MV_REFERENCE_FRAME read_ref_frame(VP9D_COMP *pbi,
+                                         vp9_reader *const bc,
+                                         unsigned char segment_id) {
+  MV_REFERENCE_FRAME ref_frame;
+  int seg_ref_active;
+  int seg_ref_count = 0;
+
+  VP9_COMMON *const cm = &pbi->common;
+  MACROBLOCKD *const xd = &pbi->mb;
+
+  seg_ref_active = vp9_segfeature_active(xd,
+                                         segment_id,
+                                         SEG_LVL_REF_FRAME);
+
+  // If segment coding enabled does the segment allow for more than one
+  // possible reference frame
+  if (seg_ref_active) {
+    seg_ref_count = vp9_check_segref(xd, segment_id, INTRA_FRAME) +
+                    vp9_check_segref(xd, segment_id, LAST_FRAME) +
+                    vp9_check_segref(xd, segment_id, GOLDEN_FRAME) +
+                    vp9_check_segref(xd, segment_id, ALTREF_FRAME);
+  }
+
+  // Segment reference frame features not available or allows for
+  // multiple reference frame options
+  if (!seg_ref_active || (seg_ref_count > 1)) {
+    // Values used in prediction model coding
+    unsigned char prediction_flag;
+    vp9_prob pred_prob;
+    MV_REFERENCE_FRAME pred_ref;
+
+    // Get the context probability the prediction flag
+    pred_prob = vp9_get_pred_prob(cm, xd, PRED_REF);
+
+    // Read the prediction status flag
+    prediction_flag = (unsigned char)vp9_read(bc, pred_prob);
+
+    // Store the prediction flag.
+    vp9_set_pred_flag(xd, PRED_REF, prediction_flag);
+
+    // Get the predicted reference frame.
+    pred_ref = vp9_get_pred_ref(cm, xd);
+
+    // If correctly predicted then use the predicted value
+    if (prediction_flag) {
+      ref_frame = pred_ref;
+    }
+    // else decode the explicitly coded value
+    else {
+      vp9_prob mod_refprobs[PREDICTION_PROBS];
+      vpx_memcpy(mod_refprobs,
+                 cm->mod_refprobs[pred_ref], sizeof(mod_refprobs));
+
+      // If segment coding enabled blank out options that cant occur by
+      // setting the branch probability to 0.
+      if (seg_ref_active) {
+        mod_refprobs[INTRA_FRAME] *=
+          vp9_check_segref(xd, segment_id, INTRA_FRAME);
+        mod_refprobs[LAST_FRAME] *=
+          vp9_check_segref(xd, segment_id, LAST_FRAME);
+        mod_refprobs[GOLDEN_FRAME] *=
+          (vp9_check_segref(xd, segment_id, GOLDEN_FRAME) *
+           vp9_check_segref(xd, segment_id, ALTREF_FRAME));
+      }
+
+      // Default to INTRA_FRAME (value 0)
+      ref_frame = INTRA_FRAME;
+
+      // Do we need to decode the Intra/Inter branch
+      if (mod_refprobs[0])
+        ref_frame = (MV_REFERENCE_FRAME) vp9_read(bc, mod_refprobs[0]);
+      else
+        ref_frame++;
+
+      if (ref_frame) {
+        // Do we need to decode the Last/Gf_Arf branch
+        if (mod_refprobs[1])
+          ref_frame += vp9_read(bc, mod_refprobs[1]);
+        else
+          ref_frame++;
+
+        if (ref_frame > 1) {
+          // Do we need to decode the GF/Arf branch
+          if (mod_refprobs[2])
+            ref_frame += vp9_read(bc, mod_refprobs[2]);
+          else {
+            if (seg_ref_active) {
+              if ((pred_ref == GOLDEN_FRAME) ||
+                  !vp9_check_segref(xd, segment_id, GOLDEN_FRAME)) {
+                ref_frame = ALTREF_FRAME;
+              } else
+                ref_frame = GOLDEN_FRAME;
+            } else
+              ref_frame = (pred_ref == GOLDEN_FRAME)
+                          ? ALTREF_FRAME : GOLDEN_FRAME;
+          }
+        }
+      }
+    }
+  }
+
+  // Segment reference frame features are enabled
+  else {
+    // The reference frame for the mb is considered as correclty predicted
+    // if it is signaled at the segment level for the purposes of the
+    // common prediction model
+    vp9_set_pred_flag(xd, PRED_REF, 1);
+    ref_frame = vp9_get_pred_ref(cm, xd);
+  }
+
+  return (MV_REFERENCE_FRAME)ref_frame;
+}
+
+#if CONFIG_SUPERBLOCKS
+static MB_PREDICTION_MODE read_sb_mv_ref(vp9_reader *bc, const vp9_prob *p) {
+  return (MB_PREDICTION_MODE) treed_read(bc, vp9_sb_mv_ref_tree, p);
+}
+#endif
+
+static MB_PREDICTION_MODE read_mv_ref(vp9_reader *bc, const vp9_prob *p) {
+  return (MB_PREDICTION_MODE) treed_read(bc, vp9_mv_ref_tree, p);
+}
+
+static B_PREDICTION_MODE sub_mv_ref(vp9_reader *bc, const vp9_prob *p) {
+  return (B_PREDICTION_MODE) treed_read(bc, vp9_sub_mv_ref_tree, p);
+}
+
+#ifdef VPX_MODE_COUNT
+unsigned int vp9_mv_cont_count[5][4] = {
+  { 0, 0, 0, 0 },
+  { 0, 0, 0, 0 },
+  { 0, 0, 0, 0 },
+  { 0, 0, 0, 0 },
+  { 0, 0, 0, 0 }
+};
+#endif
+
+static const unsigned char mbsplit_fill_count[4] = {8, 8, 4, 1};
+static const unsigned char mbsplit_fill_offset[4][16] = {
+  { 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15},
+  { 0,  1,  4,  5,  8,  9, 12, 13,  2,  3,   6,  7, 10, 11, 14, 15},
+  { 0,  1,  4,  5,  2,  3,  6,  7,  8,  9,  12, 13, 10, 11, 14, 15},
+  { 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15}
+};
+
+static void read_switchable_interp_probs(VP9D_COMP* const pbi,
+                                         BOOL_DECODER* const bc) {
+  VP9_COMMON *const cm = &pbi->common;
+  int i, j;
+  for (j = 0; j <= VP9_SWITCHABLE_FILTERS; ++j) {
+    for (i = 0; i < VP9_SWITCHABLE_FILTERS - 1; ++i) {
+      cm->fc.switchable_interp_prob[j][i] = vp9_read_literal(bc, 8);
+    }
+  }
+  //printf("DECODER: %d %d\n", cm->fc.switchable_interp_prob[0],
+  //cm->fc.switchable_interp_prob[1]);
+}
+
+static void mb_mode_mv_init(VP9D_COMP *pbi, vp9_reader *bc) {
+  VP9_COMMON *const cm = &pbi->common;
+  nmv_context *const nmvc = &pbi->common.fc.nmvc;
+  MACROBLOCKD *const xd  = &pbi->mb;
+
+  if (cm->frame_type == KEY_FRAME) {
+    if (!cm->kf_ymode_probs_update)
+      cm->kf_ymode_probs_index = vp9_read_literal(bc, 3);
+  } else {
+#if CONFIG_PRED_FILTER
+    cm->pred_filter_mode = (vp9_prob)vp9_read_literal(bc, 2);
+
+    if (cm->pred_filter_mode == 2)
+      cm->prob_pred_filter_off = (vp9_prob)vp9_read_literal(bc, 8);
+#endif
+    if (cm->mcomp_filter_type == SWITCHABLE)
+      read_switchable_interp_probs(pbi, bc);
+    // Decode the baseline probabilities for decoding reference frame
+    cm->prob_intra_coded = (vp9_prob)vp9_read_literal(bc, 8);
+    cm->prob_last_coded  = (vp9_prob)vp9_read_literal(bc, 8);
+    cm->prob_gf_coded    = (vp9_prob)vp9_read_literal(bc, 8);
+
+    // Computes a modified set of probabilities for use when reference
+    // frame prediction fails.
+    vp9_compute_mod_refprobs(cm);
+
+    pbi->common.comp_pred_mode = vp9_read(bc, 128);
+    if (cm->comp_pred_mode)
+      cm->comp_pred_mode += vp9_read(bc, 128);
+    if (cm->comp_pred_mode == HYBRID_PREDICTION) {
+      int i;
+      for (i = 0; i < COMP_PRED_CONTEXTS; i++)
+        cm->prob_comppred[i] = (vp9_prob)vp9_read_literal(bc, 8);
+    }
+
+    if (vp9_read_bit(bc)) {
+      int i = 0;
+
+      do {
+        cm->fc.ymode_prob[i] = (vp9_prob) vp9_read_literal(bc, 8);
+      } while (++i < VP9_YMODES - 1);
+    }
+
+#if CONFIG_NEW_MVREF
+  // Temp defaults probabilities for ecnoding the MV ref id signal
+  vpx_memset(xd->mb_mv_ref_id_probs, 192, sizeof(xd->mb_mv_ref_id_probs));
+#endif
+
+    read_nmvprobs(bc, nmvc, xd->allow_high_precision_mv);
+  }
+}
+
+// This function either reads the segment id for the current macroblock from
+// the bitstream or if the value is temporally predicted asserts the predicted
+// value
+static void read_mb_segment_id(VP9D_COMP *pbi,
+                               int mb_row, int mb_col,
+                               BOOL_DECODER* const bc) {
+  VP9_COMMON *const cm = &pbi->common;
+  MACROBLOCKD *const xd  = &pbi->mb;
+  MODE_INFO *mi = xd->mode_info_context;
+  MB_MODE_INFO *mbmi = &mi->mbmi;
+  int index = mb_row * pbi->common.mb_cols + mb_col;
+
+  if (xd->segmentation_enabled) {
+    if (xd->update_mb_segmentation_map) {
+      // Is temporal coding of the segment id for this mb enabled.
+      if (cm->temporal_update) {
+        // Get the context based probability for reading the
+        // prediction status flag
+        vp9_prob pred_prob =
+          vp9_get_pred_prob(cm, xd, PRED_SEG_ID);
+
+        // Read the prediction status flag
+        unsigned char seg_pred_flag =
+          (unsigned char)vp9_read(bc, pred_prob);
+
+        // Store the prediction flag.
+        vp9_set_pred_flag(xd, PRED_SEG_ID, seg_pred_flag);
+
+        // If the value is flagged as correctly predicted
+        // then use the predicted value
+        if (seg_pred_flag) {
+          mbmi->segment_id = vp9_get_pred_mb_segid(cm, xd, index);
+        }
+        // Else .... decode it explicitly
+        else {
+          read_mb_segid(bc, mbmi, xd);
+        }
+      }
+      // Normal unpredicted coding mode
+      else {
+        read_mb_segid(bc, mbmi, xd);
+      }
+#if CONFIG_SUPERBLOCKS
+      if (mbmi->encoded_as_sb) {
+        cm->last_frame_seg_map[index] = mbmi->segment_id;
+        if (mb_col + 1 < cm->mb_cols)
+          cm->last_frame_seg_map[index + 1] = mbmi->segment_id;
+        if (mb_row + 1 < cm->mb_rows) {
+          cm->last_frame_seg_map[index + cm->mb_cols] = mbmi->segment_id;
+          if (mb_col + 1 < cm->mb_cols)
+            cm->last_frame_seg_map[index + cm->mb_cols + 1] = mbmi->segment_id;
+        }
+      } else
+#endif
+      {
+        cm->last_frame_seg_map[index] = mbmi->segment_id;
+      }
+    } else {
+#if CONFIG_SUPERBLOCKS
+      if (mbmi->encoded_as_sb) {
+        mbmi->segment_id = cm->last_frame_seg_map[index];
+        if (mb_col < cm->mb_cols - 1)
+          mbmi->segment_id = mbmi->segment_id &&
+                             cm->last_frame_seg_map[index + 1];
+        if (mb_row < cm->mb_rows - 1) {
+          mbmi->segment_id = mbmi->segment_id &&
+                             cm->last_frame_seg_map[index + cm->mb_cols];
+          if (mb_col < cm->mb_cols - 1)
+            mbmi->segment_id = mbmi->segment_id &&
+                               cm->last_frame_seg_map[index + cm->mb_cols + 1];
+        }
+      } else
+#endif
+      {
+        mbmi->segment_id = cm->last_frame_seg_map[index];
+      }
+    }
+  } else {
+    // The encoder explicitly sets the segment_id to 0
+    // when segmentation is disabled
+    mbmi->segment_id = 0;
+  }
+}
+
+static void read_mb_modes_mv(VP9D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,
+                             MODE_INFO *prev_mi,
+                             int mb_row, int mb_col,
+                             BOOL_DECODER* const bc) {
+  VP9_COMMON *const cm = &pbi->common;
+  nmv_context *const nmvc = &pbi->common.fc.nmvc;
+  const int mis = pbi->common.mode_info_stride;
+  MACROBLOCKD *const xd  = &pbi->mb;
+
+  int_mv *const mv = &mbmi->mv;
+  int mb_to_left_edge;
+  int mb_to_right_edge;
+  int mb_to_top_edge;
+  int mb_to_bottom_edge;
+
+  mb_to_top_edge = xd->mb_to_top_edge;
+  mb_to_bottom_edge = xd->mb_to_bottom_edge;
+  mb_to_top_edge -= LEFT_TOP_MARGIN;
+  mb_to_bottom_edge += RIGHT_BOTTOM_MARGIN;
+  mbmi->need_to_clamp_mvs = 0;
+  mbmi->need_to_clamp_secondmv = 0;
+  mbmi->second_ref_frame = 0;
+  /* Distance of Mb to the various image edges.
+   * These specified to 8th pel as they are always compared to MV values that are in 1/8th pel units
+   */
+  xd->mb_to_left_edge =
+    mb_to_left_edge = -((mb_col * 16) << 3);
+  mb_to_left_edge -= LEFT_TOP_MARGIN;
+
+  xd->mb_to_right_edge =
+    mb_to_right_edge = ((pbi->common.mb_cols - 1 - mb_col) * 16) << 3;
+  mb_to_right_edge += RIGHT_BOTTOM_MARGIN;
+
+  // Make sure the MACROBLOCKD mode info pointer is pointed at the
+  // correct entry for the current macroblock.
+  xd->mode_info_context = mi;
+  xd->prev_mode_info_context = prev_mi;
+
+  // Read the macroblock segment id.
+  read_mb_segment_id(pbi, mb_row, mb_col, bc);
+
+  if (pbi->common.mb_no_coeff_skip &&
+      (!vp9_segfeature_active(xd,
+                              mbmi->segment_id, SEG_LVL_EOB) ||
+       (vp9_get_segdata(xd, mbmi->segment_id, SEG_LVL_EOB) != 0))) {
+    // Read the macroblock coeff skip flag if this feature is in use,
+    // else default to 0
+    mbmi->mb_skip_coeff = vp9_read(bc, vp9_get_pred_prob(cm, xd, PRED_MBSKIP));
+  } else {
+    if (vp9_segfeature_active(xd,
+                              mbmi->segment_id, SEG_LVL_EOB) &&
+        (vp9_get_segdata(xd, mbmi->segment_id, SEG_LVL_EOB) == 0)) {
+      mbmi->mb_skip_coeff = 1;
+    } else
+      mbmi->mb_skip_coeff = 0;
+  }
+
+  // Read the reference frame
+  mbmi->ref_frame = read_ref_frame(pbi, bc, mbmi->segment_id);
+
+  // If reference frame is an Inter frame
+  if (mbmi->ref_frame) {
+    int rct[4];
+    int_mv nearest, nearby, best_mv;
+    int_mv nearest_second, nearby_second, best_mv_second;
+    vp9_prob mv_ref_p [VP9_MVREFS - 1];
+
+#if CONFIG_NEWBESTREFMV
+    int recon_y_stride, recon_yoffset;
+    int recon_uv_stride, recon_uvoffset;
+#endif
+
+    vp9_find_near_mvs(xd, mi,
+                      prev_mi,
+                      &nearest, &nearby, &best_mv, rct,
+                      mbmi->ref_frame, cm->ref_frame_sign_bias);
+
+#if CONFIG_NEWBESTREFMV
+    {
+      int ref_fb_idx;
+      MV_REFERENCE_FRAME ref_frame = mbmi->ref_frame;
+
+      /* Select the appropriate reference frame for this MB */
+      if (ref_frame == LAST_FRAME)
+        ref_fb_idx = cm->lst_fb_idx;
+      else if (ref_frame == GOLDEN_FRAME)
+        ref_fb_idx = cm->gld_fb_idx;
+      else
+        ref_fb_idx = cm->alt_fb_idx;
+
+      recon_y_stride = cm->yv12_fb[ref_fb_idx].y_stride  ;
+      recon_uv_stride = cm->yv12_fb[ref_fb_idx].uv_stride;
+
+      recon_yoffset = (mb_row * recon_y_stride * 16) + (mb_col * 16);
+      recon_uvoffset = (mb_row * recon_uv_stride * 8) + (mb_col * 8);
+
+      xd->pre.y_buffer = cm->yv12_fb[ref_fb_idx].y_buffer + recon_yoffset;
+      xd->pre.u_buffer = cm->yv12_fb[ref_fb_idx].u_buffer + recon_uvoffset;
+      xd->pre.v_buffer = cm->yv12_fb[ref_fb_idx].v_buffer + recon_uvoffset;
+
+      vp9_find_mv_refs(xd, mi, prev_mi,
+                       ref_frame, mbmi->ref_mvs[ref_frame],
+                       cm->ref_frame_sign_bias);
+
+      vp9_find_best_ref_mvs(xd,
+                            xd->pre.y_buffer,
+                            recon_y_stride,
+                            mbmi->ref_mvs[ref_frame],
+                            &best_mv, &nearest, &nearby);
+    }
+#endif
+
+    vp9_mv_ref_probs(&pbi->common, mv_ref_p, rct);
+
+    // Is the segment level mode feature enabled for this segment
+    if (vp9_segfeature_active(xd, mbmi->segment_id, SEG_LVL_MODE)) {
+      mbmi->mode =
+        vp9_get_segdata(xd, mbmi->segment_id, SEG_LVL_MODE);
+    } else {
+#if CONFIG_SUPERBLOCKS
+      if (mbmi->encoded_as_sb) {
+        mbmi->mode = read_sb_mv_ref(bc, mv_ref_p);
+      } else
+#endif
+      mbmi->mode = read_mv_ref(bc, mv_ref_p);
+
+      vp9_accum_mv_refs(&pbi->common, mbmi->mode, rct);
+    }
+
+#if CONFIG_PRED_FILTER
+    if (mbmi->mode >= NEARESTMV && mbmi->mode < SPLITMV) {
+      // Is the prediction filter enabled
+      if (cm->pred_filter_mode == 2)
+        mbmi->pred_filter_enabled =
+          vp9_read(bc, cm->prob_pred_filter_off);
+      else
+        mbmi->pred_filter_enabled = cm->pred_filter_mode;
+    }
+#endif
+    if (mbmi->mode >= NEARESTMV && mbmi->mode <= SPLITMV)
+    {
+      if (cm->mcomp_filter_type == SWITCHABLE) {
+        mbmi->interp_filter = vp9_switchable_interp[
+            treed_read(bc, vp9_switchable_interp_tree,
+                       vp9_get_pred_probs(cm, xd, PRED_SWITCHABLE_INTERP))];
+      } else {
+        mbmi->interp_filter = cm->mcomp_filter_type;
+      }
+    }
+
+    if (cm->comp_pred_mode == COMP_PREDICTION_ONLY ||
+        (cm->comp_pred_mode == HYBRID_PREDICTION &&
+         vp9_read(bc, vp9_get_pred_prob(cm, xd, PRED_COMP)))) {
+      /* Since we have 3 reference frames, we can only have 3 unique
+       * combinations of combinations of 2 different reference frames
+       * (A-G, G-L or A-L). In the bitstream, we use this to simply
+       * derive the second reference frame from the first reference
+       * frame, by saying it's the next one in the enumerator, and
+       * if that's > n_refs, then the second reference frame is the
+       * first one in the enumerator. */
+      mbmi->second_ref_frame = mbmi->ref_frame + 1;
+      if (mbmi->second_ref_frame == 4)
+        mbmi->second_ref_frame = 1;
+#if CONFIG_NEWBESTREFMV
+      if (mbmi->second_ref_frame) {
+        int second_ref_fb_idx;
+        /* Select the appropriate reference frame for this MB */
+        if (mbmi->second_ref_frame == LAST_FRAME)
+          second_ref_fb_idx = cm->lst_fb_idx;
+        else if (mbmi->second_ref_frame ==
+          GOLDEN_FRAME)
+          second_ref_fb_idx = cm->gld_fb_idx;
+        else
+          second_ref_fb_idx = cm->alt_fb_idx;
+
+        xd->second_pre.y_buffer =
+          cm->yv12_fb[second_ref_fb_idx].y_buffer + recon_yoffset;
+        xd->second_pre.u_buffer =
+          cm->yv12_fb[second_ref_fb_idx].u_buffer + recon_uvoffset;
+        xd->second_pre.v_buffer =
+          cm->yv12_fb[second_ref_fb_idx].v_buffer + recon_uvoffset;
+        vp9_find_near_mvs(xd, mi, prev_mi,
+                          &nearest_second, &nearby_second, &best_mv_second,
+                          rct,
+                          mbmi->second_ref_frame,
+                          cm->ref_frame_sign_bias);
+
+        vp9_find_mv_refs(xd, mi, prev_mi,
+                         mbmi->second_ref_frame,
+                         mbmi->ref_mvs[mbmi->second_ref_frame],
+                         cm->ref_frame_sign_bias);
+
+        vp9_find_best_ref_mvs(xd,
+                              xd->second_pre.y_buffer,
+                              recon_y_stride,
+                              mbmi->ref_mvs[mbmi->second_ref_frame],
+                              &best_mv_second,
+                              &nearest_second,
+                              &nearby_second);
+      }
+#else
+      vp9_find_near_mvs(xd, mi, prev_mi,
+                        &nearest_second, &nearby_second, &best_mv_second,
+                        rct,
+                        mbmi->second_ref_frame,
+                        pbi->common.ref_frame_sign_bias);
+#endif
+    } else {
+      mbmi->second_ref_frame = 0;
+    }
+
+    mbmi->uv_mode = DC_PRED;
+    switch (mbmi->mode) {
+      case SPLITMV: {
+        const int s = mbmi->partitioning =
+                        treed_read(bc, vp9_mbsplit_tree, cm->fc.mbsplit_prob);
+        const int num_p = vp9_mbsplit_count [s];
+        int j = 0;
+        cm->fc.mbsplit_counts[s]++;
+
+        mbmi->need_to_clamp_mvs = 0;
+        do { /* for each subset j */
+          int_mv leftmv, abovemv, second_leftmv, second_abovemv;
+          int_mv blockmv, secondmv;
+          int k;  /* first block in subset j */
+          int mv_contz;
+          int blockmode;
+
+          k = vp9_mbsplit_offset[s][j];
+
+          leftmv.as_int = left_block_mv(mi, k);
+          abovemv.as_int = above_block_mv(mi, k, mis);
+          if (mbmi->second_ref_frame) {
+            second_leftmv.as_int = left_block_second_mv(mi, k);
+            second_abovemv.as_int = above_block_second_mv(mi, k, mis);
+          }
+          mv_contz = vp9_mv_cont(&leftmv, &abovemv);
+          blockmode = sub_mv_ref(bc, cm->fc.sub_mv_ref_prob [mv_contz]);
+          cm->fc.sub_mv_ref_counts[mv_contz][blockmode - LEFT4X4]++;
+
+          switch (blockmode) {
+            case NEW4X4:
+              read_nmv(bc, &blockmv.as_mv, &best_mv.as_mv, nmvc);
+              read_nmv_fp(bc, &blockmv.as_mv, &best_mv.as_mv, nmvc,
+                          xd->allow_high_precision_mv);
+              vp9_increment_nmv(&blockmv.as_mv, &best_mv.as_mv,
+                                &cm->fc.NMVcount, xd->allow_high_precision_mv);
+              blockmv.as_mv.row += best_mv.as_mv.row;
+              blockmv.as_mv.col += best_mv.as_mv.col;
+
+              if (mbmi->second_ref_frame) {
+                read_nmv(bc, &secondmv.as_mv, &best_mv_second.as_mv, nmvc);
+                read_nmv_fp(bc, &secondmv.as_mv, &best_mv_second.as_mv, nmvc,
+                            xd->allow_high_precision_mv);
+                vp9_increment_nmv(&secondmv.as_mv, &best_mv_second.as_mv,
+                                  &cm->fc.NMVcount, xd->allow_high_precision_mv);
+                secondmv.as_mv.row += best_mv_second.as_mv.row;
+                secondmv.as_mv.col += best_mv_second.as_mv.col;
+              }
+#ifdef VPX_MODE_COUNT
+              vp9_mv_cont_count[mv_contz][3]++;
+#endif
+              break;
+            case LEFT4X4:
+              blockmv.as_int = leftmv.as_int;
+              if (mbmi->second_ref_frame)
+                secondmv.as_int = second_leftmv.as_int;
+#ifdef VPX_MODE_COUNT
+              vp9_mv_cont_count[mv_contz][0]++;
+#endif
+              break;
+            case ABOVE4X4:
+              blockmv.as_int = abovemv.as_int;
+              if (mbmi->second_ref_frame)
+                secondmv.as_int = second_abovemv.as_int;
+#ifdef VPX_MODE_COUNT
+              vp9_mv_cont_count[mv_contz][1]++;
+#endif
+              break;
+            case ZERO4X4:
+              blockmv.as_int = 0;
+              if (mbmi->second_ref_frame)
+                secondmv.as_int = 0;
+#ifdef VPX_MODE_COUNT
+              vp9_mv_cont_count[mv_contz][2]++;
+#endif
+              break;
+            default:
+              break;
+          }
+
+          mbmi->need_to_clamp_mvs |= check_mv_bounds(&blockmv,
+                                                     mb_to_left_edge,
+                                                     mb_to_right_edge,
+                                                     mb_to_top_edge,
+                                                     mb_to_bottom_edge);
+          if (mbmi->second_ref_frame) {
+            mbmi->need_to_clamp_mvs |= check_mv_bounds(&secondmv,
+                                                       mb_to_left_edge,
+                                                       mb_to_right_edge,
+                                                       mb_to_top_edge,
+                                                       mb_to_bottom_edge);
+          }
+
+          {
+            /* Fill (uniform) modes, mvs of jth subset.
+             Must do it here because ensuing subsets can
+             refer back to us via "left" or "above". */
+            const unsigned char *fill_offset;
+            unsigned int fill_count = mbsplit_fill_count[s];
+
+            fill_offset = &mbsplit_fill_offset[s][(unsigned char)j * mbsplit_fill_count[s]];
+
+            do {
+              mi->bmi[ *fill_offset].as_mv.first.as_int = blockmv.as_int;
+              if (mbmi->second_ref_frame)
+                mi->bmi[ *fill_offset].as_mv.second.as_int = secondmv.as_int;
+              fill_offset++;
+            } while (--fill_count);
+          }
+
+        } while (++j < num_p);
+      }
+
+      mv->as_int = mi->bmi[15].as_mv.first.as_int;
+      mbmi->mv[1].as_int = mi->bmi[15].as_mv.second.as_int;
+
+      break;  /* done with SPLITMV */
+
+      case NEARMV:
+        mv->as_int = nearby.as_int;
+        /* Clip "next_nearest" so that it does not extend to far out of image */
+        clamp_mv(mv, mb_to_left_edge, mb_to_right_edge,
+                 mb_to_top_edge, mb_to_bottom_edge);
+        if (mbmi->second_ref_frame) {
+          mbmi->mv[1].as_int = nearby_second.as_int;
+          clamp_mv(&mbmi->mv[1], mb_to_left_edge, mb_to_right_edge,
+                   mb_to_top_edge, mb_to_bottom_edge);
+        }
+        break;
+
+      case NEARESTMV:
+        mv->as_int = nearest.as_int;
+        /* Clip "next_nearest" so that it does not extend to far out of image */
+        clamp_mv(mv, mb_to_left_edge, mb_to_right_edge,
+                 mb_to_top_edge, mb_to_bottom_edge);
+        if (mbmi->second_ref_frame) {
+          mbmi->mv[1].as_int = nearest_second.as_int;
+          clamp_mv(&mbmi->mv[1], mb_to_left_edge, mb_to_right_edge,
+                   mb_to_top_edge, mb_to_bottom_edge);
+        }
+        break;
+
+      case ZEROMV:
+        mv->as_int = 0;
+        if (mbmi->second_ref_frame)
+          mbmi->mv[1].as_int = 0;
+        break;
+
+      case NEWMV:
+
+#if CONFIG_NEW_MVREF
+        {
+          int best_index;
+          MV_REFERENCE_FRAME ref_frame = mbmi->ref_frame;
+
+          // Encode the index of the choice.
+          best_index =
+            vp9_read_mv_ref_id(bc, xd->mb_mv_ref_id_probs[ref_frame]);
+
+          best_mv.as_int = mbmi->ref_mvs[ref_frame][best_index].as_int;
+        }
+#endif
+
+        read_nmv(bc, &mv->as_mv, &best_mv.as_mv, nmvc);
+        read_nmv_fp(bc, &mv->as_mv, &best_mv.as_mv, nmvc,
+                    xd->allow_high_precision_mv);
+        vp9_increment_nmv(&mv->as_mv, &best_mv.as_mv, &cm->fc.NMVcount,
+                          xd->allow_high_precision_mv);
+
+        mv->as_mv.row += best_mv.as_mv.row;
+        mv->as_mv.col += best_mv.as_mv.col;
+
+        /* Don't need to check this on NEARMV and NEARESTMV modes
+         * since those modes clamp the MV. The NEWMV mode does not,
+         * so signal to the prediction stage whether special
+         * handling may be required.
+         */
+        mbmi->need_to_clamp_mvs = check_mv_bounds(mv,
+                                                  mb_to_left_edge,
+                                                  mb_to_right_edge,
+                                                  mb_to_top_edge,
+                                                  mb_to_bottom_edge);
+
+        if (mbmi->second_ref_frame) {
+#if CONFIG_NEW_MVREF
+        {
+          int best_index;
+          MV_REFERENCE_FRAME ref_frame = mbmi->second_ref_frame;
+
+          // Encode the index of the choice.
+          best_index =
+            vp9_read_mv_ref_id(bc, xd->mb_mv_ref_id_probs[ref_frame]);
+          best_mv_second.as_int = mbmi->ref_mvs[ref_frame][best_index].as_int;
+        }
+#endif
+
+          read_nmv(bc, &mbmi->mv[1].as_mv, &best_mv_second.as_mv, nmvc);
+          read_nmv_fp(bc, &mbmi->mv[1].as_mv, &best_mv_second.as_mv, nmvc,
+                      xd->allow_high_precision_mv);
+          vp9_increment_nmv(&mbmi->mv[1].as_mv, &best_mv_second.as_mv,
+                            &cm->fc.NMVcount, xd->allow_high_precision_mv);
+          mbmi->mv[1].as_mv.row += best_mv_second.as_mv.row;
+          mbmi->mv[1].as_mv.col += best_mv_second.as_mv.col;
+          mbmi->need_to_clamp_secondmv |=
+            check_mv_bounds(&mbmi->mv[1],
+                            mb_to_left_edge, mb_to_right_edge,
+                            mb_to_top_edge, mb_to_bottom_edge);
+        }
+        break;
+      default:
+;
+#if CONFIG_DEBUG
+        assert(0);
+#endif
+    }
+  } else {
+    /* required for left and above block mv */
+    mbmi->mv[0].as_int = 0;
+
+    if (vp9_segfeature_active(xd, mbmi->segment_id, SEG_LVL_MODE))
+      mbmi->mode = (MB_PREDICTION_MODE)
+                   vp9_get_segdata(xd, mbmi->segment_id, SEG_LVL_MODE);
+    else {
+      // FIXME write using SB mode tree
+      mbmi->mode = (MB_PREDICTION_MODE)
+                   read_ymode(bc, pbi->common.fc.ymode_prob);
+      pbi->common.fc.ymode_counts[mbmi->mode]++;
+    }
+#if CONFIG_COMP_INTRA_PRED
+    mbmi->second_mode = (MB_PREDICTION_MODE)(DC_PRED - 1);
+#endif
+
+    // If MB mode is BPRED read the block modes
+    if (mbmi->mode == B_PRED) {
+      int j = 0;
+#if CONFIG_COMP_INTRA_PRED
+      int use_comp_pred = vp9_read(bc, 128);
+#endif
+      do {
+        mi->bmi[j].as_mode.first = (B_PREDICTION_MODE)read_bmode(bc, pbi->common.fc.bmode_prob);
+        /*
+        {
+          int p;
+          for (p = 0; p < VP9_BINTRAMODES - 1; ++p)
+            printf(" %d", pbi->common.fc.bmode_prob[p]);
+          printf("\nbmode[%d][%d]: %d\n", pbi->common.current_video_frame, j, mi->bmi[j].as_mode.first);
+        }
+        */
+        pbi->common.fc.bmode_counts[mi->bmi[j].as_mode.first]++;
+#if CONFIG_COMP_INTRA_PRED
+        if (use_comp_pred) {
+          mi->bmi[j].as_mode.second = (B_PREDICTION_MODE)read_bmode(bc, pbi->common.fc.bmode_prob);
+        } else {
+          mi->bmi[j].as_mode.second = (B_PREDICTION_MODE)(B_DC_PRED - 1);
+        }
+#endif
+      } while (++j < 16);
+    }
+
+    if (mbmi->mode == I8X8_PRED) {
+      int i;
+      int mode8x8;
+      for (i = 0; i < 4; i++) {
+        int ib = vp9_i8x8_block[i];
+        mode8x8 = read_i8x8_mode(bc, pbi->common.fc.i8x8_mode_prob);
+        mi->bmi[ib + 0].as_mode.first = mode8x8;
+        mi->bmi[ib + 1].as_mode.first = mode8x8;
+        mi->bmi[ib + 4].as_mode.first = mode8x8;
+        mi->bmi[ib + 5].as_mode.first = mode8x8;
+        pbi->common.fc.i8x8_mode_counts[mode8x8]++;
+#if CONFIG_COMP_INTRA_PRED
+        mi->bmi[ib + 0].as_mode.second = (MB_PREDICTION_MODE)(DC_PRED - 1);
+        mi->bmi[ib + 1].as_mode.second = (MB_PREDICTION_MODE)(DC_PRED - 1);
+        mi->bmi[ib + 4].as_mode.second = (MB_PREDICTION_MODE)(DC_PRED - 1);
+        mi->bmi[ib + 5].as_mode.second = (MB_PREDICTION_MODE)(DC_PRED - 1);
+#endif
+      }
+    } else {
+      mbmi->uv_mode = (MB_PREDICTION_MODE)read_uv_mode(
+        bc, pbi->common.fc.uv_mode_prob[mbmi->mode]);
+      pbi->common.fc.uv_mode_counts[mbmi->mode][mbmi->uv_mode]++;
+    }
+
+#if CONFIG_COMP_INTRA_PRED
+    mbmi->second_uv_mode = (MB_PREDICTION_MODE)(DC_PRED - 1);
+#endif
+  }
+
+#if CONFIG_SUPERBLOCKS
+  if (mbmi->encoded_as_sb)
+    mbmi->txfm_size = TX_8X8;
+  else
+#endif
+  if (cm->txfm_mode == TX_MODE_SELECT && mbmi->mb_skip_coeff == 0 &&
+      ((mbmi->ref_frame == INTRA_FRAME && mbmi->mode <= I8X8_PRED) ||
+       (mbmi->ref_frame != INTRA_FRAME && !(mbmi->mode == SPLITMV &&
+                           mbmi->partitioning == PARTITIONING_4X4)))) {
+    // FIXME(rbultje) code ternary symbol once all experiments are merged
+    mbmi->txfm_size = vp9_read(bc, cm->prob_tx[0]);
+    if (mbmi->txfm_size != TX_4X4 && mbmi->mode != I8X8_PRED &&
+        mbmi->mode != SPLITMV)
+      mbmi->txfm_size += vp9_read(bc, cm->prob_tx[1]);
+  } else if (cm->txfm_mode >= ALLOW_16X16 &&
+      ((mbmi->ref_frame == INTRA_FRAME && mbmi->mode <= TM_PRED) ||
+       (mbmi->ref_frame != INTRA_FRAME && mbmi->mode != SPLITMV))) {
+    mbmi->txfm_size = TX_16X16;
+  } else if (cm->txfm_mode >= ALLOW_8X8 &&
+      (!(mbmi->ref_frame == INTRA_FRAME && mbmi->mode == B_PRED) &&
+       !(mbmi->ref_frame != INTRA_FRAME && mbmi->mode == SPLITMV &&
+         mbmi->partitioning == PARTITIONING_4X4))) {
+    mbmi->txfm_size = TX_8X8;
+  } else {
+    mbmi->txfm_size = TX_4X4;
+  }
+}
+
+void vp9_decode_mode_mvs_init(VP9D_COMP *pbi, BOOL_DECODER* const bc) {
+  VP9_COMMON *cm = &pbi->common;
+
+  vpx_memset(cm->mbskip_pred_probs, 0, sizeof(cm->mbskip_pred_probs));
+  if (pbi->common.mb_no_coeff_skip) {
+    int k;
+    for (k = 0; k < MBSKIP_CONTEXTS; ++k)
+      cm->mbskip_pred_probs[k] = (vp9_prob)vp9_read_literal(bc, 8);
+  }
+
+  mb_mode_mv_init(pbi, bc);
+}
+void vp9_decode_mb_mode_mv(VP9D_COMP *pbi,
+                           MACROBLOCKD *xd,
+                           int mb_row,
+                           int mb_col,
+                           BOOL_DECODER* const bc) {
+  MODE_INFO *mi = xd->mode_info_context;
+  MODE_INFO *prev_mi = xd->prev_mode_info_context;
+
+  if (pbi->common.frame_type == KEY_FRAME)
+    kfread_modes(pbi, mi, mb_row, mb_col, bc);
+  else
+    read_mb_modes_mv(pbi, mi, &mi->mbmi, prev_mi, mb_row, mb_col, bc);
+}
--- /dev/null
+++ b/vp9/decoder/decodemv.h
@@ -1,0 +1,19 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "onyxd_int.h"
+
+void vp9_decode_mb_mode_mv(VP9D_COMP* const pbi,
+                           MACROBLOCKD* const xd,
+                           int mb_row,
+                           int mb_col,
+                           BOOL_DECODER* const bc);
+void vp9_decode_mode_mvs_init(VP9D_COMP* const pbi, BOOL_DECODER* const bc);
--- /dev/null
+++ b/vp9/decoder/decodframe.c
@@ -1,0 +1,1337 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "onyxd_int.h"
+#include "vp9/common/header.h"
+#include "vp9/common/reconintra.h"
+#include "vp9/common/reconintra4x4.h"
+#include "vp9/common/reconinter.h"
+#include "detokenize.h"
+#include "vp9/common/invtrans.h"
+#include "vp9/common/alloccommon.h"
+#include "vp9/common/entropymode.h"
+#include "vp9/common/quant_common.h"
+#include "vpx_scale/vpxscale.h"
+#include "vpx_scale/yv12extend.h"
+#include "vp9/common/setupintrarecon.h"
+
+#include "decodemv.h"
+#include "vp9/common/extend.h"
+#include "vp9/common/modecont.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vp9/common/idct.h"
+#include "dboolhuff.h"
+
+#include "vp9/common/seg_common.h"
+#include "vp9/common/entropy.h"
+#include "vpx_rtcd.h"
+
+#include <assert.h>
+#include <stdio.h>
+
+
+#define COEFCOUNT_TESTING
+
+static int merge_index(int v, int n, int modulus) {
+  int max1 = (n - 1 - modulus / 2) / modulus + 1;
+  if (v < max1) v = v * modulus + modulus / 2;
+  else {
+    int w;
+    v -= max1;
+    w = v;
+    v += (v + modulus - modulus / 2) / modulus;
+    while (v % modulus == modulus / 2 ||
+           w != v - (v + modulus - modulus / 2) / modulus) v++;
+  }
+  return v;
+}
+
+static int inv_remap_prob(int v, int m) {
+  const int n = 256;
+  const int modulus = MODULUS_PARAM;
+  int i;
+  v = merge_index(v, n - 1, modulus);
+  if ((m << 1) <= n) {
+    i = vp9_inv_recenter_nonneg(v + 1, m);
+  } else {
+    i = n - 1 - vp9_inv_recenter_nonneg(v + 1, n - 1 - m);
+  }
+  return i;
+}
+
+static vp9_prob read_prob_diff_update(vp9_reader *const bc, int oldp) {
+  int delp = vp9_decode_term_subexp(bc, SUBEXP_PARAM, 255);
+  return (vp9_prob)inv_remap_prob(delp, oldp);
+}
+
+void vp9_init_de_quantizer(VP9D_COMP *pbi) {
+  int i;
+  int Q;
+  VP9_COMMON *const pc = &pbi->common;
+
+  for (Q = 0; Q < QINDEX_RANGE; Q++) {
+    pc->Y1dequant[Q][0] = (short)vp9_dc_quant(Q, pc->y1dc_delta_q);
+    pc->Y2dequant[Q][0] = (short)vp9_dc2quant(Q, pc->y2dc_delta_q);
+    pc->UVdequant[Q][0] = (short)vp9_dc_uv_quant(Q, pc->uvdc_delta_q);
+
+    /* all the ac values =; */
+    for (i = 1; i < 16; i++) {
+      int rc = vp9_default_zig_zag1d[i];
+
+      pc->Y1dequant[Q][rc] = (short)vp9_ac_yquant(Q);
+      pc->Y2dequant[Q][rc] = (short)vp9_ac2quant(Q, pc->y2ac_delta_q);
+      pc->UVdequant[Q][rc] = (short)vp9_ac_uv_quant(Q, pc->uvac_delta_q);
+    }
+  }
+}
+
+static void mb_init_dequantizer(VP9D_COMP *pbi, MACROBLOCKD *xd) {
+  int i;
+  int QIndex;
+  VP9_COMMON *const pc = &pbi->common;
+  int segment_id = xd->mode_info_context->mbmi.segment_id;
+
+  // Set the Q baseline allowing for any segment level adjustment
+  if (vp9_segfeature_active(xd, segment_id, SEG_LVL_ALT_Q)) {
+    /* Abs Value */
+    if (xd->mb_segment_abs_delta == SEGMENT_ABSDATA)
+      QIndex = vp9_get_segdata(xd, segment_id, SEG_LVL_ALT_Q);
+
+    /* Delta Value */
+    else {
+      QIndex = pc->base_qindex +
+               vp9_get_segdata(xd, segment_id, SEG_LVL_ALT_Q);
+      QIndex = (QIndex >= 0) ? ((QIndex <= MAXQ) ? QIndex : MAXQ) : 0;    /* Clamp to valid range */
+    }
+  } else
+    QIndex = pc->base_qindex;
+  xd->q_index = QIndex;
+
+  /* Set up the block level dequant pointers */
+  for (i = 0; i < 16; i++) {
+    xd->block[i].dequant = pc->Y1dequant[QIndex];
+  }
+
+#if CONFIG_LOSSLESS
+  if (!QIndex) {
+    pbi->common.rtcd.idct.idct1        = vp9_short_inv_walsh4x4_1_x8_c;
+    pbi->common.rtcd.idct.idct16       = vp9_short_inv_walsh4x4_x8_c;
+    pbi->common.rtcd.idct.idct1_scalar_add  = vp9_dc_only_inv_walsh_add_c;
+    pbi->common.rtcd.idct.iwalsh1      = vp9_short_inv_walsh4x4_1_lossless_c;
+    pbi->common.rtcd.idct.iwalsh16     = vp9_short_inv_walsh4x4_lossless_c;
+    pbi->idct_add            = vp9_dequant_idct_add_lossless_c;
+    pbi->dc_idct_add         = vp9_dequant_dc_idct_add_lossless_c;
+    pbi->dc_idct_add_y_block = vp9_dequant_dc_idct_add_y_block_lossless_c;
+    pbi->idct_add_y_block    = vp9_dequant_idct_add_y_block_lossless_c;
+    pbi->idct_add_uv_block   = vp9_dequant_idct_add_uv_block_lossless_c;
+  } else {
+    pbi->common.rtcd.idct.idct1        = vp9_short_idct4x4llm_1_c;
+    pbi->common.rtcd.idct.idct16       = vp9_short_idct4x4llm_c;
+    pbi->common.rtcd.idct.idct1_scalar_add  = vp9_dc_only_idct_add_c;
+    pbi->common.rtcd.idct.iwalsh1      = vp9_short_inv_walsh4x4_1_c;
+    pbi->common.rtcd.idct.iwalsh16     = vp9_short_inv_walsh4x4_c;
+    pbi->idct_add            = vp9_dequant_idct_add;
+    pbi->dc_idct_add         = vp9_dequant_dc_idct_add;
+    pbi->dc_idct_add_y_block = vp9_dequant_dc_idct_add_y_block;
+    pbi->idct_add_y_block    = vp9_dequant_idct_add_y_block;
+    pbi->idct_add_uv_block   = vp9_dequant_idct_add_uv_block;
+  }
+#else
+  pbi->idct_add            = vp9_dequant_idct_add;
+  pbi->dc_idct_add         = vp9_dequant_dc_idct_add;
+  pbi->dc_idct_add_y_block = vp9_dequant_dc_idct_add_y_block;
+  pbi->idct_add_y_block    = vp9_dequant_idct_add_y_block;
+  pbi->idct_add_uv_block   = vp9_dequant_idct_add_uv_block;
+#endif
+
+  for (i = 16; i < 24; i++) {
+    xd->block[i].dequant = pc->UVdequant[QIndex];
+  }
+
+  xd->block[24].dequant = pc->Y2dequant[QIndex];
+
+}
+
+#if CONFIG_RUNTIME_CPU_DETECT
+#define RTCD_VTABLE(x) (&(pbi)->common.rtcd.x)
+#else
+#define RTCD_VTABLE(x) NULL
+#endif
+
+/* skip_recon_mb() is Modified: Instead of writing the result to predictor buffer and then copying it
+ *  to dst buffer, we can write the result directly to dst buffer. This eliminates unnecessary copy.
+ */
+static void skip_recon_mb(VP9D_COMP *pbi, MACROBLOCKD *xd) {
+  if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) {
+#if CONFIG_SUPERBLOCKS
+    if (xd->mode_info_context->mbmi.encoded_as_sb) {
+      vp9_build_intra_predictors_sbuv_s(xd);
+      vp9_build_intra_predictors_sby_s(xd);
+    } else {
+#endif
+    vp9_build_intra_predictors_mbuv_s(xd);
+    vp9_build_intra_predictors_mby_s(xd);
+#if CONFIG_SUPERBLOCKS
+    }
+#endif
+  } else {
+#if CONFIG_SUPERBLOCKS
+    if (xd->mode_info_context->mbmi.encoded_as_sb) {
+      vp9_build_inter32x32_predictors_sb(xd, xd->dst.y_buffer,
+                                         xd->dst.u_buffer, xd->dst.v_buffer,
+                                         xd->dst.y_stride, xd->dst.uv_stride);
+    } else {
+#endif
+    vp9_build_1st_inter16x16_predictors_mb(xd, xd->dst.y_buffer,
+                                           xd->dst.u_buffer, xd->dst.v_buffer,
+                                           xd->dst.y_stride, xd->dst.uv_stride);
+
+    if (xd->mode_info_context->mbmi.second_ref_frame) {
+      vp9_build_2nd_inter16x16_predictors_mb(xd, xd->dst.y_buffer,
+                                             xd->dst.u_buffer, xd->dst.v_buffer,
+                                             xd->dst.y_stride, xd->dst.uv_stride);
+    }
+#if CONFIG_SUPERBLOCKS
+    }
+#endif
+  }
+}
+
+static void decode_macroblock(VP9D_COMP *pbi, MACROBLOCKD *xd,
+                              int mb_row, unsigned int mb_col,
+                              BOOL_DECODER* const bc) {
+  int eobtotal = 0;
+  MB_PREDICTION_MODE mode;
+  int i;
+  int tx_size;
+  TX_TYPE tx_type;
+  VP9_COMMON *pc = &pbi->common;
+#if CONFIG_SUPERBLOCKS
+  int orig_skip_flag = xd->mode_info_context->mbmi.mb_skip_coeff;
+#endif
+
+  // re-initialize macroblock dequantizer before detokenization
+  if (xd->segmentation_enabled)
+    mb_init_dequantizer(pbi, xd);
+
+  tx_size = xd->mode_info_context->mbmi.txfm_size;
+  mode = xd->mode_info_context->mbmi.mode;
+
+  if (xd->mode_info_context->mbmi.mb_skip_coeff) {
+    vp9_reset_mb_tokens_context(xd);
+#if CONFIG_SUPERBLOCKS
+    if (xd->mode_info_context->mbmi.encoded_as_sb &&
+        (mb_col < pc->mb_cols - 1 || mb_row < pc->mb_rows - 1)) {
+      if (mb_col < pc->mb_cols - 1)
+        xd->above_context++;
+      if (mb_row < pc->mb_rows - 1)
+        xd->left_context++;
+      vp9_reset_mb_tokens_context(xd);
+      if (mb_col < pc->mb_cols - 1)
+        xd->above_context--;
+      if (mb_row < pc->mb_rows - 1)
+        xd->left_context--;
+    }
+#endif
+  } else if (!bool_error(bc)) {
+    for (i = 0; i < 25; i++) {
+      xd->block[i].eob = 0;
+      xd->eobs[i] = 0;
+    }
+    if (tx_size == TX_16X16) {
+      eobtotal = vp9_decode_mb_tokens_16x16(pbi, xd, bc);
+    } else if (tx_size == TX_8X8) {
+      eobtotal = vp9_decode_mb_tokens_8x8(pbi, xd, bc);
+    } else {
+      eobtotal = vp9_decode_mb_tokens(pbi, xd, bc);
+    }
+  }
+
+  //mode = xd->mode_info_context->mbmi.mode;
+  if (pbi->common.frame_type != KEY_FRAME)
+    vp9_setup_interp_filters(xd, xd->mode_info_context->mbmi.interp_filter,
+                             &pbi->common);
+
+  if (eobtotal == 0 && mode != B_PRED && mode != SPLITMV
+      && mode != I8X8_PRED
+      && !bool_error(bc)) {
+    /* Special case:  Force the loopfilter to skip when eobtotal and
+     * mb_skip_coeff are zero.
+     * */
+    xd->mode_info_context->mbmi.mb_skip_coeff = 1;
+
+#if CONFIG_SUPERBLOCKS
+    if (!xd->mode_info_context->mbmi.encoded_as_sb || orig_skip_flag)
+#endif
+    {
+      skip_recon_mb(pbi, xd);
+      return;
+    }
+  }
+
+  // moved to be performed before detokenization
+//  if (xd->segmentation_enabled)
+//    mb_init_dequantizer(pbi, xd);
+
+  /* do prediction */
+  if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) {
+#if CONFIG_SUPERBLOCKS
+    if (xd->mode_info_context->mbmi.encoded_as_sb) {
+      vp9_build_intra_predictors_sby_s(xd);
+      vp9_build_intra_predictors_sbuv_s(xd);
+    } else
+#endif
+    if (mode != I8X8_PRED) {
+      vp9_build_intra_predictors_mbuv(xd);
+      if (mode != B_PRED) {
+        vp9_build_intra_predictors_mby(xd);
+      }
+    }
+  } else {
+#if CONFIG_SUPERBLOCKS
+    if (xd->mode_info_context->mbmi.encoded_as_sb) {
+      vp9_build_inter32x32_predictors_sb(xd, xd->dst.y_buffer,
+                                         xd->dst.u_buffer, xd->dst.v_buffer,
+                                         xd->dst.y_stride, xd->dst.uv_stride);
+    } else
+#endif
+    vp9_build_inter_predictors_mb(xd);
+  }
+
+  /* dequantization and idct */
+  if (mode == I8X8_PRED) {
+    for (i = 0; i < 4; i++) {
+      int ib = vp9_i8x8_block[i];
+      const int iblock[4] = {0, 1, 4, 5};
+      int j;
+      int i8x8mode;
+      BLOCKD *b;
+
+      int idx = (ib & 0x02) ? (ib + 2) : ib;
+
+      short *q  = xd->block[idx].qcoeff;
+      short *dq = xd->block[0].dequant;
+      unsigned char *pre = xd->block[ib].predictor;
+      unsigned char *dst = *(xd->block[ib].base_dst) + xd->block[ib].dst;
+      int stride = xd->dst.y_stride;
+
+      b = &xd->block[ib];
+      i8x8mode = b->bmi.as_mode.first;
+      vp9_intra8x8_predict(b, i8x8mode, b->predictor);
+
+      if (xd->mode_info_context->mbmi.txfm_size == TX_8X8) {
+        tx_type = get_tx_type(xd, &xd->block[idx]);
+        if (tx_type != DCT_DCT) {
+          vp9_ht_dequant_idct_add_8x8_c(tx_type,
+                                        q, dq, pre, dst, 16, stride);
+        } else {
+          vp9_dequant_idct_add_8x8_c(q, dq, pre, dst, 16, stride);
+        }
+        q += 64;
+      } else {
+        for (j = 0; j < 4; j++) {
+          b = &xd->block[ib + iblock[j]];
+          vp9_dequant_idct_add(b->qcoeff, b->dequant, b->predictor,
+                                 *(b->base_dst) + b->dst, 16, b->dst_stride);
+        }
+      }
+      b = &xd->block[16 + i];
+      vp9_intra_uv4x4_predict(b, i8x8mode, b->predictor);
+      pbi->idct_add(b->qcoeff, b->dequant, b->predictor,
+                    *(b->base_dst) + b->dst, 8, b->dst_stride);
+      b = &xd->block[20 + i];
+      vp9_intra_uv4x4_predict(b, i8x8mode, b->predictor);
+      pbi->idct_add(b->qcoeff, b->dequant, b->predictor,
+                    *(b->base_dst) + b->dst, 8, b->dst_stride);
+    }
+  } else if (mode == B_PRED) {
+    for (i = 0; i < 16; i++) {
+      BLOCKD *b = &xd->block[i];
+      int b_mode = xd->mode_info_context->bmi[i].as_mode.first;
+#if CONFIG_COMP_INTRA_PRED
+      int b_mode2 = xd->mode_info_context->bmi[i].as_mode.second;
+
+      if (b_mode2 == (B_PREDICTION_MODE)(B_DC_PRED - 1)) {
+#endif
+        vp9_intra4x4_predict(b, b_mode, b->predictor);
+#if CONFIG_COMP_INTRA_PRED
+      } else {
+        vp9_comp_intra4x4_predict(b, b_mode, b_mode2, b->predictor);
+      }
+#endif
+
+      tx_type = get_tx_type(xd, b);
+      if (tx_type != DCT_DCT) {
+        vp9_ht_dequant_idct_add_c(tx_type, b->qcoeff,
+                                  b->dequant, b->predictor,
+                                  *(b->base_dst) + b->dst, 16, b->dst_stride);
+      } else {
+        vp9_dequant_idct_add(b->qcoeff, b->dequant, b->predictor,
+                               *(b->base_dst) + b->dst, 16, b->dst_stride);
+      }
+    }
+  } else if (mode == SPLITMV) {
+    if (tx_size == TX_8X8) {
+      vp9_dequant_idct_add_y_block_8x8(xd->qcoeff, xd->block[0].dequant,
+                                         xd->predictor, xd->dst.y_buffer,
+                                         xd->dst.y_stride, xd->eobs, xd);
+    } else {
+      pbi->idct_add_y_block(xd->qcoeff, xd->block[0].dequant,
+                                       xd->predictor, xd->dst.y_buffer,
+                                       xd->dst.y_stride, xd->eobs);
+    }
+  } else {
+    BLOCKD *b = &xd->block[24];
+
+    if (tx_size == TX_16X16) {
+      BLOCKD *bd = &xd->block[0];
+      tx_type = get_tx_type(xd, bd);
+      if (tx_type != DCT_DCT) {
+        vp9_ht_dequant_idct_add_16x16_c(tx_type, xd->qcoeff,
+                                        xd->block[0].dequant, xd->predictor,
+                                        xd->dst.y_buffer, 16, xd->dst.y_stride);
+      } else {
+        vp9_dequant_idct_add_16x16(xd->qcoeff, xd->block[0].dequant,
+                                     xd->predictor, xd->dst.y_buffer,
+                                     16, xd->dst.y_stride);
+      }
+    } else if (tx_size == TX_8X8) {
+#if CONFIG_SUPERBLOCKS
+      void *orig = xd->mode_info_context;
+      int n, num = xd->mode_info_context->mbmi.encoded_as_sb ? 4 : 1;
+      for (n = 0; n < num; n++) {
+        int x_idx = n & 1, y_idx = n >> 1;
+        if (num == 4 && (mb_col + x_idx >= pc->mb_cols ||
+                         mb_row + y_idx >= pc->mb_rows))
+          continue;
+
+        if (n != 0) {
+          for (i = 0; i < 25; i++) {
+            xd->block[i].eob = 0;
+            xd->eobs[i] = 0;
+          }
+          xd->above_context = pc->above_context + mb_col + (n & 1);
+          xd->left_context = pc->left_context + (n >> 1);
+          xd->mode_info_context = orig;
+          xd->mode_info_context += (n & 1);
+          xd->mode_info_context += (n >> 1) * pc->mode_info_stride;
+          if (!orig_skip_flag) {
+            eobtotal = vp9_decode_mb_tokens_8x8(pbi, xd, bc);
+            if (eobtotal == 0) // skip loopfilter
+              xd->mode_info_context->mbmi.mb_skip_coeff = 1;
+          } else {
+            vp9_reset_mb_tokens_context(xd);
+          }
+        }
+
+        if (xd->mode_info_context->mbmi.mb_skip_coeff)
+          continue; // only happens for SBs, which are already in dest buffer
+#endif
+      vp9_dequantize_b_2x2(b);
+      IDCT_INVOKE(RTCD_VTABLE(idct), ihaar2)(&b->dqcoeff[0], b->diff, 8);
+      ((int *)b->qcoeff)[0] = 0;// 2nd order block are set to 0 after inverse transform
+      ((int *)b->qcoeff)[1] = 0;
+      ((int *)b->qcoeff)[2] = 0;
+      ((int *)b->qcoeff)[3] = 0;
+      ((int *)b->qcoeff)[4] = 0;
+      ((int *)b->qcoeff)[5] = 0;
+      ((int *)b->qcoeff)[6] = 0;
+      ((int *)b->qcoeff)[7] = 0;
+#if CONFIG_SUPERBLOCKS
+      if (xd->mode_info_context->mbmi.encoded_as_sb) {
+        vp9_dequant_dc_idct_add_y_block_8x8_inplace_c(xd->qcoeff,
+          xd->block[0].dequant,
+          xd->dst.y_buffer + (n >> 1) * 16 * xd->dst.y_stride + (n & 1) * 16,
+          xd->dst.y_stride, xd->eobs, xd->block[24].diff, xd);
+        // do UV inline also
+        vp9_dequant_idct_add_uv_block_8x8_inplace_c(xd->qcoeff + 16 * 16,
+          xd->block[16].dequant,
+          xd->dst.u_buffer + (n >> 1) * 8 * xd->dst.uv_stride + (n & 1) * 8,
+          xd->dst.v_buffer + (n >> 1) * 8 * xd->dst.uv_stride + (n & 1) * 8,
+          xd->dst.uv_stride, xd->eobs + 16, xd);
+      } else
+#endif
+        vp9_dequant_dc_idct_add_y_block_8x8(xd->qcoeff,
+          xd->block[0].dequant, xd->predictor, xd->dst.y_buffer,
+          xd->dst.y_stride, xd->eobs, xd->block[24].diff, xd);
+#if CONFIG_SUPERBLOCKS
+      }
+      xd->mode_info_context = orig;
+#endif
+    } else {
+      vp9_dequantize_b(b);
+      if (xd->eobs[24] > 1) {
+        IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh16)(&b->dqcoeff[0], b->diff);
+        ((int *)b->qcoeff)[0] = 0;
+        ((int *)b->qcoeff)[1] = 0;
+        ((int *)b->qcoeff)[2] = 0;
+        ((int *)b->qcoeff)[3] = 0;
+        ((int *)b->qcoeff)[4] = 0;
+        ((int *)b->qcoeff)[5] = 0;
+        ((int *)b->qcoeff)[6] = 0;
+        ((int *)b->qcoeff)[7] = 0;
+      } else {
+        IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh1)(&b->dqcoeff[0], b->diff);
+        ((int *)b->qcoeff)[0] = 0;
+      }
+
+      pbi->dc_idct_add_y_block(xd->qcoeff, xd->block[0].dequant, xd->predictor,
+                               xd->dst.y_buffer, xd->dst.y_stride, xd->eobs,
+                               xd->block[24].diff);
+    }
+  }
+
+#if CONFIG_SUPERBLOCKS
+  if (!xd->mode_info_context->mbmi.encoded_as_sb) {
+#endif
+    if ((tx_size == TX_8X8 &&
+         xd->mode_info_context->mbmi.mode != I8X8_PRED &&
+         xd->mode_info_context->mbmi.mode != SPLITMV)
+        || tx_size == TX_16X16
+       )
+      vp9_dequant_idct_add_uv_block_8x8
+          (xd->qcoeff + 16 * 16, xd->block[16].dequant,
+           xd->predictor + 16 * 16, xd->dst.u_buffer, xd->dst.v_buffer,
+           xd->dst.uv_stride, xd->eobs + 16, xd); //
+    else if (xd->mode_info_context->mbmi.mode != I8X8_PRED)
+      pbi->idct_add_uv_block(xd->qcoeff + 16 * 16, xd->block[16].dequant,
+           xd->predictor + 16 * 16, xd->dst.u_buffer, xd->dst.v_buffer,
+           xd->dst.uv_stride, xd->eobs + 16);
+#if CONFIG_SUPERBLOCKS
+  }
+#endif
+}
+
+
+static int get_delta_q(vp9_reader *bc, int prev, int *q_update) {
+  int ret_val = 0;
+
+  if (vp9_read_bit(bc)) {
+    ret_val = vp9_read_literal(bc, 4);
+
+    if (vp9_read_bit(bc))
+      ret_val = -ret_val;
+  }
+
+  /* Trigger a quantizer update if the delta-q value has changed */
+  if (ret_val != prev)
+    *q_update = 1;
+
+  return ret_val;
+}
+
+#ifdef PACKET_TESTING
+#include <stdio.h>
+FILE *vpxlog = 0;
+#endif
+
+/* Decode a row of Superblocks (2x2 region of MBs) */
+static void
+decode_sb_row(VP9D_COMP *pbi, VP9_COMMON *pc, int mbrow, MACROBLOCKD *xd,
+              BOOL_DECODER* const bc) {
+  int i;
+  int sb_col;
+  int mb_row, mb_col;
+  int recon_yoffset, recon_uvoffset;
+  int ref_fb_idx = pc->lst_fb_idx;
+  int dst_fb_idx = pc->new_fb_idx;
+  int recon_y_stride = pc->yv12_fb[ref_fb_idx].y_stride;
+  int recon_uv_stride = pc->yv12_fb[ref_fb_idx].uv_stride;
+  int row_delta[4] = { 0, +1,  0, -1};
+  int col_delta[4] = { +1, -1, +1, +1};
+  int sb_cols = (pc->mb_cols + 1) >> 1;
+
+  // For a SB there are 2 left contexts, each pertaining to a MB row within
+  vpx_memset(pc->left_context, 0, sizeof(pc->left_context));
+
+  mb_row = mbrow;
+  mb_col = 0;
+
+  for (sb_col = 0; sb_col < sb_cols; sb_col++) {
+    MODE_INFO *mi = xd->mode_info_context;
+
+#if CONFIG_SUPERBLOCKS
+    mi->mbmi.encoded_as_sb = vp9_read(bc, pc->sb_coded);
+#endif
+
+    // Process the 4 MBs within the SB in the order:
+    // top-left, top-right, bottom-left, bottom-right
+    for (i = 0; i < 4; i++) {
+      int dy = row_delta[i];
+      int dx = col_delta[i];
+      int offset_extended = dy * xd->mode_info_stride + dx;
+
+      xd->mb_index = i;
+
+      mi = xd->mode_info_context;
+      if ((mb_row >= pc->mb_rows) || (mb_col >= pc->mb_cols)) {
+        // MB lies outside frame, skip on to next
+        mb_row += dy;
+        mb_col += dx;
+        xd->mode_info_context += offset_extended;
+        xd->prev_mode_info_context += offset_extended;
+        continue;
+      }
+
+      // Set above context pointer
+      xd->above_context = pc->above_context + mb_col;
+      xd->left_context = pc->left_context + (i >> 1);
+
+      /* Distance of Mb to the various image edges.
+       * These are specified to 8th pel as they are always compared to
+       * values that are in 1/8th pel units
+       */
+      xd->mb_to_top_edge = -((mb_row * 16)) << 3;
+      xd->mb_to_bottom_edge = ((pc->mb_rows - 1 - mb_row) * 16) << 3;
+
+      xd->mb_to_left_edge = -((mb_col * 16) << 3);
+      xd->mb_to_right_edge = ((pc->mb_cols - 1 - mb_col) * 16) << 3;
+
+      xd->up_available = (mb_row != 0);
+      xd->left_available = (mb_col != 0);
+
+
+      recon_yoffset = (mb_row * recon_y_stride * 16) + (mb_col * 16);
+      recon_uvoffset = (mb_row * recon_uv_stride * 8) + (mb_col * 8);
+
+      xd->dst.y_buffer = pc->yv12_fb[dst_fb_idx].y_buffer + recon_yoffset;
+      xd->dst.u_buffer = pc->yv12_fb[dst_fb_idx].u_buffer + recon_uvoffset;
+      xd->dst.v_buffer = pc->yv12_fb[dst_fb_idx].v_buffer + recon_uvoffset;
+
+#if CONFIG_SUPERBLOCKS
+      if (i)
+        mi->mbmi.encoded_as_sb = 0;
+#endif
+      vp9_decode_mb_mode_mv(pbi, xd, mb_row, mb_col, bc);
+
+      update_blockd_bmi(xd);
+
+      /* Select the appropriate reference frame for this MB */
+      if (xd->mode_info_context->mbmi.ref_frame == LAST_FRAME)
+        ref_fb_idx = pc->lst_fb_idx;
+      else if (xd->mode_info_context->mbmi.ref_frame == GOLDEN_FRAME)
+        ref_fb_idx = pc->gld_fb_idx;
+      else
+        ref_fb_idx = pc->alt_fb_idx;
+
+      xd->pre.y_buffer = pc->yv12_fb[ref_fb_idx].y_buffer + recon_yoffset;
+      xd->pre.u_buffer = pc->yv12_fb[ref_fb_idx].u_buffer + recon_uvoffset;
+      xd->pre.v_buffer = pc->yv12_fb[ref_fb_idx].v_buffer + recon_uvoffset;
+
+      if (xd->mode_info_context->mbmi.second_ref_frame) {
+        int second_ref_fb_idx;
+
+        /* Select the appropriate reference frame for this MB */
+        if (xd->mode_info_context->mbmi.second_ref_frame == LAST_FRAME)
+          second_ref_fb_idx = pc->lst_fb_idx;
+        else if (xd->mode_info_context->mbmi.second_ref_frame ==
+                 GOLDEN_FRAME)
+          second_ref_fb_idx = pc->gld_fb_idx;
+        else
+          second_ref_fb_idx = pc->alt_fb_idx;
+
+        xd->second_pre.y_buffer =
+          pc->yv12_fb[second_ref_fb_idx].y_buffer + recon_yoffset;
+        xd->second_pre.u_buffer =
+          pc->yv12_fb[second_ref_fb_idx].u_buffer + recon_uvoffset;
+        xd->second_pre.v_buffer =
+          pc->yv12_fb[second_ref_fb_idx].v_buffer + recon_uvoffset;
+      }
+
+      if (xd->mode_info_context->mbmi.ref_frame != INTRA_FRAME) {
+        /* propagate errors from reference frames */
+        xd->corrupted |= pc->yv12_fb[ref_fb_idx].corrupted;
+      }
+
+#if CONFIG_SUPERBLOCKS
+      if (xd->mode_info_context->mbmi.encoded_as_sb) {
+        if (mb_col < pc->mb_cols - 1)
+          mi[1] = mi[0];
+        if (mb_row < pc->mb_rows - 1) {
+          mi[pc->mode_info_stride] = mi[0];
+          if (mb_col < pc->mb_cols - 1)
+            mi[pc->mode_info_stride + 1] = mi[0];
+        }
+      }
+#endif
+      vp9_intra_prediction_down_copy(xd);
+      decode_macroblock(pbi, xd, mb_row, mb_col, bc);
+
+      /* check if the boolean decoder has suffered an error */
+      xd->corrupted |= bool_error(bc);
+
+#if CONFIG_SUPERBLOCKS
+      if (mi->mbmi.encoded_as_sb) {
+        assert(!i);
+        mb_col += 2;
+        xd->mode_info_context += 2;
+        xd->prev_mode_info_context += 2;
+        break;
+      }
+#endif
+
+      // skip to next MB
+      xd->mode_info_context += offset_extended;
+      xd->prev_mode_info_context += offset_extended;
+      mb_row += dy;
+      mb_col += dx;
+    }
+  }
+
+  /* skip prediction column */
+  xd->mode_info_context += 1 - (pc->mb_cols & 0x1) + xd->mode_info_stride;
+  xd->prev_mode_info_context += 1 - (pc->mb_cols & 0x1) + xd->mode_info_stride;
+}
+
+static unsigned int read_partition_size(const unsigned char *cx_size) {
+  const unsigned int size =
+    cx_size[0] + (cx_size[1] << 8) + (cx_size[2] << 16);
+  return size;
+}
+
+static int read_is_valid(const unsigned char *start,
+                         size_t               len,
+                         const unsigned char *end) {
+  return (start + len > start && start + len <= end);
+}
+
+
+static void setup_token_decoder(VP9D_COMP *pbi,
+                                const unsigned char *cx_data,
+                                BOOL_DECODER* const bool_decoder) {
+  VP9_COMMON          *pc = &pbi->common;
+  const unsigned char *user_data_end = pbi->Source + pbi->source_sz;
+  const unsigned char *partition;
+
+  ptrdiff_t            partition_size;
+  ptrdiff_t            bytes_left;
+
+  // Set up pointers to token partition
+  partition = cx_data;
+  bytes_left = user_data_end - partition;
+  partition_size = bytes_left;
+
+  /* Validate the calculated partition length. If the buffer
+   * described by the partition can't be fully read, then restrict
+   * it to the portion that can be (for EC mode) or throw an error.
+   */
+  if (!read_is_valid(partition, partition_size, user_data_end)) {
+    vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,
+                       "Truncated packet or corrupt partition "
+                       "%d length", 1);
+  }
+
+  if (vp9_start_decode(bool_decoder, partition, partition_size))
+    vpx_internal_error(&pc->error, VPX_CODEC_MEM_ERROR,
+                       "Failed to allocate bool decoder %d", 1);
+}
+
+static void init_frame(VP9D_COMP *pbi) {
+  VP9_COMMON *const pc = &pbi->common;
+  MACROBLOCKD *const xd  = &pbi->mb;
+
+  if (pc->frame_type == KEY_FRAME) {
+    /* Various keyframe initializations */
+    vp9_init_mv_probs(pc);
+
+    vp9_init_mbmode_probs(pc);
+    vp9_default_bmode_probs(pc->fc.bmode_prob);
+
+    vp9_default_coef_probs(pc);
+    vp9_kf_default_bmode_probs(pc->kf_bmode_prob);
+
+    // Reset the segment feature data to the default stats:
+    // Features disabled, 0, with delta coding (Default state).
+    vp9_clearall_segfeatures(xd);
+
+    xd->mb_segment_abs_delta = SEGMENT_DELTADATA;
+
+    /* reset the mode ref deltasa for loop filter */
+    vpx_memset(xd->ref_lf_deltas, 0, sizeof(xd->ref_lf_deltas));
+    vpx_memset(xd->mode_lf_deltas, 0, sizeof(xd->mode_lf_deltas));
+
+    /* All buffers are implicitly updated on key frames. */
+    pc->refresh_golden_frame = 1;
+    pc->refresh_alt_ref_frame = 1;
+    pc->copy_buffer_to_gf = 0;
+    pc->copy_buffer_to_arf = 0;
+
+    /* Note that Golden and Altref modes cannot be used on a key frame so
+     * ref_frame_sign_bias[] is undefined and meaningless
+     */
+    pc->ref_frame_sign_bias[GOLDEN_FRAME] = 0;
+    pc->ref_frame_sign_bias[ALTREF_FRAME] = 0;
+
+    vp9_init_mode_contexts(&pbi->common);
+    vpx_memcpy(&pc->lfc, &pc->fc, sizeof(pc->fc));
+    vpx_memcpy(&pc->lfc_a, &pc->fc, sizeof(pc->fc));
+
+    vpx_memcpy(pbi->common.fc.vp8_mode_contexts,
+               pbi->common.fc.mode_context,
+               sizeof(pbi->common.fc.mode_context));
+    vpx_memset(pc->prev_mip, 0,
+               (pc->mb_cols + 1) * (pc->mb_rows + 1)* sizeof(MODE_INFO));
+    vpx_memset(pc->mip, 0,
+               (pc->mb_cols + 1) * (pc->mb_rows + 1)* sizeof(MODE_INFO));
+
+    vp9_update_mode_info_border(pc, pc->mip);
+    vp9_update_mode_info_in_image(pc, pc->mi);
+
+  } else {
+
+    if (!pc->use_bilinear_mc_filter)
+      pc->mcomp_filter_type = EIGHTTAP;
+    else
+      pc->mcomp_filter_type = BILINEAR;
+
+    /* To enable choice of different interpolation filters */
+    vp9_setup_interp_filters(xd, pc->mcomp_filter_type, pc);
+  }
+
+  xd->mode_info_context = pc->mi;
+  xd->prev_mode_info_context = pc->prev_mi;
+  xd->frame_type = pc->frame_type;
+  xd->mode_info_context->mbmi.mode = DC_PRED;
+  xd->mode_info_stride = pc->mode_info_stride;
+  xd->corrupted = 0; /* init without corruption */
+
+  xd->fullpixel_mask = 0xffffffff;
+  if (pc->full_pixel)
+    xd->fullpixel_mask = 0xfffffff8;
+
+}
+
+#if 0
+static void read_coef_probs2(VP9D_COMP *pbi) {
+  const vp9_prob grpupd = 192;
+  int i, j, k, l;
+  vp9_reader *const bc = &pbi->bc;
+  VP9_COMMON *const pc = &pbi->common;
+  for (l = 0; l < ENTROPY_NODES; l++) {
+    if (vp9_read(bc, grpupd)) {
+      // printf("Decoding %d\n", l);
+      for (i = 0; i < BLOCK_TYPES; i++)
+        for (j = !i; j < COEF_BANDS; j++)
+          for (k = 0; k < PREV_COEF_CONTEXTS; k++) {
+            if (k >= 3 && ((i == 0 && j == 1) ||
+                           (i > 0 && j == 0)))
+              continue;
+            {
+              vp9_prob *const p = pc->fc.coef_probs [i][j][k] + l;
+              int u = vp9_read(bc, COEF_UPDATE_PROB);
+              if (u) *p = read_prob_diff_update(bc, *p);
+            }
+          }
+    }
+  }
+  if (pbi->common.txfm_mode == ALLOW_8X8) {
+    for (l = 0; l < ENTROPY_NODES; l++) {
+      if (vp9_read(bc, grpupd)) {
+        for (i = 0; i < BLOCK_TYPES_8X8; i++)
+          for (j = !i; j < COEF_BANDS; j++)
+            for (k = 0; k < PREV_COEF_CONTEXTS; k++) {
+              if (k >= 3 && ((i == 0 && j == 1) ||
+                             (i > 0 && j == 0)))
+                continue;
+              {
+                vp9_prob *const p = pc->fc.coef_probs_8x8 [i][j][k] + l;
+
+                int u = vp9_read(bc, COEF_UPDATE_PROB_8X8);
+                if (u) *p = read_prob_diff_update(bc, *p);
+              }
+            }
+      }
+    }
+  }
+}
+#endif
+
+static void read_coef_probs_common(
+    BOOL_DECODER* const bc,
+    vp9_prob coef_probs[BLOCK_TYPES][COEF_BANDS]
+                       [PREV_COEF_CONTEXTS][ENTROPY_NODES]) {
+  int i, j, k, l;
+
+  if (vp9_read_bit(bc)) {
+    for (i = 0; i < BLOCK_TYPES; i++) {
+      for (j = !i; j < COEF_BANDS; j++) {
+        /* NB: This j loop starts from 1 on block type i == 0 */
+        for (k = 0; k < PREV_COEF_CONTEXTS; k++) {
+          if (k >= 3 && ((i == 0 && j == 1) ||
+                         (i > 0 && j == 0)))
+            continue;
+          for (l = 0; l < ENTROPY_NODES; l++) {
+            vp9_prob *const p = coef_probs[i][j][k] + l;
+
+            if (vp9_read(bc, COEF_UPDATE_PROB)) {
+              *p = read_prob_diff_update(bc, *p);
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+static void read_coef_probs(VP9D_COMP *pbi, BOOL_DECODER* const bc) {
+  VP9_COMMON *const pc = &pbi->common;
+
+  read_coef_probs_common(bc, pc->fc.coef_probs);
+  read_coef_probs_common(bc, pc->fc.hybrid_coef_probs);
+
+  if (pbi->common.txfm_mode != ONLY_4X4) {
+    read_coef_probs_common(bc, pc->fc.coef_probs_8x8);
+    read_coef_probs_common(bc, pc->fc.hybrid_coef_probs_8x8);
+  }
+  if (pbi->common.txfm_mode > ALLOW_8X8) {
+    read_coef_probs_common(bc, pc->fc.coef_probs_16x16);
+    read_coef_probs_common(bc, pc->fc.hybrid_coef_probs_16x16);
+  }
+}
+
+int vp9_decode_frame(VP9D_COMP *pbi) {
+  BOOL_DECODER header_bc, residual_bc;
+  VP9_COMMON *const pc = &pbi->common;
+  MACROBLOCKD *const xd  = &pbi->mb;
+  const unsigned char *data = (const unsigned char *)pbi->Source;
+  const unsigned char *data_end = data + pbi->source_sz;
+  ptrdiff_t first_partition_length_in_bytes = 0;
+
+  int mb_row;
+  int i, j;
+  int corrupt_tokens = 0;
+
+  /* start with no corruption of current frame */
+  xd->corrupted = 0;
+  pc->yv12_fb[pc->new_fb_idx].corrupted = 0;
+
+  if (data_end - data < 3) {
+    vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,
+                       "Truncated packet");
+  } else {
+    pc->last_frame_type = pc->frame_type;
+    pc->frame_type = (FRAME_TYPE)(data[0] & 1);
+    pc->version = (data[0] >> 1) & 7;
+    pc->show_frame = (data[0] >> 4) & 1;
+    first_partition_length_in_bytes =
+      (data[0] | (data[1] << 8) | (data[2] << 16)) >> 5;
+
+    if ((data + first_partition_length_in_bytes > data_end
+         || data + first_partition_length_in_bytes < data))
+      vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,
+                         "Truncated packet or corrupt partition 0 length");
+
+    data += 3;
+
+    vp9_setup_version(pc);
+
+    if (pc->frame_type == KEY_FRAME) {
+      const int Width = pc->Width;
+      const int Height = pc->Height;
+
+      /* vet via sync code */
+      /* When error concealment is enabled we should only check the sync
+       * code if we have enough bits available
+       */
+      if (data + 3 < data_end) {
+        if (data[0] != 0x9d || data[1] != 0x01 || data[2] != 0x2a)
+          vpx_internal_error(&pc->error, VPX_CODEC_UNSUP_BITSTREAM,
+                             "Invalid frame sync code");
+      }
+
+      /* If error concealment is enabled we should only parse the new size
+       * if we have enough data. Otherwise we will end up with the wrong
+       * size.
+       */
+      if (data + 6 < data_end) {
+        pc->Width = (data[3] | (data[4] << 8)) & 0x3fff;
+        pc->horiz_scale = data[4] >> 6;
+        pc->Height = (data[5] | (data[6] << 8)) & 0x3fff;
+        pc->vert_scale = data[6] >> 6;
+      }
+      data += 7;
+
+      if (Width != pc->Width  ||  Height != pc->Height) {
+        if (pc->Width <= 0) {
+          pc->Width = Width;
+          vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,
+                             "Invalid frame width");
+        }
+
+        if (pc->Height <= 0) {
+          pc->Height = Height;
+          vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,
+                             "Invalid frame height");
+        }
+
+        if (vp9_alloc_frame_buffers(pc, pc->Width, pc->Height))
+          vpx_internal_error(&pc->error, VPX_CODEC_MEM_ERROR,
+                             "Failed to allocate frame buffers");
+      }
+    }
+  }
+
+  if ((!pbi->decoded_key_frame && pc->frame_type != KEY_FRAME) ||
+      pc->Width == 0 || pc->Height == 0) {
+    return -1;
+  }
+
+  init_frame(pbi);
+
+  if (vp9_start_decode(&header_bc, data, first_partition_length_in_bytes))
+    vpx_internal_error(&pc->error, VPX_CODEC_MEM_ERROR,
+                       "Failed to allocate bool decoder 0");
+  if (pc->frame_type == KEY_FRAME) {
+    pc->clr_type    = (YUV_TYPE)vp9_read_bit(&header_bc);
+    pc->clamp_type  = (CLAMP_TYPE)vp9_read_bit(&header_bc);
+  }
+
+  /* Is segmentation enabled */
+  xd->segmentation_enabled = (unsigned char)vp9_read_bit(&header_bc);
+
+  if (xd->segmentation_enabled) {
+    // Read whether or not the segmentation map is being explicitly
+    // updated this frame.
+    xd->update_mb_segmentation_map = (unsigned char)vp9_read_bit(&header_bc);
+
+    // If so what method will be used.
+    if (xd->update_mb_segmentation_map) {
+      // Which macro block level features are enabled
+
+      // Read the probs used to decode the segment id for each macro
+      // block.
+      for (i = 0; i < MB_FEATURE_TREE_PROBS; i++) {
+          xd->mb_segment_tree_probs[i] = vp9_read_bit(&header_bc) ?
+              (vp9_prob)vp9_read_literal(&header_bc, 8) : 255;
+      }
+
+      // Read the prediction probs needed to decode the segment id
+      pc->temporal_update = (unsigned char)vp9_read_bit(&header_bc);
+      for (i = 0; i < PREDICTION_PROBS; i++) {
+        if (pc->temporal_update) {
+          pc->segment_pred_probs[i] = vp9_read_bit(&header_bc) ?
+              (vp9_prob)vp9_read_literal(&header_bc, 8) : 255;
+        } else {
+          pc->segment_pred_probs[i] = 255;
+        }
+      }
+    }
+    // Is the segment data being updated
+    xd->update_mb_segmentation_data = (unsigned char)vp9_read_bit(&header_bc);
+
+    if (xd->update_mb_segmentation_data) {
+      int data;
+
+      xd->mb_segment_abs_delta = (unsigned char)vp9_read_bit(&header_bc);
+
+      vp9_clearall_segfeatures(xd);
+
+      // For each segmentation...
+      for (i = 0; i < MAX_MB_SEGMENTS; i++) {
+        // For each of the segments features...
+        for (j = 0; j < SEG_LVL_MAX; j++) {
+          // Is the feature enabled
+          if (vp9_read_bit(&header_bc)) {
+            // Update the feature data and mask
+            vp9_enable_segfeature(xd, i, j);
+
+            data = (signed char)vp9_read_literal(
+                     &header_bc, vp9_seg_feature_data_bits(j));
+
+            // Is the segment data signed..
+            if (vp9_is_segfeature_signed(j)) {
+              if (vp9_read_bit(&header_bc))
+                data = - data;
+            }
+          } else
+            data = 0;
+
+          vp9_set_segdata(xd, i, j, data);
+        }
+      }
+    }
+  }
+
+  // Read common prediction model status flag probability updates for the
+  // reference frame
+  if (pc->frame_type == KEY_FRAME) {
+    // Set the prediction probabilities to defaults
+    pc->ref_pred_probs[0] = 120;
+    pc->ref_pred_probs[1] = 80;
+    pc->ref_pred_probs[2] = 40;
+  } else {
+    for (i = 0; i < PREDICTION_PROBS; i++) {
+      if (vp9_read_bit(&header_bc))
+        pc->ref_pred_probs[i] = (vp9_prob)vp9_read_literal(&header_bc, 8);
+    }
+  }
+
+#if CONFIG_SUPERBLOCKS
+  pc->sb_coded = vp9_read_literal(&header_bc, 8);
+#endif
+
+  /* Read the loop filter level and type */
+  pc->txfm_mode = vp9_read_literal(&header_bc, 2);
+  if (pc->txfm_mode == TX_MODE_SELECT) {
+    pc->prob_tx[0] = vp9_read_literal(&header_bc, 8);
+    pc->prob_tx[1] = vp9_read_literal(&header_bc, 8);
+  }
+
+  pc->filter_type = (LOOPFILTERTYPE) vp9_read_bit(&header_bc);
+  pc->filter_level = vp9_read_literal(&header_bc, 6);
+  pc->sharpness_level = vp9_read_literal(&header_bc, 3);
+
+  /* Read in loop filter deltas applied at the MB level based on mode or ref frame. */
+  xd->mode_ref_lf_delta_update = 0;
+  xd->mode_ref_lf_delta_enabled = (unsigned char)vp9_read_bit(&header_bc);
+
+  if (xd->mode_ref_lf_delta_enabled) {
+    /* Do the deltas need to be updated */
+    xd->mode_ref_lf_delta_update = (unsigned char)vp9_read_bit(&header_bc);
+
+    if (xd->mode_ref_lf_delta_update) {
+      /* Send update */
+      for (i = 0; i < MAX_REF_LF_DELTAS; i++) {
+        if (vp9_read_bit(&header_bc)) {
+          /*sign = vp9_read_bit( &header_bc );*/
+          xd->ref_lf_deltas[i] = (signed char)vp9_read_literal(&header_bc, 6);
+
+          if (vp9_read_bit(&header_bc))        /* Apply sign */
+            xd->ref_lf_deltas[i] = xd->ref_lf_deltas[i] * -1;
+        }
+      }
+
+      /* Send update */
+      for (i = 0; i < MAX_MODE_LF_DELTAS; i++) {
+        if (vp9_read_bit(&header_bc)) {
+          /*sign = vp9_read_bit( &header_bc );*/
+          xd->mode_lf_deltas[i] = (signed char)vp9_read_literal(&header_bc, 6);
+
+          if (vp9_read_bit(&header_bc))        /* Apply sign */
+            xd->mode_lf_deltas[i] = xd->mode_lf_deltas[i] * -1;
+        }
+      }
+    }
+  }
+
+  // Dummy read for now
+  vp9_read_literal(&header_bc, 2);
+
+  setup_token_decoder(pbi, data + first_partition_length_in_bytes,
+                      &residual_bc);
+
+  /* Read the default quantizers. */
+  {
+    int Q, q_update;
+
+    Q = vp9_read_literal(&header_bc, QINDEX_BITS);
+    pc->base_qindex = Q;
+    q_update = 0;
+    /* AC 1st order Q = default */
+    pc->y1dc_delta_q = get_delta_q(&header_bc, pc->y1dc_delta_q, &q_update);
+    pc->y2dc_delta_q = get_delta_q(&header_bc, pc->y2dc_delta_q, &q_update);
+    pc->y2ac_delta_q = get_delta_q(&header_bc, pc->y2ac_delta_q, &q_update);
+    pc->uvdc_delta_q = get_delta_q(&header_bc, pc->uvdc_delta_q, &q_update);
+    pc->uvac_delta_q = get_delta_q(&header_bc, pc->uvac_delta_q, &q_update);
+
+    if (q_update)
+      vp9_init_de_quantizer(pbi);
+
+    /* MB level dequantizer setup */
+    mb_init_dequantizer(pbi, &pbi->mb);
+  }
+
+  /* Determine if the golden frame or ARF buffer should be updated and how.
+   * For all non key frames the GF and ARF refresh flags and sign bias
+   * flags must be set explicitly.
+   */
+  if (pc->frame_type != KEY_FRAME) {
+    /* Should the GF or ARF be updated from the current frame */
+    pc->refresh_golden_frame = vp9_read_bit(&header_bc);
+    pc->refresh_alt_ref_frame = vp9_read_bit(&header_bc);
+
+    if (pc->refresh_alt_ref_frame) {
+      vpx_memcpy(&pc->fc, &pc->lfc_a, sizeof(pc->fc));
+      vpx_memcpy(pc->fc.vp8_mode_contexts,
+                 pc->fc.mode_context_a,
+                 sizeof(pc->fc.vp8_mode_contexts));
+    } else {
+      vpx_memcpy(&pc->fc, &pc->lfc, sizeof(pc->fc));
+      vpx_memcpy(pc->fc.vp8_mode_contexts,
+                 pc->fc.mode_context,
+                 sizeof(pc->fc.vp8_mode_contexts));
+    }
+
+    /* Buffer to buffer copy flags. */
+    pc->copy_buffer_to_gf = 0;
+
+    if (!pc->refresh_golden_frame)
+      pc->copy_buffer_to_gf = vp9_read_literal(&header_bc, 2);
+
+    pc->copy_buffer_to_arf = 0;
+
+    if (!pc->refresh_alt_ref_frame)
+      pc->copy_buffer_to_arf = vp9_read_literal(&header_bc, 2);
+
+    pc->ref_frame_sign_bias[GOLDEN_FRAME] = vp9_read_bit(&header_bc);
+    pc->ref_frame_sign_bias[ALTREF_FRAME] = vp9_read_bit(&header_bc);
+
+    /* Is high precision mv allowed */
+    xd->allow_high_precision_mv = (unsigned char)vp9_read_bit(&header_bc);
+    // Read the type of subpel filter to use
+    if (vp9_read_bit(&header_bc)) {
+      pc->mcomp_filter_type = SWITCHABLE;
+    } else {
+      pc->mcomp_filter_type = vp9_read_literal(&header_bc, 2);
+    }
+    /* To enable choice of different interploation filters */
+    vp9_setup_interp_filters(xd, pc->mcomp_filter_type, pc);
+  }
+
+  pc->refresh_entropy_probs = vp9_read_bit(&header_bc);
+  if (pc->refresh_entropy_probs == 0) {
+    vpx_memcpy(&pc->lfc, &pc->fc, sizeof(pc->fc));
+  }
+
+  pc->refresh_last_frame = (pc->frame_type == KEY_FRAME)
+                           || vp9_read_bit(&header_bc);
+
+  if (0) {
+    FILE *z = fopen("decodestats.stt", "a");
+    fprintf(z, "%6d F:%d,G:%d,A:%d,L:%d,Q:%d\n",
+            pc->current_video_frame,
+            pc->frame_type,
+            pc->refresh_golden_frame,
+            pc->refresh_alt_ref_frame,
+            pc->refresh_last_frame,
+            pc->base_qindex);
+    fclose(z);
+  }
+
+  vp9_copy(pbi->common.fc.pre_coef_probs,
+           pbi->common.fc.coef_probs);
+  vp9_copy(pbi->common.fc.pre_hybrid_coef_probs,
+           pbi->common.fc.hybrid_coef_probs);
+  vp9_copy(pbi->common.fc.pre_coef_probs_8x8,
+           pbi->common.fc.coef_probs_8x8);
+  vp9_copy(pbi->common.fc.pre_hybrid_coef_probs_8x8,
+           pbi->common.fc.hybrid_coef_probs_8x8);
+  vp9_copy(pbi->common.fc.pre_coef_probs_16x16,
+           pbi->common.fc.coef_probs_16x16);
+  vp9_copy(pbi->common.fc.pre_hybrid_coef_probs_16x16,
+           pbi->common.fc.hybrid_coef_probs_16x16);
+  vp9_copy(pbi->common.fc.pre_ymode_prob, pbi->common.fc.ymode_prob);
+  vp9_copy(pbi->common.fc.pre_uv_mode_prob, pbi->common.fc.uv_mode_prob);
+  vp9_copy(pbi->common.fc.pre_bmode_prob, pbi->common.fc.bmode_prob);
+  vp9_copy(pbi->common.fc.pre_i8x8_mode_prob, pbi->common.fc.i8x8_mode_prob);
+  vp9_copy(pbi->common.fc.pre_sub_mv_ref_prob, pbi->common.fc.sub_mv_ref_prob);
+  vp9_copy(pbi->common.fc.pre_mbsplit_prob, pbi->common.fc.mbsplit_prob);
+  pbi->common.fc.pre_nmvc = pbi->common.fc.nmvc;
+  vp9_zero(pbi->common.fc.coef_counts);
+  vp9_zero(pbi->common.fc.hybrid_coef_counts);
+  vp9_zero(pbi->common.fc.coef_counts_8x8);
+  vp9_zero(pbi->common.fc.hybrid_coef_counts_8x8);
+  vp9_zero(pbi->common.fc.coef_counts_16x16);
+  vp9_zero(pbi->common.fc.hybrid_coef_counts_16x16);
+  vp9_zero(pbi->common.fc.ymode_counts);
+  vp9_zero(pbi->common.fc.uv_mode_counts);
+  vp9_zero(pbi->common.fc.bmode_counts);
+  vp9_zero(pbi->common.fc.i8x8_mode_counts);
+  vp9_zero(pbi->common.fc.sub_mv_ref_counts);
+  vp9_zero(pbi->common.fc.mbsplit_counts);
+  vp9_zero(pbi->common.fc.NMVcount);
+  vp9_zero(pbi->common.fc.mv_ref_ct);
+  vp9_zero(pbi->common.fc.mv_ref_ct_a);
+
+  read_coef_probs(pbi, &header_bc);
+
+  vpx_memcpy(&xd->pre, &pc->yv12_fb[pc->lst_fb_idx], sizeof(YV12_BUFFER_CONFIG));
+  vpx_memcpy(&xd->dst, &pc->yv12_fb[pc->new_fb_idx], sizeof(YV12_BUFFER_CONFIG));
+
+  // Create the segmentation map structure and set to 0
+  if (!pc->last_frame_seg_map)
+    CHECK_MEM_ERROR(pc->last_frame_seg_map,
+                    vpx_calloc((pc->mb_rows * pc->mb_cols), 1));
+
+  /* set up frame new frame for intra coded blocks */
+  vp9_setup_intra_recon(&pc->yv12_fb[pc->new_fb_idx]);
+
+  vp9_setup_block_dptrs(xd);
+
+  vp9_build_block_doffsets(xd);
+
+  /* clear out the coeff buffer */
+  vpx_memset(xd->qcoeff, 0, sizeof(xd->qcoeff));
+
+  /* Read the mb_no_coeff_skip flag */
+  pc->mb_no_coeff_skip = (int)vp9_read_bit(&header_bc);
+
+  vp9_decode_mode_mvs_init(pbi, &header_bc);
+
+  vpx_memset(pc->above_context, 0, sizeof(ENTROPY_CONTEXT_PLANES) * pc->mb_cols);
+
+  // Resset the macroblock mode info context to the start of the list
+  xd->mode_info_context = pc->mi;
+  xd->prev_mode_info_context = pc->prev_mi;
+
+  /* Decode a row of superblocks */
+  for (mb_row = 0; mb_row < pc->mb_rows; mb_row += 2) {
+    decode_sb_row(pbi, pc, mb_row, xd, &residual_bc);
+  }
+  corrupt_tokens |= xd->corrupted;
+
+  /* Collect information about decoder corruption. */
+  /* 1. Check first boolean decoder for errors. */
+  pc->yv12_fb[pc->new_fb_idx].corrupted = bool_error(&header_bc);
+  /* 2. Check the macroblock information */
+  pc->yv12_fb[pc->new_fb_idx].corrupted |= corrupt_tokens;
+
+  if (!pbi->decoded_key_frame) {
+    if (pc->frame_type == KEY_FRAME &&
+        !pc->yv12_fb[pc->new_fb_idx].corrupted)
+      pbi->decoded_key_frame = 1;
+    else
+      vpx_internal_error(&pbi->common.error, VPX_CODEC_CORRUPT_FRAME,
+                         "A stream must start with a complete key frame");
+  }
+
+  vp9_adapt_coef_probs(pc);
+  if (pc->frame_type != KEY_FRAME) {
+    vp9_adapt_mode_probs(pc);
+    vp9_adapt_nmv_probs(pc, xd->allow_high_precision_mv);
+    vp9_update_mode_context(&pbi->common);
+  }
+
+  /* If this was a kf or Gf note the Q used */
+  if ((pc->frame_type == KEY_FRAME) ||
+      pc->refresh_golden_frame || pc->refresh_alt_ref_frame) {
+    pc->last_kf_gf_q = pc->base_qindex;
+  }
+  if (pc->refresh_entropy_probs) {
+    if (pc->refresh_alt_ref_frame)
+      vpx_memcpy(&pc->lfc_a, &pc->fc, sizeof(pc->fc));
+    else
+      vpx_memcpy(&pc->lfc, &pc->fc, sizeof(pc->fc));
+  }
+
+#ifdef PACKET_TESTING
+  {
+    FILE *f = fopen("decompressor.VP8", "ab");
+    unsigned int size = residual_bc.pos + header_bc.pos + 8;
+    fwrite((void *) &size, 4, 1, f);
+    fwrite((void *) pbi->Source, size, 1, f);
+    fclose(f);
+  }
+#endif
+  // printf("Frame %d Done\n", frame_count++);
+
+  return 0;
+}
--- /dev/null
+++ b/vp9/decoder/dequantize.c
@@ -1,0 +1,543 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_ports/config.h"
+#include "dequantize.h"
+#include "vp9/common/idct.h"
+#include "vpx_mem/vpx_mem.h"
+#include "onyxd_int.h"
+
+extern void vp9_short_idct4x4llm_c(short *input, short *output, int pitch);
+extern void vp9_short_idct4x4llm_1_c(short *input, short *output, int pitch);
+extern void vp9_short_idct8x8_c(short *input, short *output, int pitch);
+extern void vp9_short_idct8x8_1_c(short *input, short *output, int pitch);
+
+#if CONFIG_LOSSLESS
+extern void vp9_short_inv_walsh4x4_x8_c(short *input, short *output,
+                                        int pitch);
+extern void vp9_short_inv_walsh4x4_1_x8_c(short *input, short *output,
+                                          int pitch);
+#endif
+
+#ifdef DEC_DEBUG
+extern int dec_debug;
+#endif
+
+void vp9_dequantize_b_c(BLOCKD *d) {
+
+  int i;
+  short *DQ  = d->dqcoeff;
+  short *Q   = d->qcoeff;
+  short *DQC = d->dequant;
+
+  for (i = 0; i < 16; i++) {
+    DQ[i] = Q[i] * DQC[i];
+  }
+}
+
+
+void vp9_ht_dequant_idct_add_c(TX_TYPE tx_type, short *input, short *dq,
+                               unsigned char *pred, unsigned char *dest,
+                               int pitch, int stride) {
+  short output[16];
+  short *diff_ptr = output;
+  int r, c;
+  int i;
+
+  for (i = 0; i < 16; i++) {
+    input[i] = dq[i] * input[i];
+  }
+
+  vp9_ihtllm_c(input, output, 4 << 1, tx_type, 4);
+
+  vpx_memset(input, 0, 32);
+
+  for (r = 0; r < 4; r++) {
+      for (c = 0; c < 4; c++) {
+        int a = diff_ptr[c] + pred[c];
+
+        if (a < 0)
+            a = 0;
+
+        if (a > 255)
+            a = 255;
+
+        dest[c] = (unsigned char) a;
+    }
+
+      dest += stride;
+      diff_ptr += 4;
+      pred += pitch;
+  }
+}
+
+void vp9_ht_dequant_idct_add_8x8_c(TX_TYPE tx_type, short *input, short *dq,
+                                   unsigned char *pred, unsigned char *dest,
+                                   int pitch, int stride) {
+  short output[64];
+  short *diff_ptr = output;
+  int b, r, c;
+  int i;
+  unsigned char *origdest = dest;
+  unsigned char *origpred = pred;
+
+  input[0] = dq[0] * input[0];
+  for (i = 1; i < 64; i++) {
+    input[i] = dq[1] * input[i];
+  }
+
+  vp9_ihtllm_c(input, output, 16, tx_type, 8);
+
+  vpx_memset(input, 0, 128);
+
+  for (b = 0; b < 4; b++) {
+    for (r = 0; r < 4; r++) {
+      for (c = 0; c < 4; c++) {
+        int a = diff_ptr[c] + pred[c];
+
+        if (a < 0)
+          a = 0;
+
+        if (a > 255)
+          a = 255;
+
+        dest[c] = (unsigned char) a;
+      }
+
+      dest += stride;
+      diff_ptr += 8;
+      pred += pitch;
+    }
+    // shift buffer pointers to next 4x4 block in the submacroblock
+    diff_ptr = output + (b + 1) / 2 * 4 * 8 + ((b + 1) % 2) * 4;
+    dest = origdest + (b + 1) / 2 * 4 * stride + ((b + 1) % 2) * 4;
+    pred = origpred + (b + 1) / 2 * 4 * pitch + ((b + 1) % 2) * 4;
+  }
+}
+
+void vp9_dequant_idct_add_c(short *input, short *dq, unsigned char *pred,
+                            unsigned char *dest, int pitch, int stride) {
+  short output[16];
+  short *diff_ptr = output;
+  int r, c;
+  int i;
+
+  for (i = 0; i < 16; i++) {
+    input[i] = dq[i] * input[i];
+  }
+
+  /* the idct halves ( >> 1) the pitch */
+  vp9_short_idct4x4llm_c(input, output, 4 << 1);
+
+  vpx_memset(input, 0, 32);
+
+  for (r = 0; r < 4; r++) {
+    for (c = 0; c < 4; c++) {
+      int a = diff_ptr[c] + pred[c];
+
+      if (a < 0)
+        a = 0;
+
+      if (a > 255)
+        a = 255;
+
+      dest[c] = (unsigned char) a;
+    }
+
+    dest += stride;
+    diff_ptr += 4;
+    pred += pitch;
+  }
+}
+
+void vp9_dequant_dc_idct_add_c(short *input, short *dq, unsigned char *pred,
+                               unsigned char *dest, int pitch, int stride,
+                               int Dc) {
+  int i;
+  short output[16];
+  short *diff_ptr = output;
+  int r, c;
+
+  input[0] = (short)Dc;
+
+  for (i = 1; i < 16; i++) {
+    input[i] = dq[i] * input[i];
+  }
+
+  /* the idct halves ( >> 1) the pitch */
+  vp9_short_idct4x4llm_c(input, output, 4 << 1);
+
+  vpx_memset(input, 0, 32);
+
+  for (r = 0; r < 4; r++) {
+    for (c = 0; c < 4; c++) {
+      int a = diff_ptr[c] + pred[c];
+
+      if (a < 0)
+        a = 0;
+
+      if (a > 255)
+        a = 255;
+
+      dest[c] = (unsigned char) a;
+    }
+
+    dest += stride;
+    diff_ptr += 4;
+    pred += pitch;
+  }
+}
+
+#if CONFIG_LOSSLESS
+void vp9_dequant_idct_add_lossless_c(short *input, short *dq,
+                                     unsigned char *pred, unsigned char *dest,
+                                     int pitch, int stride) {
+  short output[16];
+  short *diff_ptr = output;
+  int r, c;
+  int i;
+
+  for (i = 0; i < 16; i++) {
+    input[i] = dq[i] * input[i];
+  }
+
+  vp9_short_inv_walsh4x4_x8_c(input, output, 4 << 1);
+
+  vpx_memset(input, 0, 32);
+
+  for (r = 0; r < 4; r++) {
+    for (c = 0; c < 4; c++) {
+      int a = diff_ptr[c] + pred[c];
+
+      if (a < 0)
+        a = 0;
+
+      if (a > 255)
+        a = 255;
+
+      dest[c] = (unsigned char) a;
+    }
+
+    dest += stride;
+    diff_ptr += 4;
+    pred += pitch;
+  }
+}
+
+void vp9_dequant_dc_idct_add_lossless_c(short *input, short *dq,
+                                        unsigned char *pred,
+                                        unsigned char *dest,
+                                        int pitch, int stride, int dc) {
+  int i;
+  short output[16];
+  short *diff_ptr = output;
+  int r, c;
+
+  input[0] = (short)dc;
+
+  for (i = 1; i < 16; i++) {
+    input[i] = dq[i] * input[i];
+  }
+
+  vp9_short_inv_walsh4x4_x8_c(input, output, 4 << 1);
+  vpx_memset(input, 0, 32);
+
+  for (r = 0; r < 4; r++) {
+    for (c = 0; c < 4; c++) {
+      int a = diff_ptr[c] + pred[c];
+
+      if (a < 0)
+        a = 0;
+
+      if (a > 255)
+        a = 255;
+
+      dest[c] = (unsigned char) a;
+    }
+
+    dest += stride;
+    diff_ptr += 4;
+    pred += pitch;
+  }
+}
+#endif
+
+void vp9_dequantize_b_2x2_c(BLOCKD *d) {
+  int i;
+  short *DQ  = d->dqcoeff;
+  short *Q   = d->qcoeff;
+  short *DQC = d->dequant;
+
+  for (i = 0; i < 16; i++) {
+    DQ[i] = (short)((Q[i] * DQC[i]));
+  }
+#ifdef DEC_DEBUG
+  if (dec_debug) {
+    int j;
+    printf("Dequantize 2x2\n");
+    for (j = 0; j < 16; j++) printf("%d ", Q[j]);
+    printf("\n");
+    for (j = 0; j < 16; j++) printf("%d ", DQ[j]);
+    printf("\n");
+  }
+#endif
+}
+
+void vp9_dequant_idct_add_8x8_c(short *input, short *dq, unsigned char *pred,
+                                unsigned char *dest, int pitch, int stride) {
+  short output[64];
+  short *diff_ptr = output;
+  int r, c, b;
+  int i;
+  unsigned char *origdest = dest;
+  unsigned char *origpred = pred;
+
+#ifdef DEC_DEBUG
+  if (dec_debug) {
+    int j;
+    printf("Input 8x8\n");
+    for (j = 0; j < 64; j++) {
+      printf("%d ", input[j]);
+      if (j % 8 == 7) printf("\n");
+    }
+  }
+#endif
+
+  input[0] = input[0] * dq[0];
+
+  // recover quantizer for 4 4x4 blocks
+  for (i = 1; i < 64; i++) {
+    input[i] = input[i] * dq[1];
+  }
+#ifdef DEC_DEBUG
+  if (dec_debug) {
+    int j;
+    printf("Input DQ 8x8\n");
+    for (j = 0; j < 64; j++) {
+      printf("%d ", input[j]);
+      if (j % 8 == 7) printf("\n");
+    }
+  }
+#endif
+
+  // the idct halves ( >> 1) the pitch
+  vp9_short_idct8x8_c(input, output, 16);
+#ifdef DEC_DEBUG
+  if (dec_debug) {
+    int j;
+    printf("Output 8x8\n");
+    for (j = 0; j < 64; j++) {
+      printf("%d ", output[j]);
+      if (j % 8 == 7) printf("\n");
+    }
+  }
+#endif
+
+  vpx_memset(input, 0, 128);// test what should i put here
+
+  for (b = 0; b < 4; b++) {
+    for (r = 0; r < 4; r++) {
+      for (c = 0; c < 4; c++) {
+        int a = diff_ptr[c] + pred[c];
+
+        if (a < 0)
+          a = 0;
+
+        if (a > 255)
+          a = 255;
+
+        dest[c] = (unsigned char) a;
+      }
+
+      dest += stride;
+      diff_ptr += 8;
+      pred += pitch;
+    }
+    diff_ptr = output + (b + 1) / 2 * 4 * 8 + (b + 1) % 2 * 4;
+    dest = origdest + (b + 1) / 2 * 4 * stride + (b + 1) % 2 * 4;
+    pred = origpred + (b + 1) / 2 * 4 * pitch + (b + 1) % 2 * 4;
+  }
+#ifdef DEC_DEBUG
+  if (dec_debug) {
+    int k, j;
+    printf("Final 8x8\n");
+    for (j = 0; j < 8; j++) {
+      for (k = 0; k < 8; k++) {
+        printf("%d ", origdest[k]);
+      }
+      printf("\n");
+      origdest += stride;
+    }
+  }
+#endif
+}
+
+void vp9_dequant_dc_idct_add_8x8_c(short *input, short *dq, unsigned char *pred,
+                                   unsigned char *dest, int pitch, int stride,
+                                   int Dc) { // Dc for 1st order T in some rear case
+  short output[64];
+  short *diff_ptr = output;
+  int r, c, b;
+  int i;
+  unsigned char *origdest = dest;
+  unsigned char *origpred = pred;
+
+  input[0] = (short)Dc;// Dc is the reconstructed value, do not need dequantization
+  // dc value is recovered after dequantization, since dc need not quantization
+#ifdef DEC_DEBUG
+  if (dec_debug) {
+    int j;
+    printf("Input 8x8\n");
+    for (j = 0; j < 64; j++) {
+      printf("%d ", input[j]);
+      if (j % 8 == 7) printf("\n");
+    }
+  }
+#endif
+  for (i = 1; i < 64; i++) {
+    input[i] = input[i] * dq[1];
+  }
+
+#ifdef DEC_DEBUG
+  if (dec_debug) {
+    int j;
+    printf("Input DQ 8x8\n");
+    for (j = 0; j < 64; j++) {
+      printf("%d ", input[j]);
+      if (j % 8 == 7) printf("\n");
+    }
+  }
+#endif
+
+  // the idct halves ( >> 1) the pitch
+  vp9_short_idct8x8_c(input, output, 16);
+#ifdef DEC_DEBUG
+  if (dec_debug) {
+    int j;
+    printf("Output 8x8\n");
+    for (j = 0; j < 64; j++) {
+      printf("%d ", output[j]);
+      if (j % 8 == 7) printf("\n");
+    }
+  }
+#endif
+  vpx_memset(input, 0, 128);
+
+  for (b = 0; b < 4; b++) {
+    for (r = 0; r < 4; r++) {
+      for (c = 0; c < 4; c++) {
+        int a = diff_ptr[c] + pred[c];
+
+        if (a < 0)
+          a = 0;
+
+        if (a > 255)
+          a = 255;
+
+        dest[c] = (unsigned char) a;
+      }
+
+      dest += stride;
+      diff_ptr += 8;
+      pred += pitch;
+    }
+    diff_ptr = output + (b + 1) / 2 * 4 * 8 + (b + 1) % 2 * 4;
+    dest = origdest + (b + 1) / 2 * 4 * stride + (b + 1) % 2 * 4;
+    pred = origpred + (b + 1) / 2 * 4 * pitch + (b + 1) % 2 * 4;
+  }
+#ifdef DEC_DEBUG
+  if (dec_debug) {
+    int k, j;
+    printf("Final 8x8\n");
+    for (j = 0; j < 8; j++) {
+      for (k = 0; k < 8; k++) {
+        printf("%d ", origdest[k]);
+      }
+      printf("\n");
+      origdest += stride;
+    }
+  }
+#endif
+}
+
+void vp9_ht_dequant_idct_add_16x16_c(TX_TYPE tx_type, short *input, short *dq,
+                                     unsigned char *pred, unsigned char *dest,
+                                     int pitch, int stride) {
+  short output[256];
+  short *diff_ptr = output;
+  int r, c, i;
+
+  input[0]= input[0] * dq[0];
+
+  // recover quantizer for 4 4x4 blocks
+  for (i = 1; i < 256; i++)
+    input[i] = input[i] * dq[1];
+
+  // inverse hybrid transform
+  vp9_ihtllm_c(input, output, 32, tx_type, 16);
+
+  // the idct halves ( >> 1) the pitch
+  // vp9_short_idct16x16_c(input, output, 32);
+
+  vpx_memset(input, 0, 512);
+
+  for (r = 0; r < 16; r++) {
+    for (c = 0; c < 16; c++) {
+      int a = diff_ptr[c] + pred[c];
+
+      if (a < 0)
+        a = 0;
+      else if (a > 255)
+        a = 255;
+
+      dest[c] = (unsigned char) a;
+    }
+
+    dest += stride;
+    diff_ptr += 16;
+    pred += pitch;
+  }
+}
+
+void vp9_dequant_idct_add_16x16_c(short *input, short *dq, unsigned char *pred,
+                                  unsigned char *dest, int pitch, int stride) {
+  short output[256];
+  short *diff_ptr = output;
+  int r, c, i;
+
+  input[0]= input[0] * dq[0];
+
+  // recover quantizer for 4 4x4 blocks
+  for (i = 1; i < 256; i++)
+    input[i] = input[i] * dq[1];
+
+  // the idct halves ( >> 1) the pitch
+  vp9_short_idct16x16_c(input, output, 32);
+
+  vpx_memset(input, 0, 512);
+
+  for (r = 0; r < 16; r++) {
+    for (c = 0; c < 16; c++) {
+      int a = diff_ptr[c] + pred[c];
+
+      if (a < 0)
+        a = 0;
+      else if (a > 255)
+        a = 255;
+
+      dest[c] = (unsigned char) a;
+    }
+
+    dest += stride;
+    diff_ptr += 16;
+    pred += pitch;
+  }
+}
--- /dev/null
+++ b/vp9/decoder/dequantize.h
@@ -1,0 +1,78 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef DEQUANTIZE_H
+#define DEQUANTIZE_H
+#include "vp9/common/blockd.h"
+
+#if CONFIG_LOSSLESS
+extern void vp9_dequant_idct_add_lossless_c(short *input, short *dq,
+                                            unsigned char *pred,
+                                            unsigned char *output,
+                                            int pitch, int stride);
+extern void vp9_dequant_dc_idct_add_lossless_c(short *input, short *dq,
+                                               unsigned char *pred,
+                                               unsigned char *output,
+                                               int pitch, int stride, int dc);
+extern void vp9_dequant_dc_idct_add_y_block_lossless_c(short *q, short *dq,
+                                                       unsigned char *pre,
+                                                       unsigned char *dst,
+                                                       int stride, char *eobs,
+                                                       short *dc);
+extern void vp9_dequant_idct_add_y_block_lossless_c(short *q, short *dq,
+                                                    unsigned char *pre,
+                                                    unsigned char *dst,
+                                                    int stride, char *eobs);
+extern void vp9_dequant_idct_add_uv_block_lossless_c(short *q, short *dq,
+                                                     unsigned char *pre,
+                                                     unsigned char *dst_u,
+                                                     unsigned char *dst_v,
+                                                     int stride, char *eobs);
+#endif
+
+typedef void (*vp9_dequant_idct_add_fn_t)(short *input, short *dq,
+    unsigned char *pred, unsigned char *output, int pitch, int stride);
+typedef void(*vp9_dequant_dc_idct_add_fn_t)(short *input, short *dq,
+    unsigned char *pred, unsigned char *output, int pitch, int stride, int dc);
+
+typedef void(*vp9_dequant_dc_idct_add_y_block_fn_t)(short *q, short *dq,
+    unsigned char *pre, unsigned char *dst, int stride, char *eobs, short *dc);
+typedef void(*vp9_dequant_idct_add_y_block_fn_t)(short *q, short *dq,
+    unsigned char *pre, unsigned char *dst, int stride, char *eobs);
+typedef void(*vp9_dequant_idct_add_uv_block_fn_t)(short *q, short *dq,
+    unsigned char *pre, unsigned char *dst_u, unsigned char *dst_v, int stride,
+    char *eobs);
+
+void vp9_ht_dequant_idct_add_c(TX_TYPE tx_type, short *input, short *dq,
+                                    unsigned char *pred, unsigned char *dest,
+                                    int pitch, int stride);
+
+void vp9_ht_dequant_idct_add_8x8_c(TX_TYPE tx_type, short *input, short *dq,
+                                   unsigned char *pred, unsigned char *dest,
+                                   int pitch, int stride);
+
+void vp9_ht_dequant_idct_add_16x16_c(TX_TYPE tx_type, short *input, short *dq,
+                                     unsigned char *pred, unsigned char *dest,
+                                     int pitch, int stride);
+
+#if CONFIG_SUPERBLOCKS
+void vp9_dequant_dc_idct_add_y_block_8x8_inplace_c(short *q, short *dq,
+                                                   unsigned char *dst,
+                                                   int stride, char *eobs,
+                                                   short *dc, MACROBLOCKD *xd);
+void vp9_dequant_idct_add_uv_block_8x8_inplace_c(short *q, short *dq,
+                                                 unsigned char *dstu,
+                                                 unsigned char *dstv,
+                                                 int stride, char *eobs,
+                                                 MACROBLOCKD *xd);
+#endif
+
+#endif
--- /dev/null
+++ b/vp9/decoder/detokenize.c
@@ -1,0 +1,640 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vp9/common/type_aliases.h"
+#include "vp9/common/blockd.h"
+#include "onyxd_int.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/mem.h"
+#include "detokenize.h"
+
+#include "vp9/common/seg_common.h"
+
+#define BOOL_DATA UINT8
+
+#define OCB_X PREV_COEF_CONTEXTS * ENTROPY_NODES
+
+DECLARE_ALIGNED(16, static const int, coef_bands_x[16]) = {
+  0 * OCB_X, 1 * OCB_X, 2 * OCB_X, 3 * OCB_X,
+  6 * OCB_X, 4 * OCB_X, 5 * OCB_X, 6 * OCB_X,
+  6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X,
+  6 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X
+};
+DECLARE_ALIGNED(16, static const int, coef_bands_x_8x8[64]) = {
+  0 * OCB_X, 1 * OCB_X, 2 * OCB_X, 3 * OCB_X, 5 * OCB_X, 4 * OCB_X, 4 * OCB_X, 5 * OCB_X,
+  5 * OCB_X, 3 * OCB_X, 6 * OCB_X, 3 * OCB_X, 5 * OCB_X, 4 * OCB_X, 6 * OCB_X, 6 * OCB_X,
+  6 * OCB_X, 5 * OCB_X, 5 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X,
+  6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X,
+  6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X,
+  7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X,
+  7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X,
+  7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X,
+};
+
+DECLARE_ALIGNED(16, static const int, coef_bands_x_16x16[256]) = {
+  0 * OCB_X, 1 * OCB_X, 2 * OCB_X, 3 * OCB_X, 5 * OCB_X, 4 * OCB_X, 4 * OCB_X, 5 * OCB_X, 5 * OCB_X, 3 * OCB_X, 6 * OCB_X, 3 * OCB_X, 5 * OCB_X, 4 * OCB_X, 6 * OCB_X, 6 * OCB_X,
+  6 * OCB_X, 5 * OCB_X, 5 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X,
+  6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X,
+  7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X,
+  7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X,
+  7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X,
+  7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X,
+  7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X,
+  7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X,
+  7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X,
+  7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X,
+  7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X,
+  7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X,
+  7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X,
+  7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X,
+  7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X
+};
+
+#define EOB_CONTEXT_NODE            0
+#define ZERO_CONTEXT_NODE           1
+#define ONE_CONTEXT_NODE            2
+#define LOW_VAL_CONTEXT_NODE        3
+#define TWO_CONTEXT_NODE            4
+#define THREE_CONTEXT_NODE          5
+#define HIGH_LOW_CONTEXT_NODE       6
+#define CAT_ONE_CONTEXT_NODE        7
+#define CAT_THREEFOUR_CONTEXT_NODE  8
+#define CAT_THREE_CONTEXT_NODE      9
+#define CAT_FIVE_CONTEXT_NODE       10
+
+#define CAT1_MIN_VAL    5
+#define CAT2_MIN_VAL    7
+#define CAT3_MIN_VAL   11
+#define CAT4_MIN_VAL   19
+#define CAT5_MIN_VAL   35
+#define CAT6_MIN_VAL   67
+#define CAT1_PROB0    159
+#define CAT2_PROB0    145
+#define CAT2_PROB1    165
+
+#define CAT3_PROB0 140
+#define CAT3_PROB1 148
+#define CAT3_PROB2 173
+
+#define CAT4_PROB0 135
+#define CAT4_PROB1 140
+#define CAT4_PROB2 155
+#define CAT4_PROB3 176
+
+#define CAT5_PROB0 130
+#define CAT5_PROB1 134
+#define CAT5_PROB2 141
+#define CAT5_PROB3 157
+#define CAT5_PROB4 180
+
+static const unsigned char cat6_prob[14] =
+{ 254, 254, 252, 249, 243, 230, 196, 177, 153, 140, 133, 130, 129, 0 };
+
+void vp9_reset_mb_tokens_context(MACROBLOCKD *xd) {
+  /* Clear entropy contexts for Y2 blocks */
+  if ((xd->mode_info_context->mbmi.mode != B_PRED &&
+      xd->mode_info_context->mbmi.mode != I8X8_PRED &&
+      xd->mode_info_context->mbmi.mode != SPLITMV)
+      || xd->mode_info_context->mbmi.txfm_size == TX_16X16
+      ) {
+    vpx_memset(xd->above_context, 0, sizeof(ENTROPY_CONTEXT_PLANES));
+    vpx_memset(xd->left_context, 0, sizeof(ENTROPY_CONTEXT_PLANES));
+  } else {
+    vpx_memset(xd->above_context, 0, sizeof(ENTROPY_CONTEXT_PLANES) - 1);
+    vpx_memset(xd->left_context, 0, sizeof(ENTROPY_CONTEXT_PLANES) - 1);
+  }
+}
+
+DECLARE_ALIGNED(16, extern const unsigned char, vp9_norm[256]);
+
+// #define PREV_CONTEXT_INC(val) (2+((val)>2))
+// #define PREV_CONTEXT_INC(val) (vp9_prev_token_class[(val)])
+#define PREV_CONTEXT_INC(val) (vp9_prev_token_class[(val)>10?10:(val)])
+
+static int get_token(int v) {
+  if (v < 0) v = -v;
+  if (v == 0) return ZERO_TOKEN;
+  else if (v == 1) return ONE_TOKEN;
+  else if (v == 2) return TWO_TOKEN;
+  else if (v == 3) return THREE_TOKEN;
+  else if (v == 4) return FOUR_TOKEN;
+  else if (v <= 6) return DCT_VAL_CATEGORY1;
+  else if (v <= 10) return DCT_VAL_CATEGORY2;
+  else if (v <= 18) return DCT_VAL_CATEGORY3;
+  else if (v <= 34) return DCT_VAL_CATEGORY4;
+  else if (v <= 66) return DCT_VAL_CATEGORY5;
+  else return DCT_VAL_CATEGORY6;
+}
+
+void static count_tokens_adaptive_scan(const MACROBLOCKD *xd, INT16 *qcoeff_ptr,
+                                       int block, PLANE_TYPE type,
+                                       TX_TYPE tx_type,
+                                       ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
+                                       int eob, int seg_eob,
+                                       FRAME_CONTEXT *fc) {
+  int c, pt, token, band;
+  const int *scan;
+
+  switch(tx_type) {
+    case ADST_DCT :
+      scan = vp9_row_scan;
+      break;
+
+    case DCT_ADST :
+      scan = vp9_col_scan;
+      break;
+
+    default :
+      scan = vp9_default_zig_zag1d;
+      break;
+  }
+
+  VP9_COMBINEENTROPYCONTEXTS(pt, *a, *l);
+  for (c = !type; c < eob; ++c) {
+    int rc = scan[c];
+    int v = qcoeff_ptr[rc];
+    band = vp9_coef_bands[c];
+    token = get_token(v);
+    if (tx_type != DCT_DCT)
+      fc->hybrid_coef_counts[type][band][pt][token]++;
+    else
+      fc->coef_counts[type][band][pt][token]++;
+    pt = vp9_prev_token_class[token];
+  }
+
+  if (eob < seg_eob) {
+    band = vp9_coef_bands[c];
+    if (tx_type != DCT_DCT)
+      fc->hybrid_coef_counts[type][band][pt][DCT_EOB_TOKEN]++;
+    else
+      fc->coef_counts[type][band][pt][DCT_EOB_TOKEN]++;
+  }
+}
+
+void static count_tokens(INT16 *qcoeff_ptr, int block, PLANE_TYPE type,
+                         ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
+                         int eob, int seg_eob, FRAME_CONTEXT *const fc) {
+  int c, pt, token, band;
+  VP9_COMBINEENTROPYCONTEXTS(pt, *a, *l);
+  for (c = !type; c < eob; ++c) {
+    int rc = vp9_default_zig_zag1d[c];
+    int v = qcoeff_ptr[rc];
+    band = vp9_coef_bands[c];
+    token = get_token(v);
+    fc->coef_counts[type][band][pt][token]++;
+    pt = vp9_prev_token_class[token];
+  }
+  if (eob < seg_eob) {
+    band = vp9_coef_bands[c];
+    fc->coef_counts[type][band][pt][DCT_EOB_TOKEN]++;
+  }
+}
+
+void static count_tokens_8x8(INT16 *qcoeff_ptr, int block, PLANE_TYPE type,
+                             TX_TYPE tx_type,
+                             ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
+                             int eob, int seg_eob, FRAME_CONTEXT *fc) {
+  int c, pt, token, band;
+  VP9_COMBINEENTROPYCONTEXTS(pt, *a, *l);
+  for (c = !type; c < eob; ++c) {
+    int rc = (type == 1 ? vp9_default_zig_zag1d[c] : vp9_default_zig_zag1d_8x8[c]);
+    int v = qcoeff_ptr[rc];
+    band = (type == 1 ? vp9_coef_bands[c] : vp9_coef_bands_8x8[c]);
+    token = get_token(v);
+    if (tx_type != DCT_DCT)
+      fc->hybrid_coef_counts_8x8[type][band][pt][token]++;
+    else
+      fc->coef_counts_8x8[type][band][pt][token]++;
+    pt = vp9_prev_token_class[token];
+  }
+  if (eob < seg_eob) {
+    band = (type == 1 ? vp9_coef_bands[c] : vp9_coef_bands_8x8[c]);
+    if (tx_type != DCT_DCT)
+      fc->hybrid_coef_counts_8x8[type][band][pt][DCT_EOB_TOKEN]++;
+    else
+      fc->coef_counts_8x8[type][band][pt][DCT_EOB_TOKEN]++;
+  }
+}
+
+void static count_tokens_16x16(INT16 *qcoeff_ptr, int block, PLANE_TYPE type,
+                               TX_TYPE tx_type,
+                               ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
+                               int eob, int seg_eob, FRAME_CONTEXT *fc) {
+  int c, pt, token;
+  VP9_COMBINEENTROPYCONTEXTS(pt, *a, *l);
+  for (c = !type; c < eob; ++c) {
+    int rc = vp9_default_zig_zag1d_16x16[c];
+    int v = qcoeff_ptr[rc];
+    int band = vp9_coef_bands_16x16[c];
+    token = get_token(v);
+    if (tx_type != DCT_DCT)
+      fc->hybrid_coef_counts_16x16[type][band][pt][token]++;
+    else
+      fc->coef_counts_16x16[type][band][pt][token]++;
+    pt = vp9_prev_token_class[token];
+  }
+  if (eob < seg_eob) {
+    int band = vp9_coef_bands_16x16[c];
+    if (tx_type != DCT_DCT)
+      fc->hybrid_coef_counts_16x16[type][band][pt][DCT_EOB_TOKEN]++;
+    else
+      fc->coef_counts_16x16[type][band][pt][DCT_EOB_TOKEN]++;
+  }
+}
+
+static int get_signed(BOOL_DECODER *br, int value_to_sign) {
+  const int split = (br->range + 1) >> 1;
+  const VP9_BD_VALUE bigsplit = (VP9_BD_VALUE)split << (VP9_BD_VALUE_SIZE - 8);
+  int v;
+
+  if (br->count < 0)
+    vp9_bool_decoder_fill(br);
+
+  if (br->value < bigsplit) {
+    br->range = split;
+    v = value_to_sign;
+  } else {
+    br->range = br->range - split;
+    br->value = br->value - bigsplit;
+    v = -value_to_sign;
+  }
+  br->range += br->range;
+  br->value += br->value;
+  --br->count;
+
+  return v;
+}
+
+#define WRITE_COEF_CONTINUE(val)                              \
+  {                                                           \
+    prob = coef_probs + (ENTROPY_NODES*PREV_CONTEXT_INC(val));\
+    qcoeff_ptr[scan[c]] = (INT16) get_signed(br, val);        \
+    c++;                                                      \
+    continue;                                                 \
+  }
+
+#define ADJUST_COEF(prob, bits_count)  \
+  do {                                 \
+    if (vp9_read(br, prob))            \
+      val += (UINT16)(1 << bits_count);\
+  } while (0);
+
+static int decode_coefs(VP9D_COMP *dx, const MACROBLOCKD *xd,
+                        BOOL_DECODER* const br,
+                        ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
+                        PLANE_TYPE type,
+                        TX_TYPE tx_type,
+                        int seg_eob, INT16 *qcoeff_ptr, int i,
+                        const int *const scan, int block_type,
+                        const int *coef_bands) {
+  FRAME_CONTEXT *const fc = &dx->common.fc;
+  int tmp, c = (type == PLANE_TYPE_Y_NO_DC);
+  const vp9_prob *prob, *coef_probs;
+
+  switch (block_type) {
+    default:
+    case TX_4X4:
+      coef_probs =
+        tx_type != DCT_DCT ? fc->hybrid_coef_probs[type][0][0] :
+        fc->coef_probs[type][0][0];
+      break;
+    case TX_8X8:
+      coef_probs =
+        tx_type != DCT_DCT ? fc->hybrid_coef_probs_8x8[type][0][0] :
+        fc->coef_probs_8x8[type][0][0];
+      break;
+    case TX_16X16:
+      coef_probs =
+        tx_type != DCT_DCT ? fc->hybrid_coef_probs_16x16[type][0][0] :
+        fc->coef_probs_16x16[type][0][0];
+      break;
+  }
+
+  VP9_COMBINEENTROPYCONTEXTS(tmp, *a, *l);
+  prob = coef_probs + tmp * ENTROPY_NODES;
+
+  while (1) {
+    int val;
+    const uint8_t *cat6 = cat6_prob;
+    if (c == seg_eob) break;
+    prob += coef_bands[c];
+    if (!vp9_read(br, prob[EOB_CONTEXT_NODE]))
+      break;
+SKIP_START:
+    if (c == seg_eob) break;
+    if (!vp9_read(br, prob[ZERO_CONTEXT_NODE])) {
+      ++c;
+      prob = coef_probs + coef_bands[c];
+      goto SKIP_START;
+    }
+    // ONE_CONTEXT_NODE_0_
+    if (!vp9_read(br, prob[ONE_CONTEXT_NODE])) {
+      prob = coef_probs + ENTROPY_NODES;
+      qcoeff_ptr[scan[c]] = (INT16) get_signed(br, 1);
+      ++c;
+      continue;
+    }
+    // LOW_VAL_CONTEXT_NODE_0_
+    if (!vp9_read(br, prob[LOW_VAL_CONTEXT_NODE])) {
+      if (!vp9_read(br, prob[TWO_CONTEXT_NODE])) {
+        WRITE_COEF_CONTINUE(2);
+      }
+      if (!vp9_read(br, prob[THREE_CONTEXT_NODE])) {
+        WRITE_COEF_CONTINUE(3);
+      }
+      WRITE_COEF_CONTINUE(4);
+    }
+    // HIGH_LOW_CONTEXT_NODE_0_
+    if (!vp9_read(br, prob[HIGH_LOW_CONTEXT_NODE])) {
+      if (!vp9_read(br, prob[CAT_ONE_CONTEXT_NODE])) {
+        val = CAT1_MIN_VAL;
+        ADJUST_COEF(CAT1_PROB0, 0);
+        WRITE_COEF_CONTINUE(val);
+      }
+      val = CAT2_MIN_VAL;
+      ADJUST_COEF(CAT2_PROB1, 1);
+      ADJUST_COEF(CAT2_PROB0, 0);
+      WRITE_COEF_CONTINUE(val);
+    }
+    // CAT_THREEFOUR_CONTEXT_NODE_0_
+    if (!vp9_read(br, prob[CAT_THREEFOUR_CONTEXT_NODE])) {
+      if (!vp9_read(br, prob[CAT_THREE_CONTEXT_NODE])) {
+        val = CAT3_MIN_VAL;
+        ADJUST_COEF(CAT3_PROB2, 2);
+        ADJUST_COEF(CAT3_PROB1, 1);
+        ADJUST_COEF(CAT3_PROB0, 0);
+        WRITE_COEF_CONTINUE(val);
+      }
+      val = CAT4_MIN_VAL;
+      ADJUST_COEF(CAT4_PROB3, 3);
+      ADJUST_COEF(CAT4_PROB2, 2);
+      ADJUST_COEF(CAT4_PROB1, 1);
+      ADJUST_COEF(CAT4_PROB0, 0);
+      WRITE_COEF_CONTINUE(val);
+    }
+    // CAT_FIVE_CONTEXT_NODE_0_:
+    if (!vp9_read(br, prob[CAT_FIVE_CONTEXT_NODE])) {
+      val = CAT5_MIN_VAL;
+      ADJUST_COEF(CAT5_PROB4, 4);
+      ADJUST_COEF(CAT5_PROB3, 3);
+      ADJUST_COEF(CAT5_PROB2, 2);
+      ADJUST_COEF(CAT5_PROB1, 1);
+      ADJUST_COEF(CAT5_PROB0, 0);
+      WRITE_COEF_CONTINUE(val);
+    }
+    val = 0;
+    while (*cat6) {
+      val = (val << 1) | vp9_read(br, *cat6++);
+    }
+    val += CAT6_MIN_VAL;
+    WRITE_COEF_CONTINUE(val);
+  }
+
+  if (block_type == TX_4X4) {
+    count_tokens_adaptive_scan(xd, qcoeff_ptr, i, type,
+                               tx_type,
+                               a, l, c, seg_eob, fc);
+  }
+  else if (block_type == TX_8X8)
+    count_tokens_8x8(qcoeff_ptr, i, type,
+                     tx_type,
+                     a, l, c, seg_eob, fc);
+  else
+    count_tokens_16x16(qcoeff_ptr, i, type,
+                       tx_type,
+                       a, l, c, seg_eob, fc);
+  return c;
+}
+
+int vp9_decode_mb_tokens_16x16(VP9D_COMP *pbi, MACROBLOCKD *xd,
+                               BOOL_DECODER* const bc) {
+  ENTROPY_CONTEXT* const A = (ENTROPY_CONTEXT *)xd->above_context;
+  ENTROPY_CONTEXT* const L = (ENTROPY_CONTEXT *)xd->left_context;
+
+  char* const eobs = xd->eobs;
+  PLANE_TYPE type;
+  int c, i, eobtotal = 0, seg_eob;
+  const int segment_id = xd->mode_info_context->mbmi.segment_id;
+  const int seg_active = vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB);
+  INT16 *qcoeff_ptr = &xd->qcoeff[0];
+  TX_TYPE tx_type = get_tx_type(xd, &xd->block[0]);
+
+  type = PLANE_TYPE_Y_WITH_DC;
+
+  if (seg_active)
+      seg_eob = vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);
+  else
+      seg_eob = 256;
+
+  // Luma block
+  {
+    const int* const scan = vp9_default_zig_zag1d_16x16;
+    c = decode_coefs(pbi, xd, bc, A, L, type,
+                     tx_type,
+                     seg_eob, qcoeff_ptr,
+                     0, scan, TX_16X16, coef_bands_x_16x16);
+    eobs[0] = c;
+    A[0] = L[0] = (c != !type);
+    A[1] = A[2] = A[3] = A[0];
+    L[1] = L[2] = L[3] = L[0];
+    eobtotal += c;
+  }
+
+  // 8x8 chroma blocks
+  qcoeff_ptr += 256;
+  type = PLANE_TYPE_UV;
+  tx_type = DCT_DCT;
+  if (seg_active)
+    seg_eob = vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);
+  else
+    seg_eob = 64;
+  for (i = 16; i < 24; i += 4) {
+    ENTROPY_CONTEXT* const a = A + vp9_block2above_8x8[i];
+    ENTROPY_CONTEXT* const l = L + vp9_block2left_8x8[i];
+    const int* const scan = vp9_default_zig_zag1d_8x8;
+
+    c = decode_coefs(pbi, xd, bc, a, l, type,
+                     tx_type,
+                     seg_eob, qcoeff_ptr,
+                     i, scan, TX_8X8, coef_bands_x_8x8);
+    a[0] = l[0] = ((eobs[i] = c) != !type);
+    a[1] = a[0];
+    l[1] = l[0];
+
+    eobtotal += c;
+    qcoeff_ptr += 64;
+  }
+  vpx_memset(&A[8], 0, sizeof(A[8]));
+  vpx_memset(&L[8], 0, sizeof(L[8]));
+  return eobtotal;
+}
+
+int vp9_decode_mb_tokens_8x8(VP9D_COMP *pbi, MACROBLOCKD *xd,
+                             BOOL_DECODER* const bc) {
+  ENTROPY_CONTEXT *const A = (ENTROPY_CONTEXT *)xd->above_context;
+  ENTROPY_CONTEXT *const L = (ENTROPY_CONTEXT *)xd->left_context;
+
+  char *const eobs = xd->eobs;
+  PLANE_TYPE type;
+  int c, i, eobtotal = 0, seg_eob;
+  const int segment_id = xd->mode_info_context->mbmi.segment_id;
+  const int seg_active = vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB);
+  INT16 *qcoeff_ptr = &xd->qcoeff[0];
+  TX_TYPE tx_type = DCT_DCT;
+
+  int bufthred = (xd->mode_info_context->mbmi.mode == I8X8_PRED ||
+                  xd->mode_info_context->mbmi.mode == SPLITMV) ? 16 : 24;
+  if (xd->mode_info_context->mbmi.mode != B_PRED &&
+      xd->mode_info_context->mbmi.mode != SPLITMV &&
+      xd->mode_info_context->mbmi.mode != I8X8_PRED) {
+    ENTROPY_CONTEXT *const a = A + vp9_block2above_8x8[24];
+    ENTROPY_CONTEXT *const l = L + vp9_block2left_8x8[24];
+    const int *const scan = vp9_default_zig_zag1d;
+    type = PLANE_TYPE_Y2;
+
+    if (seg_active)
+      seg_eob = vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);
+    else
+      seg_eob = 4;
+    c = decode_coefs(pbi, xd, bc, a, l, type,
+                     tx_type,
+                     seg_eob, qcoeff_ptr + 24 * 16,
+                     24, scan, TX_8X8, coef_bands_x);
+    a[0] = l[0] = ((eobs[24] = c) != !type);
+
+    eobtotal += c - 4;
+
+    type = PLANE_TYPE_Y_NO_DC;
+  } else
+    type = PLANE_TYPE_Y_WITH_DC;
+
+  if (seg_active)
+    seg_eob = vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);
+  else
+    seg_eob = 64;
+
+  for (i = 0; i < bufthred ; i += 4) {
+    ENTROPY_CONTEXT *const a = A + vp9_block2above_8x8[i];
+    ENTROPY_CONTEXT *const l = L + vp9_block2left_8x8[i];
+    const int *const scan = vp9_default_zig_zag1d_8x8;
+    tx_type = DCT_DCT;
+
+    if (i == 16)
+      type = PLANE_TYPE_UV;
+    if (type == PLANE_TYPE_Y_WITH_DC) {
+      tx_type = get_tx_type(xd, xd->block + i);
+    }
+
+    c = decode_coefs(pbi, xd, bc, a, l, type,
+                     tx_type,
+                     seg_eob, qcoeff_ptr,
+                     i, scan, TX_8X8, coef_bands_x_8x8);
+    a[0] = l[0] = ((eobs[i] = c) != !type);
+    a[1] = a[0];
+    l[1] = l[0];
+
+    eobtotal += c;
+    qcoeff_ptr += 64;
+  }
+
+  if (bufthred == 16) {
+    type = PLANE_TYPE_UV;
+    tx_type = DCT_DCT;
+    seg_eob = 16;
+
+    // use 4x4 transform for U, V components in I8X8 prediction mode
+    for (i = 16; i < 24; i++) {
+      ENTROPY_CONTEXT *const a = A + vp9_block2above[i];
+      ENTROPY_CONTEXT *const l = L + vp9_block2left[i];
+      const int *scan = vp9_default_zig_zag1d;
+
+      c = decode_coefs(pbi, xd, bc, a, l, type,
+                       tx_type,
+                       seg_eob, qcoeff_ptr,
+                       i, scan, TX_4X4, coef_bands_x);
+      a[0] = l[0] = ((eobs[i] = c) != !type);
+
+      eobtotal += c;
+      qcoeff_ptr += 16;
+    }
+  }
+
+  return eobtotal;
+}
+
+
+int vp9_decode_mb_tokens(VP9D_COMP *dx, MACROBLOCKD *xd,
+                         BOOL_DECODER* const bc) {
+  ENTROPY_CONTEXT *const A = (ENTROPY_CONTEXT *)xd->above_context;
+  ENTROPY_CONTEXT *const L = (ENTROPY_CONTEXT *)xd->left_context;
+
+  char *const eobs = xd->eobs;
+  const int *scan = vp9_default_zig_zag1d;
+  PLANE_TYPE type;
+  int c, i, eobtotal = 0, seg_eob = 16;
+  INT16 *qcoeff_ptr = &xd->qcoeff[0];
+
+  int segment_id = xd->mode_info_context->mbmi.segment_id;
+  if (vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB))
+    seg_eob = vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);
+
+  if (xd->mode_info_context->mbmi.mode != B_PRED &&
+      xd->mode_info_context->mbmi.mode != I8X8_PRED &&
+      xd->mode_info_context->mbmi.mode != SPLITMV) {
+    ENTROPY_CONTEXT *const a = A + vp9_block2above[24];
+    ENTROPY_CONTEXT *const l = L + vp9_block2left[24];
+    type = PLANE_TYPE_Y2;
+
+    c = decode_coefs(dx, xd, bc, a, l, type,
+                     DCT_DCT,
+                     seg_eob, qcoeff_ptr + 24 * 16, 24,
+                     scan, TX_4X4, coef_bands_x);
+    a[0] = l[0] = ((eobs[24] = c) != !type);
+    eobtotal += c - 16;
+
+    type = PLANE_TYPE_Y_NO_DC;
+  } else {
+    type = PLANE_TYPE_Y_WITH_DC;
+  }
+
+  for (i = 0; i < 24; ++i) {
+    ENTROPY_CONTEXT *const a = A + vp9_block2above[i];
+    ENTROPY_CONTEXT *const l = L + vp9_block2left[i];
+    TX_TYPE tx_type = DCT_DCT;
+    if (i == 16)
+      type = PLANE_TYPE_UV;
+
+    tx_type = get_tx_type(xd, &xd->block[i]);
+    switch(tx_type) {
+      case ADST_DCT :
+        scan = vp9_row_scan;
+        break;
+
+      case DCT_ADST :
+        scan = vp9_col_scan;
+        break;
+
+      default :
+        scan = vp9_default_zig_zag1d;
+        break;
+    }
+
+    c = decode_coefs(dx, xd, bc, a, l, type, tx_type,
+                     seg_eob, qcoeff_ptr,
+                     i, scan, TX_4X4, coef_bands_x);
+    a[0] = l[0] = ((eobs[i] = c) != !type);
+
+    eobtotal += c;
+    qcoeff_ptr += 16;
+  }
+
+  return eobtotal;
+}
--- /dev/null
+++ b/vp9/decoder/detokenize.h
@@ -1,0 +1,25 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef DETOKENIZE_H
+#define DETOKENIZE_H
+
+#include "onyxd_int.h"
+
+void vp9_reset_mb_tokens_context(MACROBLOCKD* const);
+int vp9_decode_mb_tokens(VP9D_COMP* const, MACROBLOCKD* const,
+                         BOOL_DECODER* const);
+int vp9_decode_mb_tokens_8x8(VP9D_COMP* const, MACROBLOCKD* const,
+                             BOOL_DECODER* const);
+int vp9_decode_mb_tokens_16x16(VP9D_COMP* const, MACROBLOCKD* const,
+                               BOOL_DECODER* const);
+
+#endif /* DETOKENIZE_H */
--- /dev/null
+++ b/vp9/decoder/idct_blk.c
@@ -1,0 +1,292 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_ports/config.h"
+#include "vp9/common/idct.h"
+#include "dequantize.h"
+
+void vp9_dequant_dc_idct_add_c(short *input, short *dq, unsigned char *pred,
+                               unsigned char *dest, int pitch, int stride,
+                               int Dc);
+void vp9_dequant_idct_add_c(short *input, short *dq, unsigned char *pred,
+                            unsigned char *dest, int pitch, int stride);
+void vp9_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr,
+                            unsigned char *dst_ptr, int pitch, int stride);
+#if CONFIG_LOSSLESS
+void vp9_dequant_idct_add_lossless_c(short *input, short *dq,
+                                     unsigned char *pred, unsigned char *dest,
+                                     int pitch, int stride);
+void vp9_dc_only_idct_add_lossless_c(short input_dc, unsigned char *pred_ptr,
+                                     unsigned char *dst_ptr,
+                                     int pitch, int stride);
+#endif
+
+void vp9_dequant_dc_idct_add_y_block_c(short *q, short *dq,
+                                       unsigned char *pre,
+                                       unsigned char *dst,
+                                       int stride, char *eobs,
+                                       short *dc) {
+  int i, j;
+
+  for (i = 0; i < 4; i++) {
+    for (j = 0; j < 4; j++) {
+      if (*eobs++ > 1)
+        vp9_dequant_dc_idct_add_c(q, dq, pre, dst, 16, stride, dc[0]);
+      else
+        vp9_dc_only_idct_add_c(dc[0], pre, dst, 16, stride);
+
+      q   += 16;
+      pre += 4;
+      dst += 4;
+      dc++;
+    }
+
+    pre += 64 - 16;
+    dst += 4 * stride - 16;
+  }
+}
+
+void vp9_dequant_idct_add_y_block_c(short *q, short *dq,
+                                    unsigned char *pre,
+                                    unsigned char *dst,
+                                    int stride, char *eobs) {
+  int i, j;
+
+  for (i = 0; i < 4; i++) {
+    for (j = 0; j < 4; j++) {
+      if (*eobs++ > 1)
+        vp9_dequant_idct_add_c(q, dq, pre, dst, 16, stride);
+      else {
+        vp9_dc_only_idct_add_c(q[0]*dq[0], pre, dst, 16, stride);
+        ((int *)q)[0] = 0;
+      }
+
+      q   += 16;
+      pre += 4;
+      dst += 4;
+    }
+
+    pre += 64 - 16;
+    dst += 4 * stride - 16;
+  }
+}
+
+void vp9_dequant_idct_add_uv_block_c(short *q, short *dq, unsigned char *pre,
+                                     unsigned char *dstu, unsigned char *dstv,
+                                     int stride, char *eobs) {
+  int i, j;
+
+  for (i = 0; i < 2; i++) {
+    for (j = 0; j < 2; j++) {
+      if (*eobs++ > 1)
+        vp9_dequant_idct_add_c(q, dq, pre, dstu, 8, stride);
+      else {
+        vp9_dc_only_idct_add_c(q[0]*dq[0], pre, dstu, 8, stride);
+        ((int *)q)[0] = 0;
+      }
+
+      q    += 16;
+      pre  += 4;
+      dstu += 4;
+    }
+
+    pre  += 32 - 8;
+    dstu += 4 * stride - 8;
+  }
+
+  for (i = 0; i < 2; i++) {
+    for (j = 0; j < 2; j++) {
+      if (*eobs++ > 1)
+        vp9_dequant_idct_add_c(q, dq, pre, dstv, 8, stride);
+      else {
+        vp9_dc_only_idct_add_c(q[0]*dq[0], pre, dstv, 8, stride);
+        ((int *)q)[0] = 0;
+      }
+
+      q    += 16;
+      pre  += 4;
+      dstv += 4;
+    }
+
+    pre  += 32 - 8;
+    dstv += 4 * stride - 8;
+  }
+}
+
+
+void vp9_dequant_dc_idct_add_y_block_8x8_c(short *q, short *dq,
+                                           unsigned char *pre,
+                                           unsigned char *dst,
+                                           int stride, char *eobs, short *dc,
+                                           MACROBLOCKD *xd) {
+  vp9_dequant_dc_idct_add_8x8_c(q, dq, pre, dst, 16, stride, dc[0]);
+  vp9_dequant_dc_idct_add_8x8_c(&q[64], dq, pre + 8, dst + 8, 16, stride, dc[1]);
+  vp9_dequant_dc_idct_add_8x8_c(&q[128], dq, pre + 8 * 16,
+                                dst + 8 * stride, 16, stride, dc[4]);
+  vp9_dequant_dc_idct_add_8x8_c(&q[192], dq, pre + 8 * 16 + 8,
+                                dst + 8 * stride + 8, 16, stride, dc[8]);
+}
+
+#if CONFIG_SUPERBLOCKS
+void vp9_dequant_dc_idct_add_y_block_8x8_inplace_c(short *q, short *dq,
+                                                   unsigned char *dst,
+                                                   int stride, char *eobs,
+                                                   short *dc, MACROBLOCKD *xd) {
+  vp9_dequant_dc_idct_add_8x8_c(q, dq, dst, dst, stride, stride, dc[0]);
+  vp9_dequant_dc_idct_add_8x8_c(&q[64], dq, dst + 8,
+                                dst + 8, stride, stride, dc[1]);
+  vp9_dequant_dc_idct_add_8x8_c(&q[128], dq, dst + 8 * stride,
+                                dst + 8 * stride, stride, stride, dc[4]);
+  vp9_dequant_dc_idct_add_8x8_c(&q[192], dq, dst + 8 * stride + 8,
+                                dst + 8 * stride + 8, stride, stride, dc[8]);
+}
+#endif
+
+void vp9_dequant_idct_add_y_block_8x8_c(short *q, short *dq,
+                                        unsigned char *pre,
+                                        unsigned char *dst,
+                                        int stride, char *eobs,
+                                        MACROBLOCKD *xd) {
+  unsigned char *origdest = dst;
+  unsigned char *origpred = pre;
+
+  vp9_dequant_idct_add_8x8_c(q, dq, pre, dst, 16, stride);
+  vp9_dequant_idct_add_8x8_c(&q[64], dq, origpred + 8,
+                             origdest + 8, 16, stride);
+  vp9_dequant_idct_add_8x8_c(&q[128], dq, origpred + 8 * 16,
+                             origdest + 8 * stride, 16, stride);
+  vp9_dequant_idct_add_8x8_c(&q[192], dq, origpred + 8 * 16 + 8,
+                             origdest + 8 * stride + 8, 16, stride);
+}
+
+void vp9_dequant_idct_add_uv_block_8x8_c(short *q, short *dq,
+                                         unsigned char *pre,
+                                         unsigned char *dstu,
+                                         unsigned char *dstv,
+                                         int stride, char *eobs,
+                                         MACROBLOCKD *xd) {
+  vp9_dequant_idct_add_8x8_c(q, dq, pre, dstu, 8, stride);
+
+  q    += 64;
+  pre  += 64;
+
+  vp9_dequant_idct_add_8x8_c(q, dq, pre, dstv, 8, stride);
+}
+
+#if CONFIG_SUPERBLOCKS
+void vp9_dequant_idct_add_uv_block_8x8_inplace_c(short *q, short *dq,
+                                                 unsigned char *dstu,
+                                                 unsigned char *dstv,
+                                                 int stride, char *eobs,
+                                                 MACROBLOCKD *xd) {
+  vp9_dequant_idct_add_8x8_c(q, dq, dstu, dstu, stride, stride);
+
+  q    += 64;
+
+  vp9_dequant_idct_add_8x8_c(q, dq, dstv, dstv, stride, stride);
+}
+#endif
+
+#if CONFIG_LOSSLESS
+void vp9_dequant_dc_idct_add_y_block_lossless_c(short *q, short *dq,
+                                                unsigned char *pre,
+                                                unsigned char *dst,
+                                                int stride, char *eobs,
+                                                short *dc) {
+  int i, j;
+
+  for (i = 0; i < 4; i++) {
+    for (j = 0; j < 4; j++) {
+      if (*eobs++ > 1)
+        vp9_dequant_dc_idct_add_lossless_c(q, dq, pre, dst, 16, stride, dc[0]);
+      else
+        vp9_dc_only_inv_walsh_add_c(dc[0], pre, dst, 16, stride);
+
+      q   += 16;
+      pre += 4;
+      dst += 4;
+      dc++;
+    }
+
+    pre += 64 - 16;
+    dst += 4 * stride - 16;
+  }
+}
+
+void vp9_dequant_idct_add_y_block_lossless_c(short *q, short *dq,
+                                             unsigned char *pre,
+                                             unsigned char *dst,
+                                             int stride, char *eobs) {
+  int i, j;
+
+  for (i = 0; i < 4; i++) {
+    for (j = 0; j < 4; j++) {
+      if (*eobs++ > 1)
+        vp9_dequant_idct_add_lossless_c(q, dq, pre, dst, 16, stride);
+      else {
+        vp9_dc_only_inv_walsh_add_c(q[0]*dq[0], pre, dst, 16, stride);
+        ((int *)q)[0] = 0;
+      }
+
+      q   += 16;
+      pre += 4;
+      dst += 4;
+    }
+
+    pre += 64 - 16;
+    dst += 4 * stride - 16;
+  }
+}
+
+void vp9_dequant_idct_add_uv_block_lossless_c(short *q, short *dq,
+                                              unsigned char *pre,
+                                              unsigned char *dstu,
+                                              unsigned char *dstv,
+                                              int stride, char *eobs) {
+  int i, j;
+
+  for (i = 0; i < 2; i++) {
+    for (j = 0; j < 2; j++) {
+      if (*eobs++ > 1)
+        vp9_dequant_idct_add_lossless_c(q, dq, pre, dstu, 8, stride);
+      else {
+        vp9_dc_only_inv_walsh_add_c(q[0]*dq[0], pre, dstu, 8, stride);
+        ((int *)q)[0] = 0;
+      }
+
+      q    += 16;
+      pre  += 4;
+      dstu += 4;
+    }
+
+    pre  += 32 - 8;
+    dstu += 4 * stride - 8;
+  }
+
+  for (i = 0; i < 2; i++) {
+    for (j = 0; j < 2; j++) {
+      if (*eobs++ > 1)
+        vp9_dequant_idct_add_lossless_c(q, dq, pre, dstv, 8, stride);
+      else {
+        vp9_dc_only_inv_walsh_add_c(q[0]*dq[0], pre, dstv, 8, stride);
+        ((int *)q)[0] = 0;
+      }
+
+      q    += 16;
+      pre  += 4;
+      dstv += 4;
+    }
+
+    pre  += 32 - 8;
+    dstv += 4 * stride - 8;
+  }
+}
+#endif
+
--- /dev/null
+++ b/vp9/decoder/onyxd_if.c
@@ -1,0 +1,506 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vp9/common/onyxc_int.h"
+#if CONFIG_POSTPROC
+#include "vp9/common/postproc.h"
+#endif
+#include "vp9/common/onyxd.h"
+#include "onyxd_int.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vp9/common/alloccommon.h"
+#include "vpx_scale/yv12extend.h"
+#include "vp9/common/loopfilter.h"
+#include "vp9/common/swapyv12buffer.h"
+#include <stdio.h>
+#include <assert.h>
+
+#include "vp9/common/quant_common.h"
+#include "vpx_scale/vpxscale.h"
+#include "vp9/common/systemdependent.h"
+#include "vpx_ports/vpx_timer.h"
+#include "detokenize.h"
+#if ARCH_ARM
+#include "vpx_ports/arm.h"
+#endif
+
+extern void vp9_init_de_quantizer(VP9D_COMP *pbi);
+static int get_free_fb(VP9_COMMON *cm);
+static void ref_cnt_fb(int *buf, int *idx, int new_idx);
+
+#if CONFIG_DEBUG
+static void recon_write_yuv_frame(char *name, YV12_BUFFER_CONFIG *s) {
+  FILE *yuv_file = fopen((char *)name, "ab");
+  unsigned char *src = s->y_buffer;
+  int h = s->y_height;
+
+  do {
+    fwrite(src, s->y_width, 1,  yuv_file);
+    src += s->y_stride;
+  } while (--h);
+
+  src = s->u_buffer;
+  h = s->uv_height;
+
+  do {
+    fwrite(src, s->uv_width, 1,  yuv_file);
+    src += s->uv_stride;
+  } while (--h);
+
+  src = s->v_buffer;
+  h = s->uv_height;
+
+  do {
+    fwrite(src, s->uv_width, 1, yuv_file);
+    src += s->uv_stride;
+  } while (--h);
+
+  fclose(yuv_file);
+}
+#endif
+#define WRITE_RECON_BUFFER 0
+#if WRITE_RECON_BUFFER
+void write_dx_frame_to_file(YV12_BUFFER_CONFIG *frame, int this_frame) {
+
+  // write the frame
+  FILE *yframe;
+  int i;
+  char filename[255];
+
+  sprintf(filename, "dx\\y%04d.raw", this_frame);
+  yframe = fopen(filename, "wb");
+
+  for (i = 0; i < frame->y_height; i++)
+    fwrite(frame->y_buffer + i * frame->y_stride,
+           frame->y_width, 1, yframe);
+
+  fclose(yframe);
+  sprintf(filename, "dx\\u%04d.raw", this_frame);
+  yframe = fopen(filename, "wb");
+
+  for (i = 0; i < frame->uv_height; i++)
+    fwrite(frame->u_buffer + i * frame->uv_stride,
+           frame->uv_width, 1, yframe);
+
+  fclose(yframe);
+  sprintf(filename, "dx\\v%04d.raw", this_frame);
+  yframe = fopen(filename, "wb");
+
+  for (i = 0; i < frame->uv_height; i++)
+    fwrite(frame->v_buffer + i * frame->uv_stride,
+           frame->uv_width, 1, yframe);
+
+  fclose(yframe);
+}
+#endif
+
+void vp9_initialize_dec(void) {
+  static int init_done = 0;
+
+  if (!init_done) {
+    vp9_initialize_common();
+    vp9_init_quant_tables();
+    vp8_scale_machine_specific_config();
+    init_done = 1;
+  }
+}
+
+VP9D_PTR vp9_create_decompressor(VP9D_CONFIG *oxcf) {
+  VP9D_COMP *pbi = vpx_memalign(32, sizeof(VP9D_COMP));
+
+  if (!pbi)
+    return NULL;
+
+  vpx_memset(pbi, 0, sizeof(VP9D_COMP));
+
+  if (setjmp(pbi->common.error.jmp)) {
+    pbi->common.error.setjmp = 0;
+    vp9_remove_decompressor(pbi);
+    return 0;
+  }
+
+  pbi->common.error.setjmp = 1;
+  vp9_initialize_dec();
+
+  vp9_create_common(&pbi->common);
+
+  pbi->common.current_video_frame = 0;
+  pbi->ready_for_new_data = 1;
+
+  /* vp9_init_de_quantizer() is first called here. Add check in
+   * frame_init_dequantizer() to avoid unnecessary calling of
+   * vp9_init_de_quantizer() for every frame.
+   */
+  vp9_init_de_quantizer(pbi);
+
+  vp9_loop_filter_init(&pbi->common);
+
+  pbi->common.error.setjmp = 0;
+
+  pbi->decoded_key_frame = 0;
+
+  return (VP9D_PTR) pbi;
+}
+
+void vp9_remove_decompressor(VP9D_PTR ptr) {
+  VP9D_COMP *pbi = (VP9D_COMP *) ptr;
+
+  if (!pbi)
+    return;
+
+  // Delete sementation map
+  if (pbi->common.last_frame_seg_map != 0)
+    vpx_free(pbi->common.last_frame_seg_map);
+
+  vp9_remove_common(&pbi->common);
+  vpx_free(pbi->mbc);
+  vpx_free(pbi);
+}
+
+
+vpx_codec_err_t vp9_get_reference_dec(VP9D_PTR ptr, VP9_REFFRAME ref_frame_flag,
+                                      YV12_BUFFER_CONFIG *sd) {
+  VP9D_COMP *pbi = (VP9D_COMP *) ptr;
+  VP9_COMMON *cm = &pbi->common;
+  int ref_fb_idx;
+
+  if (ref_frame_flag == VP9_LAST_FLAG)
+    ref_fb_idx = cm->lst_fb_idx;
+  else if (ref_frame_flag == VP9_GOLD_FLAG)
+    ref_fb_idx = cm->gld_fb_idx;
+  else if (ref_frame_flag == VP9_ALT_FLAG)
+    ref_fb_idx = cm->alt_fb_idx;
+  else {
+    vpx_internal_error(&pbi->common.error, VPX_CODEC_ERROR,
+                       "Invalid reference frame");
+    return pbi->common.error.error_code;
+  }
+
+  if (cm->yv12_fb[ref_fb_idx].y_height != sd->y_height ||
+      cm->yv12_fb[ref_fb_idx].y_width != sd->y_width ||
+      cm->yv12_fb[ref_fb_idx].uv_height != sd->uv_height ||
+      cm->yv12_fb[ref_fb_idx].uv_width != sd->uv_width) {
+    vpx_internal_error(&pbi->common.error, VPX_CODEC_ERROR,
+                       "Incorrect buffer dimensions");
+  } else
+    vp8_yv12_copy_frame_ptr(&cm->yv12_fb[ref_fb_idx], sd);
+
+  return pbi->common.error.error_code;
+}
+
+
+vpx_codec_err_t vp9_set_reference_dec(VP9D_PTR ptr, VP9_REFFRAME ref_frame_flag,
+                                      YV12_BUFFER_CONFIG *sd) {
+  VP9D_COMP *pbi = (VP9D_COMP *) ptr;
+  VP9_COMMON *cm = &pbi->common;
+  int *ref_fb_ptr = NULL;
+  int free_fb;
+
+  if (ref_frame_flag == VP9_LAST_FLAG)
+    ref_fb_ptr = &cm->lst_fb_idx;
+  else if (ref_frame_flag == VP9_GOLD_FLAG)
+    ref_fb_ptr = &cm->gld_fb_idx;
+  else if (ref_frame_flag == VP9_ALT_FLAG)
+    ref_fb_ptr = &cm->alt_fb_idx;
+  else {
+    vpx_internal_error(&pbi->common.error, VPX_CODEC_ERROR,
+                       "Invalid reference frame");
+    return pbi->common.error.error_code;
+  }
+
+  if (cm->yv12_fb[*ref_fb_ptr].y_height != sd->y_height ||
+      cm->yv12_fb[*ref_fb_ptr].y_width != sd->y_width ||
+      cm->yv12_fb[*ref_fb_ptr].uv_height != sd->uv_height ||
+      cm->yv12_fb[*ref_fb_ptr].uv_width != sd->uv_width) {
+    vpx_internal_error(&pbi->common.error, VPX_CODEC_ERROR,
+                       "Incorrect buffer dimensions");
+  } else {
+    /* Find an empty frame buffer. */
+    free_fb = get_free_fb(cm);
+    /* Decrease fb_idx_ref_cnt since it will be increased again in
+     * ref_cnt_fb() below. */
+    cm->fb_idx_ref_cnt[free_fb]--;
+
+    /* Manage the reference counters and copy image. */
+    ref_cnt_fb(cm->fb_idx_ref_cnt, ref_fb_ptr, free_fb);
+    vp8_yv12_copy_frame_ptr(sd, &cm->yv12_fb[*ref_fb_ptr]);
+  }
+
+  return pbi->common.error.error_code;
+}
+
+/*For ARM NEON, d8-d15 are callee-saved registers, and need to be saved by us.*/
+#if HAVE_ARMV7
+extern void vp9_push_neon(int64_t *store);
+extern void vp9_pop_neon(int64_t *store);
+#endif
+
+static int get_free_fb(VP9_COMMON *cm) {
+  int i;
+  for (i = 0; i < NUM_YV12_BUFFERS; i++)
+    if (cm->fb_idx_ref_cnt[i] == 0)
+      break;
+
+  assert(i < NUM_YV12_BUFFERS);
+  cm->fb_idx_ref_cnt[i] = 1;
+  return i;
+}
+
+static void ref_cnt_fb(int *buf, int *idx, int new_idx) {
+  if (buf[*idx] > 0)
+    buf[*idx]--;
+
+  *idx = new_idx;
+
+  buf[new_idx]++;
+}
+
+/* If any buffer copy / swapping is signalled it should be done here. */
+static int swap_frame_buffers(VP9_COMMON *cm) {
+  int err = 0;
+
+  /* The alternate reference frame or golden frame can be updated
+   *  using the new, last, or golden/alt ref frame.  If it
+   *  is updated using the newly decoded frame it is a refresh.
+   *  An update using the last or golden/alt ref frame is a copy.
+   */
+  if (cm->copy_buffer_to_arf) {
+    int new_fb = 0;
+
+    if (cm->copy_buffer_to_arf == 1)
+      new_fb = cm->lst_fb_idx;
+    else if (cm->copy_buffer_to_arf == 2)
+      new_fb = cm->gld_fb_idx;
+    else
+      err = -1;
+
+    ref_cnt_fb(cm->fb_idx_ref_cnt, &cm->alt_fb_idx, new_fb);
+  }
+
+  if (cm->copy_buffer_to_gf) {
+    int new_fb = 0;
+
+    if (cm->copy_buffer_to_gf == 1)
+      new_fb = cm->lst_fb_idx;
+    else if (cm->copy_buffer_to_gf == 2)
+      new_fb = cm->alt_fb_idx;
+    else
+      err = -1;
+
+    ref_cnt_fb(cm->fb_idx_ref_cnt, &cm->gld_fb_idx, new_fb);
+  }
+
+  if (cm->refresh_golden_frame)
+    ref_cnt_fb(cm->fb_idx_ref_cnt, &cm->gld_fb_idx, cm->new_fb_idx);
+
+  if (cm->refresh_alt_ref_frame)
+    ref_cnt_fb(cm->fb_idx_ref_cnt, &cm->alt_fb_idx, cm->new_fb_idx);
+
+  if (cm->refresh_last_frame) {
+    ref_cnt_fb(cm->fb_idx_ref_cnt, &cm->lst_fb_idx, cm->new_fb_idx);
+
+    cm->frame_to_show = &cm->yv12_fb[cm->lst_fb_idx];
+  } else
+    cm->frame_to_show = &cm->yv12_fb[cm->new_fb_idx];
+
+  cm->fb_idx_ref_cnt[cm->new_fb_idx]--;
+
+  return err;
+}
+
+int vp9_receive_compressed_data(VP9D_PTR ptr, unsigned long size,
+                                const unsigned char *source,
+                                int64_t time_stamp) {
+#if HAVE_ARMV7
+  int64_t dx_store_reg[8];
+#endif
+  VP9D_COMP *pbi = (VP9D_COMP *) ptr;
+  VP9_COMMON *cm = &pbi->common;
+  int retcode = 0;
+
+  /*if(pbi->ready_for_new_data == 0)
+      return -1;*/
+
+  if (ptr == 0) {
+    return -1;
+  }
+
+  pbi->common.error.error_code = VPX_CODEC_OK;
+
+  pbi->Source = source;
+  pbi->source_sz = size;
+
+  if (pbi->source_sz == 0) {
+    /* This is used to signal that we are missing frames.
+     * We do not know if the missing frame(s) was supposed to update
+     * any of the reference buffers, but we act conservative and
+     * mark only the last buffer as corrupted.
+     */
+    cm->yv12_fb[cm->lst_fb_idx].corrupted = 1;
+  }
+
+#if HAVE_ARMV7
+#if CONFIG_RUNTIME_CPU_DETECT
+  if (cm->rtcd.flags & HAS_NEON)
+#endif
+  {
+    vp9_push_neon(dx_store_reg);
+  }
+#endif
+
+  cm->new_fb_idx = get_free_fb(cm);
+
+  if (setjmp(pbi->common.error.jmp)) {
+#if HAVE_ARMV7
+#if CONFIG_RUNTIME_CPU_DETECT
+    if (cm->rtcd.flags & HAS_NEON)
+#endif
+    {
+      vp9_pop_neon(dx_store_reg);
+    }
+#endif
+    pbi->common.error.setjmp = 0;
+
+    /* We do not know if the missing frame(s) was supposed to update
+     * any of the reference buffers, but we act conservative and
+     * mark only the last buffer as corrupted.
+     */
+    cm->yv12_fb[cm->lst_fb_idx].corrupted = 1;
+
+    if (cm->fb_idx_ref_cnt[cm->new_fb_idx] > 0)
+      cm->fb_idx_ref_cnt[cm->new_fb_idx]--;
+    return -1;
+  }
+
+  pbi->common.error.setjmp = 1;
+
+  retcode = vp9_decode_frame(pbi);
+
+  if (retcode < 0) {
+#if HAVE_ARMV7
+#if CONFIG_RUNTIME_CPU_DETECT
+    if (cm->rtcd.flags & HAS_NEON)
+#endif
+    {
+      vp9_pop_neon(dx_store_reg);
+    }
+#endif
+    pbi->common.error.error_code = VPX_CODEC_ERROR;
+    pbi->common.error.setjmp = 0;
+    if (cm->fb_idx_ref_cnt[cm->new_fb_idx] > 0)
+      cm->fb_idx_ref_cnt[cm->new_fb_idx]--;
+    return retcode;
+  }
+
+  {
+    if (swap_frame_buffers(cm)) {
+#if HAVE_ARMV7
+#if CONFIG_RUNTIME_CPU_DETECT
+      if (cm->rtcd.flags & HAS_NEON)
+#endif
+      {
+        vp9_pop_neon(dx_store_reg);
+      }
+#endif
+      pbi->common.error.error_code = VPX_CODEC_ERROR;
+      pbi->common.error.setjmp = 0;
+      return -1;
+    }
+
+#if WRITE_RECON_BUFFER
+    if (cm->show_frame)
+      write_dx_frame_to_file(cm->frame_to_show,
+                             cm->current_video_frame);
+    else
+      write_dx_frame_to_file(cm->frame_to_show,
+                             cm->current_video_frame + 1000);
+#endif
+
+    if (cm->filter_level) {
+      /* Apply the loop filter if appropriate. */
+      vp9_loop_filter_frame(cm, &pbi->mb);
+    }
+    vp8_yv12_extend_frame_borders_ptr(cm->frame_to_show);
+  }
+
+#if CONFIG_DEBUG
+  if (cm->show_frame)
+    recon_write_yuv_frame("recon.yuv", cm->frame_to_show);
+#endif
+
+  vp9_clear_system_state();
+
+  if (cm->show_frame) {
+    vpx_memcpy(cm->prev_mip, cm->mip,
+               (cm->mb_cols + 1) * (cm->mb_rows + 1)* sizeof(MODE_INFO));
+  } else {
+    vpx_memset(cm->prev_mip, 0,
+               (cm->mb_cols + 1) * (cm->mb_rows + 1)* sizeof(MODE_INFO));
+  }
+
+  /*vp9_print_modes_and_motion_vectors(cm->mi, cm->mb_rows,cm->mb_cols,
+                                       cm->current_video_frame);*/
+
+  if (cm->show_frame)
+    cm->current_video_frame++;
+
+  pbi->ready_for_new_data = 0;
+  pbi->last_time_stamp = time_stamp;
+  pbi->source_sz = 0;
+
+#if HAVE_ARMV7
+#if CONFIG_RUNTIME_CPU_DETECT
+  if (cm->rtcd.flags & HAS_NEON)
+#endif
+  {
+    vp9_pop_neon(dx_store_reg);
+  }
+#endif
+  pbi->common.error.setjmp = 0;
+  return retcode;
+}
+
+int vp9_get_raw_frame(VP9D_PTR ptr, YV12_BUFFER_CONFIG *sd,
+                      int64_t *time_stamp, int64_t *time_end_stamp,
+                      vp9_ppflags_t *flags) {
+  int ret = -1;
+  VP9D_COMP *pbi = (VP9D_COMP *) ptr;
+
+  if (pbi->ready_for_new_data == 1)
+    return ret;
+
+  /* ie no raw frame to show!!! */
+  if (pbi->common.show_frame == 0)
+    return ret;
+
+  pbi->ready_for_new_data = 1;
+  *time_stamp = pbi->last_time_stamp;
+  *time_end_stamp = 0;
+
+  sd->clrtype = pbi->common.clr_type;
+#if CONFIG_POSTPROC
+  ret = vp9_post_proc_frame(&pbi->common, sd, flags);
+#else
+
+  if (pbi->common.frame_to_show) {
+    *sd = *pbi->common.frame_to_show;
+    sd->y_width = pbi->common.Width;
+    sd->y_height = pbi->common.Height;
+    sd->uv_height = pbi->common.Height / 2;
+    ret = 0;
+  } else {
+    ret = -1;
+  }
+
+#endif /*!CONFIG_POSTPROC*/
+  vp9_clear_system_state();
+  return ret;
+}
--- /dev/null
+++ b/vp9/decoder/onyxd_int.h
@@ -1,0 +1,106 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_ONYXD_INT_H
+#define __INC_ONYXD_INT_H
+#include "vpx_ports/config.h"
+#include "vp9/common/onyxd.h"
+#include "treereader.h"
+#include "vp9/common/onyxc_int.h"
+#include "dequantize.h"
+
+// #define DEC_DEBUG
+
+typedef struct {
+  int ithread;
+  void *ptr1;
+  void *ptr2;
+} DECODETHREAD_DATA;
+
+typedef struct {
+  MACROBLOCKD  mbd;
+  int mb_row;
+  int current_mb_col;
+  short *coef_ptr;
+} MB_ROW_DEC;
+
+typedef struct {
+  int const *scan;
+  int const *scan_8x8;
+  UINT8 const *ptr_block2leftabove;
+  vp9_tree_index const *vp9_coef_tree_ptr;
+  unsigned char *norm_ptr;
+  UINT8 *ptr_coef_bands_x;
+  UINT8 *ptr_coef_bands_x_8x8;
+
+  ENTROPY_CONTEXT_PLANES *A;
+  ENTROPY_CONTEXT_PLANES *L;
+
+  INT16 *qcoeff_start_ptr;
+
+  vp9_prob const *coef_probs[BLOCK_TYPES];
+  vp9_prob const *coef_probs_8x8[BLOCK_TYPES_8X8];
+  vp9_prob const *coef_probs_16X16[BLOCK_TYPES_16X16];
+
+  UINT8 eob[25];
+
+} DETOK;
+
+typedef struct VP9Decompressor {
+  DECLARE_ALIGNED(16, MACROBLOCKD, mb);
+
+  DECLARE_ALIGNED(16, VP9_COMMON, common);
+
+  VP9D_CONFIG oxcf;
+
+
+  const unsigned char *Source;
+  unsigned int   source_sz;
+
+  vp9_reader *mbc;
+  int64_t last_time_stamp;
+  int   ready_for_new_data;
+
+  DETOK detoken;
+
+  vp9_dequant_idct_add_fn_t            idct_add;
+  vp9_dequant_dc_idct_add_fn_t         dc_idct_add;
+  vp9_dequant_dc_idct_add_y_block_fn_t dc_idct_add_y_block;
+  vp9_dequant_idct_add_y_block_fn_t    idct_add_y_block;
+  vp9_dequant_idct_add_uv_block_fn_t   idct_add_uv_block;
+
+  vp9_prob prob_skip_false;
+
+  int decoded_key_frame;
+
+} VP9D_COMP;
+
+int vp9_decode_frame(VP9D_COMP *cpi);
+
+
+#if CONFIG_DEBUG
+#define CHECK_MEM_ERROR(lval,expr) do {\
+    lval = (expr); \
+    if(!lval) \
+      vpx_internal_error(&pbi->common.error, VPX_CODEC_MEM_ERROR,\
+                         "Failed to allocate "#lval" at %s:%d", \
+                         __FILE__,__LINE__);\
+  } while(0)
+#else
+#define CHECK_MEM_ERROR(lval,expr) do {\
+    lval = (expr); \
+    if(!lval) \
+      vpx_internal_error(&pbi->common.error, VPX_CODEC_MEM_ERROR,\
+                         "Failed to allocate "#lval);\
+  } while(0)
+#endif
+
+#endif  // __INC_ONYXD_INT_H
--- /dev/null
+++ b/vp9/decoder/reconintra_mt.h
@@ -1,0 +1,15 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_RECONINTRA_MT_H
+#define __INC_RECONINTRA_MT_H
+
+#endif
--- /dev/null
+++ b/vp9/decoder/treereader.h
@@ -1,0 +1,37 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef tree_reader_h
+#define tree_reader_h 1
+
+#include "vp9/common/treecoder.h"
+
+#include "dboolhuff.h"
+
+typedef BOOL_DECODER vp9_reader;
+
+#define vp9_read decode_bool
+#define vp9_read_literal decode_value
+#define vp9_read_bit(R) vp9_read(R, vp9_prob_half)
+
+/* Intent of tree data structure is to make decoding trivial. */
+
+static int treed_read(vp9_reader *const r, /* !!! must return a 0 or 1 !!! */
+                      vp9_tree t,
+                      const vp9_prob *const p) {
+  register vp9_tree_index i = 0;
+
+  while ((i = t[ i + vp9_read(r, p[i >> 1])]) > 0);
+
+  return -i;
+}
+
+#endif /* tree_reader_h */
--- /dev/null
+++ b/vp9/decoder/x86/dequantize_mmx.asm
@@ -1,0 +1,406 @@
+;
+;  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION_RODATA
+align 16
+x_s1sqr2:      times 4 dw 0x8A8C
+align 16
+x_c1sqr2less1: times 4 dw 0x4E7B
+align 16
+pw_16:         times 4 dw 16
+
+SECTION .text
+
+INIT_MMX
+
+
+;void dequantize_b_impl_mmx(short *sq, short *dq, short *q)
+cglobal dequantize_b_impl_mmx, 3,3,0,sq,dq,arg3
+    mova       m1, [sqq]
+    pmullw     m1, [arg3q+0]            ; mm4 *= kernel 0 modifiers.
+    mova [dqq+ 0], m1
+
+    mova       m1, [sqq+8]
+    pmullw     m1, [arg3q+8]            ; mm4 *= kernel 0 modifiers.
+    mova [dqq+ 8], m1
+
+    mova       m1, [sqq+16]
+    pmullw     m1, [arg3q+16]            ; mm4 *= kernel 0 modifiers.
+    mova [dqq+16], m1
+
+    mova       m1, [sqq+24]
+    pmullw     m1, [arg3q+24]            ; mm4 *= kernel 0 modifiers.
+    mova [dqq+24], m1
+    RET
+
+
+;void dequant_idct_add_mmx(short *input, short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride)
+cglobal dequant_idct_add_mmx, 4,6,0,inp,dq,pred,dest,pit,stride
+
+%if ARCH_X86_64
+    movsxd              strideq,  dword stridem
+    movsxd              pitq,     dword pitm
+%else
+    mov                 strideq,  stridem
+    mov                 pitq,     pitm
+%endif
+
+    mova                m0,       [inpq+ 0]
+    pmullw              m0,       [dqq]
+
+    mova                m1,       [inpq+ 8]
+    pmullw              m1,       [dqq+ 8]
+
+    mova                m2,       [inpq+16]
+    pmullw              m2,       [dqq+16]
+
+    mova                m3,       [inpq+24]
+    pmullw              m3,       [dqq+24]
+
+    pxor                m7,        m7
+    mova            [inpq],        m7
+    mova          [inpq+8],        m7
+    mova         [inpq+16],        m7
+    mova         [inpq+24],        m7
+
+
+    psubw               m0,        m2             ; b1= 0-2
+    paddw               m2,        m2             ;
+
+    mova                m5,        m1
+    paddw               m2,        m0             ; a1 =0+2
+
+    pmulhw              m5,       [x_s1sqr2];
+    paddw               m5,        m1             ; ip1 * sin(pi/8) * sqrt(2)
+
+    mova                m7,        m3             ;
+    pmulhw              m7,       [x_c1sqr2less1];
+
+    paddw               m7,        m3             ; ip3 * cos(pi/8) * sqrt(2)
+    psubw               m7,        m5             ; c1
+
+    mova                m5,        m1
+    mova                m4,        m3
+
+    pmulhw              m5,       [x_c1sqr2less1]
+    paddw               m5,        m1
+
+    pmulhw              m3,       [x_s1sqr2]
+    paddw               m3,        m4
+
+    paddw               m3,        m5             ; d1
+    mova                m6,        m2             ; a1
+
+    mova                m4,        m0             ; b1
+    paddw               m2,        m3             ;0
+
+    paddw               m4,        m7             ;1
+    psubw               m0,        m7             ;2
+
+    psubw               m6,        m3             ;3
+
+    mova                m1,        m2             ; 03 02 01 00
+    mova                m3,        m4             ; 23 22 21 20
+
+    punpcklwd           m1,        m0             ; 11 01 10 00
+    punpckhwd           m2,        m0             ; 13 03 12 02
+
+    punpcklwd           m3,        m6             ; 31 21 30 20
+    punpckhwd           m4,        m6             ; 33 23 32 22
+
+    mova                m0,        m1             ; 11 01 10 00
+    mova                m5,        m2             ; 13 03 12 02
+
+    punpckldq           m0,        m3             ; 30 20 10 00
+    punpckhdq           m1,        m3             ; 31 21 11 01
+
+    punpckldq           m2,        m4             ; 32 22 12 02
+    punpckhdq           m5,        m4             ; 33 23 13 03
+
+    mova                m3,        m5             ; 33 23 13 03
+
+    psubw               m0,        m2             ; b1= 0-2
+    paddw               m2,        m2             ;
+
+    mova                m5,        m1
+    paddw               m2,        m0             ; a1 =0+2
+
+    pmulhw              m5,       [x_s1sqr2];
+    paddw               m5,        m1             ; ip1 * sin(pi/8) * sqrt(2)
+
+    mova                m7,        m3             ;
+    pmulhw              m7,       [x_c1sqr2less1];
+
+    paddw               m7,        m3             ; ip3 * cos(pi/8) * sqrt(2)
+    psubw               m7,        m5             ; c1
+
+    mova                m5,        m1
+    mova                m4,        m3
+
+    pmulhw              m5,       [x_c1sqr2less1]
+    paddw               m5,        m1
+
+    pmulhw              m3,       [x_s1sqr2]
+    paddw               m3,        m4
+
+    paddw               m3,        m5             ; d1
+    paddw               m0,       [pw_16]
+
+    paddw               m2,       [pw_16]
+    mova                m6,        m2             ; a1
+
+    mova                m4,        m0             ; b1
+    paddw               m2,        m3             ;0
+
+    paddw               m4,        m7             ;1
+    psubw               m0,        m7             ;2
+
+    psubw               m6,        m3             ;3
+    psraw               m2,        5
+
+    psraw               m0,        5
+    psraw               m4,        5
+
+    psraw               m6,        5
+
+    mova                m1,        m2             ; 03 02 01 00
+    mova                m3,        m4             ; 23 22 21 20
+
+    punpcklwd           m1,        m0             ; 11 01 10 00
+    punpckhwd           m2,        m0             ; 13 03 12 02
+
+    punpcklwd           m3,        m6             ; 31 21 30 20
+    punpckhwd           m4,        m6             ; 33 23 32 22
+
+    mova                m0,        m1             ; 11 01 10 00
+    mova                m5,        m2             ; 13 03 12 02
+
+    punpckldq           m0,        m3             ; 30 20 10 00
+    punpckhdq           m1,        m3             ; 31 21 11 01
+
+    punpckldq           m2,        m4             ; 32 22 12 02
+    punpckhdq           m5,        m4             ; 33 23 13 03
+
+    pxor                m7,        m7
+
+    movh                m4,       [predq]
+    punpcklbw           m4,        m7
+    paddsw              m0,        m4
+    packuswb            m0,        m7
+    movh           [destq],      m0
+
+    movh                m4,       [predq+pitq]
+    punpcklbw           m4,        m7
+    paddsw              m1,        m4
+    packuswb            m1,        m7
+    movh   [destq+strideq],        m1
+
+    movh                m4,       [predq+2*pitq]
+    punpcklbw           m4,        m7
+    paddsw              m2,        m4
+    packuswb            m2,        m7
+    movh [destq+strideq*2],        m2
+
+    add              destq,        strideq
+    add              predq,        pitq
+
+    movh                m4,       [predq+2*pitq]
+    punpcklbw           m4,        m7
+    paddsw              m5,        m4
+    packuswb            m5,        m7
+    movh [destq+strideq*2],        m5
+    RET
+
+
+;void dequant_dc_idct_add_mmx(short *input, short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride, int Dc)
+cglobal dequant_dc_idct_add_mmx, 4,7,0,inp,dq,pred,dest,pit,stride,Dc
+
+%if ARCH_X86_64
+    movsxd              strideq,   dword stridem
+    movsxd              pitq,      dword pitm
+%else
+    mov                 strideq,   stridem
+    mov                 pitq,      pitm
+%endif
+
+    mov                 Dcq, Dcm
+    mova                m0,       [inpq+ 0]
+    pmullw              m0,       [dqq+ 0]
+
+    mova                m1,       [inpq+ 8]
+    pmullw              m1,       [dqq+ 8]
+
+    mova                m2,       [inpq+16]
+    pmullw              m2,       [dqq+16]
+
+    mova                m3,       [inpq+24]
+    pmullw              m3,       [dqq+24]
+
+    pxor                m7,        m7
+    mova         [inpq+ 0],        m7
+    mova         [inpq+ 8],        m7
+    mova         [inpq+16],        m7
+    mova         [inpq+24],        m7
+
+    ; move lower word of Dc to lower word of m0
+    psrlq               m0,        16
+    psllq               m0,        16
+    and                Dcq,        0xFFFF         ; If Dc < 0, we don't want the full dword precision.
+    movh                m7,        Dcq
+    por                 m0,        m7
+    psubw               m0,        m2             ; b1= 0-2
+    paddw               m2,        m2             ;
+
+    mova                m5,        m1
+    paddw               m2,        m0             ; a1 =0+2
+
+    pmulhw              m5,       [x_s1sqr2];
+    paddw               m5,        m1             ; ip1 * sin(pi/8) * sqrt(2)
+
+    mova                m7,        m3             ;
+    pmulhw              m7,       [x_c1sqr2less1];
+
+    paddw               m7,        m3             ; ip3 * cos(pi/8) * sqrt(2)
+    psubw               m7,        m5             ; c1
+
+    mova                m5,        m1
+    mova                m4,        m3
+
+    pmulhw              m5,       [x_c1sqr2less1]
+    paddw               m5,        m1
+
+    pmulhw              m3,       [x_s1sqr2]
+    paddw               m3,        m4
+
+    paddw               m3,        m5             ; d1
+    mova                m6,        m2             ; a1
+
+    mova                m4,        m0             ; b1
+    paddw               m2,        m3             ;0
+
+    paddw               m4,        m7             ;1
+    psubw               m0,        m7             ;2
+
+    psubw               m6,        m3             ;3
+
+    mova                m1,        m2             ; 03 02 01 00
+    mova                m3,        m4             ; 23 22 21 20
+
+    punpcklwd           m1,        m0             ; 11 01 10 00
+    punpckhwd           m2,        m0             ; 13 03 12 02
+
+    punpcklwd           m3,        m6             ; 31 21 30 20
+    punpckhwd           m4,        m6             ; 33 23 32 22
+
+    mova                m0,        m1             ; 11 01 10 00
+    mova                m5,        m2             ; 13 03 12 02
+
+    punpckldq           m0,        m3             ; 30 20 10 00
+    punpckhdq           m1,        m3             ; 31 21 11 01
+
+    punpckldq           m2,        m4             ; 32 22 12 02
+    punpckhdq           m5,        m4             ; 33 23 13 03
+
+    mova                m3,        m5             ; 33 23 13 03
+
+    psubw               m0,        m2             ; b1= 0-2
+    paddw               m2,        m2             ;
+
+    mova                m5,        m1
+    paddw               m2,        m0             ; a1 =0+2
+
+    pmulhw              m5,       [x_s1sqr2];
+    paddw               m5,        m1             ; ip1 * sin(pi/8) * sqrt(2)
+
+    mova                m7,        m3             ;
+    pmulhw              m7,       [x_c1sqr2less1];
+
+    paddw               m7,        m3             ; ip3 * cos(pi/8) * sqrt(2)
+    psubw               m7,        m5             ; c1
+
+    mova                m5,        m1
+    mova                m4,        m3
+
+    pmulhw              m5,       [x_c1sqr2less1]
+    paddw               m5,        m1
+
+    pmulhw              m3,       [x_s1sqr2]
+    paddw               m3,        m4
+
+    paddw               m3,        m5             ; d1
+    paddw               m0,       [pw_16]
+
+    paddw               m2,       [pw_16]
+    mova                m6,        m2             ; a1
+
+    mova                m4,        m0             ; b1
+    paddw               m2,        m3             ;0
+
+    paddw               m4,        m7             ;1
+    psubw               m0,        m7             ;2
+
+    psubw               m6,        m3             ;3
+    psraw               m2,        5
+
+    psraw               m0,        5
+    psraw               m4,        5
+
+    psraw               m6,        5
+
+    mova                m1,        m2             ; 03 02 01 00
+    mova                m3,        m4             ; 23 22 21 20
+
+    punpcklwd           m1,        m0             ; 11 01 10 00
+    punpckhwd           m2,        m0             ; 13 03 12 02
+
+    punpcklwd           m3,        m6             ; 31 21 30 20
+    punpckhwd           m4,        m6             ; 33 23 32 22
+
+    mova                m0,        m1             ; 11 01 10 00
+    mova                m5,        m2             ; 13 03 12 02
+
+    punpckldq           m0,        m3             ; 30 20 10 00
+    punpckhdq           m1,        m3             ; 31 21 11 01
+
+    punpckldq           m2,        m4             ; 32 22 12 02
+    punpckhdq           m5,        m4             ; 33 23 13 03
+
+    pxor                m7,        m7
+
+    movh                m4,       [predq]
+    punpcklbw           m4,        m7
+    paddsw              m0,        m4
+    packuswb            m0,        m7
+    movh           [destq],        m0
+
+    movh                m4,       [predq+pitq]
+    punpcklbw           m4,        m7
+    paddsw              m1,        m4
+    packuswb            m1,        m7
+    movh   [destq+strideq],        m1
+
+    movh                m4,       [predq+2*pitq]
+    punpcklbw           m4,        m7
+    paddsw              m2,        m4
+    packuswb            m2,        m7
+    movh [destq+strideq*2],        m2
+
+    add              destq,        strideq
+    add              predq,        pitq
+
+    movh                m4,       [predq+2*pitq]
+    punpcklbw           m4,        m7
+    paddsw              m5,        m4
+    packuswb            m5,        m7
+    movh [destq+strideq*2],        m5
+    RET
+
--- /dev/null
+++ b/vp9/decoder/x86/idct_blk_mmx.c
@@ -1,0 +1,143 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_ports/config.h"
+#include "vp9/common/idct.h"
+#include "vp9/decoder/dequantize.h"
+
+void vp9_dequant_dc_idct_add_y_block_mmx(short *q, short *dq,
+                                         unsigned char *pre,
+                                         unsigned char *dst,
+                                         int stride, char *eobs, short *dc) {
+  int i;
+
+  for (i = 0; i < 4; i++) {
+    if (eobs[0] > 1)
+      vp9_dequant_dc_idct_add_mmx(q, dq, pre, dst, 16, stride, dc[0]);
+    else
+      vp9_dc_only_idct_add_mmx(dc[0], pre, dst, 16, stride);
+
+    if (eobs[1] > 1)
+      vp9_dequant_dc_idct_add_mmx(q + 16, dq, pre + 4,
+                                  dst + 4, 16, stride, dc[1]);
+    else
+      vp9_dc_only_idct_add_mmx(dc[1], pre + 4, dst + 4, 16, stride);
+
+    if (eobs[2] > 1)
+      vp9_dequant_dc_idct_add_mmx(q + 32, dq, pre + 8,
+                                  dst + 8, 16, stride, dc[2]);
+    else
+      vp9_dc_only_idct_add_mmx(dc[2], pre + 8, dst + 8, 16, stride);
+
+    if (eobs[3] > 1)
+      vp9_dequant_dc_idct_add_mmx(q + 48, dq, pre + 12,
+                                  dst + 12, 16, stride, dc[3]);
+    else
+      vp9_dc_only_idct_add_mmx(dc[3], pre + 12, dst + 12, 16, stride);
+
+    q    += 64;
+    dc   += 4;
+    pre  += 64;
+    dst  += 4 * stride;
+    eobs += 4;
+  }
+}
+
+void vp9_dequant_idct_add_y_block_mmx(short *q, short *dq,
+                                      unsigned char *pre,
+                                      unsigned char *dst,
+                                      int stride, char *eobs) {
+  int i;
+
+  for (i = 0; i < 4; i++) {
+    if (eobs[0] > 1)
+      vp9_dequant_idct_add_mmx(q, dq, pre, dst, 16, stride);
+    else {
+      vp9_dc_only_idct_add_mmx(q[0]*dq[0], pre, dst, 16, stride);
+      ((int *)q)[0] = 0;
+    }
+
+    if (eobs[1] > 1)
+      vp9_dequant_idct_add_mmx(q + 16, dq, pre + 4, dst + 4, 16, stride);
+    else {
+      vp9_dc_only_idct_add_mmx(q[16]*dq[0], pre + 4, dst + 4, 16, stride);
+      ((int *)(q + 16))[0] = 0;
+    }
+
+    if (eobs[2] > 1)
+      vp9_dequant_idct_add_mmx(q + 32, dq, pre + 8, dst + 8, 16, stride);
+    else {
+      vp9_dc_only_idct_add_mmx(q[32]*dq[0], pre + 8, dst + 8, 16, stride);
+      ((int *)(q + 32))[0] = 0;
+    }
+
+    if (eobs[3] > 1)
+      vp9_dequant_idct_add_mmx(q + 48, dq, pre + 12, dst + 12, 16, stride);
+    else {
+      vp9_dc_only_idct_add_mmx(q[48]*dq[0], pre + 12, dst + 12, 16, stride);
+      ((int *)(q + 48))[0] = 0;
+    }
+
+    q    += 64;
+    pre  += 64;
+    dst  += 4 * stride;
+    eobs += 4;
+  }
+}
+
+void vp9_dequant_idct_add_uv_block_mmx(short *q, short *dq,
+                                       unsigned char *pre,
+                                       unsigned char *dstu,
+                                       unsigned char *dstv,
+                                       int stride, char *eobs) {
+  int i;
+
+  for (i = 0; i < 2; i++) {
+    if (eobs[0] > 1)
+      vp9_dequant_idct_add_mmx(q, dq, pre, dstu, 8, stride);
+    else {
+      vp9_dc_only_idct_add_mmx(q[0]*dq[0], pre, dstu, 8, stride);
+      ((int *)q)[0] = 0;
+    }
+
+    if (eobs[1] > 1)
+      vp9_dequant_idct_add_mmx(q + 16, dq, pre + 4, dstu + 4, 8, stride);
+    else {
+      vp9_dc_only_idct_add_mmx(q[16]*dq[0], pre + 4, dstu + 4, 8, stride);
+      ((int *)(q + 16))[0] = 0;
+    }
+
+    q    += 32;
+    pre  += 32;
+    dstu += 4 * stride;
+    eobs += 2;
+  }
+
+  for (i = 0; i < 2; i++) {
+    if (eobs[0] > 1)
+      vp9_dequant_idct_add_mmx(q, dq, pre, dstv, 8, stride);
+    else {
+      vp9_dc_only_idct_add_mmx(q[0]*dq[0], pre, dstv, 8, stride);
+      ((int *)q)[0] = 0;
+    }
+
+    if (eobs[1] > 1)
+      vp9_dequant_idct_add_mmx(q + 16, dq, pre + 4, dstv + 4, 8, stride);
+    else {
+      vp9_dc_only_idct_add_mmx(q[16]*dq[0], pre + 4, dstv + 4, 8, stride);
+      ((int *)(q + 16))[0] = 0;
+    }
+
+    q    += 32;
+    pre  += 32;
+    dstv += 4 * stride;
+    eobs += 2;
+  }
+}
--- /dev/null
+++ b/vp9/decoder/x86/idct_blk_sse2.c
@@ -1,0 +1,116 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_ports/config.h"
+#include "vp9/common/idct.h"
+#include "vp9/decoder/dequantize.h"
+
+void vp9_idct_dequant_dc_0_2x_sse2(short *q, short *dq,
+                                   unsigned char *pre, unsigned char *dst,
+                                   int dst_stride, short *dc);
+
+void vp9_idct_dequant_dc_full_2x_sse2(short *q, short *dq,
+                                      unsigned char *pre, unsigned char *dst,
+                                      int dst_stride, short *dc);
+
+void vp9_idct_dequant_0_2x_sse2(short *q, short *dq,
+                                unsigned char *pre, unsigned char *dst,
+                                int dst_stride, int blk_stride);
+
+void vp9_idct_dequant_full_2x_sse2(short *q, short *dq,
+                                   unsigned char *pre, unsigned char *dst,
+                                   int dst_stride, int blk_stride);
+
+void vp9_dequant_dc_idct_add_y_block_sse2(short *q, short *dq,
+                                          unsigned char *pre,
+                                          unsigned char *dst,
+                                          int stride, char *eobs, short *dc) {
+  int i;
+
+  for (i = 0; i < 4; i++) {
+    if (((short *)(eobs))[0] & 0xfefe)
+      vp9_idct_dequant_dc_full_2x_sse2(q, dq, pre, dst, stride, dc);
+    else
+      vp9_idct_dequant_dc_0_2x_sse2(q, dq, pre, dst, stride, dc);
+
+    if (((short *)(eobs))[1] & 0xfefe)
+      vp9_idct_dequant_dc_full_2x_sse2(q + 32, dq, pre + 8, dst + 8,
+                                       stride, dc + 2);
+    else
+      vp9_idct_dequant_dc_0_2x_sse2(q + 32, dq, pre + 8, dst + 8,
+                                    stride, dc + 2);
+
+    q    += 64;
+    dc   += 4;
+    pre  += 64;
+    dst  += stride * 4;
+    eobs += 4;
+  }
+}
+
+void vp9_dequant_idct_add_y_block_sse2(short *q, short *dq,
+                                       unsigned char *pre, unsigned char *dst,
+                                       int stride, char *eobs) {
+  int i;
+
+  for (i = 0; i < 4; i++) {
+    if (((short *)(eobs))[0] & 0xfefe)
+      vp9_idct_dequant_full_2x_sse2(q, dq, pre, dst, stride, 16);
+    else
+      vp9_idct_dequant_0_2x_sse2(q, dq, pre, dst, stride, 16);
+
+    if (((short *)(eobs))[1] & 0xfefe)
+      vp9_idct_dequant_full_2x_sse2(q + 32, dq, pre + 8, dst + 8, stride, 16);
+    else
+      vp9_idct_dequant_0_2x_sse2(q + 32, dq, pre + 8, dst + 8, stride, 16);
+
+    q    += 64;
+    pre  += 64;
+    dst  += stride * 4;
+    eobs += 4;
+  }
+}
+
+void vp9_dequant_idct_add_uv_block_sse2(short *q, short *dq,
+                                        unsigned char *pre,
+                                        unsigned char *dstu,
+                                        unsigned char *dstv,
+                                        int stride, char *eobs) {
+  if (((short *)(eobs))[0] & 0xfefe)
+    vp9_idct_dequant_full_2x_sse2(q, dq, pre, dstu, stride, 8);
+  else
+    vp9_idct_dequant_0_2x_sse2(q, dq, pre, dstu, stride, 8);
+
+  q    += 32;
+  pre  += 32;
+  dstu += stride * 4;
+
+  if (((short *)(eobs))[1] & 0xfefe)
+    vp9_idct_dequant_full_2x_sse2(q, dq, pre, dstu, stride, 8);
+  else
+    vp9_idct_dequant_0_2x_sse2(q, dq, pre, dstu, stride, 8);
+
+  q    += 32;
+  pre  += 32;
+
+  if (((short *)(eobs))[2] & 0xfefe)
+    vp9_idct_dequant_full_2x_sse2(q, dq, pre, dstv, stride, 8);
+  else
+    vp9_idct_dequant_0_2x_sse2(q, dq, pre, dstv, stride, 8);
+
+  q    += 32;
+  pre  += 32;
+  dstv += stride * 4;
+
+  if (((short *)(eobs))[3] & 0xfefe)
+    vp9_idct_dequant_full_2x_sse2(q, dq, pre, dstv, stride, 8);
+  else
+    vp9_idct_dequant_0_2x_sse2(q, dq, pre, dstv, stride, 8);
+}
--- /dev/null
+++ b/vp9/decoder/x86/x86_dsystemdependent.c
@@ -1,0 +1,26 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_ports/config.h"
+#include "vpx_ports/x86.h"
+#include "vp9/decoder/onyxd_int.h"
+
+#if HAVE_MMX
+void vp9_dequantize_b_impl_mmx(short *sq, short *dq, short *q);
+
+void vp9_dequantize_b_mmx(BLOCKD *d) {
+  short *sq = (short *) d->qcoeff;
+  short *dq = (short *) d->dqcoeff;
+  short *q = (short *) d->dequant;
+  vp9_dequantize_b_impl_mmx(sq, dq, q);
+}
+#endif
+
+
--- /dev/null
+++ b/vp9/encoder/arm/arm_csystemdependent.c
@@ -1,0 +1,129 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_ports/config.h"
+#include "vpx_ports/arm.h"
+#include "vp9/encoder/variance.h"
+#include "vp9/encoder/onyx_int.h"
+
+extern void (*vp9_yv12_copy_partial_frame_ptr)(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc, int Fraction);
+extern void vp9_yv12_copy_partial_frame(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc, int Fraction);
+extern void vpxyv12_copy_partial_frame_neon(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc, int Fraction);
+
+void vp9_arch_arm_encoder_init(VP9_COMP *cpi) {
+#if CONFIG_RUNTIME_CPU_DETECT
+  int flags = cpi->common.rtcd.flags;
+
+#if HAVE_ARMV5TE
+  if (flags & HAS_EDSP) {
+  }
+#endif
+
+#if HAVE_ARMV6
+  if (flags & HAS_MEDIA) {
+    cpi->rtcd.variance.sad16x16              = vp9_sad16x16_armv6;
+    /*cpi->rtcd.variance.sad16x8               = vp9_sad16x8_c;
+    cpi->rtcd.variance.sad8x16               = vp9_sad8x16_c;
+    cpi->rtcd.variance.sad8x8                = vp9_sad8x8_c;
+    cpi->rtcd.variance.sad4x4                = vp9_sad4x4_c;*/
+
+    /*cpi->rtcd.variance.var4x4                = vp9_variance4x4_c;*/
+    cpi->rtcd.variance.var8x8                = vp9_variance8x8_armv6;
+    /*cpi->rtcd.variance.var8x16               = vp9_variance8x16_c;
+    cpi->rtcd.variance.var16x8               = vp9_variance16x8_c;*/
+    cpi->rtcd.variance.var16x16              = vp9_variance16x16_armv6;
+
+    /*cpi->rtcd.variance.subpixvar4x4          = vp9_sub_pixel_variance4x4_c;*/
+    cpi->rtcd.variance.subpixvar8x8          = vp9_sub_pixel_variance8x8_armv6;
+    /*cpi->rtcd.variance.subpixvar8x16         = vp9_sub_pixel_variance8x16_c;
+    cpi->rtcd.variance.subpixvar16x8         = vp9_sub_pixel_variance16x8_c;*/
+    cpi->rtcd.variance.subpixvar16x16        = vp9_sub_pixel_variance16x16_armv6;
+    cpi->rtcd.variance.halfpixvar16x16_h     = vp9_variance_halfpixvar16x16_h_armv6;
+    cpi->rtcd.variance.halfpixvar16x16_v     = vp9_variance_halfpixvar16x16_v_armv6;
+    cpi->rtcd.variance.halfpixvar16x16_hv    = vp9_variance_halfpixvar16x16_hv_armv6;
+
+    cpi->rtcd.variance.mse16x16              = vp9_mse16x16_armv6;
+    /*cpi->rtcd.variance.getmbss               = vp9_get_mb_ss_c;*/
+
+    cpi->rtcd.fdct.short4x4                  = vp9_short_fdct4x4_armv6;
+    cpi->rtcd.fdct.short8x4                  = vp9_short_fdct8x4_armv6;
+    cpi->rtcd.fdct.fast4x4                   = vp9_short_fdct4x4_armv6;
+    cpi->rtcd.fdct.fast8x4                   = vp9_short_fdct8x4_armv6;
+    cpi->rtcd.fdct.walsh_short4x4            = vp9_short_walsh4x4_armv6;
+
+    /*cpi->rtcd.encodemb.berr                  = vp9_block_error_c;
+    cpi->rtcd.encodemb.mberr                 = vp9_mbblock_error_c;
+    cpi->rtcd.encodemb.mbuverr               = vp9_mbuverror_c;*/
+    cpi->rtcd.encodemb.subb                  = vp9_subtract_b_armv6;
+    cpi->rtcd.encodemb.submby                = vp9_subtract_mby_armv6;
+    cpi->rtcd.encodemb.submbuv               = vp9_subtract_mbuv_armv6;
+
+    /*cpi->rtcd.quantize.quantb                = vp8_regular_quantize_b;*/
+    cpi->rtcd.quantize.fastquantb            = vp8_fast_quantize_b_armv6;
+  }
+#endif
+
+#if HAVE_ARMV7
+  if (flags & HAS_NEON) {
+    cpi->rtcd.variance.sad16x16              = vp9_sad16x16_neon;
+    cpi->rtcd.variance.sad16x8               = vp9_sad16x8_neon;
+    cpi->rtcd.variance.sad8x16               = vp9_sad8x16_neon;
+    cpi->rtcd.variance.sad8x8                = vp9_sad8x8_neon;
+    cpi->rtcd.variance.sad4x4                = vp9_sad4x4_neon;
+
+    /*cpi->rtcd.variance.var4x4                = vp9_variance4x4_c;*/
+    cpi->rtcd.variance.var8x8                = vp9_variance8x8_neon;
+    cpi->rtcd.variance.var8x16               = vp9_variance8x16_neon;
+    cpi->rtcd.variance.var16x8               = vp9_variance16x8_neon;
+    cpi->rtcd.variance.var16x16              = vp9_variance16x16_neon;
+
+    /*cpi->rtcd.variance.subpixvar4x4          = vp9_sub_pixel_variance4x4_c;*/
+    cpi->rtcd.variance.subpixvar8x8          = vp9_sub_pixel_variance8x8_neon;
+    /*cpi->rtcd.variance.subpixvar8x16         = vp9_sub_pixel_variance8x16_c;
+    cpi->rtcd.variance.subpixvar16x8         = vp9_sub_pixel_variance16x8_c;*/
+    cpi->rtcd.variance.subpixvar16x16        = vp9_sub_pixel_variance16x16_neon;
+    cpi->rtcd.variance.halfpixvar16x16_h     = vp9_variance_halfpixvar16x16_h_neon;
+    cpi->rtcd.variance.halfpixvar16x16_v     = vp9_variance_halfpixvar16x16_v_neon;
+    cpi->rtcd.variance.halfpixvar16x16_hv    = vp9_variance_halfpixvar16x16_hv_neon;
+
+    cpi->rtcd.variance.mse16x16              = vp9_mse16x16_neon;
+    /*cpi->rtcd.variance.getmbss               = vp9_get_mb_ss_c;*/
+
+    cpi->rtcd.fdct.short4x4                  = vp9_short_fdct4x4_neon;
+    cpi->rtcd.fdct.short8x4                  = vp9_short_fdct8x4_neon;
+    cpi->rtcd.fdct.fast4x4                   = vp9_short_fdct4x4_neon;
+    cpi->rtcd.fdct.fast8x4                   = vp9_short_fdct8x4_neon;
+    cpi->rtcd.fdct.walsh_short4x4            = vp9_short_walsh4x4_neon;
+
+    /*cpi->rtcd.encodemb.berr                  = vp9_block_error_c;
+    cpi->rtcd.encodemb.mberr                 = vp9_mbblock_error_c;
+    cpi->rtcd.encodemb.mbuverr               = vp9_mbuverror_c;*/
+    cpi->rtcd.encodemb.subb                  = vp9_subtract_b_neon;
+    cpi->rtcd.encodemb.submby                = vp9_subtract_mby_neon;
+    cpi->rtcd.encodemb.submbuv               = vp9_subtract_mbuv_neon;
+
+    /*cpi->rtcd.quantize.quantb                = vp8_regular_quantize_b;
+    cpi->rtcd.quantize.quantb_pair           = vp8_regular_quantize_b_pair;*/
+    cpi->rtcd.quantize.fastquantb            = vp8_fast_quantize_b_neon;
+    cpi->rtcd.quantize.fastquantb_pair       = vp8_fast_quantize_b_pair_neon;
+  }
+#endif
+
+#if HAVE_ARMV7
+#if CONFIG_RUNTIME_CPU_DETECT
+  if (flags & HAS_NEON)
+#endif
+  {
+    vp9_yv12_copy_partial_frame_ptr = vpxyv12_copy_partial_frame_neon;
+  }
+#endif
+#endif
+}
--- /dev/null
+++ b/vp9/encoder/arm/armv5te/boolhuff_armv5te.asm
@@ -1,0 +1,286 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT |vp8_start_encode|
+    EXPORT |vp9_encode_bool|
+    EXPORT |vp8_stop_encode|
+    EXPORT |vp8_encode_value|
+
+    INCLUDE asm_enc_offsets.asm
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA    |.text|, CODE, READONLY
+
+; r0 BOOL_CODER *br
+; r1 unsigned char *source
+
+|vp8_start_encode| PROC
+    mov     r12, #0
+    mov     r3,  #255
+    mvn     r2,  #23
+    str     r12, [r0, #vp9_writer_lowvalue]
+    str     r3,  [r0, #vp9_writer_range]
+    str     r12, [r0, #vp9_writer_value]
+    str     r2,  [r0, #vp9_writer_count]
+    str     r12, [r0, #vp9_writer_pos]
+    str     r1,  [r0, #vp9_writer_buffer]
+    bx      lr
+    ENDP
+
+; r0 BOOL_CODER *br
+; r1 int bit
+; r2 int probability
+|vp9_encode_bool| PROC
+    push    {r4-r9, lr}
+
+    mov     r4, r2
+
+    ldr     r2, [r0, #vp9_writer_lowvalue]
+    ldr     r5, [r0, #vp9_writer_range]
+    ldr     r3, [r0, #vp9_writer_count]
+
+    sub     r7, r5, #1                  ; range-1
+
+    cmp     r1, #0
+    mul     r6, r4, r7                  ; ((range-1) * probability)
+
+    mov     r7, #1
+    add     r4, r7, r6, lsr #8          ; 1 + (((range-1) * probability) >> 8)
+
+    addne   r2, r2, r4                  ; if  (bit) lowvalue += split
+    subne   r4, r5, r4                  ; if  (bit) range = range-split
+
+    ; Counting the leading zeros is used to normalize range.
+    clz     r6, r4
+    sub     r6, r6, #24                 ; shift
+
+    ; Flag is set on the sum of count.  This flag is used later
+    ; to determine if count >= 0
+    adds    r3, r3, r6                  ; count += shift
+    lsl     r5, r4, r6                  ; range <<= shift
+    bmi     token_count_lt_zero         ; if(count >= 0)
+
+    sub     r6, r6, r3                  ; offset = shift - count
+    sub     r4, r6, #1                  ; offset-1
+    lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )
+    bpl     token_high_bit_not_set
+
+    ldr     r4, [r0, #vp9_writer_pos]   ; x
+    sub     r4, r4, #1                  ; x = w->pos-1
+    b       token_zero_while_start
+token_zero_while_loop
+    mov     r9, #0
+    strb    r9, [r7, r4]                ; w->buffer[x] =(unsigned char)0
+    sub     r4, r4, #1                  ; x--
+token_zero_while_start
+    cmp     r4, #0
+    ldrge   r7, [r0, #vp9_writer_buffer]
+    ldrb    r1, [r7, r4]
+    cmpge   r1, #0xff
+    beq     token_zero_while_loop
+
+    ldr     r7, [r0, #vp9_writer_buffer]
+    ldrb    r9, [r7, r4]                ; w->buffer[x]
+    add     r9, r9, #1
+    strb    r9, [r7, r4]                ; w->buffer[x] + 1
+token_high_bit_not_set
+    rsb     r4, r6, #24                 ; 24-offset
+    ldr     r9, [r0, #vp9_writer_buffer]
+    lsr     r7, r2, r4                  ; lowvalue >> (24-offset)
+    ldr     r4, [r0, #vp9_writer_pos]   ; w->pos
+    lsl     r2, r2, r6                  ; lowvalue <<= offset
+    mov     r6, r3                      ; shift = count
+    add     r1, r4, #1                  ; w->pos++
+    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
+    str     r1, [r0, #vp9_writer_pos]
+    sub     r3, r3, #8                  ; count -= 8
+    strb    r7, [r9, r4]                ; w->buffer[w->pos++]
+
+token_count_lt_zero
+    lsl     r2, r2, r6                  ; lowvalue <<= shift
+
+    str     r2, [r0, #vp9_writer_lowvalue]
+    str     r5, [r0, #vp9_writer_range]
+    str     r3, [r0, #vp9_writer_count]
+    pop     {r4-r9, pc}
+    ENDP
+
+; r0 BOOL_CODER *br
+|vp8_stop_encode| PROC
+    push    {r4-r10, lr}
+
+    ldr     r2, [r0, #vp9_writer_lowvalue]
+    ldr     r5, [r0, #vp9_writer_range]
+    ldr     r3, [r0, #vp9_writer_count]
+
+    mov     r10, #32
+
+stop_encode_loop
+    sub     r7, r5, #1                  ; range-1
+
+    mov     r4, r7, lsl #7              ; ((range-1) * 128)
+
+    mov     r7, #1
+    add     r4, r7, r4, lsr #8          ; 1 + (((range-1) * 128) >> 8)
+
+    ; Counting the leading zeros is used to normalize range.
+    clz     r6, r4
+    sub     r6, r6, #24                 ; shift
+
+    ; Flag is set on the sum of count.  This flag is used later
+    ; to determine if count >= 0
+    adds    r3, r3, r6                  ; count += shift
+    lsl     r5, r4, r6                  ; range <<= shift
+    bmi     token_count_lt_zero_se      ; if(count >= 0)
+
+    sub     r6, r6, r3                  ; offset = shift - count
+    sub     r4, r6, #1                  ; offset-1
+    lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )
+    bpl     token_high_bit_not_set_se
+
+    ldr     r4, [r0, #vp9_writer_pos]   ; x
+    sub     r4, r4, #1                  ; x = w->pos-1
+    b       token_zero_while_start_se
+token_zero_while_loop_se
+    mov     r9, #0
+    strb    r9, [r7, r4]                ; w->buffer[x] =(unsigned char)0
+    sub     r4, r4, #1                  ; x--
+token_zero_while_start_se
+    cmp     r4, #0
+    ldrge   r7, [r0, #vp9_writer_buffer]
+    ldrb    r1, [r7, r4]
+    cmpge   r1, #0xff
+    beq     token_zero_while_loop_se
+
+    ldr     r7, [r0, #vp9_writer_buffer]
+    ldrb    r9, [r7, r4]                ; w->buffer[x]
+    add     r9, r9, #1
+    strb    r9, [r7, r4]                ; w->buffer[x] + 1
+token_high_bit_not_set_se
+    rsb     r4, r6, #24                 ; 24-offset
+    ldr     r9, [r0, #vp9_writer_buffer]
+    lsr     r7, r2, r4                  ; lowvalue >> (24-offset)
+    ldr     r4, [r0, #vp9_writer_pos]   ; w->pos
+    lsl     r2, r2, r6                  ; lowvalue <<= offset
+    mov     r6, r3                      ; shift = count
+    add     r1, r4, #1                  ; w->pos++
+    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
+    str     r1, [r0, #vp9_writer_pos]
+    sub     r3, r3, #8                  ; count -= 8
+    strb    r7, [r9, r4]                ; w->buffer[w->pos++]
+
+token_count_lt_zero_se
+    lsl     r2, r2, r6                  ; lowvalue <<= shift
+
+    subs    r10, r10, #1
+    bne     stop_encode_loop
+
+    str     r2, [r0, #vp9_writer_lowvalue]
+    str     r5, [r0, #vp9_writer_range]
+    str     r3, [r0, #vp9_writer_count]
+    pop     {r4-r10, pc}
+
+    ENDP
+
+; r0 BOOL_CODER *br
+; r1 int data
+; r2 int bits
+|vp8_encode_value| PROC
+    push    {r4-r11, lr}
+
+    mov     r10, r2
+
+    ldr     r2, [r0, #vp9_writer_lowvalue]
+    ldr     r5, [r0, #vp9_writer_range]
+    ldr     r3, [r0, #vp9_writer_count]
+
+    rsb     r4, r10, #32                 ; 32-n
+
+    ; v is kept in r1 during the token pack loop
+    lsl     r1, r1, r4                  ; r1 = v << 32 - n
+
+encode_value_loop
+    sub     r7, r5, #1                  ; range-1
+
+    ; Decisions are made based on the bit value shifted
+    ; off of v, so set a flag here based on this.
+    ; This value is refered to as "bb"
+    lsls    r1, r1, #1                  ; bit = v >> n
+    mov     r4, r7, lsl #7              ; ((range-1) * 128)
+
+    mov     r7, #1
+    add     r4, r7, r4, lsr #8          ; 1 + (((range-1) * 128) >> 8)
+
+    addcs   r2, r2, r4                  ; if  (bit) lowvalue += split
+    subcs   r4, r5, r4                  ; if  (bit) range = range-split
+
+    ; Counting the leading zeros is used to normalize range.
+    clz     r6, r4
+    sub     r6, r6, #24                 ; shift
+
+    ; Flag is set on the sum of count.  This flag is used later
+    ; to determine if count >= 0
+    adds    r3, r3, r6                  ; count += shift
+    lsl     r5, r4, r6                  ; range <<= shift
+    bmi     token_count_lt_zero_ev      ; if(count >= 0)
+
+    sub     r6, r6, r3                  ; offset = shift - count
+    sub     r4, r6, #1                  ; offset-1
+    lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )
+    bpl     token_high_bit_not_set_ev
+
+    ldr     r4, [r0, #vp9_writer_pos]   ; x
+    sub     r4, r4, #1                  ; x = w->pos-1
+    b       token_zero_while_start_ev
+token_zero_while_loop_ev
+    mov     r9, #0
+    strb    r9, [r7, r4]                ; w->buffer[x] =(unsigned char)0
+    sub     r4, r4, #1                  ; x--
+token_zero_while_start_ev
+    cmp     r4, #0
+    ldrge   r7, [r0, #vp9_writer_buffer]
+    ldrb    r11, [r7, r4]
+    cmpge   r11, #0xff
+    beq     token_zero_while_loop_ev
+
+    ldr     r7, [r0, #vp9_writer_buffer]
+    ldrb    r9, [r7, r4]                ; w->buffer[x]
+    add     r9, r9, #1
+    strb    r9, [r7, r4]                ; w->buffer[x] + 1
+token_high_bit_not_set_ev
+    rsb     r4, r6, #24                 ; 24-offset
+    ldr     r9, [r0, #vp9_writer_buffer]
+    lsr     r7, r2, r4                  ; lowvalue >> (24-offset)
+    ldr     r4, [r0, #vp9_writer_pos]   ; w->pos
+    lsl     r2, r2, r6                  ; lowvalue <<= offset
+    mov     r6, r3                      ; shift = count
+    add     r11, r4, #1                 ; w->pos++
+    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
+    str     r11, [r0, #vp9_writer_pos]
+    sub     r3, r3, #8                  ; count -= 8
+    strb    r7, [r9, r4]                ; w->buffer[w->pos++]
+
+token_count_lt_zero_ev
+    lsl     r2, r2, r6                  ; lowvalue <<= shift
+
+    subs    r10, r10, #1
+    bne     encode_value_loop
+
+    str     r2, [r0, #vp9_writer_lowvalue]
+    str     r5, [r0, #vp9_writer_range]
+    str     r3, [r0, #vp9_writer_count]
+    pop     {r4-r11, pc}
+    ENDP
+
+    END
--- /dev/null
+++ b/vp9/encoder/arm/armv5te/vp8_packtokens_armv5.asm
@@ -1,0 +1,291 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT |vp8cx_pack_tokens_armv5|
+
+    INCLUDE asm_enc_offsets.asm
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA    |.text|, CODE, READONLY
+
+; r0 vp9_writer *w
+; r1 const TOKENEXTRA *p
+; r2 int xcount
+; r3 vp8_coef_encodings
+; s0 vp8_extra_bits
+; s1 vp8_coef_tree
+|vp8cx_pack_tokens_armv5| PROC
+    push    {r4-r11, lr}
+
+    ; Add size of xcount * sizeof (TOKENEXTRA) to get stop
+    ;  sizeof (TOKENEXTRA) is 8
+    sub     sp, sp, #12
+    add     r2, r1, r2, lsl #3          ; stop = p + xcount*sizeof(TOKENEXTRA)
+    str     r2, [sp, #0]
+    str     r3, [sp, #8]                ; save vp8_coef_encodings
+    ldr     r2, [r0, #vp9_writer_lowvalue]
+    ldr     r5, [r0, #vp9_writer_range]
+    ldr     r3, [r0, #vp9_writer_count]
+    b       check_p_lt_stop
+
+while_p_lt_stop
+    ldrb    r6, [r1, #tokenextra_token] ; t
+    ldr     r4, [sp, #8]                ; vp8_coef_encodings
+    mov     lr, #0
+    add     r4, r4, r6, lsl #3          ; a = vp8_coef_encodings + t
+    ldr     r9, [r1, #tokenextra_context_tree]   ; pp
+
+    ldrb    r7, [r1, #tokenextra_skip_eob_node]
+
+    ldr     r6, [r4, #vp9_token_value]  ; v
+    ldr     r8, [r4, #vp9_token_len]    ; n
+
+    ; vp8 specific skip_eob_node
+    cmp     r7, #0
+    movne   lr, #2                      ; i = 2
+    subne   r8, r8, #1                  ; --n
+
+    rsb     r4, r8, #32                 ; 32-n
+    ldr     r10, [sp, #52]              ; vp8_coef_tree
+
+    ; v is kept in r12 during the token pack loop
+    lsl     r12, r6, r4                ; r12 = v << 32 - n
+
+; loop start
+token_loop
+    ldrb    r4, [r9, lr, asr #1]        ; pp [i>>1]
+    sub     r7, r5, #1                  ; range-1
+
+    ; Decisions are made based on the bit value shifted
+    ; off of v, so set a flag here based on this.
+    ; This value is refered to as "bb"
+    lsls    r12, r12, #1                ; bb = v >> n
+    mul     r6, r4, r7                  ; ((range-1) * pp[i>>1]))
+
+    ; bb can only be 0 or 1.  So only execute this statement
+    ; if bb == 1, otherwise it will act like i + 0
+    addcs   lr, lr, #1                  ; i + bb
+
+    mov     r7, #1
+    ldrsb   lr, [r10, lr]               ; i = vp8_coef_tree[i+bb]
+    add     r4, r7, r6, lsr #8          ; 1 + (((range-1) * pp[i>>1]) >> 8)
+
+    addcs   r2, r2, r4                  ; if  (bb) lowvalue += split
+    subcs   r4, r5, r4                  ; if  (bb) range = range-split
+
+    ; Counting the leading zeros is used to normalize range.
+    clz     r6, r4
+    sub     r6, r6, #24                 ; shift
+
+    ; Flag is set on the sum of count.  This flag is used later
+    ; to determine if count >= 0
+    adds    r3, r3, r6                  ; count += shift
+    lsl     r5, r4, r6                  ; range <<= shift
+    bmi     token_count_lt_zero         ; if(count >= 0)
+
+    sub     r6, r6, r3                  ; offset = shift - count
+    sub     r4, r6, #1                  ; offset-1
+    lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )
+    bpl     token_high_bit_not_set
+
+    ldr     r4, [r0, #vp9_writer_pos]   ; x
+    sub     r4, r4, #1                  ; x = w->pos-1
+    b       token_zero_while_start
+token_zero_while_loop
+    mov     r10, #0
+    strb    r10, [r7, r4]               ; w->buffer[x] =(unsigned char)0
+    sub     r4, r4, #1                  ; x--
+token_zero_while_start
+    cmp     r4, #0
+    ldrge   r7, [r0, #vp9_writer_buffer]
+    ldrb    r11, [r7, r4]
+    cmpge   r11, #0xff
+    beq     token_zero_while_loop
+
+    ldr     r7, [r0, #vp9_writer_buffer]
+    ldrb    r10, [r7, r4]               ; w->buffer[x]
+    add     r10, r10, #1
+    strb    r10, [r7, r4]               ; w->buffer[x] + 1
+token_high_bit_not_set
+    rsb     r4, r6, #24                 ; 24-offset
+    ldr     r10, [r0, #vp9_writer_buffer]
+    lsr     r7, r2, r4                  ; lowvalue >> (24-offset)
+    ldr     r4, [r0, #vp9_writer_pos]   ; w->pos
+    lsl     r2, r2, r6                  ; lowvalue <<= offset
+    mov     r6, r3                      ; shift = count
+    add     r11, r4, #1                 ; w->pos++
+    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
+    str     r11, [r0, #vp9_writer_pos]
+    sub     r3, r3, #8                  ; count -= 8
+    strb    r7, [r10, r4]               ; w->buffer[w->pos++]
+
+    ; r10 is used earlier in the loop, but r10 is used as
+    ; temp variable here.  So after r10 is used, reload
+    ; vp8_coef_tree_dcd into r10
+    ldr     r10, [sp, #52]              ; vp8_coef_tree
+
+token_count_lt_zero
+    lsl     r2, r2, r6                  ; lowvalue <<= shift
+
+    subs    r8, r8, #1                  ; --n
+    bne     token_loop
+
+    ldrb    r6, [r1, #tokenextra_token] ; t
+    ldr     r7, [sp, #48]               ; vp8_extra_bits
+    ; Add t * sizeof (vp9_extra_bit_struct) to get the desired
+    ;  element.  Here vp9_extra_bit_struct == 16
+    add     r12, r7, r6, lsl #4         ; b = vp8_extra_bits + t
+
+    ldr     r4, [r12, #vp9_extra_bit_struct_base_val]
+    cmp     r4, #0
+    beq     skip_extra_bits
+
+;   if( b->base_val)
+    ldr     r8, [r12, #vp9_extra_bit_struct_len] ; L
+    ldrsh   lr, [r1, #tokenextra_extra] ; e = p->Extra
+    cmp     r8, #0                      ; if( L)
+    beq     no_extra_bits
+
+    ldr     r9, [r12, #vp9_extra_bit_struct_prob]
+    asr     r7, lr, #1                  ; v=e>>1
+
+    ldr     r10, [r12, #vp9_extra_bit_struct_tree]
+    str     r10, [sp, #4]               ; b->tree
+
+    rsb     r4, r8, #32
+    lsl     r12, r7, r4
+
+    mov     lr, #0                      ; i = 0
+
+extra_bits_loop
+    ldrb    r4, [r9, lr, asr #1]            ; pp[i>>1]
+    sub     r7, r5, #1                  ; range-1
+    lsls    r12, r12, #1                ; v >> n
+    mul     r6, r4, r7                  ; (range-1) * pp[i>>1]
+    addcs   lr, lr, #1                  ; i + bb
+
+    mov     r7, #1
+    ldrsb   lr, [r10, lr]               ; i = b->tree[i+bb]
+    add     r4, r7, r6, lsr #8          ; split = 1 +  (((range-1) * pp[i>>1]) >> 8)
+
+    addcs   r2, r2, r4                  ; if  (bb) lowvalue += split
+    subcs   r4, r5, r4                  ; if  (bb) range = range-split
+
+    clz     r6, r4
+    sub     r6, r6, #24
+
+    adds    r3, r3, r6                  ; count += shift
+    lsl     r5, r4, r6                  ; range <<= shift
+    bmi     extra_count_lt_zero         ; if(count >= 0)
+
+    sub     r6, r6, r3                  ; offset= shift - count
+    sub     r4, r6, #1                  ; offset-1
+    lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )
+    bpl     extra_high_bit_not_set
+
+    ldr     r4, [r0, #vp9_writer_pos]   ; x
+    sub     r4, r4, #1                  ; x = w->pos - 1
+    b       extra_zero_while_start
+extra_zero_while_loop
+    mov     r10, #0
+    strb    r10, [r7, r4]               ; w->buffer[x] =(unsigned char)0
+    sub     r4, r4, #1                  ; x--
+extra_zero_while_start
+    cmp     r4, #0
+    ldrge   r7, [r0, #vp9_writer_buffer]
+    ldrb    r11, [r7, r4]
+    cmpge   r11, #0xff
+    beq     extra_zero_while_loop
+
+    ldr     r7, [r0, #vp9_writer_buffer]
+    ldrb    r10, [r7, r4]
+    add     r10, r10, #1
+    strb    r10, [r7, r4]
+extra_high_bit_not_set
+    rsb     r4, r6, #24                 ; 24-offset
+    ldr     r10, [r0, #vp9_writer_buffer]
+    lsr     r7, r2, r4                  ; lowvalue >> (24-offset)
+    ldr     r4, [r0, #vp9_writer_pos]
+    lsl     r2, r2, r6                  ; lowvalue <<= offset
+    mov     r6, r3                      ; shift = count
+    add     r11, r4, #1                 ; w->pos++
+    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
+    str     r11, [r0, #vp9_writer_pos]
+    sub     r3, r3, #8                  ; count -= 8
+    strb    r7, [r10, r4]               ; w->buffer[w->pos++]=(lowvalue >> (24-offset))
+    ldr     r10, [sp, #4]               ; b->tree
+extra_count_lt_zero
+    lsl     r2, r2, r6
+
+    subs    r8, r8, #1                  ; --n
+    bne     extra_bits_loop             ; while (n)
+
+no_extra_bits
+    ldr     lr, [r1, #4]                ; e = p->Extra
+    add     r4, r5, #1                  ; range + 1
+    tst     lr, #1
+    lsr     r4, r4, #1                  ; split = (range + 1) >> 1
+    addne   r2, r2, r4                  ; lowvalue += split
+    subne   r4, r5, r4                  ; range = range-split
+    tst     r2, #0x80000000             ; lowvalue & 0x80000000
+    lsl     r5, r4, #1                  ; range <<= 1
+    beq     end_high_bit_not_set
+
+    ldr     r4, [r0, #vp9_writer_pos]
+    mov     r7, #0
+    sub     r4, r4, #1
+    b       end_zero_while_start
+end_zero_while_loop
+    strb    r7, [r6, r4]
+    sub     r4, r4, #1                  ; x--
+end_zero_while_start
+    cmp     r4, #0
+    ldrge   r6, [r0, #vp9_writer_buffer]
+    ldrb    r12, [r6, r4]
+    cmpge   r12, #0xff
+    beq     end_zero_while_loop
+
+    ldr     r6, [r0, #vp9_writer_buffer]
+    ldrb    r7, [r6, r4]
+    add     r7, r7, #1
+    strb    r7, [r6, r4]
+end_high_bit_not_set
+    adds    r3, r3, #1                  ; ++count
+    lsl     r2, r2, #1                  ; lowvalue  <<= 1
+    bne     end_count_zero
+
+    ldr     r4, [r0, #vp9_writer_pos]
+    mvn     r3, #7
+    ldr     r7, [r0, #vp9_writer_buffer]
+    lsr     r6, r2, #24                 ; lowvalue >> 24
+    add     r12, r4, #1                 ; w->pos++
+    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
+    str     r12, [r0, #0x10]
+    strb    r6, [r7, r4]
+end_count_zero
+skip_extra_bits
+    add     r1, r1, #TOKENEXTRA_SZ      ; ++p
+check_p_lt_stop
+    ldr     r4, [sp, #0]                ; stop
+    cmp     r1, r4                      ; while( p < stop)
+    bcc     while_p_lt_stop
+
+    str     r2, [r0, #vp9_writer_lowvalue]
+    str     r5, [r0, #vp9_writer_range]
+    str     r3, [r0, #vp9_writer_count]
+    add     sp, sp, #12
+    pop     {r4-r11, pc}
+    ENDP
+
+    END
--- /dev/null
+++ b/vp9/encoder/arm/armv5te/vp8_packtokens_mbrow_armv5.asm
@@ -1,0 +1,327 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT |vp8cx_pack_mb_row_tokens_armv5|
+
+    INCLUDE asm_enc_offsets.asm
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA    |.text|, CODE, READONLY
+
+; r0 VP8_COMP *cpi
+; r1 vp9_writer *w
+; r2 vp8_coef_encodings
+; r3 vp8_extra_bits
+; s0 vp8_coef_tree
+
+|vp8cx_pack_mb_row_tokens_armv5| PROC
+    push    {r4-r11, lr}
+    sub     sp, sp, #24
+
+    ; Compute address of cpi->common.mb_rows
+    ldr     r4, _VP8_COMP_common_
+    ldr     r6, _VP8_COMMON_MBrows_
+    add     r4, r0, r4
+
+    ldr     r5, [r4, r6]                ; load up mb_rows
+
+    str     r2, [sp, #20]               ; save vp8_coef_encodings
+    str     r5, [sp, #12]               ; save mb_rows
+    str     r3, [sp, #8]                ; save vp8_extra_bits
+
+    ldr     r4, _VP8_COMP_tplist_
+    add     r4, r0, r4
+    ldr     r7, [r4, #0]                ; dereference cpi->tp_list
+
+    mov     r0, r1                      ; keep same as other loops
+
+    ldr     r2, [r0, #vp9_writer_lowvalue]
+    ldr     r5, [r0, #vp9_writer_range]
+    ldr     r3, [r0, #vp9_writer_count]
+
+mb_row_loop
+
+    ldr     r1, [r7, #tokenlist_start]
+    ldr     r9, [r7, #tokenlist_stop]
+    str     r9, [sp, #0]                ; save stop for later comparison
+    str     r7, [sp, #16]               ; tokenlist address for next time
+
+    b       check_p_lt_stop
+
+    ; actuall work gets done here!
+
+while_p_lt_stop
+    ldrb    r6, [r1, #tokenextra_token] ; t
+    ldr     r4, [sp, #20]               ; vp8_coef_encodings
+    mov     lr, #0
+    add     r4, r4, r6, lsl #3          ; a = vp8_coef_encodings + t
+    ldr     r9, [r1, #tokenextra_context_tree]   ; pp
+
+    ldrb    r7, [r1, #tokenextra_skip_eob_node]
+
+    ldr     r6, [r4, #vp9_token_value]  ; v
+    ldr     r8, [r4, #vp9_token_len]    ; n
+
+    ; vp8 specific skip_eob_node
+    cmp     r7, #0
+    movne   lr, #2                      ; i = 2
+    subne   r8, r8, #1                  ; --n
+
+    rsb     r4, r8, #32                 ; 32-n
+    ldr     r10, [sp, #60]              ; vp8_coef_tree
+
+    ; v is kept in r12 during the token pack loop
+    lsl     r12, r6, r4                 ; r12 = v << 32 - n
+
+; loop start
+token_loop
+    ldrb    r4, [r9, lr, asr #1]        ; pp [i>>1]
+    sub     r7, r5, #1                  ; range-1
+
+    ; Decisions are made based on the bit value shifted
+    ; off of v, so set a flag here based on this.
+    ; This value is refered to as "bb"
+    lsls    r12, r12, #1                ; bb = v >> n
+    mul     r6, r4, r7                  ; ((range-1) * pp[i>>1]))
+
+    ; bb can only be 0 or 1.  So only execute this statement
+    ; if bb == 1, otherwise it will act like i + 0
+    addcs   lr, lr, #1                  ; i + bb
+
+    mov     r7, #1
+    ldrsb   lr, [r10, lr]               ; i = vp8_coef_tree[i+bb]
+    add     r4, r7, r6, lsr #8          ; 1 + (((range-1) * pp[i>>1]) >> 8)
+
+    addcs   r2, r2, r4                  ; if  (bb) lowvalue += split
+    subcs   r4, r5, r4                  ; if  (bb) range = range-split
+
+    ; Counting the leading zeros is used to normalize range.
+    clz     r6, r4
+    sub     r6, r6, #24                 ; shift
+
+    ; Flag is set on the sum of count.  This flag is used later
+    ; to determine if count >= 0
+    adds    r3, r3, r6                  ; count += shift
+    lsl     r5, r4, r6                  ; range <<= shift
+    bmi     token_count_lt_zero         ; if(count >= 0)
+
+    sub     r6, r6, r3                  ; offset = shift - count
+    sub     r4, r6, #1                  ; offset-1
+    lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )
+    bpl     token_high_bit_not_set
+
+    ldr     r4, [r0, #vp9_writer_pos]   ; x
+    sub     r4, r4, #1                  ; x = w->pos-1
+    b       token_zero_while_start
+token_zero_while_loop
+    mov     r10, #0
+    strb    r10, [r7, r4]               ; w->buffer[x] =(unsigned char)0
+    sub     r4, r4, #1                  ; x--
+token_zero_while_start
+    cmp     r4, #0
+    ldrge   r7, [r0, #vp9_writer_buffer]
+    ldrb    r11, [r7, r4]
+    cmpge   r11, #0xff
+    beq     token_zero_while_loop
+
+    ldr     r7, [r0, #vp9_writer_buffer]
+    ldrb    r10, [r7, r4]               ; w->buffer[x]
+    add     r10, r10, #1
+    strb    r10, [r7, r4]               ; w->buffer[x] + 1
+token_high_bit_not_set
+    rsb     r4, r6, #24                 ; 24-offset
+    ldr     r10, [r0, #vp9_writer_buffer]
+    lsr     r7, r2, r4                  ; lowvalue >> (24-offset)
+    ldr     r4, [r0, #vp9_writer_pos]   ; w->pos
+    lsl     r2, r2, r6                  ; lowvalue <<= offset
+    mov     r6, r3                      ; shift = count
+    add     r11, r4, #1                 ; w->pos++
+    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
+    str     r11, [r0, #vp9_writer_pos]
+    sub     r3, r3, #8                  ; count -= 8
+    strb    r7, [r10, r4]               ; w->buffer[w->pos++]
+
+    ; r10 is used earlier in the loop, but r10 is used as
+    ; temp variable here.  So after r10 is used, reload
+    ; vp8_coef_tree_dcd into r10
+    ldr     r10, [sp, #60]              ; vp8_coef_tree
+
+token_count_lt_zero
+    lsl     r2, r2, r6                  ; lowvalue <<= shift
+
+    subs    r8, r8, #1                  ; --n
+    bne     token_loop
+
+    ldrb    r6, [r1, #tokenextra_token] ; t
+    ldr     r7, [sp, #8]                ; vp8_extra_bits
+    ; Add t * sizeof (vp9_extra_bit_struct) to get the desired
+    ;  element.  Here vp9_extra_bit_struct == 16
+    add     r12, r7, r6, lsl #4         ; b = vp8_extra_bits + t
+
+    ldr     r4, [r12, #vp9_extra_bit_struct_base_val]
+    cmp     r4, #0
+    beq     skip_extra_bits
+
+;   if( b->base_val)
+    ldr     r8, [r12, #vp9_extra_bit_struct_len] ; L
+    ldrsh   lr, [r1, #tokenextra_extra] ; e = p->Extra
+    cmp     r8, #0                      ; if( L)
+    beq     no_extra_bits
+
+    ldr     r9, [r12, #vp9_extra_bit_struct_prob]
+    asr     r7, lr, #1                  ; v=e>>1
+
+    ldr     r10, [r12, #vp9_extra_bit_struct_tree]
+    str     r10, [sp, #4]               ; b->tree
+
+    rsb     r4, r8, #32
+    lsl     r12, r7, r4
+
+    mov     lr, #0                      ; i = 0
+
+extra_bits_loop
+    ldrb    r4, [r9, lr, asr #1]            ; pp[i>>1]
+    sub     r7, r5, #1                  ; range-1
+    lsls    r12, r12, #1                ; v >> n
+    mul     r6, r4, r7                  ; (range-1) * pp[i>>1]
+    addcs   lr, lr, #1                  ; i + bb
+
+    mov     r7, #1
+    ldrsb   lr, [r10, lr]               ; i = b->tree[i+bb]
+    add     r4, r7, r6, lsr #8          ; split = 1 +  (((range-1) * pp[i>>1]) >> 8)
+
+    addcs   r2, r2, r4                  ; if  (bb) lowvalue += split
+    subcs   r4, r5, r4                  ; if  (bb) range = range-split
+
+    clz     r6, r4
+    sub     r6, r6, #24
+
+    adds    r3, r3, r6                  ; count += shift
+    lsl     r5, r4, r6                  ; range <<= shift
+    bmi     extra_count_lt_zero         ; if(count >= 0)
+
+    sub     r6, r6, r3                  ; offset= shift - count
+    sub     r4, r6, #1                  ; offset-1
+    lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )
+    bpl     extra_high_bit_not_set
+
+    ldr     r4, [r0, #vp9_writer_pos]   ; x
+    sub     r4, r4, #1                  ; x = w->pos - 1
+    b       extra_zero_while_start
+extra_zero_while_loop
+    mov     r10, #0
+    strb    r10, [r7, r4]               ; w->buffer[x] =(unsigned char)0
+    sub     r4, r4, #1                  ; x--
+extra_zero_while_start
+    cmp     r4, #0
+    ldrge   r7, [r0, #vp9_writer_buffer]
+    ldrb    r11, [r7, r4]
+    cmpge   r11, #0xff
+    beq     extra_zero_while_loop
+
+    ldr     r7, [r0, #vp9_writer_buffer]
+    ldrb    r10, [r7, r4]
+    add     r10, r10, #1
+    strb    r10, [r7, r4]
+extra_high_bit_not_set
+    rsb     r4, r6, #24                 ; 24-offset
+    ldr     r10, [r0, #vp9_writer_buffer]
+    lsr     r7, r2, r4                  ; lowvalue >> (24-offset)
+    ldr     r4, [r0, #vp9_writer_pos]
+    lsl     r2, r2, r6                  ; lowvalue <<= offset
+    mov     r6, r3                      ; shift = count
+    add     r11, r4, #1                 ; w->pos++
+    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
+    str     r11, [r0, #vp9_writer_pos]
+    sub     r3, r3, #8                  ; count -= 8
+    strb    r7, [r10, r4]               ; w->buffer[w->pos++]=(lowvalue >> (24-offset))
+    ldr     r10, [sp, #4]               ; b->tree
+extra_count_lt_zero
+    lsl     r2, r2, r6
+
+    subs    r8, r8, #1                  ; --n
+    bne     extra_bits_loop             ; while (n)
+
+no_extra_bits
+    ldr     lr, [r1, #4]                ; e = p->Extra
+    add     r4, r5, #1                  ; range + 1
+    tst     lr, #1
+    lsr     r4, r4, #1                  ; split = (range + 1) >> 1
+    addne   r2, r2, r4                  ; lowvalue += split
+    subne   r4, r5, r4                  ; range = range-split
+    tst     r2, #0x80000000             ; lowvalue & 0x80000000
+    lsl     r5, r4, #1                  ; range <<= 1
+    beq     end_high_bit_not_set
+
+    ldr     r4, [r0, #vp9_writer_pos]
+    mov     r7, #0
+    sub     r4, r4, #1
+    b       end_zero_while_start
+end_zero_while_loop
+    strb    r7, [r6, r4]
+    sub     r4, r4, #1                  ; x--
+end_zero_while_start
+    cmp     r4, #0
+    ldrge   r6, [r0, #vp9_writer_buffer]
+    ldrb    r12, [r6, r4]
+    cmpge   r12, #0xff
+    beq     end_zero_while_loop
+
+    ldr     r6, [r0, #vp9_writer_buffer]
+    ldrb    r7, [r6, r4]
+    add     r7, r7, #1
+    strb    r7, [r6, r4]
+end_high_bit_not_set
+    adds    r3, r3, #1                  ; ++count
+    lsl     r2, r2, #1                  ; lowvalue  <<= 1
+    bne     end_count_zero
+
+    ldr     r4, [r0, #vp9_writer_pos]
+    mvn     r3, #7
+    ldr     r7, [r0, #vp9_writer_buffer]
+    lsr     r6, r2, #24                 ; lowvalue >> 24
+    add     r12, r4, #1                 ; w->pos++
+    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
+    str     r12, [r0, #0x10]
+    strb    r6, [r7, r4]
+end_count_zero
+skip_extra_bits
+    add     r1, r1, #TOKENEXTRA_SZ      ; ++p
+check_p_lt_stop
+    ldr     r4, [sp, #0]                ; stop
+    cmp     r1, r4                      ; while( p < stop)
+    bcc     while_p_lt_stop
+
+    ldr     r6, [sp, #12]               ; mb_rows
+    ldr     r7, [sp, #16]               ; tokenlist address
+    subs    r6, r6, #1
+    add     r7, r7, #TOKENLIST_SZ       ; next element in the array
+    str     r6, [sp, #12]
+    bne     mb_row_loop
+
+    str     r2, [r0, #vp9_writer_lowvalue]
+    str     r5, [r0, #vp9_writer_range]
+    str     r3, [r0, #vp9_writer_count]
+    add     sp, sp, #24
+    pop     {r4-r11, pc}
+    ENDP
+
+_VP8_COMP_common_
+    DCD     vp8_comp_common
+_VP8_COMMON_MBrows_
+    DCD     vp8_common_mb_rows
+_VP8_COMP_tplist_
+    DCD     vp8_comp_tplist
+
+    END
--- /dev/null
+++ b/vp9/encoder/arm/armv5te/vp8_packtokens_partitions_armv5.asm
@@ -1,0 +1,465 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT |vp8cx_pack_tokens_into_partitions_armv5|
+
+    INCLUDE asm_enc_offsets.asm
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA    |.text|, CODE, READONLY
+
+; r0 VP8_COMP *cpi
+; r1 unsigned char *cx_data
+; r2 int num_part
+; r3 *size
+; s0 vp8_coef_encodings
+; s1 vp8_extra_bits,
+; s2 const vp9_tree_index *,
+
+|vp8cx_pack_tokens_into_partitions_armv5| PROC
+    push    {r4-r11, lr}
+    sub     sp, sp, #44
+
+    ; Compute address of cpi->common.mb_rows
+    ldr     r4, _VP8_COMP_common_
+    ldr     r6, _VP8_COMMON_MBrows_
+    add     r4, r0, r4
+
+    ldr     r5, [r4, r6]                ; load up mb_rows
+
+    str     r5, [sp, #36]               ; save mb_rows
+    str     r1, [sp, #24]               ; save cx_data
+    str     r2, [sp, #20]               ; save num_part
+    str     r3, [sp, #8]                ; save *size
+
+    ; *size = 3*(num_part -1 );
+    sub     r2, r2, #1                  ; num_part - 1
+    add     r2, r2, r2, lsl #1          ; 3*(num_part - 1)
+    str     r2, [r3]
+
+    add     r2, r2, r1                  ; cx_data + *size
+    str     r2, [sp, #40]               ; ptr
+
+    ldr     r4, _VP8_COMP_tplist_
+    add     r4, r0, r4
+    ldr     r7, [r4, #0]                ; dereference cpi->tp_list
+    str     r7, [sp, #32]               ; store start of cpi->tp_list
+
+    ldr     r11, _VP8_COMP_bc2_         ; load up vp9_writer out of cpi
+    add     r0, r0, r11
+
+    mov     r11, #0
+    str     r11, [sp, #28]              ; i
+
+numparts_loop
+    ldr     r10, [sp, #40]              ; ptr
+    ldr     r5,  [sp, #36]              ; move mb_rows to the counting section
+    sub     r5, r5, r11                 ; move start point with each partition
+                                        ; mb_rows starts at i
+    str     r5,  [sp, #12]
+
+    ; Reset all of the VP8 Writer data for each partition that
+    ; is processed.
+    ; start_encode
+    mov     r2, #0                      ; vp9_writer_lowvalue
+    mov     r5, #255                    ; vp9_writer_range
+    mvn     r3, #23                     ; vp9_writer_count
+
+    str     r2,  [r0, #vp9_writer_value]
+    str     r2,  [r0, #vp9_writer_pos]
+    str     r10, [r0, #vp9_writer_buffer]
+
+mb_row_loop
+
+    ldr     r1, [r7, #tokenlist_start]
+    ldr     r9, [r7, #tokenlist_stop]
+    str     r9, [sp, #0]                ; save stop for later comparison
+    str     r7, [sp, #16]               ; tokenlist address for next time
+
+    b       check_p_lt_stop
+
+    ; actual work gets done here!
+
+while_p_lt_stop
+    ldrb    r6, [r1, #tokenextra_token] ; t
+    ldr     r4, [sp, #80]               ; vp8_coef_encodings
+    mov     lr, #0
+    add     r4, r4, r6, lsl #3          ; a = vp8_coef_encodings + t
+    ldr     r9, [r1, #tokenextra_context_tree]   ; pp
+
+    ldrb    r7, [r1, #tokenextra_skip_eob_node]
+
+    ldr     r6, [r4, #vp9_token_value]  ; v
+    ldr     r8, [r4, #vp9_token_len]    ; n
+
+    ; vp8 specific skip_eob_node
+    cmp     r7, #0
+    movne   lr, #2                      ; i = 2
+    subne   r8, r8, #1                  ; --n
+
+    rsb     r4, r8, #32                 ; 32-n
+    ldr     r10, [sp, #88]              ; vp8_coef_tree
+
+    ; v is kept in r12 during the token pack loop
+    lsl     r12, r6, r4                ; r12 = v << 32 - n
+
+; loop start
+token_loop
+    ldrb    r4, [r9, lr, asr #1]        ; pp [i>>1]
+    sub     r7, r5, #1                  ; range-1
+
+    ; Decisions are made based on the bit value shifted
+    ; off of v, so set a flag here based on this.
+    ; This value is refered to as "bb"
+    lsls    r12, r12, #1                ; bb = v >> n
+    mul     r6, r4, r7                  ; ((range-1) * pp[i>>1]))
+
+    ; bb can only be 0 or 1.  So only execute this statement
+    ; if bb == 1, otherwise it will act like i + 0
+    addcs   lr, lr, #1                  ; i + bb
+
+    mov     r7, #1
+    ldrsb   lr, [r10, lr]               ; i = vp8_coef_tree[i+bb]
+    add     r4, r7, r6, lsr #8          ; 1 + (((range-1) * pp[i>>1]) >> 8)
+
+    addcs   r2, r2, r4                  ; if  (bb) lowvalue += split
+    subcs   r4, r5, r4                  ; if  (bb) range = range-split
+
+    ; Counting the leading zeros is used to normalize range.
+    clz     r6, r4
+    sub     r6, r6, #24                 ; shift
+
+    ; Flag is set on the sum of count.  This flag is used later
+    ; to determine if count >= 0
+    adds    r3, r3, r6                  ; count += shift
+    lsl     r5, r4, r6                  ; range <<= shift
+    bmi     token_count_lt_zero         ; if(count >= 0)
+
+    sub     r6, r6, r3                  ; offset = shift - count
+    sub     r4, r6, #1                  ; offset-1
+    lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )
+    bpl     token_high_bit_not_set
+
+    ldr     r4, [r0, #vp9_writer_pos]   ; x
+    sub     r4, r4, #1                  ; x = w->pos-1
+    b       token_zero_while_start
+token_zero_while_loop
+    mov     r10, #0
+    strb    r10, [r7, r4]               ; w->buffer[x] =(unsigned char)0
+    sub     r4, r4, #1                  ; x--
+token_zero_while_start
+    cmp     r4, #0
+    ldrge   r7, [r0, #vp9_writer_buffer]
+    ldrb    r11, [r7, r4]
+    cmpge   r11, #0xff
+    beq     token_zero_while_loop
+
+    ldr     r7, [r0, #vp9_writer_buffer]
+    ldrb    r10, [r7, r4]               ; w->buffer[x]
+    add     r10, r10, #1
+    strb    r10, [r7, r4]               ; w->buffer[x] + 1
+token_high_bit_not_set
+    rsb     r4, r6, #24                 ; 24-offset
+    ldr     r10, [r0, #vp9_writer_buffer]
+    lsr     r7, r2, r4                  ; lowvalue >> (24-offset)
+    ldr     r4, [r0, #vp9_writer_pos]   ; w->pos
+    lsl     r2, r2, r6                  ; lowvalue <<= offset
+    mov     r6, r3                      ; shift = count
+    add     r11, r4, #1                 ; w->pos++
+    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
+    str     r11, [r0, #vp9_writer_pos]
+    sub     r3, r3, #8                  ; count -= 8
+    strb    r7, [r10, r4]               ; w->buffer[w->pos++]
+
+    ; r10 is used earlier in the loop, but r10 is used as
+    ; temp variable here.  So after r10 is used, reload
+    ; vp8_coef_tree_dcd into r10
+    ldr     r10, [sp, #88]              ; vp8_coef_tree
+
+token_count_lt_zero
+    lsl     r2, r2, r6                  ; lowvalue <<= shift
+
+    subs    r8, r8, #1                  ; --n
+    bne     token_loop
+
+    ldrb    r6, [r1, #tokenextra_token] ; t
+    ldr     r7, [sp, #84]                ; vp8_extra_bits
+    ; Add t * sizeof (vp9_extra_bit_struct) to get the desired
+    ;  element.  Here vp9_extra_bit_struct == 16
+    add     r12, r7, r6, lsl #4         ; b = vp8_extra_bits + t
+
+    ldr     r4, [r12, #vp9_extra_bit_struct_base_val]
+    cmp     r4, #0
+    beq     skip_extra_bits
+
+;   if( b->base_val)
+    ldr     r8, [r12, #vp9_extra_bit_struct_len] ; L
+    ldrsh   lr, [r1, #tokenextra_extra] ; e = p->Extra
+    cmp     r8, #0                      ; if( L)
+    beq     no_extra_bits
+
+    ldr     r9, [r12, #vp9_extra_bit_struct_prob]
+    asr     r7, lr, #1                  ; v=e>>1
+
+    ldr     r10, [r12, #vp9_extra_bit_struct_tree]
+    str     r10, [sp, #4]               ; b->tree
+
+    rsb     r4, r8, #32
+    lsl     r12, r7, r4
+
+    mov     lr, #0                      ; i = 0
+
+extra_bits_loop
+    ldrb    r4, [r9, lr, asr #1]        ; pp[i>>1]
+    sub     r7, r5, #1                  ; range-1
+    lsls    r12, r12, #1                ; v >> n
+    mul     r6, r4, r7                  ; (range-1) * pp[i>>1]
+    addcs   lr, lr, #1                  ; i + bb
+
+    mov     r7, #1
+    ldrsb   lr, [r10, lr]               ; i = b->tree[i+bb]
+    add     r4, r7, r6, lsr #8          ; split = 1 +  (((range-1) * pp[i>>1]) >> 8)
+
+    addcs   r2, r2, r4                  ; if  (bb) lowvalue += split
+    subcs   r4, r5, r4                  ; if  (bb) range = range-split
+
+    clz     r6, r4
+    sub     r6, r6, #24
+
+    adds    r3, r3, r6                  ; count += shift
+    lsl     r5, r4, r6                  ; range <<= shift
+    bmi     extra_count_lt_zero         ; if(count >= 0)
+
+    sub     r6, r6, r3                  ; offset= shift - count
+    sub     r4, r6, #1                  ; offset-1
+    lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )
+    bpl     extra_high_bit_not_set
+
+    ldr     r4, [r0, #vp9_writer_pos]   ; x
+    sub     r4, r4, #1                  ; x = w->pos - 1
+    b       extra_zero_while_start
+extra_zero_while_loop
+    mov     r10, #0
+    strb    r10, [r7, r4]               ; w->buffer[x] =(unsigned char)0
+    sub     r4, r4, #1                  ; x--
+extra_zero_while_start
+    cmp     r4, #0
+    ldrge   r7, [r0, #vp9_writer_buffer]
+    ldrb    r11, [r7, r4]
+    cmpge   r11, #0xff
+    beq     extra_zero_while_loop
+
+    ldr     r7, [r0, #vp9_writer_buffer]
+    ldrb    r10, [r7, r4]
+    add     r10, r10, #1
+    strb    r10, [r7, r4]
+extra_high_bit_not_set
+    rsb     r4, r6, #24                 ; 24-offset
+    ldr     r10, [r0, #vp9_writer_buffer]
+    lsr     r7, r2, r4                  ; lowvalue >> (24-offset)
+    ldr     r4, [r0, #vp9_writer_pos]
+    lsl     r2, r2, r6                  ; lowvalue <<= offset
+    mov     r6, r3                      ; shift = count
+    add     r11, r4, #1                 ; w->pos++
+    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
+    str     r11, [r0, #vp9_writer_pos]
+    sub     r3, r3, #8                  ; count -= 8
+    strb    r7, [r10, r4]               ; w->buffer[w->pos++]=(lowvalue >> (24-offset))
+    ldr     r10, [sp, #4]               ; b->tree
+extra_count_lt_zero
+    lsl     r2, r2, r6
+
+    subs    r8, r8, #1                  ; --n
+    bne     extra_bits_loop             ; while (n)
+
+no_extra_bits
+    ldr     lr, [r1, #4]                ; e = p->Extra
+    add     r4, r5, #1                  ; range + 1
+    tst     lr, #1
+    lsr     r4, r4, #1                  ; split = (range + 1) >> 1
+    addne   r2, r2, r4                  ; lowvalue += split
+    subne   r4, r5, r4                  ; range = range-split
+    tst     r2, #0x80000000             ; lowvalue & 0x80000000
+    lsl     r5, r4, #1                  ; range <<= 1
+    beq     end_high_bit_not_set
+
+    ldr     r4, [r0, #vp9_writer_pos]
+    mov     r7, #0
+    sub     r4, r4, #1
+    b       end_zero_while_start
+end_zero_while_loop
+    strb    r7, [r6, r4]
+    sub     r4, r4, #1                  ; x--
+end_zero_while_start
+    cmp     r4, #0
+    ldrge   r6, [r0, #vp9_writer_buffer]
+    ldrb    r12, [r6, r4]
+    cmpge   r12, #0xff
+    beq     end_zero_while_loop
+
+    ldr     r6, [r0, #vp9_writer_buffer]
+    ldrb    r7, [r6, r4]
+    add     r7, r7, #1
+    strb    r7, [r6, r4]
+end_high_bit_not_set
+    adds    r3, r3, #1                  ; ++count
+    lsl     r2, r2, #1                  ; lowvalue  <<= 1
+    bne     end_count_zero
+
+    ldr     r4, [r0, #vp9_writer_pos]
+    mvn     r3, #7
+    ldr     r7, [r0, #vp9_writer_buffer]
+    lsr     r6, r2, #24                 ; lowvalue >> 24
+    add     r12, r4, #1                 ; w->pos++
+    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
+    str     r12, [r0, #0x10]
+    strb    r6, [r7, r4]
+end_count_zero
+skip_extra_bits
+    add     r1, r1, #TOKENEXTRA_SZ      ; ++p
+check_p_lt_stop
+    ldr     r4, [sp, #0]                ; stop
+    cmp     r1, r4                      ; while( p < stop)
+    bcc     while_p_lt_stop
+
+    ldr     r10, [sp, #20]              ; num_parts
+    mov     r1, #TOKENLIST_SZ
+    mul     r1, r10, r1
+
+    ldr     r6, [sp, #12]               ; mb_rows
+    ldr     r7, [sp, #16]               ; tokenlist address
+    subs    r6, r6, r10
+    add     r7, r7, r1                  ; next element in the array
+    str     r6, [sp, #12]
+    bgt     mb_row_loop
+
+    mov     r12, #32
+
+stop_encode_loop
+    sub     r7, r5, #1                  ; range-1
+
+    mov     r4, r7, lsl #7              ; ((range-1) * 128)
+
+    mov     r7, #1
+    add     r4, r7, r4, lsr #8          ; 1 + (((range-1) * 128) >> 8)
+
+    ; Counting the leading zeros is used to normalize range.
+    clz     r6, r4
+    sub     r6, r6, #24                 ; shift
+
+    ; Flag is set on the sum of count.  This flag is used later
+    ; to determine if count >= 0
+    adds    r3, r3, r6                  ; count += shift
+    lsl     r5, r4, r6                  ; range <<= shift
+    bmi     token_count_lt_zero_se      ; if(count >= 0)
+
+    sub     r6, r6, r3                  ; offset = shift - count
+    sub     r4, r6, #1                  ; offset-1
+    lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )
+    bpl     token_high_bit_not_set_se
+
+    ldr     r4, [r0, #vp9_writer_pos]   ; x
+    sub     r4, r4, #1                  ; x = w->pos-1
+    b       token_zero_while_start_se
+token_zero_while_loop_se
+    mov     r10, #0
+    strb    r10, [r7, r4]               ; w->buffer[x] =(unsigned char)0
+    sub     r4, r4, #1                  ; x--
+token_zero_while_start_se
+    cmp     r4, #0
+    ldrge   r7, [r0, #vp9_writer_buffer]
+    ldrb    r11, [r7, r4]
+    cmpge   r11, #0xff
+    beq     token_zero_while_loop_se
+
+    ldr     r7, [r0, #vp9_writer_buffer]
+    ldrb    r10, [r7, r4]               ; w->buffer[x]
+    add     r10, r10, #1
+    strb    r10, [r7, r4]               ; w->buffer[x] + 1
+token_high_bit_not_set_se
+    rsb     r4, r6, #24                 ; 24-offset
+    ldr     r10, [r0, #vp9_writer_buffer]
+    lsr     r7, r2, r4                  ; lowvalue >> (24-offset)
+    ldr     r4, [r0, #vp9_writer_pos]   ; w->pos
+    lsl     r2, r2, r6                  ; lowvalue <<= offset
+    mov     r6, r3                      ; shift = count
+    add     r11, r4, #1                 ; w->pos++
+    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
+    str     r11, [r0, #vp9_writer_pos]
+    sub     r3, r3, #8                  ; count -= 8
+    strb    r7, [r10, r4]               ; w->buffer[w->pos++]
+
+token_count_lt_zero_se
+    lsl     r2, r2, r6                  ; lowvalue <<= shift
+
+    subs    r12, r12, #1
+    bne     stop_encode_loop
+
+    ldr     r10, [sp, #8]               ; *size
+    ldr     r11, [r10]
+    ldr     r4,  [r0, #vp9_writer_pos]  ; w->pos
+    add     r11, r11, r4                ; *size += w->pos
+    str     r11, [r10]
+
+    ldr     r9, [sp, #20]               ; num_parts
+    sub     r9, r9, #1
+    ldr     r10, [sp, #28]              ; i
+    cmp     r10, r9                     ; if(i<(num_part - 1))
+    bge     skip_write_partition
+
+    ldr     r12, [sp, #40]              ; ptr
+    add     r12, r12, r4                ; ptr += w->pos
+    str     r12, [sp, #40]
+
+    ldr     r9, [sp, #24]               ; cx_data
+    mov     r8, r4, asr #8
+    strb    r4, [r9, #0]
+    strb    r8, [r9, #1]
+    mov     r4, r4, asr #16
+    strb    r4, [r9, #2]
+
+    add     r9, r9, #3                  ; cx_data += 3
+    str     r9, [sp, #24]
+
+skip_write_partition
+
+    ldr     r11, [sp, #28]              ; i
+    ldr     r10, [sp, #20]              ; num_parts
+
+    add     r11, r11, #1                ; i++
+    str     r11, [sp, #28]
+
+    ldr     r7, [sp, #32]               ; cpi->tp_list[i]
+    mov     r1, #TOKENLIST_SZ
+    add     r7, r7, r1                  ; next element in cpi->tp_list
+    str     r7, [sp, #32]               ; cpi->tp_list[i+1]
+
+    cmp     r10, r11
+    bgt     numparts_loop
+
+
+    add     sp, sp, #44
+    pop     {r4-r11, pc}
+    ENDP
+
+_VP8_COMP_common_
+    DCD     vp8_comp_common
+_VP8_COMMON_MBrows_
+    DCD     vp8_common_mb_rows
+_VP8_COMP_tplist_
+    DCD     vp8_comp_tplist
+_VP8_COMP_bc2_
+    DCD     vp8_comp_bc2
+
+    END
--- /dev/null
+++ b/vp9/encoder/arm/armv6/vp8_fast_quantize_b_armv6.asm
@@ -1,0 +1,224 @@
+;
+;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_fast_quantize_b_armv6|
+
+    INCLUDE asm_enc_offsets.asm
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0    BLOCK *b
+; r1    BLOCKD *d
+|vp8_fast_quantize_b_armv6| PROC
+    stmfd   sp!, {r1, r4-r11, lr}
+
+    ldr     r3, [r0, #vp8_block_coeff]      ; coeff
+    ldr     r4, [r0, #vp8_block_quant_fast] ; quant_fast
+    ldr     r5, [r0, #vp8_block_round]      ; round
+    ldr     r6, [r1, #vp8_blockd_qcoeff]    ; qcoeff
+    ldr     r7, [r1, #vp8_blockd_dqcoeff]   ; dqcoeff
+    ldr     r8, [r1, #vp8_blockd_dequant]   ; dequant
+
+    ldr     r2, loop_count          ; loop_count=0x1000000. 'lsls' instruction
+                                    ; is used to update the counter so that
+                                    ; it can be used to mark nonzero
+                                    ; quantized coefficient pairs.
+
+    mov     r1, #0                  ; flags for quantized coeffs
+
+    ; PART 1: quantization and dequantization loop
+loop
+    ldr     r9, [r3], #4            ; [z1 | z0]
+    ldr     r10, [r5], #4           ; [r1 | r0]
+    ldr     r11, [r4], #4           ; [q1 | q0]
+
+    ssat16  lr, #1, r9              ; [sz1 | sz0]
+    eor     r9, r9, lr              ; [z1 ^ sz1 | z0 ^ sz0]
+    ssub16  r9, r9, lr              ; x = (z ^ sz) - sz
+    sadd16  r9, r9, r10             ; [x1+r1 | x0+r0]
+
+    ldr     r12, [r3], #4           ; [z3 | z2]
+
+    smulbb  r0, r9, r11             ; [(x0+r0)*q0]
+    smultt  r9, r9, r11             ; [(x1+r1)*q1]
+
+    ldr     r10, [r5], #4           ; [r3 | r2]
+
+    ssat16  r11, #1, r12            ; [sz3 | sz2]
+    eor     r12, r12, r11           ; [z3 ^ sz3 | z2 ^ sz2]
+    pkhtb   r0, r9, r0, asr #16     ; [y1 | y0]
+    ldr     r9, [r4], #4            ; [q3 | q2]
+    ssub16  r12, r12, r11           ; x = (z ^ sz) - sz
+
+    sadd16  r12, r12, r10           ; [x3+r3 | x2+r2]
+
+    eor     r0, r0, lr              ; [(y1 ^ sz1) | (y0 ^ sz0)]
+
+    smulbb  r10, r12, r9            ; [(x2+r2)*q2]
+    smultt  r12, r12, r9            ; [(x3+r3)*q3]
+
+    ssub16  r0, r0, lr              ; x = (y ^ sz) - sz
+
+    cmp     r0, #0                  ; check if zero
+    orrne   r1, r1, r2, lsr #24     ; add flag for nonzero coeffs
+
+    str     r0, [r6], #4            ; *qcoeff++ = x
+    ldr     r9, [r8], #4            ; [dq1 | dq0]
+
+    pkhtb   r10, r12, r10, asr #16  ; [y3 | y2]
+    eor     r10, r10, r11           ; [(y3 ^ sz3) | (y2 ^ sz2)]
+    ssub16  r10, r10, r11           ; x = (y ^ sz) - sz
+
+    cmp     r10, #0                 ; check if zero
+    orrne   r1, r1, r2, lsr #23     ; add flag for nonzero coeffs
+
+    str     r10, [r6], #4           ; *qcoeff++ = x
+    ldr     r11, [r8], #4           ; [dq3 | dq2]
+
+    smulbb  r12, r0, r9             ; [x0*dq0]
+    smultt  r0, r0, r9              ; [x1*dq1]
+
+    smulbb  r9, r10, r11            ; [x2*dq2]
+    smultt  r10, r10, r11           ; [x3*dq3]
+
+    lsls    r2, r2, #2              ; update loop counter
+    strh    r12, [r7, #0]           ; dqcoeff[0] = [x0*dq0]
+    strh    r0, [r7, #2]            ; dqcoeff[1] = [x1*dq1]
+    strh    r9, [r7, #4]            ; dqcoeff[2] = [x2*dq2]
+    strh    r10, [r7, #6]           ; dqcoeff[3] = [x3*dq3]
+    add     r7, r7, #8              ; dqcoeff += 8
+    bne     loop
+
+    ; PART 2: check position for eob...
+    mov     lr, #0                  ; init eob
+    cmp     r1, #0                  ; coeffs after quantization?
+    ldr     r11, [sp, #0]           ; restore BLOCKD pointer
+    beq     end                     ; skip eob calculations if all zero
+
+    ldr     r0, [r11, #vp8_blockd_qcoeff]
+
+    ; check shortcut for nonzero qcoeffs
+    tst    r1, #0x80
+    bne    quant_coeff_15_14
+    tst    r1, #0x20
+    bne    quant_coeff_13_11
+    tst    r1, #0x8
+    bne    quant_coeff_12_7
+    tst    r1, #0x40
+    bne    quant_coeff_10_9
+    tst    r1, #0x10
+    bne    quant_coeff_8_3
+    tst    r1, #0x2
+    bne    quant_coeff_6_5
+    tst    r1, #0x4
+    bne    quant_coeff_4_2
+    b      quant_coeff_1_0
+
+quant_coeff_15_14
+    ldrh    r2, [r0, #30]       ; rc=15, i=15
+    mov     lr, #16
+    cmp     r2, #0
+    bne     end
+
+    ldrh    r3, [r0, #28]       ; rc=14, i=14
+    mov     lr, #15
+    cmp     r3, #0
+    bne     end
+
+quant_coeff_13_11
+    ldrh    r2, [r0, #22]       ; rc=11, i=13
+    mov     lr, #14
+    cmp     r2, #0
+    bne     end
+
+quant_coeff_12_7
+    ldrh    r3, [r0, #14]       ; rc=7,  i=12
+    mov     lr, #13
+    cmp     r3, #0
+    bne     end
+
+    ldrh    r2, [r0, #20]       ; rc=10, i=11
+    mov     lr, #12
+    cmp     r2, #0
+    bne     end
+
+quant_coeff_10_9
+    ldrh    r3, [r0, #26]       ; rc=13, i=10
+    mov     lr, #11
+    cmp     r3, #0
+    bne     end
+
+    ldrh    r2, [r0, #24]       ; rc=12, i=9
+    mov     lr, #10
+    cmp     r2, #0
+    bne     end
+
+quant_coeff_8_3
+    ldrh    r3, [r0, #18]       ; rc=9,  i=8
+    mov     lr, #9
+    cmp     r3, #0
+    bne     end
+
+    ldrh    r2, [r0, #12]       ; rc=6,  i=7
+    mov     lr, #8
+    cmp     r2, #0
+    bne     end
+
+quant_coeff_6_5
+    ldrh    r3, [r0, #6]        ; rc=3,  i=6
+    mov     lr, #7
+    cmp     r3, #0
+    bne     end
+
+    ldrh    r2, [r0, #4]        ; rc=2,  i=5
+    mov     lr, #6
+    cmp     r2, #0
+    bne     end
+
+quant_coeff_4_2
+    ldrh    r3, [r0, #10]       ; rc=5,  i=4
+    mov     lr, #5
+    cmp     r3, #0
+    bne     end
+
+    ldrh    r2, [r0, #16]       ; rc=8,  i=3
+    mov     lr, #4
+    cmp     r2, #0
+    bne     end
+
+    ldrh    r3, [r0, #8]        ; rc=4,  i=2
+    mov     lr, #3
+    cmp     r3, #0
+    bne     end
+
+quant_coeff_1_0
+    ldrh    r2, [r0, #2]        ; rc=1,  i=1
+    mov     lr, #2
+    cmp     r2, #0
+    bne     end
+
+    mov     lr, #1              ; rc=0,  i=0
+
+end
+    str     lr, [r11, #vp8_blockd_eob]
+    ldmfd   sp!, {r1, r4-r11, pc}
+
+    ENDP
+
+loop_count
+    DCD     0x1000000
+
+    END
+
--- /dev/null
+++ b/vp9/encoder/arm/armv6/vp8_mse16x16_armv6.asm
@@ -1,0 +1,138 @@
+;
+;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_mse16x16_armv6|
+
+    ARM
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0    unsigned char *src_ptr
+; r1    int source_stride
+; r2    unsigned char *ref_ptr
+; r3    int  recon_stride
+; stack unsigned int *sse
+;
+;note: Based on vp9_variance16x16_armv6. In this function, sum is never used.
+;      So, we can remove this part of calculation.
+
+|vp8_mse16x16_armv6| PROC
+
+    push    {r4-r9, lr}
+
+    pld     [r0, r1, lsl #0]
+    pld     [r2, r3, lsl #0]
+
+    mov     r12, #16            ; set loop counter to 16 (=block height)
+    mov     r4, #0              ; initialize sse = 0
+
+loop
+    ; 1st 4 pixels
+    ldr     r5, [r0, #0x0]      ; load 4 src pixels
+    ldr     r6, [r2, #0x0]      ; load 4 ref pixels
+
+    mov     lr, #0              ; constant zero
+
+    usub8   r8, r5, r6          ; calculate difference
+    pld     [r0, r1, lsl #1]
+    sel     r7, r8, lr          ; select bytes with positive difference
+    usub8   r9, r6, r5          ; calculate difference with reversed operands
+    pld     [r2, r3, lsl #1]
+    sel     r8, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r5, r7, lr          ; calculate sum of positive differences
+    usad8   r6, r8, lr          ; calculate sum of negative differences
+    orr     r8, r8, r7          ; differences of all 4 pixels
+
+    ldr     r5, [r0, #0x4]      ; load 4 src pixels
+
+    ; calculate sse
+    uxtb16  r6, r8              ; byte (two pixels) to halfwords
+    uxtb16  r7, r8, ror #8      ; another two pixels to halfwords
+    smlad   r4, r6, r6, r4      ; dual signed multiply, add and accumulate (1)
+
+    ; 2nd 4 pixels
+    ldr     r6, [r2, #0x4]      ; load 4 ref pixels
+    smlad   r4, r7, r7, r4      ; dual signed multiply, add and accumulate (2)
+
+    usub8   r8, r5, r6          ; calculate difference
+    sel     r7, r8, lr          ; select bytes with positive difference
+    usub8   r9, r6, r5          ; calculate difference with reversed operands
+    sel     r8, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r5, r7, lr          ; calculate sum of positive differences
+    usad8   r6, r8, lr          ; calculate sum of negative differences
+    orr     r8, r8, r7          ; differences of all 4 pixels
+    ldr     r5, [r0, #0x8]      ; load 4 src pixels
+    ; calculate sse
+    uxtb16  r6, r8              ; byte (two pixels) to halfwords
+    uxtb16  r7, r8, ror #8      ; another two pixels to halfwords
+    smlad   r4, r6, r6, r4      ; dual signed multiply, add and accumulate (1)
+
+    ; 3rd 4 pixels
+    ldr     r6, [r2, #0x8]      ; load 4 ref pixels
+    smlad   r4, r7, r7, r4      ; dual signed multiply, add and accumulate (2)
+
+    usub8   r8, r5, r6          ; calculate difference
+    sel     r7, r8, lr          ; select bytes with positive difference
+    usub8   r9, r6, r5          ; calculate difference with reversed operands
+    sel     r8, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r5, r7, lr          ; calculate sum of positive differences
+    usad8   r6, r8, lr          ; calculate sum of negative differences
+    orr     r8, r8, r7          ; differences of all 4 pixels
+
+    ldr     r5, [r0, #0xc]      ; load 4 src pixels
+
+    ; calculate sse
+    uxtb16  r6, r8              ; byte (two pixels) to halfwords
+    uxtb16  r7, r8, ror #8      ; another two pixels to halfwords
+    smlad   r4, r6, r6, r4      ; dual signed multiply, add and accumulate (1)
+
+    ; 4th 4 pixels
+    ldr     r6, [r2, #0xc]      ; load 4 ref pixels
+    smlad   r4, r7, r7, r4      ; dual signed multiply, add and accumulate (2)
+
+    usub8   r8, r5, r6          ; calculate difference
+    add     r0, r0, r1          ; set src_ptr to next row
+    sel     r7, r8, lr          ; select bytes with positive difference
+    usub8   r9, r6, r5          ; calculate difference with reversed operands
+    add     r2, r2, r3          ; set dst_ptr to next row
+    sel     r8, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r5, r7, lr          ; calculate sum of positive differences
+    usad8   r6, r8, lr          ; calculate sum of negative differences
+    orr     r8, r8, r7          ; differences of all 4 pixels
+
+    subs    r12, r12, #1        ; next row
+
+    ; calculate sse
+    uxtb16  r6, r8              ; byte (two pixels) to halfwords
+    uxtb16  r7, r8, ror #8      ; another two pixels to halfwords
+    smlad   r4, r6, r6, r4      ; dual signed multiply, add and accumulate (1)
+    smlad   r4, r7, r7, r4      ; dual signed multiply, add and accumulate (2)
+
+    bne     loop
+
+    ; return stuff
+    ldr     r1, [sp, #28]       ; get address of sse
+    mov     r0, r4              ; return sse
+    str     r4, [r1]            ; store sse
+
+    pop     {r4-r9, pc}
+
+    ENDP
+
+    END
--- /dev/null
+++ b/vp9/encoder/arm/armv6/vp8_sad16x16_armv6.asm
@@ -1,0 +1,96 @@
+;
+;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_sad16x16_armv6|
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0    const unsigned char *src_ptr
+; r1    int  src_stride
+; r2    const unsigned char *ref_ptr
+; r3    int  ref_stride
+; stack max_sad (not used)
+|vp8_sad16x16_armv6| PROC
+    stmfd   sp!, {r4-r12, lr}
+
+    pld     [r0, r1, lsl #0]
+    pld     [r2, r3, lsl #0]
+    pld     [r0, r1, lsl #1]
+    pld     [r2, r3, lsl #1]
+
+    mov     r4, #0              ; sad = 0;
+    mov     r5, #8              ; loop count
+
+loop
+    ; 1st row
+    ldr     r6, [r0, #0x0]      ; load 4 src pixels (1A)
+    ldr     r8, [r2, #0x0]      ; load 4 ref pixels (1A)
+    ldr     r7, [r0, #0x4]      ; load 4 src pixels (1A)
+    ldr     r9, [r2, #0x4]      ; load 4 ref pixels (1A)
+    ldr     r10, [r0, #0x8]     ; load 4 src pixels (1B)
+    ldr     r11, [r0, #0xC]     ; load 4 src pixels (1B)
+
+    usada8  r4, r8, r6, r4      ; calculate sad for 4 pixels
+    usad8   r8, r7, r9          ; calculate sad for 4 pixels
+
+    ldr     r12, [r2, #0x8]     ; load 4 ref pixels (1B)
+    ldr     lr, [r2, #0xC]      ; load 4 ref pixels (1B)
+
+    add     r0, r0, r1          ; set src pointer to next row
+    add     r2, r2, r3          ; set dst pointer to next row
+
+    pld     [r0, r1, lsl #1]
+    pld     [r2, r3, lsl #1]
+
+    usada8  r4, r10, r12, r4    ; calculate sad for 4 pixels
+    usada8  r8, r11, lr, r8     ; calculate sad for 4 pixels
+
+    ldr     r6, [r0, #0x0]      ; load 4 src pixels (2A)
+    ldr     r7, [r0, #0x4]      ; load 4 src pixels (2A)
+    add     r4, r4, r8          ; add partial sad values
+
+    ; 2nd row
+    ldr     r8, [r2, #0x0]      ; load 4 ref pixels (2A)
+    ldr     r9, [r2, #0x4]      ; load 4 ref pixels (2A)
+    ldr     r10, [r0, #0x8]     ; load 4 src pixels (2B)
+    ldr     r11, [r0, #0xC]     ; load 4 src pixels (2B)
+
+    usada8  r4, r6, r8, r4      ; calculate sad for 4 pixels
+    usad8   r8, r7, r9          ; calculate sad for 4 pixels
+
+    ldr     r12, [r2, #0x8]     ; load 4 ref pixels (2B)
+    ldr     lr, [r2, #0xC]      ; load 4 ref pixels (2B)
+
+    add     r0, r0, r1          ; set src pointer to next row
+    add     r2, r2, r3          ; set dst pointer to next row
+
+    usada8  r4, r10, r12, r4    ; calculate sad for 4 pixels
+    usada8  r8, r11, lr, r8     ; calculate sad for 4 pixels
+
+    pld     [r0, r1, lsl #1]
+    pld     [r2, r3, lsl #1]
+
+    subs    r5, r5, #1          ; decrement loop counter
+    add     r4, r4, r8          ; add partial sad values
+
+    bne     loop
+
+    mov     r0, r4              ; return sad
+    ldmfd   sp!, {r4-r12, pc}
+
+    ENDP
+
+    END
+
--- /dev/null
+++ b/vp9/encoder/arm/armv6/vp8_short_fdct4x4_armv6.asm
@@ -1,0 +1,262 @@
+;
+;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+    EXPORT |vp8_short_fdct4x4_armv6|
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA    |.text|, CODE, READONLY
+; void vp8_short_fdct4x4_c(short *input, short *output, int pitch)
+|vp8_short_fdct4x4_armv6| PROC
+
+    stmfd       sp!, {r4 - r12, lr}
+
+    ; PART 1
+
+    ; coeffs 0-3
+    ldrd        r4, r5, [r0]        ; [i1 | i0] [i3 | i2]
+
+    ldr         r10, c7500
+    ldr         r11, c14500
+    ldr         r12, c0x22a453a0    ; [2217*4 | 5352*4]
+    ldr         lr, c0x00080008
+    ror         r5, r5, #16         ; [i2 | i3]
+
+    qadd16      r6, r4, r5          ; [i1+i2 | i0+i3] = [b1 | a1] without shift
+    qsub16      r7, r4, r5          ; [i1-i2 | i0-i3] = [c1 | d1] without shift
+
+    add         r0, r0, r2          ; update input pointer
+
+    qadd16      r7, r7, r7          ; 2*[c1|d1] --> we can use smlad and smlsd
+                                    ; with 2217*4 and 5352*4 without losing the
+                                    ; sign bit (overflow)
+
+    smuad       r4, r6, lr          ; o0 = (i1+i2)*8 + (i0+i3)*8
+    smusd       r5, r6, lr          ; o2 = (i1+i2)*8 - (i0+i3)*8
+
+    smlad       r6, r7, r12, r11    ; o1 = (c1 * 2217 + d1 * 5352 +  14500)
+    smlsdx      r7, r7, r12, r10    ; o3 = (d1 * 2217 - c1 * 5352 +   7500)
+
+    ldrd        r8, r9, [r0]        ; [i5 | i4] [i7 | i6]
+
+    pkhbt       r3, r4, r6, lsl #4  ; [o1 | o0], keep in register for PART 2
+    pkhbt       r6, r5, r7, lsl #4  ; [o3 | o2]
+
+    str         r6, [r1, #4]
+
+    ; coeffs 4-7
+    ror         r9, r9, #16         ; [i6 | i7]
+
+    qadd16      r6, r8, r9          ; [i5+i6 | i4+i7] = [b1 | a1] without shift
+    qsub16      r7, r8, r9          ; [i5-i6 | i4-i7] = [c1 | d1] without shift
+
+    add         r0, r0, r2          ; update input pointer
+
+    qadd16      r7, r7, r7          ; 2x[c1|d1] --> we can use smlad and smlsd
+                                    ; with 2217*4 and 5352*4 without losing the
+                                    ; sign bit (overflow)
+
+    smuad       r9, r6, lr          ; o4 = (i5+i6)*8 + (i4+i7)*8
+    smusd       r8, r6, lr          ; o6 = (i5+i6)*8 - (i4+i7)*8
+
+    smlad       r6, r7, r12, r11    ; o5 = (c1 * 2217 + d1 * 5352 +  14500)
+    smlsdx      r7, r7, r12, r10    ; o7 = (d1 * 2217 - c1 * 5352 +   7500)
+
+    ldrd        r4, r5, [r0]        ; [i9 | i8] [i11 | i10]
+
+    pkhbt       r9, r9, r6, lsl #4  ; [o5 | o4], keep in register for PART 2
+    pkhbt       r6, r8, r7, lsl #4  ; [o7 | o6]
+
+    str         r6, [r1, #12]
+
+    ; coeffs 8-11
+    ror         r5, r5, #16         ; [i10 | i11]
+
+    qadd16      r6, r4, r5          ; [i9+i10 | i8+i11]=[b1 | a1] without shift
+    qsub16      r7, r4, r5          ; [i9-i10 | i8-i11]=[c1 | d1] without shift
+
+    add         r0, r0, r2          ; update input pointer
+
+    qadd16      r7, r7, r7          ; 2x[c1|d1] --> we can use smlad and smlsd
+                                    ; with 2217*4 and 5352*4 without losing the
+                                    ; sign bit (overflow)
+
+    smuad       r2, r6, lr          ; o8 = (i9+i10)*8 + (i8+i11)*8
+    smusd       r8, r6, lr          ; o10 = (i9+i10)*8 - (i8+i11)*8
+
+    smlad       r6, r7, r12, r11    ; o9 = (c1 * 2217 + d1 * 5352 +  14500)
+    smlsdx      r7, r7, r12, r10    ; o11 = (d1 * 2217 - c1 * 5352 +   7500)
+
+    ldrd        r4, r5, [r0]        ; [i13 | i12] [i15 | i14]
+
+    pkhbt       r2, r2, r6, lsl #4  ; [o9 | o8], keep in register for PART 2
+    pkhbt       r6, r8, r7, lsl #4  ; [o11 | o10]
+
+    str         r6, [r1, #20]
+
+    ; coeffs 12-15
+    ror         r5, r5, #16         ; [i14 | i15]
+
+    qadd16      r6, r4, r5          ; [i13+i14 | i12+i15]=[b1|a1] without shift
+    qsub16      r7, r4, r5          ; [i13-i14 | i12-i15]=[c1|d1] without shift
+
+    qadd16      r7, r7, r7          ; 2x[c1|d1] --> we can use smlad and smlsd
+                                    ; with 2217*4 and 5352*4 without losing the
+                                    ; sign bit (overflow)
+
+    smuad       r4, r6, lr          ; o12 = (i13+i14)*8 + (i12+i15)*8
+    smusd       r5, r6, lr          ; o14 = (i13+i14)*8 - (i12+i15)*8
+
+    smlad       r6, r7, r12, r11    ; o13 = (c1 * 2217 + d1 * 5352 +  14500)
+    smlsdx      r7, r7, r12, r10    ; o15 = (d1 * 2217 - c1 * 5352 +   7500)
+
+    pkhbt       r0, r4, r6, lsl #4  ; [o13 | o12], keep in register for PART 2
+    pkhbt       r6, r5, r7, lsl #4  ; [o15 | o14]
+
+    str         r6, [r1, #28]
+
+
+    ; PART 2 -------------------------------------------------
+    ldr         r11, c12000
+    ldr         r10, c51000
+    ldr         lr, c0x00070007
+
+    qadd16      r4, r3, r0          ; a1 = [i1+i13 | i0+i12]
+    qadd16      r5, r9, r2          ; b1 = [i5+i9  |  i4+i8]
+    qsub16      r6, r9, r2          ; c1 = [i5-i9  |  i4-i8]
+    qsub16      r7, r3, r0          ; d1 = [i1-i13 | i0-i12]
+
+    qadd16      r4, r4, lr          ; a1 + 7
+
+    add         r0, r11, #0x10000   ; add (d!=0)
+
+    qadd16      r2, r4, r5          ; a1 + b1 + 7
+    qsub16      r3, r4, r5          ; a1 - b1 + 7
+
+    ldr         r12, c0x08a914e8    ; [2217 | 5352]
+
+    lsl         r8, r2, #16         ; prepare bottom halfword for scaling
+    asr         r2, r2, #4          ; scale top halfword
+    lsl         r9, r3, #16         ; prepare bottom halfword for scaling
+    asr         r3, r3, #4          ; scale top halfword
+    pkhtb       r4, r2, r8, asr #20 ; pack and scale bottom halfword
+    pkhtb       r5, r3, r9, asr #20 ; pack and scale bottom halfword
+
+    smulbt      r2, r6, r12         ; [ ------ | c1*2217]
+    str         r4, [r1, #0]        ; [     o1 |      o0]
+    smultt      r3, r6, r12         ; [c1*2217 | ------ ]
+    str         r5, [r1, #16]       ; [     o9 |      o8]
+
+    smlabb      r8, r7, r12, r2     ; [ ------ | d1*5352]
+    smlatb      r9, r7, r12, r3     ; [d1*5352 | ------ ]
+
+    smulbb      r2, r6, r12         ; [ ------ | c1*5352]
+    smultb      r3, r6, r12         ; [c1*5352 | ------ ]
+
+    lsls        r6, r7, #16         ; d1 != 0 ?
+    addeq       r8, r8, r11         ; c1_b*2217+d1_b*5352+12000 + (d==0)
+    addne       r8, r8, r0          ; c1_b*2217+d1_b*5352+12000 + (d!=0)
+    asrs        r6, r7, #16
+    addeq       r9, r9, r11         ; c1_t*2217+d1_t*5352+12000 + (d==0)
+    addne       r9, r9, r0          ; c1_t*2217+d1_t*5352+12000 + (d!=0)
+
+    smlabt      r4, r7, r12, r10    ; [ ------ | d1*2217] + 51000
+    smlatt      r5, r7, r12, r10    ; [d1*2217 | ------ ] + 51000
+
+    pkhtb       r9, r9, r8, asr #16
+
+    sub         r4, r4, r2
+    sub         r5, r5, r3
+
+    ldr         r3, [r1, #4]        ; [i3 | i2]
+
+    pkhtb       r5, r5, r4, asr #16 ; [o13|o12]
+
+    str         r9, [r1, #8]        ; [o5 | 04]
+
+    ldr         r9, [r1, #12]       ; [i7 | i6]
+    ldr         r8, [r1, #28]       ; [i15|i14]
+    ldr         r2, [r1, #20]       ; [i11|i10]
+    str         r5, [r1, #24]       ; [o13|o12]
+
+    qadd16      r4, r3, r8          ; a1 = [i3+i15 | i2+i14]
+    qadd16      r5, r9, r2          ; b1 = [i7+i11 | i6+i10]
+
+    qadd16      r4, r4, lr          ; a1 + 7
+
+    qsub16      r6, r9, r2          ; c1 = [i7-i11 | i6-i10]
+    qadd16      r2, r4, r5          ; a1 + b1 + 7
+    qsub16      r7, r3, r8          ; d1 = [i3-i15 | i2-i14]
+    qsub16      r3, r4, r5          ; a1 - b1 + 7
+
+    lsl         r8, r2, #16         ; prepare bottom halfword for scaling
+    asr         r2, r2, #4          ; scale top halfword
+    lsl         r9, r3, #16         ; prepare bottom halfword for scaling
+    asr         r3, r3, #4          ; scale top halfword
+    pkhtb       r4, r2, r8, asr #20 ; pack and scale bottom halfword
+    pkhtb       r5, r3, r9, asr #20 ; pack and scale bottom halfword
+
+    smulbt      r2, r6, r12         ; [ ------ | c1*2217]
+    str         r4, [r1, #4]        ; [     o3 |      o2]
+    smultt      r3, r6, r12         ; [c1*2217 | ------ ]
+    str         r5, [r1, #20]       ; [    o11 |     o10]
+
+    smlabb      r8, r7, r12, r2     ; [ ------ | d1*5352]
+    smlatb      r9, r7, r12, r3     ; [d1*5352 | ------ ]
+
+    smulbb      r2, r6, r12         ; [ ------ | c1*5352]
+    smultb      r3, r6, r12         ; [c1*5352 | ------ ]
+
+    lsls        r6, r7, #16         ; d1 != 0 ?
+    addeq       r8, r8, r11         ; c1_b*2217+d1_b*5352+12000 + (d==0)
+    addne       r8, r8, r0          ; c1_b*2217+d1_b*5352+12000 + (d!=0)
+
+    asrs        r6, r7, #16
+    addeq       r9, r9, r11         ; c1_t*2217+d1_t*5352+12000 + (d==0)
+    addne       r9, r9, r0          ; c1_t*2217+d1_t*5352+12000 + (d!=0)
+
+    smlabt      r4, r7, r12, r10    ; [ ------ | d1*2217] + 51000
+    smlatt      r5, r7, r12, r10    ; [d1*2217 | ------ ] + 51000
+
+    pkhtb       r9, r9, r8, asr #16
+
+    sub         r4, r4, r2
+    sub         r5, r5, r3
+
+    str         r9, [r1, #12]       ; [o7 | o6]
+    pkhtb       r5, r5, r4, asr #16 ; [o15|o14]
+
+    str         r5, [r1, #28]       ; [o15|o14]
+
+    ldmfd       sp!, {r4 - r12, pc}
+
+    ENDP
+
+; Used constants
+c7500
+    DCD     7500
+c14500
+    DCD     14500
+c0x22a453a0
+    DCD     0x22a453a0
+c0x00080008
+    DCD     0x00080008
+c12000
+    DCD     12000
+c51000
+    DCD     51000
+c0x00070007
+    DCD     0x00070007
+c0x08a914e8
+    DCD     0x08a914e8
+
+    END
--- /dev/null
+++ b/vp9/encoder/arm/armv6/vp8_subtract_armv6.asm
@@ -1,0 +1,265 @@
+;
+;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_subtract_mby_armv6|
+    EXPORT  |vp8_subtract_mbuv_armv6|
+    EXPORT  |vp8_subtract_b_armv6|
+
+    INCLUDE asm_enc_offsets.asm
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0    BLOCK *be
+; r1    BLOCKD *bd
+; r2    int pitch
+|vp8_subtract_b_armv6| PROC
+
+    stmfd   sp!, {r4-r9}
+
+    ldr     r4, [r0, #vp8_block_base_src]
+    ldr     r5, [r0, #vp8_block_src]
+    ldr     r6, [r0, #vp8_block_src_diff]
+
+    ldr     r3, [r4]
+    ldr     r7, [r0, #vp8_block_src_stride]
+    add     r3, r3, r5          ; src = *base_src + src
+    ldr     r8, [r1, #vp8_blockd_predictor]
+
+    mov     r9, #4              ; loop count
+
+loop_block
+
+    ldr     r0, [r3], r7        ; src
+    ldr     r1, [r8], r2        ; pred
+
+    uxtb16  r4, r0              ; [s2 | s0]
+    uxtb16  r5, r1              ; [p2 | p0]
+    uxtb16  r0, r0, ror #8      ; [s3 | s1]
+    uxtb16  r1, r1, ror #8      ; [p3 | p1]
+
+    usub16  r4, r4, r5          ; [d2 | d0]
+    usub16  r5, r0, r1          ; [d3 | d1]
+
+    subs    r9, r9, #1          ; decrement loop counter
+
+    pkhbt   r0, r4, r5, lsl #16 ; [d1 | d0]
+    pkhtb   r1, r5, r4, asr #16 ; [d3 | d2]
+
+    str     r0, [r6, #0]        ; diff
+    str     r1, [r6, #4]        ; diff
+
+    add     r6, r6, r2, lsl #1  ; update diff pointer
+    bne     loop_block
+
+    ldmfd   sp!, {r4-r9}
+    mov     pc, lr
+
+    ENDP
+
+
+; r0    short *diff
+; r1    unsigned char *usrc
+; r2    unsigned char *vsrc
+; r3    unsigned char *pred
+; stack int stride
+|vp8_subtract_mbuv_armv6| PROC
+
+    stmfd   sp!, {r4-r12, lr}
+
+    add     r0, r0, #512        ; set *diff point to Cb
+    add     r3, r3, #256        ; set *pred point to Cb
+
+    mov     r4, #8              ; loop count
+    ldr     r5, [sp, #40]       ; stride
+
+    ; Subtract U block
+loop_u
+    ldr     r6, [r1]            ; src       (A)
+    ldr     r7, [r3], #4        ; pred      (A)
+
+    uxtb16  r8, r6              ; [s2 | s0] (A)
+    uxtb16  r9, r7              ; [p2 | p0] (A)
+    uxtb16  r10, r6, ror #8     ; [s3 | s1] (A)
+    uxtb16  r11, r7, ror #8     ; [p3 | p1] (A)
+
+    usub16  r6, r8, r9          ; [d2 | d0] (A)
+    usub16  r7, r10, r11        ; [d3 | d1] (A)
+
+    ldr     r10, [r1, #4]       ; src       (B)
+    ldr     r11, [r3], #4       ; pred      (B)
+
+    pkhbt   r8, r6, r7, lsl #16 ; [d1 | d0] (A)
+    pkhtb   r9, r7, r6, asr #16 ; [d3 | d2] (A)
+
+    str     r8, [r0], #4        ; diff      (A)
+    uxtb16  r8, r10             ; [s2 | s0] (B)
+    str     r9, [r0], #4        ; diff      (A)
+
+    uxtb16  r9, r11             ; [p2 | p0] (B)
+    uxtb16  r10, r10, ror #8    ; [s3 | s1] (B)
+    uxtb16  r11, r11, ror #8    ; [p3 | p1] (B)
+
+    usub16  r6, r8, r9          ; [d2 | d0] (B)
+    usub16  r7, r10, r11        ; [d3 | d1] (B)
+
+    add     r1, r1, r5          ; update usrc pointer
+
+    pkhbt   r8, r6, r7, lsl #16 ; [d1 | d0] (B)
+    pkhtb   r9, r7, r6, asr #16 ; [d3 | d2] (B)
+
+    str     r8, [r0], #4        ; diff      (B)
+    subs    r4, r4, #1          ; update loop counter
+    str     r9, [r0], #4        ; diff      (B)
+
+    bne     loop_u
+
+    mov     r4, #8              ; loop count
+
+    ; Subtract V block
+loop_v
+    ldr     r6, [r2]            ; src       (A)
+    ldr     r7, [r3], #4        ; pred      (A)
+
+    uxtb16  r8, r6              ; [s2 | s0] (A)
+    uxtb16  r9, r7              ; [p2 | p0] (A)
+    uxtb16  r10, r6, ror #8     ; [s3 | s1] (A)
+    uxtb16  r11, r7, ror #8     ; [p3 | p1] (A)
+
+    usub16  r6, r8, r9          ; [d2 | d0] (A)
+    usub16  r7, r10, r11        ; [d3 | d1] (A)
+
+    ldr     r10, [r2, #4]       ; src       (B)
+    ldr     r11, [r3], #4       ; pred      (B)
+
+    pkhbt   r8, r6, r7, lsl #16 ; [d1 | d0] (A)
+    pkhtb   r9, r7, r6, asr #16 ; [d3 | d2] (A)
+
+    str     r8, [r0], #4        ; diff      (A)
+    uxtb16  r8, r10             ; [s2 | s0] (B)
+    str     r9, [r0], #4        ; diff      (A)
+
+    uxtb16  r9, r11             ; [p2 | p0] (B)
+    uxtb16  r10, r10, ror #8    ; [s3 | s1] (B)
+    uxtb16  r11, r11, ror #8    ; [p3 | p1] (B)
+
+    usub16  r6, r8, r9          ; [d2 | d0] (B)
+    usub16  r7, r10, r11        ; [d3 | d1] (B)
+
+    add     r2, r2, r5          ; update vsrc pointer
+
+    pkhbt   r8, r6, r7, lsl #16 ; [d1 | d0] (B)
+    pkhtb   r9, r7, r6, asr #16 ; [d3 | d2] (B)
+
+    str     r8, [r0], #4        ; diff      (B)
+    subs    r4, r4, #1          ; update loop counter
+    str     r9, [r0], #4        ; diff      (B)
+
+    bne     loop_v
+
+    ldmfd   sp!, {r4-r12, pc}
+
+    ENDP
+
+
+; r0    short *diff
+; r1    unsigned char *src
+; r2    unsigned char *pred
+; r3    int stride
+|vp8_subtract_mby_armv6| PROC
+
+    stmfd   sp!, {r4-r11}
+
+    mov     r4, #16
+loop
+    ldr     r6, [r1]            ; src       (A)
+    ldr     r7, [r2], #4        ; pred      (A)
+
+    uxtb16  r8, r6              ; [s2 | s0] (A)
+    uxtb16  r9, r7              ; [p2 | p0] (A)
+    uxtb16  r10, r6, ror #8     ; [s3 | s1] (A)
+    uxtb16  r11, r7, ror #8     ; [p3 | p1] (A)
+
+    usub16  r6, r8, r9          ; [d2 | d0] (A)
+    usub16  r7, r10, r11        ; [d3 | d1] (A)
+
+    ldr     r10, [r1, #4]       ; src       (B)
+    ldr     r11, [r2], #4       ; pred      (B)
+
+    pkhbt   r8, r6, r7, lsl #16 ; [d1 | d0] (A)
+    pkhtb   r9, r7, r6, asr #16 ; [d3 | d2] (A)
+
+    str     r8, [r0], #4        ; diff      (A)
+    uxtb16  r8, r10             ; [s2 | s0] (B)
+    str     r9, [r0], #4        ; diff      (A)
+
+    uxtb16  r9, r11             ; [p2 | p0] (B)
+    uxtb16  r10, r10, ror #8    ; [s3 | s1] (B)
+    uxtb16  r11, r11, ror #8    ; [p3 | p1] (B)
+
+    usub16  r6, r8, r9          ; [d2 | d0] (B)
+    usub16  r7, r10, r11        ; [d3 | d1] (B)
+
+    ldr     r10, [r1, #8]       ; src       (C)
+    ldr     r11, [r2], #4       ; pred      (C)
+
+    pkhbt   r8, r6, r7, lsl #16 ; [d1 | d0] (B)
+    pkhtb   r9, r7, r6, asr #16 ; [d3 | d2] (B)
+
+    str     r8, [r0], #4        ; diff      (B)
+    uxtb16  r8, r10             ; [s2 | s0] (C)
+    str     r9, [r0], #4        ; diff      (B)
+
+    uxtb16  r9, r11             ; [p2 | p0] (C)
+    uxtb16  r10, r10, ror #8    ; [s3 | s1] (C)
+    uxtb16  r11, r11, ror #8    ; [p3 | p1] (C)
+
+    usub16  r6, r8, r9          ; [d2 | d0] (C)
+    usub16  r7, r10, r11        ; [d3 | d1] (C)
+
+    ldr     r10, [r1, #12]      ; src       (D)
+    ldr     r11, [r2], #4       ; pred      (D)
+
+    pkhbt   r8, r6, r7, lsl #16  ; [d1 | d0] (C)
+    pkhtb   r9, r7, r6, asr #16  ; [d3 | d2] (C)
+
+    str     r8, [r0], #4        ; diff      (C)
+    uxtb16  r8, r10             ; [s2 | s0] (D)
+    str     r9, [r0], #4        ; diff      (C)
+
+    uxtb16  r9, r11             ; [p2 | p0] (D)
+    uxtb16  r10, r10, ror #8    ; [s3 | s1] (D)
+    uxtb16  r11, r11, ror #8    ; [p3 | p1] (D)
+
+    usub16  r6, r8, r9          ; [d2 | d0] (D)
+    usub16  r7, r10, r11        ; [d3 | d1] (D)
+
+    add     r1, r1, r3          ; update src pointer
+
+    pkhbt   r8, r6, r7, lsl #16 ; [d1 | d0] (D)
+    pkhtb   r9, r7, r6, asr #16 ; [d3 | d2] (D)
+
+    str     r8, [r0], #4        ; diff      (D)
+    subs    r4, r4, #1          ; update loop counter
+    str     r9, [r0], #4        ; diff      (D)
+
+    bne     loop
+
+    ldmfd   sp!, {r4-r11}
+    mov     pc, lr
+
+    ENDP
+
+    END
+
--- /dev/null
+++ b/vp9/encoder/arm/armv6/vp8_variance16x16_armv6.asm
@@ -1,0 +1,154 @@
+;
+;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp9_variance16x16_armv6|
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0    unsigned char *src_ptr
+; r1    int source_stride
+; r2    unsigned char *ref_ptr
+; r3    int  recon_stride
+; stack unsigned int *sse
+|vp9_variance16x16_armv6| PROC
+
+    stmfd   sp!, {r4-r12, lr}
+
+    pld     [r0, r1, lsl #0]
+    pld     [r2, r3, lsl #0]
+
+    mov     r8, #0              ; initialize sum = 0
+    mov     r11, #0             ; initialize sse = 0
+    mov     r12, #16            ; set loop counter to 16 (=block height)
+
+loop
+    ; 1st 4 pixels
+    ldr     r4, [r0, #0]        ; load 4 src pixels
+    ldr     r5, [r2, #0]        ; load 4 ref pixels
+
+    mov     lr, #0              ; constant zero
+
+    usub8   r6, r4, r5          ; calculate difference
+    pld     [r0, r1, lsl #1]
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r9, r5, r4          ; calculate difference with reversed operands
+    pld     [r2, r3, lsl #1]
+    sel     r6, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+    ; calculate total sum
+    adds    r8, r8, r4          ; add positive differences to sum
+    subs    r8, r8, r5          ; substract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r10, r6, ror #8     ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+
+    ; 2nd 4 pixels
+    ldr     r4, [r0, #4]        ; load 4 src pixels
+    ldr     r5, [r2, #4]        ; load 4 ref pixels
+    smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2)
+
+    usub8   r6, r4, r5          ; calculate difference
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r9, r5, r4          ; calculate difference with reversed operands
+    sel     r6, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r8, r8, r4          ; add positive differences to sum
+    sub     r8, r8, r5          ; substract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r10, r6, ror #8     ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+
+    ; 3rd 4 pixels
+    ldr     r4, [r0, #8]        ; load 4 src pixels
+    ldr     r5, [r2, #8]        ; load 4 ref pixels
+    smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2)
+
+    usub8   r6, r4, r5          ; calculate difference
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r9, r5, r4          ; calculate difference with reversed operands
+    sel     r6, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r8, r8, r4          ; add positive differences to sum
+    sub     r8, r8, r5          ; substract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r10, r6, ror #8     ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+
+    ; 4th 4 pixels
+    ldr     r4, [r0, #12]       ; load 4 src pixels
+    ldr     r5, [r2, #12]       ; load 4 ref pixels
+    smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2)
+
+    usub8   r6, r4, r5          ; calculate difference
+    add     r0, r0, r1          ; set src_ptr to next row
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r9, r5, r4          ; calculate difference with reversed operands
+    add     r2, r2, r3          ; set dst_ptr to next row
+    sel     r6, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r8, r8, r4          ; add positive differences to sum
+    sub     r8, r8, r5          ; substract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r10, r6, ror #8     ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+    smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2)
+
+
+    subs    r12, r12, #1
+
+    bne     loop
+
+    ; return stuff
+    ldr     r6, [sp, #40]       ; get address of sse
+    mul     r0, r8, r8          ; sum * sum
+    str     r11, [r6]           ; store sse
+    sub     r0, r11, r0, asr #8 ; return (sse - ((sum * sum) >> 8))
+
+    ldmfd   sp!, {r4-r12, pc}
+
+    ENDP
+
+    END
+
--- /dev/null
+++ b/vp9/encoder/arm/armv6/vp8_variance8x8_armv6.asm
@@ -1,0 +1,101 @@
+;
+;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp9_variance8x8_armv6|
+
+    ARM
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0    unsigned char *src_ptr
+; r1    int source_stride
+; r2    unsigned char *ref_ptr
+; r3    int  recon_stride
+; stack unsigned int *sse
+|vp9_variance8x8_armv6| PROC
+
+    push    {r4-r10, lr}
+
+    pld     [r0, r1, lsl #0]
+    pld     [r2, r3, lsl #0]
+
+    mov     r12, #8             ; set loop counter to 8 (=block height)
+    mov     r4, #0              ; initialize sum = 0
+    mov     r5, #0              ; initialize sse = 0
+
+loop
+    ; 1st 4 pixels
+    ldr     r6, [r0, #0x0]      ; load 4 src pixels
+    ldr     r7, [r2, #0x0]      ; load 4 ref pixels
+
+    mov     lr, #0              ; constant zero
+
+    usub8   r8, r6, r7          ; calculate difference
+    pld     [r0, r1, lsl #1]
+    sel     r10, r8, lr         ; select bytes with positive difference
+    usub8   r9, r7, r6          ; calculate difference with reversed operands
+    pld     [r2, r3, lsl #1]
+    sel     r8, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r6, r10, lr         ; calculate sum of positive differences
+    usad8   r7, r8, lr          ; calculate sum of negative differences
+    orr     r8, r8, r10         ; differences of all 4 pixels
+    ; calculate total sum
+    add    r4, r4, r6           ; add positive differences to sum
+    sub    r4, r4, r7           ; substract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r7, r8              ; byte (two pixels) to halfwords
+    uxtb16  r10, r8, ror #8     ; another two pixels to halfwords
+    smlad   r5, r7, r7, r5      ; dual signed multiply, add and accumulate (1)
+
+    ; 2nd 4 pixels
+    ldr     r6, [r0, #0x4]      ; load 4 src pixels
+    ldr     r7, [r2, #0x4]      ; load 4 ref pixels
+    smlad   r5, r10, r10, r5    ; dual signed multiply, add and accumulate (2)
+
+    usub8   r8, r6, r7          ; calculate difference
+    add     r0, r0, r1          ; set src_ptr to next row
+    sel     r10, r8, lr         ; select bytes with positive difference
+    usub8   r9, r7, r6          ; calculate difference with reversed operands
+    add     r2, r2, r3          ; set dst_ptr to next row
+    sel     r8, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r6, r10, lr         ; calculate sum of positive differences
+    usad8   r7, r8, lr          ; calculate sum of negative differences
+    orr     r8, r8, r10         ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r4, r4, r6          ; add positive differences to sum
+    sub     r4, r4, r7          ; substract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r7, r8              ; byte (two pixels) to halfwords
+    uxtb16  r10, r8, ror #8     ; another two pixels to halfwords
+    smlad   r5, r7, r7, r5      ; dual signed multiply, add and accumulate (1)
+    subs    r12, r12, #1        ; next row
+    smlad   r5, r10, r10, r5    ; dual signed multiply, add and accumulate (2)
+
+    bne     loop
+
+    ; return stuff
+    ldr     r8, [sp, #32]       ; get address of sse
+    mul     r1, r4, r4          ; sum * sum
+    str     r5, [r8]            ; store sse
+    sub     r0, r5, r1, ASR #6  ; return (sse - ((sum * sum) >> 6))
+
+    pop     {r4-r10, pc}
+
+    ENDP
+
+    END
--- /dev/null
+++ b/vp9/encoder/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6.asm
@@ -1,0 +1,182 @@
+;
+;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp9_variance_halfpixvar16x16_h_armv6|
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0    unsigned char *src_ptr
+; r1    int source_stride
+; r2    unsigned char *ref_ptr
+; r3    int  recon_stride
+; stack unsigned int *sse
+|vp9_variance_halfpixvar16x16_h_armv6| PROC
+
+    stmfd   sp!, {r4-r12, lr}
+
+    pld     [r0, r1, lsl #0]
+    pld     [r2, r3, lsl #0]
+
+    mov     r8, #0              ; initialize sum = 0
+    ldr     r10, c80808080
+    mov     r11, #0             ; initialize sse = 0
+    mov     r12, #16            ; set loop counter to 16 (=block height)
+    mov     lr, #0              ; constant zero
+loop
+    ; 1st 4 pixels
+    ldr     r4, [r0, #0]        ; load 4 src pixels
+    ldr     r6, [r0, #1]        ; load 4 src pixels with 1 byte offset
+    ldr     r5, [r2, #0]        ; load 4 ref pixels
+
+    ; bilinear interpolation
+    mvn     r6, r6
+    uhsub8  r4, r4, r6
+    eor     r4, r4, r10
+
+    usub8   r6, r4, r5          ; calculate difference
+    pld     [r0, r1, lsl #1]
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r6, r5, r4          ; calculate difference with reversed operands
+    pld     [r2, r3, lsl #1]
+    sel     r6, r6, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+    ; calculate total sum
+    adds    r8, r8, r4          ; add positive differences to sum
+    subs    r8, r8, r5          ; substract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+
+    ; 2nd 4 pixels
+    ldr     r4, [r0, #4]        ; load 4 src pixels
+    ldr     r6, [r0, #5]        ; load 4 src pixels with 1 byte offset
+    ldr     r5, [r2, #4]        ; load 4 ref pixels
+
+    ; bilinear interpolation
+    mvn     r6, r6
+    uhsub8  r4, r4, r6
+    eor     r4, r4, r10
+
+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
+
+    usub8   r6, r4, r5          ; calculate difference
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r6, r5, r4          ; calculate difference with reversed operands
+    sel     r6, r6, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r8, r8, r4          ; add positive differences to sum
+    sub     r8, r8, r5          ; substract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+
+    ; 3rd 4 pixels
+    ldr     r4, [r0, #8]        ; load 4 src pixels
+    ldr     r6, [r0, #9]        ; load 4 src pixels with 1 byte offset
+    ldr     r5, [r2, #8]        ; load 4 ref pixels
+
+    ; bilinear interpolation
+    mvn     r6, r6
+    uhsub8  r4, r4, r6
+    eor     r4, r4, r10
+
+    smlad   r11, r7, r7, r11  ; dual signed multiply, add and accumulate (2)
+
+    usub8   r6, r4, r5          ; calculate difference
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r6, r5, r4          ; calculate difference with reversed operands
+    sel     r6, r6, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r8, r8, r4          ; add positive differences to sum
+    sub     r8, r8, r5          ; substract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+
+    ; 4th 4 pixels
+    ldr     r4, [r0, #12]       ; load 4 src pixels
+    ldr     r6, [r0, #13]       ; load 4 src pixels with 1 byte offset
+    ldr     r5, [r2, #12]       ; load 4 ref pixels
+
+    ; bilinear interpolation
+    mvn     r6, r6
+    uhsub8  r4, r4, r6
+    eor     r4, r4, r10
+
+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
+
+    usub8   r6, r4, r5          ; calculate difference
+    add     r0, r0, r1          ; set src_ptr to next row
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r6, r5, r4          ; calculate difference with reversed operands
+    add     r2, r2, r3          ; set dst_ptr to next row
+    sel     r6, r6, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r8, r8, r4          ; add positive differences to sum
+    sub     r8, r8, r5          ; substract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
+
+    subs    r12, r12, #1
+
+    bne     loop
+
+    ; return stuff
+    ldr     r6, [sp, #40]       ; get address of sse
+    mul     r0, r8, r8          ; sum * sum
+    str     r11, [r6]           ; store sse
+    sub     r0, r11, r0, asr #8 ; return (sse - ((sum * sum) >> 8))
+
+    ldmfd   sp!, {r4-r12, pc}
+
+    ENDP
+
+c80808080
+    DCD     0x80808080
+
+    END
+
--- /dev/null
+++ b/vp9/encoder/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6.asm
@@ -1,0 +1,222 @@
+;
+;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp9_variance_halfpixvar16x16_hv_armv6|
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0    unsigned char *src_ptr
+; r1    int source_stride
+; r2    unsigned char *ref_ptr
+; r3    int  recon_stride
+; stack unsigned int *sse
+|vp9_variance_halfpixvar16x16_hv_armv6| PROC
+
+    stmfd   sp!, {r4-r12, lr}
+
+    pld     [r0, r1, lsl #0]
+    pld     [r2, r3, lsl #0]
+
+    mov     r8, #0              ; initialize sum = 0
+    ldr     r10, c80808080
+    mov     r11, #0             ; initialize sse = 0
+    mov     r12, #16            ; set loop counter to 16 (=block height)
+    mov     lr, #0              ; constant zero
+loop
+    add     r9, r0, r1          ; pointer to pixels on the next row
+    ; 1st 4 pixels
+    ldr     r4, [r0, #0]        ; load source pixels a, row N
+    ldr     r6, [r0, #1]        ; load source pixels b, row N
+    ldr     r5, [r9, #0]        ; load source pixels c, row N+1
+    ldr     r7, [r9, #1]        ; load source pixels d, row N+1
+
+    ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N
+    mvn     r6, r6
+    uhsub8  r4, r4, r6
+    eor     r4, r4, r10
+    ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1
+    mvn     r7, r7
+    uhsub8  r5, r5, r7
+    eor     r5, r5, r10
+    ; z = (x + y + 1) >> 1, interpolate half pixel values vertically
+    mvn     r5, r5
+    uhsub8  r4, r4, r5
+    ldr     r5, [r2, #0]        ; load 4 ref pixels
+    eor     r4, r4, r10
+
+    usub8   r6, r4, r5          ; calculate difference
+    pld     [r0, r1, lsl #1]
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r6, r5, r4          ; calculate difference with reversed operands
+    pld     [r2, r3, lsl #1]
+    sel     r6, r6, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+    ; calculate total sum
+    adds    r8, r8, r4          ; add positive differences to sum
+    subs    r8, r8, r5          ; substract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+
+    ; 2nd 4 pixels
+    ldr     r4, [r0, #4]        ; load source pixels a, row N
+    ldr     r6, [r0, #5]        ; load source pixels b, row N
+    ldr     r5, [r9, #4]        ; load source pixels c, row N+1
+
+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
+
+    ldr     r7, [r9, #5]        ; load source pixels d, row N+1
+
+    ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N
+    mvn     r6, r6
+    uhsub8  r4, r4, r6
+    eor     r4, r4, r10
+    ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1
+    mvn     r7, r7
+    uhsub8  r5, r5, r7
+    eor     r5, r5, r10
+    ; z = (x + y + 1) >> 1, interpolate half pixel values vertically
+    mvn     r5, r5
+    uhsub8  r4, r4, r5
+    ldr     r5, [r2, #4]        ; load 4 ref pixels
+    eor     r4, r4, r10
+
+    usub8   r6, r4, r5          ; calculate difference
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r6, r5, r4          ; calculate difference with reversed operands
+    sel     r6, r6, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r8, r8, r4          ; add positive differences to sum
+    sub     r8, r8, r5          ; substract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+
+    ; 3rd 4 pixels
+    ldr     r4, [r0, #8]        ; load source pixels a, row N
+    ldr     r6, [r0, #9]        ; load source pixels b, row N
+    ldr     r5, [r9, #8]        ; load source pixels c, row N+1
+
+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
+
+    ldr     r7, [r9, #9]        ; load source pixels d, row N+1
+
+    ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N
+    mvn     r6, r6
+    uhsub8  r4, r4, r6
+    eor     r4, r4, r10
+    ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1
+    mvn     r7, r7
+    uhsub8  r5, r5, r7
+    eor     r5, r5, r10
+    ; z = (x + y + 1) >> 1, interpolate half pixel values vertically
+    mvn     r5, r5
+    uhsub8  r4, r4, r5
+    ldr     r5, [r2, #8]        ; load 4 ref pixels
+    eor     r4, r4, r10
+
+    usub8   r6, r4, r5          ; calculate difference
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r6, r5, r4          ; calculate difference with reversed operands
+    sel     r6, r6, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r8, r8, r4          ; add positive differences to sum
+    sub     r8, r8, r5          ; substract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+
+    ; 4th 4 pixels
+    ldr     r4, [r0, #12]       ; load source pixels a, row N
+    ldr     r6, [r0, #13]       ; load source pixels b, row N
+    ldr     r5, [r9, #12]       ; load source pixels c, row N+1
+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
+    ldr     r7, [r9, #13]       ; load source pixels d, row N+1
+
+    ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N
+    mvn     r6, r6
+    uhsub8  r4, r4, r6
+    eor     r4, r4, r10
+    ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1
+    mvn     r7, r7
+    uhsub8  r5, r5, r7
+    eor     r5, r5, r10
+    ; z = (x + y + 1) >> 1, interpolate half pixel values vertically
+    mvn     r5, r5
+    uhsub8  r4, r4, r5
+    ldr     r5, [r2, #12]       ; load 4 ref pixels
+    eor     r4, r4, r10
+
+    usub8   r6, r4, r5          ; calculate difference
+    add     r0, r0, r1          ; set src_ptr to next row
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r6, r5, r4          ; calculate difference with reversed operands
+    add     r2, r2, r3          ; set dst_ptr to next row
+    sel     r6, r6, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r8, r8, r4          ; add positive differences to sum
+    sub     r8, r8, r5          ; substract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+    subs    r12, r12, #1
+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
+
+    bne     loop
+
+    ; return stuff
+    ldr     r6, [sp, #40]       ; get address of sse
+    mul     r0, r8, r8          ; sum * sum
+    str     r11, [r6]           ; store sse
+    sub     r0, r11, r0, asr #8 ; return (sse - ((sum * sum) >> 8))
+
+    ldmfd   sp!, {r4-r12, pc}
+
+    ENDP
+
+c80808080
+    DCD     0x80808080
+
+    END
--- /dev/null
+++ b/vp9/encoder/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6.asm
@@ -1,0 +1,184 @@
+;
+;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp9_variance_halfpixvar16x16_v_armv6|
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0    unsigned char *src_ptr
+; r1    int source_stride
+; r2    unsigned char *ref_ptr
+; r3    int  recon_stride
+; stack unsigned int *sse
+|vp9_variance_halfpixvar16x16_v_armv6| PROC
+
+    stmfd   sp!, {r4-r12, lr}
+
+    pld     [r0, r1, lsl #0]
+    pld     [r2, r3, lsl #0]
+
+    mov     r8, #0              ; initialize sum = 0
+    ldr     r10, c80808080
+    mov     r11, #0             ; initialize sse = 0
+    mov     r12, #16            ; set loop counter to 16 (=block height)
+    mov     lr, #0              ; constant zero
+loop
+    add     r9, r0, r1          ; set src pointer to next row
+    ; 1st 4 pixels
+    ldr     r4, [r0, #0]        ; load 4 src pixels
+    ldr     r6, [r9, #0]        ; load 4 src pixels from next row
+    ldr     r5, [r2, #0]        ; load 4 ref pixels
+
+    ; bilinear interpolation
+    mvn     r6, r6
+    uhsub8  r4, r4, r6
+    eor     r4, r4, r10
+
+    usub8   r6, r4, r5          ; calculate difference
+    pld     [r0, r1, lsl #1]
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r6, r5, r4          ; calculate difference with reversed operands
+    pld     [r2, r3, lsl #1]
+    sel     r6, r6, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+    ; calculate total sum
+    adds    r8, r8, r4          ; add positive differences to sum
+    subs    r8, r8, r5          ; substract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+
+    ; 2nd 4 pixels
+    ldr     r4, [r0, #4]        ; load 4 src pixels
+    ldr     r6, [r9, #4]        ; load 4 src pixels from next row
+    ldr     r5, [r2, #4]        ; load 4 ref pixels
+
+    ; bilinear interpolation
+    mvn     r6, r6
+    uhsub8  r4, r4, r6
+    eor     r4, r4, r10
+
+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
+
+    usub8   r6, r4, r5          ; calculate difference
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r6, r5, r4          ; calculate difference with reversed operands
+    sel     r6, r6, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r8, r8, r4          ; add positive differences to sum
+    sub     r8, r8, r5          ; substract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+
+    ; 3rd 4 pixels
+    ldr     r4, [r0, #8]        ; load 4 src pixels
+    ldr     r6, [r9, #8]        ; load 4 src pixels from next row
+    ldr     r5, [r2, #8]        ; load 4 ref pixels
+
+    ; bilinear interpolation
+    mvn     r6, r6
+    uhsub8  r4, r4, r6
+    eor     r4, r4, r10
+
+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
+
+    usub8   r6, r4, r5          ; calculate difference
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r6, r5, r4          ; calculate difference with reversed operands
+    sel     r6, r6, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r8, r8, r4          ; add positive differences to sum
+    sub     r8, r8, r5          ; substract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+
+    ; 4th 4 pixels
+    ldr     r4, [r0, #12]       ; load 4 src pixels
+    ldr     r6, [r9, #12]       ; load 4 src pixels from next row
+    ldr     r5, [r2, #12]       ; load 4 ref pixels
+
+    ; bilinear interpolation
+    mvn     r6, r6
+    uhsub8  r4, r4, r6
+    eor     r4, r4, r10
+
+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
+
+    usub8   r6, r4, r5          ; calculate difference
+    add     r0, r0, r1          ; set src_ptr to next row
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r6, r5, r4          ; calculate difference with reversed operands
+    add     r2, r2, r3          ; set dst_ptr to next row
+    sel     r6, r6, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r8, r8, r4          ; add positive differences to sum
+    sub     r8, r8, r5          ; substract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
+
+
+    subs    r12, r12, #1
+
+    bne     loop
+
+    ; return stuff
+    ldr     r6, [sp, #40]       ; get address of sse
+    mul     r0, r8, r8          ; sum * sum
+    str     r11, [r6]           ; store sse
+    sub     r0, r11, r0, asr #8 ; return (sse - ((sum * sum) >> 8))
+
+    ldmfd   sp!, {r4-r12, pc}
+
+    ENDP
+
+c80808080
+    DCD     0x80808080
+
+    END
+
--- /dev/null
+++ b/vp9/encoder/arm/armv6/walsh_v6.asm
@@ -1,0 +1,212 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+    EXPORT |vp8_short_walsh4x4_armv6|
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA    |.text|, CODE, READONLY  ; name this block of code
+
+;short vp8_short_walsh4x4_armv6(short *input, short *output, int pitch)
+; r0    short *input,
+; r1    short *output,
+; r2    int pitch
+|vp8_short_walsh4x4_armv6| PROC
+
+    stmdb       sp!, {r4 - r11, lr}
+
+    ldrd        r4, r5, [r0], r2
+    ldr         lr, c00040004
+    ldrd        r6, r7, [r0], r2
+
+    ; 0-3
+    qadd16      r3, r4, r5          ; [d1|a1] [1+3   |   0+2]
+    qsub16      r4, r4, r5          ; [c1|b1] [1-3   |   0-2]
+
+    ldrd        r8, r9, [r0], r2
+    ; 4-7
+    qadd16      r5, r6, r7          ; [d1|a1] [5+7   |   4+6]
+    qsub16      r6, r6, r7          ; [c1|b1] [5-7   |   4-6]
+
+    ldrd        r10, r11, [r0]
+    ; 8-11
+    qadd16      r7, r8, r9          ; [d1|a1] [9+11  |  8+10]
+    qsub16      r8, r8, r9          ; [c1|b1] [9-11  |  8-10]
+
+    ; 12-15
+    qadd16      r9, r10, r11        ; [d1|a1] [13+15 | 12+14]
+    qsub16      r10, r10, r11       ; [c1|b1] [13-15 | 12-14]
+
+
+    lsls        r2, r3, #16
+    smuad       r11, r3, lr         ; A0 = a1<<2 + d1<<2
+    addne       r11, r11, #1        ; A0 += (a1!=0)
+
+    lsls        r2, r7, #16
+    smuad       r12, r7, lr         ; C0 = a1<<2 + d1<<2
+    addne       r12, r12, #1        ; C0 += (a1!=0)
+
+    add         r0, r11, r12        ; a1_0 = A0 + C0
+    sub         r11, r11, r12       ; b1_0 = A0 - C0
+
+    lsls        r2, r5, #16
+    smuad       r12, r5, lr         ; B0 = a1<<2 + d1<<2
+    addne       r12, r12, #1        ; B0 += (a1!=0)
+
+    lsls        r2, r9, #16
+    smuad       r2, r9, lr          ; D0 = a1<<2 + d1<<2
+    addne       r2, r2, #1          ; D0 += (a1!=0)
+
+    add         lr, r12, r2         ; d1_0 = B0 + D0
+    sub         r12, r12, r2        ; c1_0 = B0 - D0
+
+    ; op[0,4,8,12]
+    adds        r2, r0, lr          ; a2 = a1_0 + d1_0
+    addmi       r2, r2, #1          ; += a2 < 0
+    add         r2, r2, #3          ; += 3
+    subs        r0, r0, lr          ; d2 = a1_0 - d1_0
+    mov         r2, r2, asr #3      ; >> 3
+    strh        r2, [r1]            ; op[0]
+
+    addmi       r0, r0, #1          ; += a2 < 0
+    add         r0, r0, #3          ; += 3
+    ldr         lr, c00040004
+    mov         r0, r0, asr #3      ; >> 3
+    strh        r0, [r1, #24]       ; op[12]
+
+    adds        r2, r11, r12        ; b2 = b1_0 + c1_0
+    addmi       r2, r2, #1          ; += a2 < 0
+    add         r2, r2, #3          ; += 3
+    subs        r0, r11, r12        ; c2 = b1_0 - c1_0
+    mov         r2, r2, asr #3      ; >> 3
+    strh        r2, [r1, #8]        ; op[4]
+
+    addmi       r0, r0, #1          ; += a2 < 0
+    add         r0, r0, #3          ; += 3
+    smusd       r3, r3, lr          ; A3 = a1<<2 - d1<<2
+    smusd       r7, r7, lr          ; C3 = a1<<2 - d1<<2
+    mov         r0, r0, asr #3      ; >> 3
+    strh        r0, [r1, #16]       ; op[8]
+
+
+    ; op[3,7,11,15]
+    add         r0, r3, r7          ; a1_3 = A3 + C3
+    sub         r3, r3, r7          ; b1_3 = A3 - C3
+
+    smusd       r5, r5, lr          ; B3 = a1<<2 - d1<<2
+    smusd       r9, r9, lr          ; D3 = a1<<2 - d1<<2
+    add         r7, r5, r9          ; d1_3 = B3 + D3
+    sub         r5, r5, r9          ; c1_3 = B3 - D3
+
+    adds        r2, r0, r7          ; a2 = a1_3 + d1_3
+    addmi       r2, r2, #1          ; += a2 < 0
+    add         r2, r2, #3          ; += 3
+    adds        r9, r3, r5          ; b2 = b1_3 + c1_3
+    mov         r2, r2, asr #3      ; >> 3
+    strh        r2, [r1, #6]        ; op[3]
+
+    addmi       r9, r9, #1          ; += a2 < 0
+    add         r9, r9, #3          ; += 3
+    subs        r2, r3, r5          ; c2 = b1_3 - c1_3
+    mov         r9, r9, asr #3      ; >> 3
+    strh        r9, [r1, #14]       ; op[7]
+
+    addmi       r2, r2, #1          ; += a2 < 0
+    add         r2, r2, #3          ; += 3
+    subs        r9, r0, r7          ; d2 = a1_3 - d1_3
+    mov         r2, r2, asr #3      ; >> 3
+    strh        r2, [r1, #22]       ; op[11]
+
+    addmi       r9, r9, #1          ; += a2 < 0
+    add         r9, r9, #3          ; += 3
+    smuad       r3, r4, lr          ; A1 = b1<<2 + c1<<2
+    smuad       r5, r8, lr          ; C1 = b1<<2 + c1<<2
+    mov         r9, r9, asr #3      ; >> 3
+    strh        r9, [r1, #30]       ; op[15]
+
+    ; op[1,5,9,13]
+    add         r0, r3, r5          ; a1_1 = A1 + C1
+    sub         r3, r3, r5          ; b1_1 = A1 - C1
+
+    smuad       r7, r6, lr          ; B1 = b1<<2 + c1<<2
+    smuad       r9, r10, lr         ; D1 = b1<<2 + c1<<2
+    add         r5, r7, r9          ; d1_1 = B1 + D1
+    sub         r7, r7, r9          ; c1_1 = B1 - D1
+
+    adds        r2, r0, r5          ; a2 = a1_1 + d1_1
+    addmi       r2, r2, #1          ; += a2 < 0
+    add         r2, r2, #3          ; += 3
+    adds        r9, r3, r7          ; b2 = b1_1 + c1_1
+    mov         r2, r2, asr #3      ; >> 3
+    strh        r2, [r1, #2]        ; op[1]
+
+    addmi       r9, r9, #1          ; += a2 < 0
+    add         r9, r9, #3          ; += 3
+    subs        r2, r3, r7          ; c2 = b1_1 - c1_1
+    mov         r9, r9, asr #3      ; >> 3
+    strh        r9, [r1, #10]       ; op[5]
+
+    addmi       r2, r2, #1          ; += a2 < 0
+    add         r2, r2, #3          ; += 3
+    subs        r9, r0, r5          ; d2 = a1_1 - d1_1
+    mov         r2, r2, asr #3      ; >> 3
+    strh        r2, [r1, #18]       ; op[9]
+
+    addmi       r9, r9, #1          ; += a2 < 0
+    add         r9, r9, #3          ; += 3
+    smusd       r4, r4, lr          ; A2 = b1<<2 - c1<<2
+    smusd       r8, r8, lr          ; C2 = b1<<2 - c1<<2
+    mov         r9, r9, asr #3      ; >> 3
+    strh        r9, [r1, #26]       ; op[13]
+
+
+    ; op[2,6,10,14]
+    add         r11, r4, r8         ; a1_2 = A2 + C2
+    sub         r12, r4, r8         ; b1_2 = A2 - C2
+
+    smusd       r6, r6, lr          ; B2 = b1<<2 - c1<<2
+    smusd       r10, r10, lr        ; D2 = b1<<2 - c1<<2
+    add         r4, r6, r10         ; d1_2 = B2 + D2
+    sub         r8, r6, r10         ; c1_2 = B2 - D2
+
+    adds        r2, r11, r4         ; a2 = a1_2 + d1_2
+    addmi       r2, r2, #1          ; += a2 < 0
+    add         r2, r2, #3          ; += 3
+    adds        r9, r12, r8         ; b2 = b1_2 + c1_2
+    mov         r2, r2, asr #3      ; >> 3
+    strh        r2, [r1, #4]        ; op[2]
+
+    addmi       r9, r9, #1          ; += a2 < 0
+    add         r9, r9, #3          ; += 3
+    subs        r2, r12, r8         ; c2 = b1_2 - c1_2
+    mov         r9, r9, asr #3      ; >> 3
+    strh        r9, [r1, #12]       ; op[6]
+
+    addmi       r2, r2, #1          ; += a2 < 0
+    add         r2, r2, #3          ; += 3
+    subs        r9, r11, r4         ; d2 = a1_2 - d1_2
+    mov         r2, r2, asr #3      ; >> 3
+    strh        r2, [r1, #20]       ; op[10]
+
+    addmi       r9, r9, #1          ; += a2 < 0
+    add         r9, r9, #3          ; += 3
+    mov         r9, r9, asr #3      ; >> 3
+    strh        r9, [r1, #28]       ; op[14]
+
+
+    ldmia       sp!, {r4 - r11, pc}
+    ENDP        ; |vp8_short_walsh4x4_armv6|
+
+c00040004
+    DCD         0x00040004
+
+    END
--- /dev/null
+++ b/vp9/encoder/arm/boolhuff_arm.c
@@ -1,0 +1,33 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vp9/encoder/boolhuff.h"
+#include "vp9/common/blockd.h"
+
+const unsigned int vp9_prob_cost[256] = {
+  2047, 2047, 1791, 1641, 1535, 1452, 1385, 1328, 1279, 1235, 1196, 1161, 1129, 1099, 1072, 1046,
+  1023, 1000,  979,  959,  940,  922,  905,  889,  873,  858,  843,  829,  816,  803,  790,  778,
+  767,  755,  744,  733,  723,  713,  703,  693,  684,  675,  666,  657,  649,  641,  633,  625,
+  617,  609,  602,  594,  587,  580,  573,  567,  560,  553,  547,  541,  534,  528,  522,  516,
+  511,  505,  499,  494,  488,  483,  477,  472,  467,  462,  457,  452,  447,  442,  437,  433,
+  428,  424,  419,  415,  410,  406,  401,  397,  393,  389,  385,  381,  377,  373,  369,  365,
+  361,  357,  353,  349,  346,  342,  338,  335,  331,  328,  324,  321,  317,  314,  311,  307,
+  304,  301,  297,  294,  291,  288,  285,  281,  278,  275,  272,  269,  266,  263,  260,  257,
+  255,  252,  249,  246,  243,  240,  238,  235,  232,  229,  227,  224,  221,  219,  216,  214,
+  211,  208,  206,  203,  201,  198,  196,  194,  191,  189,  186,  184,  181,  179,  177,  174,
+  172,  170,  168,  165,  163,  161,  159,  156,  154,  152,  150,  148,  145,  143,  141,  139,
+  137,  135,  133,  131,  129,  127,  125,  123,  121,  119,  117,  115,  113,  111,  109,  107,
+  105,  103,  101,   99,   97,   95,   93,   92,   90,   88,   86,   84,   82,   81,   79,   77,
+  75,   73,   72,   70,   68,   66,   65,   63,   61,   60,   58,   56,   55,   53,   51,   50,
+  48,   46,   45,   43,   41,   40,   38,   37,   35,   33,   32,   30,   29,   27,   25,   24,
+  22,   21,   19,   18,   16,   15,   13,   12,   10,    9,    7,    6,    4,    3,    1,   1
+};
+
--- /dev/null
+++ b/vp9/encoder/arm/dct_arm.c
@@ -1,0 +1,21 @@
+/*
+ *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_config.h"
+#include "./vpx_rtcd.h"
+
+#if HAVE_ARMV6
+
+void vp9_short_fdct8x4_armv6(short *input, short *output, int pitch) {
+  vp9_short_fdct4x4_armv6(input,   output,    pitch);
+  vp9_short_fdct4x4_armv6(input + 4, output + 16, pitch);
+}
+
+#endif /* HAVE_ARMV6 */
--- /dev/null
+++ b/vp9/encoder/arm/dct_arm.h
@@ -1,0 +1,65 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef DCT_ARM_H
+#define DCT_ARM_H
+
+#if HAVE_ARMV6
+extern prototype_fdct(vp9_short_walsh4x4_armv6);
+extern prototype_fdct(vp9_short_fdct4x4_armv6);
+extern prototype_fdct(vp9_short_fdct8x4_armv6);
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+#undef  vp8_fdct_walsh_short4x4
+#define vp8_fdct_walsh_short4x4 vp9_short_walsh4x4_armv6
+
+#undef  vp8_fdct_short4x4
+#define vp8_fdct_short4x4 vp9_short_fdct4x4_armv6
+
+#undef  vp8_fdct_short8x4
+#define vp8_fdct_short8x4 vp9_short_fdct8x4_armv6
+
+#undef  vp8_fdct_fast4x4
+#define vp8_fdct_fast4x4 vp9_short_fdct4x4_armv6
+
+#undef  vp8_fdct_fast8x4
+#define vp8_fdct_fast8x4 vp9_short_fdct8x4_armv6
+#endif
+
+#endif /* HAVE_ARMV6 */
+
+#if HAVE_ARMV7
+extern prototype_fdct(vp9_short_fdct4x4_neon);
+extern prototype_fdct(vp9_short_fdct8x4_neon);
+extern prototype_fdct(vp8_fast_fdct4x4_neon);
+extern prototype_fdct(vp8_fast_fdct8x4_neon);
+extern prototype_fdct(vp9_short_walsh4x4_neon);
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+#undef  vp8_fdct_short4x4
+#define vp8_fdct_short4x4 vp9_short_fdct4x4_neon
+
+#undef  vp8_fdct_short8x4
+#define vp8_fdct_short8x4 vp9_short_fdct8x4_neon
+
+#undef  vp8_fdct_fast4x4
+#define vp8_fdct_fast4x4 vp9_short_fdct4x4_neon
+
+#undef  vp8_fdct_fast8x4
+#define vp8_fdct_fast8x4 vp9_short_fdct8x4_neon
+
+#undef  vp8_fdct_walsh_short4x4
+#define vp8_fdct_walsh_short4x4 vp9_short_walsh4x4_neon
+#endif
+
+#endif
+
+#endif
--- /dev/null
+++ b/vp9/encoder/arm/encodemb_arm.h
@@ -1,0 +1,64 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef ENCODEMB_ARM_H
+#define ENCODEMB_ARM_H
+
+#if HAVE_ARMV6
+extern prototype_subb(vp9_subtract_b_armv6);
+extern prototype_submby(vp9_subtract_mby_armv6);
+extern prototype_submbuv(vp9_subtract_mbuv_armv6);
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+#undef  vp8_encodemb_subb
+#define vp8_encodemb_subb vp9_subtract_b_armv6
+
+#undef  vp8_encodemb_submby
+#define vp8_encodemb_submby vp9_subtract_mby_armv6
+
+#undef  vp8_encodemb_submbuv
+#define vp8_encodemb_submbuv vp9_subtract_mbuv_armv6
+#endif
+
+#endif /* HAVE_ARMV6 */
+
+#if HAVE_ARMV7
+// extern prototype_berr(vp9_block_error_c);
+// extern prototype_mberr(vp9_mbblock_error_c);
+// extern prototype_mbuverr(vp9_mbuverror_c);
+
+extern prototype_subb(vp9_subtract_b_neon);
+extern prototype_submby(vp9_subtract_mby_neon);
+extern prototype_submbuv(vp9_subtract_mbuv_neon);
+
+// #undef  vp8_encodemb_berr
+// #define vp8_encodemb_berr vp9_block_error_c
+
+// #undef  vp8_encodemb_mberr
+// #define vp8_encodemb_mberr vp9_mbblock_error_c
+
+// #undef  vp8_encodemb_mbuverr
+// #define vp8_encodemb_mbuverr vp9_mbuverror_c
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+#undef  vp8_encodemb_subb
+#define vp8_encodemb_subb vp9_subtract_b_neon
+
+#undef  vp8_encodemb_submby
+#define vp8_encodemb_submby vp9_subtract_mby_neon
+
+#undef  vp8_encodemb_submbuv
+#define vp8_encodemb_submbuv vp9_subtract_mbuv_neon
+#endif
+
+#endif
+
+#endif
--- /dev/null
+++ b/vp9/encoder/arm/neon/fastquantizeb_neon.asm
@@ -1,0 +1,261 @@
+;
+;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_fast_quantize_b_neon|
+    EXPORT  |vp8_fast_quantize_b_pair_neon|
+
+    INCLUDE asm_enc_offsets.asm
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=4
+
+;vp8_fast_quantize_b_pair_neon(BLOCK *b1, BLOCK *b2, BLOCKD *d1, BLOCKD *d2);
+|vp8_fast_quantize_b_pair_neon| PROC
+
+    stmfd           sp!, {r4-r9}
+    vstmdb          sp!, {q4-q7}
+
+    ldr             r4, [r0, #vp8_block_coeff]
+    ldr             r5, [r0, #vp8_block_quant_fast]
+    ldr             r6, [r0, #vp8_block_round]
+
+    vld1.16         {q0, q1}, [r4@128]  ; load z
+
+    ldr             r7, [r2, #vp8_blockd_qcoeff]
+
+    vabs.s16        q4, q0              ; calculate x = abs(z)
+    vabs.s16        q5, q1
+
+    ;right shift 15 to get sign, all 0 if it is positive, all 1 if it is negative
+    vshr.s16        q2, q0, #15         ; sz
+    vshr.s16        q3, q1, #15
+
+    vld1.s16        {q6, q7}, [r6@128]  ; load round_ptr [0-15]
+    vld1.s16        {q8, q9}, [r5@128]  ; load quant_ptr [0-15]
+
+    ldr             r4, [r1, #vp8_block_coeff]
+
+    vadd.s16        q4, q6              ; x + Round
+    vadd.s16        q5, q7
+
+    vld1.16         {q0, q1}, [r4@128]  ; load z2
+
+    vqdmulh.s16     q4, q8              ; y = ((Round+abs(z)) * Quant) >> 16
+    vqdmulh.s16     q5, q9
+
+    vabs.s16        q10, q0             ; calculate x2 = abs(z_2)
+    vabs.s16        q11, q1
+    vshr.s16        q12, q0, #15        ; sz2
+    vshr.s16        q13, q1, #15
+
+    ;modify data to have its original sign
+    veor.s16        q4, q2              ; y^sz
+    veor.s16        q5, q3
+
+    vadd.s16        q10, q6             ; x2 + Round
+    vadd.s16        q11, q7
+
+    ldr             r8, [r2, #vp8_blockd_dequant]
+
+    vqdmulh.s16     q10, q8             ; y2 = ((Round+abs(z)) * Quant) >> 16
+    vqdmulh.s16     q11, q9
+
+    vshr.s16        q4, #1              ; right shift 1 after vqdmulh
+    vshr.s16        q5, #1
+
+    vld1.s16        {q6, q7}, [r8@128]  ;load dequant_ptr[i]
+
+    vsub.s16        q4, q2              ; x1=(y^sz)-sz = (y^sz)-(-1) (2's complement)
+    vsub.s16        q5, q3
+
+    vshr.s16        q10, #1             ; right shift 1 after vqdmulh
+    vshr.s16        q11, #1
+
+    ldr             r9, [r2, #vp8_blockd_dqcoeff]
+
+    veor.s16        q10, q12            ; y2^sz2
+    veor.s16        q11, q13
+
+    vst1.s16        {q4, q5}, [r7]      ; store: qcoeff = x1
+
+
+    vsub.s16        q10, q12            ; x2=(y^sz)-sz = (y^sz)-(-1) (2's complement)
+    vsub.s16        q11, q13
+
+    ldr             r6, [r3, #vp8_blockd_qcoeff]
+
+    vmul.s16        q2, q6, q4          ; x * Dequant
+    vmul.s16        q3, q7, q5
+
+    ldr             r0, _inv_zig_zag_   ; load ptr of inverse zigzag table
+
+    vceq.s16        q8, q8              ; set q8 to all 1
+
+    vst1.s16        {q10, q11}, [r6]    ; store: qcoeff = x2
+
+    vmul.s16        q12, q6, q10        ; x2 * Dequant
+    vmul.s16        q13, q7, q11
+
+    vld1.16         {q6, q7}, [r0@128]  ; load inverse scan order
+
+    vtst.16         q14, q4, q8         ; now find eob
+    vtst.16         q15, q5, q8         ; non-zero element is set to all 1
+
+    vst1.s16        {q2, q3}, [r9]      ; store dqcoeff = x * Dequant
+
+    ldr             r7, [r3, #vp8_blockd_dqcoeff]
+
+    vand            q0, q6, q14         ; get all valid numbers from scan array
+    vand            q1, q7, q15
+
+    vst1.s16        {q12, q13}, [r7]    ; store dqcoeff = x * Dequant
+
+    vtst.16         q2, q10, q8         ; now find eob
+    vtst.16         q3, q11, q8         ; non-zero element is set to all 1
+
+    vmax.u16        q0, q0, q1          ; find maximum value in q0, q1
+
+    vand            q10, q6, q2         ; get all valid numbers from scan array
+    vand            q11, q7, q3
+    vmax.u16        q10, q10, q11       ; find maximum value in q10, q11
+
+    vmax.u16        d0, d0, d1
+    vmax.u16        d20, d20, d21
+    vmovl.u16       q0, d0
+    vmovl.u16       q10, d20
+
+
+    vmax.u32        d0, d0, d1
+    vmax.u32        d20, d20, d21
+    vpmax.u32       d0, d0, d0
+    vpmax.u32       d20, d20, d20
+
+    add             r4, r2, #vp8_blockd_eob
+    add             r5, r3, #vp8_blockd_eob
+
+    vst1.32         {d0[0]}, [r4@32]
+    vst1.32         {d20[0]}, [r5@32]
+
+    vldmia          sp!, {q4-q7}
+    ldmfd           sp!, {r4-r9}
+    bx              lr
+
+    ENDP
+
+;void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d)
+|vp8_fast_quantize_b_neon| PROC
+
+    stmfd           sp!, {r4-r7}
+
+    ldr             r3, [r0, #vp8_block_coeff]
+    ldr             r4, [r0, #vp8_block_quant_fast]
+    ldr             r5, [r0, #vp8_block_round]
+
+    vld1.16         {q0, q1}, [r3@128]  ; load z
+    vorr.s16        q14, q0, q1         ; check if all zero (step 1)
+    ldr             r6, [r1, #vp8_blockd_qcoeff]
+    ldr             r7, [r1, #vp8_blockd_dqcoeff]
+    vorr.s16        d28, d28, d29       ; check if all zero (step 2)
+
+    vabs.s16        q12, q0             ; calculate x = abs(z)
+    vabs.s16        q13, q1
+
+    ;right shift 15 to get sign, all 0 if it is positive, all 1 if it is negative
+    vshr.s16        q2, q0, #15         ; sz
+    vmov            r2, r3, d28         ; check if all zero (step 3)
+    vshr.s16        q3, q1, #15
+
+    vld1.s16        {q14, q15}, [r5@128]; load round_ptr [0-15]
+    vld1.s16        {q8, q9}, [r4@128]  ; load quant_ptr [0-15]
+
+    vadd.s16        q12, q14            ; x + Round
+    vadd.s16        q13, q15
+
+    ldr             r0, _inv_zig_zag_   ; load ptr of inverse zigzag table
+
+    vqdmulh.s16     q12, q8             ; y = ((Round+abs(z)) * Quant) >> 16
+    vqdmulh.s16     q13, q9
+
+    vld1.16         {q10, q11}, [r0@128]; load inverse scan order
+
+    vceq.s16        q8, q8              ; set q8 to all 1
+
+    ldr             r4, [r1, #vp8_blockd_dequant]
+
+    vshr.s16        q12, #1             ; right shift 1 after vqdmulh
+    vshr.s16        q13, #1
+
+    orr             r2, r2, r3          ; check if all zero (step 4)
+    cmp             r2, #0              ; check if all zero (step 5)
+    beq             zero_output         ; check if all zero (step 6)
+
+    ;modify data to have its original sign
+    veor.s16        q12, q2             ; y^sz
+    veor.s16        q13, q3
+
+    vsub.s16        q12, q2             ; x1=(y^sz)-sz = (y^sz)-(-1) (2's complement)
+    vsub.s16        q13, q3
+
+    vld1.s16        {q2, q3}, [r4@128]  ; load dequant_ptr[i]
+
+    vtst.16         q14, q12, q8        ; now find eob
+    vtst.16         q15, q13, q8        ; non-zero element is set to all 1
+
+    vst1.s16        {q12, q13}, [r6@128]; store: qcoeff = x1
+
+    vand            q10, q10, q14       ; get all valid numbers from scan array
+    vand            q11, q11, q15
+
+
+    vmax.u16        q0, q10, q11        ; find maximum value in q0, q1
+    vmax.u16        d0, d0, d1
+    vmovl.u16       q0, d0
+
+    vmul.s16        q2, q12             ; x * Dequant
+    vmul.s16        q3, q13
+
+    vmax.u32        d0, d0, d1
+    vpmax.u32       d0, d0, d0
+
+    vst1.s16        {q2, q3}, [r7@128]  ; store dqcoeff = x * Dequant
+
+    add             r4, r1, #vp8_blockd_eob
+    vst1.32         {d0[0]}, [r4@32]
+
+    ldmfd           sp!, {r4-r7}
+    bx              lr
+
+zero_output
+    str             r2, [r1, #vp8_blockd_eob]
+    vst1.s16        {q0, q1}, [r6@128]  ; qcoeff = 0
+    vst1.s16        {q0, q1}, [r7@128]  ; dqcoeff = 0
+
+    ldmfd           sp!, {r4-r7}
+    bx              lr
+
+    ENDP
+
+; default inverse zigzag table is defined in vp9/common/entropy.c
+_inv_zig_zag_
+    DCD inv_zig_zag
+
+    ALIGN 16    ; enable use of @128 bit aligned loads
+inv_zig_zag
+    DCW 0x0001, 0x0002, 0x0006, 0x0007
+    DCW 0x0003, 0x0005, 0x0008, 0x000d
+    DCW 0x0004, 0x0009, 0x000c, 0x000e
+    DCW 0x000a, 0x000b, 0x000f, 0x0010
+
+    END
+
--- /dev/null
+++ b/vp9/encoder/arm/neon/picklpf_arm.c
@@ -1,0 +1,49 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vp9/common/onyxc_int.h"
+#include "vp9/encoder/onyx_int.h"
+#include "vp9/encoder/quantize.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_scale/yv12extend.h"
+#include "vpx_scale/vpxscale.h"
+#include "vp9/common/alloccommon.h"
+
+extern void vp8_memcpy_neon(unsigned char *dst_ptr, unsigned char *src_ptr, int sz);
+
+
+void
+vpxyv12_copy_partial_frame_neon(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc, int Fraction) {
+  unsigned char *src_y, *dst_y;
+  int yheight;
+  int ystride;
+  int border;
+  int yoffset;
+  int linestocopy;
+
+  border   = src_ybc->border;
+  yheight  = src_ybc->y_height;
+  ystride  = src_ybc->y_stride;
+
+  linestocopy = (yheight >> (Fraction + 4));
+
+  if (linestocopy < 1)
+    linestocopy = 1;
+
+  linestocopy <<= 4;
+
+  yoffset  = ystride * ((yheight >> 5) * 16 - 8);
+  src_y = src_ybc->y_buffer + yoffset;
+  dst_y = dst_ybc->y_buffer + yoffset;
+
+  // vpx_memcpy (dst_y, src_y, ystride * (linestocopy +16));
+  vp8_memcpy_neon((unsigned char *)dst_y, (unsigned char *)src_y, (int)(ystride * (linestocopy + 16)));
+}
--- /dev/null
+++ b/vp9/encoder/arm/neon/sad16_neon.asm
@@ -1,0 +1,207 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_sad16x16_neon|
+    EXPORT  |vp8_sad16x8_neon|
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0    unsigned char *src_ptr
+; r1    int  src_stride
+; r2    unsigned char *ref_ptr
+; r3    int  ref_stride
+|vp8_sad16x16_neon| PROC
+;;
+    vld1.8          {q0}, [r0], r1
+    vld1.8          {q4}, [r2], r3
+
+    vld1.8          {q1}, [r0], r1
+    vld1.8          {q5}, [r2], r3
+
+    vabdl.u8        q12, d0, d8
+    vabdl.u8        q13, d1, d9
+
+    vld1.8          {q2}, [r0], r1
+    vld1.8          {q6}, [r2], r3
+
+    vabal.u8        q12, d2, d10
+    vabal.u8        q13, d3, d11
+
+    vld1.8          {q3}, [r0], r1
+    vld1.8          {q7}, [r2], r3
+
+    vabal.u8        q12, d4, d12
+    vabal.u8        q13, d5, d13
+
+;;
+    vld1.8          {q0}, [r0], r1
+    vld1.8          {q4}, [r2], r3
+
+    vabal.u8        q12, d6, d14
+    vabal.u8        q13, d7, d15
+
+    vld1.8          {q1}, [r0], r1
+    vld1.8          {q5}, [r2], r3
+
+    vabal.u8        q12, d0, d8
+    vabal.u8        q13, d1, d9
+
+    vld1.8          {q2}, [r0], r1
+    vld1.8          {q6}, [r2], r3
+
+    vabal.u8        q12, d2, d10
+    vabal.u8        q13, d3, d11
+
+    vld1.8          {q3}, [r0], r1
+    vld1.8          {q7}, [r2], r3
+
+    vabal.u8        q12, d4, d12
+    vabal.u8        q13, d5, d13
+
+;;
+    vld1.8          {q0}, [r0], r1
+    vld1.8          {q4}, [r2], r3
+
+    vabal.u8        q12, d6, d14
+    vabal.u8        q13, d7, d15
+
+    vld1.8          {q1}, [r0], r1
+    vld1.8          {q5}, [r2], r3
+
+    vabal.u8        q12, d0, d8
+    vabal.u8        q13, d1, d9
+
+    vld1.8          {q2}, [r0], r1
+    vld1.8          {q6}, [r2], r3
+
+    vabal.u8        q12, d2, d10
+    vabal.u8        q13, d3, d11
+
+    vld1.8          {q3}, [r0], r1
+    vld1.8          {q7}, [r2], r3
+
+    vabal.u8        q12, d4, d12
+    vabal.u8        q13, d5, d13
+
+;;
+    vld1.8          {q0}, [r0], r1
+    vld1.8          {q4}, [r2], r3
+
+    vabal.u8        q12, d6, d14
+    vabal.u8        q13, d7, d15
+
+    vld1.8          {q1}, [r0], r1
+    vld1.8          {q5}, [r2], r3
+
+    vabal.u8        q12, d0, d8
+    vabal.u8        q13, d1, d9
+
+    vld1.8          {q2}, [r0], r1
+    vld1.8          {q6}, [r2], r3
+
+    vabal.u8        q12, d2, d10
+    vabal.u8        q13, d3, d11
+
+    vld1.8          {q3}, [r0]
+    vld1.8          {q7}, [r2]
+
+    vabal.u8        q12, d4, d12
+    vabal.u8        q13, d5, d13
+
+    vabal.u8        q12, d6, d14
+    vabal.u8        q13, d7, d15
+
+    vadd.u16        q0, q12, q13
+
+    vpaddl.u16      q1, q0
+    vpaddl.u32      q0, q1
+
+    vadd.u32        d0, d0, d1
+
+    vmov.32         r0, d0[0]
+
+    bx              lr
+
+    ENDP
+
+;==============================
+;unsigned int vp8_sad16x8_c(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *ref_ptr,
+;    int  ref_stride)
+|vp8_sad16x8_neon| PROC
+    vld1.8          {q0}, [r0], r1
+    vld1.8          {q4}, [r2], r3
+
+    vld1.8          {q1}, [r0], r1
+    vld1.8          {q5}, [r2], r3
+
+    vabdl.u8        q12, d0, d8
+    vabdl.u8        q13, d1, d9
+
+    vld1.8          {q2}, [r0], r1
+    vld1.8          {q6}, [r2], r3
+
+    vabal.u8        q12, d2, d10
+    vabal.u8        q13, d3, d11
+
+    vld1.8          {q3}, [r0], r1
+    vld1.8          {q7}, [r2], r3
+
+    vabal.u8        q12, d4, d12
+    vabal.u8        q13, d5, d13
+
+    vld1.8          {q0}, [r0], r1
+    vld1.8          {q4}, [r2], r3
+
+    vabal.u8        q12, d6, d14
+    vabal.u8        q13, d7, d15
+
+    vld1.8          {q1}, [r0], r1
+    vld1.8          {q5}, [r2], r3
+
+    vabal.u8        q12, d0, d8
+    vabal.u8        q13, d1, d9
+
+    vld1.8          {q2}, [r0], r1
+    vld1.8          {q6}, [r2], r3
+
+    vabal.u8        q12, d2, d10
+    vabal.u8        q13, d3, d11
+
+    vld1.8          {q3}, [r0], r1
+    vld1.8          {q7}, [r2], r3
+
+    vabal.u8        q12, d4, d12
+    vabal.u8        q13, d5, d13
+
+    vabal.u8        q12, d6, d14
+    vabal.u8        q13, d7, d15
+
+    vadd.u16        q0, q12, q13
+
+    vpaddl.u16      q1, q0
+    vpaddl.u32      q0, q1
+
+    vadd.u32        d0, d0, d1
+
+    vmov.32         r0, d0[0]
+
+    bx              lr
+
+    ENDP
+
+    END
--- /dev/null
+++ b/vp9/encoder/arm/neon/sad8_neon.asm
@@ -1,0 +1,209 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_sad8x8_neon|
+    EXPORT  |vp8_sad8x16_neon|
+    EXPORT  |vp8_sad4x4_neon|
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+; unsigned int vp8_sad8x8_c(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *ref_ptr,
+;    int  ref_stride)
+
+|vp8_sad8x8_neon| PROC
+    vld1.8          {d0}, [r0], r1
+    vld1.8          {d8}, [r2], r3
+
+    vld1.8          {d2}, [r0], r1
+    vld1.8          {d10}, [r2], r3
+
+    vabdl.u8        q12, d0, d8
+
+    vld1.8          {d4}, [r0], r1
+    vld1.8          {d12}, [r2], r3
+
+    vabal.u8        q12, d2, d10
+
+    vld1.8          {d6}, [r0], r1
+    vld1.8          {d14}, [r2], r3
+
+    vabal.u8        q12, d4, d12
+
+    vld1.8          {d0}, [r0], r1
+    vld1.8          {d8}, [r2], r3
+
+    vabal.u8        q12, d6, d14
+
+    vld1.8          {d2}, [r0], r1
+    vld1.8          {d10}, [r2], r3
+
+    vabal.u8        q12, d0, d8
+
+    vld1.8          {d4}, [r0], r1
+    vld1.8          {d12}, [r2], r3
+
+    vabal.u8        q12, d2, d10
+
+    vld1.8          {d6}, [r0], r1
+    vld1.8          {d14}, [r2], r3
+
+    vabal.u8        q12, d4, d12
+    vabal.u8        q12, d6, d14
+
+    vpaddl.u16      q1, q12
+    vpaddl.u32      q0, q1
+    vadd.u32        d0, d0, d1
+
+    vmov.32         r0, d0[0]
+
+    bx              lr
+
+    ENDP
+
+;============================
+;unsigned int vp8_sad8x16_c(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *ref_ptr,
+;    int  ref_stride)
+
+|vp8_sad8x16_neon| PROC
+    vld1.8          {d0}, [r0], r1
+    vld1.8          {d8}, [r2], r3
+
+    vld1.8          {d2}, [r0], r1
+    vld1.8          {d10}, [r2], r3
+
+    vabdl.u8        q12, d0, d8
+
+    vld1.8          {d4}, [r0], r1
+    vld1.8          {d12}, [r2], r3
+
+    vabal.u8        q12, d2, d10
+
+    vld1.8          {d6}, [r0], r1
+    vld1.8          {d14}, [r2], r3
+
+    vabal.u8        q12, d4, d12
+
+    vld1.8          {d0}, [r0], r1
+    vld1.8          {d8}, [r2], r3
+
+    vabal.u8        q12, d6, d14
+
+    vld1.8          {d2}, [r0], r1
+    vld1.8          {d10}, [r2], r3
+
+    vabal.u8        q12, d0, d8
+
+    vld1.8          {d4}, [r0], r1
+    vld1.8          {d12}, [r2], r3
+
+    vabal.u8        q12, d2, d10
+
+    vld1.8          {d6}, [r0], r1
+    vld1.8          {d14}, [r2], r3
+
+    vabal.u8        q12, d4, d12
+
+    vld1.8          {d0}, [r0], r1
+    vld1.8          {d8}, [r2], r3
+
+    vabal.u8        q12, d6, d14
+
+    vld1.8          {d2}, [r0], r1
+    vld1.8          {d10}, [r2], r3
+
+    vabal.u8        q12, d0, d8
+
+    vld1.8          {d4}, [r0], r1
+    vld1.8          {d12}, [r2], r3
+
+    vabal.u8        q12, d2, d10
+
+    vld1.8          {d6}, [r0], r1
+    vld1.8          {d14}, [r2], r3
+
+    vabal.u8        q12, d4, d12
+
+    vld1.8          {d0}, [r0], r1
+    vld1.8          {d8}, [r2], r3
+
+    vabal.u8        q12, d6, d14
+
+    vld1.8          {d2}, [r0], r1
+    vld1.8          {d10}, [r2], r3
+
+    vabal.u8        q12, d0, d8
+
+    vld1.8          {d4}, [r0], r1
+    vld1.8          {d12}, [r2], r3
+
+    vabal.u8        q12, d2, d10
+
+    vld1.8          {d6}, [r0], r1
+    vld1.8          {d14}, [r2], r3
+
+    vabal.u8        q12, d4, d12
+    vabal.u8        q12, d6, d14
+
+    vpaddl.u16      q1, q12
+    vpaddl.u32      q0, q1
+    vadd.u32        d0, d0, d1
+
+    vmov.32         r0, d0[0]
+
+    bx              lr
+
+    ENDP
+
+;===========================
+;unsigned int vp8_sad4x4_c(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *ref_ptr,
+;    int  ref_stride)
+
+|vp8_sad4x4_neon| PROC
+    vld1.8          {d0}, [r0], r1
+    vld1.8          {d8}, [r2], r3
+
+    vld1.8          {d2}, [r0], r1
+    vld1.8          {d10}, [r2], r3
+
+    vabdl.u8        q12, d0, d8
+
+    vld1.8          {d4}, [r0], r1
+    vld1.8          {d12}, [r2], r3
+
+    vabal.u8        q12, d2, d10
+
+    vld1.8          {d6}, [r0], r1
+    vld1.8          {d14}, [r2], r3
+
+    vabal.u8        q12, d4, d12
+    vabal.u8        q12, d6, d14
+
+    vpaddl.u16      d1, d24
+    vpaddl.u32      d0, d1
+    vmov.32         r0, d0[0]
+
+    bx              lr
+
+    ENDP
+
+    END
--- /dev/null
+++ b/vp9/encoder/arm/neon/shortfdct_neon.asm
@@ -1,0 +1,221 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_short_fdct4x4_neon|
+    EXPORT  |vp8_short_fdct8x4_neon|
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=4
+
+
+    ALIGN 16    ; enable use of @128 bit aligned loads
+coeff
+    DCW      5352,  5352,  5352, 5352
+    DCW      2217,  2217,  2217, 2217
+    DCD     14500, 14500, 14500, 14500
+    DCD      7500,  7500,  7500, 7500
+    DCD     12000, 12000, 12000, 12000
+    DCD     51000, 51000, 51000, 51000
+
+;void vp8_short_fdct4x4_c(short *input, short *output, int pitch)
+|vp8_short_fdct4x4_neon| PROC
+
+    ; Part one
+    vld1.16         {d0}, [r0@64], r2
+    adr             r12, coeff
+    vld1.16         {d1}, [r0@64], r2
+    vld1.16         {q8}, [r12@128]!        ; d16=5352,  d17=2217
+    vld1.16         {d2}, [r0@64], r2
+    vld1.32         {q9, q10}, [r12@128]!   ;  q9=14500, q10=7500
+    vld1.16         {d3}, [r0@64], r2
+
+    ; transpose d0=ip[0], d1=ip[1], d2=ip[2], d3=ip[3]
+    vtrn.32         d0, d2
+    vtrn.32         d1, d3
+    vld1.32         {q11,q12}, [r12@128]    ; q11=12000, q12=51000
+    vtrn.16         d0, d1
+    vtrn.16         d2, d3
+
+    vadd.s16        d4, d0, d3      ; a1 = ip[0] + ip[3]
+    vadd.s16        d5, d1, d2      ; b1 = ip[1] + ip[2]
+    vsub.s16        d6, d1, d2      ; c1 = ip[1] - ip[2]
+    vsub.s16        d7, d0, d3      ; d1 = ip[0] - ip[3]
+
+    vshl.s16        q2, q2, #3      ; (a1, b1) << 3
+    vshl.s16        q3, q3, #3      ; (c1, d1) << 3
+
+    vadd.s16        d0, d4, d5      ; op[0] = a1 + b1
+    vsub.s16        d2, d4, d5      ; op[2] = a1 - b1
+
+    vmlal.s16       q9, d7, d16     ; d1*5352 + 14500
+    vmlal.s16       q10, d7, d17    ; d1*2217 + 7500
+    vmlal.s16       q9, d6, d17     ; c1*2217 + d1*5352 + 14500
+    vmlsl.s16       q10, d6, d16    ; d1*2217 - c1*5352 + 7500
+
+    vshrn.s32       d1, q9, #12     ; op[1] = (c1*2217 + d1*5352 + 14500)>>12
+    vshrn.s32       d3, q10, #12    ; op[3] = (d1*2217 - c1*5352 +  7500)>>12
+
+
+    ; Part two
+
+    ; transpose d0=ip[0], d1=ip[4], d2=ip[8], d3=ip[12]
+    vtrn.32         d0, d2
+    vtrn.32         d1, d3
+    vtrn.16         d0, d1
+    vtrn.16         d2, d3
+
+    vmov.s16        d26, #7
+
+    vadd.s16        d4, d0, d3      ; a1 = ip[0] + ip[12]
+    vadd.s16        d5, d1, d2      ; b1 = ip[4] + ip[8]
+    vsub.s16        d6, d1, d2      ; c1 = ip[4] - ip[8]
+    vadd.s16        d4, d4, d26     ; a1 + 7
+    vsub.s16        d7, d0, d3      ; d1 = ip[0] - ip[12]
+
+    vadd.s16        d0, d4, d5      ; op[0] = a1 + b1 + 7
+    vsub.s16        d2, d4, d5      ; op[8] = a1 - b1 + 7
+
+    vmlal.s16       q11, d7, d16    ; d1*5352 + 12000
+    vmlal.s16       q12, d7, d17    ; d1*2217 + 51000
+
+    vceq.s16        d4, d7, #0
+
+    vshr.s16        d0, d0, #4
+    vshr.s16        d2, d2, #4
+
+    vmlal.s16       q11, d6, d17    ; c1*2217 + d1*5352 + 12000
+    vmlsl.s16       q12, d6, d16    ; d1*2217 - c1*5352 + 51000
+
+    vmvn.s16        d4, d4
+    vshrn.s32       d1, q11, #16    ; op[4] = (c1*2217 + d1*5352 + 12000)>>16
+    vsub.s16        d1, d1, d4      ; op[4] += (d1!=0)
+    vshrn.s32       d3, q12, #16    ; op[12]= (d1*2217 - c1*5352 + 51000)>>16
+
+    vst1.16         {q0, q1}, [r1@128]
+
+    bx              lr
+
+    ENDP
+
+;void vp8_short_fdct8x4_c(short *input, short *output, int pitch)
+|vp8_short_fdct8x4_neon| PROC
+
+    ; Part one
+
+    vld1.16         {q0}, [r0@128], r2
+    adr             r12, coeff
+    vld1.16         {q1}, [r0@128], r2
+    vld1.16         {q8}, [r12@128]!        ; d16=5352,  d17=2217
+    vld1.16         {q2}, [r0@128], r2
+    vld1.32         {q9, q10}, [r12@128]!   ;  q9=14500, q10=7500
+    vld1.16         {q3}, [r0@128], r2
+
+    ; transpose q0=ip[0], q1=ip[1], q2=ip[2], q3=ip[3]
+    vtrn.32         q0, q2          ; [A0|B0]
+    vtrn.32         q1, q3          ; [A1|B1]
+    vtrn.16         q0, q1          ; [A2|B2]
+    vtrn.16         q2, q3          ; [A3|B3]
+
+    vadd.s16        q11, q0, q3     ; a1 = ip[0] + ip[3]
+    vadd.s16        q12, q1, q2     ; b1 = ip[1] + ip[2]
+    vsub.s16        q13, q1, q2     ; c1 = ip[1] - ip[2]
+    vsub.s16        q14, q0, q3     ; d1 = ip[0] - ip[3]
+
+    vshl.s16        q11, q11, #3    ; a1 << 3
+    vshl.s16        q12, q12, #3    ; b1 << 3
+    vshl.s16        q13, q13, #3    ; c1 << 3
+    vshl.s16        q14, q14, #3    ; d1 << 3
+
+    vadd.s16        q0, q11, q12    ; [A0 | B0] = a1 + b1
+    vsub.s16        q2, q11, q12    ; [A2 | B2] = a1 - b1
+
+    vmov.s16        q11, q9         ; 14500
+    vmov.s16        q12, q10        ; 7500
+
+    vmlal.s16       q9, d28, d16    ; A[1] = d1*5352 + 14500
+    vmlal.s16       q10, d28, d17   ; A[3] = d1*2217 + 7500
+    vmlal.s16       q11, d29, d16   ; B[1] = d1*5352 + 14500
+    vmlal.s16       q12, d29, d17   ; B[3] = d1*2217 + 7500
+
+    vmlal.s16       q9, d26, d17    ; A[1] = c1*2217 + d1*5352 + 14500
+    vmlsl.s16       q10, d26, d16   ; A[3] = d1*2217 - c1*5352 + 7500
+    vmlal.s16       q11, d27, d17   ; B[1] = c1*2217 + d1*5352 + 14500
+    vmlsl.s16       q12, d27, d16   ; B[3] = d1*2217 - c1*5352 + 7500
+
+    vshrn.s32       d2, q9, #12     ; A[1] = (c1*2217 + d1*5352 + 14500)>>12
+    vshrn.s32       d6, q10, #12    ; A[3] = (d1*2217 - c1*5352 +  7500)>>12
+    vshrn.s32       d3, q11, #12    ; B[1] = (c1*2217 + d1*5352 + 14500)>>12
+    vshrn.s32       d7, q12, #12    ; B[3] = (d1*2217 - c1*5352 +  7500)>>12
+
+
+    ; Part two
+    vld1.32         {q9,q10}, [r12@128]    ; q9=12000, q10=51000
+
+    ; transpose q0=ip[0], q1=ip[4], q2=ip[8], q3=ip[12]
+    vtrn.32         q0, q2          ; q0=[A0 | B0]
+    vtrn.32         q1, q3          ; q1=[A4 | B4]
+    vtrn.16         q0, q1          ; q2=[A8 | B8]
+    vtrn.16         q2, q3          ; q3=[A12|B12]
+
+    vmov.s16        q15, #7
+
+    vadd.s16        q11, q0, q3     ; a1 = ip[0] + ip[12]
+    vadd.s16        q12, q1, q2     ; b1 = ip[4] + ip[8]
+    vadd.s16        q11, q11, q15   ; a1 + 7
+    vsub.s16        q13, q1, q2     ; c1 = ip[4] - ip[8]
+    vsub.s16        q14, q0, q3     ; d1 = ip[0] - ip[12]
+
+    vadd.s16        q0, q11, q12    ; a1 + b1 + 7
+    vsub.s16        q1, q11, q12    ; a1 - b1 + 7
+
+    vmov.s16        q11, q9         ; 12000
+    vmov.s16        q12, q10        ; 51000
+
+    vshr.s16        d0, d0, #4      ; A[0] = (a1 + b1 + 7)>>4
+    vshr.s16        d4, d1, #4      ; B[0] = (a1 + b1 + 7)>>4
+    vshr.s16        d2, d2, #4      ; A[8] = (a1 + b1 + 7)>>4
+    vshr.s16        d6, d3, #4      ; B[8] = (a1 + b1 + 7)>>4
+
+
+    vmlal.s16       q9, d28, d16    ; A[4]  = d1*5352 + 12000
+    vmlal.s16       q10, d28, d17   ; A[12] = d1*2217 + 51000
+    vmlal.s16       q11, d29, d16   ; B[4]  = d1*5352 + 12000
+    vmlal.s16       q12, d29, d17   ; B[12] = d1*2217 + 51000
+
+    vceq.s16        q14, q14, #0
+
+    vmlal.s16       q9, d26, d17    ; A[4]  = c1*2217 + d1*5352 + 12000
+    vmlsl.s16       q10, d26, d16   ; A[12] = d1*2217 - c1*5352 + 51000
+    vmlal.s16       q11, d27, d17   ; B[4]  = c1*2217 + d1*5352 + 12000
+    vmlsl.s16       q12, d27, d16   ; B[12] = d1*2217 - c1*5352 + 51000
+
+    vmvn.s16        q14, q14
+
+    vshrn.s32       d1, q9, #16     ; A[4] = (c1*2217 + d1*5352 + 12000)>>16
+    vshrn.s32       d3, q10, #16    ; A[12]= (d1*2217 - c1*5352 + 51000)>>16
+    vsub.s16        d1, d1, d28     ; A[4] += (d1!=0)
+
+    vshrn.s32       d5, q11, #16    ; B[4] = (c1*2217 + d1*5352 + 12000)>>16
+    vshrn.s32       d7, q12, #16    ; B[12]= (d1*2217 - c1*5352 + 51000)>>16
+    vsub.s16        d5, d5, d29     ; B[4] += (d1!=0)
+
+    vst1.16         {q0, q1}, [r1@128]! ; block A
+    vst1.16         {q2, q3}, [r1@128]! ; block B
+
+    bx              lr
+
+    ENDP
+
+    END
+
--- /dev/null
+++ b/vp9/encoder/arm/neon/subtract_neon.asm
@@ -1,0 +1,185 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+    EXPORT |vp8_subtract_b_neon|
+    EXPORT |vp8_subtract_mby_neon|
+    EXPORT |vp8_subtract_mbuv_neon|
+
+    INCLUDE asm_enc_offsets.asm
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+;void vp8_subtract_b_neon(BLOCK *be, BLOCKD *bd, int pitch)
+|vp8_subtract_b_neon| PROC
+
+    stmfd   sp!, {r4-r7}
+
+    ldr     r3, [r0, #vp8_block_base_src]
+    ldr     r4, [r0, #vp8_block_src]
+    ldr     r5, [r0, #vp8_block_src_diff]
+    ldr     r3, [r3]
+    ldr     r6, [r0, #vp8_block_src_stride]
+    add     r3, r3, r4                      ; src = *base_src + src
+    ldr     r7, [r1, #vp8_blockd_predictor]
+
+    vld1.8          {d0}, [r3], r6          ;load src
+    vld1.8          {d1}, [r7], r2          ;load pred
+    vld1.8          {d2}, [r3], r6
+    vld1.8          {d3}, [r7], r2
+    vld1.8          {d4}, [r3], r6
+    vld1.8          {d5}, [r7], r2
+    vld1.8          {d6}, [r3], r6
+    vld1.8          {d7}, [r7], r2
+
+    vsubl.u8        q10, d0, d1
+    vsubl.u8        q11, d2, d3
+    vsubl.u8        q12, d4, d5
+    vsubl.u8        q13, d6, d7
+
+    mov             r2, r2, lsl #1
+
+    vst1.16         {d20}, [r5], r2         ;store diff
+    vst1.16         {d22}, [r5], r2
+    vst1.16         {d24}, [r5], r2
+    vst1.16         {d26}, [r5], r2
+
+    ldmfd   sp!, {r4-r7}
+    bx              lr
+
+    ENDP
+
+
+;==========================================
+;void vp8_subtract_mby_neon(short *diff, unsigned char *src, unsigned char *pred, int stride)
+|vp8_subtract_mby_neon| PROC
+    mov             r12, #4
+
+subtract_mby_loop
+    vld1.8          {q0}, [r1], r3          ;load src
+    vld1.8          {q1}, [r2]!             ;load pred
+    vld1.8          {q2}, [r1], r3
+    vld1.8          {q3}, [r2]!
+    vld1.8          {q4}, [r1], r3
+    vld1.8          {q5}, [r2]!
+    vld1.8          {q6}, [r1], r3
+    vld1.8          {q7}, [r2]!
+
+    vsubl.u8        q8, d0, d2
+    vsubl.u8        q9, d1, d3
+    vsubl.u8        q10, d4, d6
+    vsubl.u8        q11, d5, d7
+    vsubl.u8        q12, d8, d10
+    vsubl.u8        q13, d9, d11
+    vsubl.u8        q14, d12, d14
+    vsubl.u8        q15, d13, d15
+
+    vst1.16         {q8}, [r0]!             ;store diff
+    vst1.16         {q9}, [r0]!
+    vst1.16         {q10}, [r0]!
+    vst1.16         {q11}, [r0]!
+    vst1.16         {q12}, [r0]!
+    vst1.16         {q13}, [r0]!
+    vst1.16         {q14}, [r0]!
+    vst1.16         {q15}, [r0]!
+
+    subs            r12, r12, #1
+    bne             subtract_mby_loop
+
+    bx              lr
+    ENDP
+
+;=================================
+;void vp8_subtract_mbuv_neon(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride)
+|vp8_subtract_mbuv_neon| PROC
+    ldr             r12, [sp]
+
+;u
+    add             r0, r0, #512        ;   short *udiff = diff + 256;
+    add             r3, r3, #256        ;   unsigned char *upred = pred + 256;
+
+    vld1.8          {d0}, [r1], r12         ;load src
+    vld1.8          {d1}, [r3]!             ;load pred
+    vld1.8          {d2}, [r1], r12
+    vld1.8          {d3}, [r3]!
+    vld1.8          {d4}, [r1], r12
+    vld1.8          {d5}, [r3]!
+    vld1.8          {d6}, [r1], r12
+    vld1.8          {d7}, [r3]!
+    vld1.8          {d8}, [r1], r12
+    vld1.8          {d9}, [r3]!
+    vld1.8          {d10}, [r1], r12
+    vld1.8          {d11}, [r3]!
+    vld1.8          {d12}, [r1], r12
+    vld1.8          {d13}, [r3]!
+    vld1.8          {d14}, [r1], r12
+    vld1.8          {d15}, [r3]!
+
+    vsubl.u8        q8, d0, d1
+    vsubl.u8        q9, d2, d3
+    vsubl.u8        q10, d4, d5
+    vsubl.u8        q11, d6, d7
+    vsubl.u8        q12, d8, d9
+    vsubl.u8        q13, d10, d11
+    vsubl.u8        q14, d12, d13
+    vsubl.u8        q15, d14, d15
+
+    vst1.16         {q8}, [r0]!             ;store diff
+    vst1.16         {q9}, [r0]!
+    vst1.16         {q10}, [r0]!
+    vst1.16         {q11}, [r0]!
+    vst1.16         {q12}, [r0]!
+    vst1.16         {q13}, [r0]!
+    vst1.16         {q14}, [r0]!
+    vst1.16         {q15}, [r0]!
+
+;v
+    vld1.8          {d0}, [r2], r12         ;load src
+    vld1.8          {d1}, [r3]!             ;load pred
+    vld1.8          {d2}, [r2], r12
+    vld1.8          {d3}, [r3]!
+    vld1.8          {d4}, [r2], r12
+    vld1.8          {d5}, [r3]!
+    vld1.8          {d6}, [r2], r12
+    vld1.8          {d7}, [r3]!
+    vld1.8          {d8}, [r2], r12
+    vld1.8          {d9}, [r3]!
+    vld1.8          {d10}, [r2], r12
+    vld1.8          {d11}, [r3]!
+    vld1.8          {d12}, [r2], r12
+    vld1.8          {d13}, [r3]!
+    vld1.8          {d14}, [r2], r12
+    vld1.8          {d15}, [r3]!
+
+    vsubl.u8        q8, d0, d1
+    vsubl.u8        q9, d2, d3
+    vsubl.u8        q10, d4, d5
+    vsubl.u8        q11, d6, d7
+    vsubl.u8        q12, d8, d9
+    vsubl.u8        q13, d10, d11
+    vsubl.u8        q14, d12, d13
+    vsubl.u8        q15, d14, d15
+
+    vst1.16         {q8}, [r0]!             ;store diff
+    vst1.16         {q9}, [r0]!
+    vst1.16         {q10}, [r0]!
+    vst1.16         {q11}, [r0]!
+    vst1.16         {q12}, [r0]!
+    vst1.16         {q13}, [r0]!
+    vst1.16         {q14}, [r0]!
+    vst1.16         {q15}, [r0]!
+
+    bx              lr
+    ENDP
+
+    END
--- /dev/null
+++ b/vp9/encoder/arm/neon/variance_neon.asm
@@ -1,0 +1,276 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp9_variance16x16_neon|
+    EXPORT  |vp9_variance16x8_neon|
+    EXPORT  |vp9_variance8x16_neon|
+    EXPORT  |vp9_variance8x8_neon|
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0    unsigned char *src_ptr
+; r1    int source_stride
+; r2    unsigned char *ref_ptr
+; r3    int  recon_stride
+; stack unsigned int *sse
+|vp9_variance16x16_neon| PROC
+    vmov.i8         q8, #0                      ;q8 - sum
+    vmov.i8         q9, #0                      ;q9, q10 - sse
+    vmov.i8         q10, #0
+
+    mov             r12, #8
+
+variance16x16_neon_loop
+    vld1.8          {q0}, [r0], r1              ;Load up source and reference
+    vld1.8          {q2}, [r2], r3
+    vld1.8          {q1}, [r0], r1
+    vld1.8          {q3}, [r2], r3
+
+    vsubl.u8        q11, d0, d4                 ;calculate diff
+    vsubl.u8        q12, d1, d5
+    vsubl.u8        q13, d2, d6
+    vsubl.u8        q14, d3, d7
+
+    ;VPADAL adds adjacent pairs of elements of a vector, and accumulates
+    ;the results into the elements of the destination vector. The explanation
+    ;in ARM guide is wrong.
+    vpadal.s16      q8, q11                     ;calculate sum
+    vmlal.s16       q9, d22, d22                ;calculate sse
+    vmlal.s16       q10, d23, d23
+
+    subs            r12, r12, #1
+
+    vpadal.s16      q8, q12
+    vmlal.s16       q9, d24, d24
+    vmlal.s16       q10, d25, d25
+    vpadal.s16      q8, q13
+    vmlal.s16       q9, d26, d26
+    vmlal.s16       q10, d27, d27
+    vpadal.s16      q8, q14
+    vmlal.s16       q9, d28, d28
+    vmlal.s16       q10, d29, d29
+
+    bne             variance16x16_neon_loop
+
+    vadd.u32        q10, q9, q10                ;accumulate sse
+    vpaddl.s32      q0, q8                      ;accumulate sum
+
+    ldr             r12, [sp]                   ;load *sse from stack
+
+    vpaddl.u32      q1, q10
+    vadd.s64        d0, d0, d1
+    vadd.u64        d1, d2, d3
+
+    ;vmov.32        r0, d0[0]                   ;this instruction costs a lot
+    ;vmov.32        r1, d1[0]
+    ;mul            r0, r0, r0
+    ;str            r1, [r12]
+    ;sub            r0, r1, r0, asr #8
+
+    ;sum is in [-255x256, 255x256]. sumxsum is 32-bit. Shift to right should
+    ;have sign-bit exension, which is vshr.s. Have to use s32 to make it right.
+    vmull.s32       q5, d0, d0
+    vst1.32         {d1[0]}, [r12]              ;store sse
+    vshr.s32        d10, d10, #8
+    vsub.s32        d0, d1, d10
+
+    vmov.32         r0, d0[0]                   ;return
+    bx              lr
+
+    ENDP
+
+;================================
+;unsigned int vp9_variance16x8_c(
+;    unsigned char *src_ptr,
+;    int  source_stride,
+;    unsigned char *ref_ptr,
+;    int  recon_stride,
+;   unsigned int *sse)
+|vp9_variance16x8_neon| PROC
+    vmov.i8         q8, #0                      ;q8 - sum
+    vmov.i8         q9, #0                      ;q9, q10 - sse
+    vmov.i8         q10, #0
+
+    mov             r12, #4
+
+variance16x8_neon_loop
+    vld1.8          {q0}, [r0], r1              ;Load up source and reference
+    vld1.8          {q2}, [r2], r3
+    vld1.8          {q1}, [r0], r1
+    vld1.8          {q3}, [r2], r3
+
+    vsubl.u8        q11, d0, d4                 ;calculate diff
+    vsubl.u8        q12, d1, d5
+    vsubl.u8        q13, d2, d6
+    vsubl.u8        q14, d3, d7
+
+    vpadal.s16      q8, q11                     ;calculate sum
+    vmlal.s16       q9, d22, d22                ;calculate sse
+    vmlal.s16       q10, d23, d23
+
+    subs            r12, r12, #1
+
+    vpadal.s16      q8, q12
+    vmlal.s16       q9, d24, d24
+    vmlal.s16       q10, d25, d25
+    vpadal.s16      q8, q13
+    vmlal.s16       q9, d26, d26
+    vmlal.s16       q10, d27, d27
+    vpadal.s16      q8, q14
+    vmlal.s16       q9, d28, d28
+    vmlal.s16       q10, d29, d29
+
+    bne             variance16x8_neon_loop
+
+    vadd.u32        q10, q9, q10                ;accumulate sse
+    vpaddl.s32      q0, q8                      ;accumulate sum
+
+    ldr             r12, [sp]                   ;load *sse from stack
+
+    vpaddl.u32      q1, q10
+    vadd.s64        d0, d0, d1
+    vadd.u64        d1, d2, d3
+
+    vmull.s32       q5, d0, d0
+    vst1.32         {d1[0]}, [r12]              ;store sse
+    vshr.s32        d10, d10, #7
+    vsub.s32        d0, d1, d10
+
+    vmov.32         r0, d0[0]                   ;return
+    bx              lr
+
+    ENDP
+
+;=================================
+;unsigned int vp9_variance8x16_c(
+;    unsigned char *src_ptr,
+;    int  source_stride,
+;    unsigned char *ref_ptr,
+;    int  recon_stride,
+;   unsigned int *sse)
+
+|vp9_variance8x16_neon| PROC
+    vmov.i8         q8, #0                      ;q8 - sum
+    vmov.i8         q9, #0                      ;q9, q10 - sse
+    vmov.i8         q10, #0
+
+    mov             r12, #8
+
+variance8x16_neon_loop
+    vld1.8          {d0}, [r0], r1              ;Load up source and reference
+    vld1.8          {d4}, [r2], r3
+    vld1.8          {d2}, [r0], r1
+    vld1.8          {d6}, [r2], r3
+
+    vsubl.u8        q11, d0, d4                 ;calculate diff
+    vsubl.u8        q12, d2, d6
+
+    vpadal.s16      q8, q11                     ;calculate sum
+    vmlal.s16       q9, d22, d22                ;calculate sse
+    vmlal.s16       q10, d23, d23
+
+    subs            r12, r12, #1
+
+    vpadal.s16      q8, q12
+    vmlal.s16       q9, d24, d24
+    vmlal.s16       q10, d25, d25
+
+    bne             variance8x16_neon_loop
+
+    vadd.u32        q10, q9, q10                ;accumulate sse
+    vpaddl.s32      q0, q8                      ;accumulate sum
+
+    ldr             r12, [sp]                   ;load *sse from stack
+
+    vpaddl.u32      q1, q10
+    vadd.s64        d0, d0, d1
+    vadd.u64        d1, d2, d3
+
+    vmull.s32       q5, d0, d0
+    vst1.32         {d1[0]}, [r12]              ;store sse
+    vshr.s32        d10, d10, #7
+    vsub.s32        d0, d1, d10
+
+    vmov.32         r0, d0[0]                   ;return
+    bx              lr
+
+    ENDP
+
+;==================================
+; r0    unsigned char *src_ptr
+; r1    int source_stride
+; r2    unsigned char *ref_ptr
+; r3    int  recon_stride
+; stack unsigned int *sse
+|vp9_variance8x8_neon| PROC
+    vmov.i8         q8, #0                      ;q8 - sum
+    vmov.i8         q9, #0                      ;q9, q10 - sse
+    vmov.i8         q10, #0
+
+    mov             r12, #2
+
+variance8x8_neon_loop
+    vld1.8          {d0}, [r0], r1              ;Load up source and reference
+    vld1.8          {d4}, [r2], r3
+    vld1.8          {d1}, [r0], r1
+    vld1.8          {d5}, [r2], r3
+    vld1.8          {d2}, [r0], r1
+    vld1.8          {d6}, [r2], r3
+    vld1.8          {d3}, [r0], r1
+    vld1.8          {d7}, [r2], r3
+
+    vsubl.u8        q11, d0, d4                 ;calculate diff
+    vsubl.u8        q12, d1, d5
+    vsubl.u8        q13, d2, d6
+    vsubl.u8        q14, d3, d7
+
+    vpadal.s16      q8, q11                     ;calculate sum
+    vmlal.s16       q9, d22, d22                ;calculate sse
+    vmlal.s16       q10, d23, d23
+
+    subs            r12, r12, #1
+
+    vpadal.s16      q8, q12
+    vmlal.s16       q9, d24, d24
+    vmlal.s16       q10, d25, d25
+    vpadal.s16      q8, q13
+    vmlal.s16       q9, d26, d26
+    vmlal.s16       q10, d27, d27
+    vpadal.s16      q8, q14
+    vmlal.s16       q9, d28, d28
+    vmlal.s16       q10, d29, d29
+
+    bne             variance8x8_neon_loop
+
+    vadd.u32        q10, q9, q10                ;accumulate sse
+    vpaddl.s32      q0, q8                      ;accumulate sum
+
+    ldr             r12, [sp]                   ;load *sse from stack
+
+    vpaddl.u32      q1, q10
+    vadd.s64        d0, d0, d1
+    vadd.u64        d1, d2, d3
+
+    vmull.s32       q5, d0, d0
+    vst1.32         {d1[0]}, [r12]              ;store sse
+    vshr.s32        d10, d10, #6
+    vsub.s32        d0, d1, d10
+
+    vmov.32         r0, d0[0]                   ;return
+    bx              lr
+
+    ENDP
+
+    END
--- /dev/null
+++ b/vp9/encoder/arm/neon/vp8_memcpy_neon.asm
@@ -1,0 +1,68 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT |vp8_memcpy_neon|
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+;=========================================
+;void vp8_memcpy_neon(unsigned char *dst_ptr, unsigned char *src_ptr, int sz);
+|vp8_memcpy_neon| PROC
+    ;pld                [r1]                        ;preload pred data
+    ;pld                [r1, #128]
+    ;pld                [r1, #256]
+    ;pld                [r1, #384]
+
+    mov             r12, r2, lsr #8                 ;copy 256 bytes data at one time
+
+memcpy_neon_loop
+    vld1.8          {q0, q1}, [r1]!                 ;load src data
+    subs            r12, r12, #1
+    vld1.8          {q2, q3}, [r1]!
+    vst1.8          {q0, q1}, [r0]!                 ;copy to dst_ptr
+    vld1.8          {q4, q5}, [r1]!
+    vst1.8          {q2, q3}, [r0]!
+    vld1.8          {q6, q7}, [r1]!
+    vst1.8          {q4, q5}, [r0]!
+    vld1.8          {q8, q9}, [r1]!
+    vst1.8          {q6, q7}, [r0]!
+    vld1.8          {q10, q11}, [r1]!
+    vst1.8          {q8, q9}, [r0]!
+    vld1.8          {q12, q13}, [r1]!
+    vst1.8          {q10, q11}, [r0]!
+    vld1.8          {q14, q15}, [r1]!
+    vst1.8          {q12, q13}, [r0]!
+    vst1.8          {q14, q15}, [r0]!
+
+    ;pld                [r1]                        ;preload pred data -- need to adjust for real device
+    ;pld                [r1, #128]
+    ;pld                [r1, #256]
+    ;pld                [r1, #384]
+
+    bne             memcpy_neon_loop
+
+    ands            r3, r2, #0xff                   ;extra copy
+    beq             done_copy_neon_loop
+
+extra_copy_neon_loop
+    vld1.8          {q0}, [r1]!                 ;load src data
+    subs            r3, r3, #16
+    vst1.8          {q0}, [r0]!
+    bne             extra_copy_neon_loop
+
+done_copy_neon_loop
+    bx              lr
+    ENDP
+
+    END
--- /dev/null
+++ b/vp9/encoder/arm/neon/vp8_mse16x16_neon.asm
@@ -1,0 +1,116 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_mse16x16_neon|
+    EXPORT  |vp8_get4x4sse_cs_neon|
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+;============================
+; r0    unsigned char *src_ptr
+; r1    int source_stride
+; r2    unsigned char *ref_ptr
+; r3    int  recon_stride
+; stack unsigned int *sse
+;note: in this function, sum is never used. So, we can remove this part of calculation
+;from vp9_variance().
+
+|vp8_mse16x16_neon| PROC
+    vmov.i8         q7, #0                      ;q7, q8, q9, q10 - sse
+    vmov.i8         q8, #0
+    vmov.i8         q9, #0
+    vmov.i8         q10, #0
+
+    mov             r12, #8
+
+mse16x16_neon_loop
+    vld1.8          {q0}, [r0], r1              ;Load up source and reference
+    vld1.8          {q2}, [r2], r3
+    vld1.8          {q1}, [r0], r1
+    vld1.8          {q3}, [r2], r3
+
+    vsubl.u8        q11, d0, d4
+    vsubl.u8        q12, d1, d5
+    vsubl.u8        q13, d2, d6
+    vsubl.u8        q14, d3, d7
+
+    vmlal.s16       q7, d22, d22
+    vmlal.s16       q8, d23, d23
+
+    subs            r12, r12, #1
+
+    vmlal.s16       q9, d24, d24
+    vmlal.s16       q10, d25, d25
+    vmlal.s16       q7, d26, d26
+    vmlal.s16       q8, d27, d27
+    vmlal.s16       q9, d28, d28
+    vmlal.s16       q10, d29, d29
+
+    bne             mse16x16_neon_loop
+
+    vadd.u32        q7, q7, q8
+    vadd.u32        q9, q9, q10
+
+    ldr             r12, [sp]               ;load *sse from stack
+
+    vadd.u32        q10, q7, q9
+    vpaddl.u32      q1, q10
+    vadd.u64        d0, d2, d3
+
+    vst1.32         {d0[0]}, [r12]
+    vmov.32         r0, d0[0]
+
+    bx              lr
+
+    ENDP
+
+
+;=============================
+; r0    unsigned char *src_ptr,
+; r1    int  source_stride,
+; r2    unsigned char *ref_ptr,
+; r3    int  recon_stride
+|vp8_get4x4sse_cs_neon| PROC
+    vld1.8          {d0}, [r0], r1              ;Load up source and reference
+    vld1.8          {d4}, [r2], r3
+    vld1.8          {d1}, [r0], r1
+    vld1.8          {d5}, [r2], r3
+    vld1.8          {d2}, [r0], r1
+    vld1.8          {d6}, [r2], r3
+    vld1.8          {d3}, [r0], r1
+    vld1.8          {d7}, [r2], r3
+
+    vsubl.u8        q11, d0, d4
+    vsubl.u8        q12, d1, d5
+    vsubl.u8        q13, d2, d6
+    vsubl.u8        q14, d3, d7
+
+    vmull.s16       q7, d22, d22
+    vmull.s16       q8, d24, d24
+    vmull.s16       q9, d26, d26
+    vmull.s16       q10, d28, d28
+
+    vadd.u32        q7, q7, q8
+    vadd.u32        q9, q9, q10
+    vadd.u32        q9, q7, q9
+
+    vpaddl.u32      q1, q9
+    vadd.u64        d0, d2, d3
+
+    vmov.32         r0, d0[0]
+    bx              lr
+
+    ENDP
+
+    END
--- /dev/null
+++ b/vp9/encoder/arm/neon/vp8_shortwalsh4x4_neon.asm
@@ -1,0 +1,103 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_short_walsh4x4_neon|
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+;void vp8_short_walsh4x4_neon(short *input, short *output, int pitch)
+; r0   short *input,
+; r1   short *output,
+; r2   int pitch
+|vp8_short_walsh4x4_neon| PROC
+
+    vld1.16         {d0}, [r0@64], r2   ; load input
+    vld1.16         {d1}, [r0@64], r2
+    vld1.16         {d2}, [r0@64], r2
+    vld1.16         {d3}, [r0@64]
+
+    ;First for-loop
+    ;transpose d0, d1, d2, d3. Then, d0=ip[0], d1=ip[1], d2=ip[2], d3=ip[3]
+    vtrn.32         d0, d2
+    vtrn.32         d1, d3
+
+    vmov.s32        q15, #3             ; add 3 to all values
+
+    vtrn.16         d0, d1
+    vtrn.16         d2, d3
+
+    vadd.s16        d4, d0, d2          ; ip[0] + ip[2]
+    vadd.s16        d5, d1, d3          ; ip[1] + ip[3]
+    vsub.s16        d6, d1, d3          ; ip[1] - ip[3]
+    vsub.s16        d7, d0, d2          ; ip[0] - ip[2]
+
+    vshl.s16        d4, d4, #2          ; a1 = (ip[0] + ip[2]) << 2
+    vshl.s16        d5, d5, #2          ; d1 = (ip[1] + ip[3]) << 2
+    vshl.s16        d6, d6, #2          ; c1 = (ip[1] - ip[3]) << 2
+    vceq.s16        d16, d4, #0         ; a1 == 0
+    vshl.s16        d7, d7, #2          ; b1 = (ip[0] - ip[2]) << 2
+
+    vadd.s16        d0, d4, d5          ; a1 + d1
+    vmvn            d16, d16            ; a1 != 0
+    vsub.s16        d3, d4, d5          ; op[3] = a1 - d1
+    vadd.s16        d1, d7, d6          ; op[1] = b1 + c1
+    vsub.s16        d2, d7, d6          ; op[2] = b1 - c1
+    vsub.s16        d0, d0, d16         ; op[0] = a1 + d1 + (a1 != 0)
+
+    ;Second for-loop
+    ;transpose d0, d1, d2, d3, Then, d0=ip[0], d1=ip[4], d2=ip[8], d3=ip[12]
+    vtrn.32         d1, d3
+    vtrn.32         d0, d2
+    vtrn.16         d2, d3
+    vtrn.16         d0, d1
+
+    vaddl.s16       q8, d0, d2          ; a1 = ip[0]+ip[8]
+    vaddl.s16       q9, d1, d3          ; d1 = ip[4]+ip[12]
+    vsubl.s16       q10, d1, d3         ; c1 = ip[4]-ip[12]
+    vsubl.s16       q11, d0, d2         ; b1 = ip[0]-ip[8]
+
+    vadd.s32        q0, q8, q9          ; a2 = a1 + d1
+    vadd.s32        q1, q11, q10        ; b2 = b1 + c1
+    vsub.s32        q2, q11, q10        ; c2 = b1 - c1
+    vsub.s32        q3, q8, q9          ; d2 = a1 - d1
+
+    vclt.s32        q8, q0, #0
+    vclt.s32        q9, q1, #0
+    vclt.s32        q10, q2, #0
+    vclt.s32        q11, q3, #0
+
+    ; subtract -1 (or 0)
+    vsub.s32        q0, q0, q8          ; a2 += a2 < 0
+    vsub.s32        q1, q1, q9          ; b2 += b2 < 0
+    vsub.s32        q2, q2, q10         ; c2 += c2 < 0
+    vsub.s32        q3, q3, q11         ; d2 += d2 < 0
+
+    vadd.s32        q8, q0, q15         ; a2 + 3
+    vadd.s32        q9, q1, q15         ; b2 + 3
+    vadd.s32        q10, q2, q15        ; c2 + 3
+    vadd.s32        q11, q3, q15        ; d2 + 3
+
+    ; vrshrn? would add 1 << 3-1 = 2
+    vshrn.s32       d0, q8, #3
+    vshrn.s32       d1, q9, #3
+    vshrn.s32       d2, q10, #3
+    vshrn.s32       d3, q11, #3
+
+    vst1.16         {q0, q1}, [r1@128]
+
+    bx              lr
+
+    ENDP
+
+    END
--- /dev/null
+++ b/vp9/encoder/arm/neon/vp8_subpixelvariance16x16_neon.asm
@@ -1,0 +1,425 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp9_sub_pixel_variance16x16_neon_func|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+; r0    unsigned char  *src_ptr,
+; r1    int  src_pixels_per_line,
+; r2    int  xoffset,
+; r3    int  yoffset,
+; stack(r4) unsigned char *dst_ptr,
+; stack(r5) int dst_pixels_per_line,
+; stack(r6) unsigned int *sse
+;note: most of the code is copied from bilinear_predict16x16_neon and vp9_variance16x16_neon.
+
+|vp9_sub_pixel_variance16x16_neon_func| PROC
+    push            {r4-r6, lr}
+
+    ldr             r12, _BilinearTaps_coeff_
+    ldr             r4, [sp, #16]           ;load *dst_ptr from stack
+    ldr             r5, [sp, #20]           ;load dst_pixels_per_line from stack
+    ldr             r6, [sp, #24]           ;load *sse from stack
+
+    cmp             r2, #0                  ;skip first_pass filter if xoffset=0
+    beq             secondpass_bfilter16x16_only
+
+    add             r2, r12, r2, lsl #3     ;calculate filter location
+
+    cmp             r3, #0                  ;skip second_pass filter if yoffset=0
+
+    vld1.s32        {d31}, [r2]             ;load first_pass filter
+
+    beq             firstpass_bfilter16x16_only
+
+    sub             sp, sp, #272            ;reserve space on stack for temporary storage
+    vld1.u8         {d2, d3, d4}, [r0], r1      ;load src data
+    mov             lr, sp
+    vld1.u8         {d5, d6, d7}, [r0], r1
+
+    mov             r2, #3                  ;loop counter
+    vld1.u8         {d8, d9, d10}, [r0], r1
+
+    vdup.8          d0, d31[0]              ;first_pass filter (d0 d1)
+    vld1.u8         {d11, d12, d13}, [r0], r1
+
+    vdup.8          d1, d31[4]
+
+;First Pass: output_height lines x output_width columns (17x16)
+vp8e_filt_blk2d_fp16x16_loop_neon
+    pld             [r0]
+    pld             [r0, r1]
+    pld             [r0, r1, lsl #1]
+
+    vmull.u8        q7, d2, d0              ;(src_ptr[0] * Filter[0])
+    vmull.u8        q8, d3, d0
+    vmull.u8        q9, d5, d0
+    vmull.u8        q10, d6, d0
+    vmull.u8        q11, d8, d0
+    vmull.u8        q12, d9, d0
+    vmull.u8        q13, d11, d0
+    vmull.u8        q14, d12, d0
+
+    vext.8          d2, d2, d3, #1          ;construct src_ptr[1]
+    vext.8          d5, d5, d6, #1
+    vext.8          d8, d8, d9, #1
+    vext.8          d11, d11, d12, #1
+
+    vmlal.u8        q7, d2, d1              ;(src_ptr[0] * Filter[1])
+    vmlal.u8        q9, d5, d1
+    vmlal.u8        q11, d8, d1
+    vmlal.u8        q13, d11, d1
+
+    vext.8          d3, d3, d4, #1
+    vext.8          d6, d6, d7, #1
+    vext.8          d9, d9, d10, #1
+    vext.8          d12, d12, d13, #1
+
+    vmlal.u8        q8, d3, d1              ;(src_ptr[0] * Filter[1])
+    vmlal.u8        q10, d6, d1
+    vmlal.u8        q12, d9, d1
+    vmlal.u8        q14, d12, d1
+
+    subs            r2, r2, #1
+
+    vqrshrn.u16    d14, q7, #7              ;shift/round/saturate to u8
+    vqrshrn.u16    d15, q8, #7
+    vqrshrn.u16    d16, q9, #7
+    vqrshrn.u16    d17, q10, #7
+    vqrshrn.u16    d18, q11, #7
+    vqrshrn.u16    d19, q12, #7
+    vqrshrn.u16    d20, q13, #7
+
+    vld1.u8         {d2, d3, d4}, [r0], r1      ;load src data
+    vqrshrn.u16    d21, q14, #7
+    vld1.u8         {d5, d6, d7}, [r0], r1
+
+    vst1.u8         {d14, d15, d16, d17}, [lr]!     ;store result
+    vld1.u8         {d8, d9, d10}, [r0], r1
+    vst1.u8         {d18, d19, d20, d21}, [lr]!
+    vld1.u8         {d11, d12, d13}, [r0], r1
+
+    bne             vp8e_filt_blk2d_fp16x16_loop_neon
+
+;First-pass filtering for rest 5 lines
+    vld1.u8         {d14, d15, d16}, [r0], r1
+
+    vmull.u8        q9, d2, d0              ;(src_ptr[0] * Filter[0])
+    vmull.u8        q10, d3, d0
+    vmull.u8        q11, d5, d0
+    vmull.u8        q12, d6, d0
+    vmull.u8        q13, d8, d0
+    vmull.u8        q14, d9, d0
+
+    vext.8          d2, d2, d3, #1          ;construct src_ptr[1]
+    vext.8          d5, d5, d6, #1
+    vext.8          d8, d8, d9, #1
+
+    vmlal.u8        q9, d2, d1              ;(src_ptr[0] * Filter[1])
+    vmlal.u8        q11, d5, d1
+    vmlal.u8        q13, d8, d1
+
+    vext.8          d3, d3, d4, #1
+    vext.8          d6, d6, d7, #1
+    vext.8          d9, d9, d10, #1
+
+    vmlal.u8        q10, d3, d1             ;(src_ptr[0] * Filter[1])
+    vmlal.u8        q12, d6, d1
+    vmlal.u8        q14, d9, d1
+
+    vmull.u8        q1, d11, d0
+    vmull.u8        q2, d12, d0
+    vmull.u8        q3, d14, d0
+    vmull.u8        q4, d15, d0
+
+    vext.8          d11, d11, d12, #1       ;construct src_ptr[1]
+    vext.8          d14, d14, d15, #1
+
+    vmlal.u8        q1, d11, d1             ;(src_ptr[0] * Filter[1])
+    vmlal.u8        q3, d14, d1
+
+    vext.8          d12, d12, d13, #1
+    vext.8          d15, d15, d16, #1
+
+    vmlal.u8        q2, d12, d1             ;(src_ptr[0] * Filter[1])
+    vmlal.u8        q4, d15, d1
+
+    vqrshrn.u16    d10, q9, #7              ;shift/round/saturate to u8
+    vqrshrn.u16    d11, q10, #7
+    vqrshrn.u16    d12, q11, #7
+    vqrshrn.u16    d13, q12, #7
+    vqrshrn.u16    d14, q13, #7
+    vqrshrn.u16    d15, q14, #7
+    vqrshrn.u16    d16, q1, #7
+    vqrshrn.u16    d17, q2, #7
+    vqrshrn.u16    d18, q3, #7
+    vqrshrn.u16    d19, q4, #7
+
+    vst1.u8         {d10, d11, d12, d13}, [lr]!         ;store result
+    vst1.u8         {d14, d15, d16, d17}, [lr]!
+    vst1.u8         {d18, d19}, [lr]!
+
+;Second pass: 16x16
+;secondpass_filter
+    add             r3, r12, r3, lsl #3
+    sub             lr, lr, #272
+
+    vld1.u32        {d31}, [r3]             ;load second_pass filter
+
+    sub             sp, sp, #256
+    mov             r3, sp
+
+    vld1.u8         {d22, d23}, [lr]!       ;load src data
+
+    vdup.8          d0, d31[0]              ;second_pass filter parameters (d0 d1)
+    vdup.8          d1, d31[4]
+    mov             r12, #4                 ;loop counter
+
+vp8e_filt_blk2d_sp16x16_loop_neon
+    vld1.u8         {d24, d25}, [lr]!
+    vmull.u8        q1, d22, d0             ;(src_ptr[0] * Filter[0])
+    vld1.u8         {d26, d27}, [lr]!
+    vmull.u8        q2, d23, d0
+    vld1.u8         {d28, d29}, [lr]!
+    vmull.u8        q3, d24, d0
+    vld1.u8         {d30, d31}, [lr]!
+
+    vmull.u8        q4, d25, d0
+    vmull.u8        q5, d26, d0
+    vmull.u8        q6, d27, d0
+    vmull.u8        q7, d28, d0
+    vmull.u8        q8, d29, d0
+
+    vmlal.u8        q1, d24, d1             ;(src_ptr[pixel_step] * Filter[1])
+    vmlal.u8        q2, d25, d1
+    vmlal.u8        q3, d26, d1
+    vmlal.u8        q4, d27, d1
+    vmlal.u8        q5, d28, d1
+    vmlal.u8        q6, d29, d1
+    vmlal.u8        q7, d30, d1
+    vmlal.u8        q8, d31, d1
+
+    subs            r12, r12, #1
+
+    vqrshrn.u16    d2, q1, #7               ;shift/round/saturate to u8
+    vqrshrn.u16    d3, q2, #7
+    vqrshrn.u16    d4, q3, #7
+    vqrshrn.u16    d5, q4, #7
+    vqrshrn.u16    d6, q5, #7
+    vqrshrn.u16    d7, q6, #7
+    vqrshrn.u16    d8, q7, #7
+    vqrshrn.u16    d9, q8, #7
+
+    vst1.u8         {d2, d3}, [r3]!         ;store result
+    vst1.u8         {d4, d5}, [r3]!
+    vst1.u8         {d6, d7}, [r3]!
+    vmov            q11, q15
+    vst1.u8         {d8, d9}, [r3]!
+
+    bne             vp8e_filt_blk2d_sp16x16_loop_neon
+
+    b               sub_pixel_variance16x16_neon
+
+;--------------------
+firstpass_bfilter16x16_only
+    mov             r2, #4                      ;loop counter
+    sub             sp, sp, #528            ;reserve space on stack for temporary storage
+    vdup.8          d0, d31[0]                  ;first_pass filter (d0 d1)
+    vdup.8          d1, d31[4]
+    mov             r3, sp
+
+;First Pass: output_height lines x output_width columns (16x16)
+vp8e_filt_blk2d_fpo16x16_loop_neon
+    vld1.u8         {d2, d3, d4}, [r0], r1      ;load src data
+    vld1.u8         {d5, d6, d7}, [r0], r1
+    vld1.u8         {d8, d9, d10}, [r0], r1
+    vld1.u8         {d11, d12, d13}, [r0], r1
+
+    pld             [r0]
+    pld             [r0, r1]
+    pld             [r0, r1, lsl #1]
+
+    vmull.u8        q7, d2, d0              ;(src_ptr[0] * Filter[0])
+    vmull.u8        q8, d3, d0
+    vmull.u8        q9, d5, d0
+    vmull.u8        q10, d6, d0
+    vmull.u8        q11, d8, d0
+    vmull.u8        q12, d9, d0
+    vmull.u8        q13, d11, d0
+    vmull.u8        q14, d12, d0
+
+    vext.8          d2, d2, d3, #1          ;construct src_ptr[1]
+    vext.8          d5, d5, d6, #1
+    vext.8          d8, d8, d9, #1
+    vext.8          d11, d11, d12, #1
+
+    vmlal.u8        q7, d2, d1              ;(src_ptr[0] * Filter[1])
+    vmlal.u8        q9, d5, d1
+    vmlal.u8        q11, d8, d1
+    vmlal.u8        q13, d11, d1
+
+    vext.8          d3, d3, d4, #1
+    vext.8          d6, d6, d7, #1
+    vext.8          d9, d9, d10, #1
+    vext.8          d12, d12, d13, #1
+
+    vmlal.u8        q8, d3, d1              ;(src_ptr[0] * Filter[1])
+    vmlal.u8        q10, d6, d1
+    vmlal.u8        q12, d9, d1
+    vmlal.u8        q14, d12, d1
+
+    subs            r2, r2, #1
+
+    vqrshrn.u16    d14, q7, #7              ;shift/round/saturate to u8
+    vqrshrn.u16    d15, q8, #7
+    vqrshrn.u16    d16, q9, #7
+    vqrshrn.u16    d17, q10, #7
+    vqrshrn.u16    d18, q11, #7
+    vqrshrn.u16    d19, q12, #7
+    vqrshrn.u16    d20, q13, #7
+    vst1.u8         {d14, d15}, [r3]!       ;store result
+    vqrshrn.u16    d21, q14, #7
+
+    vst1.u8         {d16, d17}, [r3]!
+    vst1.u8         {d18, d19}, [r3]!
+    vst1.u8         {d20, d21}, [r3]!
+
+    bne             vp8e_filt_blk2d_fpo16x16_loop_neon
+
+    b               sub_pixel_variance16x16_neon
+
+;---------------------
+secondpass_bfilter16x16_only
+;Second pass: 16x16
+;secondpass_filter
+    sub             sp, sp, #528            ;reserve space on stack for temporary storage
+    add             r3, r12, r3, lsl #3
+    mov             r12, #4                     ;loop counter
+    vld1.u32        {d31}, [r3]                 ;load second_pass filter
+    vld1.u8         {d22, d23}, [r0], r1        ;load src data
+    mov             r3, sp
+
+    vdup.8          d0, d31[0]                  ;second_pass filter parameters (d0 d1)
+    vdup.8          d1, d31[4]
+
+vp8e_filt_blk2d_spo16x16_loop_neon
+    vld1.u8         {d24, d25}, [r0], r1
+    vmull.u8        q1, d22, d0             ;(src_ptr[0] * Filter[0])
+    vld1.u8         {d26, d27}, [r0], r1
+    vmull.u8        q2, d23, d0
+    vld1.u8         {d28, d29}, [r0], r1
+    vmull.u8        q3, d24, d0
+    vld1.u8         {d30, d31}, [r0], r1
+
+    vmull.u8        q4, d25, d0
+    vmull.u8        q5, d26, d0
+    vmull.u8        q6, d27, d0
+    vmull.u8        q7, d28, d0
+    vmull.u8        q8, d29, d0
+
+    vmlal.u8        q1, d24, d1             ;(src_ptr[pixel_step] * Filter[1])
+    vmlal.u8        q2, d25, d1
+    vmlal.u8        q3, d26, d1
+    vmlal.u8        q4, d27, d1
+    vmlal.u8        q5, d28, d1
+    vmlal.u8        q6, d29, d1
+    vmlal.u8        q7, d30, d1
+    vmlal.u8        q8, d31, d1
+
+    vqrshrn.u16    d2, q1, #7               ;shift/round/saturate to u8
+    vqrshrn.u16    d3, q2, #7
+    vqrshrn.u16    d4, q3, #7
+    vqrshrn.u16    d5, q4, #7
+    vqrshrn.u16    d6, q5, #7
+    vqrshrn.u16    d7, q6, #7
+    vqrshrn.u16    d8, q7, #7
+    vqrshrn.u16    d9, q8, #7
+
+    vst1.u8         {d2, d3}, [r3]!         ;store result
+    subs            r12, r12, #1
+    vst1.u8         {d4, d5}, [r3]!
+    vmov            q11, q15
+    vst1.u8         {d6, d7}, [r3]!
+    vst1.u8         {d8, d9}, [r3]!
+
+    bne             vp8e_filt_blk2d_spo16x16_loop_neon
+
+    b               sub_pixel_variance16x16_neon
+
+;----------------------------
+;variance16x16
+sub_pixel_variance16x16_neon
+    vmov.i8         q8, #0                      ;q8 - sum
+    vmov.i8         q9, #0                      ;q9, q10 - sse
+    vmov.i8         q10, #0
+
+    sub             r3, r3, #256
+    mov             r12, #8
+
+sub_pixel_variance16x16_neon_loop
+    vld1.8          {q0}, [r3]!                 ;Load up source and reference
+    vld1.8          {q2}, [r4], r5
+    vld1.8          {q1}, [r3]!
+    vld1.8          {q3}, [r4], r5
+
+    vsubl.u8        q11, d0, d4                 ;diff
+    vsubl.u8        q12, d1, d5
+    vsubl.u8        q13, d2, d6
+    vsubl.u8        q14, d3, d7
+
+    vpadal.s16      q8, q11                     ;sum
+    vmlal.s16       q9, d22, d22                ;sse
+    vmlal.s16       q10, d23, d23
+
+    subs            r12, r12, #1
+
+    vpadal.s16      q8, q12
+    vmlal.s16       q9, d24, d24
+    vmlal.s16       q10, d25, d25
+    vpadal.s16      q8, q13
+    vmlal.s16       q9, d26, d26
+    vmlal.s16       q10, d27, d27
+    vpadal.s16      q8, q14
+    vmlal.s16       q9, d28, d28
+    vmlal.s16       q10, d29, d29
+
+    bne             sub_pixel_variance16x16_neon_loop
+
+    vadd.u32        q10, q9, q10                ;accumulate sse
+    vpaddl.s32      q0, q8                      ;accumulate sum
+
+    vpaddl.u32      q1, q10
+    vadd.s64        d0, d0, d1
+    vadd.u64        d1, d2, d3
+
+    vmull.s32       q5, d0, d0
+    vst1.32         {d1[0]}, [r6]               ;store sse
+    vshr.s32        d10, d10, #8
+    vsub.s32        d0, d1, d10
+
+    add             sp, sp, #528
+    vmov.32         r0, d0[0]                   ;return
+
+    pop             {r4-r6,pc}
+
+    ENDP
+
+;-----------------
+
+_BilinearTaps_coeff_
+    DCD     bilinear_taps_coeff
+bilinear_taps_coeff
+    DCD     128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112
+
+    END
--- /dev/null
+++ b/vp9/encoder/arm/neon/vp8_subpixelvariance16x16s_neon.asm
@@ -1,0 +1,572 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp9_variance_halfpixvar16x16_h_neon|
+    EXPORT  |vp9_variance_halfpixvar16x16_v_neon|
+    EXPORT  |vp9_variance_halfpixvar16x16_hv_neon|
+    EXPORT  |vp9_sub_pixel_variance16x16s_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+;================================================
+;unsigned int vp9_variance_halfpixvar16x16_h_neon
+;(
+;    unsigned char  *src_ptr, r0
+;    int  src_pixels_per_line,  r1
+;    unsigned char *dst_ptr,  r2
+;    int dst_pixels_per_line,   r3
+;    unsigned int *sse
+;);
+;================================================
+|vp9_variance_halfpixvar16x16_h_neon| PROC
+    push            {lr}
+
+    mov             r12, #4                  ;loop counter
+    ldr             lr, [sp, #4]           ;load *sse from stack
+    vmov.i8         q8, #0                      ;q8 - sum
+    vmov.i8         q9, #0                      ;q9, q10 - sse
+    vmov.i8         q10, #0
+
+;First Pass: output_height lines x output_width columns (16x16)
+vp8_filt_fpo16x16s_4_0_loop_neon
+    vld1.u8         {d0, d1, d2, d3}, [r0], r1      ;load src data
+    vld1.8          {q11}, [r2], r3
+    vld1.u8         {d4, d5, d6, d7}, [r0], r1
+    vld1.8          {q12}, [r2], r3
+    vld1.u8         {d8, d9, d10, d11}, [r0], r1
+    vld1.8          {q13}, [r2], r3
+    vld1.u8         {d12, d13, d14, d15}, [r0], r1
+
+    ;pld                [r0]
+    ;pld                [r0, r1]
+    ;pld                [r0, r1, lsl #1]
+
+    vext.8          q1, q0, q1, #1          ;construct src_ptr[1]
+    vext.8          q3, q2, q3, #1
+    vext.8          q5, q4, q5, #1
+    vext.8          q7, q6, q7, #1
+
+    vrhadd.u8       q0, q0, q1              ;(src_ptr[0]+src_ptr[1])/round/shift right 1
+    vld1.8          {q14}, [r2], r3
+    vrhadd.u8       q1, q2, q3
+    vrhadd.u8       q2, q4, q5
+    vrhadd.u8       q3, q6, q7
+
+    vsubl.u8        q4, d0, d22                 ;diff
+    vsubl.u8        q5, d1, d23
+    vsubl.u8        q6, d2, d24
+    vsubl.u8        q7, d3, d25
+    vsubl.u8        q0, d4, d26
+    vsubl.u8        q1, d5, d27
+    vsubl.u8        q2, d6, d28
+    vsubl.u8        q3, d7, d29
+
+    vpadal.s16      q8, q4                     ;sum
+    vmlal.s16       q9, d8, d8                ;sse
+    vmlal.s16       q10, d9, d9
+
+    subs            r12, r12, #1
+
+    vpadal.s16      q8, q5
+    vmlal.s16       q9, d10, d10
+    vmlal.s16       q10, d11, d11
+    vpadal.s16      q8, q6
+    vmlal.s16       q9, d12, d12
+    vmlal.s16       q10, d13, d13
+    vpadal.s16      q8, q7
+    vmlal.s16       q9, d14, d14
+    vmlal.s16       q10, d15, d15
+
+    vpadal.s16      q8, q0                     ;sum
+    vmlal.s16       q9, d0, d0                ;sse
+    vmlal.s16       q10, d1, d1
+    vpadal.s16      q8, q1
+    vmlal.s16       q9, d2, d2
+    vmlal.s16       q10, d3, d3
+    vpadal.s16      q8, q2
+    vmlal.s16       q9, d4, d4
+    vmlal.s16       q10, d5, d5
+    vpadal.s16      q8, q3
+    vmlal.s16       q9, d6, d6
+    vmlal.s16       q10, d7, d7
+
+    bne             vp8_filt_fpo16x16s_4_0_loop_neon
+
+    vadd.u32        q10, q9, q10                ;accumulate sse
+    vpaddl.s32      q0, q8                      ;accumulate sum
+
+    vpaddl.u32      q1, q10
+    vadd.s64        d0, d0, d1
+    vadd.u64        d1, d2, d3
+
+    vmull.s32       q5, d0, d0
+    vst1.32         {d1[0]}, [lr]               ;store sse
+    vshr.s32        d10, d10, #8
+    vsub.s32        d0, d1, d10
+
+    vmov.32         r0, d0[0]                   ;return
+    pop             {pc}
+    ENDP
+
+;================================================
+;unsigned int vp9_variance_halfpixvar16x16_v_neon
+;(
+;    unsigned char  *src_ptr, r0
+;    int  src_pixels_per_line,  r1
+;    unsigned char *dst_ptr,  r2
+;    int dst_pixels_per_line,   r3
+;    unsigned int *sse
+;);
+;================================================
+|vp9_variance_halfpixvar16x16_v_neon| PROC
+    push            {lr}
+
+    mov             r12, #4                     ;loop counter
+
+    vld1.u8         {q0}, [r0], r1              ;load src data
+    ldr             lr, [sp, #4]                ;load *sse from stack
+
+    vmov.i8         q8, #0                      ;q8 - sum
+    vmov.i8         q9, #0                      ;q9, q10 - sse
+    vmov.i8         q10, #0
+
+vp8_filt_spo16x16s_0_4_loop_neon
+    vld1.u8         {q2}, [r0], r1
+    vld1.8          {q1}, [r2], r3
+    vld1.u8         {q4}, [r0], r1
+    vld1.8          {q3}, [r2], r3
+    vld1.u8         {q6}, [r0], r1
+    vld1.8          {q5}, [r2], r3
+    vld1.u8         {q15}, [r0], r1
+
+    vrhadd.u8       q0, q0, q2
+    vld1.8          {q7}, [r2], r3
+    vrhadd.u8       q2, q2, q4
+    vrhadd.u8       q4, q4, q6
+    vrhadd.u8       q6, q6, q15
+
+    vsubl.u8        q11, d0, d2                 ;diff
+    vsubl.u8        q12, d1, d3
+    vsubl.u8        q13, d4, d6
+    vsubl.u8        q14, d5, d7
+    vsubl.u8        q0, d8, d10
+    vsubl.u8        q1, d9, d11
+    vsubl.u8        q2, d12, d14
+    vsubl.u8        q3, d13, d15
+
+    vpadal.s16      q8, q11                     ;sum
+    vmlal.s16       q9, d22, d22                ;sse
+    vmlal.s16       q10, d23, d23
+
+    subs            r12, r12, #1
+
+    vpadal.s16      q8, q12
+    vmlal.s16       q9, d24, d24
+    vmlal.s16       q10, d25, d25
+    vpadal.s16      q8, q13
+    vmlal.s16       q9, d26, d26
+    vmlal.s16       q10, d27, d27
+    vpadal.s16      q8, q14
+    vmlal.s16       q9, d28, d28
+    vmlal.s16       q10, d29, d29
+
+    vpadal.s16      q8, q0                     ;sum
+    vmlal.s16       q9, d0, d0                 ;sse
+    vmlal.s16       q10, d1, d1
+    vpadal.s16      q8, q1
+    vmlal.s16       q9, d2, d2
+    vmlal.s16       q10, d3, d3
+    vpadal.s16      q8, q2
+    vmlal.s16       q9, d4, d4
+    vmlal.s16       q10, d5, d5
+
+    vmov            q0, q15
+
+    vpadal.s16      q8, q3
+    vmlal.s16       q9, d6, d6
+    vmlal.s16       q10, d7, d7
+
+    bne             vp8_filt_spo16x16s_0_4_loop_neon
+
+    vadd.u32        q10, q9, q10                ;accumulate sse
+    vpaddl.s32      q0, q8                      ;accumulate sum
+
+    vpaddl.u32      q1, q10
+    vadd.s64        d0, d0, d1
+    vadd.u64        d1, d2, d3
+
+    vmull.s32       q5, d0, d0
+    vst1.32         {d1[0]}, [lr]               ;store sse
+    vshr.s32        d10, d10, #8
+    vsub.s32        d0, d1, d10
+
+    vmov.32         r0, d0[0]                   ;return
+    pop             {pc}
+    ENDP
+
+;================================================
+;unsigned int vp9_variance_halfpixvar16x16_hv_neon
+;(
+;    unsigned char  *src_ptr, r0
+;    int  src_pixels_per_line,  r1
+;    unsigned char *dst_ptr,  r2
+;    int dst_pixels_per_line,   r3
+;    unsigned int *sse
+;);
+;================================================
+|vp9_variance_halfpixvar16x16_hv_neon| PROC
+    push            {lr}
+
+    vld1.u8         {d0, d1, d2, d3}, [r0], r1      ;load src data
+
+    ldr             lr, [sp, #4]           ;load *sse from stack
+    vmov.i8         q13, #0                      ;q8 - sum
+    vext.8          q1, q0, q1, #1          ;construct src_ptr[1]
+
+    vmov.i8         q14, #0                      ;q9, q10 - sse
+    vmov.i8         q15, #0
+
+    mov             r12, #4                  ;loop counter
+    vrhadd.u8       q0, q0, q1              ;(src_ptr[0]+src_ptr[1])/round/shift right 1
+
+;First Pass: output_height lines x output_width columns (17x16)
+vp8_filt16x16s_4_4_loop_neon
+    vld1.u8         {d4, d5, d6, d7}, [r0], r1
+    vld1.u8         {d8, d9, d10, d11}, [r0], r1
+    vld1.u8         {d12, d13, d14, d15}, [r0], r1
+    vld1.u8         {d16, d17, d18, d19}, [r0], r1
+
+    ;pld                [r0]
+    ;pld                [r0, r1]
+    ;pld                [r0, r1, lsl #1]
+
+    vext.8          q3, q2, q3, #1          ;construct src_ptr[1]
+    vext.8          q5, q4, q5, #1
+    vext.8          q7, q6, q7, #1
+    vext.8          q9, q8, q9, #1
+
+    vrhadd.u8       q1, q2, q3              ;(src_ptr[0]+src_ptr[1])/round/shift right 1
+    vrhadd.u8       q2, q4, q5
+    vrhadd.u8       q3, q6, q7
+    vrhadd.u8       q4, q8, q9
+
+    vld1.8          {q5}, [r2], r3
+    vrhadd.u8       q0, q0, q1
+    vld1.8          {q6}, [r2], r3
+    vrhadd.u8       q1, q1, q2
+    vld1.8          {q7}, [r2], r3
+    vrhadd.u8       q2, q2, q3
+    vld1.8          {q8}, [r2], r3
+    vrhadd.u8       q3, q3, q4
+
+    vsubl.u8        q9, d0, d10                 ;diff
+    vsubl.u8        q10, d1, d11
+    vsubl.u8        q11, d2, d12
+    vsubl.u8        q12, d3, d13
+
+    vsubl.u8        q0, d4, d14                 ;diff
+    vsubl.u8        q1, d5, d15
+    vsubl.u8        q5, d6, d16
+    vsubl.u8        q6, d7, d17
+
+    vpadal.s16      q13, q9                     ;sum
+    vmlal.s16       q14, d18, d18                ;sse
+    vmlal.s16       q15, d19, d19
+
+    vpadal.s16      q13, q10                     ;sum
+    vmlal.s16       q14, d20, d20                ;sse
+    vmlal.s16       q15, d21, d21
+
+    vpadal.s16      q13, q11                     ;sum
+    vmlal.s16       q14, d22, d22                ;sse
+    vmlal.s16       q15, d23, d23
+
+    vpadal.s16      q13, q12                     ;sum
+    vmlal.s16       q14, d24, d24                ;sse
+    vmlal.s16       q15, d25, d25
+
+    subs            r12, r12, #1
+
+    vpadal.s16      q13, q0                     ;sum
+    vmlal.s16       q14, d0, d0                ;sse
+    vmlal.s16       q15, d1, d1
+
+    vpadal.s16      q13, q1                     ;sum
+    vmlal.s16       q14, d2, d2                ;sse
+    vmlal.s16       q15, d3, d3
+
+    vpadal.s16      q13, q5                     ;sum
+    vmlal.s16       q14, d10, d10                ;sse
+    vmlal.s16       q15, d11, d11
+
+    vmov            q0, q4
+
+    vpadal.s16      q13, q6                     ;sum
+    vmlal.s16       q14, d12, d12                ;sse
+    vmlal.s16       q15, d13, d13
+
+    bne             vp8_filt16x16s_4_4_loop_neon
+
+    vadd.u32        q15, q14, q15                ;accumulate sse
+    vpaddl.s32      q0, q13                      ;accumulate sum
+
+    vpaddl.u32      q1, q15
+    vadd.s64        d0, d0, d1
+    vadd.u64        d1, d2, d3
+
+    vmull.s32       q5, d0, d0
+    vst1.32         {d1[0]}, [lr]               ;store sse
+    vshr.s32        d10, d10, #8
+    vsub.s32        d0, d1, d10
+
+    vmov.32         r0, d0[0]                   ;return
+    pop             {pc}
+    ENDP
+
+;==============================
+; r0    unsigned char  *src_ptr,
+; r1    int  src_pixels_per_line,
+; r2    int  xoffset,
+; r3    int  yoffset,
+; stack unsigned char *dst_ptr,
+; stack int dst_pixels_per_line,
+; stack unsigned int *sse
+;note: in vp8_find_best_half_pixel_step()(called when 8<Speed<15), and first call of vp8_find_best_sub_pixel_step()
+;(called when speed<=8). xoffset/yoffset can only be 4 or 0, which means either by pass the filter,
+;or filter coeff is {64, 64}. This simplified program only works in this situation.
+;note: It happens that both xoffset and yoffset are zero. This can be handled in c code later.
+
+|vp9_sub_pixel_variance16x16s_neon| PROC
+    push            {r4, lr}
+
+    ldr             r4, [sp, #8]            ;load *dst_ptr from stack
+    ldr             r12, [sp, #12]          ;load dst_pixels_per_line from stack
+    ldr             lr, [sp, #16]           ;load *sse from stack
+
+    cmp             r2, #0                  ;skip first_pass filter if xoffset=0
+    beq             secondpass_bfilter16x16s_only
+
+    cmp             r3, #0                  ;skip second_pass filter if yoffset=0
+    beq             firstpass_bfilter16x16s_only
+
+    vld1.u8         {d0, d1, d2, d3}, [r0], r1      ;load src data
+    sub             sp, sp, #256            ;reserve space on stack for temporary storage
+    vext.8          q1, q0, q1, #1          ;construct src_ptr[1]
+    mov             r3, sp
+    mov             r2, #4                  ;loop counter
+    vrhadd.u8       q0, q0, q1              ;(src_ptr[0]+src_ptr[1])/round/shift right 1
+
+;First Pass: output_height lines x output_width columns (17x16)
+vp8e_filt_blk2d_fp16x16s_loop_neon
+    vld1.u8         {d4, d5, d6, d7}, [r0], r1
+    vld1.u8         {d8, d9, d10, d11}, [r0], r1
+    vld1.u8         {d12, d13, d14, d15}, [r0], r1
+    vld1.u8         {d16, d17, d18, d19}, [r0], r1
+
+    ;pld                [r0]
+    ;pld                [r0, r1]
+    ;pld                [r0, r1, lsl #1]
+
+    vext.8          q3, q2, q3, #1          ;construct src_ptr[1]
+    vext.8          q5, q4, q5, #1
+    vext.8          q7, q6, q7, #1
+    vext.8          q9, q8, q9, #1
+
+    vrhadd.u8       q1, q2, q3              ;(src_ptr[0]+src_ptr[1])/round/shift right 1
+    vrhadd.u8       q2, q4, q5
+    vrhadd.u8       q3, q6, q7
+    vrhadd.u8       q4, q8, q9
+
+    vrhadd.u8       q0, q0, q1
+    vrhadd.u8       q1, q1, q2
+    vrhadd.u8       q2, q2, q3
+    vrhadd.u8       q3, q3, q4
+
+    subs            r2, r2, #1
+    vst1.u8         {d0, d1 ,d2, d3}, [r3]!         ;store result
+    vmov            q0, q4
+    vst1.u8         {d4, d5, d6, d7}, [r3]!
+
+    bne             vp8e_filt_blk2d_fp16x16s_loop_neon
+
+    b               sub_pixel_variance16x16s_neon
+
+;--------------------
+firstpass_bfilter16x16s_only
+    mov             r2, #2                  ;loop counter
+    sub             sp, sp, #256            ;reserve space on stack for temporary storage
+    mov             r3, sp
+
+;First Pass: output_height lines x output_width columns (16x16)
+vp8e_filt_blk2d_fpo16x16s_loop_neon
+    vld1.u8         {d0, d1, d2, d3}, [r0], r1      ;load src data
+    vld1.u8         {d4, d5, d6, d7}, [r0], r1
+    vld1.u8         {d8, d9, d10, d11}, [r0], r1
+    vld1.u8         {d12, d13, d14, d15}, [r0], r1
+
+    ;pld                [r0]
+    ;pld                [r0, r1]
+    ;pld                [r0, r1, lsl #1]
+
+    vext.8          q1, q0, q1, #1          ;construct src_ptr[1]
+    vld1.u8         {d16, d17, d18, d19}, [r0], r1
+    vext.8          q3, q2, q3, #1
+    vld1.u8         {d20, d21, d22, d23}, [r0], r1
+    vext.8          q5, q4, q5, #1
+    vld1.u8         {d24, d25, d26, d27}, [r0], r1
+    vext.8          q7, q6, q7, #1
+    vld1.u8         {d28, d29, d30, d31}, [r0], r1
+    vext.8          q9, q8, q9, #1
+    vext.8          q11, q10, q11, #1
+    vext.8          q13, q12, q13, #1
+    vext.8          q15, q14, q15, #1
+
+    vrhadd.u8       q0, q0, q1              ;(src_ptr[0]+src_ptr[1])/round/shift right 1
+    vrhadd.u8       q1, q2, q3
+    vrhadd.u8       q2, q4, q5
+    vrhadd.u8       q3, q6, q7
+    vrhadd.u8       q4, q8, q9
+    vrhadd.u8       q5, q10, q11
+    vrhadd.u8       q6, q12, q13
+    vrhadd.u8       q7, q14, q15
+
+    subs            r2, r2, #1
+
+    vst1.u8         {d0, d1, d2, d3}, [r3]!         ;store result
+    vst1.u8         {d4, d5, d6, d7}, [r3]!
+    vst1.u8         {d8, d9, d10, d11}, [r3]!
+    vst1.u8         {d12, d13, d14, d15}, [r3]!
+
+    bne             vp8e_filt_blk2d_fpo16x16s_loop_neon
+
+    b               sub_pixel_variance16x16s_neon
+
+;---------------------
+secondpass_bfilter16x16s_only
+    sub             sp, sp, #256            ;reserve space on stack for temporary storage
+
+    mov             r2, #2                  ;loop counter
+    vld1.u8         {d0, d1}, [r0], r1      ;load src data
+    mov             r3, sp
+
+vp8e_filt_blk2d_spo16x16s_loop_neon
+    vld1.u8         {d2, d3}, [r0], r1
+    vld1.u8         {d4, d5}, [r0], r1
+    vld1.u8         {d6, d7}, [r0], r1
+    vld1.u8         {d8, d9}, [r0], r1
+
+    vrhadd.u8       q0, q0, q1
+    vld1.u8         {d10, d11}, [r0], r1
+    vrhadd.u8       q1, q1, q2
+    vld1.u8         {d12, d13}, [r0], r1
+    vrhadd.u8       q2, q2, q3
+    vld1.u8         {d14, d15}, [r0], r1
+    vrhadd.u8       q3, q3, q4
+    vld1.u8         {d16, d17}, [r0], r1
+    vrhadd.u8       q4, q4, q5
+    vrhadd.u8       q5, q5, q6
+    vrhadd.u8       q6, q6, q7
+    vrhadd.u8       q7, q7, q8
+
+    subs            r2, r2, #1
+
+    vst1.u8         {d0, d1, d2, d3}, [r3]!         ;store result
+    vmov            q0, q8
+    vst1.u8         {d4, d5, d6, d7}, [r3]!
+    vst1.u8         {d8, d9, d10, d11}, [r3]!           ;store result
+    vst1.u8         {d12, d13, d14, d15}, [r3]!
+
+    bne             vp8e_filt_blk2d_spo16x16s_loop_neon
+
+    b               sub_pixel_variance16x16s_neon
+
+;----------------------------
+;variance16x16
+sub_pixel_variance16x16s_neon
+    vmov.i8         q8, #0                      ;q8 - sum
+    vmov.i8         q9, #0                      ;q9, q10 - sse
+    vmov.i8         q10, #0
+
+    sub             r3, r3, #256
+    mov             r2, #4
+
+sub_pixel_variance16x16s_neon_loop
+    vld1.8          {q0}, [r3]!                 ;Load up source and reference
+    vld1.8          {q1}, [r4], r12
+    vld1.8          {q2}, [r3]!
+    vld1.8          {q3}, [r4], r12
+    vld1.8          {q4}, [r3]!
+    vld1.8          {q5}, [r4], r12
+    vld1.8          {q6}, [r3]!
+    vld1.8          {q7}, [r4], r12
+
+    vsubl.u8        q11, d0, d2                 ;diff
+    vsubl.u8        q12, d1, d3
+    vsubl.u8        q13, d4, d6
+    vsubl.u8        q14, d5, d7
+    vsubl.u8        q0, d8, d10
+    vsubl.u8        q1, d9, d11
+    vsubl.u8        q2, d12, d14
+    vsubl.u8        q3, d13, d15
+
+    vpadal.s16      q8, q11                     ;sum
+    vmlal.s16       q9, d22, d22                ;sse
+    vmlal.s16       q10, d23, d23
+
+    subs            r2, r2, #1
+
+    vpadal.s16      q8, q12
+    vmlal.s16       q9, d24, d24
+    vmlal.s16       q10, d25, d25
+    vpadal.s16      q8, q13
+    vmlal.s16       q9, d26, d26
+    vmlal.s16       q10, d27, d27
+    vpadal.s16      q8, q14
+    vmlal.s16       q9, d28, d28
+    vmlal.s16       q10, d29, d29
+
+    vpadal.s16      q8, q0                     ;sum
+    vmlal.s16       q9, d0, d0                ;sse
+    vmlal.s16       q10, d1, d1
+    vpadal.s16      q8, q1
+    vmlal.s16       q9, d2, d2
+    vmlal.s16       q10, d3, d3
+    vpadal.s16      q8, q2
+    vmlal.s16       q9, d4, d4
+    vmlal.s16       q10, d5, d5
+    vpadal.s16      q8, q3
+    vmlal.s16       q9, d6, d6
+    vmlal.s16       q10, d7, d7
+
+    bne             sub_pixel_variance16x16s_neon_loop
+
+    vadd.u32        q10, q9, q10                ;accumulate sse
+    vpaddl.s32      q0, q8                      ;accumulate sum
+
+    vpaddl.u32      q1, q10
+    vadd.s64        d0, d0, d1
+    vadd.u64        d1, d2, d3
+
+    vmull.s32       q5, d0, d0
+    vst1.32         {d1[0]}, [lr]               ;store sse
+    vshr.s32        d10, d10, #8
+    vsub.s32        d0, d1, d10
+
+    add             sp, sp, #256
+    vmov.32         r0, d0[0]                   ;return
+
+    pop             {r4, pc}
+    ENDP
+
+    END
--- /dev/null
+++ b/vp9/encoder/arm/neon/vp8_subpixelvariance8x8_neon.asm
@@ -1,0 +1,224 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp9_sub_pixel_variance8x8_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+; r0    unsigned char  *src_ptr,
+; r1    int  src_pixels_per_line,
+; r2    int  xoffset,
+; r3    int  yoffset,
+; stack(r4) unsigned char *dst_ptr,
+; stack(r5) int dst_pixels_per_line,
+; stack(r6) unsigned int *sse
+;note: most of the code is copied from bilinear_predict8x8_neon and vp9_variance8x8_neon.
+
+|vp9_sub_pixel_variance8x8_neon| PROC
+    push            {r4-r5, lr}
+
+    ldr             r12, _BilinearTaps_coeff_
+    ldr             r4, [sp, #12]           ;load *dst_ptr from stack
+    ldr             r5, [sp, #16]           ;load dst_pixels_per_line from stack
+    ldr             lr, [sp, #20]           ;load *sse from stack
+
+    cmp             r2, #0                  ;skip first_pass filter if xoffset=0
+    beq             skip_firstpass_filter
+
+;First pass: output_height lines x output_width columns (9x8)
+    add             r2, r12, r2, lsl #3     ;calculate filter location
+
+    vld1.u8         {q1}, [r0], r1          ;load src data
+    vld1.u32        {d31}, [r2]             ;load first_pass filter
+    vld1.u8         {q2}, [r0], r1
+    vdup.8          d0, d31[0]              ;first_pass filter (d0 d1)
+    vld1.u8         {q3}, [r0], r1
+    vdup.8          d1, d31[4]
+    vld1.u8         {q4}, [r0], r1
+
+    vmull.u8        q6, d2, d0              ;(src_ptr[0] * Filter[0])
+    vmull.u8        q7, d4, d0
+    vmull.u8        q8, d6, d0
+    vmull.u8        q9, d8, d0
+
+    vext.8          d3, d2, d3, #1          ;construct src_ptr[-1]
+    vext.8          d5, d4, d5, #1
+    vext.8          d7, d6, d7, #1
+    vext.8          d9, d8, d9, #1
+
+    vmlal.u8        q6, d3, d1              ;(src_ptr[1] * Filter[1])
+    vmlal.u8        q7, d5, d1
+    vmlal.u8        q8, d7, d1
+    vmlal.u8        q9, d9, d1
+
+    vld1.u8         {q1}, [r0], r1          ;load src data
+    vqrshrn.u16    d22, q6, #7              ;shift/round/saturate to u8
+    vld1.u8         {q2}, [r0], r1
+    vqrshrn.u16    d23, q7, #7
+    vld1.u8         {q3}, [r0], r1
+    vqrshrn.u16    d24, q8, #7
+    vld1.u8         {q4}, [r0], r1
+    vqrshrn.u16    d25, q9, #7
+
+    ;first_pass filtering on the rest 5-line data
+    vld1.u8         {q5}, [r0], r1
+
+    vmull.u8        q6, d2, d0              ;(src_ptr[0] * Filter[0])
+    vmull.u8        q7, d4, d0
+    vmull.u8        q8, d6, d0
+    vmull.u8        q9, d8, d0
+    vmull.u8        q10, d10, d0
+
+    vext.8          d3, d2, d3, #1          ;construct src_ptr[-1]
+    vext.8          d5, d4, d5, #1
+    vext.8          d7, d6, d7, #1
+    vext.8          d9, d8, d9, #1
+    vext.8          d11, d10, d11, #1
+
+    vmlal.u8        q6, d3, d1              ;(src_ptr[1] * Filter[1])
+    vmlal.u8        q7, d5, d1
+    vmlal.u8        q8, d7, d1
+    vmlal.u8        q9, d9, d1
+    vmlal.u8        q10, d11, d1
+
+    vqrshrn.u16    d26, q6, #7              ;shift/round/saturate to u8
+    vqrshrn.u16    d27, q7, #7
+    vqrshrn.u16    d28, q8, #7
+    vqrshrn.u16    d29, q9, #7
+    vqrshrn.u16    d30, q10, #7
+
+;Second pass: 8x8
+secondpass_filter
+    cmp             r3, #0                  ;skip second_pass filter if yoffset=0
+    ;skip_secondpass_filter
+    beq             sub_pixel_variance8x8_neon
+
+    add             r3, r12, r3, lsl #3
+
+    vld1.u32        {d31}, [r3]             ;load second_pass filter
+
+    vdup.8          d0, d31[0]              ;second_pass filter parameters (d0 d1)
+    vdup.8          d1, d31[4]
+
+    vmull.u8        q1, d22, d0             ;(src_ptr[0] * Filter[0])
+    vmull.u8        q2, d23, d0
+    vmull.u8        q3, d24, d0
+    vmull.u8        q4, d25, d0
+    vmull.u8        q5, d26, d0
+    vmull.u8        q6, d27, d0
+    vmull.u8        q7, d28, d0
+    vmull.u8        q8, d29, d0
+
+    vmlal.u8        q1, d23, d1             ;(src_ptr[pixel_step] * Filter[1])
+    vmlal.u8        q2, d24, d1
+    vmlal.u8        q3, d25, d1
+    vmlal.u8        q4, d26, d1
+    vmlal.u8        q5, d27, d1
+    vmlal.u8        q6, d28, d1
+    vmlal.u8        q7, d29, d1
+    vmlal.u8        q8, d30, d1
+
+    vqrshrn.u16    d22, q1, #7              ;shift/round/saturate to u8
+    vqrshrn.u16    d23, q2, #7
+    vqrshrn.u16    d24, q3, #7
+    vqrshrn.u16    d25, q4, #7
+    vqrshrn.u16    d26, q5, #7
+    vqrshrn.u16    d27, q6, #7
+    vqrshrn.u16    d28, q7, #7
+    vqrshrn.u16    d29, q8, #7
+
+    b               sub_pixel_variance8x8_neon
+
+;--------------------
+skip_firstpass_filter
+    vld1.u8         {d22}, [r0], r1         ;load src data
+    vld1.u8         {d23}, [r0], r1
+    vld1.u8         {d24}, [r0], r1
+    vld1.u8         {d25}, [r0], r1
+    vld1.u8         {d26}, [r0], r1
+    vld1.u8         {d27}, [r0], r1
+    vld1.u8         {d28}, [r0], r1
+    vld1.u8         {d29}, [r0], r1
+    vld1.u8         {d30}, [r0], r1
+
+    b               secondpass_filter
+
+;----------------------
+;vp9_variance8x8_neon
+sub_pixel_variance8x8_neon
+    vmov.i8         q8, #0                      ;q8 - sum
+    vmov.i8         q9, #0                      ;q9, q10 - sse
+    vmov.i8         q10, #0
+
+    mov             r12, #2
+
+sub_pixel_variance8x8_neon_loop
+    vld1.8          {d0}, [r4], r5              ;load dst data
+    subs            r12, r12, #1
+    vld1.8          {d1}, [r4], r5
+    vld1.8          {d2}, [r4], r5
+    vsubl.u8        q4, d22, d0                 ;calculate diff
+    vld1.8          {d3}, [r4], r5
+
+    vsubl.u8        q5, d23, d1
+    vsubl.u8        q6, d24, d2
+
+    vpadal.s16      q8, q4                      ;sum
+    vmlal.s16       q9, d8, d8                  ;sse
+    vmlal.s16       q10, d9, d9
+
+    vsubl.u8        q7, d25, d3
+
+    vpadal.s16      q8, q5
+    vmlal.s16       q9, d10, d10
+    vmlal.s16       q10, d11, d11
+
+    vmov            q11, q13
+
+    vpadal.s16      q8, q6
+    vmlal.s16       q9, d12, d12
+    vmlal.s16       q10, d13, d13
+
+    vmov            q12, q14
+
+    vpadal.s16      q8, q7
+    vmlal.s16       q9, d14, d14
+    vmlal.s16       q10, d15, d15
+
+    bne             sub_pixel_variance8x8_neon_loop
+
+    vadd.u32        q10, q9, q10                ;accumulate sse
+    vpaddl.s32      q0, q8                      ;accumulate sum
+
+    vpaddl.u32      q1, q10
+    vadd.s64        d0, d0, d1
+    vadd.u64        d1, d2, d3
+
+    vmull.s32       q5, d0, d0
+    vst1.32         {d1[0]}, [lr]               ;store sse
+    vshr.s32        d10, d10, #6
+    vsub.s32        d0, d1, d10
+
+    vmov.32         r0, d0[0]                   ;return
+    pop             {r4-r5, pc}
+
+    ENDP
+
+;-----------------
+
+_BilinearTaps_coeff_
+    DCD     bilinear_taps_coeff
+bilinear_taps_coeff
+    DCD     128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112
+
+    END
--- /dev/null
+++ b/vp9/encoder/arm/quantize_arm.c
@@ -1,0 +1,59 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include <math.h>
+#include "vpx_mem/vpx_mem.h"
+
+#include "vp9/encoder/quantize.h"
+#include "vp9/common/entropy.h"
+
+
+#if HAVE_ARMV7
+
+/* vp8_quantize_mbX functions here differs from corresponding ones in
+ * quantize.c only by using quantize_b_pair function pointer instead of
+ * the regular quantize_b function pointer */
+void vp8_quantize_mby_neon(MACROBLOCK *x) {
+  int i;
+  int has_2nd_order = (x->e_mbd.mode_info_context->mbmi.mode != B_PRED
+                       && x->e_mbd.mode_info_context->mbmi.mode != SPLITMV);
+
+  for (i = 0; i < 16; i += 2)
+    x->quantize_b_pair(&x->block[i], &x->block[i + 1],
+                       &x->e_mbd.block[i], &x->e_mbd.block[i + 1]);
+
+  if (has_2nd_order)
+    x->quantize_b(&x->block[24], &x->e_mbd.block[24]);
+}
+
+void vp8_quantize_mb_neon(MACROBLOCK *x) {
+  int i;
+  int has_2nd_order = (x->e_mbd.mode_info_context->mbmi.mode != B_PRED
+                       && x->e_mbd.mode_info_context->mbmi.mode != SPLITMV);
+
+  for (i = 0; i < 24; i += 2)
+    x->quantize_b_pair(&x->block[i], &x->block[i + 1],
+                       &x->e_mbd.block[i], &x->e_mbd.block[i + 1]);
+
+  if (has_2nd_order)
+    x->quantize_b(&x->block[i], &x->e_mbd.block[i]);
+}
+
+
+void vp8_quantize_mbuv_neon(MACROBLOCK *x) {
+  int i;
+
+  for (i = 16; i < 24; i += 2)
+    x->quantize_b_pair(&x->block[i], &x->block[i + 1],
+                       &x->e_mbd.block[i], &x->e_mbd.block[i + 1]);
+}
+
+#endif /* HAVE_ARMV7 */
--- /dev/null
+++ b/vp9/encoder/arm/quantize_arm.h
@@ -1,0 +1,52 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef QUANTIZE_ARM_H
+#define QUANTIZE_ARM_H
+
+#if HAVE_ARMV6
+
+extern prototype_quantize_block(vp8_fast_quantize_b_armv6);
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+#undef  vp8_quantize_fastquantb
+#define vp8_quantize_fastquantb vp8_fast_quantize_b_armv6
+#endif
+
+#endif /* HAVE_ARMV6 */
+
+
+#if HAVE_ARMV7
+
+extern prototype_quantize_block(vp8_fast_quantize_b_neon);
+extern prototype_quantize_block_pair(vp8_fast_quantize_b_pair_neon);
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+#undef  vp8_quantize_fastquantb
+#define vp8_quantize_fastquantb vp8_fast_quantize_b_neon
+
+#undef  vp8_quantize_fastquantb_pair
+#define vp8_quantize_fastquantb_pair vp8_fast_quantize_b_pair_neon
+
+#undef vp8_quantize_mb
+#define vp8_quantize_mb vp8_quantize_mb_neon
+
+#undef vp8_quantize_mbuv
+#define vp8_quantize_mbuv vp8_quantize_mbuv_neon
+
+#undef vp8_quantize_mby
+#define vp8_quantize_mby vp8_quantize_mby_neon
+#endif
+
+#endif /* HAVE_ARMV7 */
+
+#endif
+
--- /dev/null
+++ b/vp9/encoder/arm/variance_arm.c
@@ -1,0 +1,112 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_config.h"
+#include "vp9/encoder/variance.h"
+#include "vp9/common/filter.h"
+#include "vp9/common/arm/bilinearfilter_arm.h"
+
+#define HALFNDX 8
+
+#if HAVE_ARMV6
+
+unsigned int vp9_sub_pixel_variance8x8_armv6
+(
+  const unsigned char  *src_ptr,
+  int  src_pixels_per_line,
+  int  xoffset,
+  int  yoffset,
+  const unsigned char *dst_ptr,
+  int dst_pixels_per_line,
+  unsigned int *sse
+) {
+  unsigned short first_pass[10 * 8];
+  unsigned char  second_pass[8 * 8];
+  const short *HFilter, *VFilter;
+
+  HFilter = vp8_bilinear_filters[xoffset];
+  VFilter = vp8_bilinear_filters[yoffset];
+
+  vp9_filter_block2d_bil_first_pass_armv6(src_ptr, first_pass,
+                                          src_pixels_per_line,
+                                          9, 8, HFilter);
+  vp9_filter_block2d_bil_second_pass_armv6(first_pass, second_pass,
+                                           8, 8, 8, VFilter);
+
+  return vp9_variance8x8_armv6(second_pass, 8, dst_ptr,
+                               dst_pixels_per_line, sse);
+}
+
+unsigned int vp9_sub_pixel_variance16x16_armv6
+(
+  const unsigned char  *src_ptr,
+  int  src_pixels_per_line,
+  int  xoffset,
+  int  yoffset,
+  const unsigned char *dst_ptr,
+  int dst_pixels_per_line,
+  unsigned int *sse
+) {
+  unsigned short first_pass[36 * 16];
+  unsigned char  second_pass[20 * 16];
+  const short *HFilter, *VFilter;
+  unsigned int var;
+
+  if (xoffset == HALFNDX && yoffset == 0) {
+    var = vp9_variance_halfpixvar16x16_h_armv6(src_ptr, src_pixels_per_line,
+                                               dst_ptr, dst_pixels_per_line, sse);
+  } else if (xoffset == 0 && yoffset == HALFNDX) {
+    var = vp9_variance_halfpixvar16x16_v_armv6(src_ptr, src_pixels_per_line,
+                                               dst_ptr, dst_pixels_per_line, sse);
+  } else if (xoffset == HALFNDX && yoffset == HALFNDX) {
+    var = vp9_variance_halfpixvar16x16_hv_armv6(src_ptr, src_pixels_per_line,
+                                                dst_ptr, dst_pixels_per_line, sse);
+  } else {
+    HFilter = vp8_bilinear_filters[xoffset];
+    VFilter = vp8_bilinear_filters[yoffset];
+
+    vp9_filter_block2d_bil_first_pass_armv6(src_ptr, first_pass,
+                                            src_pixels_per_line,
+                                            17, 16, HFilter);
+    vp9_filter_block2d_bil_second_pass_armv6(first_pass, second_pass,
+                                             16, 16, 16, VFilter);
+
+    var = vp9_variance16x16_armv6(second_pass, 16, dst_ptr,
+                                  dst_pixels_per_line, sse);
+  }
+  return var;
+}
+
+#endif /* HAVE_ARMV6 */
+
+
+#if HAVE_ARMV7
+
+unsigned int vp9_sub_pixel_variance16x16_neon
+(
+  const unsigned char  *src_ptr,
+  int  src_pixels_per_line,
+  int  xoffset,
+  int  yoffset,
+  const unsigned char *dst_ptr,
+  int dst_pixels_per_line,
+  unsigned int *sse
+) {
+  if (xoffset == HALFNDX && yoffset == 0)
+    return vp9_variance_halfpixvar16x16_h_neon(src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse);
+  else if (xoffset == 0 && yoffset == HALFNDX)
+    return vp9_variance_halfpixvar16x16_v_neon(src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse);
+  else if (xoffset == HALFNDX && yoffset == HALFNDX)
+    return vp9_variance_halfpixvar16x16_hv_neon(src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse);
+  else
+    return vp9_sub_pixel_variance16x16_neon_func(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse);
+}
+
+#endif
--- /dev/null
+++ b/vp9/encoder/arm/variance_arm.h
@@ -1,0 +1,132 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VARIANCE_ARM_H
+#define VARIANCE_ARM_H
+
+#if HAVE_ARMV6
+
+extern prototype_sad(vp9_sad16x16_armv6);
+extern prototype_variance(vp9_variance16x16_armv6);
+extern prototype_variance(vp9_variance8x8_armv6);
+extern prototype_subpixvariance(vp9_sub_pixel_variance16x16_armv6);
+extern prototype_subpixvariance(vp9_sub_pixel_variance8x8_armv6);
+extern prototype_variance(vp9_variance_halfpixvar16x16_h_armv6);
+extern prototype_variance(vp9_variance_halfpixvar16x16_v_armv6);
+extern prototype_variance(vp9_variance_halfpixvar16x16_hv_armv6);
+extern prototype_variance(vp9_mse16x16_armv6);
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+
+#undef  vp9_variance_sad16x16
+#define vp9_variance_sad16x16 vp9_sad16x16_armv6
+
+#undef  vp9_variance_subpixvar16x16
+#define vp9_variance_subpixvar16x16 vp9_sub_pixel_variance16x16_armv6
+
+#undef  vp9_variance_subpixvar8x8
+#define vp9_variance_subpixvar8x8 vp9_sub_pixel_variance8x8_armv6
+
+#undef  vp9_variance_var16x16
+#define vp9_variance_var16x16 vp9_variance16x16_armv6
+
+#undef  vp9_variance_mse16x16
+#define vp9_variance_mse16x16 vp9_mse16x16_armv6
+
+#undef  vp9_variance_var8x8
+#define vp9_variance_var8x8 vp9_variance8x8_armv6
+
+#undef  vp9_variance_halfpixvar16x16_h
+#define vp9_variance_halfpixvar16x16_h vp9_variance_halfpixvar16x16_h_armv6
+
+#undef  vp9_variance_halfpixvar16x16_v
+#define vp9_variance_halfpixvar16x16_v vp9_variance_halfpixvar16x16_v_armv6
+
+#undef  vp9_variance_halfpixvar16x16_hv
+#define vp9_variance_halfpixvar16x16_hv vp9_variance_halfpixvar16x16_hv_armv6
+
+#endif /* !CONFIG_RUNTIME_CPU_DETECT */
+
+#endif /* HAVE_ARMV6 */
+
+
+#if HAVE_ARMV7
+extern prototype_sad(vp9_sad4x4_neon);
+extern prototype_sad(vp9_sad8x8_neon);
+extern prototype_sad(vp9_sad8x16_neon);
+extern prototype_sad(vp9_sad16x8_neon);
+extern prototype_sad(vp9_sad16x16_neon);
+
+extern prototype_variance(vp9_variance8x8_neon);
+extern prototype_variance(vp9_variance8x16_neon);
+extern prototype_variance(vp9_variance16x8_neon);
+extern prototype_variance(vp9_variance16x16_neon);
+
+extern prototype_subpixvariance(vp9_sub_pixel_variance8x8_neon);
+extern prototype_subpixvariance(vp9_sub_pixel_variance16x16_neon);
+extern prototype_subpixvariance(vp9_sub_pixel_variance16x16_neon_func);
+extern prototype_variance(vp9_variance_halfpixvar16x16_h_neon);
+extern prototype_variance(vp9_variance_halfpixvar16x16_v_neon);
+extern prototype_variance(vp9_variance_halfpixvar16x16_hv_neon);
+
+extern prototype_variance(vp9_mse16x16_neon);
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+#undef  vp9_variance_sad4x4
+#define vp9_variance_sad4x4 vp9_sad4x4_neon
+
+#undef  vp9_variance_sad8x8
+#define vp9_variance_sad8x8 vp9_sad8x8_neon
+
+#undef  vp9_variance_sad8x16
+#define vp9_variance_sad8x16 vp9_sad8x16_neon
+
+#undef  vp9_variance_sad16x8
+#define vp9_variance_sad16x8 vp9_sad16x8_neon
+
+#undef  vp9_variance_sad16x16
+#define vp9_variance_sad16x16 vp9_sad16x16_neon
+
+#undef  vp9_variance_var8x8
+#define vp9_variance_var8x8 vp9_variance8x8_neon
+
+#undef  vp9_variance_var8x16
+#define vp9_variance_var8x16 vp9_variance8x16_neon
+
+#undef  vp9_variance_var16x8
+#define vp9_variance_var16x8 vp9_variance16x8_neon
+
+#undef  vp9_variance_var16x16
+#define vp9_variance_var16x16 vp9_variance16x16_neon
+
+#undef  vp9_variance_subpixvar8x8
+#define vp9_variance_subpixvar8x8 vp9_sub_pixel_variance8x8_neon
+
+#undef  vp9_variance_subpixvar16x16
+#define vp9_variance_subpixvar16x16 vp9_sub_pixel_variance16x16_neon
+
+#undef  vp9_variance_halfpixvar16x16_h
+#define vp9_variance_halfpixvar16x16_h vp9_variance_halfpixvar16x16_h_neon
+
+#undef  vp9_variance_halfpixvar16x16_v
+#define vp9_variance_halfpixvar16x16_v vp9_variance_halfpixvar16x16_v_neon
+
+#undef  vp9_variance_halfpixvar16x16_hv
+#define vp9_variance_halfpixvar16x16_hv vp9_variance_halfpixvar16x16_hv_neon
+
+#undef  vp9_variance_mse16x16
+#define vp9_variance_mse16x16 vp9_mse16x16_neon
+
+#endif
+
+#endif
+
+#endif
--- /dev/null
+++ b/vp9/encoder/asm_enc_offsets.c
@@ -1,0 +1,90 @@
+/*
+ *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_ports/asm_offsets.h"
+#include "vpx_config.h"
+#include "block.h"
+#include "vp9/common/blockd.h"
+#include "onyx_int.h"
+#include "treewriter.h"
+#include "tokenize.h"
+
+BEGIN
+
+/* regular quantize */
+DEFINE(vp9_block_coeff,                         offsetof(BLOCK, coeff));
+DEFINE(vp9_block_zbin,                          offsetof(BLOCK, zbin));
+DEFINE(vp9_block_round,                         offsetof(BLOCK, round));
+DEFINE(vp9_block_quant,                         offsetof(BLOCK, quant));
+DEFINE(vp9_block_quant_fast,                    offsetof(BLOCK, quant_fast));
+DEFINE(vp9_block_zbin_extra,                    offsetof(BLOCK, zbin_extra));
+DEFINE(vp9_block_zrun_zbin_boost,               offsetof(BLOCK, zrun_zbin_boost));
+DEFINE(vp9_block_quant_shift,                   offsetof(BLOCK, quant_shift));
+
+DEFINE(vp9_blockd_qcoeff,                       offsetof(BLOCKD, qcoeff));
+DEFINE(vp9_blockd_dequant,                      offsetof(BLOCKD, dequant));
+DEFINE(vp9_blockd_dqcoeff,                      offsetof(BLOCKD, dqcoeff));
+DEFINE(vp9_blockd_eob,                          offsetof(BLOCKD, eob));
+
+/* subtract */
+DEFINE(vp9_block_base_src,                      offsetof(BLOCK, base_src));
+DEFINE(vp9_block_src,                           offsetof(BLOCK, src));
+DEFINE(vp9_block_src_diff,                      offsetof(BLOCK, src_diff));
+DEFINE(vp9_block_src_stride,                    offsetof(BLOCK, src_stride));
+
+DEFINE(vp9_blockd_predictor,                    offsetof(BLOCKD, predictor));
+
+/* pack tokens */
+DEFINE(vp9_writer_lowvalue,                     offsetof(vp9_writer, lowvalue));
+DEFINE(vp9_writer_range,                        offsetof(vp9_writer, range));
+DEFINE(vp9_writer_value,                        offsetof(vp9_writer, value));
+DEFINE(vp9_writer_count,                        offsetof(vp9_writer, count));
+DEFINE(vp9_writer_pos,                          offsetof(vp9_writer, pos));
+DEFINE(vp9_writer_buffer,                       offsetof(vp9_writer, buffer));
+
+DEFINE(tokenextra_token,                        offsetof(TOKENEXTRA, Token));
+DEFINE(tokenextra_extra,                        offsetof(TOKENEXTRA, Extra));
+DEFINE(tokenextra_context_tree,                 offsetof(TOKENEXTRA, context_tree));
+DEFINE(tokenextra_skip_eob_node,                offsetof(TOKENEXTRA, skip_eob_node));
+DEFINE(TOKENEXTRA_SZ,                           sizeof(TOKENEXTRA));
+
+DEFINE(vp9_extra_bit_struct_sz,                 sizeof(vp9_extra_bit_struct));
+
+DEFINE(vp9_token_value,                         offsetof(vp9_token, value));
+DEFINE(vp9_token_len,                           offsetof(vp9_token, Len));
+
+DEFINE(vp9_extra_bit_struct_tree,               offsetof(vp9_extra_bit_struct, tree));
+DEFINE(vp9_extra_bit_struct_prob,               offsetof(vp9_extra_bit_struct, prob));
+DEFINE(vp9_extra_bit_struct_len,                offsetof(vp9_extra_bit_struct, Len));
+DEFINE(vp9_extra_bit_struct_base_val,           offsetof(vp9_extra_bit_struct, base_val));
+
+DEFINE(vp9_comp_tplist,                         offsetof(VP9_COMP, tplist));
+DEFINE(vp9_comp_common,                         offsetof(VP9_COMP, common));
+
+DEFINE(tokenlist_start,                         offsetof(TOKENLIST, start));
+DEFINE(tokenlist_stop,                          offsetof(TOKENLIST, stop));
+DEFINE(TOKENLIST_SZ,                            sizeof(TOKENLIST));
+
+DEFINE(vp9_common_mb_rows,                      offsetof(VP9_COMMON, mb_rows));
+
+END
+
+/* add asserts for any offset that is not supported by assembly code
+ * add asserts for any size that is not supported by assembly code
+
+ * These are used in vp8cx_pack_tokens.  They are hard coded so if their sizes
+ * change they will have to be adjusted.
+ */
+
+#if HAVE_ARMV5TE
+ct_assert(TOKENEXTRA_SZ, sizeof(TOKENEXTRA) == 8)
+ct_assert(vp9_extra_bit_struct_sz, sizeof(vp9_extra_bit_struct) == 16)
+#endif
--- /dev/null
+++ b/vp9/encoder/bitstream.c
@@ -1,0 +1,2394 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vp9/common/header.h"
+#include "encodemv.h"
+#include "vp9/common/entropymode.h"
+#include "vp9/common/findnearmv.h"
+#include "mcomp.h"
+#include "vp9/common/systemdependent.h"
+#include <assert.h>
+#include <stdio.h>
+#include <limits.h>
+#include "vp9/common/pragmas.h"
+#include "vpx/vpx_encoder.h"
+#include "vpx_mem/vpx_mem.h"
+#include "bitstream.h"
+#include "segmentation.h"
+
+#include "vp9/common/seg_common.h"
+#include "vp9/common/pred_common.h"
+#include "vp9/common/entropy.h"
+#include "vp9/encoder/encodemv.h"
+#include "vp9/common/entropymv.h"
+
+#if CONFIG_NEWBESTREFMV
+#include "vp9/common/mvref_common.h"
+#endif
+
+#if defined(SECTIONBITS_OUTPUT)
+unsigned __int64 Sectionbits[500];
+#endif
+
+#ifdef ENTROPY_STATS
+int intra_mode_stats [VP9_BINTRAMODES] [VP9_BINTRAMODES] [VP9_BINTRAMODES];
+unsigned int tree_update_hist [BLOCK_TYPES]
+                              [COEF_BANDS]
+                              [PREV_COEF_CONTEXTS]
+                              [ENTROPY_NODES][2];
+unsigned int hybrid_tree_update_hist [BLOCK_TYPES]
+                                     [COEF_BANDS]
+                                     [PREV_COEF_CONTEXTS]
+                                     [ENTROPY_NODES][2];
+unsigned int tree_update_hist_8x8 [BLOCK_TYPES_8X8]
+                                  [COEF_BANDS]
+                                  [PREV_COEF_CONTEXTS]
+                                  [ENTROPY_NODES] [2];
+unsigned int hybrid_tree_update_hist_8x8 [BLOCK_TYPES_8X8]
+                                         [COEF_BANDS]
+                                         [PREV_COEF_CONTEXTS]
+                                         [ENTROPY_NODES] [2];
+unsigned int tree_update_hist_16x16 [BLOCK_TYPES_16X16]
+                                    [COEF_BANDS]
+                                    [PREV_COEF_CONTEXTS]
+                                    [ENTROPY_NODES] [2];
+unsigned int hybrid_tree_update_hist_16x16 [BLOCK_TYPES_16X16]
+                                           [COEF_BANDS]
+                                           [PREV_COEF_CONTEXTS]
+                                           [ENTROPY_NODES] [2];
+
+extern unsigned int active_section;
+#endif
+
+#ifdef MODE_STATS
+int count_mb_seg[4] = { 0, 0, 0, 0 };
+#endif
+
+#define vp9_cost_upd  ((int)(vp9_cost_one(upd) - vp9_cost_zero(upd)) >> 8)
+#define vp9_cost_upd256  ((int)(vp9_cost_one(upd) - vp9_cost_zero(upd)))
+
+#define SEARCH_NEWP
+static int update_bits[255];
+
+static void compute_update_table() {
+  int i;
+  for (i = 0; i < 255; i++)
+    update_bits[i] = vp9_count_term_subexp(i, SUBEXP_PARAM, 255);
+}
+
+static int split_index(int i, int n, int modulus) {
+  int max1 = (n - 1 - modulus / 2) / modulus + 1;
+  if (i % modulus == modulus / 2) i = i / modulus;
+  else i = max1 + i - (i + modulus - modulus / 2) / modulus;
+  return i;
+}
+
+static int remap_prob(int v, int m) {
+  const int n = 256;
+  const int modulus = MODULUS_PARAM;
+  int i;
+  if ((m << 1) <= n)
+    i = vp9_recenter_nonneg(v, m) - 1;
+  else
+    i = vp9_recenter_nonneg(n - 1 - v, n - 1 - m) - 1;
+
+  i = split_index(i, n - 1, modulus);
+  return i;
+}
+
+static void write_prob_diff_update(vp9_writer *const bc,
+                                   vp9_prob newp, vp9_prob oldp) {
+  int delp = remap_prob(newp, oldp);
+  vp9_encode_term_subexp(bc, delp, SUBEXP_PARAM, 255);
+}
+
+static int prob_diff_update_cost(vp9_prob newp, vp9_prob oldp) {
+  int delp = remap_prob(newp, oldp);
+  return update_bits[delp] * 256;
+}
+
+static void update_mode(
+  vp9_writer *const bc,
+  int n,
+  vp9_token tok               [/* n */],
+  vp9_tree tree,
+  vp9_prob Pnew               [/* n-1 */],
+  vp9_prob Pcur               [/* n-1 */],
+  unsigned int bct            [/* n-1 */] [2],
+  const unsigned int num_events[/* n */]
+) {
+  unsigned int new_b = 0, old_b = 0;
+  int i = 0;
+
+  vp9_tree_probs_from_distribution(
+    n--, tok, tree,
+    Pnew, bct, num_events,
+    256, 1
+  );
+
+  do {
+    new_b += cost_branch(bct[i], Pnew[i]);
+    old_b += cost_branch(bct[i], Pcur[i]);
+  } while (++i < n);
+
+  if (new_b + (n << 8) < old_b) {
+    int i = 0;
+
+    vp9_write_bit(bc, 1);
+
+    do {
+      const vp9_prob p = Pnew[i];
+
+      vp9_write_literal(bc, Pcur[i] = p ? p : 1, 8);
+    } while (++i < n);
+  } else
+    vp9_write_bit(bc, 0);
+}
+
+static void update_mbintra_mode_probs(VP9_COMP* const cpi,
+                                      vp9_writer* const bc) {
+  VP9_COMMON *const cm = &cpi->common;
+
+  {
+    vp9_prob Pnew   [VP9_YMODES - 1];
+    unsigned int bct [VP9_YMODES - 1] [2];
+
+    update_mode(
+      bc, VP9_YMODES, vp9_ymode_encodings, vp9_ymode_tree,
+      Pnew, cm->fc.ymode_prob, bct, (unsigned int *)cpi->ymode_count
+    );
+  }
+}
+
+static int get_prob(int num, int den) {
+  int p;
+  if (den <= 0)
+    return 128;
+  p = (num * 255 + (den >> 1)) / den;
+  if (p > 255)
+    return 255;
+  else if (p < 1)
+    return 1;
+  return p;
+}
+
+static int get_binary_prob(int n0, int n1) {
+  return get_prob(n0, n0 + n1);
+}
+
+void vp9_update_skip_probs(VP9_COMP *cpi) {
+  VP9_COMMON *const pc = &cpi->common;
+  int prob_skip_false[3] = {0, 0, 0};
+  int k;
+
+  for (k = 0; k < MBSKIP_CONTEXTS; ++k) {
+    pc->mbskip_pred_probs[k] = get_binary_prob(cpi->skip_false_count[k],
+                                               cpi->skip_true_count[k]);
+  }
+}
+
+static void update_switchable_interp_probs(VP9_COMP *cpi,
+                                           vp9_writer* const bc) {
+  VP9_COMMON *const pc = &cpi->common;
+  unsigned int branch_ct[32][2];
+  int i, j;
+  for (j = 0; j <= VP9_SWITCHABLE_FILTERS; ++j) {
+    vp9_tree_probs_from_distribution(
+        VP9_SWITCHABLE_FILTERS,
+        vp9_switchable_interp_encodings, vp9_switchable_interp_tree,
+        pc->fc.switchable_interp_prob[j], branch_ct,
+        cpi->switchable_interp_count[j], 256, 1);
+    for (i = 0; i < VP9_SWITCHABLE_FILTERS - 1; ++i) {
+      if (pc->fc.switchable_interp_prob[j][i] < 1)
+        pc->fc.switchable_interp_prob[j][i] = 1;
+      vp9_write_literal(bc, pc->fc.switchable_interp_prob[j][i], 8);
+    }
+  }
+}
+
+// This function updates the reference frame prediction stats
+static void update_refpred_stats(VP9_COMP *cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  int i;
+  int tot_count;
+  vp9_prob new_pred_probs[PREDICTION_PROBS];
+  int old_cost, new_cost;
+
+  // Set the prediction probability structures to defaults
+  if (cm->frame_type == KEY_FRAME) {
+    // Set the prediction probabilities to defaults
+    cm->ref_pred_probs[0] = 120;
+    cm->ref_pred_probs[1] = 80;
+    cm->ref_pred_probs[2] = 40;
+
+    vpx_memset(cpi->ref_pred_probs_update, 0,
+               sizeof(cpi->ref_pred_probs_update));
+  } else {
+    // From the prediction counts set the probabilities for each context
+    for (i = 0; i < PREDICTION_PROBS; i++) {
+      new_pred_probs[i] = get_binary_prob(cpi->ref_pred_count[i][0],
+                                          cpi->ref_pred_count[i][1]);
+
+      // Decide whether or not to update the reference frame probs.
+      // Returned costs are in 1/256 bit units.
+      old_cost =
+        (cpi->ref_pred_count[i][0] * vp9_cost_zero(cm->ref_pred_probs[i])) +
+        (cpi->ref_pred_count[i][1] * vp9_cost_one(cm->ref_pred_probs[i]));
+
+      new_cost =
+        (cpi->ref_pred_count[i][0] * vp9_cost_zero(new_pred_probs[i])) +
+        (cpi->ref_pred_count[i][1] * vp9_cost_one(new_pred_probs[i]));
+
+      // Cost saving must be >= 8 bits (2048 in these units)
+      if ((old_cost - new_cost) >= 2048) {
+        cpi->ref_pred_probs_update[i] = 1;
+        cm->ref_pred_probs[i] = new_pred_probs[i];
+      } else
+        cpi->ref_pred_probs_update[i] = 0;
+
+    }
+  }
+}
+
+static void update_mvcount(VP9_COMP *cpi, MACROBLOCK *x,
+                           int_mv *best_ref_mv, int_mv *second_best_ref_mv) {
+  MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;
+  MV mv;
+
+  if (mbmi->mode == SPLITMV) {
+    int i;
+
+    for (i = 0; i < x->partition_info->count; i++) {
+      if (x->partition_info->bmi[i].mode == NEW4X4) {
+        if (x->e_mbd.allow_high_precision_mv) {
+          mv.row = (x->partition_info->bmi[i].mv.as_mv.row
+                    - best_ref_mv->as_mv.row);
+          mv.col = (x->partition_info->bmi[i].mv.as_mv.col
+                    - best_ref_mv->as_mv.col);
+          vp9_increment_nmv(&mv, &best_ref_mv->as_mv, &cpi->NMVcount, 1);
+          if (x->e_mbd.mode_info_context->mbmi.second_ref_frame) {
+            mv.row = (x->partition_info->bmi[i].second_mv.as_mv.row
+                      - second_best_ref_mv->as_mv.row);
+            mv.col = (x->partition_info->bmi[i].second_mv.as_mv.col
+                      - second_best_ref_mv->as_mv.col);
+            vp9_increment_nmv(&mv, &second_best_ref_mv->as_mv,
+                              &cpi->NMVcount, 1);
+          }
+        } else {
+          mv.row = (x->partition_info->bmi[i].mv.as_mv.row
+                    - best_ref_mv->as_mv.row);
+          mv.col = (x->partition_info->bmi[i].mv.as_mv.col
+                    - best_ref_mv->as_mv.col);
+          vp9_increment_nmv(&mv, &best_ref_mv->as_mv, &cpi->NMVcount, 0);
+          if (x->e_mbd.mode_info_context->mbmi.second_ref_frame) {
+            mv.row = (x->partition_info->bmi[i].second_mv.as_mv.row
+                      - second_best_ref_mv->as_mv.row);
+            mv.col = (x->partition_info->bmi[i].second_mv.as_mv.col
+                      - second_best_ref_mv->as_mv.col);
+            vp9_increment_nmv(&mv, &second_best_ref_mv->as_mv,
+                              &cpi->NMVcount, 0);
+          }
+        }
+      }
+    }
+  } else if (mbmi->mode == NEWMV) {
+    if (x->e_mbd.allow_high_precision_mv) {
+      mv.row = (mbmi->mv[0].as_mv.row - best_ref_mv->as_mv.row);
+      mv.col = (mbmi->mv[0].as_mv.col - best_ref_mv->as_mv.col);
+      vp9_increment_nmv(&mv, &best_ref_mv->as_mv, &cpi->NMVcount, 1);
+      if (mbmi->second_ref_frame) {
+        mv.row = (mbmi->mv[1].as_mv.row - second_best_ref_mv->as_mv.row);
+        mv.col = (mbmi->mv[1].as_mv.col - second_best_ref_mv->as_mv.col);
+        vp9_increment_nmv(&mv, &second_best_ref_mv->as_mv, &cpi->NMVcount, 1);
+      }
+    } else {
+      mv.row = (mbmi->mv[0].as_mv.row - best_ref_mv->as_mv.row);
+      mv.col = (mbmi->mv[0].as_mv.col - best_ref_mv->as_mv.col);
+      vp9_increment_nmv(&mv, &best_ref_mv->as_mv, &cpi->NMVcount, 0);
+      if (mbmi->second_ref_frame) {
+        mv.row = (mbmi->mv[1].as_mv.row - second_best_ref_mv->as_mv.row);
+        mv.col = (mbmi->mv[1].as_mv.col - second_best_ref_mv->as_mv.col);
+        vp9_increment_nmv(&mv, &second_best_ref_mv->as_mv, &cpi->NMVcount, 0);
+      }
+    }
+  }
+}
+
+static void write_ymode(vp9_writer *bc, int m, const vp9_prob *p) {
+  write_token(bc, vp9_ymode_tree, p, vp9_ymode_encodings + m);
+}
+
+static void kfwrite_ymode(vp9_writer *bc, int m, const vp9_prob *p) {
+  write_token(bc, vp9_kf_ymode_tree, p, vp9_kf_ymode_encodings + m);
+}
+
+#if CONFIG_SUPERBLOCKS
+static void sb_kfwrite_ymode(vp9_writer *bc, int m, const vp9_prob *p) {
+  write_token(bc, vp9_uv_mode_tree, p, vp9_sb_kf_ymode_encodings + m);
+}
+#endif
+
+static void write_i8x8_mode(vp9_writer *bc, int m, const vp9_prob *p) {
+  write_token(bc, vp9_i8x8_mode_tree, p, vp9_i8x8_mode_encodings + m);
+}
+
+static void write_uv_mode(vp9_writer *bc, int m, const vp9_prob *p) {
+  write_token(bc, vp9_uv_mode_tree, p, vp9_uv_mode_encodings + m);
+}
+
+
+static void write_bmode(vp9_writer *bc, int m, const vp9_prob *p) {
+  write_token(bc, vp9_bmode_tree, p, vp9_bmode_encodings + m);
+}
+
+static void write_split(vp9_writer *bc, int x, const vp9_prob *p) {
+  write_token(bc, vp9_mbsplit_tree, p, vp9_mbsplit_encodings + x);
+}
+
+static int prob_update_savings(const unsigned int *ct,
+                               const vp9_prob oldp, const vp9_prob newp,
+                               const vp9_prob upd) {
+  const int old_b = cost_branch256(ct, oldp);
+  const int new_b = cost_branch256(ct, newp);
+  const int update_b = 2048 + vp9_cost_upd256;
+  return (old_b - new_b - update_b);
+}
+
+static int prob_diff_update_savings(const unsigned int *ct,
+                                    const vp9_prob oldp, const vp9_prob newp,
+                                    const vp9_prob upd) {
+  const int old_b = cost_branch256(ct, oldp);
+  const int new_b = cost_branch256(ct, newp);
+  const int update_b = (newp == oldp ? 0 :
+                        prob_diff_update_cost(newp, oldp) + vp9_cost_upd256);
+  return (old_b - new_b - update_b);
+}
+
+static int prob_diff_update_savings_search(const unsigned int *ct,
+                                           const vp9_prob oldp, vp9_prob *bestp,
+                                           const vp9_prob upd) {
+  const int old_b = cost_branch256(ct, oldp);
+  int new_b, update_b, savings, bestsavings, step;
+  vp9_prob newp, bestnewp;
+
+  bestsavings = 0;
+  bestnewp = oldp;
+
+  step = (*bestp > oldp ? -1 : 1);
+  for (newp = *bestp; newp != oldp; newp += step) {
+    new_b = cost_branch256(ct, newp);
+    update_b = prob_diff_update_cost(newp, oldp) + vp9_cost_upd256;
+    savings = old_b - new_b - update_b;
+    if (savings > bestsavings) {
+      bestsavings = savings;
+      bestnewp = newp;
+    }
+  }
+  *bestp = bestnewp;
+  return bestsavings;
+}
+
+static void pack_mb_tokens(vp9_writer* const bc,
+                           TOKENEXTRA **tp,
+                           const TOKENEXTRA *const stop) {
+  unsigned int split;
+  unsigned int shift;
+  int count = bc->count;
+  unsigned int range = bc->range;
+  unsigned int lowvalue = bc->lowvalue;
+  TOKENEXTRA *p = *tp;
+
+  while (p < stop) {
+    const int t = p->Token;
+    vp9_token *const a = vp9_coef_encodings + t;
+    const vp9_extra_bit_struct *const b = vp9_extra_bits + t;
+    int i = 0;
+    const unsigned char *pp = p->context_tree;
+    int v = a->value;
+    int n = a->Len;
+
+    if (t == EOSB_TOKEN)
+    {
+      ++p;
+      break;
+    }
+
+    /* skip one or two nodes */
+    if (p->skip_eob_node) {
+      n -= p->skip_eob_node;
+      i = 2 * p->skip_eob_node;
+    }
+
+    do {
+      const int bb = (v >> --n) & 1;
+      split = 1 + (((range - 1) * pp[i >> 1]) >> 8);
+      i = vp9_coef_tree[i + bb];
+
+      if (bb) {
+        lowvalue += split;
+        range = range - split;
+      } else {
+        range = split;
+      }
+
+      shift = vp9_norm[range];
+      range <<= shift;
+      count += shift;
+
+      if (count >= 0) {
+        int offset = shift - count;
+
+        if ((lowvalue << (offset - 1)) & 0x80000000) {
+          int x = bc->pos - 1;
+
+          while (x >= 0 && bc->buffer[x] == 0xff) {
+            bc->buffer[x] = (unsigned char)0;
+            x--;
+          }
+
+          bc->buffer[x] += 1;
+        }
+
+        bc->buffer[bc->pos++] = (lowvalue >> (24 - offset));
+        lowvalue <<= offset;
+        shift = count;
+        lowvalue &= 0xffffff;
+        count -= 8;
+      }
+
+      lowvalue <<= shift;
+    } while (n);
+
+
+    if (b->base_val) {
+      const int e = p->Extra, L = b->Len;
+
+      if (L) {
+        const unsigned char *pp = b->prob;
+        int v = e >> 1;
+        int n = L;              /* number of bits in v, assumed nonzero */
+        int i = 0;
+
+        do {
+          const int bb = (v >> --n) & 1;
+          split = 1 + (((range - 1) * pp[i >> 1]) >> 8);
+          i = b->tree[i + bb];
+
+          if (bb) {
+            lowvalue += split;
+            range = range - split;
+          } else {
+            range = split;
+          }
+
+          shift = vp9_norm[range];
+          range <<= shift;
+          count += shift;
+
+          if (count >= 0) {
+            int offset = shift - count;
+
+            if ((lowvalue << (offset - 1)) & 0x80000000) {
+              int x = bc->pos - 1;
+
+              while (x >= 0 && bc->buffer[x] == 0xff) {
+                bc->buffer[x] = (unsigned char)0;
+                x--;
+              }
+
+              bc->buffer[x] += 1;
+            }
+
+            bc->buffer[bc->pos++] = (lowvalue >> (24 - offset));
+            lowvalue <<= offset;
+            shift = count;
+            lowvalue &= 0xffffff;
+            count -= 8;
+          }
+
+          lowvalue <<= shift;
+        } while (n);
+      }
+
+
+      {
+
+        split = (range + 1) >> 1;
+
+        if (e & 1) {
+          lowvalue += split;
+          range = range - split;
+        } else {
+          range = split;
+        }
+
+        range <<= 1;
+
+        if ((lowvalue & 0x80000000)) {
+          int x = bc->pos - 1;
+
+          while (x >= 0 && bc->buffer[x] == 0xff) {
+            bc->buffer[x] = (unsigned char)0;
+            x--;
+          }
+
+          bc->buffer[x] += 1;
+
+        }
+
+        lowvalue  <<= 1;
+
+        if (!++count) {
+          count = -8;
+          bc->buffer[bc->pos++] = (lowvalue >> 24);
+          lowvalue &= 0xffffff;
+        }
+      }
+
+    }
+    ++p;
+  }
+
+  bc->count = count;
+  bc->lowvalue = lowvalue;
+  bc->range = range;
+  *tp = p;
+}
+
+static void write_partition_size(unsigned char *cx_data, int size) {
+  signed char csize;
+
+  csize = size & 0xff;
+  *cx_data = csize;
+  csize = (size >> 8) & 0xff;
+  *(cx_data + 1) = csize;
+  csize = (size >> 16) & 0xff;
+  *(cx_data + 2) = csize;
+
+}
+
+static void write_mv_ref
+(
+  vp9_writer *bc, MB_PREDICTION_MODE m, const vp9_prob *p
+) {
+#if CONFIG_DEBUG
+  assert(NEARESTMV <= m  &&  m <= SPLITMV);
+#endif
+  write_token(bc, vp9_mv_ref_tree, p,
+              vp9_mv_ref_encoding_array - NEARESTMV + m);
+}
+
+#if CONFIG_SUPERBLOCKS
+static void write_sb_mv_ref(vp9_writer *bc, MB_PREDICTION_MODE m,
+                            const vp9_prob *p) {
+#if CONFIG_DEBUG
+  assert(NEARESTMV <= m  &&  m < SPLITMV);
+#endif
+  write_token(bc, vp9_sb_mv_ref_tree, p,
+              vp9_sb_mv_ref_encoding_array - NEARESTMV + m);
+}
+#endif
+
+static void write_sub_mv_ref
+(
+  vp9_writer *bc, B_PREDICTION_MODE m, const vp9_prob *p
+) {
+#if CONFIG_DEBUG
+  assert(LEFT4X4 <= m  &&  m <= NEW4X4);
+#endif
+  write_token(bc, vp9_sub_mv_ref_tree, p,
+              vp9_sub_mv_ref_encoding_array - LEFT4X4 + m);
+}
+
+static void write_nmv(vp9_writer *bc, const MV *mv, const int_mv *ref,
+                      const nmv_context *nmvc, int usehp) {
+  MV e;
+  e.row = mv->row - ref->as_mv.row;
+  e.col = mv->col - ref->as_mv.col;
+
+  vp9_encode_nmv(bc, &e, &ref->as_mv, nmvc);
+  vp9_encode_nmv_fp(bc, &e, &ref->as_mv, nmvc, usehp);
+}
+
+#if CONFIG_NEW_MVREF
+static int vp9_cost_mv_ref_id(vp9_prob * ref_id_probs, int mv_ref_id) {
+  int cost;
+
+  // Encode the index for the MV reference.
+  switch (mv_ref_id) {
+    case 0:
+      cost = vp9_cost_zero(ref_id_probs[0]);
+      break;
+    case 1:
+      cost = vp9_cost_one(ref_id_probs[0]);
+      cost += vp9_cost_zero(ref_id_probs[1]);
+      break;
+    case 2:
+      cost = vp9_cost_one(ref_id_probs[0]);
+      cost += vp9_cost_one(ref_id_probs[1]);
+      cost += vp9_cost_zero(ref_id_probs[2]);
+      break;
+    case 3:
+      cost = vp9_cost_one(ref_id_probs[0]);
+      cost += vp9_cost_one(ref_id_probs[1]);
+      cost += vp9_cost_one(ref_id_probs[2]);
+      break;
+
+      // TRAP.. This should not happen
+    default:
+      assert(0);
+      break;
+  }
+
+  return cost;
+}
+
+static void vp9_write_mv_ref_id(vp9_writer *w,
+                                vp9_prob * ref_id_probs,
+                                int mv_ref_id) {
+  // Encode the index for the MV reference.
+  switch (mv_ref_id) {
+    case 0:
+      vp9_write(w, 0, ref_id_probs[0]);
+      break;
+    case 1:
+      vp9_write(w, 1, ref_id_probs[0]);
+      vp9_write(w, 0, ref_id_probs[1]);
+      break;
+    case 2:
+      vp9_write(w, 1, ref_id_probs[0]);
+      vp9_write(w, 1, ref_id_probs[1]);
+      vp9_write(w, 0, ref_id_probs[2]);
+      break;
+    case 3:
+      vp9_write(w, 1, ref_id_probs[0]);
+      vp9_write(w, 1, ref_id_probs[1]);
+      vp9_write(w, 1, ref_id_probs[2]);
+      break;
+
+      // TRAP.. This should not happen
+    default:
+      assert(0);
+      break;
+  }
+}
+
+// Estimate the cost of each coding the vector using each reference candidate
+static unsigned int pick_best_mv_ref(MACROBLOCK *x,
+                                     MV_REFERENCE_FRAME ref_frame,
+                                     int_mv target_mv,
+                                     int_mv * mv_ref_list,
+                                     int_mv * best_ref) {
+  int i;
+  int best_index = 0;
+  int cost, cost2;
+  int zero_seen = (mv_ref_list[0].as_int) ? FALSE : TRUE;
+  MACROBLOCKD *xd = &x->e_mbd;
+  int max_mv = MV_MAX;
+
+  cost = vp9_cost_mv_ref_id(xd->mb_mv_ref_id_probs[ref_frame], 0) +
+         vp9_mv_bit_cost(&target_mv,
+                         &mv_ref_list[0],
+                         XMVCOST, 96,
+                         xd->allow_high_precision_mv);
+
+
+  // Use 4 for now : for (i = 1; i < MAX_MV_REFS; ++i ) {
+  for (i = 1; i < 4; ++i) {
+    // If we see a 0,0 reference vector for a second time we have reached
+    // the end of the list of valid candidate vectors.
+    if (!mv_ref_list[i].as_int)
+      if (zero_seen)
+        break;
+      else
+        zero_seen = TRUE;
+
+    // Check for cases where the reference choice would give rise to an
+    // uncodable/out of range residual for row or col.
+    if ((abs(target_mv.as_mv.row - mv_ref_list[i].as_mv.row) > max_mv) ||
+        (abs(target_mv.as_mv.col - mv_ref_list[i].as_mv.col) > max_mv)) {
+      continue;
+    }
+
+    cost2 = vp9_cost_mv_ref_id(xd->mb_mv_ref_id_probs[ref_frame], i) +
+            vp9_mv_bit_cost(&target_mv,
+                            &mv_ref_list[i],
+                            XMVCOST, 96,
+                            xd->allow_high_precision_mv);
+
+    if (cost2 < cost) {
+      cost = cost2;
+      best_index = i;
+    }
+  }
+
+  (*best_ref).as_int = mv_ref_list[best_index].as_int;
+
+  return best_index;
+}
+#endif
+
+// This function writes the current macro block's segnment id to the bitstream
+// It should only be called if a segment map update is indicated.
+static void write_mb_segid(vp9_writer *bc,
+                           const MB_MODE_INFO *mi, const MACROBLOCKD *xd) {
+  // Encode the MB segment id.
+  int seg_id = mi->segment_id;
+#if CONFIG_SUPERBLOCKS
+  if (mi->encoded_as_sb) {
+    if (xd->mb_to_right_edge > 0)
+      seg_id = seg_id && xd->mode_info_context[1].mbmi.segment_id;
+    if (xd->mb_to_bottom_edge > 0) {
+      seg_id = seg_id &&
+               xd->mode_info_context[xd->mode_info_stride].mbmi.segment_id;
+      if (xd->mb_to_right_edge > 0)
+        seg_id = seg_id &&
+                xd->mode_info_context[xd->mode_info_stride + 1].mbmi.segment_id;
+    }
+  }
+#endif
+  if (xd->segmentation_enabled && xd->update_mb_segmentation_map) {
+    switch (seg_id) {
+      case 0:
+        vp9_write(bc, 0, xd->mb_segment_tree_probs[0]);
+        vp9_write(bc, 0, xd->mb_segment_tree_probs[1]);
+        break;
+      case 1:
+        vp9_write(bc, 0, xd->mb_segment_tree_probs[0]);
+        vp9_write(bc, 1, xd->mb_segment_tree_probs[1]);
+        break;
+      case 2:
+        vp9_write(bc, 1, xd->mb_segment_tree_probs[0]);
+        vp9_write(bc, 0, xd->mb_segment_tree_probs[2]);
+        break;
+      case 3:
+        vp9_write(bc, 1, xd->mb_segment_tree_probs[0]);
+        vp9_write(bc, 1, xd->mb_segment_tree_probs[2]);
+        break;
+
+        // TRAP.. This should not happen
+      default:
+        vp9_write(bc, 0, xd->mb_segment_tree_probs[0]);
+        vp9_write(bc, 0, xd->mb_segment_tree_probs[1]);
+        break;
+    }
+  }
+}
+
+// This function encodes the reference frame
+static void encode_ref_frame(vp9_writer *const bc,
+                             VP9_COMMON *const cm,
+                             MACROBLOCKD *xd,
+                             int segment_id,
+                             MV_REFERENCE_FRAME rf) {
+  int seg_ref_active;
+  int seg_ref_count = 0;
+  seg_ref_active = vp9_segfeature_active(xd,
+                                         segment_id,
+                                         SEG_LVL_REF_FRAME);
+
+  if (seg_ref_active) {
+    seg_ref_count = vp9_check_segref(xd, segment_id, INTRA_FRAME) +
+                    vp9_check_segref(xd, segment_id, LAST_FRAME) +
+                    vp9_check_segref(xd, segment_id, GOLDEN_FRAME) +
+                    vp9_check_segref(xd, segment_id, ALTREF_FRAME);
+  }
+
+  // If segment level coding of this signal is disabled...
+  // or the segment allows multiple reference frame options
+  if (!seg_ref_active || (seg_ref_count > 1)) {
+    // Values used in prediction model coding
+    unsigned char prediction_flag;
+    vp9_prob pred_prob;
+    MV_REFERENCE_FRAME pred_rf;
+
+    // Get the context probability the prediction flag
+    pred_prob = vp9_get_pred_prob(cm, xd, PRED_REF);
+
+    // Get the predicted value.
+    pred_rf = vp9_get_pred_ref(cm, xd);
+
+    // Did the chosen reference frame match its predicted value.
+    prediction_flag =
+      (xd->mode_info_context->mbmi.ref_frame == pred_rf);
+
+    vp9_set_pred_flag(xd, PRED_REF, prediction_flag);
+    vp9_write(bc, prediction_flag, pred_prob);
+
+    // If not predicted correctly then code value explicitly
+    if (!prediction_flag) {
+      vp9_prob mod_refprobs[PREDICTION_PROBS];
+
+      vpx_memcpy(mod_refprobs,
+                 cm->mod_refprobs[pred_rf], sizeof(mod_refprobs));
+
+      // If segment coding enabled blank out options that cant occur by
+      // setting the branch probability to 0.
+      if (seg_ref_active) {
+        mod_refprobs[INTRA_FRAME] *=
+          vp9_check_segref(xd, segment_id, INTRA_FRAME);
+        mod_refprobs[LAST_FRAME] *=
+          vp9_check_segref(xd, segment_id, LAST_FRAME);
+        mod_refprobs[GOLDEN_FRAME] *=
+          (vp9_check_segref(xd, segment_id, GOLDEN_FRAME) *
+           vp9_check_segref(xd, segment_id, ALTREF_FRAME));
+      }
+
+      if (mod_refprobs[0]) {
+        vp9_write(bc, (rf != INTRA_FRAME), mod_refprobs[0]);
+      }
+
+      // Inter coded
+      if (rf != INTRA_FRAME) {
+        if (mod_refprobs[1]) {
+          vp9_write(bc, (rf != LAST_FRAME), mod_refprobs[1]);
+        }
+
+        if (rf != LAST_FRAME) {
+          if (mod_refprobs[2]) {
+            vp9_write(bc, (rf != GOLDEN_FRAME), mod_refprobs[2]);
+          }
+        }
+      }
+    }
+  }
+
+  // if using the prediction mdoel we have nothing further to do because
+  // the reference frame is fully coded by the segment
+}
+
+// Update the probabilities used to encode reference frame data
+static void update_ref_probs(VP9_COMP *const cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+
+  const int *const rfct = cpi->count_mb_ref_frame_usage;
+  const int rf_intra = rfct[INTRA_FRAME];
+  const int rf_inter = rfct[LAST_FRAME] +
+                       rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME];
+
+  cm->prob_intra_coded = get_binary_prob(rf_intra, rf_inter);
+  cm->prob_last_coded = get_prob(rfct[LAST_FRAME], rf_inter);
+  cm->prob_gf_coded = get_binary_prob(rfct[GOLDEN_FRAME], rfct[ALTREF_FRAME]);
+
+  // Compute a modified set of probabilities to use when prediction of the
+  // reference frame fails
+  vp9_compute_mod_refprobs(cm);
+}
+
+static void pack_inter_mode_mvs(VP9_COMP *const cpi, vp9_writer *const bc) {
+  int i;
+  VP9_COMMON *const pc = &cpi->common;
+  const nmv_context *nmvc = &pc->fc.nmvc;
+  MACROBLOCK *x = &cpi->mb;
+  MACROBLOCKD *xd = &cpi->mb.e_mbd;
+  MODE_INFO *m;
+  MODE_INFO *prev_m;
+  TOKENEXTRA *tok = cpi->tok;
+  TOKENEXTRA *tok_end = tok + cpi->tok_count;
+
+  const int mis = pc->mode_info_stride;
+  int mb_row, mb_col;
+  int row, col;
+
+  // Values used in prediction model coding
+  vp9_prob pred_prob;
+  unsigned char prediction_flag;
+
+  int row_delta[4] = { 0, +1,  0, -1};
+  int col_delta[4] = { +1, -1, +1, +1};
+
+  cpi->mb.partition_info = cpi->mb.pi;
+
+  mb_row = 0;
+  for (row = 0; row < pc->mb_rows; row += 2) {
+    m = pc->mi + row * mis;
+    prev_m = pc->prev_mi + row * mis;
+
+    mb_col = 0;
+    for (col = 0; col < pc->mb_cols; col += 2) {
+      int i;
+
+      // Process the 4 MBs in the order:
+      // top-left, top-right, bottom-left, bottom-right
+#if CONFIG_SUPERBLOCKS
+      vp9_write(bc, m->mbmi.encoded_as_sb, pc->sb_coded);
+#endif
+      for (i = 0; i < 4; i++) {
+        MB_MODE_INFO *mi;
+        MV_REFERENCE_FRAME rf;
+        MB_PREDICTION_MODE mode;
+        int segment_id;
+
+        int dy = row_delta[i];
+        int dx = col_delta[i];
+        int offset_extended = dy * mis + dx;
+
+        if ((mb_row >= pc->mb_rows) || (mb_col >= pc->mb_cols)) {
+          // MB lies outside frame, move on
+          mb_row += dy;
+          mb_col += dx;
+          m += offset_extended;
+          prev_m += offset_extended;
+          cpi->mb.partition_info += offset_extended;
+          continue;
+        }
+
+        mi = &m->mbmi;
+        rf = mi->ref_frame;
+        mode = mi->mode;
+        segment_id = mi->segment_id;
+
+        // Distance of Mb to the various image edges.
+        // These specified to 8th pel as they are always compared to MV
+        // values that are in 1/8th pel units
+        xd->mb_to_left_edge = -((mb_col * 16) << 3);
+        xd->mb_to_right_edge = ((pc->mb_cols - 1 - mb_col) * 16) << 3;
+        xd->mb_to_top_edge = -((mb_row * 16)) << 3;
+        xd->mb_to_bottom_edge = ((pc->mb_rows - 1 - mb_row) * 16) << 3;
+
+        // Make sure the MacroBlockD mode info pointer is set correctly
+        xd->mode_info_context = m;
+        xd->prev_mode_info_context = prev_m;
+
+#ifdef ENTROPY_STATS
+        active_section = 9;
+#endif
+        if (cpi->mb.e_mbd.update_mb_segmentation_map) {
+          // Is temporal coding of the segment map enabled
+          if (pc->temporal_update) {
+            prediction_flag = vp9_get_pred_flag(xd, PRED_SEG_ID);
+            pred_prob = vp9_get_pred_prob(pc, xd, PRED_SEG_ID);
+
+            // Code the segment id prediction flag for this mb
+            vp9_write(bc, prediction_flag, pred_prob);
+
+            // If the mb segment id wasn't predicted code explicitly
+            if (!prediction_flag)
+              write_mb_segid(bc, mi, &cpi->mb.e_mbd);
+          } else {
+            // Normal unpredicted coding
+            write_mb_segid(bc, mi, &cpi->mb.e_mbd);
+          }
+        }
+
+        if (pc->mb_no_coeff_skip &&
+            (!vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) ||
+             (vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) != 0))) {
+          int skip_coeff = mi->mb_skip_coeff;
+#if CONFIG_SUPERBLOCKS
+          if (mi->encoded_as_sb) {
+            skip_coeff &= m[1].mbmi.mb_skip_coeff;
+            skip_coeff &= m[mis].mbmi.mb_skip_coeff;
+            skip_coeff &= m[mis + 1].mbmi.mb_skip_coeff;
+          }
+#endif
+          vp9_write(bc, skip_coeff,
+                    vp9_get_pred_prob(pc, xd, PRED_MBSKIP));
+        }
+
+        // Encode the reference frame.
+        encode_ref_frame(bc, pc, xd, segment_id, rf);
+
+        if (rf == INTRA_FRAME) {
+#ifdef ENTROPY_STATS
+          active_section = 6;
+#endif
+
+          // TODO(rbultje) write using SB tree structure
+
+          if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_MODE)) {
+            write_ymode(bc, mode, pc->fc.ymode_prob);
+          }
+
+          if (mode == B_PRED) {
+            int j = 0;
+#if CONFIG_COMP_INTRA_PRED
+            int uses_second =
+              m->bmi[0].as_mode.second !=
+              (B_PREDICTION_MODE)(B_DC_PRED - 1);
+            vp9_write(bc, uses_second, 128);
+#endif
+            do {
+#if CONFIG_COMP_INTRA_PRED
+              B_PREDICTION_MODE mode2 = m->bmi[j].as_mode.second;
+#endif
+              write_bmode(bc, m->bmi[j].as_mode.first,
+                          pc->fc.bmode_prob);
+              /*
+              if (!cpi->dummy_packing) {
+                int p;
+                for (p = 0; p < VP9_BINTRAMODES - 1; ++p)
+                  printf(" %d", pc->fc.bmode_prob[p]);
+                printf("\nbmode[%d][%d]: %d\n", pc->current_video_frame, j, m->bmi[j].as_mode.first);
+              }
+              */
+#if CONFIG_COMP_INTRA_PRED
+              if (uses_second) {
+                write_bmode(bc, mode2, pc->fc.bmode_prob);
+              }
+#endif
+            } while (++j < 16);
+          }
+          if (mode == I8X8_PRED) {
+            write_i8x8_mode(bc, m->bmi[0].as_mode.first,
+                            pc->fc.i8x8_mode_prob);
+            write_i8x8_mode(bc, m->bmi[2].as_mode.first,
+                            pc->fc.i8x8_mode_prob);
+            write_i8x8_mode(bc, m->bmi[8].as_mode.first,
+                            pc->fc.i8x8_mode_prob);
+            write_i8x8_mode(bc, m->bmi[10].as_mode.first,
+                            pc->fc.i8x8_mode_prob);
+          } else {
+            write_uv_mode(bc, mi->uv_mode,
+                          pc->fc.uv_mode_prob[mode]);
+          }
+        } else {
+          int_mv best_mv, best_second_mv;
+          int ct[4];
+
+          vp9_prob mv_ref_p [VP9_MVREFS - 1];
+
+          {
+            int_mv n1, n2;
+
+            // Only used for context just now and soon to be deprecated.
+            vp9_find_near_mvs(xd, m, prev_m, &n1, &n2, &best_mv, ct,
+                              rf, cpi->common.ref_frame_sign_bias);
+#if CONFIG_NEWBESTREFMV
+            best_mv.as_int = mi->ref_mvs[rf][0].as_int;
+#endif
+
+            vp9_mv_ref_probs(&cpi->common, mv_ref_p, ct);
+
+#ifdef ENTROPY_STATS
+            accum_mv_refs(mode, ct);
+#endif
+          }
+
+#ifdef ENTROPY_STATS
+          active_section = 3;
+#endif
+
+          // Is the segment coding of mode enabled
+          if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_MODE)) {
+#if CONFIG_SUPERBLOCKS
+            if (mi->encoded_as_sb) {
+              write_sb_mv_ref(bc, mode, mv_ref_p);
+            } else
+#endif
+            {
+              write_mv_ref(bc, mode, mv_ref_p);
+            }
+            vp9_accum_mv_refs(&cpi->common, mode, ct);
+          }
+
+#if CONFIG_PRED_FILTER
+          // Is the prediction filter enabled
+          if (mode >= NEARESTMV && mode < SPLITMV) {
+            if (cpi->common.pred_filter_mode == 2)
+              vp9_write(bc, mi->pred_filter_enabled,
+                        pc->prob_pred_filter_off);
+            else
+              assert(mi->pred_filter_enabled ==
+                     cpi->common.pred_filter_mode);
+          }
+#endif
+          if (mode >= NEARESTMV && mode <= SPLITMV)
+          {
+            if (cpi->common.mcomp_filter_type == SWITCHABLE) {
+              write_token(bc, vp9_switchable_interp_tree,
+                          vp9_get_pred_probs(&cpi->common, xd,
+                                             PRED_SWITCHABLE_INTERP),
+                          vp9_switchable_interp_encodings +
+                              vp9_switchable_interp_map[mi->interp_filter]);
+            } else {
+              assert (mi->interp_filter ==
+                      cpi->common.mcomp_filter_type);
+            }
+          }
+          if (mi->second_ref_frame &&
+              (mode == NEWMV || mode == SPLITMV)) {
+            int_mv n1, n2;
+
+            // Only used for context just now and soon to be deprecated.
+            vp9_find_near_mvs(xd, m, prev_m,
+                              &n1, &n2, &best_second_mv, ct,
+                              mi->second_ref_frame,
+                              cpi->common.ref_frame_sign_bias);
+
+#if CONFIG_NEWBESTREFMV
+            best_second_mv.as_int =
+              mi->ref_mvs[mi->second_ref_frame][0].as_int;
+#endif
+          }
+
+          // does the feature use compound prediction or not
+          // (if not specified at the frame/segment level)
+          if (cpi->common.comp_pred_mode == HYBRID_PREDICTION) {
+            vp9_write(bc, mi->second_ref_frame != INTRA_FRAME,
+                      vp9_get_pred_prob(pc, xd, PRED_COMP));
+          }
+
+          {
+            switch (mode) { /* new, split require MVs */
+              case NEWMV:
+#ifdef ENTROPY_STATS
+                active_section = 5;
+#endif
+
+#if CONFIG_NEW_MVREF
+                {
+                  unsigned int best_index;
+
+                  // Choose the best mv reference
+                  best_index = pick_best_mv_ref(x, rf, mi->mv[0],
+                                                mi->ref_mvs[rf], &best_mv);
+
+                  // Encode the index of the choice.
+                  vp9_write_mv_ref_id(bc,
+                                      xd->mb_mv_ref_id_probs[rf], best_index);
+
+                  cpi->best_ref_index_counts[rf][best_index]++;
+
+                }
+#endif
+
+                write_nmv(bc, &mi->mv[0].as_mv, &best_mv,
+                          (const nmv_context*) nmvc,
+                          xd->allow_high_precision_mv);
+
+                if (mi->second_ref_frame) {
+#if CONFIG_NEW_MVREF
+                  unsigned int best_index;
+                  MV_REFERENCE_FRAME sec_ref_frame = mi->second_ref_frame;
+
+                  best_index =
+                    pick_best_mv_ref(x, sec_ref_frame, mi->mv[1],
+                                     mi->ref_mvs[sec_ref_frame],
+                                     &best_second_mv);
+
+                  // Encode the index of the choice.
+                  vp9_write_mv_ref_id(bc,
+                                      xd->mb_mv_ref_id_probs[sec_ref_frame],
+                                      best_index);
+
+                  cpi->best_ref_index_counts[sec_ref_frame][best_index]++;
+#endif
+                  write_nmv(bc, &mi->mv[1].as_mv, &best_second_mv,
+                            (const nmv_context*) nmvc,
+                            xd->allow_high_precision_mv);
+                }
+                break;
+              case SPLITMV: {
+                int j = 0;
+
+#ifdef MODE_STATS
+                ++count_mb_seg [mi->partitioning];
+#endif
+
+                write_split(bc, mi->partitioning, cpi->common.fc.mbsplit_prob);
+                cpi->mbsplit_count[mi->partitioning]++;
+
+                do {
+                  B_PREDICTION_MODE blockmode;
+                  int_mv blockmv;
+                  const int *const  L =
+                    vp9_mbsplits [mi->partitioning];
+                  int k = -1;  /* first block in subset j */
+                  int mv_contz;
+                  int_mv leftmv, abovemv;
+
+                  blockmode = cpi->mb.partition_info->bmi[j].mode;
+                  blockmv = cpi->mb.partition_info->bmi[j].mv;
+#if CONFIG_DEBUG
+                  while (j != L[++k])
+                    if (k >= 16)
+                      assert(0);
+#else
+                  while (j != L[++k]);
+#endif
+                  leftmv.as_int = left_block_mv(m, k);
+                  abovemv.as_int = above_block_mv(m, k, mis);
+                  mv_contz = vp9_mv_cont(&leftmv, &abovemv);
+
+                  write_sub_mv_ref(bc, blockmode,
+                                   cpi->common.fc.sub_mv_ref_prob [mv_contz]);
+                  cpi->sub_mv_ref_count[mv_contz][blockmode - LEFT4X4]++;
+                  if (blockmode == NEW4X4) {
+#ifdef ENTROPY_STATS
+                    active_section = 11;
+#endif
+                    write_nmv(bc, &blockmv.as_mv, &best_mv,
+                              (const nmv_context*) nmvc,
+                              xd->allow_high_precision_mv);
+
+                    if (mi->second_ref_frame) {
+                      write_nmv(bc,
+                                &cpi->mb.partition_info->bmi[j].second_mv.as_mv,
+                                &best_second_mv,
+                                (const nmv_context*) nmvc,
+                                xd->allow_high_precision_mv);
+                    }
+                  }
+                } while (++j < cpi->mb.partition_info->count);
+              }
+              break;
+              default:
+                break;
+            }
+          }
+
+          // Update the mvcounts used to tune mv probs but only if this is
+          // the real pack run.
+          if ( !cpi->dummy_packing ) {
+            update_mvcount(cpi, x, &best_mv, &best_second_mv);
+          }
+        }
+
+        if (
+#if CONFIG_SUPERBLOCKS
+            !mi->encoded_as_sb &&
+#endif
+            ((rf == INTRA_FRAME && mode <= I8X8_PRED) ||
+             (rf != INTRA_FRAME && !(mode == SPLITMV &&
+                                     mi->partitioning == PARTITIONING_4X4))) &&
+            pc->txfm_mode == TX_MODE_SELECT &&
+            !((pc->mb_no_coeff_skip && mi->mb_skip_coeff) ||
+              (vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) &&
+               vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) == 0))) {
+          TX_SIZE sz = mi->txfm_size;
+          // FIXME(rbultje) code ternary symbol once all experiments are merged
+          vp9_write(bc, sz != TX_4X4, pc->prob_tx[0]);
+          if (sz != TX_4X4 && mode != I8X8_PRED && mode != SPLITMV)
+            vp9_write(bc, sz != TX_8X8, pc->prob_tx[1]);
+        }
+
+#ifdef ENTROPY_STATS
+        active_section = 1;
+#endif
+        assert(tok < tok_end);
+        pack_mb_tokens(bc, &tok, tok_end);
+
+#if CONFIG_SUPERBLOCKS
+        if (m->mbmi.encoded_as_sb) {
+          assert(!i);
+          mb_col += 2;
+          m += 2;
+          cpi->mb.partition_info += 2;
+          prev_m += 2;
+          break;
+        }
+#endif
+
+        // Next MB
+        mb_row += dy;
+        mb_col += dx;
+        m += offset_extended;
+        prev_m += offset_extended;
+        cpi->mb.partition_info += offset_extended;
+#if CONFIG_DEBUG
+        assert((prev_m - cpi->common.prev_mip) == (m - cpi->common.mip));
+        assert((prev_m - cpi->common.prev_mi) == (m - cpi->common.mi));
+#endif
+      }
+    }
+
+    // Next SB
+    mb_row += 2;
+    m += mis + (1 - (pc->mb_cols & 0x1));
+    prev_m += mis + (1 - (pc->mb_cols & 0x1));
+    cpi->mb.partition_info += mis + (1 - (pc->mb_cols & 0x1));
+  }
+}
+
+
+static void write_mb_modes_kf(const VP9_COMMON  *c,
+                              const MACROBLOCKD *xd,
+                              const MODE_INFO   *m,
+                              int                mode_info_stride,
+                              vp9_writer *const  bc) {
+  const int mis = mode_info_stride;
+  int ym;
+  int segment_id;
+
+  ym = m->mbmi.mode;
+  segment_id = m->mbmi.segment_id;
+
+  if (xd->update_mb_segmentation_map) {
+    write_mb_segid(bc, &m->mbmi, xd);
+  }
+
+  if (c->mb_no_coeff_skip &&
+      (!vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) ||
+       (vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) != 0))) {
+        int skip_coeff = m->mbmi.mb_skip_coeff;
+#if CONFIG_SUPERBLOCKS
+        if (m->mbmi.encoded_as_sb) {
+          skip_coeff &= m[1].mbmi.mb_skip_coeff;
+          skip_coeff &= m[mis].mbmi.mb_skip_coeff;
+          skip_coeff &= m[mis + 1].mbmi.mb_skip_coeff;
+        }
+#endif
+        vp9_write(bc, skip_coeff,
+                  vp9_get_pred_prob(c, xd, PRED_MBSKIP));
+  }
+
+#if CONFIG_SUPERBLOCKS
+  if (m->mbmi.encoded_as_sb) {
+    sb_kfwrite_ymode(bc, ym,
+                     c->sb_kf_ymode_prob[c->kf_ymode_probs_index]);
+  } else
+#endif
+  {
+    kfwrite_ymode(bc, ym,
+                  c->kf_ymode_prob[c->kf_ymode_probs_index]);
+  }
+
+  if (ym == B_PRED) {
+    const int mis = c->mode_info_stride;
+    int i = 0;
+#if CONFIG_COMP_INTRA_PRED
+    int uses_second =
+      m->bmi[0].as_mode.second !=
+      (B_PREDICTION_MODE)(B_DC_PRED - 1);
+    vp9_write(bc, uses_second, 128);
+#endif
+    do {
+      const B_PREDICTION_MODE A = above_block_mode(m, i, mis);
+      const B_PREDICTION_MODE L = left_block_mode(m, i);
+      const int bm = m->bmi[i].as_mode.first;
+#if CONFIG_COMP_INTRA_PRED
+      const int bm2 = m->bmi[i].as_mode.second;
+#endif
+
+#ifdef ENTROPY_STATS
+      ++intra_mode_stats [A] [L] [bm];
+#endif
+
+      write_bmode(bc, bm, c->kf_bmode_prob [A] [L]);
+      // printf("    mode: %d\n", bm);
+#if CONFIG_COMP_INTRA_PRED
+      if (uses_second) {
+        write_bmode(bc, bm2, c->kf_bmode_prob [A] [L]);
+      }
+#endif
+    } while (++i < 16);
+  }
+  if (ym == I8X8_PRED) {
+    write_i8x8_mode(bc, m->bmi[0].as_mode.first,
+                    c->fc.i8x8_mode_prob);
+    // printf("    mode: %d\n", m->bmi[0].as_mode.first); fflush(stdout);
+    write_i8x8_mode(bc, m->bmi[2].as_mode.first,
+                    c->fc.i8x8_mode_prob);
+    // printf("    mode: %d\n", m->bmi[2].as_mode.first); fflush(stdout);
+    write_i8x8_mode(bc, m->bmi[8].as_mode.first,
+                    c->fc.i8x8_mode_prob);
+    // printf("    mode: %d\n", m->bmi[8].as_mode.first); fflush(stdout);
+    write_i8x8_mode(bc, m->bmi[10].as_mode.first,
+                    c->fc.i8x8_mode_prob);
+    // printf("    mode: %d\n", m->bmi[10].as_mode.first); fflush(stdout);
+  } else
+    write_uv_mode(bc, m->mbmi.uv_mode, c->kf_uv_mode_prob[ym]);
+
+  if (
+#if CONFIG_SUPERBLOCKS
+      !m->mbmi.encoded_as_sb &&
+#endif
+      ym <= I8X8_PRED && c->txfm_mode == TX_MODE_SELECT &&
+      !((c->mb_no_coeff_skip && m->mbmi.mb_skip_coeff) ||
+        (vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) &&
+         vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) == 0))) {
+    TX_SIZE sz = m->mbmi.txfm_size;
+    // FIXME(rbultje) code ternary symbol once all experiments are merged
+    vp9_write(bc, sz != TX_4X4, c->prob_tx[0]);
+    if (sz != TX_4X4 && ym <= TM_PRED)
+      vp9_write(bc, sz != TX_8X8, c->prob_tx[1]);
+  }
+}
+
+static void write_kfmodes(VP9_COMP* const cpi, vp9_writer* const bc) {
+  VP9_COMMON *const c = &cpi->common;
+  const int mis = c->mode_info_stride;
+  MACROBLOCKD *xd = &cpi->mb.e_mbd;
+  MODE_INFO *m;
+  int i;
+  int row, col;
+  int mb_row, mb_col;
+  int row_delta[4] = { 0, +1,  0, -1};
+  int col_delta[4] = { +1, -1, +1, +1};
+  TOKENEXTRA *tok = cpi->tok;
+  TOKENEXTRA *tok_end = tok + cpi->tok_count;
+
+  mb_row = 0;
+  for (row = 0; row < c->mb_rows; row += 2) {
+    m = c->mi + row * mis;
+
+    mb_col = 0;
+    for (col = 0; col < c->mb_cols; col += 2) {
+#if CONFIG_SUPERBLOCKS
+      vp9_write(bc, m->mbmi.encoded_as_sb, c->sb_coded);
+#endif
+      // Process the 4 MBs in the order:
+      // top-left, top-right, bottom-left, bottom-right
+      for (i = 0; i < 4; i++) {
+        int dy = row_delta[i];
+        int dx = col_delta[i];
+        int offset_extended = dy * mis + dx;
+
+        if ((mb_row >= c->mb_rows) || (mb_col >= c->mb_cols)) {
+          // MB lies outside frame, move on
+          mb_row += dy;
+          mb_col += dx;
+          m += offset_extended;
+          continue;
+        }
+
+        // Make sure the MacroBlockD mode info pointer is set correctly
+        xd->mode_info_context = m;
+
+        write_mb_modes_kf(c, xd, m, mis, bc);
+#ifdef ENTROPY_STATS
+        active_section = 8;
+#endif
+        assert(tok < tok_end);
+        pack_mb_tokens(bc, &tok, tok_end);
+
+#if CONFIG_SUPERBLOCKS
+        if (m->mbmi.encoded_as_sb) {
+          assert(!i);
+          mb_col += 2;
+          m += 2;
+          break;
+        }
+#endif
+        // Next MB
+        mb_row += dy;
+        mb_col += dx;
+        m += offset_extended;
+      }
+    }
+    mb_row += 2;
+  }
+}
+
+
+/* This function is used for debugging probability trees. */
+static void print_prob_tree(vp9_prob
+                            coef_probs[BLOCK_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES]) {
+  /* print coef probability tree */
+  int i, j, k, l;
+  FILE *f = fopen("enc_tree_probs.txt", "a");
+  fprintf(f, "{\n");
+  for (i = 0; i < BLOCK_TYPES; i++) {
+    fprintf(f, "  {\n");
+    for (j = 0; j < COEF_BANDS; j++) {
+      fprintf(f, "    {\n");
+      for (k = 0; k < PREV_COEF_CONTEXTS; k++) {
+        fprintf(f, "      {");
+        for (l = 0; l < ENTROPY_NODES; l++) {
+          fprintf(f, "%3u, ",
+                  (unsigned int)(coef_probs [i][j][k][l]));
+        }
+        fprintf(f, " }\n");
+      }
+      fprintf(f, "    }\n");
+    }
+    fprintf(f, "  }\n");
+  }
+  fprintf(f, "}\n");
+  fclose(f);
+}
+
+static void build_coeff_contexts(VP9_COMP *cpi) {
+  int i = 0, j, k;
+#ifdef ENTROPY_STATS
+  int t = 0;
+#endif
+  for (i = 0; i < BLOCK_TYPES; ++i) {
+    for (j = 0; j < COEF_BANDS; ++j) {
+      for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {
+        if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0)))
+          continue;
+        vp9_tree_probs_from_distribution(
+          MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree,
+          cpi->frame_coef_probs [i][j][k],
+          cpi->frame_branch_ct [i][j][k],
+          cpi->coef_counts [i][j][k],
+          256, 1
+        );
+#ifdef ENTROPY_STATS
+        if (!cpi->dummy_packing)
+          for (t = 0; t < MAX_ENTROPY_TOKENS; ++t)
+            context_counters[i][j][k][t] += cpi->coef_counts[i][j][k][t];
+#endif
+      }
+    }
+  }
+  for (i = 0; i < BLOCK_TYPES; ++i) {
+    for (j = 0; j < COEF_BANDS; ++j) {
+      for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {
+        if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0)))
+          continue;
+        vp9_tree_probs_from_distribution(
+          MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree,
+          cpi->frame_hybrid_coef_probs [i][j][k],
+          cpi->frame_hybrid_branch_ct [i][j][k],
+          cpi->hybrid_coef_counts [i][j][k],
+          256, 1
+        );
+#ifdef ENTROPY_STATS
+        if (!cpi->dummy_packing)
+          for (t = 0; t < MAX_ENTROPY_TOKENS; ++t)
+            hybrid_context_counters[i][j][k][t] += cpi->hybrid_coef_counts[i][j][k][t];
+#endif
+      }
+    }
+  }
+
+  if (cpi->common.txfm_mode != ONLY_4X4) {
+    for (i = 0; i < BLOCK_TYPES_8X8; ++i) {
+      for (j = 0; j < COEF_BANDS; ++j) {
+        for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {
+          /* at every context */
+          /* calc probs and branch cts for this frame only */
+          // vp9_prob new_p           [ENTROPY_NODES];
+          // unsigned int branch_ct   [ENTROPY_NODES] [2];
+          if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0)))
+            continue;
+          vp9_tree_probs_from_distribution(
+            MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree,
+            cpi->frame_coef_probs_8x8 [i][j][k],
+            cpi->frame_branch_ct_8x8 [i][j][k],
+            cpi->coef_counts_8x8 [i][j][k],
+            256, 1
+          );
+#ifdef ENTROPY_STATS
+          if (!cpi->dummy_packing)
+            for (t = 0; t < MAX_ENTROPY_TOKENS; ++t)
+              context_counters_8x8[i][j][k][t] += cpi->coef_counts_8x8[i][j][k][t];
+#endif
+        }
+      }
+    }
+    for (i = 0; i < BLOCK_TYPES_8X8; ++i) {
+      for (j = 0; j < COEF_BANDS; ++j) {
+        for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {
+          /* at every context */
+          /* calc probs and branch cts for this frame only */
+          // vp9_prob new_p           [ENTROPY_NODES];
+          // unsigned int branch_ct   [ENTROPY_NODES] [2];
+          if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0)))
+            continue;
+          vp9_tree_probs_from_distribution(
+            MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree,
+            cpi->frame_hybrid_coef_probs_8x8 [i][j][k],
+            cpi->frame_hybrid_branch_ct_8x8 [i][j][k],
+            cpi->hybrid_coef_counts_8x8 [i][j][k],
+            256, 1
+          );
+#ifdef ENTROPY_STATS
+          if (!cpi->dummy_packing)
+            for (t = 0; t < MAX_ENTROPY_TOKENS; ++t)
+              hybrid_context_counters_8x8[i][j][k][t] += cpi->hybrid_coef_counts_8x8[i][j][k][t];
+#endif
+        }
+      }
+    }
+  }
+
+  if (cpi->common.txfm_mode > ALLOW_8X8) {
+    for (i = 0; i < BLOCK_TYPES_16X16; ++i) {
+      for (j = 0; j < COEF_BANDS; ++j) {
+        for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {
+          if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0)))
+            continue;
+          vp9_tree_probs_from_distribution(
+            MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree,
+            cpi->frame_coef_probs_16x16[i][j][k],
+            cpi->frame_branch_ct_16x16[i][j][k],
+            cpi->coef_counts_16x16[i][j][k], 256, 1);
+#ifdef ENTROPY_STATS
+          if (!cpi->dummy_packing)
+            for (t = 0; t < MAX_ENTROPY_TOKENS; ++t)
+              context_counters_16x16[i][j][k][t] += cpi->coef_counts_16x16[i][j][k][t];
+#endif
+        }
+      }
+    }
+  }
+  for (i = 0; i < BLOCK_TYPES_16X16; ++i) {
+    for (j = 0; j < COEF_BANDS; ++j) {
+      for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {
+        if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0)))
+          continue;
+        vp9_tree_probs_from_distribution(
+          MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree,
+          cpi->frame_hybrid_coef_probs_16x16[i][j][k],
+          cpi->frame_hybrid_branch_ct_16x16[i][j][k],
+          cpi->hybrid_coef_counts_16x16[i][j][k], 256, 1);
+#ifdef ENTROPY_STATS
+        if (!cpi->dummy_packing)
+          for (t = 0; t < MAX_ENTROPY_TOKENS; ++t)
+            hybrid_context_counters_16x16[i][j][k][t] += cpi->hybrid_coef_counts_16x16[i][j][k][t];
+#endif
+      }
+    }
+  }
+}
+
+static void update_coef_probs_common(
+    vp9_writer* const bc,
+    vp9_prob new_frame_coef_probs[BLOCK_TYPES][COEF_BANDS]
+                                 [PREV_COEF_CONTEXTS][ENTROPY_NODES],
+    vp9_prob old_frame_coef_probs[BLOCK_TYPES][COEF_BANDS]
+                                 [PREV_COEF_CONTEXTS][ENTROPY_NODES],
+    unsigned int frame_branch_ct[BLOCK_TYPES][COEF_BANDS]
+                                [PREV_COEF_CONTEXTS][ENTROPY_NODES][2]) {
+  int i, j, k, t;
+  int update[2] = {0, 0};
+  int savings;
+  // vp9_prob bestupd = find_coef_update_prob(cpi);
+
+  /* dry run to see if there is any udpate at all needed */
+  savings = 0;
+  for (i = 0; i < BLOCK_TYPES; ++i) {
+    for (j = !i; j < COEF_BANDS; ++j) {
+      int prev_coef_savings[ENTROPY_NODES] = {0};
+      for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {
+        for (t = 0; t < ENTROPY_NODES; ++t) {
+          vp9_prob newp = new_frame_coef_probs[i][j][k][t];
+          const vp9_prob oldp = old_frame_coef_probs[i][j][k][t];
+          const vp9_prob upd = COEF_UPDATE_PROB;
+          int s = prev_coef_savings[t];
+          int u = 0;
+          if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0)))
+            continue;
+#if defined(SEARCH_NEWP)
+          s = prob_diff_update_savings_search(
+                frame_branch_ct[i][j][k][t],
+                oldp, &newp, upd);
+          if (s > 0 && newp != oldp)
+            u = 1;
+          if (u)
+            savings += s - (int)(vp9_cost_zero(upd));
+          else
+            savings -= (int)(vp9_cost_zero(upd));
+#else
+          s = prob_update_savings(
+                frame_branch_ct[i][j][k][t],
+                oldp, newp, upd);
+          if (s > 0)
+            u = 1;
+          if (u)
+            savings += s;
+#endif
+
+          update[u]++;
+        }
+      }
+    }
+  }
+
+  // printf("Update %d %d, savings %d\n", update[0], update[1], savings);
+  /* Is coef updated at all */
+  if (update[1] == 0 || savings < 0) {
+    vp9_write_bit(bc, 0);
+  } else {
+    vp9_write_bit(bc, 1);
+    for (i = 0; i < BLOCK_TYPES; ++i) {
+      for (j = !i; j < COEF_BANDS; ++j) {
+        int prev_coef_savings[ENTROPY_NODES] = {0};
+        for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {
+          // calc probs and branch cts for this frame only
+          for (t = 0; t < ENTROPY_NODES; ++t) {
+            vp9_prob newp = new_frame_coef_probs[i][j][k][t];
+            vp9_prob *oldp = old_frame_coef_probs[i][j][k] + t;
+            const vp9_prob upd = COEF_UPDATE_PROB;
+            int s = prev_coef_savings[t];
+            int u = 0;
+            if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0)))
+              continue;
+
+#if defined(SEARCH_NEWP)
+            s = prob_diff_update_savings_search(
+                  frame_branch_ct[i][j][k][t],
+                  *oldp, &newp, upd);
+            if (s > 0 && newp != *oldp)
+              u = 1;
+#else
+            s = prob_update_savings(
+                  frame_branch_ct[i][j][k][t],
+                  *oldp, newp, upd);
+            if (s > 0)
+              u = 1;
+#endif
+            vp9_write(bc, u, upd);
+#ifdef ENTROPY_STATS
+            if (!cpi->dummy_packing)
+              ++ tree_update_hist [i][j][k][t] [u];
+#endif
+            if (u) {
+              /* send/use new probability */
+              write_prob_diff_update(bc, newp, *oldp);
+              *oldp = newp;
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+static void update_coef_probs(VP9_COMP* const cpi, vp9_writer* const bc) {
+  vp9_clear_system_state();
+
+  // Build the cofficient contexts based on counts collected in encode loop
+  build_coeff_contexts(cpi);
+
+  update_coef_probs_common(bc,
+                           cpi->frame_coef_probs,
+                           cpi->common.fc.coef_probs,
+                           cpi->frame_branch_ct);
+
+  update_coef_probs_common(bc,
+                           cpi->frame_hybrid_coef_probs,
+                           cpi->common.fc.hybrid_coef_probs,
+                           cpi->frame_hybrid_branch_ct);
+
+  /* do not do this if not even allowed */
+  if (cpi->common.txfm_mode != ONLY_4X4) {
+    update_coef_probs_common(bc,
+                             cpi->frame_coef_probs_8x8,
+                             cpi->common.fc.coef_probs_8x8,
+                             cpi->frame_branch_ct_8x8);
+
+    update_coef_probs_common(bc,
+                             cpi->frame_hybrid_coef_probs_8x8,
+                             cpi->common.fc.hybrid_coef_probs_8x8,
+                             cpi->frame_hybrid_branch_ct_8x8);
+  }
+
+  if (cpi->common.txfm_mode > ALLOW_8X8) {
+    update_coef_probs_common(bc,
+                             cpi->frame_coef_probs_16x16,
+                             cpi->common.fc.coef_probs_16x16,
+                             cpi->frame_branch_ct_16x16);
+    update_coef_probs_common(bc,
+                             cpi->frame_hybrid_coef_probs_16x16,
+                             cpi->common.fc.hybrid_coef_probs_16x16,
+                             cpi->frame_hybrid_branch_ct_16x16);
+  }
+}
+
+#ifdef PACKET_TESTING
+FILE *vpxlogc = 0;
+#endif
+
+static void put_delta_q(vp9_writer *bc, int delta_q) {
+  if (delta_q != 0) {
+    vp9_write_bit(bc, 1);
+    vp9_write_literal(bc, abs(delta_q), 4);
+
+    if (delta_q < 0)
+      vp9_write_bit(bc, 1);
+    else
+      vp9_write_bit(bc, 0);
+  } else
+    vp9_write_bit(bc, 0);
+}
+
+static void decide_kf_ymode_entropy(VP9_COMP *cpi) {
+
+  int mode_cost[MB_MODE_COUNT];
+  int cost;
+  int bestcost = INT_MAX;
+  int bestindex = 0;
+  int i, j;
+
+  for (i = 0; i < 8; i++) {
+    vp9_cost_tokens(mode_cost, cpi->common.kf_ymode_prob[i], vp9_kf_ymode_tree);
+    cost = 0;
+    for (j = 0; j < VP9_YMODES; j++) {
+      cost += mode_cost[j] * cpi->ymode_count[j];
+    }
+#if CONFIG_SUPERBLOCKS
+    vp9_cost_tokens(mode_cost, cpi->common.sb_kf_ymode_prob[i],
+                    vp9_sb_ymode_tree);
+    for (j = 0; j < VP9_I32X32_MODES; j++) {
+      cost += mode_cost[j] * cpi->sb_ymode_count[j];
+    }
+#endif
+    if (cost < bestcost) {
+      bestindex = i;
+      bestcost = cost;
+    }
+  }
+  cpi->common.kf_ymode_probs_index = bestindex;
+
+}
+static void segment_reference_frames(VP9_COMP *cpi) {
+  VP9_COMMON *oci = &cpi->common;
+  MODE_INFO *mi = oci->mi;
+  int ref[MAX_MB_SEGMENTS] = {0};
+  int i, j;
+  int mb_index = 0;
+  MACROBLOCKD *const xd = &cpi->mb.e_mbd;
+
+  for (i = 0; i < oci->mb_rows; i++) {
+    for (j = 0; j < oci->mb_cols; j++, mb_index++) {
+      ref[mi[mb_index].mbmi.segment_id] |= (1 << mi[mb_index].mbmi.ref_frame);
+    }
+    mb_index++;
+  }
+  for (i = 0; i < MAX_MB_SEGMENTS; i++) {
+    vp9_enable_segfeature(xd, i, SEG_LVL_REF_FRAME);
+    vp9_set_segdata(xd, i, SEG_LVL_REF_FRAME, ref[i]);
+  }
+}
+
+void vp9_pack_bitstream(VP9_COMP *cpi, unsigned char *dest,
+                        unsigned long *size) {
+  int i, j;
+  VP9_HEADER oh;
+  VP9_COMMON *const pc = &cpi->common;
+  vp9_writer header_bc, residual_bc;
+  MACROBLOCKD *const xd = &cpi->mb.e_mbd;
+  int extra_bytes_packed = 0;
+
+  unsigned char *cx_data = dest;
+
+  oh.show_frame = (int) pc->show_frame;
+  oh.type = (int)pc->frame_type;
+  oh.version = pc->version;
+  oh.first_partition_length_in_bytes = 0;
+
+  cx_data += 3;
+
+#if defined(SECTIONBITS_OUTPUT)
+  Sectionbits[active_section = 1] += sizeof(VP9_HEADER) * 8 * 256;
+#endif
+
+  compute_update_table();
+
+  /* vp9_kf_default_bmode_probs() is called in vp9_setup_key_frame() once
+   * for each K frame before encode frame. pc->kf_bmode_prob doesn't get
+   * changed anywhere else. No need to call it again here. --yw
+   * vp9_kf_default_bmode_probs( pc->kf_bmode_prob);
+   */
+
+  /* every keyframe send startcode, width, height, scale factor, clamp
+   * and color type.
+   */
+  if (oh.type == KEY_FRAME) {
+    int v;
+
+    // Start / synch code
+    cx_data[0] = 0x9D;
+    cx_data[1] = 0x01;
+    cx_data[2] = 0x2a;
+
+    v = (pc->horiz_scale << 14) | pc->Width;
+    cx_data[3] = v;
+    cx_data[4] = v >> 8;
+
+    v = (pc->vert_scale << 14) | pc->Height;
+    cx_data[5] = v;
+    cx_data[6] = v >> 8;
+
+    extra_bytes_packed = 7;
+    cx_data += extra_bytes_packed;
+
+    vp9_start_encode(&header_bc, cx_data);
+
+    // signal clr type
+    vp9_write_bit(&header_bc, pc->clr_type);
+    vp9_write_bit(&header_bc, pc->clamp_type);
+
+  } else {
+    vp9_start_encode(&header_bc, cx_data);
+  }
+
+  // Signal whether or not Segmentation is enabled
+  vp9_write_bit(&header_bc, (xd->segmentation_enabled) ? 1 : 0);
+
+  // Indicate which features are enabled
+  if (xd->segmentation_enabled) {
+    // Indicate whether or not the segmentation map is being updated.
+    vp9_write_bit(&header_bc, (xd->update_mb_segmentation_map) ? 1 : 0);
+
+    // If it is, then indicate the method that will be used.
+    if (xd->update_mb_segmentation_map) {
+      // Select the coding strategy (temporal or spatial)
+      vp9_choose_segmap_coding_method(cpi);
+      // Send the tree probabilities used to decode unpredicted
+      // macro-block segments
+      for (i = 0; i < MB_FEATURE_TREE_PROBS; i++) {
+        int data = xd->mb_segment_tree_probs[i];
+
+        if (data != 255) {
+          vp9_write_bit(&header_bc, 1);
+          vp9_write_literal(&header_bc, data, 8);
+        } else {
+          vp9_write_bit(&header_bc, 0);
+        }
+      }
+
+      // Write out the chosen coding method.
+      vp9_write_bit(&header_bc, (pc->temporal_update) ? 1 : 0);
+      if (pc->temporal_update) {
+        for (i = 0; i < PREDICTION_PROBS; i++) {
+          int data = pc->segment_pred_probs[i];
+
+          if (data != 255) {
+            vp9_write_bit(&header_bc, 1);
+            vp9_write_literal(&header_bc, data, 8);
+          } else {
+            vp9_write_bit(&header_bc, 0);
+          }
+        }
+      }
+    }
+
+    vp9_write_bit(&header_bc, (xd->update_mb_segmentation_data) ? 1 : 0);
+
+    // segment_reference_frames(cpi);
+
+    if (xd->update_mb_segmentation_data) {
+      signed char Data;
+
+      vp9_write_bit(&header_bc, (xd->mb_segment_abs_delta) ? 1 : 0);
+
+      // For each segments id...
+      for (i = 0; i < MAX_MB_SEGMENTS; i++) {
+        // For each segmentation codable feature...
+        for (j = 0; j < SEG_LVL_MAX; j++) {
+          Data = vp9_get_segdata(xd, i, j);
+
+          // If the feature is enabled...
+          if (vp9_segfeature_active(xd, i, j)) {
+            vp9_write_bit(&header_bc, 1);
+
+            // Is the segment data signed..
+            if (vp9_is_segfeature_signed(j)) {
+              // Encode the relevant feature data
+              if (Data < 0) {
+                Data = - Data;
+                vp9_write_literal(&header_bc, Data,
+                                  vp9_seg_feature_data_bits(j));
+                vp9_write_bit(&header_bc, 1);
+              } else {
+                vp9_write_literal(&header_bc, Data,
+                                  vp9_seg_feature_data_bits(j));
+                vp9_write_bit(&header_bc, 0);
+              }
+            }
+            // Unsigned data element so no sign bit needed
+            else
+              vp9_write_literal(&header_bc, Data,
+                                vp9_seg_feature_data_bits(j));
+          } else
+            vp9_write_bit(&header_bc, 0);
+        }
+      }
+    }
+  }
+
+  // Encode the common prediction model status flag probability updates for
+  // the reference frame
+  update_refpred_stats(cpi);
+  if (pc->frame_type != KEY_FRAME) {
+    for (i = 0; i < PREDICTION_PROBS; i++) {
+      if (cpi->ref_pred_probs_update[i]) {
+        vp9_write_bit(&header_bc, 1);
+        vp9_write_literal(&header_bc, pc->ref_pred_probs[i], 8);
+      } else {
+        vp9_write_bit(&header_bc, 0);
+      }
+    }
+  }
+
+#if CONFIG_SUPERBLOCKS
+  {
+    /* sb mode probability */
+    const int sb_max = (((pc->mb_rows + 1) >> 1) * ((pc->mb_cols + 1) >> 1));
+
+    pc->sb_coded = get_prob(sb_max - cpi->sb_count, sb_max);
+    vp9_write_literal(&header_bc, pc->sb_coded, 8);
+  }
+#endif
+
+  {
+    if (pc->txfm_mode == TX_MODE_SELECT) {
+      pc->prob_tx[0] = get_prob(cpi->txfm_count[0] + cpi->txfm_count_8x8p[0],
+                                cpi->txfm_count[0] + cpi->txfm_count[1] + cpi->txfm_count[2] +
+                                cpi->txfm_count_8x8p[0] + cpi->txfm_count_8x8p[1]);
+      pc->prob_tx[1] = get_prob(cpi->txfm_count[1], cpi->txfm_count[1] + cpi->txfm_count[2]);
+    } else {
+      pc->prob_tx[0] = 128;
+      pc->prob_tx[1] = 128;
+    }
+    vp9_write_literal(&header_bc, pc->txfm_mode, 2);
+    if (pc->txfm_mode == TX_MODE_SELECT) {
+      vp9_write_literal(&header_bc, pc->prob_tx[0], 8);
+      vp9_write_literal(&header_bc, pc->prob_tx[1], 8);
+    }
+  }
+
+  // Encode the loop filter level and type
+  vp9_write_bit(&header_bc, pc->filter_type);
+  vp9_write_literal(&header_bc, pc->filter_level, 6);
+  vp9_write_literal(&header_bc, pc->sharpness_level, 3);
+
+  // Write out loop filter deltas applied at the MB level based on mode or ref frame (if they are enabled).
+  vp9_write_bit(&header_bc, (xd->mode_ref_lf_delta_enabled) ? 1 : 0);
+
+  if (xd->mode_ref_lf_delta_enabled) {
+    // Do the deltas need to be updated
+    int send_update = xd->mode_ref_lf_delta_update;
+
+    vp9_write_bit(&header_bc, send_update);
+    if (send_update) {
+      int Data;
+
+      // Send update
+      for (i = 0; i < MAX_REF_LF_DELTAS; i++) {
+        Data = xd->ref_lf_deltas[i];
+
+        // Frame level data
+        if (xd->ref_lf_deltas[i] != xd->last_ref_lf_deltas[i]) {
+          xd->last_ref_lf_deltas[i] = xd->ref_lf_deltas[i];
+          vp9_write_bit(&header_bc, 1);
+
+          if (Data > 0) {
+            vp9_write_literal(&header_bc, (Data & 0x3F), 6);
+            vp9_write_bit(&header_bc, 0);    // sign
+          } else {
+            Data = -Data;
+            vp9_write_literal(&header_bc, (Data & 0x3F), 6);
+            vp9_write_bit(&header_bc, 1);    // sign
+          }
+        } else {
+          vp9_write_bit(&header_bc, 0);
+        }
+      }
+
+      // Send update
+      for (i = 0; i < MAX_MODE_LF_DELTAS; i++) {
+        Data = xd->mode_lf_deltas[i];
+
+        if (xd->mode_lf_deltas[i] != xd->last_mode_lf_deltas[i]) {
+          xd->last_mode_lf_deltas[i] = xd->mode_lf_deltas[i];
+          vp9_write_bit(&header_bc, 1);
+
+          if (Data > 0) {
+            vp9_write_literal(&header_bc, (Data & 0x3F), 6);
+            vp9_write_bit(&header_bc, 0);    // sign
+          } else {
+            Data = -Data;
+            vp9_write_literal(&header_bc, (Data & 0x3F), 6);
+            vp9_write_bit(&header_bc, 1);    // sign
+          }
+        } else {
+          vp9_write_bit(&header_bc, 0);
+        }
+      }
+    }
+  }
+
+  // signal here is multi token partition is enabled
+  // vp9_write_literal(&header_bc, pc->multi_token_partition, 2);
+  vp9_write_literal(&header_bc, 0, 2);
+
+  // Frame Q baseline quantizer index
+  vp9_write_literal(&header_bc, pc->base_qindex, QINDEX_BITS);
+
+  // Transmit Dc, Second order and Uv quantizer delta information
+  put_delta_q(&header_bc, pc->y1dc_delta_q);
+  put_delta_q(&header_bc, pc->y2dc_delta_q);
+  put_delta_q(&header_bc, pc->y2ac_delta_q);
+  put_delta_q(&header_bc, pc->uvdc_delta_q);
+  put_delta_q(&header_bc, pc->uvac_delta_q);
+
+  // When there is a key frame all reference buffers are updated using the new key frame
+  if (pc->frame_type != KEY_FRAME) {
+    // Should the GF or ARF be updated using the transmitted frame or buffer
+    vp9_write_bit(&header_bc, pc->refresh_golden_frame);
+    vp9_write_bit(&header_bc, pc->refresh_alt_ref_frame);
+
+    // For inter frames the current default behavior is that when
+    // cm->refresh_golden_frame is set we copy the old GF over to
+    // the ARF buffer. This is purely an encoder decision at present.
+    if (pc->refresh_golden_frame)
+      pc->copy_buffer_to_arf  = 2;
+
+    // If not being updated from current frame should either GF or ARF be updated from another buffer
+    if (!pc->refresh_golden_frame)
+      vp9_write_literal(&header_bc, pc->copy_buffer_to_gf, 2);
+
+    if (!pc->refresh_alt_ref_frame)
+      vp9_write_literal(&header_bc, pc->copy_buffer_to_arf, 2);
+
+    // Indicate reference frame sign bias for Golden and ARF frames (always 0 for last frame buffer)
+    vp9_write_bit(&header_bc, pc->ref_frame_sign_bias[GOLDEN_FRAME]);
+    vp9_write_bit(&header_bc, pc->ref_frame_sign_bias[ALTREF_FRAME]);
+
+    // Signal whether to allow high MV precision
+    vp9_write_bit(&header_bc, (xd->allow_high_precision_mv) ? 1 : 0);
+    if (pc->mcomp_filter_type == SWITCHABLE) {
+      /* Check to see if only one of the filters is actually used */
+      int count[VP9_SWITCHABLE_FILTERS];
+      int i, j, c = 0;
+      for (i = 0; i < VP9_SWITCHABLE_FILTERS; ++i) {
+        count[i] = 0;
+        for (j = 0; j <= VP9_SWITCHABLE_FILTERS; ++j) {
+          count[i] += cpi->switchable_interp_count[j][i];
+        }
+        c += (count[i] > 0);
+      }
+      if (c == 1) {
+        /* Only one filter is used. So set the filter at frame level */
+        for (i = 0; i < VP9_SWITCHABLE_FILTERS; ++i) {
+          if (count[i]) {
+            pc->mcomp_filter_type = vp9_switchable_interp[i];
+            break;
+          }
+        }
+      }
+    }
+    // Signal the type of subpel filter to use
+    vp9_write_bit(&header_bc, (pc->mcomp_filter_type == SWITCHABLE));
+    if (pc->mcomp_filter_type != SWITCHABLE)
+      vp9_write_literal(&header_bc, (pc->mcomp_filter_type), 2);
+  }
+
+  vp9_write_bit(&header_bc, pc->refresh_entropy_probs);
+
+  if (pc->frame_type != KEY_FRAME)
+    vp9_write_bit(&header_bc, pc->refresh_last_frame);
+
+#ifdef ENTROPY_STATS
+  if (pc->frame_type == INTER_FRAME)
+    active_section = 0;
+  else
+    active_section = 7;
+#endif
+
+  vp9_clear_system_state();  // __asm emms;
+
+  vp9_copy(cpi->common.fc.pre_coef_probs, cpi->common.fc.coef_probs);
+  vp9_copy(cpi->common.fc.pre_hybrid_coef_probs, cpi->common.fc.hybrid_coef_probs);
+  vp9_copy(cpi->common.fc.pre_coef_probs_8x8, cpi->common.fc.coef_probs_8x8);
+  vp9_copy(cpi->common.fc.pre_hybrid_coef_probs_8x8, cpi->common.fc.hybrid_coef_probs_8x8);
+  vp9_copy(cpi->common.fc.pre_coef_probs_16x16, cpi->common.fc.coef_probs_16x16);
+  vp9_copy(cpi->common.fc.pre_hybrid_coef_probs_16x16, cpi->common.fc.hybrid_coef_probs_16x16);
+  vp9_copy(cpi->common.fc.pre_ymode_prob, cpi->common.fc.ymode_prob);
+  vp9_copy(cpi->common.fc.pre_uv_mode_prob, cpi->common.fc.uv_mode_prob);
+  vp9_copy(cpi->common.fc.pre_bmode_prob, cpi->common.fc.bmode_prob);
+  vp9_copy(cpi->common.fc.pre_sub_mv_ref_prob, cpi->common.fc.sub_mv_ref_prob);
+  vp9_copy(cpi->common.fc.pre_mbsplit_prob, cpi->common.fc.mbsplit_prob);
+  vp9_copy(cpi->common.fc.pre_i8x8_mode_prob, cpi->common.fc.i8x8_mode_prob);
+  cpi->common.fc.pre_nmvc = cpi->common.fc.nmvc;
+  vp9_zero(cpi->sub_mv_ref_count);
+  vp9_zero(cpi->mbsplit_count);
+  vp9_zero(cpi->common.fc.mv_ref_ct)
+  vp9_zero(cpi->common.fc.mv_ref_ct_a)
+
+  update_coef_probs(cpi, &header_bc);
+
+#ifdef ENTROPY_STATS
+  active_section = 2;
+#endif
+
+  // Write out the mb_no_coeff_skip flag
+  vp9_write_bit(&header_bc, pc->mb_no_coeff_skip);
+  if (pc->mb_no_coeff_skip) {
+    int k;
+
+    vp9_update_skip_probs(cpi);
+    for (k = 0; k < MBSKIP_CONTEXTS; ++k)
+      vp9_write_literal(&header_bc, pc->mbskip_pred_probs[k], 8);
+  }
+
+  if (pc->frame_type == KEY_FRAME) {
+    if (!pc->kf_ymode_probs_update) {
+      vp9_write_literal(&header_bc, pc->kf_ymode_probs_index, 3);
+    }
+  } else {
+    // Update the probabilities used to encode reference frame data
+    update_ref_probs(cpi);
+
+#ifdef ENTROPY_STATS
+    active_section = 1;
+#endif
+
+#if CONFIG_PRED_FILTER
+    // Write the prediction filter mode used for this frame
+    vp9_write_literal(&header_bc, pc->pred_filter_mode, 2);
+
+    // Write prediction filter on/off probability if signaling at MB level
+    if (pc->pred_filter_mode == 2)
+      vp9_write_literal(&header_bc, pc->prob_pred_filter_off, 8);
+
+#endif
+    if (pc->mcomp_filter_type == SWITCHABLE)
+      update_switchable_interp_probs(cpi, &header_bc);
+
+    vp9_write_literal(&header_bc, pc->prob_intra_coded, 8);
+    vp9_write_literal(&header_bc, pc->prob_last_coded, 8);
+    vp9_write_literal(&header_bc, pc->prob_gf_coded, 8);
+
+    {
+      const int comp_pred_mode = cpi->common.comp_pred_mode;
+      const int use_compound_pred = (comp_pred_mode != SINGLE_PREDICTION_ONLY);
+      const int use_hybrid_pred = (comp_pred_mode == HYBRID_PREDICTION);
+
+      vp9_write(&header_bc, use_compound_pred, 128);
+      if (use_compound_pred) {
+        vp9_write(&header_bc, use_hybrid_pred, 128);
+        if (use_hybrid_pred) {
+          for (i = 0; i < COMP_PRED_CONTEXTS; i++) {
+            pc->prob_comppred[i] = get_binary_prob(cpi->single_pred_count[i],
+                                                   cpi->comp_pred_count[i]);
+            vp9_write_literal(&header_bc, pc->prob_comppred[i], 8);
+          }
+        }
+      }
+    }
+
+    update_mbintra_mode_probs(cpi, &header_bc);
+
+#if CONFIG_NEW_MVREF
+    // Temp defaults probabilities for ecnoding the MV ref id signal
+    vpx_memset(xd->mb_mv_ref_id_probs, 192, sizeof(xd->mb_mv_ref_id_probs));
+#endif
+
+    vp9_write_nmvprobs(cpi, xd->allow_high_precision_mv, &header_bc);
+  }
+
+  vp9_stop_encode(&header_bc);
+
+  oh.first_partition_length_in_bytes = header_bc.pos;
+
+  /* update frame tag */
+  {
+    int v = (oh.first_partition_length_in_bytes << 5) |
+            (oh.show_frame << 4) |
+            (oh.version << 1) |
+            oh.type;
+
+    dest[0] = v;
+    dest[1] = v >> 8;
+    dest[2] = v >> 16;
+  }
+
+  *size = VP9_HEADER_SIZE + extra_bytes_packed + header_bc.pos;
+  vp9_start_encode(&residual_bc, cx_data + header_bc.pos);
+
+  if (pc->frame_type == KEY_FRAME) {
+    decide_kf_ymode_entropy(cpi);
+    write_kfmodes(cpi, &residual_bc);
+  } else {
+    pack_inter_mode_mvs(cpi, &residual_bc);
+    vp9_update_mode_context(&cpi->common);
+  }
+
+
+  vp9_stop_encode(&residual_bc);
+
+  *size += residual_bc.pos;
+
+}
+
+#ifdef ENTROPY_STATS
+void print_tree_update_probs() {
+  int i, j, k, l;
+  FILE *f = fopen("coefupdprob.h", "w");
+  int Sum;
+  fprintf(f, "\n/* Update probabilities for token entropy tree. */\n\n");
+
+  fprintf(f, "const vp9_prob\n"
+          "vp9_coef_update_probs[BLOCK_TYPES]\n"
+          "                     [COEF_BANDS]\n"
+          "                     [PREV_COEF_CONTEXTS]\n"
+          "                     [ENTROPY_NODES] = {\n");
+  for (i = 0; i < BLOCK_TYPES; i++) {
+    fprintf(f, "  { \n");
+    for (j = 0; j < COEF_BANDS; j++) {
+      fprintf(f, "    {\n");
+      for (k = 0; k < PREV_COEF_CONTEXTS; k++) {
+        fprintf(f, "      {");
+        for (l = 0; l < ENTROPY_NODES; l++) {
+          fprintf(f, "%3ld, ",
+              get_binary_prob(tree_update_hist[i][j][k][l][0],
+                              tree_update_hist[i][j][k][l][1]));
+        }
+        fprintf(f, "},\n");
+      }
+      fprintf(f, "    },\n");
+    }
+    fprintf(f, "  },\n");
+  }
+  fprintf(f, "};\n");
+
+  fprintf(f, "const vp9_prob\n"
+          "vp9_coef_update_probs_8x8[BLOCK_TYPES_8X8]\n"
+          "                         [COEF_BANDS]\n"
+          "                         [PREV_COEF_CONTEXTS]\n"
+          "                         [ENTROPY_NODES] = {\n");
+  for (i = 0; i < BLOCK_TYPES_8X8; i++) {
+    fprintf(f, "  { \n");
+    for (j = 0; j < COEF_BANDS; j++) {
+      fprintf(f, "    {\n");
+      for (k = 0; k < PREV_COEF_CONTEXTS; k++) {
+        fprintf(f, "      {");
+        for (l = 0; l < MAX_ENTROPY_TOKENS - 1; l++) {
+          fprintf(f, "%3ld, ",
+              get_binary_prob(tree_update_hist_8x8[i][j][k][l][0],
+                              tree_update_hist_8x8[i][j][k][l][1]));
+        }
+        fprintf(f, "},\n");
+      }
+      fprintf(f, "    },\n");
+    }
+    fprintf(f, "  },\n");
+  }
+
+  fprintf(f, "const vp9_prob\n"
+          "vp9_coef_update_probs_16x16[BLOCK_TYPES_16X16]\n"
+          "                           [COEF_BANDS]\n"
+          "                           [PREV_COEF_CONTEXTS]\n"
+          "                           [ENTROPY_NODES] = {\n");
+  for (i = 0; i < BLOCK_TYPES_16X16; i++) {
+    fprintf(f, "  { \n");
+    for (j = 0; j < COEF_BANDS; j++) {
+      fprintf(f, "    {\n");
+      for (k = 0; k < PREV_COEF_CONTEXTS; k++) {
+        fprintf(f, "      {");
+        for (l = 0; l < MAX_ENTROPY_TOKENS - 1; l++) {
+          fprintf(f, "%3ld, ",
+              get_binary_prob(tree_update_hist_16x16[i][j][k][l][0],
+                              tree_update_hist_16x16[i][j][k][l][1]));
+        }
+        fprintf(f, "},\n");
+      }
+      fprintf(f, "    },\n");
+    }
+    fprintf(f, "  },\n");
+  }
+
+  fclose(f);
+  f = fopen("treeupdate.bin", "wb");
+  fwrite(tree_update_hist, sizeof(tree_update_hist), 1, f);
+  fwrite(tree_update_hist_8x8, sizeof(tree_update_hist_8x8), 1, f);
+  fwrite(tree_update_hist_16x16, sizeof(tree_update_hist_16x16), 1, f);
+  fclose(f);
+}
+#endif
--- /dev/null
+++ b/vp9/encoder/bitstream.h
@@ -1,0 +1,17 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_BITSTREAM_H
+#define __INC_BITSTREAM_H
+
+void vp9_update_skip_probs(VP9_COMP *cpi);
+
+#endif
--- /dev/null
+++ b/vp9/encoder/block.h
@@ -1,0 +1,184 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_BLOCK_H
+#define __INC_BLOCK_H
+
+#include "vp9/common/onyx.h"
+#include "vp9/common/entropymv.h"
+#include "vp9/common/entropy.h"
+#include "vpx_ports/mem.h"
+#include "vp9/common/onyxc_int.h"
+
+// motion search site
+typedef struct {
+  MV mv;
+  int offset;
+} search_site;
+
+typedef struct block {
+  // 16 Y blocks, 4 U blocks, 4 V blocks each with 16 entries
+  short *src_diff;
+  short *coeff;
+
+  // 16 Y blocks, 4 U blocks, 4 V blocks each with 16 entries
+  short *quant;
+  short *quant_fast;      // fast quant deprecated for now
+  unsigned char *quant_shift;
+  short *zbin;
+  short *zbin_8x8;
+  short *zbin_16x16;
+  short *zrun_zbin_boost;
+  short *zrun_zbin_boost_8x8;
+  short *zrun_zbin_boost_16x16;
+  short *round;
+
+  // Zbin Over Quant value
+  short zbin_extra;
+
+  unsigned char **base_src;
+  unsigned char **base_second_src;
+  int src;
+  int src_stride;
+
+  int eob_max_offset;
+  int eob_max_offset_8x8;
+  int eob_max_offset_16x16;
+} BLOCK;
+
+typedef struct {
+  int count;
+  struct {
+    B_PREDICTION_MODE mode;
+    int_mv mv;
+    int_mv second_mv;
+  } bmi[16];
+} PARTITION_INFO;
+
+// Structure to hold snapshot of coding context during the mode picking process
+// TODO Do we need all of these?
+typedef struct {
+  MODE_INFO mic;
+  PARTITION_INFO partition_info;
+  int_mv best_ref_mv;
+  int_mv second_best_ref_mv;
+#if CONFIG_NEWBESTREFMV || CONFIG_NEW_MVREF
+  int_mv ref_mvs[MAX_REF_FRAMES][MAX_MV_REFS];
+#endif
+  int rate;
+  int distortion;
+  int64_t intra_error;
+  int best_mode_index;
+  int rddiv;
+  int rdmult;
+  int hybrid_pred_diff;
+  int comp_pred_diff;
+  int single_pred_diff;
+  int64_t txfm_rd_diff[NB_TXFM_MODES];
+} PICK_MODE_CONTEXT;
+
+typedef struct macroblock {
+  DECLARE_ALIGNED(16, short, src_diff[400]);  // 16x16 Y 8x8 U 8x8 V 4x4 2nd Y
+  DECLARE_ALIGNED(16, short, coeff[400]);     // 16x16 Y 8x8 U 8x8 V 4x4 2nd Y
+  DECLARE_ALIGNED(16, unsigned char, thismb[256]);    // 16x16 Y
+
+  unsigned char *thismb_ptr;
+  // 16 Y blocks, 4 U blocks, 4 V blocks,
+  // 1 DC 2nd order block each with 16 entries
+  BLOCK block[25];
+
+  YV12_BUFFER_CONFIG src;
+
+  MACROBLOCKD e_mbd;
+  PARTITION_INFO *partition_info; /* work pointer */
+  PARTITION_INFO *pi;   /* Corresponds to upper left visible macroblock */
+  PARTITION_INFO *pip;  /* Base of allocated array */
+
+  search_site *ss;
+  int ss_count;
+  int searches_per_step;
+
+  int errorperbit;
+  int sadperbit16;
+  int sadperbit4;
+  int rddiv;
+  int rdmult;
+  unsigned int *mb_activity_ptr;
+  int *mb_norm_activity_ptr;
+  signed int act_zbin_adj;
+
+  int nmvjointcost[MV_JOINTS];
+  int nmvcosts[2][MV_VALS];
+  int *nmvcost[2];
+  int nmvcosts_hp[2][MV_VALS];
+  int *nmvcost_hp[2];
+
+  int nmvjointsadcost[MV_JOINTS];
+  int nmvsadcosts[2][MV_VALS];
+  int *nmvsadcost[2];
+  int nmvsadcosts_hp[2][MV_VALS];
+  int *nmvsadcost_hp[2];
+
+  int mbmode_cost[2][MB_MODE_COUNT];
+  int intra_uv_mode_cost[2][MB_MODE_COUNT];
+  int bmode_costs[VP9_BINTRAMODES][VP9_BINTRAMODES][VP9_BINTRAMODES];
+  int i8x8_mode_costs[MB_MODE_COUNT];
+  int inter_bmode_costs[B_MODE_COUNT];
+  int switchable_interp_costs[VP9_SWITCHABLE_FILTERS + 1]
+                             [VP9_SWITCHABLE_FILTERS];
+
+  // These define limits to motion vector components to prevent them
+  // from extending outside the UMV borders
+  int mv_col_min;
+  int mv_col_max;
+  int mv_row_min;
+  int mv_row_max;
+
+  int skip;
+
+  int encode_breakout;
+
+  // char * gf_active_ptr;
+  signed char *gf_active_ptr;
+
+  unsigned char *active_ptr;
+
+  unsigned int token_costs[TX_SIZE_MAX][BLOCK_TYPES][COEF_BANDS]
+    [PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS];
+  unsigned int hybrid_token_costs[TX_SIZE_MAX][BLOCK_TYPES][COEF_BANDS]
+    [PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS];
+
+  int optimize;
+
+  // Structure to hold context for each of the 4 MBs within a SB:
+  // when encoded as 4 independent MBs:
+  PICK_MODE_CONTEXT mb_context[4];
+#if CONFIG_SUPERBLOCKS
+  // when 4 MBs share coding parameters:
+  PICK_MODE_CONTEXT sb_context[4];
+#endif
+
+  void (*vp9_short_fdct4x4)(short *input, short *output, int pitch);
+  void (*vp9_short_fdct8x4)(short *input, short *output, int pitch);
+  void (*short_walsh4x4)(short *input, short *output, int pitch);
+  void (*quantize_b_4x4)(BLOCK *b, BLOCKD *d);
+  void (*quantize_b_4x4_pair)(BLOCK *b1, BLOCK *b2, BLOCKD *d0, BLOCKD *d1);
+  void (*vp9_short_fdct8x8)(short *input, short *output, int pitch);
+  void (*vp9_short_fdct16x16)(short *input, short *output, int pitch);
+  void (*short_fhaar2x2)(short *input, short *output, int pitch);
+  void (*quantize_b_16x16)(BLOCK *b, BLOCKD *d);
+  void (*quantize_b_8x8)(BLOCK *b, BLOCKD *d);
+  void (*quantize_b_2x2)(BLOCK *b, BLOCKD *d);
+
+} MACROBLOCK;
+
+
+#endif
--- /dev/null
+++ b/vp9/encoder/boolhuff.c
@@ -1,0 +1,153 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "boolhuff.h"
+
+#if defined(SECTIONBITS_OUTPUT)
+unsigned __int64 Sectionbits[500];
+
+#endif
+
+#ifdef ENTROPY_STATS
+unsigned int active_section = 0;
+#endif
+
+const unsigned int vp9_prob_cost[256] = {
+  2047, 2047, 1791, 1641, 1535, 1452, 1385, 1328, 1279, 1235, 1196, 1161, 1129, 1099, 1072, 1046,
+  1023, 1000,  979,  959,  940,  922,  905,  889,  873,  858,  843,  829,  816,  803,  790,  778,
+  767,  755,  744,  733,  723,  713,  703,  693,  684,  675,  666,  657,  649,  641,  633,  625,
+  617,  609,  602,  594,  587,  580,  573,  567,  560,  553,  547,  541,  534,  528,  522,  516,
+  511,  505,  499,  494,  488,  483,  477,  472,  467,  462,  457,  452,  447,  442,  437,  433,
+  428,  424,  419,  415,  410,  406,  401,  397,  393,  389,  385,  381,  377,  373,  369,  365,
+  361,  357,  353,  349,  346,  342,  338,  335,  331,  328,  324,  321,  317,  314,  311,  307,
+  304,  301,  297,  294,  291,  288,  285,  281,  278,  275,  272,  269,  266,  263,  260,  257,
+  255,  252,  249,  246,  243,  240,  238,  235,  232,  229,  227,  224,  221,  219,  216,  214,
+  211,  208,  206,  203,  201,  198,  196,  194,  191,  189,  186,  184,  181,  179,  177,  174,
+  172,  170,  168,  165,  163,  161,  159,  156,  154,  152,  150,  148,  145,  143,  141,  139,
+  137,  135,  133,  131,  129,  127,  125,  123,  121,  119,  117,  115,  113,  111,  109,  107,
+  105,  103,  101,   99,   97,   95,   93,   92,   90,   88,   86,   84,   82,   81,   79,   77,
+  75,   73,   72,   70,   68,   66,   65,   63,   61,   60,   58,   56,   55,   53,   51,   50,
+  48,   46,   45,   43,   41,   40,   38,   37,   35,   33,   32,   30,   29,   27,   25,   24,
+  22,   21,   19,   18,   16,   15,   13,   12,   10,    9,    7,    6,    4,    3,    1,   1
+};
+
+void vp9_start_encode(BOOL_CODER *br, unsigned char *source) {
+
+  br->lowvalue = 0;
+  br->range    = 255;
+  br->value    = 0;
+  br->count    = -24;
+  br->buffer   = source;
+  br->pos      = 0;
+}
+
+void vp9_stop_encode(BOOL_CODER *br) {
+  int i;
+
+  for (i = 0; i < 32; i++)
+    encode_bool(br, 0, 128);
+}
+
+
+void vp9_encode_value(BOOL_CODER *br, int data, int bits) {
+  int bit;
+
+  for (bit = bits - 1; bit >= 0; bit--)
+    encode_bool(br, (1 & (data >> bit)), 0x80);
+}
+
+int vp9_recenter_nonneg(int v, int m) {
+  if (v > (m << 1)) return v;
+  else if (v >= m) return ((v - m) << 1);
+  else return ((m - v) << 1) - 1;
+}
+
+static int get_unsigned_bits(unsigned num_values) {
+  int cat = 0;
+  if ((num_values--) <= 1) return 0;
+  while (num_values > 0) {
+    cat++;
+    num_values >>= 1;
+  }
+  return cat;
+}
+
+void vp9_encode_uniform(BOOL_CODER *br, int v, int n) {
+  int l = get_unsigned_bits(n);
+  int m;
+  if (l == 0) return;
+  m = (1 << l) - n;
+  if (v < m)
+    vp9_encode_value(br, v, l - 1);
+  else {
+    vp9_encode_value(br, m + ((v - m) >> 1), l - 1);
+    vp9_encode_value(br, (v - m) & 1, 1);
+  }
+}
+
+int vp9_count_uniform(int v, int n) {
+  int l = get_unsigned_bits(n);
+  int m;
+  if (l == 0) return 0;
+  m = (1 << l) - n;
+  if (v < m)
+    return l - 1;
+  else
+    return l;
+}
+
+void vp9_encode_term_subexp(BOOL_CODER *br, int word, int k, int num_syms) {
+  int i = 0;
+  int mk = 0;
+  while (1) {
+    int b = (i ? k + i - 1 : k);
+    int a = (1 << b);
+    if (num_syms <= mk + 3 * a) {
+      vp9_encode_uniform(br, word - mk, num_syms - mk);
+      break;
+    } else {
+      int t = (word >= mk + a);
+      vp9_encode_value(br, t, 1);
+      if (t) {
+        i = i + 1;
+        mk += a;
+      } else {
+        vp9_encode_value(br, word - mk, b);
+        break;
+      }
+    }
+  }
+}
+
+int vp9_count_term_subexp(int word, int k, int num_syms) {
+  int count = 0;
+  int i = 0;
+  int mk = 0;
+  while (1) {
+    int b = (i ? k + i - 1 : k);
+    int a = (1 << b);
+    if (num_syms <= mk + 3 * a) {
+      count += vp9_count_uniform(word - mk, num_syms - mk);
+      break;
+    } else {
+      int t = (word >= mk + a);
+      count++;
+      if (t) {
+        i = i + 1;
+        mk += a;
+      } else {
+        count += b;
+        break;
+      }
+    }
+  }
+  return count;
+}
--- /dev/null
+++ b/vp9/encoder/boolhuff.h
@@ -1,0 +1,111 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+/****************************************************************************
+*
+*   Module Title :     boolhuff.h
+*
+*   Description  :     Bool Coder header file.
+*
+****************************************************************************/
+#ifndef __INC_BOOLHUFF_H
+#define __INC_BOOLHUFF_H
+
+#include "vpx_ports/mem.h"
+
+typedef struct {
+  unsigned int lowvalue;
+  unsigned int range;
+  unsigned int value;
+  int count;
+  unsigned int pos;
+  unsigned char *buffer;
+
+  // Variables used to track bit costs without outputing to the bitstream
+  unsigned int  measure_cost;
+  unsigned long bit_counter;
+} BOOL_CODER;
+
+extern void vp9_start_encode(BOOL_CODER *bc, unsigned char *buffer);
+
+extern void vp9_encode_value(BOOL_CODER *br, int data, int bits);
+extern void vp9_stop_encode(BOOL_CODER *bc);
+extern const unsigned int vp9_prob_cost[256];
+
+extern void vp9_encode_uniform(BOOL_CODER *bc, int v, int n);
+extern void vp9_encode_term_subexp(BOOL_CODER *bc, int v, int k, int n);
+extern int vp9_count_uniform(int v, int n);
+extern int vp9_count_term_subexp(int v, int k, int n);
+extern int vp9_recenter_nonneg(int v, int m);
+
+DECLARE_ALIGNED(16, extern const unsigned char, vp9_norm[256]);
+
+
+static void encode_bool(BOOL_CODER *br, int bit, int probability) {
+  unsigned int split;
+  int count = br->count;
+  unsigned int range = br->range;
+  unsigned int lowvalue = br->lowvalue;
+  register unsigned int shift;
+
+#ifdef ENTROPY_STATS
+#if defined(SECTIONBITS_OUTPUT)
+
+  if (bit)
+    Sectionbits[active_section] += vp9_prob_cost[255 - probability];
+  else
+    Sectionbits[active_section] += vp9_prob_cost[probability];
+
+#endif
+#endif
+
+  split = 1 + (((range - 1) * probability) >> 8);
+
+  range = split;
+
+  if (bit) {
+    lowvalue += split;
+    range = br->range - split;
+  }
+
+  shift = vp9_norm[range];
+
+  range <<= shift;
+  count += shift;
+
+  if (count >= 0) {
+    int offset = shift - count;
+
+    if ((lowvalue << (offset - 1)) & 0x80000000) {
+      int x = br->pos - 1;
+
+      while (x >= 0 && br->buffer[x] == 0xff) {
+        br->buffer[x] = (unsigned char)0;
+        x--;
+      }
+
+      br->buffer[x] += 1;
+    }
+
+    br->buffer[br->pos++] = (lowvalue >> (24 - offset));
+    lowvalue <<= offset;
+    shift = count;
+    lowvalue &= 0xffffff;
+    count -= 8;
+  }
+
+  lowvalue <<= shift;
+  br->count = count;
+  br->lowvalue = lowvalue;
+  br->range = range;
+}
+
+#endif
--- /dev/null
+++ b/vp9/encoder/dct.c
@@ -1,0 +1,1109 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include <assert.h>
+#include <math.h>
+#include "vpx_ports/config.h"
+#include "vp9/common/idct.h"
+#include "vp9/common/systemdependent.h"
+
+#include "vp9/common/blockd.h"
+
+// TODO: these transforms can be converted into integer forms to reduce
+//       the complexity
+static const float dct_4[16] = {
+  0.500000000000000,  0.500000000000000,  0.500000000000000,  0.500000000000000,
+  0.653281482438188,  0.270598050073099, -0.270598050073099, -0.653281482438188,
+  0.500000000000000, -0.500000000000000, -0.500000000000000,  0.500000000000000,
+  0.270598050073099, -0.653281482438188,  0.653281482438188, -0.270598050073099
+};
+
+static const float adst_4[16] = {
+  0.228013428883779,  0.428525073124360,  0.577350269189626,  0.656538502008139,
+  0.577350269189626,  0.577350269189626,  0.000000000000000, -0.577350269189626,
+  0.656538502008139, -0.228013428883779, -0.577350269189626,  0.428525073124359,
+  0.428525073124360, -0.656538502008139,  0.577350269189626, -0.228013428883779
+};
+
+static const float dct_8[64] = {
+  0.353553390593274,   0.353553390593274,   0.353553390593274,   0.353553390593274,
+  0.353553390593274,   0.353553390593274,   0.353553390593274,   0.353553390593274,
+  0.490392640201615,   0.415734806151273,   0.277785116509801,   0.097545161008064,
+ -0.097545161008064,  -0.277785116509801,  -0.415734806151273,  -0.490392640201615,
+  0.461939766255643,   0.191341716182545,  -0.191341716182545,  -0.461939766255643,
+ -0.461939766255643,  -0.191341716182545,   0.191341716182545,   0.461939766255643,
+  0.415734806151273,  -0.097545161008064,  -0.490392640201615,  -0.277785116509801,
+  0.277785116509801,   0.490392640201615,   0.097545161008064,  -0.415734806151273,
+  0.353553390593274,  -0.353553390593274,  -0.353553390593274,   0.353553390593274,
+  0.353553390593274,  -0.353553390593274,  -0.353553390593274,   0.353553390593274,
+  0.277785116509801,  -0.490392640201615,   0.097545161008064,   0.415734806151273,
+ -0.415734806151273,  -0.097545161008064,   0.490392640201615,  -0.277785116509801,
+  0.191341716182545,  -0.461939766255643,   0.461939766255643,  -0.191341716182545,
+ -0.191341716182545,   0.461939766255643,  -0.461939766255643,   0.191341716182545,
+  0.097545161008064,  -0.277785116509801,   0.415734806151273,  -0.490392640201615,
+  0.490392640201615,  -0.415734806151273,   0.277785116509801,  -0.097545161008064
+};
+
+static const float adst_8[64] = {
+  0.089131608307533,   0.175227946595735,   0.255357107325376,   0.326790388032145,
+  0.387095214016349,   0.434217976756762,   0.466553967085785,   0.483002021635509,
+  0.255357107325376,   0.434217976756762,   0.483002021635509,   0.387095214016349,
+  0.175227946595735,  -0.089131608307533,  -0.326790388032145,  -0.466553967085785,
+  0.387095214016349,   0.466553967085785,   0.175227946595735,  -0.255357107325376,
+ -0.483002021635509,  -0.326790388032145,   0.089131608307533,   0.434217976756762,
+  0.466553967085785,   0.255357107325376,  -0.326790388032145,  -0.434217976756762,
+  0.089131608307533,   0.483002021635509,   0.175227946595735,  -0.387095214016348,
+  0.483002021635509,  -0.089131608307533,  -0.466553967085785,   0.175227946595735,
+  0.434217976756762,  -0.255357107325376,  -0.387095214016348,   0.326790388032145,
+  0.434217976756762,  -0.387095214016348,  -0.089131608307533,   0.466553967085786,
+ -0.326790388032145,  -0.175227946595735,   0.483002021635509,  -0.255357107325375,
+  0.326790388032145,  -0.483002021635509,   0.387095214016349,  -0.089131608307534,
+ -0.255357107325377,   0.466553967085785,  -0.434217976756762,   0.175227946595736,
+  0.175227946595735,  -0.326790388032145,   0.434217976756762,  -0.483002021635509,
+  0.466553967085785,  -0.387095214016348,   0.255357107325376,  -0.089131608307532
+};
+
+/* Converted the transforms to integers. */
+static const int16_t dct_i4[16] = {
+  16384,  16384,  16384,  16384,
+  21407,   8867,  -8867, -21407,
+  16384, -16384, -16384,  16384,
+   8867, -21407,  21407,  -8867
+};
+
+static const int16_t adst_i4[16] = {
+   7472,  14042,  18919,  21513,
+  18919,  18919,      0, -18919,
+  21513,  -7472, -18919,  14042,
+  14042, -21513,  18919,  -7472
+};
+
+static const int16_t dct_i8[64] = {
+   11585,  11585,  11585,  11585,
+   11585,  11585,  11585,  11585,
+   16069,  13623,   9102,   3196,
+   -3196,  -9102, -13623, -16069,
+   15137,   6270,  -6270, -15137,
+  -15137,  -6270,   6270,  15137,
+   13623,  -3196, -16069,  -9102,
+    9102,  16069,   3196, -13623,
+   11585, -11585, -11585,  11585,
+   11585, -11585, -11585,  11585,
+    9102, -16069,   3196,  13623,
+  -13623,  -3196,  16069,  -9102,
+    6270, -15137,  15137,  -6270,
+   -6270,  15137, -15137,   6270,
+    3196,  -9102,  13623, -16069,
+   16069, -13623,   9102,  -3196
+};
+
+static const int16_t adst_i8[64] = {
+    2921,   5742,   8368,  10708,
+   12684,  14228,  15288,  15827,
+    8368,  14228,  15827,  12684,
+    5742,  -2921, -10708, -15288,
+   12684,  15288,   5742,  -8368,
+  -15827, -10708,   2921,  14228,
+   15288,   8368, -10708, -14228,
+    2921,  15827,   5742, -12684,
+   15827,  -2921, -15288,   5742,
+   14228,  -8368, -12684,  10708,
+   14228, -12684,  -2921,  15288,
+  -10708,  -5742,  15827,  -8368,
+   10708, -15827,  12684,  -2921,
+   -8368,  15288, -14228,   5742,
+    5742, -10708,  14228, -15827,
+   15288, -12684,   8368,  -2921
+};
+
+static const float dct_16[256] = {
+  0.250000,  0.250000,  0.250000,  0.250000,  0.250000,  0.250000,  0.250000,  0.250000,
+  0.250000,  0.250000,  0.250000,  0.250000,  0.250000,  0.250000,  0.250000,  0.250000,
+  0.351851,  0.338330,  0.311806,  0.273300,  0.224292,  0.166664,  0.102631,  0.034654,
+ -0.034654, -0.102631, -0.166664, -0.224292, -0.273300, -0.311806, -0.338330, -0.351851,
+  0.346760,  0.293969,  0.196424,  0.068975, -0.068975, -0.196424, -0.293969, -0.346760,
+ -0.346760, -0.293969, -0.196424, -0.068975,  0.068975,  0.196424,  0.293969,  0.346760,
+  0.338330,  0.224292,  0.034654, -0.166664, -0.311806, -0.351851, -0.273300, -0.102631,
+  0.102631,  0.273300,  0.351851,  0.311806,  0.166664, -0.034654, -0.224292, -0.338330,
+  0.326641,  0.135299, -0.135299, -0.326641, -0.326641, -0.135299,  0.135299,  0.326641,
+  0.326641,  0.135299, -0.135299, -0.326641, -0.326641, -0.135299,  0.135299,  0.326641,
+  0.311806,  0.034654, -0.273300, -0.338330, -0.102631,  0.224292,  0.351851,  0.166664,
+ -0.166664, -0.351851, -0.224292,  0.102631,  0.338330,  0.273300, -0.034654, -0.311806,
+  0.293969, -0.068975, -0.346760, -0.196424,  0.196424,  0.346760,  0.068975, -0.293969,
+ -0.293969,  0.068975,  0.346760,  0.196424, -0.196424, -0.346760, -0.068975,  0.293969,
+  0.273300, -0.166664, -0.338330,  0.034654,  0.351851,  0.102631, -0.311806, -0.224292,
+  0.224292,  0.311806, -0.102631, -0.351851, -0.034654,  0.338330,  0.166664, -0.273300,
+  0.250000, -0.250000, -0.250000,  0.250000,  0.250000, -0.250000, -0.250000,  0.250000,
+  0.250000, -0.250000, -0.250000,  0.250000,  0.250000, -0.250000, -0.250000,  0.250000,
+  0.224292, -0.311806, -0.102631,  0.351851, -0.034654, -0.338330,  0.166664,  0.273300,
+ -0.273300, -0.166664,  0.338330,  0.034654, -0.351851,  0.102631,  0.311806, -0.224292,
+  0.196424, -0.346760,  0.068975,  0.293969, -0.293969, -0.068975,  0.346760, -0.196424,
+ -0.196424,  0.346760, -0.068975, -0.293969,  0.293969,  0.068975, -0.346760,  0.196424,
+  0.166664, -0.351851,  0.224292,  0.102631, -0.338330,  0.273300,  0.034654, -0.311806,
+  0.311806, -0.034654, -0.273300,  0.338330, -0.102631, -0.224292,  0.351851, -0.166664,
+  0.135299, -0.326641,  0.326641, -0.135299, -0.135299,  0.326641, -0.326641,  0.135299,
+  0.135299, -0.326641,  0.326641, -0.135299, -0.135299,  0.326641, -0.326641,  0.135299,
+  0.102631, -0.273300,  0.351851, -0.311806,  0.166664,  0.034654, -0.224292,  0.338330,
+ -0.338330,  0.224292, -0.034654, -0.166664,  0.311806, -0.351851,  0.273300, -0.102631,
+  0.068975, -0.196424,  0.293969, -0.346760,  0.346760, -0.293969,  0.196424, -0.068975,
+ -0.068975,  0.196424, -0.293969,  0.346760, -0.346760,  0.293969, -0.196424,  0.068975,
+  0.034654, -0.102631,  0.166664, -0.224292,  0.273300, -0.311806,  0.338330, -0.351851,
+  0.351851, -0.338330,  0.311806, -0.273300,  0.224292, -0.166664,  0.102631, -0.034654
+};
+
+static const float adst_16[256] = {
+  0.033094,  0.065889,  0.098087,  0.129396,  0.159534,  0.188227,  0.215215,  0.240255,
+  0.263118,  0.283599,  0.301511,  0.316693,  0.329007,  0.338341,  0.344612,  0.347761,
+  0.098087,  0.188227,  0.263118,  0.316693,  0.344612,  0.344612,  0.316693,  0.263118,
+  0.188227,  0.098087,  0.000000, -0.098087, -0.188227, -0.263118, -0.316693, -0.344612,
+  0.159534,  0.283599,  0.344612,  0.329007,  0.240255,  0.098087, -0.065889, -0.215215,
+ -0.316693, -0.347761, -0.301511, -0.188227, -0.033094,  0.129396,  0.263118,  0.338341,
+  0.215215,  0.338341,  0.316693,  0.159534, -0.065889, -0.263118, -0.347761, -0.283599,
+ -0.098087,  0.129396,  0.301511,  0.344612,  0.240255,  0.033094, -0.188227, -0.329007,
+  0.263118,  0.344612,  0.188227, -0.098087, -0.316693, -0.316693, -0.098087,  0.188227,
+  0.344612,  0.263118,  0.000000, -0.263118, -0.344612, -0.188227,  0.098087,  0.316693,
+  0.301511,  0.301511,  0.000000, -0.301511, -0.301511, -0.000000,  0.301511,  0.301511,
+  0.000000, -0.301511, -0.301511, -0.000000,  0.301511,  0.301511,  0.000000, -0.301511,
+  0.329007,  0.215215, -0.188227, -0.338341, -0.033094,  0.316693,  0.240255, -0.159534,
+ -0.344612, -0.065889,  0.301511,  0.263118, -0.129396, -0.347761, -0.098087,  0.283599,
+  0.344612,  0.098087, -0.316693, -0.188227,  0.263118,  0.263118, -0.188227, -0.316693,
+  0.098087,  0.344612,  0.000000, -0.344612, -0.098087,  0.316693,  0.188227, -0.263118,
+  0.347761, -0.033094, -0.344612,  0.065889,  0.338341, -0.098087, -0.329007,  0.129396,
+  0.316693, -0.159534, -0.301511,  0.188227,  0.283599, -0.215215, -0.263118,  0.240255,
+  0.338341, -0.159534, -0.263118,  0.283599,  0.129396, -0.344612,  0.033094,  0.329007,
+ -0.188227, -0.240255,  0.301511,  0.098087, -0.347761,  0.065889,  0.316693, -0.215215,
+  0.316693, -0.263118, -0.098087,  0.344612, -0.188227, -0.188227,  0.344612, -0.098087,
+ -0.263118,  0.316693,  0.000000, -0.316693,  0.263118,  0.098087, -0.344612,  0.188227,
+  0.283599, -0.329007,  0.098087,  0.215215, -0.347761,  0.188227,  0.129396, -0.338341,
+  0.263118,  0.033094, -0.301511,  0.316693, -0.065889, -0.240255,  0.344612, -0.159534,
+  0.240255, -0.347761,  0.263118, -0.033094, -0.215215,  0.344612, -0.283599,  0.065889,
+  0.188227, -0.338341,  0.301511, -0.098087, -0.159534,  0.329007, -0.316693,  0.129396,
+  0.188227, -0.316693,  0.344612, -0.263118,  0.098087,  0.098087, -0.263118,  0.344612,
+ -0.316693,  0.188227,  0.000000, -0.188227,  0.316693, -0.344612,  0.263118, -0.098087,
+  0.129396, -0.240255,  0.316693, -0.347761,  0.329007, -0.263118,  0.159534, -0.033094,
+ -0.098087,  0.215215, -0.301511,  0.344612, -0.338341,  0.283599, -0.188227,  0.065889,
+  0.065889, -0.129396,  0.188227, -0.240255,  0.283599, -0.316693,  0.338341, -0.347761,
+  0.344612, -0.329007,  0.301511, -0.263118,  0.215215, -0.159534,  0.098087, -0.033094
+};
+
+/* Converted the transforms to integers. */
+static const int16_t dct_i16[256] = {
+    8192,   8192,   8192,   8192,   8192,   8192,   8192,   8192,
+    8192,   8192,   8192,   8192,   8192,   8192,   8192,   8192,
+   11529,  11086,  10217,   8955,   7350,   5461,   3363,   1136,
+   -1136,  -3363,  -5461,  -7350,  -8955, -10217, -11086, -11529,
+   11363,   9633,   6436,   2260,  -2260,  -6436,  -9633, -11363,
+  -11363,  -9633,  -6436,  -2260,   2260,   6436,   9633,  11363,
+   11086,   7350,   1136,  -5461, -10217, -11529,  -8955,  -3363,
+    3363,   8955,  11529,  10217,   5461,  -1136,  -7350, -11086,
+   10703,   4433,  -4433, -10703, -10703,  -4433,   4433,  10703,
+   10703,   4433,  -4433, -10703, -10703,  -4433,   4433,  10703,
+   10217,   1136,  -8955, -11086,  -3363,   7350,  11529,   5461,
+   -5461, -11529,  -7350,   3363,  11086,   8955,  -1136, -10217,
+    9633,  -2260, -11363,  -6436,   6436,  11363,   2260,  -9633,
+   -9633,   2260,  11363,   6436,  -6436, -11363,  -2260,   9633,
+    8955,  -5461, -11086,   1136,  11529,   3363, -10217,  -7350,
+    7350,  10217,  -3363, -11529,  -1136,  11086,   5461,  -8955,
+    8192,  -8192,  -8192,   8192,   8192,  -8192,  -8192,   8192,
+    8192,  -8192,  -8192,   8192,   8192,  -8192,  -8192,   8192,
+    7350, -10217,  -3363,  11529,  -1136, -11086,   5461,   8955,
+   -8955,  -5461,  11086,   1136, -11529,   3363,  10217,  -7350,
+    6436, -11363,   2260,   9633,  -9633,  -2260,  11363,  -6436,
+   -6436,  11363,  -2260,  -9633,   9633,   2260, -11363,   6436,
+    5461, -11529,   7350,   3363, -11086,   8955,   1136, -10217,
+   10217,  -1136,  -8955,  11086,  -3363,  -7350,  11529,  -5461,
+    4433, -10703,  10703,  -4433,  -4433,  10703, -10703,   4433,
+    4433, -10703,  10703,  -4433,  -4433,  10703, -10703,   4433,
+    3363,  -8955,  11529, -10217,   5461,   1136,  -7350,  11086,
+  -11086,   7350,  -1136,  -5461,  10217, -11529,   8955,  -3363,
+    2260,  -6436,   9633, -11363,  11363,  -9633,   6436,  -2260,
+   -2260,   6436,  -9633,  11363, -11363,   9633,  -6436,   2260,
+    1136,  -3363,   5461,  -7350,   8955, -10217,  11086, -11529,
+   11529, -11086,  10217,  -8955,   7350,  -5461,   3363,  -1136
+};
+
+static const int16_t adst_i16[256] = {
+    1084,   2159,   3214,   4240,   5228,   6168,   7052,   7873,
+    8622,   9293,   9880,  10377,  10781,  11087,  11292,  11395,
+    3214,   6168,   8622,  10377,  11292,  11292,  10377,   8622,
+    6168,   3214,      0,  -3214,  -6168,  -8622, -10377, -11292,
+    5228,   9293,  11292,  10781,   7873,   3214,  -2159,  -7052,
+  -10377, -11395,  -9880,  -6168,  -1084,   4240,   8622,  11087,
+    7052,  11087,  10377,   5228,  -2159,  -8622, -11395,  -9293,
+   -3214,   4240,   9880,  11292,   7873,   1084,  -6168, -10781,
+    8622,  11292,   6168,  -3214, -10377, -10377,  -3214,   6168,
+   11292,   8622,      0,  -8622, -11292,  -6168,   3214,  10377,
+    9880,   9880,      0,  -9880,  -9880,      0,   9880,   9880,
+       0,  -9880,  -9880,      0,   9880,   9880,      0,  -9880,
+   10781,   7052,  -6168, -11087,  -1084,  10377,   7873,  -5228,
+  -11292,  -2159,   9880,   8622,  -4240, -11395,  -3214,   9293,
+   11292,   3214, -10377,  -6168,   8622,   8622,  -6168, -10377,
+    3214,  11292,      0, -11292,  -3214,  10377,   6168,  -8622,
+   11395,  -1084, -11292,   2159,  11087,  -3214, -10781,   4240,
+   10377,  -5228,  -9880,   6168,   9293,  -7052,  -8622,   7873,
+   11087,  -5228,  -8622,   9293,   4240, -11292,   1084,  10781,
+   -6168,  -7873,   9880,   3214, -11395,   2159,  10377,  -7052,
+   10377,  -8622,  -3214,  11292,  -6168,  -6168,  11292,  -3214,
+   -8622,  10377,      0, -10377,   8622,   3214, -11292,   6168,
+    9293, -10781,   3214,   7052, -11395,   6168,   4240, -11087,
+    8622,   1084,  -9880,  10377,  -2159,  -7873,  11292,  -5228,
+    7873, -11395,   8622,  -1084,  -7052,  11292,  -9293,   2159,
+    6168, -11087,   9880,  -3214,  -5228,  10781, -10377,   4240,
+    6168, -10377,  11292,  -8622,   3214,   3214,  -8622,  11292,
+  -10377,   6168,      0,  -6168,  10377, -11292,   8622,  -3214,
+    4240,  -7873,  10377, -11395,  10781,  -8622,   5228,  -1084,
+   -3214,   7052,  -9880,  11292, -11087,   9293,  -6168,   2159,
+    2159,  -4240,   6168,  -7873,   9293, -10377,  11087, -11395,
+   11292, -10781,   9880,  -8622,   7052,  -5228,   3214,  -1084
+};
+
+static const int xC1S7 = 16069;
+static const int xC2S6 = 15137;
+static const int xC3S5 = 13623;
+static const int xC4S4 = 11585;
+static const int xC5S3 =  9102;
+static const int xC6S2 =  6270;
+static const int xC7S1 =  3196;
+
+#define SHIFT_BITS 14
+#define DOROUND(X) X += (1<<(SHIFT_BITS-1));
+
+#define FINAL_SHIFT 3
+#define FINAL_ROUNDING (1<<(FINAL_SHIFT -1))
+#define IN_SHIFT (FINAL_SHIFT+1)
+
+
+void vp9_short_fdct8x8_c(short *InputData, short *OutputData, int pitch) {
+  int loop;
+  int short_pitch = pitch >> 1;
+  int is07, is12, is34, is56;
+  int is0734, is1256;
+  int id07, id12, id34, id56;
+  int irot_input_x, irot_input_y;
+  int icommon_product1;      // Re-used product  (c4s4 * (s12 - s56))
+  int icommon_product2;      // Re-used product  (c4s4 * (d12 + d56))
+  int temp1, temp2;          // intermediate variable for computation
+
+  int  InterData[64];
+  int  *ip = InterData;
+  short *op = OutputData;
+
+  for (loop = 0; loop < 8; loop++) {
+    // Pre calculate some common sums and differences.
+    is07 = (InputData[0] + InputData[7]) << IN_SHIFT;
+    is12 = (InputData[1] + InputData[2]) << IN_SHIFT;
+    is34 = (InputData[3] + InputData[4]) << IN_SHIFT;
+    is56 = (InputData[5] + InputData[6]) << IN_SHIFT;
+    id07 = (InputData[0] - InputData[7]) << IN_SHIFT;
+    id12 = (InputData[1] - InputData[2]) << IN_SHIFT;
+    id34 = (InputData[3] - InputData[4]) << IN_SHIFT;
+    id56 = (InputData[5] - InputData[6]) << IN_SHIFT;
+
+    is0734 = is07 + is34;
+    is1256 = is12 + is56;
+
+    // Pre-Calculate some common product terms.
+    icommon_product1 = xC4S4 * (is12 - is56);
+    DOROUND(icommon_product1)
+    icommon_product1 >>= SHIFT_BITS;
+
+    icommon_product2 = xC4S4 * (id12 + id56);
+    DOROUND(icommon_product2)
+    icommon_product2 >>= SHIFT_BITS;
+
+
+    ip[0] = (xC4S4 * (is0734 + is1256));
+    DOROUND(ip[0]);
+    ip[0] >>= SHIFT_BITS;
+
+    ip[4] = (xC4S4 * (is0734 - is1256));
+    DOROUND(ip[4]);
+    ip[4] >>= SHIFT_BITS;
+
+    // Define inputs to rotation for outputs 2 and 6
+    irot_input_x = id12 - id56;
+    irot_input_y = is07 - is34;
+
+    // Apply rotation for outputs 2 and 6.
+    temp1 = xC6S2 * irot_input_x;
+    DOROUND(temp1);
+    temp1 >>= SHIFT_BITS;
+    temp2 = xC2S6 * irot_input_y;
+    DOROUND(temp2);
+    temp2 >>= SHIFT_BITS;
+    ip[2] = temp1 + temp2;
+
+    temp1 = xC6S2 * irot_input_y;
+    DOROUND(temp1);
+    temp1 >>= SHIFT_BITS;
+    temp2 = xC2S6 * irot_input_x;
+    DOROUND(temp2);
+    temp2 >>= SHIFT_BITS;
+    ip[6] = temp1 - temp2;
+
+    // Define inputs to rotation for outputs 1 and 7
+    irot_input_x = icommon_product1 + id07;
+    irot_input_y = -(id34 + icommon_product2);
+
+    // Apply rotation for outputs 1 and 7.
+    temp1 = xC1S7 * irot_input_x;
+    DOROUND(temp1);
+    temp1 >>= SHIFT_BITS;
+    temp2 = xC7S1 * irot_input_y;
+    DOROUND(temp2);
+    temp2 >>= SHIFT_BITS;
+    ip[1] = temp1 - temp2;
+
+    temp1 = xC7S1 * irot_input_x;
+    DOROUND(temp1);
+    temp1 >>= SHIFT_BITS;
+    temp2 = xC1S7 * irot_input_y;
+    DOROUND(temp2);
+    temp2 >>= SHIFT_BITS;
+    ip[7] = temp1 + temp2;
+
+    // Define inputs to rotation for outputs 3 and 5
+    irot_input_x = id07 - icommon_product1;
+    irot_input_y = id34 - icommon_product2;
+
+    // Apply rotation for outputs 3 and 5.
+    temp1 = xC3S5 * irot_input_x;
+    DOROUND(temp1);
+    temp1 >>= SHIFT_BITS;
+    temp2 = xC5S3 * irot_input_y;
+    DOROUND(temp2);
+    temp2 >>= SHIFT_BITS;
+    ip[3] = temp1 - temp2;
+
+
+    temp1 = xC5S3 * irot_input_x;
+    DOROUND(temp1);
+    temp1 >>= SHIFT_BITS;
+    temp2 = xC3S5 * irot_input_y;
+    DOROUND(temp2);
+    temp2 >>= SHIFT_BITS;
+    ip[5] = temp1 + temp2;
+
+    // Increment data pointer for next row
+    InputData += short_pitch;
+    ip += 8;
+  }
+
+  // Performed DCT on rows, now transform the columns
+  ip = InterData;
+  for (loop = 0; loop < 8; loop++) {
+    // Pre calculate some common sums and differences.
+    is07 = ip[0 * 8] + ip[7 * 8];
+    is12 = ip[1 * 8] + ip[2 * 8];
+    is34 = ip[3 * 8] + ip[4 * 8];
+    is56 = ip[5 * 8] + ip[6 * 8];
+
+    id07 = ip[0 * 8] - ip[7 * 8];
+    id12 = ip[1 * 8] - ip[2 * 8];
+    id34 = ip[3 * 8] - ip[4 * 8];
+    id56 = ip[5 * 8] - ip[6 * 8];
+
+    is0734 = is07 + is34;
+    is1256 = is12 + is56;
+
+    // Pre-Calculate some common product terms
+    icommon_product1 = xC4S4 * (is12 - is56);
+    icommon_product2 = xC4S4 * (id12 + id56);
+    DOROUND(icommon_product1)
+    DOROUND(icommon_product2)
+    icommon_product1 >>= SHIFT_BITS;
+    icommon_product2 >>= SHIFT_BITS;
+
+
+    temp1 = xC4S4 * (is0734 + is1256);
+    temp2 = xC4S4 * (is0734 - is1256);
+    DOROUND(temp1);
+    DOROUND(temp2);
+    temp1 >>= SHIFT_BITS;
+
+    temp2 >>= SHIFT_BITS;
+    op[0 * 8] = (temp1 + FINAL_ROUNDING) >> FINAL_SHIFT;
+    op[4 * 8] = (temp2 + FINAL_ROUNDING) >> FINAL_SHIFT;
+
+    // Define inputs to rotation for outputs 2 and 6
+    irot_input_x = id12 - id56;
+    irot_input_y = is07 - is34;
+
+    // Apply rotation for outputs 2 and 6.
+    temp1 = xC6S2 * irot_input_x;
+    DOROUND(temp1);
+    temp1 >>= SHIFT_BITS;
+    temp2 = xC2S6 * irot_input_y;
+    DOROUND(temp2);
+    temp2 >>= SHIFT_BITS;
+    op[2 * 8] = (temp1 + temp2 + FINAL_ROUNDING) >> FINAL_SHIFT;
+
+    temp1 = xC6S2 * irot_input_y;
+    DOROUND(temp1);
+    temp1 >>= SHIFT_BITS;
+    temp2 = xC2S6 * irot_input_x;
+    DOROUND(temp2);
+    temp2 >>= SHIFT_BITS;
+    op[6 * 8] = (temp1 - temp2 + FINAL_ROUNDING) >> FINAL_SHIFT;
+
+    // Define inputs to rotation for outputs 1 and 7
+    irot_input_x = icommon_product1 + id07;
+    irot_input_y = -(id34 + icommon_product2);
+
+    // Apply rotation for outputs 1 and 7.
+    temp1 = xC1S7 * irot_input_x;
+    DOROUND(temp1);
+    temp1 >>= SHIFT_BITS;
+    temp2 = xC7S1 * irot_input_y;
+    DOROUND(temp2);
+    temp2 >>= SHIFT_BITS;
+    op[1 * 8] = (temp1 - temp2 + FINAL_ROUNDING) >> FINAL_SHIFT;
+
+    temp1 = xC7S1 * irot_input_x;
+    DOROUND(temp1);
+    temp1 >>= SHIFT_BITS;
+    temp2 = xC1S7 * irot_input_y;
+    DOROUND(temp2);
+    temp2 >>= SHIFT_BITS;
+    op[7 * 8] = (temp1 + temp2 + FINAL_ROUNDING) >> FINAL_SHIFT;
+
+    // Define inputs to rotation for outputs 3 and 5
+    irot_input_x = id07 - icommon_product1;
+    irot_input_y = id34 - icommon_product2;
+
+    // Apply rotation for outputs 3 and 5.
+    temp1 = xC3S5 * irot_input_x;
+    DOROUND(temp1);
+    temp1 >>= SHIFT_BITS;
+    temp2 = xC5S3 * irot_input_y;
+    DOROUND(temp2);
+    temp2 >>= SHIFT_BITS;
+    op[3 * 8] = (temp1 - temp2 + FINAL_ROUNDING) >> FINAL_SHIFT;
+
+
+    temp1 = xC5S3 * irot_input_x;
+    DOROUND(temp1);
+    temp1 >>= SHIFT_BITS;
+    temp2 = xC3S5 * irot_input_y;
+    DOROUND(temp2);
+    temp2 >>= SHIFT_BITS;
+    op[5 * 8] = (temp1 + temp2 + FINAL_ROUNDING) >> FINAL_SHIFT;
+
+    // Increment data pointer for next column.
+    ip++;
+    op++;
+  }
+}
+
+void vp9_short_fhaar2x2_c(short *input, short *output, int pitch) {
+  /* [1 1; 1 -1] orthogonal transform */
+  /* use position: 0,1, 4, 8 */
+  int i;
+  short *ip1 = input;
+  short *op1 = output;
+  for (i = 0; i < 16; i++) {
+    op1[i] = 0;
+  }
+
+  op1[0] = (ip1[0] + ip1[1] + ip1[4] + ip1[8] + 1) >> 1;
+  op1[1] = (ip1[0] - ip1[1] + ip1[4] - ip1[8]) >> 1;
+  op1[4] = (ip1[0] + ip1[1] - ip1[4] - ip1[8]) >> 1;
+  op1[8] = (ip1[0] - ip1[1] - ip1[4] + ip1[8]) >> 1;
+}
+
+/* For test */
+#define TEST_INT 1
+#if TEST_INT
+#define vp9_fht_int_c vp9_fht_c
+#else
+#define vp9_fht_float_c vp9_fht_c
+#endif
+
+void vp9_fht_float_c(const int16_t *input, int pitch, int16_t *output,
+               TX_TYPE tx_type, int tx_dim) {
+  vp9_clear_system_state();  // Make it simd safe : __asm emms;
+  {
+    int i, j, k;
+    float bufa[256], bufb[256];  // buffers are for floating-point test purpose
+                                 // the implementation could be simplified in
+                                 // conjunction with integer transform
+    const int16_t *ip = input;
+    int16_t *op = output;
+
+    float *pfa = &bufa[0];
+    float *pfb = &bufb[0];
+
+    // pointers to vertical and horizontal transforms
+    const float *ptv, *pth;
+
+    assert(tx_type != DCT_DCT);
+    // load and convert residual array into floating-point
+    for (j = 0; j < tx_dim; j++) {
+      for (i = 0; i < tx_dim; i++) {
+        pfa[i] = (float)ip[i];
+      }
+      pfa += tx_dim;
+      ip  += pitch / 2;
+    }
+
+    // vertical transformation
+    pfa = &bufa[0];
+    pfb = &bufb[0];
+
+    switch (tx_type) {
+      case ADST_ADST :
+      case ADST_DCT  :
+        ptv = (tx_dim == 4) ? &adst_4[0] :
+                              ((tx_dim == 8) ? &adst_8[0] : &adst_16[0]);
+        break;
+
+      default :
+        ptv = (tx_dim == 4) ? &dct_4[0] :
+                              ((tx_dim == 8) ? &dct_8[0] : &dct_16[0]);
+        break;
+    }
+
+    for (j = 0; j < tx_dim; j++) {
+      for (i = 0; i < tx_dim; i++) {
+        pfb[i] = 0;
+        for (k = 0; k < tx_dim; k++) {
+          pfb[i] += ptv[k] * pfa[(k * tx_dim)];
+        }
+        pfa += 1;
+      }
+      pfb += tx_dim;
+      ptv += tx_dim;
+      pfa = &bufa[0];
+    }
+
+    // horizontal transformation
+    pfa = &bufa[0];
+    pfb = &bufb[0];
+
+    switch (tx_type) {
+      case ADST_ADST :
+      case  DCT_ADST :
+        pth = (tx_dim == 4) ? &adst_4[0] :
+                              ((tx_dim == 8) ? &adst_8[0] : &adst_16[0]);
+        break;
+
+      default :
+        pth = (tx_dim == 4) ? &dct_4[0] :
+                              ((tx_dim == 8) ? &dct_8[0] : &dct_16[0]);
+        break;
+    }
+
+    for (j = 0; j < tx_dim; j++) {
+      for (i = 0; i < tx_dim; i++) {
+        pfa[i] = 0;
+        for (k = 0; k < tx_dim; k++) {
+          pfa[i] += pfb[k] * pth[k];
+        }
+        pth += tx_dim;
+      }
+
+      pfa += tx_dim;
+      pfb += tx_dim;
+      // pth -= tx_dim * tx_dim;
+
+      switch (tx_type) {
+        case ADST_ADST :
+        case  DCT_ADST :
+          pth = (tx_dim == 4) ? &adst_4[0] :
+                                ((tx_dim == 8) ? &adst_8[0] : &adst_16[0]);
+          break;
+
+        default :
+          pth = (tx_dim == 4) ? &dct_4[0] :
+                                ((tx_dim == 8) ? &dct_8[0] : &dct_16[0]);
+          break;
+      }
+    }
+
+    // convert to short integer format and load BLOCKD buffer
+    op = output;
+    pfa = &bufa[0];
+
+    for (j = 0; j < tx_dim; j++) {
+      for (i = 0; i < tx_dim; i++) {
+        op[i] = (pfa[i] > 0 ) ? (int16_t)( 8 * pfa[i] + 0.49) :
+                                     -(int16_t)(- 8 * pfa[i] + 0.49);
+      }
+      op  += tx_dim;
+      pfa += tx_dim;
+    }
+  }
+  vp9_clear_system_state();  // Make it simd safe : __asm emms;
+}
+
+/* Converted the transforms to integer form. */
+#define VERTICAL_SHIFT 11
+#define VERTICAL_ROUNDING ((1 << (VERTICAL_SHIFT - 1)) - 1)
+#define HORIZONTAL_SHIFT 16
+#define HORIZONTAL_ROUNDING ((1 << (HORIZONTAL_SHIFT - 1)) - 1)
+void vp9_fht_int_c(const int16_t *input, int pitch, int16_t *output,
+                   TX_TYPE tx_type, int tx_dim) {
+  int i, j, k;
+  int16_t imbuf[256];
+
+  const int16_t *ip = input;
+  int16_t *op = output;
+  int16_t *im = &imbuf[0];
+
+  /* pointers to vertical and horizontal transforms. */
+  const int16_t *ptv = NULL, *pth = NULL;
+
+  switch (tx_type) {
+    case ADST_ADST :
+      ptv = pth = (tx_dim == 4) ? &adst_i4[0]
+                                  : ((tx_dim == 8) ? &adst_i8[0]
+                                                     : &adst_i16[0]);
+      break;
+    case ADST_DCT  :
+      ptv = (tx_dim == 4) ? &adst_i4[0]
+                            : ((tx_dim == 8) ? &adst_i8[0] : &adst_i16[0]);
+      pth = (tx_dim == 4) ? &dct_i4[0]
+                            : ((tx_dim == 8) ? &dct_i8[0] : &dct_i16[0]);
+      break;
+    case  DCT_ADST :
+      ptv = (tx_dim == 4) ? &dct_i4[0]
+                            : ((tx_dim == 8) ? &dct_i8[0] : &dct_i16[0]);
+      pth = (tx_dim == 4) ? &adst_i4[0]
+                            : ((tx_dim == 8) ? &adst_i8[0] : &adst_i16[0]);
+      break;
+    case  DCT_DCT :
+      ptv = pth = (tx_dim == 4) ? &dct_i4[0]
+                                  : ((tx_dim == 8) ? &dct_i8[0] : &dct_i16[0]);
+      break;
+    default:
+      assert(0);
+      break;
+  }
+
+  /* vertical transformation */
+  for (j = 0; j < tx_dim; j++) {
+    for (i = 0; i < tx_dim; i++) {
+      int temp = 0;
+
+      for (k = 0; k < tx_dim; k++) {
+        temp += ptv[k] * ip[(k * (pitch >> 1))];
+      }
+
+      im[i] = (int16_t)((temp + VERTICAL_ROUNDING) >> VERTICAL_SHIFT);
+      ip++;
+    }
+    im += tx_dim;  // 16
+    ptv += tx_dim;
+    ip = input;
+  }
+
+  /* horizontal transformation */
+  im = &imbuf[0];
+
+  for (j = 0; j < tx_dim; j++) {
+    const int16_t *pthc = pth;
+
+    for (i = 0; i < tx_dim; i++) {
+      int temp = 0;
+
+      for (k = 0; k < tx_dim; k++) {
+        temp += im[k] * pthc[k];
+      }
+
+      op[i] = (int16_t)((temp + HORIZONTAL_ROUNDING) >> HORIZONTAL_SHIFT);
+      pthc += tx_dim;
+    }
+
+    im += tx_dim;  // 16
+    op += tx_dim;
+  }
+}
+
+void vp9_short_fdct4x4_c(short *input, short *output, int pitch) {
+  int i;
+  int a1, b1, c1, d1;
+  short *ip = input;
+  short *op = output;
+
+  for (i = 0; i < 4; i++) {
+    a1 = ((ip[0] + ip[3]) << 5);
+    b1 = ((ip[1] + ip[2]) << 5);
+    c1 = ((ip[1] - ip[2]) << 5);
+    d1 = ((ip[0] - ip[3]) << 5);
+
+    op[0] = a1 + b1;
+    op[2] = a1 - b1;
+
+    op[1] = (c1 * 2217 + d1 * 5352 +  14500) >> 12;
+    op[3] = (d1 * 2217 - c1 * 5352 +   7500) >> 12;
+
+    ip += pitch / 2;
+    op += 4;
+
+  }
+  ip = output;
+  op = output;
+  for (i = 0; i < 4; i++) {
+    a1 = ip[0] + ip[12];
+    b1 = ip[4] + ip[8];
+    c1 = ip[4] - ip[8];
+    d1 = ip[0] - ip[12];
+
+    op[0]  = (a1 + b1 + 7) >> 4;
+    op[8]  = (a1 - b1 + 7) >> 4;
+
+    op[4]  = ((c1 * 2217 + d1 * 5352 +  12000) >> 16) + (d1 != 0);
+    op[12] = (d1 * 2217 - c1 * 5352 +  51000) >> 16;
+
+    ip++;
+    op++;
+  }
+}
+
+void vp9_short_fdct8x4_c(short *input, short *output, int pitch)
+{
+    vp9_short_fdct4x4_c(input,   output,    pitch);
+    vp9_short_fdct4x4_c(input + 4, output + 16, pitch);
+}
+
+void vp9_short_walsh4x4_c(short *input, short *output, int pitch) {
+  int i;
+  int a1, b1, c1, d1;
+  short *ip = input;
+  short *op = output;
+  int pitch_short = pitch >> 1;
+
+  for (i = 0; i < 4; i++) {
+    a1 = ip[0 * pitch_short] + ip[3 * pitch_short];
+    b1 = ip[1 * pitch_short] + ip[2 * pitch_short];
+    c1 = ip[1 * pitch_short] - ip[2 * pitch_short];
+    d1 = ip[0 * pitch_short] - ip[3 * pitch_short];
+
+    op[0] = (a1 + b1 + 1) >> 1;
+    op[4] = (c1 + d1) >> 1;
+    op[8] = (a1 - b1) >> 1;
+    op[12] = (d1 - c1) >> 1;
+
+    ip++;
+    op++;
+  }
+  ip = output;
+  op = output;
+
+  for (i = 0; i < 4; i++) {
+    a1 = ip[0] + ip[3];
+    b1 = ip[1] + ip[2];
+    c1 = ip[1] - ip[2];
+    d1 = ip[0] - ip[3];
+
+    op[0] = (a1 + b1 + 1) >> 1;
+    op[1] = (c1 + d1) >> 1;
+    op[2] = (a1 - b1) >> 1;
+    op[3] = (d1 - c1) >> 1;
+
+    ip += 4;
+    op += 4;
+  }
+}
+
+#if CONFIG_LOSSLESS
+void vp9_short_walsh4x4_lossless_c(short *input, short *output, int pitch) {
+  int i;
+  int a1, b1, c1, d1;
+  short *ip = input;
+  short *op = output;
+  int pitch_short = pitch >> 1;
+
+  for (i = 0; i < 4; i++) {
+    a1 = (ip[0 * pitch_short] + ip[3 * pitch_short]) >> Y2_WHT_UPSCALE_FACTOR;
+    b1 = (ip[1 * pitch_short] + ip[2 * pitch_short]) >> Y2_WHT_UPSCALE_FACTOR;
+    c1 = (ip[1 * pitch_short] - ip[2 * pitch_short]) >> Y2_WHT_UPSCALE_FACTOR;
+    d1 = (ip[0 * pitch_short] - ip[3 * pitch_short]) >> Y2_WHT_UPSCALE_FACTOR;
+
+    op[0] = (a1 + b1 + 1) >> 1;
+    op[4] = (c1 + d1) >> 1;
+    op[8] = (a1 - b1) >> 1;
+    op[12] = (d1 - c1) >> 1;
+
+    ip++;
+    op++;
+  }
+  ip = output;
+  op = output;
+
+  for (i = 0; i < 4; i++) {
+    a1 = ip[0] + ip[3];
+    b1 = ip[1] + ip[2];
+    c1 = ip[1] - ip[2];
+    d1 = ip[0] - ip[3];
+
+    op[0] = ((a1 + b1 + 1) >> 1) << Y2_WHT_UPSCALE_FACTOR;
+    op[1] = ((c1 + d1) >> 1) << Y2_WHT_UPSCALE_FACTOR;
+    op[2] = ((a1 - b1) >> 1) << Y2_WHT_UPSCALE_FACTOR;
+    op[3] = ((d1 - c1) >> 1) << Y2_WHT_UPSCALE_FACTOR;
+
+    ip += 4;
+    op += 4;
+  }
+}
+
+void vp9_short_walsh4x4_x8_c(short *input, short *output, int pitch) {
+  int i;
+  int a1, b1, c1, d1;
+  short *ip = input;
+  short *op = output;
+  int pitch_short = pitch >> 1;
+
+  for (i = 0; i < 4; i++) {
+    a1 = ip[0 * pitch_short] + ip[3 * pitch_short];
+    b1 = ip[1 * pitch_short] + ip[2 * pitch_short];
+    c1 = ip[1 * pitch_short] - ip[2 * pitch_short];
+    d1 = ip[0 * pitch_short] - ip[3 * pitch_short];
+
+    op[0] = (a1 + b1 + 1) >> 1;
+    op[4] = (c1 + d1) >> 1;
+    op[8] = (a1 - b1) >> 1;
+    op[12] = (d1 - c1) >> 1;
+
+    ip++;
+    op++;
+  }
+  ip = output;
+  op = output;
+
+  for (i = 0; i < 4; i++) {
+    a1 = ip[0] + ip[3];
+    b1 = ip[1] + ip[2];
+    c1 = ip[1] - ip[2];
+    d1 = ip[0] - ip[3];
+
+    op[0] = ((a1 + b1 + 1) >> 1) << WHT_UPSCALE_FACTOR;
+    op[1] = ((c1 + d1) >> 1) << WHT_UPSCALE_FACTOR;
+    op[2] = ((a1 - b1) >> 1) << WHT_UPSCALE_FACTOR;
+    op[3] = ((d1 - c1) >> 1) << WHT_UPSCALE_FACTOR;
+
+    ip += 4;
+    op += 4;
+  }
+}
+
+void vp9_short_walsh8x4_x8_c(short *input, short *output, int pitch) {
+  vp9_short_walsh4x4_x8_c(input,   output,    pitch);
+  vp9_short_walsh4x4_x8_c(input + 4, output + 16, pitch);
+}
+#endif
+
+static const double C1 = 0.995184726672197;
+static const double C2 = 0.98078528040323;
+static const double C3 = 0.956940335732209;
+static const double C4 = 0.923879532511287;
+static const double C5 = 0.881921264348355;
+static const double C6 = 0.831469612302545;
+static const double C7 = 0.773010453362737;
+static const double C8 = 0.707106781186548;
+static const double C9 = 0.634393284163646;
+static const double C10 = 0.555570233019602;
+static const double C11 = 0.471396736825998;
+static const double C12 = 0.38268343236509;
+static const double C13 = 0.290284677254462;
+static const double C14 = 0.195090322016128;
+static const double C15 = 0.098017140329561;
+
+static void dct16x16_1d(double input[16], double output[16]) {
+  vp9_clear_system_state(); // Make it simd safe : __asm emms;
+  {
+    double step[16];
+    double intermediate[16];
+    double temp1, temp2;
+
+    // step 1
+    step[ 0] = input[0] + input[15];
+    step[ 1] = input[1] + input[14];
+    step[ 2] = input[2] + input[13];
+    step[ 3] = input[3] + input[12];
+    step[ 4] = input[4] + input[11];
+    step[ 5] = input[5] + input[10];
+    step[ 6] = input[6] + input[ 9];
+    step[ 7] = input[7] + input[ 8];
+    step[ 8] = input[7] - input[ 8];
+    step[ 9] = input[6] - input[ 9];
+    step[10] = input[5] - input[10];
+    step[11] = input[4] - input[11];
+    step[12] = input[3] - input[12];
+    step[13] = input[2] - input[13];
+    step[14] = input[1] - input[14];
+    step[15] = input[0] - input[15];
+
+    // step 2
+    output[0] = step[0] + step[7];
+    output[1] = step[1] + step[6];
+    output[2] = step[2] + step[5];
+    output[3] = step[3] + step[4];
+    output[4] = step[3] - step[4];
+    output[5] = step[2] - step[5];
+    output[6] = step[1] - step[6];
+    output[7] = step[0] - step[7];
+
+    temp1 = step[ 8]*C7;
+    temp2 = step[15]*C9;
+    output[ 8] = temp1 + temp2;
+
+    temp1 = step[ 9]*C11;
+    temp2 = step[14]*C5;
+    output[ 9] = temp1 - temp2;
+
+    temp1 = step[10]*C3;
+    temp2 = step[13]*C13;
+    output[10] = temp1 + temp2;
+
+    temp1 = step[11]*C15;
+    temp2 = step[12]*C1;
+    output[11] = temp1 - temp2;
+
+    temp1 = step[11]*C1;
+    temp2 = step[12]*C15;
+    output[12] = temp2 + temp1;
+
+    temp1 = step[10]*C13;
+    temp2 = step[13]*C3;
+    output[13] = temp2 - temp1;
+
+    temp1 = step[ 9]*C5;
+    temp2 = step[14]*C11;
+    output[14] = temp2 + temp1;
+
+    temp1 = step[ 8]*C9;
+    temp2 = step[15]*C7;
+    output[15] = temp2 - temp1;
+
+    // step 3
+    step[ 0] = output[0] + output[3];
+    step[ 1] = output[1] + output[2];
+    step[ 2] = output[1] - output[2];
+    step[ 3] = output[0] - output[3];
+
+    temp1 = output[4]*C14;
+    temp2 = output[7]*C2;
+    step[ 4] = temp1 + temp2;
+
+    temp1 = output[5]*C10;
+    temp2 = output[6]*C6;
+    step[ 5] = temp1 + temp2;
+
+    temp1 = output[5]*C6;
+    temp2 = output[6]*C10;
+    step[ 6] = temp2 - temp1;
+
+    temp1 = output[4]*C2;
+    temp2 = output[7]*C14;
+    step[ 7] = temp2 - temp1;
+
+    step[ 8] = output[ 8] + output[11];
+    step[ 9] = output[ 9] + output[10];
+    step[10] = output[ 9] - output[10];
+    step[11] = output[ 8] - output[11];
+
+    step[12] = output[12] + output[15];
+    step[13] = output[13] + output[14];
+    step[14] = output[13] - output[14];
+    step[15] = output[12] - output[15];
+
+    // step 4
+    output[ 0] = (step[ 0] + step[ 1]);
+    output[ 8] = (step[ 0] - step[ 1]);
+
+    temp1 = step[2]*C12;
+    temp2 = step[3]*C4;
+    temp1 = temp1 + temp2;
+    output[ 4] = 2*(temp1*C8);
+
+    temp1 = step[2]*C4;
+    temp2 = step[3]*C12;
+    temp1 = temp2 - temp1;
+    output[12] = 2*(temp1*C8);
+
+    output[ 2] = 2*((step[4] + step[ 5])*C8);
+    output[14] = 2*((step[7] - step[ 6])*C8);
+
+    temp1 = step[4] - step[5];
+    temp2 = step[6] + step[7];
+    output[ 6] = (temp1 + temp2);
+    output[10] = (temp1 - temp2);
+
+    intermediate[8] = step[8] + step[14];
+    intermediate[9] = step[9] + step[15];
+
+    temp1 = intermediate[8]*C12;
+    temp2 = intermediate[9]*C4;
+    temp1 = temp1 - temp2;
+    output[3] = 2*(temp1*C8);
+
+    temp1 = intermediate[8]*C4;
+    temp2 = intermediate[9]*C12;
+    temp1 = temp2 + temp1;
+    output[13] = 2*(temp1*C8);
+
+    output[ 9] = 2*((step[10] + step[11])*C8);
+
+    intermediate[11] = step[10] - step[11];
+    intermediate[12] = step[12] + step[13];
+    intermediate[13] = step[12] - step[13];
+    intermediate[14] = step[ 8] - step[14];
+    intermediate[15] = step[ 9] - step[15];
+
+    output[15] = (intermediate[11] + intermediate[12]);
+    output[ 1] = -(intermediate[11] - intermediate[12]);
+
+    output[ 7] = 2*(intermediate[13]*C8);
+
+    temp1 = intermediate[14]*C12;
+    temp2 = intermediate[15]*C4;
+    temp1 = temp1 - temp2;
+    output[11] = -2*(temp1*C8);
+
+    temp1 = intermediate[14]*C4;
+    temp2 = intermediate[15]*C12;
+    temp1 = temp2 + temp1;
+    output[ 5] = 2*(temp1*C8);
+  }
+  vp9_clear_system_state(); // Make it simd safe : __asm emms;
+}
+
+void vp9_short_fdct16x16_c(short *input, short *out, int pitch) {
+  vp9_clear_system_state(); // Make it simd safe : __asm emms;
+  {
+    int shortpitch = pitch >> 1;
+    int i, j;
+    double output[256];
+    // First transform columns
+    for (i = 0; i < 16; i++) {
+        double temp_in[16], temp_out[16];
+        for (j = 0; j < 16; j++)
+            temp_in[j] = input[j*shortpitch + i];
+        dct16x16_1d(temp_in, temp_out);
+        for (j = 0; j < 16; j++)
+            output[j*16 + i] = temp_out[j];
+    }
+    // Then transform rows
+    for (i = 0; i < 16; ++i) {
+        double temp_in[16], temp_out[16];
+        for (j = 0; j < 16; ++j)
+            temp_in[j] = output[j + i*16];
+        dct16x16_1d(temp_in, temp_out);
+        for (j = 0; j < 16; ++j)
+            output[j + i*16] = temp_out[j];
+    }
+    // Scale by some magic number
+    for (i = 0; i < 256; i++)
+        out[i] = (short)round(output[i]/2);
+  }
+  vp9_clear_system_state(); // Make it simd safe : __asm emms;
+}
--- /dev/null
+++ b/vp9/encoder/encodeframe.c
@@ -1,0 +1,2342 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_ports/config.h"
+#include "encodemb.h"
+#include "encodemv.h"
+#include "vp9/common/common.h"
+#include "onyx_int.h"
+#include "vp9/common/extend.h"
+#include "vp9/common/entropymode.h"
+#include "vp9/common/quant_common.h"
+#include "segmentation.h"
+#include "vp9/common/setupintrarecon.h"
+#include "vp9/common/reconintra4x4.h"
+#include "encodeintra.h"
+#include "vp9/common/reconinter.h"
+#include "vp9/common/invtrans.h"
+#include "rdopt.h"
+#include "vp9/common/findnearmv.h"
+#include "vp9/common/reconintra.h"
+#include "vp9/common/seg_common.h"
+#include "vpx_rtcd.h"
+#include <stdio.h>
+#include <math.h>
+#include <limits.h>
+#include "vp9/common/subpixel.h"
+#include "vpx_ports/vpx_timer.h"
+#include "vp9/common/pred_common.h"
+
+#define DBG_PRNT_SEGMAP 0
+#if CONFIG_NEWBESTREFMV
+#include "vp9/common/mvref_common.h"
+#endif
+
+
+#if CONFIG_RUNTIME_CPU_DETECT
+#define RTCD(x)     &cpi->common.rtcd.x
+#define IF_RTCD(x)  (x)
+#else
+#define RTCD(x)     NULL
+#define IF_RTCD(x)  NULL
+#endif
+
+#ifdef ENC_DEBUG
+int enc_debug = 0;
+int mb_row_debug, mb_col_debug;
+#endif
+
+extern void vp9_initialize_me_consts(VP9_COMP *cpi, int QIndex);
+
+extern void vp9_auto_select_speed(VP9_COMP *cpi);
+
+int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
+                              int recon_yoffset, int recon_uvoffset,
+                              int *returnrate, int *returndistortion);
+
+extern void vp9_pick_mode_inter_macroblock(VP9_COMP *cpi, MACROBLOCK *x,
+                                           int recon_yoffset,
+                                           int recon_uvoffset, int *r, int *d);
+
+void vp9_build_block_offsets(MACROBLOCK *x);
+
+void vp9_setup_block_ptrs(MACROBLOCK *x);
+
+void vp9_encode_inter_macroblock(VP9_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t,
+                                 int recon_yoffset, int recon_uvoffset,
+                                 int output_enabled);
+
+void vp9_encode_inter_superblock(VP9_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t,
+                                 int recon_yoffset, int recon_uvoffset,
+                                 int mb_col, int mb_row);
+
+void vp9_encode_intra_macro_block(VP9_COMP *cpi, MACROBLOCK *x,
+                                  TOKENEXTRA **t, int output_enabled);
+
+void vp9_encode_intra_super_block(VP9_COMP *cpi, MACROBLOCK *x,
+                                  TOKENEXTRA **t, int mb_col);
+
+static void adjust_act_zbin(VP9_COMP *cpi, MACROBLOCK *x);
+
+#ifdef MODE_STATS
+unsigned int inter_y_modes[MB_MODE_COUNT];
+unsigned int inter_uv_modes[VP9_UV_MODES];
+unsigned int inter_b_modes[B_MODE_COUNT];
+unsigned int y_modes[VP9_YMODES];
+unsigned int i8x8_modes[VP9_I8X8_MODES];
+unsigned int uv_modes[VP9_UV_MODES];
+unsigned int uv_modes_y[VP9_YMODES][VP9_UV_MODES];
+unsigned int b_modes[B_MODE_COUNT];
+#endif
+
+
+/* activity_avg must be positive, or flat regions could get a zero weight
+ *  (infinite lambda), which confounds analysis.
+ * This also avoids the need for divide by zero checks in
+ *  vp9_activity_masking().
+ */
+#define VP9_ACTIVITY_AVG_MIN (64)
+
+/* This is used as a reference when computing the source variance for the
+ *  purposes of activity masking.
+ * Eventually this should be replaced by custom no-reference routines,
+ *  which will be faster.
+ */
+static const unsigned char VP9_VAR_OFFS[16] = {
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128
+};
+
+
+// Original activity measure from Tim T's code.
+static unsigned int tt_activity_measure(VP9_COMP *cpi, MACROBLOCK *x) {
+  unsigned int act;
+  unsigned int sse;
+  /* TODO: This could also be done over smaller areas (8x8), but that would
+   *  require extensive changes elsewhere, as lambda is assumed to be fixed
+   *  over an entire MB in most of the code.
+   * Another option is to compute four 8x8 variances, and pick a single
+   *  lambda using a non-linear combination (e.g., the smallest, or second
+   *  smallest, etc.).
+   */
+  act = vp9_variance16x16(x->src.y_buffer, x->src.y_stride, VP9_VAR_OFFS, 0,
+                          &sse);
+  act = act << 4;
+
+  /* If the region is flat, lower the activity some more. */
+  if (act < 8 << 12)
+    act = act < 5 << 12 ? act : 5 << 12;
+
+  return act;
+}
+
+// Stub for alternative experimental activity measures.
+static unsigned int alt_activity_measure(VP9_COMP *cpi,
+                                         MACROBLOCK *x, int use_dc_pred) {
+  return vp9_encode_intra(cpi, x, use_dc_pred);
+}
+
+
+// Measure the activity of the current macroblock
+// What we measure here is TBD so abstracted to this function
+#define ALT_ACT_MEASURE 1
+static unsigned int mb_activity_measure(VP9_COMP *cpi, MACROBLOCK *x,
+                                        int mb_row, int mb_col) {
+  unsigned int mb_activity;
+
+  if (ALT_ACT_MEASURE) {
+    int use_dc_pred = (mb_col || mb_row) && (!mb_col || !mb_row);
+
+    // Or use and alternative.
+    mb_activity = alt_activity_measure(cpi, x, use_dc_pred);
+  } else {
+    // Original activity measure from Tim T's code.
+    mb_activity = tt_activity_measure(cpi, x);
+  }
+
+  if (mb_activity < VP9_ACTIVITY_AVG_MIN)
+    mb_activity = VP9_ACTIVITY_AVG_MIN;
+
+  return mb_activity;
+}
+
+// Calculate an "average" mb activity value for the frame
+#define ACT_MEDIAN 0
+static void calc_av_activity(VP9_COMP *cpi, int64_t activity_sum) {
+#if ACT_MEDIAN
+  // Find median: Simple n^2 algorithm for experimentation
+  {
+    unsigned int median;
+    unsigned int i, j;
+    unsigned int *sortlist;
+    unsigned int tmp;
+
+    // Create a list to sort to
+    CHECK_MEM_ERROR(sortlist,
+    vpx_calloc(sizeof(unsigned int),
+    cpi->common.MBs));
+
+    // Copy map to sort list
+    vpx_memcpy(sortlist, cpi->mb_activity_map,
+    sizeof(unsigned int) * cpi->common.MBs);
+
+
+    // Ripple each value down to its correct position
+    for (i = 1; i < cpi->common.MBs; i ++) {
+      for (j = i; j > 0; j --) {
+        if (sortlist[j] < sortlist[j - 1]) {
+          // Swap values
+          tmp = sortlist[j - 1];
+          sortlist[j - 1] = sortlist[j];
+          sortlist[j] = tmp;
+        } else
+          break;
+      }
+    }
+
+    // Even number MBs so estimate median as mean of two either side.
+    median = (1 + sortlist[cpi->common.MBs >> 1] +
+              sortlist[(cpi->common.MBs >> 1) + 1]) >> 1;
+
+    cpi->activity_avg = median;
+
+    vpx_free(sortlist);
+  }
+#else
+  // Simple mean for now
+  cpi->activity_avg = (unsigned int)(activity_sum / cpi->common.MBs);
+#endif
+
+  if (cpi->activity_avg < VP9_ACTIVITY_AVG_MIN)
+    cpi->activity_avg = VP9_ACTIVITY_AVG_MIN;
+
+  // Experimental code: return fixed value normalized for several clips
+  if (ALT_ACT_MEASURE)
+    cpi->activity_avg = 100000;
+}
+
+#define USE_ACT_INDEX   0
+#define OUTPUT_NORM_ACT_STATS   0
+
+#if USE_ACT_INDEX
+// Calculate and activity index for each mb
+static void calc_activity_index(VP9_COMP *cpi, MACROBLOCK *x) {
+  VP9_COMMON *const cm = &cpi->common;
+  int mb_row, mb_col;
+
+  int64_t act;
+  int64_t a;
+  int64_t b;
+
+#if OUTPUT_NORM_ACT_STATS
+  FILE *f = fopen("norm_act.stt", "a");
+  fprintf(f, "\n%12d\n", cpi->activity_avg);
+#endif
+
+  // Reset pointers to start of activity map
+  x->mb_activity_ptr = cpi->mb_activity_map;
+
+  // Calculate normalized mb activity number.
+  for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) {
+    // for each macroblock col in image
+    for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) {
+      // Read activity from the map
+      act = *(x->mb_activity_ptr);
+
+      // Calculate a normalized activity number
+      a = act + 4 * cpi->activity_avg;
+      b = 4 * act + cpi->activity_avg;
+
+      if (b >= a)
+        *(x->activity_ptr) = (int)((b + (a >> 1)) / a) - 1;
+      else
+        *(x->activity_ptr) = 1 - (int)((a + (b >> 1)) / b);
+
+#if OUTPUT_NORM_ACT_STATS
+      fprintf(f, " %6d", *(x->mb_activity_ptr));
+#endif
+      // Increment activity map pointers
+      x->mb_activity_ptr++;
+    }
+
+#if OUTPUT_NORM_ACT_STATS
+    fprintf(f, "\n");
+#endif
+
+  }
+
+#if OUTPUT_NORM_ACT_STATS
+  fclose(f);
+#endif
+
+}
+#endif
+
+// Loop through all MBs. Note activity of each, average activity and
+// calculate a normalized activity for each
+static void build_activity_map(VP9_COMP *cpi) {
+  MACROBLOCK *const x = &cpi->mb;
+  MACROBLOCKD *xd = &x->e_mbd;
+  VP9_COMMON *const cm = &cpi->common;
+
+#if ALT_ACT_MEASURE
+  YV12_BUFFER_CONFIG *new_yv12 = &cm->yv12_fb[cm->new_fb_idx];
+  int recon_yoffset;
+  int recon_y_stride = new_yv12->y_stride;
+#endif
+
+  int mb_row, mb_col;
+  unsigned int mb_activity;
+  int64_t activity_sum = 0;
+
+  // for each macroblock row in image
+  for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) {
+#if ALT_ACT_MEASURE
+    // reset above block coeffs
+    xd->up_available = (mb_row != 0);
+    recon_yoffset = (mb_row * recon_y_stride * 16);
+#endif
+    // for each macroblock col in image
+    for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) {
+#if ALT_ACT_MEASURE
+      xd->dst.y_buffer = new_yv12->y_buffer + recon_yoffset;
+      xd->left_available = (mb_col != 0);
+      recon_yoffset += 16;
+#endif
+      // Copy current mb to a buffer
+      vp9_copy_mem16x16(x->src.y_buffer, x->src.y_stride, x->thismb, 16);
+
+      // measure activity
+      mb_activity = mb_activity_measure(cpi, x, mb_row, mb_col);
+
+      // Keep frame sum
+      activity_sum += mb_activity;
+
+      // Store MB level activity details.
+      *x->mb_activity_ptr = mb_activity;
+
+      // Increment activity map pointer
+      x->mb_activity_ptr++;
+
+      // adjust to the next column of source macroblocks
+      x->src.y_buffer += 16;
+    }
+
+
+    // adjust to the next row of mbs
+    x->src.y_buffer += 16 * x->src.y_stride - 16 * cm->mb_cols;
+
+#if ALT_ACT_MEASURE
+    // extend the recon for intra prediction
+    vp9_extend_mb_row(new_yv12, xd->dst.y_buffer + 16,
+                      xd->dst.u_buffer + 8, xd->dst.v_buffer + 8);
+#endif
+
+  }
+
+  // Calculate an "average" MB activity
+  calc_av_activity(cpi, activity_sum);
+
+#if USE_ACT_INDEX
+  // Calculate an activity index number of each mb
+  calc_activity_index(cpi, x);
+#endif
+
+}
+
+// Macroblock activity masking
+void vp9_activity_masking(VP9_COMP *cpi, MACROBLOCK *x) {
+#if USE_ACT_INDEX
+  x->rdmult += *(x->mb_activity_ptr) * (x->rdmult >> 2);
+  x->errorperbit = x->rdmult * 100 / (110 * x->rddiv);
+  x->errorperbit += (x->errorperbit == 0);
+#else
+  int64_t a;
+  int64_t b;
+  int64_t act = *(x->mb_activity_ptr);
+
+  // Apply the masking to the RD multiplier.
+  a = act + (2 * cpi->activity_avg);
+  b = (2 * act) + cpi->activity_avg;
+
+  x->rdmult = (unsigned int)(((int64_t)x->rdmult * b + (a >> 1)) / a);
+  x->errorperbit = x->rdmult * 100 / (110 * x->rddiv);
+  x->errorperbit += (x->errorperbit == 0);
+#endif
+
+  // Activity based Zbin adjustment
+  adjust_act_zbin(cpi, x);
+}
+
+static void update_state(VP9_COMP *cpi, MACROBLOCK *x, PICK_MODE_CONTEXT *ctx) {
+  int i;
+  MACROBLOCKD *xd = &x->e_mbd;
+  MODE_INFO *mi = &ctx->mic;
+  MB_MODE_INFO * mbmi = &xd->mode_info_context->mbmi;
+  int mb_mode = mi->mbmi.mode;
+  int mb_mode_index = ctx->best_mode_index;
+
+#if CONFIG_DEBUG
+  assert(mb_mode < MB_MODE_COUNT);
+  assert(mb_mode_index < MAX_MODES);
+  assert(mi->mbmi.ref_frame < MAX_REF_FRAMES);
+#endif
+
+  // Restore the coding context of the MB to that that was in place
+  // when the mode was picked for it
+  vpx_memcpy(xd->mode_info_context, mi, sizeof(MODE_INFO));
+#if CONFIG_SUPERBLOCKS
+  if (mi->mbmi.encoded_as_sb) {
+    const int mis = cpi->common.mode_info_stride;
+    if (xd->mb_to_right_edge > 0)
+      vpx_memcpy(xd->mode_info_context + 1, mi, sizeof(MODE_INFO));
+    if (xd->mb_to_bottom_edge > 0) {
+      vpx_memcpy(xd->mode_info_context + mis, mi, sizeof(MODE_INFO));
+      if (xd->mb_to_right_edge > 0)
+        vpx_memcpy(xd->mode_info_context + mis + 1, mi, sizeof(MODE_INFO));
+    }
+  }
+#endif
+
+  if (mb_mode == B_PRED) {
+    for (i = 0; i < 16; i++) {
+      xd->block[i].bmi.as_mode = xd->mode_info_context->bmi[i].as_mode;
+      assert(xd->block[i].bmi.as_mode.first < MB_MODE_COUNT);
+    }
+  } else if (mb_mode == I8X8_PRED) {
+    for (i = 0; i < 16; i++) {
+      xd->block[i].bmi = xd->mode_info_context->bmi[i];
+    }
+  } else if (mb_mode == SPLITMV) {
+    vpx_memcpy(x->partition_info, &ctx->partition_info,
+               sizeof(PARTITION_INFO));
+
+    mbmi->mv[0].as_int = x->partition_info->bmi[15].mv.as_int;
+    mbmi->mv[1].as_int = x->partition_info->bmi[15].second_mv.as_int;
+  }
+
+  {
+    int segment_id = mbmi->segment_id;
+    if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) ||
+        vp9_get_segdata(xd, segment_id, SEG_LVL_EOB)) {
+      for (i = 0; i < NB_TXFM_MODES; i++) {
+        cpi->rd_tx_select_diff[i] += ctx->txfm_rd_diff[i];
+      }
+    }
+  }
+
+  if (cpi->common.frame_type == KEY_FRAME) {
+    // Restore the coding modes to that held in the coding context
+    // if (mb_mode == B_PRED)
+    //    for (i = 0; i < 16; i++)
+    //    {
+    //        xd->block[i].bmi.as_mode =
+    //                          xd->mode_info_context->bmi[i].as_mode;
+    //        assert(xd->mode_info_context->bmi[i].as_mode < MB_MODE_COUNT);
+    //    }
+#if CONFIG_INTERNAL_STATS
+    static const int kf_mode_index[] = {
+      THR_DC /*DC_PRED*/,
+      THR_V_PRED /*V_PRED*/,
+      THR_H_PRED /*H_PRED*/,
+      THR_D45_PRED /*D45_PRED*/,
+      THR_D135_PRED /*D135_PRED*/,
+      THR_D117_PRED /*D117_PRED*/,
+      THR_D153_PRED /*D153_PRED*/,
+      THR_D27_PRED /*D27_PRED*/,
+      THR_D63_PRED /*D63_PRED*/,
+      THR_TM /*TM_PRED*/,
+      THR_I8X8_PRED /*I8X8_PRED*/,
+      THR_B_PRED /*B_PRED*/,
+    };
+    cpi->mode_chosen_counts[kf_mode_index[mb_mode]]++;
+#endif
+  } else {
+    /*
+            // Reduce the activation RD thresholds for the best choice mode
+            if ((cpi->rd_baseline_thresh[mb_mode_index] > 0) &&
+                (cpi->rd_baseline_thresh[mb_mode_index] < (INT_MAX >> 2)))
+            {
+                int best_adjustment = (cpi->rd_thresh_mult[mb_mode_index] >> 2);
+
+                cpi->rd_thresh_mult[mb_mode_index] =
+                        (cpi->rd_thresh_mult[mb_mode_index]
+                         >= (MIN_THRESHMULT + best_adjustment)) ?
+                                cpi->rd_thresh_mult[mb_mode_index] - best_adjustment :
+                                MIN_THRESHMULT;
+                cpi->rd_threshes[mb_mode_index] =
+                        (cpi->rd_baseline_thresh[mb_mode_index] >> 7)
+                        * cpi->rd_thresh_mult[mb_mode_index];
+
+            }
+    */
+    // Note how often each mode chosen as best
+    cpi->mode_chosen_counts[mb_mode_index]++;
+
+    cpi->prediction_error += ctx->distortion;
+    cpi->intra_error += ctx->intra_error;
+
+    cpi->rd_comp_pred_diff[0] += ctx->single_pred_diff;
+    cpi->rd_comp_pred_diff[1] += ctx->comp_pred_diff;
+    cpi->rd_comp_pred_diff[2] += ctx->hybrid_pred_diff;
+  }
+}
+
+static void pick_mb_modes(VP9_COMP *cpi,
+                          VP9_COMMON *cm,
+                          int mb_row,
+                          int mb_col,
+                          MACROBLOCK  *x,
+                          MACROBLOCKD *xd,
+                          TOKENEXTRA **tp,
+                          int *totalrate,
+                          int *totaldist) {
+  int i;
+  int map_index;
+  int recon_yoffset, recon_uvoffset;
+  int ref_fb_idx = cm->lst_fb_idx;
+  int dst_fb_idx = cm->new_fb_idx;
+  int recon_y_stride = cm->yv12_fb[ref_fb_idx].y_stride;
+  int recon_uv_stride = cm->yv12_fb[ref_fb_idx].uv_stride;
+  ENTROPY_CONTEXT_PLANES left_context[2];
+  ENTROPY_CONTEXT_PLANES above_context[2];
+  ENTROPY_CONTEXT_PLANES *initial_above_context_ptr = cm->above_context
+                                                      + mb_col;
+
+  // Offsets to move pointers from MB to MB within a SB in raster order
+  int row_delta[4] = { 0, +1,  0, -1};
+  int col_delta[4] = { +1, -1, +1, +1};
+
+  /* Function should not modify L & A contexts; save and restore on exit */
+  vpx_memcpy(left_context,
+             cm->left_context,
+             sizeof(left_context));
+  vpx_memcpy(above_context,
+             initial_above_context_ptr,
+             sizeof(above_context));
+
+  /* Encode MBs in raster order within the SB */
+  for (i = 0; i < 4; i++) {
+    int dy = row_delta[i];
+    int dx = col_delta[i];
+    int offset_unextended = dy * cm->mb_cols + dx;
+    int offset_extended   = dy * xd->mode_info_stride + dx;
+    MB_MODE_INFO * mbmi = &xd->mode_info_context->mbmi;
+
+    // TODO Many of the index items here can be computed more efficiently!
+
+    if ((mb_row >= cm->mb_rows) || (mb_col >= cm->mb_cols)) {
+      // MB lies outside frame, move on
+      mb_row += dy;
+      mb_col += dx;
+
+      // Update pointers
+      x->src.y_buffer += 16 * (dx + dy * x->src.y_stride);
+      x->src.u_buffer += 8  * (dx + dy * x->src.uv_stride);
+      x->src.v_buffer += 8  * (dx + dy * x->src.uv_stride);
+
+      x->gf_active_ptr += offset_unextended;
+      x->partition_info += offset_extended;
+      xd->mode_info_context += offset_extended;
+      xd->prev_mode_info_context += offset_extended;
+#if CONFIG_DEBUG
+      assert((xd->prev_mode_info_context - cpi->common.prev_mip) ==
+             (xd->mode_info_context - cpi->common.mip));
+#endif
+      continue;
+    }
+
+    // Index of the MB in the SB 0..3
+    xd->mb_index = i;
+
+    map_index = (mb_row * cpi->common.mb_cols) + mb_col;
+    x->mb_activity_ptr = &cpi->mb_activity_map[map_index];
+
+    // set above context pointer
+    xd->above_context = cm->above_context + mb_col;
+
+    // Restore the appropriate left context depending on which
+    // row in the SB the MB is situated
+    xd->left_context = cm->left_context + (i >> 1);
+
+    // Set up distance of MB to edge of frame in 1/8th pel units
+    xd->mb_to_top_edge    = -((mb_row * 16) << 3);
+    xd->mb_to_left_edge   = -((mb_col * 16) << 3);
+    xd->mb_to_bottom_edge = ((cm->mb_rows - 1 - mb_row) * 16) << 3;
+    xd->mb_to_right_edge  = ((cm->mb_cols - 1 - mb_col) * 16) << 3;
+
+    // Set up limit values for MV components to prevent them from
+    // extending beyond the UMV borders assuming 16x16 block size
+    x->mv_row_min = -((mb_row * 16) + VP8BORDERINPIXELS - INTERP_EXTEND);
+    x->mv_col_min = -((mb_col * 16) + VP8BORDERINPIXELS - INTERP_EXTEND);
+    x->mv_row_max = ((cm->mb_rows - mb_row) * 16 +
+                     (VP8BORDERINPIXELS - 16 - INTERP_EXTEND));
+    x->mv_col_max = ((cm->mb_cols - mb_col) * 16 +
+                     (VP8BORDERINPIXELS - 16 - INTERP_EXTEND));
+
+    xd->up_available   = (mb_row != 0);
+    xd->left_available = (mb_col != 0);
+
+    recon_yoffset  = (mb_row * recon_y_stride * 16) + (mb_col * 16);
+    recon_uvoffset = (mb_row * recon_uv_stride * 8) + (mb_col *  8);
+
+    xd->dst.y_buffer = cm->yv12_fb[dst_fb_idx].y_buffer + recon_yoffset;
+    xd->dst.u_buffer = cm->yv12_fb[dst_fb_idx].u_buffer + recon_uvoffset;
+    xd->dst.v_buffer = cm->yv12_fb[dst_fb_idx].v_buffer + recon_uvoffset;
+
+    // Copy current MB to a work buffer
+    vp9_copy_mem16x16(x->src.y_buffer, x->src.y_stride, x->thismb, 16);
+
+    x->rddiv = cpi->RDDIV;
+    x->rdmult = cpi->RDMULT;
+
+    if (cpi->oxcf.tuning == VP8_TUNE_SSIM)
+      vp9_activity_masking(cpi, x);
+
+    // Is segmentation enabled
+    if (xd->segmentation_enabled) {
+      // Code to set segment id in xd->mbmi.segment_id
+      if (xd->update_mb_segmentation_map)
+        mbmi->segment_id = cpi->segmentation_map[map_index];
+      else
+        mbmi->segment_id = cm->last_frame_seg_map[map_index];
+      if (mbmi->segment_id > 3)
+        mbmi->segment_id = 0;
+
+      vp9_mb_init_quantizer(cpi, x);
+    } else
+      // Set to Segment 0 by default
+      mbmi->segment_id = 0;
+
+    x->active_ptr = cpi->active_map + map_index;
+
+#if CONFIG_SUPERBLOCKS
+    xd->mode_info_context->mbmi.encoded_as_sb = 0;
+#endif
+
+    cpi->update_context = 0;    // TODO Do we need this now??
+
+    vp9_intra_prediction_down_copy(xd);
+
+    // Find best coding mode & reconstruct the MB so it is available
+    // as a predictor for MBs that follow in the SB
+    if (cm->frame_type == KEY_FRAME) {
+      int r, d;
+      vp9_rd_pick_intra_mode(cpi, x, &r, &d);
+      *totalrate += r;
+      *totaldist += d;
+
+      // Dummy encode, do not do the tokenization
+      vp9_encode_intra_macro_block(cpi, x, tp, 0);
+      // Note the encoder may have changed the segment_id
+
+      // Save the coding context
+      vpx_memcpy(&x->mb_context[i].mic, xd->mode_info_context,
+                 sizeof(MODE_INFO));
+    } else {
+      int seg_id, r, d;
+
+      if (xd->segmentation_enabled && cpi->seg0_cnt > 0 &&
+          !vp9_segfeature_active(xd, 0, SEG_LVL_REF_FRAME) &&
+          vp9_segfeature_active(xd, 1, SEG_LVL_REF_FRAME) &&
+          vp9_check_segref(xd, 1, INTRA_FRAME)  +
+          vp9_check_segref(xd, 1, LAST_FRAME)   +
+          vp9_check_segref(xd, 1, GOLDEN_FRAME) +
+          vp9_check_segref(xd, 1, ALTREF_FRAME) == 1) {
+        cpi->seg0_progress = (cpi->seg0_idx << 16) / cpi->seg0_cnt;
+      } else {
+        cpi->seg0_progress = (((mb_col & ~1) * 2 + (mb_row & ~1) * cm->mb_cols + i) << 16) / cm->MBs;
+      }
+
+      vp9_pick_mode_inter_macroblock(cpi, x, recon_yoffset,
+                                     recon_uvoffset, &r, &d);
+      *totalrate += r;
+      *totaldist += d;
+
+      // Dummy encode, do not do the tokenization
+      vp9_encode_inter_macroblock(cpi, x, tp,
+                                  recon_yoffset, recon_uvoffset, 0);
+
+      seg_id = mbmi->segment_id;
+      if (cpi->mb.e_mbd.segmentation_enabled && seg_id == 0) {
+        cpi->seg0_idx++;
+      }
+      if (!xd->segmentation_enabled ||
+          !vp9_segfeature_active(xd, seg_id, SEG_LVL_REF_FRAME) ||
+          vp9_check_segref(xd, seg_id, INTRA_FRAME)  +
+          vp9_check_segref(xd, seg_id, LAST_FRAME)   +
+          vp9_check_segref(xd, seg_id, GOLDEN_FRAME) +
+          vp9_check_segref(xd, seg_id, ALTREF_FRAME) > 1) {
+        // Get the prediction context and status
+        int pred_flag = vp9_get_pred_flag(xd, PRED_REF);
+        int pred_context = vp9_get_pred_context(cm, xd, PRED_REF);
+
+        // Count prediction success
+        cpi->ref_pred_count[pred_context][pred_flag]++;
+      }
+    }
+
+    // Next MB
+    mb_row += dy;
+    mb_col += dx;
+
+    x->src.y_buffer += 16 * (dx + dy * x->src.y_stride);
+    x->src.u_buffer += 8  * (dx + dy * x->src.uv_stride);
+    x->src.v_buffer += 8  * (dx + dy * x->src.uv_stride);
+
+    x->gf_active_ptr += offset_unextended;
+    x->partition_info += offset_extended;
+    xd->mode_info_context += offset_extended;
+    xd->prev_mode_info_context += offset_extended;
+
+#if CONFIG_DEBUG
+    assert((xd->prev_mode_info_context - cpi->common.prev_mip) ==
+           (xd->mode_info_context - cpi->common.mip));
+#endif
+  }
+
+  /* Restore L & A coding context to those in place on entry */
+  vpx_memcpy(cm->left_context,
+             left_context,
+             sizeof(left_context));
+  vpx_memcpy(initial_above_context_ptr,
+             above_context,
+             sizeof(above_context));
+}
+
+#if CONFIG_SUPERBLOCKS
+static void pick_sb_modes (VP9_COMP *cpi,
+                           VP9_COMMON *cm,
+                           int mb_row,
+                           int mb_col,
+                           MACROBLOCK  *x,
+                           MACROBLOCKD *xd,
+                           TOKENEXTRA **tp,
+                           int *totalrate,
+                           int *totaldist)
+{
+  int map_index;
+  int recon_yoffset, recon_uvoffset;
+  int ref_fb_idx = cm->lst_fb_idx;
+  int dst_fb_idx = cm->new_fb_idx;
+  int recon_y_stride = cm->yv12_fb[ref_fb_idx].y_stride;
+  int recon_uv_stride = cm->yv12_fb[ref_fb_idx].uv_stride;
+  ENTROPY_CONTEXT_PLANES left_context[2];
+  ENTROPY_CONTEXT_PLANES above_context[2];
+  ENTROPY_CONTEXT_PLANES *initial_above_context_ptr = cm->above_context
+    + mb_col;
+
+  /* Function should not modify L & A contexts; save and restore on exit */
+  vpx_memcpy (left_context,
+              cm->left_context,
+              sizeof(left_context));
+  vpx_memcpy (above_context,
+              initial_above_context_ptr,
+              sizeof(above_context));
+
+  map_index = (mb_row * cpi->common.mb_cols) + mb_col;
+  x->mb_activity_ptr = &cpi->mb_activity_map[map_index];
+
+  /* set above context pointer */
+  xd->above_context = cm->above_context + mb_col;
+
+  /* Restore the appropriate left context depending on which
+   * row in the SB the MB is situated */
+  xd->left_context = cm->left_context;
+
+  // Set up distance of MB to edge of frame in 1/8th pel units
+  xd->mb_to_top_edge    = -((mb_row * 16) << 3);
+  xd->mb_to_left_edge   = -((mb_col * 16) << 3);
+  xd->mb_to_bottom_edge = ((cm->mb_rows - 1 - mb_row) * 16) << 3;
+  xd->mb_to_right_edge  = ((cm->mb_cols - 1 - mb_col) * 16) << 3;
+
+  /* Set up limit values for MV components to prevent them from
+   * extending beyond the UMV borders assuming 16x16 block size */
+  x->mv_row_min = -((mb_row * 16) + VP8BORDERINPIXELS - INTERP_EXTEND);
+  x->mv_col_min = -((mb_col * 16) + VP8BORDERINPIXELS - INTERP_EXTEND);
+  x->mv_row_max = ((cm->mb_rows - mb_row) * 16 +
+                   (VP8BORDERINPIXELS - 32 - INTERP_EXTEND));
+  x->mv_col_max = ((cm->mb_cols - mb_col) * 16 +
+                   (VP8BORDERINPIXELS - 32 - INTERP_EXTEND));
+
+  xd->up_available   = (mb_row != 0);
+  xd->left_available = (mb_col != 0);
+
+  recon_yoffset  = (mb_row * recon_y_stride * 16) + (mb_col * 16);
+  recon_uvoffset = (mb_row * recon_uv_stride * 8) + (mb_col *  8);
+
+  xd->dst.y_buffer = cm->yv12_fb[dst_fb_idx].y_buffer + recon_yoffset;
+  xd->dst.u_buffer = cm->yv12_fb[dst_fb_idx].u_buffer + recon_uvoffset;
+  xd->dst.v_buffer = cm->yv12_fb[dst_fb_idx].v_buffer + recon_uvoffset;
+#if 0 // FIXME
+  /* Copy current MB to a work buffer */
+  vp9_copy_mem16x16(x->src.y_buffer, x->src.y_stride, x->thismb, 16);
+#endif
+  x->rddiv = cpi->RDDIV;
+  x->rdmult = cpi->RDMULT;
+  if(cpi->oxcf.tuning == VP8_TUNE_SSIM)
+    vp9_activity_masking(cpi, x);
+  /* Is segmentation enabled */
+  if (xd->segmentation_enabled)
+  {
+    /* Code to set segment id in xd->mbmi.segment_id */
+    if (xd->update_mb_segmentation_map)
+      xd->mode_info_context->mbmi.segment_id =
+            cpi->segmentation_map[map_index] &&
+            cpi->segmentation_map[map_index + 1] &&
+            cpi->segmentation_map[map_index + cm->mb_cols] &&
+            cpi->segmentation_map[map_index + cm->mb_cols + 1];
+    else
+      xd->mode_info_context->mbmi.segment_id =
+            cm->last_frame_seg_map[map_index] &&
+            cm->last_frame_seg_map[map_index + 1] &&
+            cm->last_frame_seg_map[map_index + cm->mb_cols] &&
+            cm->last_frame_seg_map[map_index + cm->mb_cols + 1];
+    if (xd->mode_info_context->mbmi.segment_id > 3)
+      xd->mode_info_context->mbmi.segment_id = 0;
+
+    vp9_mb_init_quantizer(cpi, x);
+  }
+  else
+    /* Set to Segment 0 by default */
+    xd->mode_info_context->mbmi.segment_id = 0;
+
+  x->active_ptr = cpi->active_map + map_index;
+  
+  cpi->update_context = 0;    // TODO Do we need this now??
+
+  /* Find best coding mode & reconstruct the MB so it is available
+   * as a predictor for MBs that follow in the SB */
+  if (cm->frame_type == KEY_FRAME)
+  {
+    vp9_rd_pick_intra_mode_sb(cpi, x,
+                              totalrate,
+                              totaldist);
+
+    /* Save the coding context */
+    vpx_memcpy(&x->sb_context[0].mic, xd->mode_info_context,
+               sizeof(MODE_INFO));
+  } else {
+    if (xd->segmentation_enabled && cpi->seg0_cnt > 0 &&
+        !vp9_segfeature_active(xd, 0, SEG_LVL_REF_FRAME) &&
+        vp9_segfeature_active(xd, 1, SEG_LVL_REF_FRAME) &&
+        vp9_check_segref(xd, 1, INTRA_FRAME)  +
+        vp9_check_segref(xd, 1, LAST_FRAME)   +
+        vp9_check_segref(xd, 1, GOLDEN_FRAME) +
+        vp9_check_segref(xd, 1, ALTREF_FRAME) == 1) {
+      cpi->seg0_progress = (cpi->seg0_idx << 16) / cpi->seg0_cnt;
+    } else {
+      cpi->seg0_progress =
+        (((mb_col & ~1) * 2 + (mb_row & ~1) * cm->mb_cols) << 16) / cm->MBs;
+    }
+
+    vp9_rd_pick_inter_mode_sb(cpi, x,
+                              recon_yoffset,
+                              recon_uvoffset,
+                              totalrate,
+                              totaldist);
+  }
+
+  /* Restore L & A coding context to those in place on entry */
+  vpx_memcpy (cm->left_context,
+              left_context,
+              sizeof(left_context));
+  vpx_memcpy (initial_above_context_ptr,
+              above_context,
+              sizeof(above_context));
+}
+#endif
+
+static void encode_sb(VP9_COMP *cpi,
+                      VP9_COMMON *cm,
+                      int mbrow,
+                      int mbcol,
+                      MACROBLOCK  *x,
+                      MACROBLOCKD *xd,
+                      TOKENEXTRA **tp) {
+  int i;
+  int map_index;
+  int mb_row, mb_col;
+  int recon_yoffset, recon_uvoffset;
+  int ref_fb_idx = cm->lst_fb_idx;
+  int dst_fb_idx = cm->new_fb_idx;
+  int recon_y_stride = cm->yv12_fb[ref_fb_idx].y_stride;
+  int recon_uv_stride = cm->yv12_fb[ref_fb_idx].uv_stride;
+  int row_delta[4] = { 0, +1,  0, -1};
+  int col_delta[4] = { +1, -1, +1, +1};
+
+  mb_row = mbrow;
+  mb_col = mbcol;
+
+  /* Encode MBs in raster order within the SB */
+  for (i = 0; i < 4; i++) {
+    int dy = row_delta[i];
+    int dx = col_delta[i];
+    int offset_extended   = dy * xd->mode_info_stride + dx;
+    int offset_unextended = dy * cm->mb_cols + dx;
+    MB_MODE_INFO * mbmi = &xd->mode_info_context->mbmi;
+
+    if ((mb_row >= cm->mb_rows) || (mb_col >= cm->mb_cols)) {
+      // MB lies outside frame, move on
+      mb_row += dy;
+      mb_col += dx;
+
+      x->src.y_buffer += 16 * (dx + dy * x->src.y_stride);
+      x->src.u_buffer += 8  * (dx + dy * x->src.uv_stride);
+      x->src.v_buffer += 8  * (dx + dy * x->src.uv_stride);
+
+      x->gf_active_ptr      += offset_unextended;
+      x->partition_info     += offset_extended;
+      xd->mode_info_context += offset_extended;
+      xd->prev_mode_info_context += offset_extended;
+
+#if CONFIG_DEBUG
+      assert((xd->prev_mode_info_context - cpi->common.prev_mip) ==
+             (xd->mode_info_context - cpi->common.mip));
+#endif
+      continue;
+    }
+
+    xd->mb_index = i;
+
+#ifdef ENC_DEBUG
+    enc_debug = (cpi->common.current_video_frame == 0 &&
+                 mb_row == 0 && mb_col == 0);
+    mb_col_debug = mb_col;
+    mb_row_debug = mb_row;
+#endif
+
+    // Restore MB state to that when it was picked
+#if CONFIG_SUPERBLOCKS
+    if (xd->mode_info_context->mbmi.encoded_as_sb) {
+      update_state(cpi, x, &x->sb_context[i]);
+      cpi->sb_count++;
+    } else
+#endif
+      update_state(cpi, x, &x->mb_context[i]);
+
+    map_index = (mb_row * cpi->common.mb_cols) + mb_col;
+    x->mb_activity_ptr = &cpi->mb_activity_map[map_index];
+
+    // reset above block coeffs
+    xd->above_context = cm->above_context + mb_col;
+    xd->left_context  = cm->left_context + (i >> 1);
+
+    // Set up distance of MB to edge of the frame in 1/8th pel units
+    xd->mb_to_top_edge    = -((mb_row * 16) << 3);
+    xd->mb_to_left_edge   = -((mb_col * 16) << 3);
+    xd->mb_to_bottom_edge = ((cm->mb_rows - 1 - mb_row) * 16) << 3;
+    xd->mb_to_right_edge  = ((cm->mb_cols - 1 - mb_col) * 16) << 3;
+
+#if CONFIG_SUPERBLOCKS
+    if (xd->mode_info_context->mbmi.encoded_as_sb) {
+      // Set up limit values for MV components to prevent them from
+      // extending beyond the UMV borders assuming 32x32 block size
+      x->mv_row_min = -((mb_row * 16) + VP8BORDERINPIXELS - INTERP_EXTEND);
+      x->mv_col_min = -((mb_col * 16) + VP8BORDERINPIXELS - INTERP_EXTEND);
+      x->mv_row_max = ((cm->mb_rows - mb_row) * 16 +
+                       (VP8BORDERINPIXELS - 32 - INTERP_EXTEND));
+      x->mv_col_max = ((cm->mb_cols - mb_col) * 16 +
+                       (VP8BORDERINPIXELS - 32 - INTERP_EXTEND));
+    } else {
+#endif
+      // Set up limit values for MV components to prevent them from
+      // extending beyond the UMV borders assuming 16x16 block size
+      x->mv_row_min = -((mb_row * 16) + VP8BORDERINPIXELS - INTERP_EXTEND);
+      x->mv_col_min = -((mb_col * 16) + VP8BORDERINPIXELS - INTERP_EXTEND);
+      x->mv_row_max = ((cm->mb_rows - mb_row) * 16 +
+                       (VP8BORDERINPIXELS - 16 - INTERP_EXTEND));
+      x->mv_col_max = ((cm->mb_cols - mb_col) * 16 +
+                       (VP8BORDERINPIXELS - 16 - INTERP_EXTEND));
+#if CONFIG_SUPERBLOCKS
+    }
+#endif
+
+    xd->up_available = (mb_row != 0);
+    xd->left_available = (mb_col != 0);
+
+    recon_yoffset = (mb_row * recon_y_stride * 16) + (mb_col * 16);
+    recon_uvoffset = (mb_row * recon_uv_stride * 8) + (mb_col * 8);
+
+    xd->dst.y_buffer = cm->yv12_fb[dst_fb_idx].y_buffer + recon_yoffset;
+    xd->dst.u_buffer = cm->yv12_fb[dst_fb_idx].u_buffer + recon_uvoffset;
+    xd->dst.v_buffer = cm->yv12_fb[dst_fb_idx].v_buffer + recon_uvoffset;
+
+    // Copy current MB to a work buffer
+    vp9_copy_mem16x16(x->src.y_buffer, x->src.y_stride, x->thismb, 16);
+
+    if (cpi->oxcf.tuning == VP8_TUNE_SSIM)
+      vp9_activity_masking(cpi, x);
+
+    // Is segmentation enabled
+    if (xd->segmentation_enabled) {
+      vp9_mb_init_quantizer(cpi, x);
+    }
+
+    x->active_ptr = cpi->active_map + map_index;
+
+    cpi->update_context = 0;
+
+#if CONFIG_SUPERBLOCKS
+    if (!xd->mode_info_context->mbmi.encoded_as_sb)
+#endif
+      vp9_intra_prediction_down_copy(xd);
+
+    if (cm->frame_type == KEY_FRAME) {
+#if CONFIG_SUPERBLOCKS
+      if (xd->mode_info_context->mbmi.encoded_as_sb)
+        vp9_encode_intra_super_block(cpi, x, tp, mb_col);
+      else
+#endif
+        vp9_encode_intra_macro_block(cpi, x, tp, 1);
+        // Note the encoder may have changed the segment_id
+
+#ifdef MODE_STATS
+      y_modes[mbmi->mode]++;
+#endif
+    } else {
+      unsigned char *segment_id;
+      int seg_ref_active;
+
+      if (xd->mode_info_context->mbmi.ref_frame) {
+        unsigned char pred_context;
+
+        pred_context = vp9_get_pred_context(cm, xd, PRED_COMP);
+
+        if (xd->mode_info_context->mbmi.second_ref_frame == INTRA_FRAME)
+          cpi->single_pred_count[pred_context]++;
+        else
+          cpi->comp_pred_count[pred_context]++;
+      }
+
+#if CONFIG_SUPERBLOCKS
+      if (xd->mode_info_context->mbmi.encoded_as_sb)
+        vp9_encode_inter_superblock(cpi, x, tp, recon_yoffset, recon_uvoffset,
+                                    mb_col, mb_row);
+      else
+#endif
+        vp9_encode_inter_macroblock(cpi, x, tp,
+                                    recon_yoffset, recon_uvoffset, 1);
+        // Note the encoder may have changed the segment_id
+
+#ifdef MODE_STATS
+      inter_y_modes[mbmi->mode]++;
+
+      if (mbmi->mode == SPLITMV) {
+        int b;
+
+        for (b = 0; b < x->partition_info->count; b++) {
+          inter_b_modes[x->partition_info->bmi[b].mode]++;
+        }
+      }
+
+#endif
+
+      // If we have just a single reference frame coded for a segment then
+      // exclude from the reference frame counts used to work out
+      // probabilities. NOTE: At the moment we dont support custom trees
+      // for the reference frame coding for each segment but this is a
+      // possible future action.
+      segment_id = &mbmi->segment_id;
+      seg_ref_active = vp9_segfeature_active(xd, *segment_id,
+                                             SEG_LVL_REF_FRAME);
+      if (!seg_ref_active ||
+          ((vp9_check_segref(xd, *segment_id, INTRA_FRAME) +
+            vp9_check_segref(xd, *segment_id, LAST_FRAME) +
+            vp9_check_segref(xd, *segment_id, GOLDEN_FRAME) +
+            vp9_check_segref(xd, *segment_id, ALTREF_FRAME)) > 1)) {
+        {
+          cpi->count_mb_ref_frame_usage[mbmi->ref_frame]++;
+        }
+      }
+
+      // Count of last ref frame 0,0 usage
+      if ((mbmi->mode == ZEROMV) && (mbmi->ref_frame == LAST_FRAME))
+        cpi->inter_zz_count++;
+    }
+
+#if CONFIG_SUPERBLOCKS
+    if (xd->mode_info_context->mbmi.encoded_as_sb) {
+      x->src.y_buffer += 32;
+      x->src.u_buffer += 16;
+      x->src.v_buffer += 16;
+
+      x->gf_active_ptr      += 2;
+      x->partition_info     += 2;
+      xd->mode_info_context += 2;
+      xd->prev_mode_info_context += 2;
+
+      (*tp)->Token = EOSB_TOKEN;
+      (*tp)++;
+      if (mb_row < cm->mb_rows) cpi->tplist[mb_row].stop = *tp;
+      break;
+    }
+#endif
+
+    // Next MB
+    mb_row += dy;
+    mb_col += dx;
+
+    x->src.y_buffer += 16 * (dx + dy * x->src.y_stride);
+    x->src.u_buffer += 8  * (dx + dy * x->src.uv_stride);
+    x->src.v_buffer += 8  * (dx + dy * x->src.uv_stride);
+
+    x->gf_active_ptr      += offset_unextended;
+    x->partition_info     += offset_extended;
+    xd->mode_info_context += offset_extended;
+    xd->prev_mode_info_context += offset_extended;
+
+#if CONFIG_DEBUG
+    assert((xd->prev_mode_info_context - cpi->common.prev_mip) ==
+           (xd->mode_info_context - cpi->common.mip));
+#endif
+    (*tp)->Token = EOSB_TOKEN;
+    (*tp)++;
+    if (mb_row < cm->mb_rows) cpi->tplist[mb_row].stop = *tp;
+  }
+
+  // debug output
+#if DBG_PRNT_SEGMAP
+  {
+    FILE *statsfile;
+    statsfile = fopen("segmap2.stt", "a");
+    fprintf(statsfile, "\n");
+    fclose(statsfile);
+  }
+#endif
+}
+
+static
+void encode_sb_row(VP9_COMP *cpi,
+                   VP9_COMMON *cm,
+                   int mb_row,
+                   MACROBLOCK  *x,
+                   MACROBLOCKD *xd,
+                   TOKENEXTRA **tp,
+                   int *totalrate) {
+  int mb_col;
+  int mb_cols = cm->mb_cols;
+
+  // Initialize the left context for the new SB row
+  vpx_memset(cm->left_context, 0, sizeof(cm->left_context));
+
+  // Code each SB in the row
+  for (mb_col = 0; mb_col < mb_cols; mb_col += 2) {
+    int mb_rate = 0, mb_dist = 0;
+#if CONFIG_SUPERBLOCKS
+    int sb_rate = INT_MAX, sb_dist;
+#endif
+
+#if CONFIG_DEBUG
+    MODE_INFO *mic = xd->mode_info_context;
+    PARTITION_INFO *pi = x->partition_info;
+    signed char  *gfa = x->gf_active_ptr;
+    unsigned char *yb = x->src.y_buffer;
+    unsigned char *ub = x->src.u_buffer;
+    unsigned char *vb = x->src.v_buffer;
+#endif
+
+#if CONFIG_SUPERBLOCKS
+    // Pick modes assuming the SB is coded as 4 independent MBs
+    xd->mode_info_context->mbmi.encoded_as_sb = 0;
+#endif
+    pick_mb_modes(cpi, cm, mb_row, mb_col, x, xd, tp, &mb_rate, &mb_dist);
+#if CONFIG_SUPERBLOCKS
+    mb_rate += vp9_cost_bit(cm->sb_coded, 0);
+#endif
+
+    x->src.y_buffer -= 32;
+    x->src.u_buffer -= 16;
+    x->src.v_buffer -= 16;
+
+    x->gf_active_ptr -= 2;
+    x->partition_info -= 2;
+    xd->mode_info_context -= 2;
+    xd->prev_mode_info_context -= 2;
+
+#if CONFIG_DEBUG
+    assert(x->gf_active_ptr == gfa);
+    assert(x->partition_info == pi);
+    assert(xd->mode_info_context == mic);
+    assert(x->src.y_buffer == yb);
+    assert(x->src.u_buffer == ub);
+    assert(x->src.v_buffer == vb);
+#endif
+
+#if CONFIG_SUPERBLOCKS
+    if (!(((    mb_cols & 1) && mb_col ==     mb_cols - 1) ||
+          ((cm->mb_rows & 1) && mb_row == cm->mb_rows - 1))) {
+      /* Pick a mode assuming that it applies to all 4 of the MBs in the SB */
+      xd->mode_info_context->mbmi.encoded_as_sb = 1;
+      pick_sb_modes(cpi, cm, mb_row, mb_col, x, xd, tp, &sb_rate, &sb_dist);
+      sb_rate += vp9_cost_bit(cm->sb_coded, 1);
+    }
+
+    /* Decide whether to encode as a SB or 4xMBs */
+    if (sb_rate < INT_MAX &&
+        RDCOST(x->rdmult, x->rddiv, sb_rate, sb_dist) <
+          RDCOST(x->rdmult, x->rddiv, mb_rate, mb_dist)) {
+      xd->mode_info_context->mbmi.encoded_as_sb = 1;
+      xd->mode_info_context[1].mbmi.encoded_as_sb = 1;
+      xd->mode_info_context[cm->mode_info_stride].mbmi.encoded_as_sb = 1;
+      xd->mode_info_context[1 + cm->mode_info_stride].mbmi.encoded_as_sb = 1;
+      *totalrate += sb_rate;
+    } else
+#endif
+    {
+#if CONFIG_SUPERBLOCKS
+      xd->mode_info_context->mbmi.encoded_as_sb = 0;
+      if (cm->mb_cols - 1 > mb_col)
+        xd->mode_info_context[1].mbmi.encoded_as_sb = 0;
+      if (cm->mb_rows - 1 > mb_row) {
+        xd->mode_info_context[cm->mode_info_stride].mbmi.encoded_as_sb = 0;
+        if (cm->mb_cols - 1 > mb_col)
+          xd->mode_info_context[1 + cm->mode_info_stride].mbmi.encoded_as_sb = 0;
+      }
+#endif
+      *totalrate += mb_rate;
+    }
+
+    /* Encode SB using best computed mode(s) */
+    encode_sb(cpi, cm, mb_row, mb_col, x, xd, tp);
+
+#if CONFIG_DEBUG
+    assert(x->gf_active_ptr == gfa + 2);
+    assert(x->partition_info == pi + 2);
+    assert(xd->mode_info_context == mic + 2);
+    assert(x->src.y_buffer == yb + 32);
+    assert(x->src.u_buffer == ub + 16);
+    assert(x->src.v_buffer == vb + 16);
+#endif
+  }
+
+  // this is to account for the border
+  x->gf_active_ptr += mb_cols - (mb_cols & 0x1);
+  x->partition_info += xd->mode_info_stride + 1 - (mb_cols & 0x1);
+  xd->mode_info_context += xd->mode_info_stride + 1 - (mb_cols & 0x1);
+  xd->prev_mode_info_context += xd->mode_info_stride + 1 - (mb_cols & 0x1);
+
+#if CONFIG_DEBUG
+  assert((xd->prev_mode_info_context - cpi->common.prev_mip) ==
+         (xd->mode_info_context - cpi->common.mip));
+#endif
+}
+
+static void init_encode_frame_mb_context(VP9_COMP *cpi) {
+  MACROBLOCK *const x = &cpi->mb;
+  VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+
+  // GF active flags data structure
+  x->gf_active_ptr = (signed char *)cpi->gf_active_flags;
+
+  // Activity map pointer
+  x->mb_activity_ptr = cpi->mb_activity_map;
+
+  x->act_zbin_adj = 0;
+  cpi->seg0_idx = 0;
+  vpx_memset(cpi->ref_pred_count, 0, sizeof(cpi->ref_pred_count));
+
+  x->partition_info = x->pi;
+
+  xd->mode_info_context = cm->mi;
+  xd->mode_info_stride = cm->mode_info_stride;
+  xd->prev_mode_info_context = cm->prev_mi;
+
+  xd->frame_type = cm->frame_type;
+
+  xd->frames_since_golden = cm->frames_since_golden;
+  xd->frames_till_alt_ref_frame = cm->frames_till_alt_ref_frame;
+
+  // reset intra mode contexts
+  if (cm->frame_type == KEY_FRAME)
+    vp9_init_mbmode_probs(cm);
+
+  // Copy data over into macro block data structures.
+  x->src = * cpi->Source;
+  xd->pre = cm->yv12_fb[cm->lst_fb_idx];
+  xd->dst = cm->yv12_fb[cm->new_fb_idx];
+
+  // set up frame for intra coded blocks
+  vp9_setup_intra_recon(&cm->yv12_fb[cm->new_fb_idx]);
+
+  vp9_build_block_offsets(x);
+
+  vp9_setup_block_dptrs(&x->e_mbd);
+
+  vp9_setup_block_ptrs(x);
+
+  xd->mode_info_context->mbmi.mode = DC_PRED;
+  xd->mode_info_context->mbmi.uv_mode = DC_PRED;
+
+  vp9_zero(cpi->count_mb_ref_frame_usage)
+  vp9_zero(cpi->bmode_count)
+  vp9_zero(cpi->ymode_count)
+  vp9_zero(cpi->i8x8_mode_count)
+  vp9_zero(cpi->y_uv_mode_count)
+  vp9_zero(cpi->sub_mv_ref_count)
+  vp9_zero(cpi->mbsplit_count)
+  vp9_zero(cpi->common.fc.mv_ref_ct)
+  vp9_zero(cpi->common.fc.mv_ref_ct_a)
+#if CONFIG_SUPERBLOCKS
+  vp9_zero(cpi->sb_ymode_count)
+  cpi->sb_count = 0;
+#endif
+
+  vpx_memset(cm->above_context, 0,
+             sizeof(ENTROPY_CONTEXT_PLANES) * cm->mb_cols);
+
+  xd->fullpixel_mask = 0xffffffff;
+  if (cm->full_pixel)
+    xd->fullpixel_mask = 0xfffffff8;
+}
+
+static void encode_frame_internal(VP9_COMP *cpi) {
+  int mb_row;
+  MACROBLOCK *const x = &cpi->mb;
+  VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+
+  TOKENEXTRA *tp = cpi->tok;
+  int totalrate;
+
+  //printf("encode_frame_internal\n");
+
+  // Compute a modified set of reference frame probabilities to use when
+  // prediction fails. These are based on the current general estimates for
+  // this frame which may be updated with each iteration of the recode loop.
+  vp9_compute_mod_refprobs(cm);
+
+#if CONFIG_NEW_MVREF
+  // temp stats reset
+  vp9_zero( cpi->best_ref_index_counts );
+#endif
+
+// debug output
+#if DBG_PRNT_SEGMAP
+  {
+    FILE *statsfile;
+    statsfile = fopen("segmap2.stt", "a");
+    fprintf(statsfile, "\n");
+    fclose(statsfile);
+  }
+#endif
+
+  totalrate = 0;
+
+  // Functions setup for all frame types so we can use MC in AltRef
+  vp9_setup_interp_filters(xd, cm->mcomp_filter_type, cm);
+
+  // Reset frame count of inter 0,0 motion vector usage.
+  cpi->inter_zz_count = 0;
+
+  cpi->prediction_error = 0;
+  cpi->intra_error = 0;
+  cpi->skip_true_count[0] = cpi->skip_true_count[1] = cpi->skip_true_count[2] = 0;
+  cpi->skip_false_count[0] = cpi->skip_false_count[1] = cpi->skip_false_count[2] = 0;
+
+#if CONFIG_PRED_FILTER
+  if (cm->current_video_frame == 0) {
+    // Initially assume that we'll signal the prediction filter
+    // state at the frame level and that it is off.
+    cpi->common.pred_filter_mode = 0;
+    cpi->common.prob_pred_filter_off = 128;
+  }
+  cpi->pred_filter_on_count = 0;
+  cpi->pred_filter_off_count = 0;
+#endif
+  vp9_zero(cpi->switchable_interp_count);
+
+  xd->mode_info_context = cm->mi;
+  xd->prev_mode_info_context = cm->prev_mi;
+
+  vp9_zero(cpi->NMVcount);
+  vp9_zero(cpi->coef_counts);
+  vp9_zero(cpi->hybrid_coef_counts);
+  vp9_zero(cpi->coef_counts_8x8);
+  vp9_zero(cpi->hybrid_coef_counts_8x8);
+  vp9_zero(cpi->coef_counts_16x16);
+  vp9_zero(cpi->hybrid_coef_counts_16x16);
+
+  vp9_frame_init_quantizer(cpi);
+
+  vp9_initialize_rd_consts(cpi, cm->base_qindex + cm->y1dc_delta_q);
+  vp9_initialize_me_consts(cpi, cm->base_qindex);
+
+  if (cpi->oxcf.tuning == VP8_TUNE_SSIM) {
+    // Initialize encode frame context.
+    init_encode_frame_mb_context(cpi);
+
+    // Build a frame level activity map
+    build_activity_map(cpi);
+  }
+
+  // re-initencode frame context.
+  init_encode_frame_mb_context(cpi);
+
+  vpx_memset(cpi->rd_comp_pred_diff, 0, sizeof(cpi->rd_comp_pred_diff));
+  vpx_memset(cpi->single_pred_count, 0, sizeof(cpi->single_pred_count));
+  vpx_memset(cpi->comp_pred_count, 0, sizeof(cpi->comp_pred_count));
+  vpx_memset(cpi->txfm_count, 0, sizeof(cpi->txfm_count));
+  vpx_memset(cpi->txfm_count_8x8p, 0, sizeof(cpi->txfm_count_8x8p));
+  vpx_memset(cpi->rd_tx_select_diff, 0, sizeof(cpi->rd_tx_select_diff));
+  {
+    struct vpx_usec_timer  emr_timer;
+    vpx_usec_timer_start(&emr_timer);
+
+    {
+      // For each row of SBs in the frame
+      for (mb_row = 0; mb_row < cm->mb_rows; mb_row += 2) {
+        int offset = (cm->mb_cols + 1) & ~0x1;
+
+        encode_sb_row(cpi, cm, mb_row, x, xd, &tp, &totalrate);
+
+        // adjust to the next row of SBs
+        x->src.y_buffer += 32 * x->src.y_stride - 16 * offset;
+        x->src.u_buffer += 16 * x->src.uv_stride - 8 * offset;
+        x->src.v_buffer += 16 * x->src.uv_stride - 8 * offset;
+      }
+
+      cpi->tok_count = tp - cpi->tok;
+    }
+
+    vpx_usec_timer_mark(&emr_timer);
+    cpi->time_encode_mb_row += vpx_usec_timer_elapsed(&emr_timer);
+
+  }
+
+  // 256 rate units to the bit,
+  // projected_frame_size in units of BYTES
+  cpi->projected_frame_size = totalrate >> 8;
+
+
+#if 0
+  // Keep record of the total distortion this time around for future use
+  cpi->last_frame_distortion = cpi->frame_distortion;
+#endif
+
+}
+
+static int check_dual_ref_flags(VP9_COMP *cpi) {
+  MACROBLOCKD *xd = &cpi->mb.e_mbd;
+  int ref_flags = cpi->ref_frame_flags;
+
+  if (vp9_segfeature_active(xd, 1, SEG_LVL_REF_FRAME)) {
+    if ((ref_flags & (VP9_LAST_FLAG | VP9_GOLD_FLAG)) == (VP9_LAST_FLAG | VP9_GOLD_FLAG) &&
+        vp9_check_segref(xd, 1, LAST_FRAME))
+      return 1;
+    if ((ref_flags & (VP9_GOLD_FLAG | VP9_ALT_FLAG)) == (VP9_GOLD_FLAG | VP9_ALT_FLAG) &&
+        vp9_check_segref(xd, 1, GOLDEN_FRAME))
+      return 1;
+    if ((ref_flags & (VP9_ALT_FLAG  | VP9_LAST_FLAG)) == (VP9_ALT_FLAG  | VP9_LAST_FLAG) &&
+        vp9_check_segref(xd, 1, ALTREF_FRAME))
+      return 1;
+    return 0;
+  } else {
+    return (!!(ref_flags & VP9_GOLD_FLAG) +
+            !!(ref_flags & VP9_LAST_FLAG) +
+            !!(ref_flags & VP9_ALT_FLAG)) >= 2;
+  }
+}
+
+static void reset_skip_txfm_size(VP9_COMP *cpi, TX_SIZE txfm_max) {
+  VP9_COMMON *cm = &cpi->common;
+  int mb_row, mb_col, mis = cm->mode_info_stride, segment_id;
+  MODE_INFO *mi, *mi_ptr = cm->mi;
+#if CONFIG_SUPERBLOCKS
+  MODE_INFO *sb_mi_ptr = cm->mi, *sb_mi;
+  MB_MODE_INFO *sb_mbmi;
+#endif
+  MB_MODE_INFO *mbmi;
+  MACROBLOCK *x = &cpi->mb;
+  MACROBLOCKD *xd = &x->e_mbd;
+
+  for (mb_row = 0; mb_row < cm->mb_rows; mb_row++, mi_ptr += mis) {
+    mi = mi_ptr;
+#if CONFIG_SUPERBLOCKS
+    sb_mi = sb_mi_ptr;
+#endif
+    for (mb_col = 0; mb_col < cm->mb_cols; mb_col++, mi++) {
+      mbmi = &mi->mbmi;
+#if CONFIG_SUPERBLOCKS
+      sb_mbmi = &sb_mi->mbmi;
+#endif
+      if (
+#if CONFIG_SUPERBLOCKS
+          !sb_mbmi->encoded_as_sb &&
+#endif
+          mbmi->txfm_size > txfm_max) {
+        segment_id = mbmi->segment_id;
+        xd->mode_info_context = mi;
+        assert((vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) &&
+                vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) == 0) ||
+               (cm->mb_no_coeff_skip && mbmi->mb_skip_coeff));
+        mbmi->txfm_size = txfm_max;
+      }
+#if CONFIG_SUPERBLOCKS
+      if (mb_col & 1)
+        sb_mi += 2;
+#endif
+    }
+#if CONFIG_SUPERBLOCKS
+    if (mb_row & 1)
+      sb_mi_ptr += 2 * mis;
+#endif
+  }
+}
+
+void vp9_encode_frame(VP9_COMP *cpi) {
+  if (cpi->sf.RD) {
+    int i, frame_type, pred_type;
+    TXFM_MODE txfm_type;
+
+    /*
+     * This code does a single RD pass over the whole frame assuming
+     * either compound, single or hybrid prediction as per whatever has
+     * worked best for that type of frame in the past.
+     * It also predicts whether another coding mode would have worked
+     * better that this coding mode. If that is the case, it remembers
+     * that for subsequent frames.
+     * It does the same analysis for transform size selection also.
+     */
+    if (cpi->common.frame_type == KEY_FRAME)
+      frame_type = 0;
+    else if (cpi->is_src_frame_alt_ref && cpi->common.refresh_golden_frame)
+      frame_type = 3;
+    else if (cpi->common.refresh_golden_frame || cpi->common.refresh_alt_ref_frame)
+      frame_type = 1;
+    else
+      frame_type = 2;
+
+    /* prediction (compound, single or hybrid) mode selection */
+    if (frame_type == 3)
+      pred_type = SINGLE_PREDICTION_ONLY;
+    else if (cpi->rd_prediction_type_threshes[frame_type][1] >
+                 cpi->rd_prediction_type_threshes[frame_type][0] &&
+             cpi->rd_prediction_type_threshes[frame_type][1] >
+                 cpi->rd_prediction_type_threshes[frame_type][2] &&
+             check_dual_ref_flags(cpi) && cpi->static_mb_pct == 100)
+      pred_type = COMP_PREDICTION_ONLY;
+    else if (cpi->rd_prediction_type_threshes[frame_type][0] >
+                 cpi->rd_prediction_type_threshes[frame_type][2])
+      pred_type = SINGLE_PREDICTION_ONLY;
+    else
+      pred_type = HYBRID_PREDICTION;
+
+    /* transform size (4x4, 8x8, 16x16 or select-per-mb) selection */
+#if CONFIG_LOSSLESS
+    if (cpi->oxcf.lossless) {
+      txfm_type = ONLY_4X4;
+    } else
+#endif
+    /* FIXME (rbultje)
+     * this is a hack (no really), basically to work around the complete
+     * nonsense coefficient cost prediction for keyframes. The probabilities
+     * are reset to defaults, and thus we basically have no idea how expensive
+     * a 4x4 vs. 8x8 will really be. The result is that any estimate at which
+     * of the two is better is utterly bogus.
+     * I'd like to eventually remove this hack, but in order to do that, we
+     * need to move the frame reset code from the frame encode init to the
+     * bitstream write code, or alternatively keep a backup of the previous
+     * keyframe's probabilities as an estimate of what the current keyframe's
+     * coefficient cost distributions may look like. */
+    if (frame_type == 0) {
+      txfm_type = ALLOW_16X16;
+    } else
+#if 0
+    /* FIXME (rbultje)
+     * this code is disabled for a similar reason as the code above; the
+     * problem is that each time we "revert" to 4x4 only (or even 8x8 only),
+     * the coefficient probabilities for 16x16 (and 8x8) start lagging behind,
+     * thus leading to them lagging further behind and not being chosen for
+     * subsequent frames either. This is essentially a local minimum problem
+     * that we can probably fix by estimating real costs more closely within
+     * a frame, perhaps by re-calculating costs on-the-fly as frame encoding
+     * progresses. */
+    if (cpi->rd_tx_select_threshes[frame_type][TX_MODE_SELECT] >
+            cpi->rd_tx_select_threshes[frame_type][ONLY_4X4] &&
+        cpi->rd_tx_select_threshes[frame_type][TX_MODE_SELECT] >
+            cpi->rd_tx_select_threshes[frame_type][ALLOW_16X16] &&
+        cpi->rd_tx_select_threshes[frame_type][TX_MODE_SELECT] >
+            cpi->rd_tx_select_threshes[frame_type][ALLOW_8X8]) {
+      txfm_type = TX_MODE_SELECT;
+    } else if (cpi->rd_tx_select_threshes[frame_type][ONLY_4X4] >
+                  cpi->rd_tx_select_threshes[frame_type][ALLOW_8X8]
+            && cpi->rd_tx_select_threshes[frame_type][ONLY_4X4] >
+                  cpi->rd_tx_select_threshes[frame_type][ALLOW_16X16]
+               ) {
+      txfm_type = ONLY_4X4;
+    } else if (cpi->rd_tx_select_threshes[frame_type][ALLOW_16X16] >=
+                  cpi->rd_tx_select_threshes[frame_type][ALLOW_8X8]) {
+      txfm_type = ALLOW_16X16;
+    } else
+      txfm_type = ALLOW_8X8;
+#else
+    txfm_type = cpi->rd_tx_select_threshes[frame_type][ALLOW_16X16] >=
+                 cpi->rd_tx_select_threshes[frame_type][TX_MODE_SELECT] ?
+    ALLOW_16X16 : TX_MODE_SELECT;
+#endif
+    cpi->common.txfm_mode = txfm_type;
+    if (txfm_type != TX_MODE_SELECT) {
+      cpi->common.prob_tx[0] = 128;
+      cpi->common.prob_tx[1] = 128;
+    }
+    cpi->common.comp_pred_mode = pred_type;
+    encode_frame_internal(cpi);
+
+    for (i = 0; i < NB_PREDICTION_TYPES; ++i) {
+      const int diff = cpi->rd_comp_pred_diff[i] / cpi->common.MBs;
+      cpi->rd_prediction_type_threshes[frame_type][i] += diff;
+      cpi->rd_prediction_type_threshes[frame_type][i] >>= 1;
+    }
+
+    for (i = 0; i < NB_TXFM_MODES; ++i) {
+      int64_t pd = cpi->rd_tx_select_diff[i];
+      int diff;
+      if (i == TX_MODE_SELECT)
+        pd -= RDCOST(cpi->mb.rdmult, cpi->mb.rddiv, 2048 * (TX_SIZE_MAX - 1), 0);
+      diff = pd / cpi->common.MBs;
+      cpi->rd_tx_select_threshes[frame_type][i] += diff;
+      cpi->rd_tx_select_threshes[frame_type][i] /= 2;
+    }
+
+    if (cpi->common.comp_pred_mode == HYBRID_PREDICTION) {
+      int single_count_zero = 0;
+      int comp_count_zero = 0;
+
+      for (i = 0; i < COMP_PRED_CONTEXTS; i++) {
+        single_count_zero += cpi->single_pred_count[i];
+        comp_count_zero += cpi->comp_pred_count[i];
+      }
+
+      if (comp_count_zero == 0) {
+        cpi->common.comp_pred_mode = SINGLE_PREDICTION_ONLY;
+      } else if (single_count_zero == 0) {
+        cpi->common.comp_pred_mode = COMP_PREDICTION_ONLY;
+      }
+    }
+
+    if (cpi->common.txfm_mode == TX_MODE_SELECT) {
+      const int count4x4 = cpi->txfm_count[TX_4X4] + cpi->txfm_count_8x8p[TX_4X4];
+      const int count8x8 = cpi->txfm_count[TX_8X8];
+      const int count8x8_8x8p = cpi->txfm_count_8x8p[TX_8X8];
+      const int count16x16 = cpi->txfm_count[TX_16X16];
+
+      if (count4x4 == 0 && count16x16 == 0) {
+        cpi->common.txfm_mode = ALLOW_8X8;
+        reset_skip_txfm_size(cpi, TX_8X8);
+      } else if (count8x8 == 0 && count16x16 == 0 && count8x8_8x8p == 0) {
+        cpi->common.txfm_mode = ONLY_4X4;
+        reset_skip_txfm_size(cpi, TX_4X4);
+      } else if (count8x8 == 0 && count4x4 == 0) {
+        cpi->common.txfm_mode = ALLOW_16X16;
+      }
+    }
+  } else {
+    encode_frame_internal(cpi);
+  }
+
+}
+
+void vp9_setup_block_ptrs(MACROBLOCK *x) {
+  int r, c;
+  int i;
+
+  for (r = 0; r < 4; r++) {
+    for (c = 0; c < 4; c++) {
+      x->block[r * 4 + c].src_diff = x->src_diff + r * 4 * 16 + c * 4;
+    }
+  }
+
+  for (r = 0; r < 2; r++) {
+    for (c = 0; c < 2; c++) {
+      x->block[16 + r * 2 + c].src_diff = x->src_diff + 256 + r * 4 * 8 + c * 4;
+    }
+  }
+
+
+  for (r = 0; r < 2; r++) {
+    for (c = 0; c < 2; c++) {
+      x->block[20 + r * 2 + c].src_diff = x->src_diff + 320 + r * 4 * 8 + c * 4;
+    }
+  }
+
+  x->block[24].src_diff = x->src_diff + 384;
+
+
+  for (i = 0; i < 25; i++) {
+    x->block[i].coeff = x->coeff + i * 16;
+  }
+}
+
+void vp9_build_block_offsets(MACROBLOCK *x) {
+  int block = 0;
+  int br, bc;
+
+  vp9_build_block_doffsets(&x->e_mbd);
+
+  // y blocks
+  x->thismb_ptr = &x->thismb[0];
+  for (br = 0; br < 4; br++) {
+    for (bc = 0; bc < 4; bc++) {
+      BLOCK *this_block = &x->block[block];
+      // this_block->base_src = &x->src.y_buffer;
+      // this_block->src_stride = x->src.y_stride;
+      // this_block->src = 4 * br * this_block->src_stride + 4 * bc;
+      this_block->base_src = &x->thismb_ptr;
+      this_block->src_stride = 16;
+      this_block->src = 4 * br * 16 + 4 * bc;
+      ++block;
+    }
+  }
+
+  // u blocks
+  for (br = 0; br < 2; br++) {
+    for (bc = 0; bc < 2; bc++) {
+      BLOCK *this_block = &x->block[block];
+      this_block->base_src = &x->src.u_buffer;
+      this_block->src_stride = x->src.uv_stride;
+      this_block->src = 4 * br * this_block->src_stride + 4 * bc;
+      ++block;
+    }
+  }
+
+  // v blocks
+  for (br = 0; br < 2; br++) {
+    for (bc = 0; bc < 2; bc++) {
+      BLOCK *this_block = &x->block[block];
+      this_block->base_src = &x->src.v_buffer;
+      this_block->src_stride = x->src.uv_stride;
+      this_block->src = 4 * br * this_block->src_stride + 4 * bc;
+      ++block;
+    }
+  }
+}
+
+static void sum_intra_stats(VP9_COMP *cpi, MACROBLOCK *x) {
+  const MACROBLOCKD *xd = &x->e_mbd;
+  const MB_PREDICTION_MODE m = xd->mode_info_context->mbmi.mode;
+  const MB_PREDICTION_MODE uvm = xd->mode_info_context->mbmi.uv_mode;
+
+#ifdef MODE_STATS
+  const int is_key = cpi->common.frame_type == KEY_FRAME;
+
+  ++ (is_key ? uv_modes : inter_uv_modes)[uvm];
+  ++ uv_modes_y[m][uvm];
+
+  if (m == B_PRED) {
+    unsigned int *const bct = is_key ? b_modes : inter_b_modes;
+
+    int b = 0;
+
+    do {
+      ++ bct[xd->block[b].bmi.as_mode.first];
+    } while (++b < 16);
+  }
+
+  if (m == I8X8_PRED) {
+    i8x8_modes[xd->block[0].bmi.as_mode.first]++;
+    i8x8_modes[xd->block[2].bmi.as_mode.first]++;
+    i8x8_modes[xd->block[8].bmi.as_mode.first]++;
+    i8x8_modes[xd->block[10].bmi.as_mode.first]++;
+  }
+#endif
+
+#if CONFIG_SUPERBLOCKS
+  if (xd->mode_info_context->mbmi.encoded_as_sb) {
+    ++cpi->sb_ymode_count[m];
+  } else
+#endif
+    ++cpi->ymode_count[m];
+  if (m != I8X8_PRED)
+    ++cpi->y_uv_mode_count[m][uvm];
+  else {
+    cpi->i8x8_mode_count[xd->block[0].bmi.as_mode.first]++;
+    cpi->i8x8_mode_count[xd->block[2].bmi.as_mode.first]++;
+    cpi->i8x8_mode_count[xd->block[8].bmi.as_mode.first]++;
+    cpi->i8x8_mode_count[xd->block[10].bmi.as_mode.first]++;
+  }
+  if (m == B_PRED) {
+    int b = 0;
+    do {
+      ++ cpi->bmode_count[xd->block[b].bmi.as_mode.first];
+    } while (++b < 16);
+  }
+}
+
+// Experimental stub function to create a per MB zbin adjustment based on
+// some previously calculated measure of MB activity.
+static void adjust_act_zbin(VP9_COMP *cpi, MACROBLOCK *x) {
+#if USE_ACT_INDEX
+  x->act_zbin_adj = *(x->mb_activity_ptr);
+#else
+  int64_t a;
+  int64_t b;
+  int64_t act = *(x->mb_activity_ptr);
+
+  // Apply the masking to the RD multiplier.
+  a = act + 4 * cpi->activity_avg;
+  b = 4 * act + cpi->activity_avg;
+
+  if (act > cpi->activity_avg)
+    x->act_zbin_adj = (int)(((int64_t)b + (a >> 1)) / a) - 1;
+  else
+    x->act_zbin_adj = 1 - (int)(((int64_t)a + (b >> 1)) / b);
+#endif
+}
+
+#if CONFIG_SUPERBLOCKS
+static void update_sb_skip_coeff_state(VP9_COMP *cpi,
+                                       MACROBLOCK *x,
+                                       ENTROPY_CONTEXT_PLANES ta[4],
+                                       ENTROPY_CONTEXT_PLANES tl[4],
+                                       TOKENEXTRA *t[4],
+                                       TOKENEXTRA **tp,
+                                       int skip[4])
+{
+  TOKENEXTRA tokens[4][16 * 24];
+  int n_tokens[4], n;
+
+  // if there were no skips, we don't need to do anything
+  if (!skip[0] && !skip[1] && !skip[2] && !skip[3])
+    return;
+
+  // if we don't do coeff skipping for this frame, we don't
+  // need to do anything here
+  if (!cpi->common.mb_no_coeff_skip)
+    return;
+
+  // if all 4 MBs skipped coeff coding, nothing to be done
+  if (skip[0] && skip[1] && skip[2] && skip[3])
+    return;
+
+  // so the situation now is that we want to skip coeffs
+  // for some MBs, but not all, and we didn't code EOB
+  // coefficients for them. However, the skip flag for this
+  // SB will be 0 overall, so we need to insert EOBs in the
+  // middle of the token tree. Do so here.
+  n_tokens[0] = t[1] - t[0];
+  n_tokens[1] = t[2] - t[1];
+  n_tokens[2] = t[3] - t[2];
+  n_tokens[3] = *tp  - t[3];
+  if (n_tokens[0])
+    memcpy(tokens[0], t[0], n_tokens[0] * sizeof(*t[0]));
+  if (n_tokens[1])
+    memcpy(tokens[1], t[1], n_tokens[1] * sizeof(*t[0]));
+  if (n_tokens[2])
+    memcpy(tokens[2], t[2], n_tokens[2] * sizeof(*t[0]));
+  if (n_tokens[3])
+    memcpy(tokens[3], t[3], n_tokens[3] * sizeof(*t[0]));
+
+  // reset pointer, stuff EOBs where necessary
+  *tp = t[0];
+  for (n = 0; n < 4; n++) {
+    if (skip[n]) {
+      x->e_mbd.above_context = &ta[n];
+      x->e_mbd.left_context  = &tl[n];
+      vp9_stuff_mb(cpi, &x->e_mbd, tp, 0);
+    } else {
+      if (n_tokens[n]) {
+        memcpy(*tp, tokens[n], sizeof(*t[0]) * n_tokens[n]);
+      }
+      (*tp) += n_tokens[n];
+    }
+  }
+}
+
+void vp9_encode_intra_super_block(VP9_COMP *cpi,
+                                  MACROBLOCK *x,
+                                  TOKENEXTRA **t,
+                                  int mb_col) {
+  const int output_enabled = 1;
+  int n;
+  MACROBLOCKD *xd = &x->e_mbd;
+  VP9_COMMON *cm = &cpi->common;
+  const uint8_t *src = x->src.y_buffer;
+  uint8_t *dst = xd->dst.y_buffer;
+  const uint8_t *usrc = x->src.u_buffer;
+  uint8_t *udst = xd->dst.u_buffer;
+  const uint8_t *vsrc = x->src.v_buffer;
+  uint8_t *vdst = xd->dst.v_buffer;
+  int src_y_stride = x->src.y_stride, dst_y_stride = xd->dst.y_stride;
+  int src_uv_stride = x->src.uv_stride, dst_uv_stride = xd->dst.uv_stride;
+  const VP9_ENCODER_RTCD *rtcd = IF_RTCD(&cpi->rtcd);
+  TOKENEXTRA *tp[4];
+  int skip[4];
+  MODE_INFO *mi = x->e_mbd.mode_info_context;
+  ENTROPY_CONTEXT_PLANES ta[4], tl[4];
+
+  if ((cpi->oxcf.tuning == VP8_TUNE_SSIM) && output_enabled) {
+    adjust_act_zbin(cpi, x);
+    vp9_update_zbin_extra(cpi, x);
+  }
+
+  vp9_build_intra_predictors_sby_s(&x->e_mbd);
+  vp9_build_intra_predictors_sbuv_s(&x->e_mbd);
+
+  assert(x->e_mbd.mode_info_context->mbmi.txfm_size == TX_8X8);
+  for (n = 0; n < 4; n++) {
+    int x_idx = n & 1, y_idx = n >> 1;
+
+    xd->above_context = cm->above_context + mb_col + (n & 1);
+    xd->left_context = cm->left_context + (n >> 1);
+
+    vp9_subtract_mby_s_c(x->src_diff,
+                         src + x_idx * 16 + y_idx * 16 * src_y_stride,
+                         src_y_stride,
+                         dst + x_idx * 16 + y_idx * 16 * dst_y_stride,
+                         dst_y_stride);
+    vp9_subtract_mbuv_s_c(x->src_diff,
+                          usrc + x_idx * 8 + y_idx * 8 * src_uv_stride,
+                          vsrc + x_idx * 8 + y_idx * 8 * src_uv_stride,
+                          src_uv_stride,
+                          udst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
+                          vdst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
+                          dst_uv_stride);
+    vp9_transform_mb_8x8(x);
+    vp9_quantize_mb_8x8(x);
+    if (x->optimize) {
+      vp9_optimize_mby_8x8(x, rtcd);
+      vp9_optimize_mbuv_8x8(x, rtcd);
+    }
+    vp9_inverse_transform_mb_8x8(IF_RTCD(&rtcd->common->idct), &x->e_mbd);
+    vp9_recon_mby_s_c(&x->e_mbd, dst + x_idx * 16 + y_idx * 16 * dst_y_stride);
+    vp9_recon_mbuv_s_c(&x->e_mbd,
+                       udst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
+                       vdst + x_idx * 8 + y_idx * 8 * dst_uv_stride);
+
+    if (output_enabled) {
+      memcpy(&ta[n], xd->above_context, sizeof(ta[n]));
+      memcpy(&tl[n], xd->left_context, sizeof(tl[n]));
+      tp[n] = *t;
+      xd->mode_info_context = mi + x_idx + y_idx * cm->mode_info_stride;
+      vp9_tokenize_mb(cpi, &x->e_mbd, t, 0);
+      skip[n] = xd->mode_info_context->mbmi.mb_skip_coeff;
+    }
+  }
+
+  if (output_enabled) {
+    // Tokenize
+    xd->mode_info_context = mi;
+    sum_intra_stats(cpi, x);
+    update_sb_skip_coeff_state(cpi, x, ta, tl, tp, t, skip);
+  }
+}
+#endif /* CONFIG_SUPERBLOCKS */
+
+void vp9_encode_intra_macro_block(VP9_COMP *cpi,
+                                  MACROBLOCK *x,
+                                  TOKENEXTRA **t,
+                                  int output_enabled) {
+  MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;
+  if ((cpi->oxcf.tuning == VP8_TUNE_SSIM) && output_enabled) {
+    adjust_act_zbin(cpi, x);
+    vp9_update_zbin_extra(cpi, x);
+  }
+  if (mbmi->mode == I8X8_PRED) {
+    vp9_encode_intra8x8mby(IF_RTCD(&cpi->rtcd), x);
+    vp9_encode_intra8x8mbuv(IF_RTCD(&cpi->rtcd), x);
+  } else if (mbmi->mode == B_PRED) {
+    vp9_encode_intra4x4mby(IF_RTCD(&cpi->rtcd), x);
+  } else {
+    vp9_encode_intra16x16mby(IF_RTCD(&cpi->rtcd), x);
+  }
+
+  if (mbmi->mode != I8X8_PRED) {
+    vp9_encode_intra16x16mbuv(IF_RTCD(&cpi->rtcd), x);
+  }
+
+  if (output_enabled) {
+    int segment_id = mbmi->segment_id;
+
+    // Tokenize
+    sum_intra_stats(cpi, x);
+    vp9_tokenize_mb(cpi, &x->e_mbd, t, 0);
+
+    if (cpi->common.txfm_mode == TX_MODE_SELECT &&
+        !((cpi->common.mb_no_coeff_skip && mbmi->mb_skip_coeff) ||
+          (vp9_segfeature_active(&x->e_mbd, segment_id, SEG_LVL_EOB) &&
+           vp9_get_segdata(&x->e_mbd, segment_id, SEG_LVL_EOB) == 0))) {
+      if (mbmi->mode != B_PRED && mbmi->mode != I8X8_PRED) {
+        cpi->txfm_count[mbmi->txfm_size]++;
+      } else if (mbmi->mode == I8X8_PRED) {
+        cpi->txfm_count_8x8p[mbmi->txfm_size]++;
+      }
+    } else if (cpi->common.txfm_mode >= ALLOW_16X16 && mbmi->mode <= TM_PRED) {
+      mbmi->txfm_size = TX_16X16;
+    } else
+    if (cpi->common.txfm_mode >= ALLOW_8X8 && mbmi->mode != B_PRED) {
+      mbmi->txfm_size = TX_8X8;
+    } else {
+      mbmi->txfm_size = TX_4X4;
+    }
+  }
+#if CONFIG_NEWBESTREFMV
+  else
+    vp9_tokenize_mb(cpi, &x->e_mbd, t, 1);
+#endif
+}
+
+extern void vp9_fix_contexts(MACROBLOCKD *xd);
+
+void vp9_encode_inter_macroblock(VP9_COMP *cpi, MACROBLOCK *x,
+                                 TOKENEXTRA **t, int recon_yoffset,
+                                 int recon_uvoffset, int output_enabled) {
+  VP9_COMMON *cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO * mbmi = &xd->mode_info_context->mbmi;
+  unsigned char *segment_id = &mbmi->segment_id;
+  int seg_ref_active;
+  unsigned char ref_pred_flag;
+
+  x->skip = 0;
+#if CONFIG_SUPERBLOCKS
+  assert(!xd->mode_info_context->mbmi.encoded_as_sb);
+#endif
+
+  vp9_setup_interp_filters(xd, mbmi->interp_filter, cm);
+  if (cpi->oxcf.tuning == VP8_TUNE_SSIM) {
+    // Adjust the zbin based on this MB rate.
+    adjust_act_zbin(cpi, x);
+  }
+
+  {
+    // Experimental code. Special case for gf and arf zeromv modes.
+    // Increase zbin size to suppress noise
+    cpi->zbin_mode_boost = 0;
+    if (cpi->zbin_mode_boost_enabled) {
+      if (mbmi->ref_frame != INTRA_FRAME) {
+        if (mbmi->mode == ZEROMV) {
+          if (mbmi->ref_frame != LAST_FRAME)
+            cpi->zbin_mode_boost = GF_ZEROMV_ZBIN_BOOST;
+          else
+            cpi->zbin_mode_boost = LF_ZEROMV_ZBIN_BOOST;
+        } else if (mbmi->mode == SPLITMV)
+          cpi->zbin_mode_boost = 0;
+        else
+          cpi->zbin_mode_boost = MV_ZBIN_BOOST;
+      }
+    }
+
+    vp9_update_zbin_extra(cpi, x);
+  }
+
+  seg_ref_active = vp9_segfeature_active(xd, *segment_id, SEG_LVL_REF_FRAME);
+
+  // SET VARIOUS PREDICTION FLAGS
+
+  // Did the chosen reference frame match its predicted value.
+  ref_pred_flag = ((mbmi->ref_frame == vp9_get_pred_ref(cm, xd)));
+  vp9_set_pred_flag(xd, PRED_REF, ref_pred_flag);
+
+  if (mbmi->ref_frame == INTRA_FRAME) {
+    if (mbmi->mode == B_PRED) {
+      vp9_encode_intra16x16mbuv(IF_RTCD(&cpi->rtcd), x);
+      vp9_encode_intra4x4mby(IF_RTCD(&cpi->rtcd), x);
+    } else if (mbmi->mode == I8X8_PRED) {
+      vp9_encode_intra8x8mby(IF_RTCD(&cpi->rtcd), x);
+      vp9_encode_intra8x8mbuv(IF_RTCD(&cpi->rtcd), x);
+    } else {
+      vp9_encode_intra16x16mbuv(IF_RTCD(&cpi->rtcd), x);
+      vp9_encode_intra16x16mby(IF_RTCD(&cpi->rtcd), x);
+    }
+
+    if (output_enabled)
+      sum_intra_stats(cpi, x);
+  } else {
+    int ref_fb_idx;
+
+    if (mbmi->ref_frame == LAST_FRAME)
+      ref_fb_idx = cpi->common.lst_fb_idx;
+    else if (mbmi->ref_frame == GOLDEN_FRAME)
+      ref_fb_idx = cpi->common.gld_fb_idx;
+    else
+      ref_fb_idx = cpi->common.alt_fb_idx;
+
+    xd->pre.y_buffer = cpi->common.yv12_fb[ref_fb_idx].y_buffer + recon_yoffset;
+    xd->pre.u_buffer = cpi->common.yv12_fb[ref_fb_idx].u_buffer + recon_uvoffset;
+    xd->pre.v_buffer = cpi->common.yv12_fb[ref_fb_idx].v_buffer + recon_uvoffset;
+
+    if (mbmi->second_ref_frame) {
+      int second_ref_fb_idx;
+
+      if (mbmi->second_ref_frame == LAST_FRAME)
+        second_ref_fb_idx = cpi->common.lst_fb_idx;
+      else if (mbmi->second_ref_frame == GOLDEN_FRAME)
+        second_ref_fb_idx = cpi->common.gld_fb_idx;
+      else
+        second_ref_fb_idx = cpi->common.alt_fb_idx;
+
+      xd->second_pre.y_buffer = cpi->common.yv12_fb[second_ref_fb_idx].y_buffer +
+                                recon_yoffset;
+      xd->second_pre.u_buffer = cpi->common.yv12_fb[second_ref_fb_idx].u_buffer +
+                                recon_uvoffset;
+      xd->second_pre.v_buffer = cpi->common.yv12_fb[second_ref_fb_idx].v_buffer +
+                                recon_uvoffset;
+    }
+
+    if (!x->skip) {
+      vp9_encode_inter16x16(IF_RTCD(&cpi->rtcd), x);
+
+      // Clear mb_skip_coeff if mb_no_coeff_skip is not set
+      if (!cpi->common.mb_no_coeff_skip)
+        mbmi->mb_skip_coeff = 0;
+
+    } else {
+      vp9_build_1st_inter16x16_predictors_mb(xd, xd->dst.y_buffer,
+                                             xd->dst.u_buffer, xd->dst.v_buffer,
+                                             xd->dst.y_stride,
+                                             xd->dst.uv_stride);
+    }
+  }
+
+  if (!x->skip) {
+#ifdef ENC_DEBUG
+    if (enc_debug) {
+      int i;
+      printf("Segment=%d [%d, %d]: %d %d:\n", mbmi->segment_id, mb_col_debug,
+             mb_row_debug, xd->mb_to_left_edge, xd->mb_to_top_edge);
+      for (i = 0; i < 400; i++) {
+        printf("%3d ", xd->qcoeff[i]);
+        if (i % 16 == 15) printf("\n");
+      }
+      printf("\n");
+      printf("eobs = ");
+      for (i = 0; i < 25; i++)
+        printf("%d:%d ", i, xd->block[i].eob);
+      printf("\n");
+      fflush(stdout);
+    }
+#endif
+
+    vp9_tokenize_mb(cpi, xd, t, !output_enabled);
+
+#ifdef ENC_DEBUG
+    if (enc_debug) {
+      printf("Tokenized\n");
+      fflush(stdout);
+    }
+#endif
+  } else {
+    int mb_skip_context =
+      cpi->common.mb_no_coeff_skip ?
+      (x->e_mbd.mode_info_context - 1)->mbmi.mb_skip_coeff +
+      (x->e_mbd.mode_info_context - cpi->common.mode_info_stride)->mbmi.mb_skip_coeff :
+      0;
+    if (cpi->common.mb_no_coeff_skip) {
+      mbmi->mb_skip_coeff = 1;
+      if (output_enabled)
+        cpi->skip_true_count[mb_skip_context]++;
+      vp9_fix_contexts(xd);
+    } else {
+      vp9_stuff_mb(cpi, xd, t, !output_enabled);
+      mbmi->mb_skip_coeff = 0;
+      if (output_enabled)
+        cpi->skip_false_count[mb_skip_context]++;
+    }
+  }
+
+  if (output_enabled) {
+    int segment_id = mbmi->segment_id;
+    if (cpi->common.txfm_mode == TX_MODE_SELECT &&
+        !((cpi->common.mb_no_coeff_skip && mbmi->mb_skip_coeff) ||
+          (vp9_segfeature_active(&x->e_mbd, segment_id, SEG_LVL_EOB) &&
+           vp9_get_segdata(&x->e_mbd, segment_id, SEG_LVL_EOB) == 0))) {
+      if (mbmi->mode != B_PRED && mbmi->mode != I8X8_PRED &&
+          mbmi->mode != SPLITMV) {
+        cpi->txfm_count[mbmi->txfm_size]++;
+      } else if (mbmi->mode == I8X8_PRED ||
+                 (mbmi->mode == SPLITMV &&
+                  mbmi->partitioning != PARTITIONING_4X4)) {
+        cpi->txfm_count_8x8p[mbmi->txfm_size]++;
+      }
+    } else if (mbmi->mode != B_PRED && mbmi->mode != I8X8_PRED &&
+        mbmi->mode != SPLITMV && cpi->common.txfm_mode >= ALLOW_16X16) {
+      mbmi->txfm_size = TX_16X16;
+    } else if (mbmi->mode != B_PRED &&
+               !(mbmi->mode == SPLITMV &&
+                 mbmi->partitioning == PARTITIONING_4X4) &&
+               cpi->common.txfm_mode >= ALLOW_8X8) {
+      mbmi->txfm_size = TX_8X8;
+    } else {
+      mbmi->txfm_size = TX_4X4;
+    }
+  }
+}
+
+#if CONFIG_SUPERBLOCKS
+void vp9_encode_inter_superblock(VP9_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t,
+                                 int recon_yoffset, int recon_uvoffset,
+                                 int mb_col, int mb_row) {
+  const int output_enabled = 1;
+  VP9_COMMON *cm = &cpi->common;
+  MACROBLOCKD *xd = &x->e_mbd;
+  const uint8_t *src = x->src.y_buffer;
+  uint8_t *dst = xd->dst.y_buffer;
+  const uint8_t *usrc = x->src.u_buffer;
+  uint8_t *udst = xd->dst.u_buffer;
+  const uint8_t *vsrc = x->src.v_buffer;
+  uint8_t *vdst = xd->dst.v_buffer;
+  int src_y_stride = x->src.y_stride, dst_y_stride = xd->dst.y_stride;
+  int src_uv_stride = x->src.uv_stride, dst_uv_stride = xd->dst.uv_stride;
+  const VP9_ENCODER_RTCD *rtcd = IF_RTCD(&cpi->rtcd);
+  unsigned int segment_id = xd->mode_info_context->mbmi.segment_id;
+  int seg_ref_active;
+  unsigned char ref_pred_flag;
+  int n;
+  TOKENEXTRA *tp[4];
+  int skip[4];
+  MODE_INFO *mi = x->e_mbd.mode_info_context;
+  ENTROPY_CONTEXT_PLANES ta[4], tl[4];
+
+  x->skip = 0;
+
+  if (cpi->oxcf.tuning == VP8_TUNE_SSIM) {
+    // Adjust the zbin based on this MB rate.
+    adjust_act_zbin(cpi, x);
+  }
+
+  {
+    // Experimental code. Special case for gf and arf zeromv modes.
+    // Increase zbin size to suppress noise
+    cpi->zbin_mode_boost = 0;
+    if (cpi->zbin_mode_boost_enabled) {
+      if (xd->mode_info_context->mbmi.ref_frame != INTRA_FRAME) {
+        if (xd->mode_info_context->mbmi.mode == ZEROMV) {
+          if (xd->mode_info_context->mbmi.ref_frame != LAST_FRAME)
+            cpi->zbin_mode_boost = GF_ZEROMV_ZBIN_BOOST;
+          else
+            cpi->zbin_mode_boost = LF_ZEROMV_ZBIN_BOOST;
+        } else if (xd->mode_info_context->mbmi.mode == SPLITMV)
+          cpi->zbin_mode_boost = 0;
+        else
+          cpi->zbin_mode_boost = MV_ZBIN_BOOST;
+      }
+    }
+
+    vp9_update_zbin_extra(cpi, x);
+  }
+
+  seg_ref_active = vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME);
+
+  // SET VARIOUS PREDICTION FLAGS
+
+  // Did the chosen reference frame match its predicted value.
+  ref_pred_flag = ((xd->mode_info_context->mbmi.ref_frame ==
+                    vp9_get_pred_ref(cm, xd)));
+  vp9_set_pred_flag(xd, PRED_REF, ref_pred_flag);
+
+  if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) {
+    vp9_build_intra_predictors_sby_s(&x->e_mbd);
+    vp9_build_intra_predictors_sbuv_s(&x->e_mbd);
+  } else {
+    int ref_fb_idx;
+
+    if (xd->mode_info_context->mbmi.ref_frame == LAST_FRAME)
+      ref_fb_idx = cpi->common.lst_fb_idx;
+    else if (xd->mode_info_context->mbmi.ref_frame == GOLDEN_FRAME)
+      ref_fb_idx = cpi->common.gld_fb_idx;
+    else
+      ref_fb_idx = cpi->common.alt_fb_idx;
+
+    xd->pre.y_buffer = cpi->common.yv12_fb[ref_fb_idx].y_buffer + recon_yoffset;
+    xd->pre.u_buffer = cpi->common.yv12_fb[ref_fb_idx].u_buffer + recon_uvoffset;
+    xd->pre.v_buffer = cpi->common.yv12_fb[ref_fb_idx].v_buffer + recon_uvoffset;
+
+    if (xd->mode_info_context->mbmi.second_ref_frame) {
+      int second_ref_fb_idx;
+
+      if (xd->mode_info_context->mbmi.second_ref_frame == LAST_FRAME)
+        second_ref_fb_idx = cpi->common.lst_fb_idx;
+      else if (xd->mode_info_context->mbmi.second_ref_frame == GOLDEN_FRAME)
+        second_ref_fb_idx = cpi->common.gld_fb_idx;
+      else
+        second_ref_fb_idx = cpi->common.alt_fb_idx;
+
+      xd->second_pre.y_buffer = cpi->common.yv12_fb[second_ref_fb_idx].y_buffer +
+                                    recon_yoffset;
+      xd->second_pre.u_buffer = cpi->common.yv12_fb[second_ref_fb_idx].u_buffer +
+                                    recon_uvoffset;
+      xd->second_pre.v_buffer = cpi->common.yv12_fb[second_ref_fb_idx].v_buffer +
+                                    recon_uvoffset;
+    }
+
+    vp9_build_inter32x32_predictors_sb(xd, xd->dst.y_buffer,
+                                       xd->dst.u_buffer, xd->dst.v_buffer,
+                                       xd->dst.y_stride, xd->dst.uv_stride);
+  }
+
+  assert(x->e_mbd.mode_info_context->mbmi.txfm_size == TX_8X8);
+  for (n = 0; n < 4; n++) {
+    int x_idx = n & 1, y_idx = n >> 1;
+
+    vp9_subtract_mby_s_c(x->src_diff,
+                         src + x_idx * 16 + y_idx * 16 * src_y_stride,
+                         src_y_stride,
+                         dst + x_idx * 16 + y_idx * 16 * dst_y_stride,
+                         dst_y_stride);
+    vp9_subtract_mbuv_s_c(x->src_diff,
+                          usrc + x_idx * 8 + y_idx * 8 * src_uv_stride,
+                          vsrc + x_idx * 8 + y_idx * 8 * src_uv_stride,
+                          src_uv_stride,
+                          udst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
+                          vdst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
+                          dst_uv_stride);
+    vp9_transform_mb_8x8(x);
+    vp9_quantize_mb_8x8(x);
+    if (x->optimize) {
+      vp9_optimize_mby_8x8(x, rtcd);
+      vp9_optimize_mbuv_8x8(x, rtcd);
+    }
+    vp9_inverse_transform_mb_8x8(IF_RTCD(&rtcd->common->idct), &x->e_mbd);
+    vp9_recon_mby_s_c(&x->e_mbd,
+                      dst + x_idx * 16 + y_idx * 16 * dst_y_stride);
+    vp9_recon_mbuv_s_c(&x->e_mbd,
+                       udst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
+                       vdst + x_idx * 8 + y_idx * 8 * dst_uv_stride);
+
+    if (!x->skip) {
+      if (output_enabled) {
+        xd->left_context = cm->left_context + (n >> 1);
+        xd->above_context = cm->above_context + mb_col + (n & 1);
+        memcpy(&ta[n], xd->above_context, sizeof(ta[n]));
+        memcpy(&tl[n], xd->left_context, sizeof(tl[n]));
+        tp[n] = *t;
+        xd->mode_info_context = mi + x_idx + y_idx * cm->mode_info_stride;
+        vp9_tokenize_mb(cpi, &x->e_mbd, t, 0);
+        skip[n] = xd->mode_info_context->mbmi.mb_skip_coeff;
+      }
+    } else {
+      int mb_skip_context =
+        cpi->common.mb_no_coeff_skip ?
+          (x->e_mbd.mode_info_context - 1)->mbmi.mb_skip_coeff +
+            (x->e_mbd.mode_info_context - cpi->common.mode_info_stride)->mbmi.mb_skip_coeff :
+          0;
+      if (cpi->common.mb_no_coeff_skip) {
+        skip[n] = xd->mode_info_context->mbmi.mb_skip_coeff = 1;
+        xd->left_context = cm->left_context + (n >> 1);
+        xd->above_context = cm->above_context + mb_col + (n & 1);
+        memcpy(&ta[n], xd->above_context, sizeof(ta[n]));
+        memcpy(&tl[n], xd->left_context, sizeof(tl[n]));
+        tp[n] = *t;
+        cpi->skip_true_count[mb_skip_context]++;
+        vp9_fix_contexts(xd);
+      } else {
+        vp9_stuff_mb(cpi, xd, t, 0);
+        xd->mode_info_context->mbmi.mb_skip_coeff = 0;
+        cpi->skip_false_count[mb_skip_context]++;
+      }
+    }
+  }
+
+  xd->mode_info_context = mi;
+  update_sb_skip_coeff_state(cpi, x, ta, tl, tp, t, skip);
+}
+#endif
--- /dev/null
+++ b/vp9/encoder/encodeintra.c
@@ -1,0 +1,289 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_ports/config.h"
+#include "vpx_rtcd.h"
+#include "vp9/common/idct.h"
+#include "quantize.h"
+#include "vp9/common/reconintra.h"
+#include "vp9/common/reconintra4x4.h"
+#include "encodemb.h"
+#include "vp9/common/invtrans.h"
+#include "encodeintra.h"
+
+#if CONFIG_RUNTIME_CPU_DETECT
+#define IF_RTCD(x) (x)
+#else
+#define IF_RTCD(x) NULL
+#endif
+
+int vp9_encode_intra(VP9_COMP *cpi, MACROBLOCK *x, int use_16x16_pred) {
+  int i;
+  int intra_pred_var = 0;
+  MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;
+  (void) cpi;
+
+  if (use_16x16_pred) {
+    mbmi->mode = DC_PRED;
+#if CONFIG_COMP_INTRA_PRED
+    mbmi->second_mode = (MB_PREDICTION_MODE)(DC_PRED - 1);
+#endif
+    mbmi->uv_mode = DC_PRED;
+    mbmi->ref_frame = INTRA_FRAME;
+
+    vp9_encode_intra16x16mby(IF_RTCD(&cpi->rtcd), x);
+  } else {
+    for (i = 0; i < 16; i++) {
+      x->e_mbd.block[i].bmi.as_mode.first = B_DC_PRED;
+      vp9_encode_intra4x4block(IF_RTCD(&cpi->rtcd), x, i);
+    }
+  }
+
+  intra_pred_var = vp9_get_mb_ss(x->src_diff);
+
+  return intra_pred_var;
+}
+
+void vp9_encode_intra4x4block(const VP9_ENCODER_RTCD *rtcd,
+                              MACROBLOCK *x, int ib) {
+  BLOCKD *b = &x->e_mbd.block[ib];
+  BLOCK *be = &x->block[ib];
+  TX_TYPE tx_type;
+
+#if CONFIG_COMP_INTRA_PRED
+  if (b->bmi.as_mode.second == (B_PREDICTION_MODE)(B_DC_PRED - 1)) {
+#endif
+    vp9_intra4x4_predict(b, b->bmi.as_mode.first, b->predictor);
+#if CONFIG_COMP_INTRA_PRED
+  } else {
+    vp9_comp_intra4x4_predict(b, b->bmi.as_mode.first, b->bmi.as_mode.second,
+                              b->predictor);
+  }
+#endif
+
+  vp9_subtract_b(be, b, 16);
+
+  tx_type = get_tx_type(&x->e_mbd, b);
+  if (tx_type != DCT_DCT) {
+    vp9_fht(be->src_diff, 32, be->coeff, tx_type, 4);
+    vp9_ht_quantize_b_4x4(be, b, tx_type);
+    vp9_ihtllm_c(b->dqcoeff, b->diff, 32, tx_type, 4);
+  } else {
+    x->vp9_short_fdct4x4(be->src_diff, be->coeff, 32);
+    x->quantize_b_4x4(be, b) ;
+    vp9_inverse_transform_b_4x4(IF_RTCD(&rtcd->common->idct), b, 32);
+  }
+
+  vp9_recon_b(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+}
+
+void vp9_encode_intra4x4mby(const VP9_ENCODER_RTCD *rtcd, MACROBLOCK *mb) {
+  int i;
+
+  for (i = 0; i < 16; i++)
+    vp9_encode_intra4x4block(rtcd, mb, i);
+  return;
+}
+
+void vp9_encode_intra16x16mby(const VP9_ENCODER_RTCD *rtcd, MACROBLOCK *x) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  BLOCK *b = &x->block[0];
+  TX_SIZE tx_size = xd->mode_info_context->mbmi.txfm_size;
+  TX_TYPE tx_type;
+
+#if CONFIG_COMP_INTRA_PRED
+  if (xd->mode_info_context->mbmi.second_mode == (MB_PREDICTION_MODE)(DC_PRED - 1))
+#endif
+    vp9_build_intra_predictors_mby(xd);
+#if CONFIG_COMP_INTRA_PRED
+  else
+    vp9_build_comp_intra_predictors_mby(xd);
+#endif
+
+  vp9_subtract_mby(x->src_diff, *(b->base_src), xd->predictor, b->src_stride);
+
+  if (tx_size == TX_16X16) {
+    BLOCKD  *bd = &xd->block[0];
+    tx_type = get_tx_type(xd, bd);
+    if (tx_type != DCT_DCT) {
+      vp9_fht(b->src_diff, 32, b->coeff, tx_type, 16);
+      vp9_quantize_mby_16x16(x);
+      if (x->optimize)
+        vp9_optimize_mby_16x16(x, rtcd);
+      vp9_ihtllm_c(bd->dqcoeff, bd->diff, 32, tx_type, 16);
+    } else {
+      vp9_transform_mby_16x16(x);
+      vp9_quantize_mby_16x16(x);
+      if (x->optimize)
+        vp9_optimize_mby_16x16(x, rtcd);
+      vp9_inverse_transform_mby_16x16(IF_RTCD(&rtcd->common->idct), xd);
+    }
+  } else if (tx_size == TX_8X8) {
+    vp9_transform_mby_8x8(x);
+    vp9_quantize_mby_8x8(x);
+    if (x->optimize)
+      vp9_optimize_mby_8x8(x, rtcd);
+    vp9_inverse_transform_mby_8x8(IF_RTCD(&rtcd->common->idct), xd);
+  } else {
+    vp9_transform_mby_4x4(x);
+    vp9_quantize_mby_4x4(x);
+    if (x->optimize)
+      vp9_optimize_mby_4x4(x, rtcd);
+    vp9_inverse_transform_mby_4x4(IF_RTCD(&rtcd->common->idct), xd);
+  }
+
+  vp9_recon_mby(xd);
+}
+
+void vp9_encode_intra16x16mbuv(const VP9_ENCODER_RTCD *rtcd, MACROBLOCK *x) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  TX_SIZE tx_size = xd->mode_info_context->mbmi.txfm_size;
+
+#if CONFIG_COMP_INTRA_PRED
+  if (xd->mode_info_context->mbmi.second_uv_mode == (MB_PREDICTION_MODE)(DC_PRED - 1)) {
+#endif
+    vp9_build_intra_predictors_mbuv(xd);
+#if CONFIG_COMP_INTRA_PRED
+  } else {
+    vp9_build_comp_intra_predictors_mbuv(xd);
+  }
+#endif
+
+  vp9_subtract_mbuv(x->src_diff, x->src.u_buffer, x->src.v_buffer,
+                    xd->predictor, x->src.uv_stride);
+
+  if (tx_size == TX_4X4) {
+    vp9_transform_mbuv_4x4(x);
+    vp9_quantize_mbuv_4x4(x);
+    if (x->optimize)
+      vp9_optimize_mbuv_4x4(x, rtcd);
+    vp9_inverse_transform_mbuv_4x4(IF_RTCD(&rtcd->common->idct), xd);
+  } else /* 16x16 or 8x8 */ {
+    vp9_transform_mbuv_8x8(x);
+    vp9_quantize_mbuv_8x8(x);
+    if (x->optimize)
+      vp9_optimize_mbuv_8x8(x, rtcd);
+    vp9_inverse_transform_mbuv_8x8(IF_RTCD(&rtcd->common->idct), xd);
+  }
+
+  vp9_recon_intra_mbuv(xd);
+}
+
+void vp9_encode_intra8x8(const VP9_ENCODER_RTCD *rtcd,
+                         MACROBLOCK *x, int ib) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  BLOCKD *b = &xd->block[ib];
+  BLOCK *be = &x->block[ib];
+  const int iblock[4] = {0, 1, 4, 5};
+  int i;
+  TX_TYPE tx_type;
+
+#if CONFIG_COMP_INTRA_PRED
+  if (b->bmi.as_mode.second == (MB_PREDICTION_MODE)(DC_PRED - 1)) {
+#endif
+    vp9_intra8x8_predict(b, b->bmi.as_mode.first, b->predictor);
+#if CONFIG_COMP_INTRA_PRED
+  } else {
+    vp9_comp_intra8x8_predict(b, b->bmi.as_mode.first, b->bmi.as_mode.second,
+                              b->predictor);
+  }
+#endif
+
+  if (xd->mode_info_context->mbmi.txfm_size == TX_8X8) {
+    int idx = (ib & 0x02) ? (ib + 2) : ib;
+
+    // generate residual blocks
+    vp9_subtract_4b_c(be, b, 16);
+
+    tx_type = get_tx_type(xd, xd->block + idx);
+    if (tx_type != DCT_DCT) {
+      vp9_fht(be->src_diff, 32, (x->block + idx)->coeff,
+                tx_type, 8);
+      x->quantize_b_8x8(x->block + idx, xd->block + idx);
+      vp9_ihtllm_c(xd->block[idx].dqcoeff, xd->block[ib].diff, 32,
+                   tx_type, 8);
+    } else {
+      x->vp9_short_fdct8x8(be->src_diff, (x->block + idx)->coeff, 32);
+      x->quantize_b_8x8(x->block + idx, xd->block + idx);
+      vp9_idct_idct8(xd->block[idx].dqcoeff, xd->block[ib].diff, 32);
+    }
+  } else {
+    for (i = 0; i < 4; i++) {
+      b = &xd->block[ib + iblock[i]];
+      be = &x->block[ib + iblock[i]];
+      vp9_subtract_b(be, b, 16);
+      x->vp9_short_fdct4x4(be->src_diff, be->coeff, 32);
+      x->quantize_b_4x4(be, b);
+      vp9_inverse_transform_b_4x4(IF_RTCD(&rtcd->common->idct), b, 32);
+    }
+  }
+
+  // reconstruct submacroblock
+  for (i = 0; i < 4; i++) {
+    b = &xd->block[ib + iblock[i]];
+    vp9_recon_b_c(b->predictor, b->diff, *(b->base_dst) + b->dst,
+                  b->dst_stride);
+  }
+}
+
+void vp9_encode_intra8x8mby(const VP9_ENCODER_RTCD *rtcd, MACROBLOCK *x) {
+  int i, ib;
+
+  for (i = 0; i < 4; i++) {
+    ib = vp9_i8x8_block[i];
+    vp9_encode_intra8x8(rtcd, x, ib);
+  }
+}
+
+void vp9_encode_intra_uv4x4(const VP9_ENCODER_RTCD *rtcd,
+                            MACROBLOCK *x, int ib,
+                            int mode, int second) {
+  BLOCKD *b = &x->e_mbd.block[ib];
+  BLOCK *be = &x->block[ib];
+
+#if CONFIG_COMP_INTRA_PRED
+  if (second == -1) {
+#endif
+    vp9_intra_uv4x4_predict(b, mode, b->predictor);
+#if CONFIG_COMP_INTRA_PRED
+  } else {
+    vp9_comp_intra_uv4x4_predict(b, mode, second, b->predictor);
+  }
+#endif
+
+  vp9_subtract_b(be, b, 8);
+
+  x->vp9_short_fdct4x4(be->src_diff, be->coeff, 16);
+  x->quantize_b_4x4(be, b);
+  vp9_inverse_transform_b_4x4(IF_RTCD(&rtcd->common->idct), b, 16);
+
+  vp9_recon_uv_b_c(b->predictor, b->diff, *(b->base_dst) + b->dst,
+                   b->dst_stride);
+}
+
+void vp9_encode_intra8x8mbuv(const VP9_ENCODER_RTCD *rtcd, MACROBLOCK *x) {
+  int i, ib, mode, second;
+  BLOCKD *b;
+
+  for (i = 0; i < 4; i++) {
+    ib = vp9_i8x8_block[i];
+    b = &x->e_mbd.block[ib];
+    mode = b->bmi.as_mode.first;
+#if CONFIG_COMP_INTRA_PRED
+    second = b->bmi.as_mode.second;
+#else
+    second = -1;
+#endif
+    /*u */
+    vp9_encode_intra_uv4x4(rtcd, x, i + 16, mode, second);
+    /*v */
+    vp9_encode_intra_uv4x4(rtcd, x, i + 20, mode, second);
+  }
+}
--- /dev/null
+++ b/vp9/encoder/encodeintra.h
@@ -1,0 +1,27 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef __ENCODEINTRA_H_
+#define __ENCODEINTRA_H_
+
+#include "onyx_int.h"
+
+int vp9_encode_intra(VP9_COMP *cpi, MACROBLOCK *x, int use_16x16_pred);
+void vp9_encode_intra16x16mby(const VP9_ENCODER_RTCD *, MACROBLOCK *x);
+void vp9_encode_intra16x16mbuv(const VP9_ENCODER_RTCD *, MACROBLOCK *x);
+void vp9_encode_intra4x4mby(const VP9_ENCODER_RTCD *, MACROBLOCK *mb);
+void vp9_encode_intra4x4block(const VP9_ENCODER_RTCD *rtcd,
+                              MACROBLOCK *x, int ib);
+void vp9_encode_intra8x8mby(const VP9_ENCODER_RTCD *rtcd, MACROBLOCK *x);
+void vp9_encode_intra8x8mbuv(const VP9_ENCODER_RTCD *rtcd, MACROBLOCK *x);
+void vp9_encode_intra8x8(const VP9_ENCODER_RTCD *rtcd,
+                         MACROBLOCK *x, int ib);
+
+#endif  // __ENCODEINTRA_H_
--- /dev/null
+++ b/vp9/encoder/encodemb.c
@@ -1,0 +1,950 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_ports/config.h"
+#include "encodemb.h"
+#include "vp9/common/reconinter.h"
+#include "quantize.h"
+#include "tokenize.h"
+#include "vp9/common/invtrans.h"
+#include "vp9/common/reconintra.h"
+#include "vpx_mem/vpx_mem.h"
+#include "rdopt.h"
+#include "vp9/common/systemdependent.h"
+#include "vpx_rtcd.h"
+
+#if CONFIG_RUNTIME_CPU_DETECT
+#define IF_RTCD(x) (x)
+#else
+#define IF_RTCD(x) NULL
+#endif
+
+void vp9_subtract_b_c(BLOCK *be, BLOCKD *bd, int pitch) {
+  unsigned char *src_ptr = (*(be->base_src) + be->src);
+  short *diff_ptr = be->src_diff;
+  unsigned char *pred_ptr = bd->predictor;
+  int src_stride = be->src_stride;
+
+  int r, c;
+
+  for (r = 0; r < 4; r++) {
+    for (c = 0; c < 4; c++) {
+      diff_ptr[c] = src_ptr[c] - pred_ptr[c];
+    }
+
+    diff_ptr += pitch;
+    pred_ptr += pitch;
+    src_ptr  += src_stride;
+  }
+}
+
+void vp9_subtract_4b_c(BLOCK *be, BLOCKD *bd, int pitch) {
+  unsigned char *src_ptr = (*(be->base_src) + be->src);
+  short *diff_ptr = be->src_diff;
+  unsigned char *pred_ptr = bd->predictor;
+  int src_stride = be->src_stride;
+  int r, c;
+
+  for (r = 0; r < 8; r++) {
+    for (c = 0; c < 8; c++) {
+      diff_ptr[c] = src_ptr[c] - pred_ptr[c];
+    }
+    diff_ptr += pitch;
+    pred_ptr += pitch;
+    src_ptr  += src_stride;
+  }
+}
+
+void vp9_subtract_mbuv_s_c(short *diff, const unsigned char *usrc,
+                           const unsigned char *vsrc, int src_stride,
+                           const unsigned char *upred,
+                           const unsigned char *vpred, int dst_stride) {
+  short *udiff = diff + 256;
+  short *vdiff = diff + 320;
+  int r, c;
+
+  for (r = 0; r < 8; r++) {
+    for (c = 0; c < 8; c++) {
+      udiff[c] = usrc[c] - upred[c];
+    }
+
+    udiff += 8;
+    upred += dst_stride;
+    usrc  += src_stride;
+  }
+
+  for (r = 0; r < 8; r++) {
+    for (c = 0; c < 8; c++) {
+      vdiff[c] = vsrc[c] - vpred[c];
+    }
+
+    vdiff += 8;
+    vpred += dst_stride;
+    vsrc  += src_stride;
+  }
+}
+
+void vp9_subtract_mbuv_c(short *diff, unsigned char *usrc,
+                         unsigned char *vsrc, unsigned char *pred, int stride) {
+  unsigned char *upred = pred + 256;
+  unsigned char *vpred = pred + 320;
+
+  vp9_subtract_mbuv_s_c(diff, usrc, vsrc, stride, upred, vpred, 8);
+}
+
+void vp9_subtract_mby_s_c(short *diff, const unsigned char *src, int src_stride,
+                          const unsigned char *pred, int dst_stride) {
+  int r, c;
+
+  for (r = 0; r < 16; r++) {
+    for (c = 0; c < 16; c++) {
+      diff[c] = src[c] - pred[c];
+    }
+
+    diff += 16;
+    pred += dst_stride;
+    src  += src_stride;
+  }
+}
+
+void vp9_subtract_mby_c(short *diff, unsigned char *src,
+                        unsigned char *pred, int stride) {
+  vp9_subtract_mby_s_c(diff, src, stride, pred, 16);
+}
+
+static void subtract_mb(const VP9_ENCODER_RTCD *rtcd, MACROBLOCK *x) {
+  BLOCK *b = &x->block[0];
+
+  vp9_subtract_mby(x->src_diff, *(b->base_src), x->e_mbd.predictor,
+                   b->src_stride);
+  vp9_subtract_mbuv(x->src_diff, x->src.u_buffer, x->src.v_buffer,
+                    x->e_mbd.predictor, x->src.uv_stride);
+}
+
+static void build_dcblock_4x4(MACROBLOCK *x) {
+  short *src_diff_ptr = &x->src_diff[384];
+  int i;
+
+  for (i = 0; i < 16; i++) {
+    src_diff_ptr[i] = x->coeff[i * 16];
+  }
+}
+
+void vp9_transform_mby_4x4(MACROBLOCK *x) {
+  int i;
+
+  for (i = 0; i < 16; i += 2) {
+    x->vp9_short_fdct8x4(&x->block[i].src_diff[0],
+                         &x->block[i].coeff[0], 32);
+  }
+
+  if (x->e_mbd.mode_info_context->mbmi.mode != SPLITMV) {
+    // build dc block from 16 y dc values
+    build_dcblock_4x4(x);
+
+    // do 2nd order transform on the dc block
+    x->short_walsh4x4(&x->block[24].src_diff[0],
+                      &x->block[24].coeff[0], 8);
+  }
+}
+
+void vp9_transform_mbuv_4x4(MACROBLOCK *x) {
+  int i;
+
+  for (i = 16; i < 24; i += 2) {
+    x->vp9_short_fdct8x4(&x->block[i].src_diff[0],
+                         &x->block[i].coeff[0], 16);
+  }
+}
+
+static void transform_mb_4x4(MACROBLOCK *x) {
+  vp9_transform_mby_4x4(x);
+  vp9_transform_mbuv_4x4(x);
+}
+
+static void build_dcblock_8x8(MACROBLOCK *x) {
+  int16_t *src_diff_ptr = x->block[24].src_diff;
+  int i;
+
+  for (i = 0; i < 16; i++) {
+    src_diff_ptr[i] = 0;
+  }
+  src_diff_ptr[0] = x->coeff[0 * 16];
+  src_diff_ptr[1] = x->coeff[4 * 16];
+  src_diff_ptr[4] = x->coeff[8 * 16];
+  src_diff_ptr[8] = x->coeff[12 * 16];
+}
+
+void vp9_transform_mby_8x8(MACROBLOCK *x) {
+  int i;
+
+  for (i = 0; i < 9; i += 8) {
+    x->vp9_short_fdct8x8(&x->block[i].src_diff[0],
+                         &x->block[i].coeff[0], 32);
+  }
+  for (i = 2; i < 11; i += 8) {
+    x->vp9_short_fdct8x8(&x->block[i].src_diff[0],
+                         &x->block[i + 2].coeff[0], 32);
+  }
+
+  if (x->e_mbd.mode_info_context->mbmi.mode != SPLITMV) {
+    // build dc block from 2x2 y dc values
+    build_dcblock_8x8(x);
+
+    // do 2nd order transform on the dc block
+    x->short_fhaar2x2(&x->block[24].src_diff[0],
+                      &x->block[24].coeff[0], 8);
+  }
+}
+
+void vp9_transform_mbuv_8x8(MACROBLOCK *x) {
+  int i;
+
+  for (i = 16; i < 24; i += 4) {
+    x->vp9_short_fdct8x8(&x->block[i].src_diff[0],
+                         &x->block[i].coeff[0], 16);
+  }
+}
+
+void vp9_transform_mb_8x8(MACROBLOCK *x) {
+  vp9_transform_mby_8x8(x);
+  vp9_transform_mbuv_8x8(x);
+}
+
+void vp9_transform_mby_16x16(MACROBLOCK *x) {
+  vp9_clear_system_state();
+  x->vp9_short_fdct16x16(&x->block[0].src_diff[0],
+                         &x->block[0].coeff[0], 32);
+}
+
+void vp9_transform_mb_16x16(MACROBLOCK *x) {
+  vp9_transform_mby_16x16(x);
+  vp9_transform_mbuv_8x8(x);
+}
+
+#define RDTRUNC(RM,DM,R,D) ( (128+(R)*(RM)) & 0xFF )
+#define RDTRUNC_8x8(RM,DM,R,D) ( (128+(R)*(RM)) & 0xFF )
+typedef struct vp9_token_state vp9_token_state;
+
+struct vp9_token_state {
+  int           rate;
+  int           error;
+  int           next;
+  signed char   token;
+  short         qc;
+};
+
+// TODO: experiments to find optimal multiple numbers
+#define Y1_RD_MULT 4
+#define UV_RD_MULT 2
+#define Y2_RD_MULT 4
+
+static const int plane_rd_mult[4] = {
+  Y1_RD_MULT,
+  Y2_RD_MULT,
+  UV_RD_MULT,
+  Y1_RD_MULT
+};
+
+#define UPDATE_RD_COST()\
+{\
+  rd_cost0 = RDCOST(rdmult, rddiv, rate0, error0);\
+  rd_cost1 = RDCOST(rdmult, rddiv, rate1, error1);\
+  if (rd_cost0 == rd_cost1) {\
+    rd_cost0 = RDTRUNC(rdmult, rddiv, rate0, error0);\
+    rd_cost1 = RDTRUNC(rdmult, rddiv, rate1, error1);\
+  }\
+}
+
+static void optimize_b(MACROBLOCK *mb, int i, PLANE_TYPE type,
+                       ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
+                       const VP9_ENCODER_RTCD *rtcd, int tx_size) {
+  BLOCK *b;
+  BLOCKD *d;
+  vp9_token_state tokens[65][2];
+  uint64_t best_mask[2];
+  const short *dequant_ptr;
+  const short *coeff_ptr;
+  short *qcoeff_ptr;
+  short *dqcoeff_ptr;
+  int eob;
+  int i0;
+  int rc;
+  int x;
+  int sz = 0;
+  int next;
+  int rdmult;
+  int rddiv;
+  int final_eob;
+  int64_t rd_cost0, rd_cost1;
+  int rate0, rate1;
+  int error0, error1;
+  int t0, t1;
+  int best;
+  int band;
+  int pt;
+  int err_mult = plane_rd_mult[type];
+  int default_eob;
+  int const *scan, *bands;
+
+  b = &mb->block[i];
+  d = &mb->e_mbd.block[i];
+  switch (tx_size) {
+    default:
+    case TX_4X4:
+      scan = vp9_default_zig_zag1d;
+      bands = vp9_coef_bands;
+      default_eob = 16;
+      // TODO: this isn't called (for intra4x4 modes), but will be left in
+      // since it could be used later
+      {
+        TX_TYPE tx_type = get_tx_type(&mb->e_mbd, d);
+        if (tx_type != DCT_DCT) {
+          switch (tx_type) {
+            case ADST_DCT:
+              scan = vp9_row_scan;
+              break;
+
+            case DCT_ADST:
+              scan = vp9_col_scan;
+              break;
+
+            default:
+              scan = vp9_default_zig_zag1d;
+              break;
+          }
+        } else {
+          scan = vp9_default_zig_zag1d;
+        }
+      }
+      break;
+    case TX_8X8:
+      scan = vp9_default_zig_zag1d_8x8;
+      bands = vp9_coef_bands_8x8;
+      default_eob = 64;
+      break;
+  }
+
+  dequant_ptr = d->dequant;
+  coeff_ptr = b->coeff;
+  qcoeff_ptr = d->qcoeff;
+  dqcoeff_ptr = d->dqcoeff;
+  i0 = (type == PLANE_TYPE_Y_NO_DC);
+  eob = d->eob;
+
+  /* Now set up a Viterbi trellis to evaluate alternative roundings. */
+  rdmult = mb->rdmult * err_mult;
+  if (mb->e_mbd.mode_info_context->mbmi.ref_frame == INTRA_FRAME)
+    rdmult = (rdmult * 9) >> 4;
+  rddiv = mb->rddiv;
+  best_mask[0] = best_mask[1] = 0;
+  /* Initialize the sentinel node of the trellis. */
+  tokens[eob][0].rate = 0;
+  tokens[eob][0].error = 0;
+  tokens[eob][0].next = default_eob;
+  tokens[eob][0].token = DCT_EOB_TOKEN;
+  tokens[eob][0].qc = 0;
+  *(tokens[eob] + 1) = *(tokens[eob] + 0);
+  next = eob;
+  for (i = eob; i-- > i0;) {
+    int base_bits;
+    int d2;
+    int dx;
+
+    rc = scan[i];
+    x = qcoeff_ptr[rc];
+    /* Only add a trellis state for non-zero coefficients. */
+    if (x) {
+      int shortcut = 0;
+      error0 = tokens[next][0].error;
+      error1 = tokens[next][1].error;
+      /* Evaluate the first possibility for this state. */
+      rate0 = tokens[next][0].rate;
+      rate1 = tokens[next][1].rate;
+      t0 = (vp9_dct_value_tokens_ptr + x)->Token;
+      /* Consider both possible successor states. */
+      if (next < default_eob) {
+        band = bands[i + 1];
+        pt = vp9_prev_token_class[t0];
+        rate0 +=
+          mb->token_costs[tx_size][type][band][pt][tokens[next][0].token];
+        rate1 +=
+          mb->token_costs[tx_size][type][band][pt][tokens[next][1].token];
+      }
+      UPDATE_RD_COST();
+      /* And pick the best. */
+      best = rd_cost1 < rd_cost0;
+      base_bits = *(vp9_dct_value_cost_ptr + x);
+      dx = dqcoeff_ptr[rc] - coeff_ptr[rc];
+      d2 = dx * dx;
+      tokens[i][0].rate = base_bits + (best ? rate1 : rate0);
+      tokens[i][0].error = d2 + (best ? error1 : error0);
+      tokens[i][0].next = next;
+      tokens[i][0].token = t0;
+      tokens[i][0].qc = x;
+      best_mask[0] |= best << i;
+      /* Evaluate the second possibility for this state. */
+      rate0 = tokens[next][0].rate;
+      rate1 = tokens[next][1].rate;
+
+      if ((abs(x)*dequant_ptr[rc != 0] > abs(coeff_ptr[rc])) &&
+          (abs(x)*dequant_ptr[rc != 0] < abs(coeff_ptr[rc]) + dequant_ptr[rc != 0]))
+        shortcut = 1;
+      else
+        shortcut = 0;
+
+      if (shortcut) {
+        sz = -(x < 0);
+        x -= 2 * sz + 1;
+      }
+
+      /* Consider both possible successor states. */
+      if (!x) {
+        /* If we reduced this coefficient to zero, check to see if
+         *  we need to move the EOB back here.
+         */
+        t0 = tokens[next][0].token == DCT_EOB_TOKEN ?
+             DCT_EOB_TOKEN : ZERO_TOKEN;
+        t1 = tokens[next][1].token == DCT_EOB_TOKEN ?
+             DCT_EOB_TOKEN : ZERO_TOKEN;
+      } else {
+        t0 = t1 = (vp9_dct_value_tokens_ptr + x)->Token;
+      }
+      if (next < default_eob) {
+        band = bands[i + 1];
+        if (t0 != DCT_EOB_TOKEN) {
+          pt = vp9_prev_token_class[t0];
+          rate0 += mb->token_costs[tx_size][type][band][pt][
+              tokens[next][0].token];
+        }
+        if (t1 != DCT_EOB_TOKEN) {
+          pt = vp9_prev_token_class[t1];
+          rate1 += mb->token_costs[tx_size][type][band][pt][
+              tokens[next][1].token];
+        }
+      }
+
+      UPDATE_RD_COST();
+      /* And pick the best. */
+      best = rd_cost1 < rd_cost0;
+      base_bits = *(vp9_dct_value_cost_ptr + x);
+
+      if (shortcut) {
+        dx -= (dequant_ptr[rc != 0] + sz) ^ sz;
+        d2 = dx * dx;
+      }
+      tokens[i][1].rate = base_bits + (best ? rate1 : rate0);
+      tokens[i][1].error = d2 + (best ? error1 : error0);
+      tokens[i][1].next = next;
+      tokens[i][1].token = best ? t1 : t0;
+      tokens[i][1].qc = x;
+      best_mask[1] |= best << i;
+      /* Finally, make this the new head of the trellis. */
+      next = i;
+    }
+    /* There's no choice to make for a zero coefficient, so we don't
+     *  add a new trellis node, but we do need to update the costs.
+     */
+    else {
+      band = bands[i + 1];
+      t0 = tokens[next][0].token;
+      t1 = tokens[next][1].token;
+      /* Update the cost of each path if we're past the EOB token. */
+      if (t0 != DCT_EOB_TOKEN) {
+        tokens[next][0].rate += mb->token_costs[tx_size][type][band][0][t0];
+        tokens[next][0].token = ZERO_TOKEN;
+      }
+      if (t1 != DCT_EOB_TOKEN) {
+        tokens[next][1].rate += mb->token_costs[tx_size][type][band][0][t1];
+        tokens[next][1].token = ZERO_TOKEN;
+      }
+      /* Don't update next, because we didn't add a new node. */
+    }
+  }
+
+  /* Now pick the best path through the whole trellis. */
+  band = bands[i + 1];
+  VP9_COMBINEENTROPYCONTEXTS(pt, *a, *l);
+  rate0 = tokens[next][0].rate;
+  rate1 = tokens[next][1].rate;
+  error0 = tokens[next][0].error;
+  error1 = tokens[next][1].error;
+  t0 = tokens[next][0].token;
+  t1 = tokens[next][1].token;
+  rate0 += mb->token_costs[tx_size][type][band][pt][t0];
+  rate1 += mb->token_costs[tx_size][type][band][pt][t1];
+  UPDATE_RD_COST();
+  best = rd_cost1 < rd_cost0;
+  final_eob = i0 - 1;
+  for (i = next; i < eob; i = next) {
+    x = tokens[i][best].qc;
+    if (x)
+      final_eob = i;
+    rc = scan[i];
+    qcoeff_ptr[rc] = x;
+    dqcoeff_ptr[rc] = (x * dequant_ptr[rc != 0]);
+
+    next = tokens[i][best].next;
+    best = (best_mask[best] >> i) & 1;
+  }
+  final_eob++;
+
+  d->eob = final_eob;
+  *a = *l = (d->eob != !type);
+}
+
+/**************************************************************************
+our inverse hadamard transform effectively is weighted sum of all 16 inputs
+with weight either 1 or -1. It has a last stage scaling of (sum+1)>>2. And
+dc only idct is (dc+16)>>5. So if all the sums are between -65 and 63 the
+output after inverse wht and idct will be all zero. A sum of absolute value
+smaller than 65 guarantees all 16 different (+1/-1) weighted sums in wht
+fall between -65 and +65.
+**************************************************************************/
+#define SUM_2ND_COEFF_THRESH 65
+
+static void check_reset_2nd_coeffs(MACROBLOCKD *xd,
+                                   ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l) {
+  int sum = 0;
+  int i;
+  BLOCKD *bd = &xd->block[24];
+  if (bd->dequant[0] >= SUM_2ND_COEFF_THRESH
+      && bd->dequant[1] >= SUM_2ND_COEFF_THRESH)
+    return;
+
+  for (i = 0; i < bd->eob; i++) {
+    int coef = bd->dqcoeff[vp9_default_zig_zag1d[i]];
+    sum += (coef >= 0) ? coef : -coef;
+    if (sum >= SUM_2ND_COEFF_THRESH)
+      return;
+  }
+
+  if (sum < SUM_2ND_COEFF_THRESH) {
+    for (i = 0; i < bd->eob; i++) {
+      int rc = vp9_default_zig_zag1d[i];
+      bd->qcoeff[rc] = 0;
+      bd->dqcoeff[rc] = 0;
+    }
+    bd->eob = 0;
+    *a = *l = (bd->eob != 0);
+  }
+}
+
+#define SUM_2ND_COEFF_THRESH_8X8 32
+static void check_reset_8x8_2nd_coeffs(MACROBLOCKD *xd,
+                                       ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l) {
+  int sum = 0;
+  BLOCKD *bd = &xd->block[24];
+  int coef;
+
+  coef = bd->dqcoeff[0];
+  sum += (coef >= 0) ? coef : -coef;
+  coef = bd->dqcoeff[1];
+  sum += (coef >= 0) ? coef : -coef;
+  coef = bd->dqcoeff[4];
+  sum += (coef >= 0) ? coef : -coef;
+  coef = bd->dqcoeff[8];
+  sum += (coef >= 0) ? coef : -coef;
+
+  if (sum < SUM_2ND_COEFF_THRESH_8X8) {
+    bd->qcoeff[0] = 0;
+    bd->dqcoeff[0] = 0;
+    bd->qcoeff[1] = 0;
+    bd->dqcoeff[1] = 0;
+    bd->qcoeff[4] = 0;
+    bd->dqcoeff[4] = 0;
+    bd->qcoeff[8] = 0;
+    bd->dqcoeff[8] = 0;
+    bd->eob = 0;
+    *a = *l = (bd->eob != 0);
+  }
+}
+
+void vp9_optimize_mby_4x4(MACROBLOCK *x, const VP9_ENCODER_RTCD *rtcd) {
+  int b;
+  PLANE_TYPE type;
+  int has_2nd_order;
+  ENTROPY_CONTEXT_PLANES t_above, t_left;
+  ENTROPY_CONTEXT *ta;
+  ENTROPY_CONTEXT *tl;
+  MB_PREDICTION_MODE mode = x->e_mbd.mode_info_context->mbmi.mode;
+
+  if (!x->e_mbd.above_context || !x->e_mbd.left_context)
+    return;
+
+  vpx_memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
+  vpx_memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
+
+  ta = (ENTROPY_CONTEXT *)&t_above;
+  tl = (ENTROPY_CONTEXT *)&t_left;
+
+  has_2nd_order = (mode != B_PRED && mode != I8X8_PRED && mode != SPLITMV);
+  type = has_2nd_order ? PLANE_TYPE_Y_NO_DC : PLANE_TYPE_Y_WITH_DC;
+
+  for (b = 0; b < 16; b++) {
+    optimize_b(x, b, type,
+               ta + vp9_block2above[b], tl + vp9_block2left[b], rtcd, TX_4X4);
+  }
+
+  if (has_2nd_order) {
+    b = 24;
+    optimize_b(x, b, PLANE_TYPE_Y2,
+               ta + vp9_block2above[b], tl + vp9_block2left[b], rtcd, TX_4X4);
+    check_reset_2nd_coeffs(&x->e_mbd,
+                           ta + vp9_block2above[b], tl + vp9_block2left[b]);
+  }
+}
+
+void vp9_optimize_mbuv_4x4(MACROBLOCK *x, const VP9_ENCODER_RTCD *rtcd) {
+  int b;
+  ENTROPY_CONTEXT_PLANES t_above, t_left;
+  ENTROPY_CONTEXT *ta;
+  ENTROPY_CONTEXT *tl;
+
+  if (!x->e_mbd.above_context || !x->e_mbd.left_context)
+    return;
+
+  vpx_memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
+  vpx_memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
+
+  ta = (ENTROPY_CONTEXT *)&t_above;
+  tl = (ENTROPY_CONTEXT *)&t_left;
+
+  for (b = 16; b < 24; b++) {
+    optimize_b(x, b, PLANE_TYPE_UV,
+               ta + vp9_block2above[b], tl + vp9_block2left[b], rtcd, TX_4X4);
+  }
+}
+
+static void optimize_mb_4x4(MACROBLOCK *x, const VP9_ENCODER_RTCD *rtcd) {
+  vp9_optimize_mby_4x4(x, rtcd);
+  vp9_optimize_mbuv_4x4(x, rtcd);
+}
+
+void vp9_optimize_mby_8x8(MACROBLOCK *x, const VP9_ENCODER_RTCD *rtcd) {
+  int b;
+  PLANE_TYPE type;
+  ENTROPY_CONTEXT_PLANES t_above, t_left;
+  ENTROPY_CONTEXT *ta;
+  ENTROPY_CONTEXT *tl;
+  int has_2nd_order = x->e_mbd.mode_info_context->mbmi.mode != SPLITMV;
+
+  if (!x->e_mbd.above_context || !x->e_mbd.left_context)
+    return;
+
+  vpx_memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
+  vpx_memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
+
+  ta = (ENTROPY_CONTEXT *)&t_above;
+  tl = (ENTROPY_CONTEXT *)&t_left;
+  type = has_2nd_order ? PLANE_TYPE_Y_NO_DC : PLANE_TYPE_Y_WITH_DC;
+  for (b = 0; b < 16; b += 4) {
+    optimize_b(x, b, type,
+               ta + vp9_block2above_8x8[b], tl + vp9_block2left_8x8[b],
+               rtcd, TX_8X8);
+    ta[vp9_block2above_8x8[b] + 1] = ta[vp9_block2above_8x8[b]];
+    tl[vp9_block2left_8x8[b] + 1]  = tl[vp9_block2left_8x8[b]];
+  }
+
+  // 8x8 always have 2nd roder haar block
+  if (has_2nd_order) {
+    check_reset_8x8_2nd_coeffs(&x->e_mbd,
+                               ta + vp9_block2above_8x8[24],
+                               tl + vp9_block2left_8x8[24]);
+  }
+}
+
+void vp9_optimize_mbuv_8x8(MACROBLOCK *x, const VP9_ENCODER_RTCD *rtcd) {
+  int b;
+  ENTROPY_CONTEXT_PLANES t_above, t_left;
+  ENTROPY_CONTEXT *ta;
+  ENTROPY_CONTEXT *tl;
+
+  if (!x->e_mbd.above_context || !x->e_mbd.left_context)
+    return;
+
+  vpx_memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
+  vpx_memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
+
+  ta = (ENTROPY_CONTEXT *)&t_above;
+  tl = (ENTROPY_CONTEXT *)&t_left;
+
+  for (b = 16; b < 24; b += 4) {
+    optimize_b(x, b, PLANE_TYPE_UV,
+               ta + vp9_block2above_8x8[b], tl + vp9_block2left_8x8[b],
+               rtcd, TX_8X8);
+    ta[vp9_block2above_8x8[b] + 1] = ta[vp9_block2above_8x8[b]];
+    tl[vp9_block2left_8x8[b] + 1]  = tl[vp9_block2left_8x8[b]];
+  }
+}
+
+static void optimize_mb_8x8(MACROBLOCK *x, const VP9_ENCODER_RTCD *rtcd) {
+  vp9_optimize_mby_8x8(x, rtcd);
+  vp9_optimize_mbuv_8x8(x, rtcd);
+}
+
+static void optimize_b_16x16(MACROBLOCK *mb, int i, PLANE_TYPE type,
+                             ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
+                             const VP9_ENCODER_RTCD *rtcd) {
+  BLOCK *b = &mb->block[i];
+  BLOCKD *d = &mb->e_mbd.block[i];
+  vp9_token_state tokens[257][2];
+  unsigned best_index[257][2];
+  const short *dequant_ptr = d->dequant, *coeff_ptr = b->coeff;
+  short *qcoeff_ptr = qcoeff_ptr = d->qcoeff;
+  short *dqcoeff_ptr = dqcoeff_ptr = d->dqcoeff;
+  int eob = d->eob, final_eob, sz = 0;
+  int rc, x, next;
+  int64_t rdmult, rddiv, rd_cost0, rd_cost1;
+  int rate0, rate1, error0, error1, t0, t1;
+  int best, band, pt;
+  int err_mult = plane_rd_mult[type];
+
+  /* Now set up a Viterbi trellis to evaluate alternative roundings. */
+  rdmult = mb->rdmult * err_mult;
+  if (mb->e_mbd.mode_info_context->mbmi.ref_frame == INTRA_FRAME)
+      rdmult = (rdmult * 9)>>4;
+  rddiv = mb->rddiv;
+  memset(best_index, 0, sizeof(best_index));
+  /* Initialize the sentinel node of the trellis. */
+  tokens[eob][0].rate = 0;
+  tokens[eob][0].error = 0;
+  tokens[eob][0].next = 256;
+  tokens[eob][0].token = DCT_EOB_TOKEN;
+  tokens[eob][0].qc = 0;
+  *(tokens[eob] + 1) = *(tokens[eob] + 0);
+  next = eob;
+  for (i = eob; i-- > 0;) {
+    int base_bits, d2, dx;
+
+    rc = vp9_default_zig_zag1d_16x16[i];
+    x = qcoeff_ptr[rc];
+    /* Only add a trellis state for non-zero coefficients. */
+    if (x) {
+      int shortcut = 0;
+      error0 = tokens[next][0].error;
+      error1 = tokens[next][1].error;
+      /* Evaluate the first possibility for this state. */
+      rate0 = tokens[next][0].rate;
+      rate1 = tokens[next][1].rate;
+      t0 = (vp9_dct_value_tokens_ptr + x)->Token;
+      /* Consider both possible successor states. */
+      if (next < 256) {
+        band = vp9_coef_bands_16x16[i + 1];
+        pt = vp9_prev_token_class[t0];
+        rate0 += mb->token_costs[TX_16X16][type][band][pt][tokens[next][0].token];
+        rate1 += mb->token_costs[TX_16X16][type][band][pt][tokens[next][1].token];
+      }
+      UPDATE_RD_COST();
+      /* And pick the best. */
+      best = rd_cost1 < rd_cost0;
+      base_bits = *(vp9_dct_value_cost_ptr + x);
+      dx = dqcoeff_ptr[rc] - coeff_ptr[rc];
+      d2 = dx*dx;
+      tokens[i][0].rate = base_bits + (best ? rate1 : rate0);
+      tokens[i][0].error = d2 + (best ? error1 : error0);
+      tokens[i][0].next = next;
+      tokens[i][0].token = t0;
+      tokens[i][0].qc = x;
+      best_index[i][0] = best;
+      /* Evaluate the second possibility for this state. */
+      rate0 = tokens[next][0].rate;
+      rate1 = tokens[next][1].rate;
+
+      if((abs(x)*dequant_ptr[rc!=0]>abs(coeff_ptr[rc])) &&
+         (abs(x)*dequant_ptr[rc!=0]<abs(coeff_ptr[rc])+dequant_ptr[rc!=0]))
+        shortcut = 1;
+      else
+        shortcut = 0;
+
+      if (shortcut) {
+        sz = -(x < 0);
+        x -= 2*sz + 1;
+      }
+
+      /* Consider both possible successor states. */
+      if (!x) {
+        /* If we reduced this coefficient to zero, check to see if
+         *  we need to move the EOB back here.
+         */
+        t0 = tokens[next][0].token == DCT_EOB_TOKEN ?
+             DCT_EOB_TOKEN : ZERO_TOKEN;
+        t1 = tokens[next][1].token == DCT_EOB_TOKEN ?
+             DCT_EOB_TOKEN : ZERO_TOKEN;
+      }
+      else
+        t0=t1 = (vp9_dct_value_tokens_ptr + x)->Token;
+      if (next < 256) {
+        band = vp9_coef_bands_16x16[i + 1];
+        if (t0 != DCT_EOB_TOKEN) {
+            pt = vp9_prev_token_class[t0];
+            rate0 += mb->token_costs[TX_16X16][type][band][pt]
+                [tokens[next][0].token];
+        }
+        if (t1!=DCT_EOB_TOKEN) {
+            pt = vp9_prev_token_class[t1];
+            rate1 += mb->token_costs[TX_16X16][type][band][pt]
+                [tokens[next][1].token];
+        }
+      }
+      UPDATE_RD_COST();
+      /* And pick the best. */
+      best = rd_cost1 < rd_cost0;
+      base_bits = *(vp9_dct_value_cost_ptr + x);
+
+      if(shortcut) {
+        dx -= (dequant_ptr[rc!=0] + sz) ^ sz;
+        d2 = dx*dx;
+      }
+      tokens[i][1].rate = base_bits + (best ? rate1 : rate0);
+      tokens[i][1].error = d2 + (best ? error1 : error0);
+      tokens[i][1].next = next;
+      tokens[i][1].token = best ? t1 : t0;
+      tokens[i][1].qc = x;
+      best_index[i][1] = best;
+      /* Finally, make this the new head of the trellis. */
+      next = i;
+    }
+    /* There's no choice to make for a zero coefficient, so we don't
+     *  add a new trellis node, but we do need to update the costs.
+     */
+    else {
+      band = vp9_coef_bands_16x16[i + 1];
+      t0 = tokens[next][0].token;
+      t1 = tokens[next][1].token;
+      /* Update the cost of each path if we're past the EOB token. */
+      if (t0 != DCT_EOB_TOKEN) {
+        tokens[next][0].rate += mb->token_costs[TX_16X16][type][band][0][t0];
+        tokens[next][0].token = ZERO_TOKEN;
+      }
+      if (t1 != DCT_EOB_TOKEN) {
+        tokens[next][1].rate += mb->token_costs[TX_16X16][type][band][0][t1];
+        tokens[next][1].token = ZERO_TOKEN;
+      }
+      /* Don't update next, because we didn't add a new node. */
+    }
+  }
+
+  /* Now pick the best path through the whole trellis. */
+  band = vp9_coef_bands_16x16[i + 1];
+  VP9_COMBINEENTROPYCONTEXTS(pt, *a, *l);
+  rate0 = tokens[next][0].rate;
+  rate1 = tokens[next][1].rate;
+  error0 = tokens[next][0].error;
+  error1 = tokens[next][1].error;
+  t0 = tokens[next][0].token;
+  t1 = tokens[next][1].token;
+  rate0 += mb->token_costs[TX_16X16][type][band][pt][t0];
+  rate1 += mb->token_costs[TX_16X16][type][band][pt][t1];
+  UPDATE_RD_COST();
+  best = rd_cost1 < rd_cost0;
+  final_eob = -1;
+
+  for (i = next; i < eob; i = next) {
+    x = tokens[i][best].qc;
+    if (x)
+      final_eob = i;
+    rc = vp9_default_zig_zag1d_16x16[i];
+    qcoeff_ptr[rc] = x;
+    dqcoeff_ptr[rc] = (x * dequant_ptr[rc!=0]);
+
+    next = tokens[i][best].next;
+    best = best_index[i][best];
+  }
+  final_eob++;
+
+  d->eob = final_eob;
+  *a = *l = (d->eob != !type);
+}
+
+void vp9_optimize_mby_16x16(MACROBLOCK *x, const VP9_ENCODER_RTCD *rtcd) {
+  ENTROPY_CONTEXT_PLANES t_above, t_left;
+  ENTROPY_CONTEXT *ta, *tl;
+
+  if (!x->e_mbd.above_context || !x->e_mbd.left_context)
+    return;
+
+  vpx_memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
+  vpx_memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
+
+  ta = (ENTROPY_CONTEXT *)&t_above;
+  tl = (ENTROPY_CONTEXT *)&t_left;
+  optimize_b_16x16(x, 0, PLANE_TYPE_Y_WITH_DC, ta, tl, rtcd);
+}
+
+static void optimize_mb_16x16(MACROBLOCK *x, const VP9_ENCODER_RTCD *rtcd) {
+  vp9_optimize_mby_16x16(x, rtcd);
+  vp9_optimize_mbuv_8x8(x, rtcd);
+}
+
+void vp9_encode_inter16x16(const VP9_ENCODER_RTCD *rtcd, MACROBLOCK *x) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  TX_SIZE tx_size = xd->mode_info_context->mbmi.txfm_size;
+
+  vp9_build_inter_predictors_mb(xd);
+  subtract_mb(rtcd, x);
+
+  if (tx_size == TX_16X16) {
+    vp9_transform_mb_16x16(x);
+    vp9_quantize_mb_16x16(x);
+    if (x->optimize)
+      optimize_mb_16x16(x, rtcd);
+    vp9_inverse_transform_mb_16x16(IF_RTCD(&rtcd->common->idct), xd);
+  } else if (tx_size == TX_8X8) {
+    if (xd->mode_info_context->mbmi.mode == SPLITMV) {
+      assert(xd->mode_info_context->mbmi.partitioning != PARTITIONING_4X4);
+      vp9_transform_mby_8x8(x);
+      vp9_transform_mbuv_4x4(x);
+      vp9_quantize_mby_8x8(x);
+      vp9_quantize_mbuv_4x4(x);
+      if (x->optimize) {
+        vp9_optimize_mby_8x8(x, rtcd);
+        vp9_optimize_mbuv_4x4(x, rtcd);
+      }
+      vp9_inverse_transform_mby_8x8(IF_RTCD(&rtcd->common->idct), xd);
+      vp9_inverse_transform_mbuv_4x4(IF_RTCD(&rtcd->common->idct), xd);
+    } else {
+      vp9_transform_mb_8x8(x);
+      vp9_quantize_mb_8x8(x);
+      if (x->optimize)
+        optimize_mb_8x8(x, rtcd);
+      vp9_inverse_transform_mb_8x8(IF_RTCD(&rtcd->common->idct), xd);
+    }
+  } else {
+    transform_mb_4x4(x);
+    vp9_quantize_mb_4x4(x);
+    if (x->optimize)
+      optimize_mb_4x4(x, rtcd);
+    vp9_inverse_transform_mb_4x4(IF_RTCD(&rtcd->common->idct), xd);
+  }
+
+  vp9_recon_mb(xd);
+}
+
+/* this function is used by first pass only */
+void vp9_encode_inter16x16y(const VP9_ENCODER_RTCD *rtcd, MACROBLOCK *x) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  BLOCK *b = &x->block[0];
+
+#if CONFIG_PRED_FILTER
+  // Disable the prediction filter for firstpass
+  xd->mode_info_context->mbmi.pred_filter_enabled = 0;
+#endif
+
+  vp9_build_1st_inter16x16_predictors_mby(xd, xd->predictor, 16, 0);
+
+  vp9_subtract_mby(x->src_diff, *(b->base_src), xd->predictor, b->src_stride);
+
+  vp9_transform_mby_4x4(x);
+  vp9_quantize_mby_4x4(x);
+  vp9_inverse_transform_mby_4x4(IF_RTCD(&rtcd->common->idct), xd);
+
+  vp9_recon_mby(xd);
+}
--- /dev/null
+++ b/vp9/encoder/encodemb.h
@@ -1,0 +1,70 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_ENCODEMB_H
+#define __INC_ENCODEMB_H
+
+#include "vpx_ports/config.h"
+#include "block.h"
+
+typedef struct {
+  MB_PREDICTION_MODE mode;
+  MV_REFERENCE_FRAME ref_frame;
+  MV_REFERENCE_FRAME second_ref_frame;
+#if CONFIG_PRED_FILTER
+  int pred_filter_flag;
+#endif
+} MODE_DEFINITION;
+
+
+#if CONFIG_RUNTIME_CPU_DETECT
+#define ENCODEMB_INVOKE(ctx,fn) (ctx)->fn
+#else
+#define ENCODEMB_INVOKE(ctx,fn) vp9_encodemb_##fn
+#endif
+
+
+
+#include "onyx_int.h"
+struct VP9_ENCODER_RTCD;
+void vp9_encode_inter16x16(const struct VP9_ENCODER_RTCD *rtcd, MACROBLOCK *x);
+
+void vp9_transform_mbuv_4x4(MACROBLOCK *x);
+void vp9_transform_mby_4x4(MACROBLOCK *x);
+
+void vp9_optimize_mby_4x4(MACROBLOCK *x, const struct VP9_ENCODER_RTCD *rtcd);
+void vp9_optimize_mbuv_4x4(MACROBLOCK *x, const struct VP9_ENCODER_RTCD *rtcd);
+void vp9_encode_inter16x16y(const struct VP9_ENCODER_RTCD *rtcd, MACROBLOCK *x);
+
+void vp9_transform_mb_8x8(MACROBLOCK *mb);
+void vp9_transform_mby_8x8(MACROBLOCK *x);
+void vp9_transform_mbuv_8x8(MACROBLOCK *x);
+void vp9_build_dcblock_8x8(MACROBLOCK *b);
+void vp9_optimize_mby_8x8(MACROBLOCK *x, const struct VP9_ENCODER_RTCD *rtcd);
+void vp9_optimize_mbuv_8x8(MACROBLOCK *x, const struct VP9_ENCODER_RTCD *rtcd);
+
+void vp9_transform_mb_16x16(MACROBLOCK *mb);
+void vp9_transform_mby_16x16(MACROBLOCK *x);
+void vp9_optimize_mby_16x16(MACROBLOCK *x, const struct VP9_ENCODER_RTCD *rtcd);
+
+void vp9_subtract_4b_c(BLOCK *be, BLOCKD *bd, int pitch);
+
+#if CONFIG_SUPERBLOCKS
+void vp9_subtract_mbuv_s_c(short *diff, const unsigned char *usrc,
+                           const unsigned char *vsrc, int src_stride,
+                           const unsigned char *upred,
+                           const unsigned char *vpred, int dst_stride);
+void vp9_subtract_mby_s_c(short *diff, const unsigned char *src,
+                          int src_stride, const unsigned char *pred,
+                          int dst_stride);
+#endif
+
+#endif
--- /dev/null
+++ b/vp9/encoder/encodemv.c
@@ -1,0 +1,547 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vp9/common/common.h"
+#include "encodemv.h"
+#include "vp9/common/entropymode.h"
+#include "vp9/common/systemdependent.h"
+
+#include <math.h>
+
+#ifdef ENTROPY_STATS
+extern unsigned int active_section;
+#endif
+
+#ifdef NMV_STATS
+nmv_context_counts tnmvcounts;
+#endif
+
+static void encode_nmv_component(vp9_writer* const bc,
+                                 int v,
+                                 int r,
+                                 const nmv_component* const mvcomp) {
+  int s, z, c, o, d;
+  assert (v != 0);            /* should not be zero */
+  s = v < 0;
+  vp9_write(bc, s, mvcomp->sign);
+  z = (s ? -v : v) - 1;       /* magnitude - 1 */
+
+  c = vp9_get_mv_class(z, &o);
+
+  write_token(bc, vp9_mv_class_tree, mvcomp->classes,
+              vp9_mv_class_encodings + c);
+
+  d = (o >> 3);               /* int mv data */
+
+  if (c == MV_CLASS_0) {
+    write_token(bc, vp9_mv_class0_tree, mvcomp->class0,
+                vp9_mv_class0_encodings + d);
+  } else {
+    int i, b;
+    b = c + CLASS0_BITS - 1;  /* number of bits */
+    for (i = 0; i < b; ++i)
+      vp9_write(bc, ((d >> i) & 1), mvcomp->bits[i]);
+  }
+}
+
+static void encode_nmv_component_fp(vp9_writer *bc,
+                                    int v,
+                                    int r,
+                                    const nmv_component* const mvcomp,
+                                    int usehp) {
+  int s, z, c, o, d, f, e;
+  assert (v != 0);            /* should not be zero */
+  s = v < 0;
+  z = (s ? -v : v) - 1;       /* magnitude - 1 */
+
+  c = vp9_get_mv_class(z, &o);
+
+  d = (o >> 3);               /* int mv data */
+  f = (o >> 1) & 3;           /* fractional pel mv data */
+  e = (o & 1);                /* high precision mv data */
+
+  /* Code the fractional pel bits */
+  if (c == MV_CLASS_0) {
+    write_token(bc, vp9_mv_fp_tree, mvcomp->class0_fp[d],
+                vp9_mv_fp_encodings + f);
+  } else {
+    write_token(bc, vp9_mv_fp_tree, mvcomp->fp,
+                vp9_mv_fp_encodings + f);
+  }
+  /* Code the high precision bit */
+  if (usehp) {
+    if (c == MV_CLASS_0) {
+      vp9_write(bc, e, mvcomp->class0_hp);
+    } else {
+      vp9_write(bc, e, mvcomp->hp);
+    }
+  }
+}
+
+static void build_nmv_component_cost_table(int *mvcost,
+                                           const nmv_component* const mvcomp,
+                                           int usehp) {
+  int i, v;
+  int sign_cost[2], class_cost[MV_CLASSES], class0_cost[CLASS0_SIZE];
+  int bits_cost[MV_OFFSET_BITS][2];
+  int class0_fp_cost[CLASS0_SIZE][4], fp_cost[4];
+  int class0_hp_cost[2], hp_cost[2];
+
+  sign_cost[0] = vp9_cost_zero(mvcomp->sign);
+  sign_cost[1] = vp9_cost_one(mvcomp->sign);
+  vp9_cost_tokens(class_cost, mvcomp->classes, vp9_mv_class_tree);
+  vp9_cost_tokens(class0_cost, mvcomp->class0, vp9_mv_class0_tree);
+  for (i = 0; i < MV_OFFSET_BITS; ++i) {
+    bits_cost[i][0] = vp9_cost_zero(mvcomp->bits[i]);
+    bits_cost[i][1] = vp9_cost_one(mvcomp->bits[i]);
+  }
+
+  for (i = 0; i < CLASS0_SIZE; ++i)
+    vp9_cost_tokens(class0_fp_cost[i], mvcomp->class0_fp[i], vp9_mv_fp_tree);
+  vp9_cost_tokens(fp_cost, mvcomp->fp, vp9_mv_fp_tree);
+
+  if (usehp) {
+    class0_hp_cost[0] = vp9_cost_zero(mvcomp->class0_hp);
+    class0_hp_cost[1] = vp9_cost_one(mvcomp->class0_hp);
+    hp_cost[0] = vp9_cost_zero(mvcomp->hp);
+    hp_cost[1] = vp9_cost_one(mvcomp->hp);
+  }
+  mvcost[0] = 0;
+  for (v = 1; v <= MV_MAX; ++v) {
+    int z, c, o, d, e, f, cost = 0;
+    z = v - 1;
+    c = vp9_get_mv_class(z, &o);
+    cost += class_cost[c];
+    d = (o >> 3);               /* int mv data */
+    f = (o >> 1) & 3;           /* fractional pel mv data */
+    e = (o & 1);                /* high precision mv data */
+    if (c == MV_CLASS_0) {
+      cost += class0_cost[d];
+    } else {
+      int i, b;
+      b = c + CLASS0_BITS - 1;  /* number of bits */
+      for (i = 0; i < b; ++i)
+        cost += bits_cost[i][((d >> i) & 1)];
+    }
+    if (c == MV_CLASS_0) {
+      cost += class0_fp_cost[d][f];
+    } else {
+      cost += fp_cost[f];
+    }
+    if (usehp) {
+      if (c == MV_CLASS_0) {
+        cost += class0_hp_cost[e];
+      } else {
+        cost += hp_cost[e];
+      }
+    }
+    mvcost[v] = cost + sign_cost[0];
+    mvcost[-v] = cost + sign_cost[1];
+  }
+}
+
+static int update_nmv_savings(const unsigned int ct[2],
+                              const vp9_prob cur_p,
+                              const vp9_prob new_p,
+                              const vp9_prob upd_p) {
+
+#ifdef LOW_PRECISION_MV_UPDATE
+  vp9_prob mod_p = new_p | 1;
+#else
+  vp9_prob mod_p = new_p;
+#endif
+  const int cur_b = cost_branch256(ct, cur_p);
+  const int mod_b = cost_branch256(ct, mod_p);
+  const int cost = 7 * 256 +
+#ifndef LOW_PRECISION_MV_UPDATE
+      256 +
+#endif
+      (vp9_cost_one(upd_p) - vp9_cost_zero(upd_p));
+  if (cur_b - mod_b - cost > 0) {
+    return cur_b - mod_b - cost;
+  } else {
+    return -vp9_cost_zero(upd_p);
+  }
+}
+
+static int update_nmv(
+  vp9_writer *const bc,
+  const unsigned int ct[2],
+  vp9_prob *const cur_p,
+  const vp9_prob new_p,
+  const vp9_prob upd_p) {
+
+#ifdef LOW_PRECISION_MV_UPDATE
+  vp9_prob mod_p = new_p | 1;
+#else
+  vp9_prob mod_p = new_p;
+#endif
+
+  const int cur_b = cost_branch256(ct, *cur_p);
+  const int mod_b = cost_branch256(ct, mod_p);
+  const int cost = 7 * 256 +
+#ifndef LOW_PRECISION_MV_UPDATE
+      256 +
+#endif
+      (vp9_cost_one(upd_p) - vp9_cost_zero(upd_p));
+
+  if (cur_b - mod_b > cost) {
+    *cur_p = mod_p;
+    vp9_write(bc, 1, upd_p);
+#ifdef LOW_PRECISION_MV_UPDATE
+    vp9_write_literal(bc, mod_p >> 1, 7);
+#else
+    vp9_write_literal(bc, mod_p, 8);
+#endif
+    return 1;
+  } else {
+    vp9_write(bc, 0, upd_p);
+    return 0;
+  }
+}
+
+#ifdef NMV_STATS
+void init_nmvstats() {
+  vp9_zero(tnmvcounts);
+}
+
+void print_nmvstats() {
+  nmv_context prob;
+  unsigned int branch_ct_joint[MV_JOINTS - 1][2];
+  unsigned int branch_ct_sign[2][2];
+  unsigned int branch_ct_classes[2][MV_CLASSES - 1][2];
+  unsigned int branch_ct_class0[2][CLASS0_SIZE - 1][2];
+  unsigned int branch_ct_bits[2][MV_OFFSET_BITS][2];
+  unsigned int branch_ct_class0_fp[2][CLASS0_SIZE][4 - 1][2];
+  unsigned int branch_ct_fp[2][4 - 1][2];
+  unsigned int branch_ct_class0_hp[2][2];
+  unsigned int branch_ct_hp[2][2];
+  int i, j, k;
+  vp9_counts_to_nmv_context(&tnmvcounts, &prob, 1,
+                            branch_ct_joint, branch_ct_sign, branch_ct_classes,
+                            branch_ct_class0, branch_ct_bits,
+                            branch_ct_class0_fp, branch_ct_fp,
+                            branch_ct_class0_hp, branch_ct_hp);
+
+  printf("\nCounts =\n  { ");
+  for (j = 0; j < MV_JOINTS; ++j)
+    printf("%d, ", tnmvcounts.joints[j]);
+  printf("},\n");
+  for (i=0; i< 2; ++i) {
+    printf("  {\n");
+    printf("    %d/%d,\n", tnmvcounts.comps[i].sign[0],
+                           tnmvcounts.comps[i].sign[1]);
+    printf("    { ");
+    for (j = 0; j < MV_CLASSES; ++j)
+      printf("%d, ", tnmvcounts.comps[i].classes[j]);
+    printf("},\n");
+    printf("    { ");
+    for (j = 0; j < CLASS0_SIZE; ++j)
+      printf("%d, ", tnmvcounts.comps[i].class0[j]);
+    printf("},\n");
+    printf("    { ");
+    for (j = 0; j < MV_OFFSET_BITS; ++j)
+      printf("%d/%d, ", tnmvcounts.comps[i].bits[j][0],
+                        tnmvcounts.comps[i].bits[j][1]);
+    printf("},\n");
+
+    printf("    {");
+    for (j = 0; j < CLASS0_SIZE; ++j) {
+      printf("{");
+      for (k = 0; k < 4; ++k)
+        printf("%d, ", tnmvcounts.comps[i].class0_fp[j][k]);
+      printf("}, ");
+    }
+    printf("},\n");
+
+    printf("    { ");
+    for (j = 0; j < 4; ++j)
+      printf("%d, ", tnmvcounts.comps[i].fp[j]);
+    printf("},\n");
+
+    printf("    %d/%d,\n",
+           tnmvcounts.comps[i].class0_hp[0],
+           tnmvcounts.comps[i].class0_hp[1]);
+    printf("    %d/%d,\n",
+           tnmvcounts.comps[i].hp[0],
+           tnmvcounts.comps[i].hp[1]);
+    printf("  },\n");
+  }
+
+  printf("\nProbs =\n  { ");
+  for (j = 0; j < MV_JOINTS - 1; ++j)
+    printf("%d, ", prob.joints[j]);
+  printf("},\n");
+  for (i=0; i< 2; ++i) {
+    printf("  {\n");
+    printf("    %d,\n", prob.comps[i].sign);
+    printf("    { ");
+    for (j = 0; j < MV_CLASSES - 1; ++j)
+      printf("%d, ", prob.comps[i].classes[j]);
+    printf("},\n");
+    printf("    { ");
+    for (j = 0; j < CLASS0_SIZE - 1; ++j)
+      printf("%d, ", prob.comps[i].class0[j]);
+    printf("},\n");
+    printf("    { ");
+    for (j = 0; j < MV_OFFSET_BITS; ++j)
+      printf("%d, ", prob.comps[i].bits[j]);
+    printf("},\n");
+    printf("    { ");
+    for (j = 0; j < CLASS0_SIZE; ++j) {
+      printf("{");
+      for (k = 0; k < 3; ++k)
+        printf("%d, ", prob.comps[i].class0_fp[j][k]);
+      printf("}, ");
+    }
+    printf("},\n");
+    printf("    { ");
+    for (j = 0; j < 3; ++j)
+      printf("%d, ", prob.comps[i].fp[j]);
+    printf("},\n");
+
+    printf("    %d,\n", prob.comps[i].class0_hp);
+    printf("    %d,\n", prob.comps[i].hp);
+    printf("  },\n");
+  }
+}
+
+static void add_nmvcount(nmv_context_counts* const dst,
+                         const nmv_context_counts* const src) {
+  int i, j, k;
+  for (j = 0; j < MV_JOINTS; ++j) {
+    dst->joints[j] += src->joints[j];
+  }
+  for (i = 0; i < 2; ++i) {
+    for (j = 0; j < MV_VALS; ++j) {
+      dst->comps[i].mvcount[j] += src->comps[i].mvcount[j];
+    }
+    dst->comps[i].sign[0] += src->comps[i].sign[0];
+    dst->comps[i].sign[1] += src->comps[i].sign[1];
+    for (j = 0; j < MV_CLASSES; ++j) {
+      dst->comps[i].classes[j] += src->comps[i].classes[j];
+    }
+    for (j = 0; j < CLASS0_SIZE; ++j) {
+      dst->comps[i].class0[j] += src->comps[i].class0[j];
+    }
+    for (j = 0; j < MV_OFFSET_BITS; ++j) {
+      dst->comps[i].bits[j][0] += src->comps[i].bits[j][0];
+      dst->comps[i].bits[j][1] += src->comps[i].bits[j][1];
+    }
+  }
+  for (i = 0; i < 2; ++i) {
+    for (j = 0; j < CLASS0_SIZE; ++j) {
+      for (k = 0; k < 4; ++k) {
+        dst->comps[i].class0_fp[j][k] += src->comps[i].class0_fp[j][k];
+      }
+    }
+    for (j = 0; j < 4; ++j) {
+      dst->comps[i].fp[j] += src->comps[i].fp[j];
+    }
+    dst->comps[i].class0_hp[0] += src->comps[i].class0_hp[0];
+    dst->comps[i].class0_hp[1] += src->comps[i].class0_hp[1];
+    dst->comps[i].hp[0] += src->comps[i].hp[0];
+    dst->comps[i].hp[1] += src->comps[i].hp[1];
+  }
+}
+#endif
+
+void vp9_write_nmvprobs(VP9_COMP* const cpi, int usehp, vp9_writer* const bc) {
+  int i, j;
+  nmv_context prob;
+  unsigned int branch_ct_joint[MV_JOINTS - 1][2];
+  unsigned int branch_ct_sign[2][2];
+  unsigned int branch_ct_classes[2][MV_CLASSES - 1][2];
+  unsigned int branch_ct_class0[2][CLASS0_SIZE - 1][2];
+  unsigned int branch_ct_bits[2][MV_OFFSET_BITS][2];
+  unsigned int branch_ct_class0_fp[2][CLASS0_SIZE][4 - 1][2];
+  unsigned int branch_ct_fp[2][4 - 1][2];
+  unsigned int branch_ct_class0_hp[2][2];
+  unsigned int branch_ct_hp[2][2];
+  int savings = 0;
+
+#ifdef NMV_STATS
+  if (!cpi->dummy_packing)
+    add_nmvcount(&tnmvcounts, &cpi->NMVcount);
+#endif
+  vp9_counts_to_nmv_context(&cpi->NMVcount, &prob, usehp,
+                            branch_ct_joint, branch_ct_sign, branch_ct_classes,
+                            branch_ct_class0, branch_ct_bits,
+                            branch_ct_class0_fp, branch_ct_fp,
+                            branch_ct_class0_hp, branch_ct_hp);
+  /* write updates if they help */
+#ifdef MV_GROUP_UPDATE
+  for (j = 0; j < MV_JOINTS - 1; ++j) {
+    savings += update_nmv_savings(branch_ct_joint[j],
+                                  cpi->common.fc.nmvc.joints[j],
+                                  prob.joints[j],
+                                  VP9_NMV_UPDATE_PROB);
+  }
+  for (i = 0; i < 2; ++i) {
+    savings += update_nmv_savings(branch_ct_sign[i],
+                                  cpi->common.fc.nmvc.comps[i].sign,
+                                  prob.comps[i].sign,
+                                  VP9_NMV_UPDATE_PROB);
+    for (j = 0; j < MV_CLASSES - 1; ++j) {
+      savings += update_nmv_savings(branch_ct_classes[i][j],
+                                    cpi->common.fc.nmvc.comps[i].classes[j],
+                                    prob.comps[i].classes[j],
+                                    VP9_NMV_UPDATE_PROB);
+    }
+    for (j = 0; j < CLASS0_SIZE - 1; ++j) {
+      savings += update_nmv_savings(branch_ct_class0[i][j],
+                                    cpi->common.fc.nmvc.comps[i].class0[j],
+                                    prob.comps[i].class0[j],
+                                    VP9_NMV_UPDATE_PROB);
+    }
+    for (j = 0; j < MV_OFFSET_BITS; ++j) {
+      savings += update_nmv_savings(branch_ct_bits[i][j],
+                                    cpi->common.fc.nmvc.comps[i].bits[j],
+                                    prob.comps[i].bits[j],
+                                    VP9_NMV_UPDATE_PROB);
+    }
+  }
+  for (i = 0; i < 2; ++i) {
+    for (j = 0; j < CLASS0_SIZE; ++j) {
+      int k;
+      for (k = 0; k < 3; ++k) {
+        savings += update_nmv_savings(branch_ct_class0_fp[i][j][k],
+                                      cpi->common.fc.nmvc.comps[i].class0_fp[j][k],
+                                      prob.comps[i].class0_fp[j][k],
+                                      VP9_NMV_UPDATE_PROB);
+      }
+    }
+    for (j = 0; j < 3; ++j) {
+      savings += update_nmv_savings(branch_ct_fp[i][j],
+                                    cpi->common.fc.nmvc.comps[i].fp[j],
+                                    prob.comps[i].fp[j],
+                                    VP9_NMV_UPDATE_PROB);
+    }
+  }
+  if (usehp) {
+    for (i = 0; i < 2; ++i) {
+      savings += update_nmv_savings(branch_ct_class0_hp[i],
+                                    cpi->common.fc.nmvc.comps[i].class0_hp,
+                                    prob.comps[i].class0_hp,
+                                    VP9_NMV_UPDATE_PROB);
+      savings += update_nmv_savings(branch_ct_hp[i],
+                                    cpi->common.fc.nmvc.comps[i].hp,
+                                    prob.comps[i].hp,
+                                    VP9_NMV_UPDATE_PROB);
+    }
+  }
+  if (savings <= 0) {
+    vp9_write_bit(bc, 0);
+    return;
+  }
+  vp9_write_bit(bc, 1);
+#endif
+
+  for (j = 0; j < MV_JOINTS - 1; ++j) {
+    update_nmv(bc, branch_ct_joint[j],
+               &cpi->common.fc.nmvc.joints[j],
+               prob.joints[j],
+               VP9_NMV_UPDATE_PROB);
+  }
+  for (i = 0; i < 2; ++i) {
+    update_nmv(bc, branch_ct_sign[i],
+               &cpi->common.fc.nmvc.comps[i].sign,
+               prob.comps[i].sign,
+               VP9_NMV_UPDATE_PROB);
+    for (j = 0; j < MV_CLASSES - 1; ++j) {
+      update_nmv(bc, branch_ct_classes[i][j],
+                 &cpi->common.fc.nmvc.comps[i].classes[j],
+                 prob.comps[i].classes[j],
+                 VP9_NMV_UPDATE_PROB);
+    }
+    for (j = 0; j < CLASS0_SIZE - 1; ++j) {
+      update_nmv(bc, branch_ct_class0[i][j],
+                 &cpi->common.fc.nmvc.comps[i].class0[j],
+                 prob.comps[i].class0[j],
+                 VP9_NMV_UPDATE_PROB);
+    }
+    for (j = 0; j < MV_OFFSET_BITS; ++j) {
+      update_nmv(bc, branch_ct_bits[i][j],
+                 &cpi->common.fc.nmvc.comps[i].bits[j],
+                 prob.comps[i].bits[j],
+                 VP9_NMV_UPDATE_PROB);
+    }
+  }
+  for (i = 0; i < 2; ++i) {
+    for (j = 0; j < CLASS0_SIZE; ++j) {
+      int k;
+      for (k = 0; k < 3; ++k) {
+        update_nmv(bc, branch_ct_class0_fp[i][j][k],
+                   &cpi->common.fc.nmvc.comps[i].class0_fp[j][k],
+                   prob.comps[i].class0_fp[j][k],
+                   VP9_NMV_UPDATE_PROB);
+      }
+    }
+    for (j = 0; j < 3; ++j) {
+      update_nmv(bc, branch_ct_fp[i][j],
+                 &cpi->common.fc.nmvc.comps[i].fp[j],
+                 prob.comps[i].fp[j],
+                 VP9_NMV_UPDATE_PROB);
+    }
+  }
+  if (usehp) {
+    for (i = 0; i < 2; ++i) {
+      update_nmv(bc, branch_ct_class0_hp[i],
+                 &cpi->common.fc.nmvc.comps[i].class0_hp,
+                 prob.comps[i].class0_hp,
+                 VP9_NMV_UPDATE_PROB);
+      update_nmv(bc, branch_ct_hp[i],
+                 &cpi->common.fc.nmvc.comps[i].hp,
+                 prob.comps[i].hp,
+                 VP9_NMV_UPDATE_PROB);
+    }
+  }
+}
+
+void vp9_encode_nmv(vp9_writer* const bc, const MV* const mv,
+                    const MV* const ref, const nmv_context* const mvctx) {
+  MV_JOINT_TYPE j = vp9_get_mv_joint(*mv);
+  write_token(bc, vp9_mv_joint_tree, mvctx->joints,
+              vp9_mv_joint_encodings + j);
+  if (j == MV_JOINT_HZVNZ || j == MV_JOINT_HNZVNZ) {
+    encode_nmv_component(bc, mv->row, ref->col, &mvctx->comps[0]);
+  }
+  if (j == MV_JOINT_HNZVZ || j == MV_JOINT_HNZVNZ) {
+    encode_nmv_component(bc, mv->col, ref->col, &mvctx->comps[1]);
+  }
+}
+
+void vp9_encode_nmv_fp(vp9_writer* const bc, const MV* const mv,
+                       const MV* const ref, const nmv_context* const mvctx,
+                       int usehp) {
+  MV_JOINT_TYPE j = vp9_get_mv_joint(*mv);
+  usehp = usehp && vp9_use_nmv_hp(ref);
+  if (j == MV_JOINT_HZVNZ || j == MV_JOINT_HNZVNZ) {
+    encode_nmv_component_fp(bc, mv->row, ref->row, &mvctx->comps[0], usehp);
+  }
+  if (j == MV_JOINT_HNZVZ || j == MV_JOINT_HNZVNZ) {
+    encode_nmv_component_fp(bc, mv->col, ref->col, &mvctx->comps[1], usehp);
+  }
+}
+
+void vp9_build_nmv_cost_table(int *mvjoint,
+                              int *mvcost[2],
+                              const nmv_context* const mvctx,
+                              int usehp,
+                              int mvc_flag_v,
+                              int mvc_flag_h) {
+  vp9_clear_system_state();
+  vp9_cost_tokens(mvjoint, mvctx->joints, vp9_mv_joint_tree);
+  if (mvc_flag_v)
+    build_nmv_component_cost_table(mvcost[0], &mvctx->comps[0], usehp);
+  if (mvc_flag_h)
+    build_nmv_component_cost_table(mvcost[1], &mvctx->comps[1], usehp);
+}
--- /dev/null
+++ b/vp9/encoder/encodemv.h
@@ -1,0 +1,30 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_ENCODEMV_H
+#define __INC_ENCODEMV_H
+
+#include "onyx_int.h"
+
+void vp9_write_nmvprobs(VP9_COMP* const, int usehp, vp9_writer* const);
+void vp9_encode_nmv(vp9_writer* const w, const MV* const mv,
+                    const MV* const ref, const nmv_context* const mvctx);
+void vp9_encode_nmv_fp(vp9_writer* const w, const MV* const mv,
+                       const MV* const ref, const nmv_context *mvctx,
+                       int usehp);
+void vp9_build_nmv_cost_table(int *mvjoint,
+                              int *mvcost[2],
+                              const nmv_context *mvctx,
+                              int usehp,
+                              int mvc_flag_v,
+                              int mvc_flag_h);
+
+#endif
--- /dev/null
+++ b/vp9/encoder/firstpass.c
@@ -1,0 +1,2533 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "math.h"
+#include "limits.h"
+#include "block.h"
+#include "onyx_int.h"
+#include "variance.h"
+#include "encodeintra.h"
+#include "vp9/common/setupintrarecon.h"
+#include "mcomp.h"
+#include "firstpass.h"
+#include "vpx_scale/vpxscale.h"
+#include "encodemb.h"
+#include "vp9/common/extend.h"
+#include "vp9/common/systemdependent.h"
+#include "vpx_scale/yv12extend.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vp9/common/swapyv12buffer.h"
+#include <stdio.h>
+#include "rdopt.h"
+#include "ratectrl.h"
+#include "vp9/common/quant_common.h"
+#include "vp9/common/entropymv.h"
+#include "encodemv.h"
+
+#define OUTPUT_FPF 0
+
+#if CONFIG_RUNTIME_CPU_DETECT
+#define IF_RTCD(x) (x)
+#else
+#define IF_RTCD(x) NULL
+#endif
+
+extern void vp9_build_block_offsets(MACROBLOCK *x);
+
+extern void vp9_setup_block_ptrs(MACROBLOCK *x);
+
+extern void vp9_frame_init_quantizer(VP9_COMP *cpi);
+
+extern void vp9_set_mbmode_and_mvs(MACROBLOCK *x, MB_PREDICTION_MODE mb,
+                                   int_mv *mv);
+
+extern void vp9_alloc_compressor_data(VP9_COMP *cpi);
+
+#define IIFACTOR   12.5
+#define IIKFACTOR1 12.5
+#define IIKFACTOR2 15.0
+#define RMAX       128.0
+#define GF_RMAX    96.0
+#define ERR_DIVISOR   150.0
+
+#define KF_MB_INTRA_MIN 300
+#define GF_MB_INTRA_MIN 200
+
+#define DOUBLE_DIVIDE_CHECK(X) ((X)<0?(X)-.000001:(X)+.000001)
+
+#define POW1 (double)cpi->oxcf.two_pass_vbrbias/100.0
+#define POW2 (double)cpi->oxcf.two_pass_vbrbias/100.0
+
+static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame);
+
+static int select_cq_level(int qindex) {
+  int ret_val = QINDEX_RANGE - 1;
+  int i;
+
+  double target_q = (vp9_convert_qindex_to_q(qindex) * 0.5847) + 1.0;
+
+  for (i = 0; i < QINDEX_RANGE; i++) {
+    if (target_q <= vp9_convert_qindex_to_q(i)) {
+      ret_val = i;
+      break;
+    }
+  }
+
+  return ret_val;
+}
+
+
+// Resets the first pass file to the given position using a relative seek from the current position
+static void reset_fpf_position(VP9_COMP *cpi, FIRSTPASS_STATS *Position) {
+  cpi->twopass.stats_in = Position;
+}
+
+static int lookup_next_frame_stats(VP9_COMP *cpi, FIRSTPASS_STATS *next_frame) {
+  if (cpi->twopass.stats_in >= cpi->twopass.stats_in_end)
+    return EOF;
+
+  *next_frame = *cpi->twopass.stats_in;
+  return 1;
+}
+
+// Read frame stats at an offset from the current position
+static int read_frame_stats(VP9_COMP *cpi,
+                            FIRSTPASS_STATS *frame_stats,
+                            int offset) {
+  FIRSTPASS_STATS *fps_ptr = cpi->twopass.stats_in;
+
+  // Check legality of offset
+  if (offset >= 0) {
+    if (&fps_ptr[offset] >= cpi->twopass.stats_in_end)
+      return EOF;
+  } else if (offset < 0) {
+    if (&fps_ptr[offset] < cpi->twopass.stats_in_start)
+      return EOF;
+  }
+
+  *frame_stats = fps_ptr[offset];
+  return 1;
+}
+
+static int input_stats(VP9_COMP *cpi, FIRSTPASS_STATS *fps) {
+  if (cpi->twopass.stats_in >= cpi->twopass.stats_in_end)
+    return EOF;
+
+  *fps = *cpi->twopass.stats_in;
+  cpi->twopass.stats_in =
+    (void *)((char *)cpi->twopass.stats_in + sizeof(FIRSTPASS_STATS));
+  return 1;
+}
+
+static void output_stats(const VP9_COMP            *cpi,
+                         struct vpx_codec_pkt_list *pktlist,
+                         FIRSTPASS_STATS            *stats) {
+  struct vpx_codec_cx_pkt pkt;
+  pkt.kind = VPX_CODEC_STATS_PKT;
+  pkt.data.twopass_stats.buf = stats;
+  pkt.data.twopass_stats.sz = sizeof(FIRSTPASS_STATS);
+  vpx_codec_pkt_list_add(pktlist, &pkt);
+
+// TEMP debug code
+#if OUTPUT_FPF
+
+  {
+    FILE *fpfile;
+    fpfile = fopen("firstpass.stt", "a");
+
+    fprintf(fpfile, "%12.0f %12.0f %12.0f %12.0f %12.0f %12.4f %12.4f"
+            "%12.4f %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f"
+            "%12.0f %12.0f %12.4f %12.0f %12.0f %12.4f\n",
+            stats->frame,
+            stats->intra_error,
+            stats->coded_error,
+            stats->sr_coded_error,
+            stats->ssim_weighted_pred_err,
+            stats->pcnt_inter,
+            stats->pcnt_motion,
+            stats->pcnt_second_ref,
+            stats->pcnt_neutral,
+            stats->MVr,
+            stats->mvr_abs,
+            stats->MVc,
+            stats->mvc_abs,
+            stats->MVrv,
+            stats->MVcv,
+            stats->mv_in_out_count,
+            stats->new_mv_count,
+            stats->count,
+            stats->duration);
+    fclose(fpfile);
+  }
+#endif
+}
+
+static void zero_stats(FIRSTPASS_STATS *section) {
+  section->frame      = 0.0;
+  section->intra_error = 0.0;
+  section->coded_error = 0.0;
+  section->sr_coded_error = 0.0;
+  section->ssim_weighted_pred_err = 0.0;
+  section->pcnt_inter  = 0.0;
+  section->pcnt_motion  = 0.0;
+  section->pcnt_second_ref = 0.0;
+  section->pcnt_neutral = 0.0;
+  section->MVr        = 0.0;
+  section->mvr_abs     = 0.0;
+  section->MVc        = 0.0;
+  section->mvc_abs     = 0.0;
+  section->MVrv       = 0.0;
+  section->MVcv       = 0.0;
+  section->mv_in_out_count  = 0.0;
+  section->new_mv_count = 0.0;
+  section->count      = 0.0;
+  section->duration   = 1.0;
+}
+
+static void accumulate_stats(FIRSTPASS_STATS *section, FIRSTPASS_STATS *frame) {
+  section->frame += frame->frame;
+  section->intra_error += frame->intra_error;
+  section->coded_error += frame->coded_error;
+  section->sr_coded_error += frame->sr_coded_error;
+  section->ssim_weighted_pred_err += frame->ssim_weighted_pred_err;
+  section->pcnt_inter  += frame->pcnt_inter;
+  section->pcnt_motion += frame->pcnt_motion;
+  section->pcnt_second_ref += frame->pcnt_second_ref;
+  section->pcnt_neutral += frame->pcnt_neutral;
+  section->MVr        += frame->MVr;
+  section->mvr_abs     += frame->mvr_abs;
+  section->MVc        += frame->MVc;
+  section->mvc_abs     += frame->mvc_abs;
+  section->MVrv       += frame->MVrv;
+  section->MVcv       += frame->MVcv;
+  section->mv_in_out_count  += frame->mv_in_out_count;
+  section->new_mv_count += frame->new_mv_count;
+  section->count      += frame->count;
+  section->duration   += frame->duration;
+}
+
+static void subtract_stats(FIRSTPASS_STATS *section, FIRSTPASS_STATS *frame) {
+  section->frame -= frame->frame;
+  section->intra_error -= frame->intra_error;
+  section->coded_error -= frame->coded_error;
+  section->sr_coded_error -= frame->sr_coded_error;
+  section->ssim_weighted_pred_err -= frame->ssim_weighted_pred_err;
+  section->pcnt_inter  -= frame->pcnt_inter;
+  section->pcnt_motion -= frame->pcnt_motion;
+  section->pcnt_second_ref -= frame->pcnt_second_ref;
+  section->pcnt_neutral -= frame->pcnt_neutral;
+  section->MVr        -= frame->MVr;
+  section->mvr_abs     -= frame->mvr_abs;
+  section->MVc        -= frame->MVc;
+  section->mvc_abs     -= frame->mvc_abs;
+  section->MVrv       -= frame->MVrv;
+  section->MVcv       -= frame->MVcv;
+  section->mv_in_out_count  -= frame->mv_in_out_count;
+  section->new_mv_count -= frame->new_mv_count;
+  section->count      -= frame->count;
+  section->duration   -= frame->duration;
+}
+
+static void avg_stats(FIRSTPASS_STATS *section) {
+  if (section->count < 1.0)
+    return;
+
+  section->intra_error /= section->count;
+  section->coded_error /= section->count;
+  section->sr_coded_error /= section->count;
+  section->ssim_weighted_pred_err /= section->count;
+  section->pcnt_inter  /= section->count;
+  section->pcnt_second_ref /= section->count;
+  section->pcnt_neutral /= section->count;
+  section->pcnt_motion /= section->count;
+  section->MVr        /= section->count;
+  section->mvr_abs     /= section->count;
+  section->MVc        /= section->count;
+  section->mvc_abs     /= section->count;
+  section->MVrv       /= section->count;
+  section->MVcv       /= section->count;
+  section->mv_in_out_count   /= section->count;
+  section->duration   /= section->count;
+}
+
+// Calculate a modified Error used in distributing bits between easier and harder frames
+static double calculate_modified_err(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
+  double av_err = (cpi->twopass.total_stats->ssim_weighted_pred_err /
+                   cpi->twopass.total_stats->count);
+  double this_err = this_frame->ssim_weighted_pred_err;
+  double modified_err;
+
+  if (this_err > av_err)
+    modified_err = av_err * pow((this_err / DOUBLE_DIVIDE_CHECK(av_err)), POW1);
+  else
+    modified_err = av_err * pow((this_err / DOUBLE_DIVIDE_CHECK(av_err)), POW2);
+
+  return modified_err;
+}
+
+static const double weight_table[256] = {
+  0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000,
+  0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000,
+  0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000,
+  0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000,
+  0.020000, 0.031250, 0.062500, 0.093750, 0.125000, 0.156250, 0.187500, 0.218750,
+  0.250000, 0.281250, 0.312500, 0.343750, 0.375000, 0.406250, 0.437500, 0.468750,
+  0.500000, 0.531250, 0.562500, 0.593750, 0.625000, 0.656250, 0.687500, 0.718750,
+  0.750000, 0.781250, 0.812500, 0.843750, 0.875000, 0.906250, 0.937500, 0.968750,
+  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000
+};
+
+static double simple_weight(YV12_BUFFER_CONFIG *source) {
+  int i, j;
+
+  unsigned char *src = source->y_buffer;
+  double sum_weights = 0.0;
+
+  // Loop throught the Y plane raw examining levels and creating a weight for the image
+  i = source->y_height;
+  do {
+    j = source->y_width;
+    do {
+      sum_weights += weight_table[ *src];
+      src++;
+    } while (--j);
+    src -= source->y_width;
+    src += source->y_stride;
+  } while (--i);
+
+  sum_weights /= (source->y_height * source->y_width);
+
+  return sum_weights;
+}
+
+
+// This function returns the current per frame maximum bitrate target
+static int frame_max_bits(VP9_COMP *cpi) {
+  // Max allocation for a single frame based on the max section guidelines passed in and how many bits are left
+  int max_bits;
+
+  // For VBR base this on the bits and frames left plus the two_pass_vbrmax_section rate passed in by the user
+  max_bits = (int)(((double)cpi->twopass.bits_left / (cpi->twopass.total_stats->count - (double)cpi->common.current_video_frame)) * ((double)cpi->oxcf.two_pass_vbrmax_section / 100.0));
+
+  // Trap case where we are out of bits
+  if (max_bits < 0)
+    max_bits = 0;
+
+  return max_bits;
+}
+
+void vp9_init_first_pass(VP9_COMP *cpi) {
+  zero_stats(cpi->twopass.total_stats);
+}
+
+void vp9_end_first_pass(VP9_COMP *cpi) {
+  output_stats(cpi, cpi->output_pkt_list, cpi->twopass.total_stats);
+}
+
+static void zz_motion_search(VP9_COMP *cpi, MACROBLOCK *x, YV12_BUFFER_CONFIG *recon_buffer, int *best_motion_err, int recon_yoffset) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  BLOCK *b = &x->block[0];
+  BLOCKD *d = &x->e_mbd.block[0];
+
+  unsigned char *src_ptr = (*(b->base_src) + b->src);
+  int src_stride = b->src_stride;
+  unsigned char *ref_ptr;
+  int ref_stride = d->pre_stride;
+
+  // Set up pointers for this macro block recon buffer
+  xd->pre.y_buffer = recon_buffer->y_buffer + recon_yoffset;
+
+  ref_ptr = (unsigned char *)(*(d->base_pre) + d->pre);
+
+  vp9_mse16x16(src_ptr, src_stride, ref_ptr, ref_stride,
+               (unsigned int *)(best_motion_err));
+}
+
+static void first_pass_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
+                                     int_mv *ref_mv, MV *best_mv,
+                                     YV12_BUFFER_CONFIG *recon_buffer,
+                                     int *best_motion_err, int recon_yoffset) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  BLOCK *b = &x->block[0];
+  BLOCKD *d = &x->e_mbd.block[0];
+  int num00;
+
+  int_mv tmp_mv;
+  int_mv ref_mv_full;
+
+  int tmp_err;
+  int step_param = 3;
+  int further_steps = (MAX_MVSEARCH_STEPS - 1) - step_param;
+  int n;
+  vp9_variance_fn_ptr_t v_fn_ptr = cpi->fn_ptr[BLOCK_16X16];
+  int new_mv_mode_penalty = 256;
+
+  // override the default variance function to use MSE
+  v_fn_ptr.vf = vp9_mse16x16;
+
+  // Set up pointers for this macro block recon buffer
+  xd->pre.y_buffer = recon_buffer->y_buffer + recon_yoffset;
+
+  // Initial step/diamond search centred on best mv
+  tmp_mv.as_int = 0;
+  ref_mv_full.as_mv.col = ref_mv->as_mv.col >> 3;
+  ref_mv_full.as_mv.row = ref_mv->as_mv.row >> 3;
+  tmp_err = cpi->diamond_search_sad(x, b, d, &ref_mv_full, &tmp_mv, step_param,
+                                    x->sadperbit16, &num00, &v_fn_ptr,
+                                    XMVCOST, ref_mv);
+  if (tmp_err < INT_MAX - new_mv_mode_penalty)
+    tmp_err += new_mv_mode_penalty;
+
+  if (tmp_err < *best_motion_err) {
+    *best_motion_err = tmp_err;
+    best_mv->row = tmp_mv.as_mv.row;
+    best_mv->col = tmp_mv.as_mv.col;
+  }
+
+  // Further step/diamond searches as necessary
+  n = num00;
+  num00 = 0;
+
+  while (n < further_steps) {
+    n++;
+
+    if (num00)
+      num00--;
+    else {
+      tmp_err = cpi->diamond_search_sad(x, b, d, &ref_mv_full, &tmp_mv,
+                                        step_param + n, x->sadperbit16,
+                                        &num00, &v_fn_ptr,
+                                        XMVCOST, ref_mv);
+      if (tmp_err < INT_MAX - new_mv_mode_penalty)
+        tmp_err += new_mv_mode_penalty;
+
+      if (tmp_err < *best_motion_err) {
+        *best_motion_err = tmp_err;
+        best_mv->row = tmp_mv.as_mv.row;
+        best_mv->col = tmp_mv.as_mv.col;
+      }
+    }
+  }
+}
+
+void vp9_first_pass(VP9_COMP *cpi) {
+  int mb_row, mb_col;
+  MACROBLOCK *const x = &cpi->mb;
+  VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+
+  int recon_yoffset, recon_uvoffset;
+  YV12_BUFFER_CONFIG *lst_yv12 = &cm->yv12_fb[cm->lst_fb_idx];
+  YV12_BUFFER_CONFIG *new_yv12 = &cm->yv12_fb[cm->new_fb_idx];
+  YV12_BUFFER_CONFIG *gld_yv12 = &cm->yv12_fb[cm->gld_fb_idx];
+  int recon_y_stride = lst_yv12->y_stride;
+  int recon_uv_stride = lst_yv12->uv_stride;
+  int64_t intra_error = 0;
+  int64_t coded_error = 0;
+  int64_t sr_coded_error = 0;
+
+  int sum_mvr = 0, sum_mvc = 0;
+  int sum_mvr_abs = 0, sum_mvc_abs = 0;
+  int sum_mvrs = 0, sum_mvcs = 0;
+  int mvcount = 0;
+  int intercount = 0;
+  int second_ref_count = 0;
+  int intrapenalty = 256;
+  int neutral_count = 0;
+  int new_mv_count = 0;
+  int sum_in_vectors = 0;
+  uint32_t lastmv_as_int = 0;
+
+  int_mv zero_ref_mv;
+
+  zero_ref_mv.as_int = 0;
+
+  vp9_clear_system_state();  // __asm emms;
+
+  x->src = * cpi->Source;
+  xd->pre = *lst_yv12;
+  xd->dst = *new_yv12;
+
+  x->partition_info = x->pi;
+
+  xd->mode_info_context = cm->mi;
+
+  vp9_build_block_offsets(x);
+
+  vp9_setup_block_dptrs(&x->e_mbd);
+
+  vp9_setup_block_ptrs(x);
+
+  // set up frame new frame for intra coded blocks
+  vp9_setup_intra_recon(new_yv12);
+  vp9_frame_init_quantizer(cpi);
+
+  // Initialise the MV cost table to the defaults
+  // if( cm->current_video_frame == 0)
+  // if ( 0 )
+  {
+    int flag[2] = {1, 1};
+    vp9_init_mv_probs(cm);
+    vp9_initialize_rd_consts(cpi, cm->base_qindex + cm->y1dc_delta_q);
+  }
+
+  // for each macroblock row in image
+  for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) {
+    int_mv best_ref_mv;
+
+    best_ref_mv.as_int = 0;
+
+    // reset above block coeffs
+    xd->up_available = (mb_row != 0);
+    recon_yoffset = (mb_row * recon_y_stride * 16);
+    recon_uvoffset = (mb_row * recon_uv_stride * 8);
+
+    // Set up limit values for motion vectors to prevent them extending outside the UMV borders
+    x->mv_row_min = -((mb_row * 16) + (VP8BORDERINPIXELS - 16));
+    x->mv_row_max = ((cm->mb_rows - 1 - mb_row) * 16) + (VP8BORDERINPIXELS - 16);
+
+
+    // for each macroblock col in image
+    for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) {
+      int this_error;
+      int gf_motion_error = INT_MAX;
+      int use_dc_pred = (mb_col || mb_row) && (!mb_col || !mb_row);
+
+      xd->dst.y_buffer = new_yv12->y_buffer + recon_yoffset;
+      xd->dst.u_buffer = new_yv12->u_buffer + recon_uvoffset;
+      xd->dst.v_buffer = new_yv12->v_buffer + recon_uvoffset;
+      xd->left_available = (mb_col != 0);
+
+      // Copy current mb to a buffer
+      vp9_copy_mem16x16(x->src.y_buffer, x->src.y_stride, x->thismb, 16);
+
+      // do intra 16x16 prediction
+      this_error = vp9_encode_intra(cpi, x, use_dc_pred);
+
+      // "intrapenalty" below deals with situations where the intra and inter error scores are very low (eg a plain black frame)
+      // We do not have special cases in first pass for 0,0 and nearest etc so all inter modes carry an overhead cost estimate fot the mv.
+      // When the error score is very low this causes us to pick all or lots of INTRA modes and throw lots of key frames.
+      // This penalty adds a cost matching that of a 0,0 mv to the intra case.
+      this_error += intrapenalty;
+
+      // Cumulative intra error total
+      intra_error += (int64_t)this_error;
+
+      // Set up limit values for motion vectors to prevent them extending outside the UMV borders
+      x->mv_col_min = -((mb_col * 16) + (VP8BORDERINPIXELS - 16));
+      x->mv_col_max = ((cm->mb_cols - 1 - mb_col) * 16) + (VP8BORDERINPIXELS - 16);
+
+      // Other than for the first frame do a motion search
+      if (cm->current_video_frame > 0) {
+        int tmp_err;
+        int motion_error = INT_MAX;
+        int_mv mv, tmp_mv;
+
+        // Simple 0,0 motion with no mv overhead
+        zz_motion_search(cpi, x, lst_yv12, &motion_error, recon_yoffset);
+        mv.as_int = tmp_mv.as_int = 0;
+
+        // Test last reference frame using the previous best mv as the
+        // starting point (best reference) for the search
+        first_pass_motion_search(cpi, x, &best_ref_mv,
+                                 &mv.as_mv, lst_yv12,
+                                 &motion_error, recon_yoffset);
+
+        // If the current best reference mv is not centred on 0,0 then do a 0,0 based search as well
+        if (best_ref_mv.as_int) {
+          tmp_err = INT_MAX;
+          first_pass_motion_search(cpi, x, &zero_ref_mv, &tmp_mv.as_mv,
+                                   lst_yv12, &tmp_err, recon_yoffset);
+
+          if (tmp_err < motion_error) {
+            motion_error = tmp_err;
+            mv.as_int = tmp_mv.as_int;
+          }
+        }
+
+        // Experimental search in an older reference frame
+        if (cm->current_video_frame > 1) {
+          // Simple 0,0 motion with no mv overhead
+          zz_motion_search(cpi, x, gld_yv12,
+                           &gf_motion_error, recon_yoffset);
+
+          first_pass_motion_search(cpi, x, &zero_ref_mv,
+                                   &tmp_mv.as_mv, gld_yv12,
+                                   &gf_motion_error, recon_yoffset);
+
+          if ((gf_motion_error < motion_error) &&
+              (gf_motion_error < this_error)) {
+            second_ref_count++;
+          }
+
+          // Reset to last frame as reference buffer
+          xd->pre.y_buffer = lst_yv12->y_buffer + recon_yoffset;
+          xd->pre.u_buffer = lst_yv12->u_buffer + recon_uvoffset;
+          xd->pre.v_buffer = lst_yv12->v_buffer + recon_uvoffset;
+
+          // In accumulating a score for the older reference frame
+          // take the best of the motion predicted score and
+          // the intra coded error (just as will be done for)
+          // accumulation of "coded_error" for the last frame.
+          if (gf_motion_error < this_error)
+            sr_coded_error += gf_motion_error;
+          else
+            sr_coded_error += this_error;
+        } else
+          sr_coded_error += motion_error;
+
+        /* Intra assumed best */
+        best_ref_mv.as_int = 0;
+
+        if (motion_error <= this_error) {
+          // Keep a count of cases where the inter and intra were
+          // very close and very low. This helps with scene cut
+          // detection for example in cropped clips with black bars
+          // at the sides or top and bottom.
+          if ((((this_error - intrapenalty) * 9) <=
+               (motion_error * 10)) &&
+              (this_error < (2 * intrapenalty))) {
+            neutral_count++;
+          }
+
+          mv.as_mv.row <<= 3;
+          mv.as_mv.col <<= 3;
+          this_error = motion_error;
+          vp9_set_mbmode_and_mvs(x, NEWMV, &mv);
+          xd->mode_info_context->mbmi.txfm_size = TX_4X4;
+          vp9_encode_inter16x16y(IF_RTCD(&cpi->rtcd), x);
+          sum_mvr += mv.as_mv.row;
+          sum_mvr_abs += abs(mv.as_mv.row);
+          sum_mvc += mv.as_mv.col;
+          sum_mvc_abs += abs(mv.as_mv.col);
+          sum_mvrs += mv.as_mv.row * mv.as_mv.row;
+          sum_mvcs += mv.as_mv.col * mv.as_mv.col;
+          intercount++;
+
+          best_ref_mv.as_int = mv.as_int;
+
+          // Was the vector non-zero
+          if (mv.as_int) {
+            mvcount++;
+
+            // Was it different from the last non zero vector
+            if (mv.as_int != lastmv_as_int)
+              new_mv_count++;
+            lastmv_as_int = mv.as_int;
+
+            // Does the Row vector point inwards or outwards
+            if (mb_row < cm->mb_rows / 2) {
+              if (mv.as_mv.row > 0)
+                sum_in_vectors--;
+              else if (mv.as_mv.row < 0)
+                sum_in_vectors++;
+            } else if (mb_row > cm->mb_rows / 2) {
+              if (mv.as_mv.row > 0)
+                sum_in_vectors++;
+              else if (mv.as_mv.row < 0)
+                sum_in_vectors--;
+            }
+
+            // Does the Row vector point inwards or outwards
+            if (mb_col < cm->mb_cols / 2) {
+              if (mv.as_mv.col > 0)
+                sum_in_vectors--;
+              else if (mv.as_mv.col < 0)
+                sum_in_vectors++;
+            } else if (mb_col > cm->mb_cols / 2) {
+              if (mv.as_mv.col > 0)
+                sum_in_vectors++;
+              else if (mv.as_mv.col < 0)
+                sum_in_vectors--;
+            }
+          }
+        }
+      } else
+        sr_coded_error += (int64_t)this_error;
+
+      coded_error += (int64_t)this_error;
+
+      // adjust to the next column of macroblocks
+      x->src.y_buffer += 16;
+      x->src.u_buffer += 8;
+      x->src.v_buffer += 8;
+
+      recon_yoffset += 16;
+      recon_uvoffset += 8;
+    }
+
+    // adjust to the next row of mbs
+    x->src.y_buffer += 16 * x->src.y_stride - 16 * cm->mb_cols;
+    x->src.u_buffer += 8 * x->src.uv_stride - 8 * cm->mb_cols;
+    x->src.v_buffer += 8 * x->src.uv_stride - 8 * cm->mb_cols;
+
+    // extend the recon for intra prediction
+    vp9_extend_mb_row(new_yv12, xd->dst.y_buffer + 16,
+                      xd->dst.u_buffer + 8, xd->dst.v_buffer + 8);
+    vp9_clear_system_state();  // __asm emms;
+  }
+
+  vp9_clear_system_state();  // __asm emms;
+  {
+    double weight = 0.0;
+
+    FIRSTPASS_STATS fps;
+
+    fps.frame      = cm->current_video_frame;
+    fps.intra_error = intra_error >> 8;
+    fps.coded_error = coded_error >> 8;
+    fps.sr_coded_error = sr_coded_error >> 8;
+    weight = simple_weight(cpi->Source);
+
+
+    if (weight < 0.1)
+      weight = 0.1;
+
+    fps.ssim_weighted_pred_err = fps.coded_error * weight;
+
+    fps.pcnt_inter  = 0.0;
+    fps.pcnt_motion = 0.0;
+    fps.MVr        = 0.0;
+    fps.mvr_abs     = 0.0;
+    fps.MVc        = 0.0;
+    fps.mvc_abs     = 0.0;
+    fps.MVrv       = 0.0;
+    fps.MVcv       = 0.0;
+    fps.mv_in_out_count  = 0.0;
+    fps.new_mv_count = 0.0;
+    fps.count      = 1.0;
+
+    fps.pcnt_inter   = 1.0 * (double)intercount / cm->MBs;
+    fps.pcnt_second_ref = 1.0 * (double)second_ref_count / cm->MBs;
+    fps.pcnt_neutral = 1.0 * (double)neutral_count / cm->MBs;
+
+    if (mvcount > 0) {
+      fps.MVr = (double)sum_mvr / (double)mvcount;
+      fps.mvr_abs = (double)sum_mvr_abs / (double)mvcount;
+      fps.MVc = (double)sum_mvc / (double)mvcount;
+      fps.mvc_abs = (double)sum_mvc_abs / (double)mvcount;
+      fps.MVrv = ((double)sum_mvrs - (fps.MVr * fps.MVr / (double)mvcount)) / (double)mvcount;
+      fps.MVcv = ((double)sum_mvcs - (fps.MVc * fps.MVc / (double)mvcount)) / (double)mvcount;
+      fps.mv_in_out_count = (double)sum_in_vectors / (double)(mvcount * 2);
+      fps.new_mv_count = new_mv_count;
+
+      fps.pcnt_motion = 1.0 * (double)mvcount / cpi->common.MBs;
+    }
+
+    // TODO:  handle the case when duration is set to 0, or something less
+    // than the full time between subsequent cpi->source_time_stamp s  .
+    fps.duration = cpi->source->ts_end
+                   - cpi->source->ts_start;
+
+    // don't want to do output stats with a stack variable!
+    memcpy(cpi->twopass.this_frame_stats,
+           &fps,
+           sizeof(FIRSTPASS_STATS));
+    output_stats(cpi, cpi->output_pkt_list, cpi->twopass.this_frame_stats);
+    accumulate_stats(cpi->twopass.total_stats, &fps);
+  }
+
+  // Copy the previous Last Frame back into gf and and arf buffers if
+  // the prediction is good enough... but also dont allow it to lag too far
+  if ((cpi->twopass.sr_update_lag > 3) ||
+      ((cm->current_video_frame > 0) &&
+       (cpi->twopass.this_frame_stats->pcnt_inter > 0.20) &&
+       ((cpi->twopass.this_frame_stats->intra_error /
+         cpi->twopass.this_frame_stats->coded_error) > 2.0))) {
+    vp8_yv12_copy_frame_ptr(lst_yv12, gld_yv12);
+    cpi->twopass.sr_update_lag = 1;
+  } else
+    cpi->twopass.sr_update_lag++;
+
+  // swap frame pointers so last frame refers to the frame we just compressed
+  vp9_swap_yv12_buffer(lst_yv12, new_yv12);
+  vp8_yv12_extend_frame_borders(lst_yv12);
+
+  // Special case for the first frame. Copy into the GF buffer as a second reference.
+  if (cm->current_video_frame == 0) {
+    vp8_yv12_copy_frame_ptr(lst_yv12, gld_yv12);
+  }
+
+
+  // use this to see what the first pass reconstruction looks like
+  if (0) {
+    char filename[512];
+    FILE *recon_file;
+    sprintf(filename, "enc%04d.yuv", (int) cm->current_video_frame);
+
+    if (cm->current_video_frame == 0)
+      recon_file = fopen(filename, "wb");
+    else
+      recon_file = fopen(filename, "ab");
+
+    if (fwrite(lst_yv12->buffer_alloc, lst_yv12->frame_size, 1, recon_file));
+    fclose(recon_file);
+  }
+
+  cm->current_video_frame++;
+
+}
+
+// Estimate a cost per mb attributable to overheads such as the coding of
+// modes and motion vectors.
+// Currently simplistic in its assumptions for testing.
+//
+
+
+static double bitcost(double prob) {
+  return -(log(prob) / log(2.0));
+}
+
+static long long estimate_modemvcost(VP9_COMP *cpi,
+                                     FIRSTPASS_STATS *fpstats) {
+  int mv_cost;
+  int mode_cost;
+
+  double av_pct_inter = fpstats->pcnt_inter / fpstats->count;
+  double av_pct_motion = fpstats->pcnt_motion / fpstats->count;
+  double av_intra = (1.0 - av_pct_inter);
+
+  double zz_cost;
+  double motion_cost;
+  double intra_cost;
+
+  zz_cost = bitcost(av_pct_inter - av_pct_motion);
+  motion_cost = bitcost(av_pct_motion);
+  intra_cost = bitcost(av_intra);
+
+  // Estimate of extra bits per mv overhead for mbs
+  // << 9 is the normalization to the (bits * 512) used in vp9_bits_per_mb
+  mv_cost = ((int)(fpstats->new_mv_count / fpstats->count) * 8) << 9;
+
+  // Crude estimate of overhead cost from modes
+  // << 9 is the normalization to (bits * 512) used in vp9_bits_per_mb
+  mode_cost =
+    (int)((((av_pct_inter - av_pct_motion) * zz_cost) +
+           (av_pct_motion * motion_cost) +
+           (av_intra * intra_cost)) * cpi->common.MBs) << 9;
+
+  // return mv_cost + mode_cost;
+  // TODO PGW Fix overhead costs for extended Q range
+  return 0;
+}
+
+static double calc_correction_factor(double err_per_mb,
+                                     double err_divisor,
+                                     double pt_low,
+                                     double pt_high,
+                                     int Q) {
+  double power_term;
+  double error_term = err_per_mb / err_divisor;
+  double correction_factor;
+
+  // Adjustment based on actual quantizer to power term.
+  power_term = (vp9_convert_qindex_to_q(Q) * 0.01) + pt_low;
+  power_term = (power_term > pt_high) ? pt_high : power_term;
+
+  // Adjustments to error term
+  // TBD
+
+  // Calculate correction factor
+  correction_factor = pow(error_term, power_term);
+
+  // Clip range
+  correction_factor =
+    (correction_factor < 0.05)
+    ? 0.05 : (correction_factor > 2.0) ? 2.0 : correction_factor;
+
+  return correction_factor;
+}
+
+// Given a current maxQ value sets a range for future values.
+// PGW TODO..
+// This code removes direct dependency on QIndex to determin the range
+// (now uses the actual quantizer) but has not been tuned.
+static void adjust_maxq_qrange(VP9_COMP *cpi) {
+  int i;
+  double q;
+
+  // Set the max corresponding to cpi->avg_q * 2.0
+  q = cpi->avg_q * 2.0;
+  cpi->twopass.maxq_max_limit = cpi->worst_quality;
+  for (i = cpi->best_quality; i <= cpi->worst_quality; i++) {
+    cpi->twopass.maxq_max_limit = i;
+    if (vp9_convert_qindex_to_q(i) >= q)
+      break;
+  }
+
+  // Set the min corresponding to cpi->avg_q * 0.5
+  q = cpi->avg_q * 0.5;
+  cpi->twopass.maxq_min_limit = cpi->best_quality;
+  for (i = cpi->worst_quality; i >= cpi->best_quality; i--) {
+    cpi->twopass.maxq_min_limit = i;
+    if (vp9_convert_qindex_to_q(i) <= q)
+      break;
+  }
+}
+
+static int estimate_max_q(VP9_COMP *cpi,
+                          FIRSTPASS_STATS *fpstats,
+                          int section_target_bandwitdh,
+                          int overhead_bits) {
+  int Q;
+  int num_mbs = cpi->common.MBs;
+  int target_norm_bits_per_mb;
+
+  double section_err = (fpstats->coded_error / fpstats->count);
+  double sr_err_diff;
+  double sr_correction;
+  double err_per_mb = section_err / num_mbs;
+  double err_correction_factor;
+  double speed_correction = 1.0;
+  int overhead_bits_per_mb;
+
+  if (section_target_bandwitdh <= 0)
+    return cpi->twopass.maxq_max_limit;          // Highest value allowed
+
+  target_norm_bits_per_mb =
+    (section_target_bandwitdh < (1 << 20))
+    ? (512 * section_target_bandwitdh) / num_mbs
+    : 512 * (section_target_bandwitdh / num_mbs);
+
+  // Look at the drop in prediction quality between the last frame
+  // and the GF buffer (which contained an older frame).
+  sr_err_diff =
+    (fpstats->sr_coded_error - fpstats->coded_error) /
+    (fpstats->count * cpi->common.MBs);
+  sr_correction = (sr_err_diff / 32.0);
+  sr_correction = pow(sr_correction, 0.25);
+  if (sr_correction < 0.75)
+    sr_correction = 0.75;
+  else if (sr_correction > 1.25)
+    sr_correction = 1.25;
+
+  // Calculate a corrective factor based on a rolling ratio of bits spent
+  // vs target bits
+  if ((cpi->rolling_target_bits > 0) &&
+      (cpi->active_worst_quality < cpi->worst_quality)) {
+    double rolling_ratio;
+
+    rolling_ratio = (double)cpi->rolling_actual_bits /
+                    (double)cpi->rolling_target_bits;
+
+    if (rolling_ratio < 0.95)
+      cpi->twopass.est_max_qcorrection_factor -= 0.005;
+    else if (rolling_ratio > 1.05)
+      cpi->twopass.est_max_qcorrection_factor += 0.005;
+
+    cpi->twopass.est_max_qcorrection_factor =
+      (cpi->twopass.est_max_qcorrection_factor < 0.1)
+      ? 0.1
+      : (cpi->twopass.est_max_qcorrection_factor > 10.0)
+      ? 10.0 : cpi->twopass.est_max_qcorrection_factor;
+  }
+
+  // Corrections for higher compression speed settings
+  // (reduced compression expected)
+  if (cpi->compressor_speed == 1) {
+    if (cpi->oxcf.cpu_used <= 5)
+      speed_correction = 1.04 + (cpi->oxcf.cpu_used * 0.04);
+    else
+      speed_correction = 1.25;
+  }
+
+  // Estimate of overhead bits per mb
+  // Correction to overhead bits for min allowed Q.
+  // PGW TODO.. This code is broken for the extended Q range
+  //            for now overhead set to 0.
+  overhead_bits_per_mb = overhead_bits / num_mbs;
+  overhead_bits_per_mb *= pow(0.98, (double)cpi->twopass.maxq_min_limit);
+
+  // Try and pick a max Q that will be high enough to encode the
+  // content at the given rate.
+  for (Q = cpi->twopass.maxq_min_limit; Q < cpi->twopass.maxq_max_limit; Q++) {
+    int bits_per_mb_at_this_q;
+
+    err_correction_factor =
+      calc_correction_factor(err_per_mb, ERR_DIVISOR, 0.4, 0.90, Q) *
+      sr_correction * speed_correction *
+      cpi->twopass.est_max_qcorrection_factor;
+
+    if (err_correction_factor < 0.05)
+      err_correction_factor = 0.05;
+    else if (err_correction_factor > 5.0)
+      err_correction_factor = 5.0;
+
+    bits_per_mb_at_this_q =
+      vp9_bits_per_mb(INTER_FRAME, Q) + overhead_bits_per_mb;
+
+    bits_per_mb_at_this_q = (int)(.5 + err_correction_factor *
+                                  (double)bits_per_mb_at_this_q);
+
+    // Mode and motion overhead
+    // As Q rises in real encode loop rd code will force overhead down
+    // We make a crude adjustment for this here as *.98 per Q step.
+    // PGW TODO.. This code is broken for the extended Q range
+    //            for now overhead set to 0.
+    // overhead_bits_per_mb = (int)((double)overhead_bits_per_mb * 0.98);
+
+    if (bits_per_mb_at_this_q <= target_norm_bits_per_mb)
+      break;
+  }
+
+  // Restriction on active max q for constrained quality mode.
+  if ((cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) &&
+      (Q < cpi->cq_target_quality)) {
+    Q = cpi->cq_target_quality;
+  }
+
+  // Adjust maxq_min_limit and maxq_max_limit limits based on
+  // averaga q observed in clip for non kf/gf/arf frames
+  // Give average a chance to settle though.
+  // PGW TODO.. This code is broken for the extended Q range
+  if ((cpi->ni_frames >
+       ((unsigned int)cpi->twopass.total_stats->count >> 8)) &&
+      (cpi->ni_frames > 150)) {
+    adjust_maxq_qrange(cpi);
+  }
+
+  return Q;
+}
+
+// For cq mode estimate a cq level that matches the observed
+// complexity and data rate.
+static int estimate_cq(VP9_COMP *cpi,
+                       FIRSTPASS_STATS *fpstats,
+                       int section_target_bandwitdh,
+                       int overhead_bits) {
+  int Q;
+  int num_mbs = cpi->common.MBs;
+  int target_norm_bits_per_mb;
+
+  double section_err = (fpstats->coded_error / fpstats->count);
+  double err_per_mb = section_err / num_mbs;
+  double err_correction_factor;
+  double sr_err_diff;
+  double sr_correction;
+  double speed_correction = 1.0;
+  double clip_iiratio;
+  double clip_iifactor;
+  int overhead_bits_per_mb;
+
+
+  target_norm_bits_per_mb = (section_target_bandwitdh < (1 << 20))
+                            ? (512 * section_target_bandwitdh) / num_mbs
+                            : 512 * (section_target_bandwitdh / num_mbs);
+
+  // Estimate of overhead bits per mb
+  overhead_bits_per_mb = overhead_bits / num_mbs;
+
+  // Corrections for higher compression speed settings
+  // (reduced compression expected)
+  if (cpi->compressor_speed == 1) {
+    if (cpi->oxcf.cpu_used <= 5)
+      speed_correction = 1.04 + (cpi->oxcf.cpu_used * 0.04);
+    else
+      speed_correction = 1.25;
+  }
+
+  // Look at the drop in prediction quality between the last frame
+  // and the GF buffer (which contained an older frame).
+  sr_err_diff =
+    (fpstats->sr_coded_error - fpstats->coded_error) /
+    (fpstats->count * cpi->common.MBs);
+  sr_correction = (sr_err_diff / 32.0);
+  sr_correction = pow(sr_correction, 0.25);
+  if (sr_correction < 0.75)
+    sr_correction = 0.75;
+  else if (sr_correction > 1.25)
+    sr_correction = 1.25;
+
+  // II ratio correction factor for clip as a whole
+  clip_iiratio = cpi->twopass.total_stats->intra_error /
+                 DOUBLE_DIVIDE_CHECK(cpi->twopass.total_stats->coded_error);
+  clip_iifactor = 1.0 - ((clip_iiratio - 10.0) * 0.025);
+  if (clip_iifactor < 0.80)
+    clip_iifactor = 0.80;
+
+  // Try and pick a Q that can encode the content at the given rate.
+  for (Q = 0; Q < MAXQ; Q++) {
+    int bits_per_mb_at_this_q;
+
+    // Error per MB based correction factor
+    err_correction_factor =
+      calc_correction_factor(err_per_mb, 100.0, 0.4, 0.90, Q) *
+      sr_correction * speed_correction * clip_iifactor;
+
+    if (err_correction_factor < 0.05)
+      err_correction_factor = 0.05;
+    else if (err_correction_factor > 5.0)
+      err_correction_factor = 5.0;
+
+    bits_per_mb_at_this_q =
+      vp9_bits_per_mb(INTER_FRAME, Q) + overhead_bits_per_mb;
+
+    bits_per_mb_at_this_q = (int)(.5 + err_correction_factor *
+                                  (double)bits_per_mb_at_this_q);
+
+    // Mode and motion overhead
+    // As Q rises in real encode loop rd code will force overhead down
+    // We make a crude adjustment for this here as *.98 per Q step.
+    // PGW TODO.. This code is broken for the extended Q range
+    //            for now overhead set to 0.
+    overhead_bits_per_mb = (int)((double)overhead_bits_per_mb * 0.98);
+
+    if (bits_per_mb_at_this_q <= target_norm_bits_per_mb)
+      break;
+  }
+
+  // Clip value to range "best allowed to (worst allowed - 1)"
+  Q = select_cq_level(Q);
+  if (Q >= cpi->worst_quality)
+    Q = cpi->worst_quality - 1;
+  if (Q < cpi->best_quality)
+    Q = cpi->best_quality;
+
+  return Q;
+}
+
+
+extern void vp9_new_frame_rate(VP9_COMP *cpi, double framerate);
+
+void vp9_init_second_pass(VP9_COMP *cpi) {
+  FIRSTPASS_STATS this_frame;
+  FIRSTPASS_STATS *start_pos;
+
+  double lower_bounds_min_rate = FRAME_OVERHEAD_BITS * cpi->oxcf.frame_rate;
+  double two_pass_min_rate = (double)(cpi->oxcf.target_bandwidth
+                                      * cpi->oxcf.two_pass_vbrmin_section / 100);
+
+  if (two_pass_min_rate < lower_bounds_min_rate)
+    two_pass_min_rate = lower_bounds_min_rate;
+
+  zero_stats(cpi->twopass.total_stats);
+  zero_stats(cpi->twopass.total_left_stats);
+
+  if (!cpi->twopass.stats_in_end)
+    return;
+
+  *cpi->twopass.total_stats = *cpi->twopass.stats_in_end;
+  *cpi->twopass.total_left_stats = *cpi->twopass.total_stats;
+
+  // each frame can have a different duration, as the frame rate in the source
+  // isn't guaranteed to be constant.   The frame rate prior to the first frame
+  // encoded in the second pass is a guess.  However the sum duration is not.
+  // Its calculated based on the actual durations of all frames from the first
+  // pass.
+  vp9_new_frame_rate(cpi,
+                     10000000.0 * cpi->twopass.total_stats->count /
+                     cpi->twopass.total_stats->duration);
+
+  cpi->output_frame_rate = cpi->oxcf.frame_rate;
+  cpi->twopass.bits_left = (int64_t)(cpi->twopass.total_stats->duration *
+                                     cpi->oxcf.target_bandwidth / 10000000.0);
+  cpi->twopass.bits_left -= (int64_t)(cpi->twopass.total_stats->duration *
+                                      two_pass_min_rate / 10000000.0);
+
+  // Calculate a minimum intra value to be used in determining the IIratio
+  // scores used in the second pass. We have this minimum to make sure
+  // that clips that are static but "low complexity" in the intra domain
+  // are still boosted appropriately for KF/GF/ARF
+  cpi->twopass.kf_intra_err_min = KF_MB_INTRA_MIN * cpi->common.MBs;
+  cpi->twopass.gf_intra_err_min = GF_MB_INTRA_MIN * cpi->common.MBs;
+
+  // This variable monitors how far behind the second ref update is lagging
+  cpi->twopass.sr_update_lag = 1;
+
+  // Scan the first pass file and calculate an average Intra / Inter error score ratio for the sequence
+  {
+    double sum_iiratio = 0.0;
+    double IIRatio;
+
+    start_pos = cpi->twopass.stats_in;               // Note starting "file" position
+
+    while (input_stats(cpi, &this_frame) != EOF) {
+      IIRatio = this_frame.intra_error / DOUBLE_DIVIDE_CHECK(this_frame.coded_error);
+      IIRatio = (IIRatio < 1.0) ? 1.0 : (IIRatio > 20.0) ? 20.0 : IIRatio;
+      sum_iiratio += IIRatio;
+    }
+
+    cpi->twopass.avg_iiratio = sum_iiratio / DOUBLE_DIVIDE_CHECK((double)cpi->twopass.total_stats->count);
+
+    // Reset file position
+    reset_fpf_position(cpi, start_pos);
+  }
+
+  // Scan the first pass file and calculate a modified total error based upon the bias/power function
+  // used to allocate bits
+  {
+    start_pos = cpi->twopass.stats_in;               // Note starting "file" position
+
+    cpi->twopass.modified_error_total = 0.0;
+    cpi->twopass.modified_error_used = 0.0;
+
+    while (input_stats(cpi, &this_frame) != EOF) {
+      cpi->twopass.modified_error_total += calculate_modified_err(cpi, &this_frame);
+    }
+    cpi->twopass.modified_error_left = cpi->twopass.modified_error_total;
+
+    reset_fpf_position(cpi, start_pos);            // Reset file position
+
+  }
+}
+
+void vp9_end_second_pass(VP9_COMP *cpi) {
+}
+
+// This function gives and estimate of how badly we believe
+// the prediction quality is decaying from frame to frame.
+static double get_prediction_decay_rate(VP9_COMP *cpi,
+                                        FIRSTPASS_STATS *next_frame) {
+  double prediction_decay_rate;
+  double second_ref_decay;
+  double mb_sr_err_diff;
+
+  // Initial basis is the % mbs inter coded
+  prediction_decay_rate = next_frame->pcnt_inter;
+
+  // Look at the observed drop in prediction quality between the last frame
+  // and the GF buffer (which contains an older frame).
+  mb_sr_err_diff =
+    (next_frame->sr_coded_error - next_frame->coded_error) /
+    (cpi->common.MBs);
+  second_ref_decay = 1.0 - (mb_sr_err_diff / 512.0);
+  second_ref_decay = pow(second_ref_decay, 0.5);
+  if (second_ref_decay < 0.85)
+    second_ref_decay = 0.85;
+  else if (second_ref_decay > 1.0)
+    second_ref_decay = 1.0;
+
+  if (second_ref_decay < prediction_decay_rate)
+    prediction_decay_rate = second_ref_decay;
+
+  return prediction_decay_rate;
+}
+
+// Function to test for a condition where a complex transition is followed
+// by a static section. For example in slide shows where there is a fade
+// between slides. This is to help with more optimal kf and gf positioning.
+static int detect_transition_to_still(
+  VP9_COMP *cpi,
+  int frame_interval,
+  int still_interval,
+  double loop_decay_rate,
+  double last_decay_rate) {
+  BOOL trans_to_still = FALSE;
+
+  // Break clause to detect very still sections after motion
+  // For example a static image after a fade or other transition
+  // instead of a clean scene cut.
+  if ((frame_interval > MIN_GF_INTERVAL) &&
+      (loop_decay_rate >= 0.999) &&
+      (last_decay_rate < 0.9)) {
+    int j;
+    FIRSTPASS_STATS *position = cpi->twopass.stats_in;
+    FIRSTPASS_STATS tmp_next_frame;
+    double zz_inter;
+
+    // Look ahead a few frames to see if static condition
+    // persists...
+    for (j = 0; j < still_interval; j++) {
+      if (EOF == input_stats(cpi, &tmp_next_frame))
+        break;
+
+      zz_inter =
+        (tmp_next_frame.pcnt_inter - tmp_next_frame.pcnt_motion);
+      if (zz_inter < 0.999)
+        break;
+    }
+    // Reset file position
+    reset_fpf_position(cpi, position);
+
+    // Only if it does do we signal a transition to still
+    if (j == still_interval)
+      trans_to_still = TRUE;
+  }
+
+  return trans_to_still;
+}
+
+// This function detects a flash through the high relative pcnt_second_ref
+// score in the frame following a flash frame. The offset passed in should
+// reflect this
+static BOOL detect_flash(VP9_COMP *cpi, int offset) {
+  FIRSTPASS_STATS next_frame;
+
+  BOOL flash_detected = FALSE;
+
+  // Read the frame data.
+  // The return is FALSE (no flash detected) if not a valid frame
+  if (read_frame_stats(cpi, &next_frame, offset) != EOF) {
+    // What we are looking for here is a situation where there is a
+    // brief break in prediction (such as a flash) but subsequent frames
+    // are reasonably well predicted by an earlier (pre flash) frame.
+    // The recovery after a flash is indicated by a high pcnt_second_ref
+    // comapred to pcnt_inter.
+    if ((next_frame.pcnt_second_ref > next_frame.pcnt_inter) &&
+        (next_frame.pcnt_second_ref >= 0.5)) {
+      flash_detected = TRUE;
+    }
+  }
+
+  return flash_detected;
+}
+
+// Update the motion related elements to the GF arf boost calculation
+static void accumulate_frame_motion_stats(
+  VP9_COMP *cpi,
+  FIRSTPASS_STATS *this_frame,
+  double *this_frame_mv_in_out,
+  double *mv_in_out_accumulator,
+  double *abs_mv_in_out_accumulator,
+  double *mv_ratio_accumulator) {
+  // double this_frame_mv_in_out;
+  double this_frame_mvr_ratio;
+  double this_frame_mvc_ratio;
+  double motion_pct;
+
+  // Accumulate motion stats.
+  motion_pct = this_frame->pcnt_motion;
+
+  // Accumulate Motion In/Out of frame stats
+  *this_frame_mv_in_out = this_frame->mv_in_out_count * motion_pct;
+  *mv_in_out_accumulator += this_frame->mv_in_out_count * motion_pct;
+  *abs_mv_in_out_accumulator +=
+    fabs(this_frame->mv_in_out_count * motion_pct);
+
+  // Accumulate a measure of how uniform (or conversely how random)
+  // the motion field is. (A ratio of absmv / mv)
+  if (motion_pct > 0.05) {
+    this_frame_mvr_ratio = fabs(this_frame->mvr_abs) /
+                           DOUBLE_DIVIDE_CHECK(fabs(this_frame->MVr));
+
+    this_frame_mvc_ratio = fabs(this_frame->mvc_abs) /
+                           DOUBLE_DIVIDE_CHECK(fabs(this_frame->MVc));
+
+    *mv_ratio_accumulator +=
+      (this_frame_mvr_ratio < this_frame->mvr_abs)
+      ? (this_frame_mvr_ratio * motion_pct)
+      : this_frame->mvr_abs * motion_pct;
+
+    *mv_ratio_accumulator +=
+      (this_frame_mvc_ratio < this_frame->mvc_abs)
+      ? (this_frame_mvc_ratio * motion_pct)
+      : this_frame->mvc_abs * motion_pct;
+
+  }
+}
+
+// Calculate a baseline boost number for the current frame.
+static double calc_frame_boost(
+  VP9_COMP *cpi,
+  FIRSTPASS_STATS *this_frame,
+  double this_frame_mv_in_out) {
+  double frame_boost;
+
+  // Underlying boost factor is based on inter intra error ratio
+  if (this_frame->intra_error > cpi->twopass.gf_intra_err_min)
+    frame_boost = (IIFACTOR * this_frame->intra_error /
+                   DOUBLE_DIVIDE_CHECK(this_frame->coded_error));
+  else
+    frame_boost = (IIFACTOR * cpi->twopass.gf_intra_err_min /
+                   DOUBLE_DIVIDE_CHECK(this_frame->coded_error));
+
+  // Increase boost for frames where new data coming into frame
+  // (eg zoom out). Slightly reduce boost if there is a net balance
+  // of motion out of the frame (zoom in).
+  // The range for this_frame_mv_in_out is -1.0 to +1.0
+  if (this_frame_mv_in_out > 0.0)
+    frame_boost += frame_boost * (this_frame_mv_in_out * 2.0);
+  // In extreme case boost is halved
+  else
+    frame_boost += frame_boost * (this_frame_mv_in_out / 2.0);
+
+  // Clip to maximum
+  if (frame_boost > GF_RMAX)
+    frame_boost = GF_RMAX;
+
+  return frame_boost;
+}
+
+static int calc_arf_boost(
+  VP9_COMP *cpi,
+  int offset,
+  int f_frames,
+  int b_frames,
+  int *f_boost,
+  int *b_boost) {
+  FIRSTPASS_STATS this_frame;
+
+  int i;
+  double boost_score = 0.0;
+  double mv_ratio_accumulator = 0.0;
+  double decay_accumulator = 1.0;
+  double this_frame_mv_in_out = 0.0;
+  double mv_in_out_accumulator = 0.0;
+  double abs_mv_in_out_accumulator = 0.0;
+  int arf_boost;
+  BOOL flash_detected = FALSE;
+
+  // Search forward from the proposed arf/next gf position
+  for (i = 0; i < f_frames; i++) {
+    if (read_frame_stats(cpi, &this_frame, (i + offset)) == EOF)
+      break;
+
+    // Update the motion related elements to the boost calculation
+    accumulate_frame_motion_stats(cpi, &this_frame,
+                                  &this_frame_mv_in_out, &mv_in_out_accumulator,
+                                  &abs_mv_in_out_accumulator, &mv_ratio_accumulator);
+
+    // We want to discount the the flash frame itself and the recovery
+    // frame that follows as both will have poor scores.
+    flash_detected = detect_flash(cpi, (i + offset)) ||
+                     detect_flash(cpi, (i + offset + 1));
+
+    // Cumulative effect of prediction quality decay
+    if (!flash_detected) {
+      decay_accumulator =
+        decay_accumulator *
+        get_prediction_decay_rate(cpi, &this_frame);
+      decay_accumulator =
+        decay_accumulator < 0.1 ? 0.1 : decay_accumulator;
+    }
+
+    boost_score += (decay_accumulator *
+                    calc_frame_boost(cpi, &this_frame, this_frame_mv_in_out));
+  }
+
+  *f_boost = boost_score;
+
+  // Reset for backward looking loop
+  boost_score = 0.0;
+  mv_ratio_accumulator = 0.0;
+  decay_accumulator = 1.0;
+  this_frame_mv_in_out = 0.0;
+  mv_in_out_accumulator = 0.0;
+  abs_mv_in_out_accumulator = 0.0;
+
+  // Search backward towards last gf position
+  for (i = -1; i >= -b_frames; i--) {
+    if (read_frame_stats(cpi, &this_frame, (i + offset)) == EOF)
+      break;
+
+    // Update the motion related elements to the boost calculation
+    accumulate_frame_motion_stats(cpi, &this_frame,
+                                  &this_frame_mv_in_out, &mv_in_out_accumulator,
+                                  &abs_mv_in_out_accumulator, &mv_ratio_accumulator);
+
+    // We want to discount the the flash frame itself and the recovery
+    // frame that follows as both will have poor scores.
+    flash_detected = detect_flash(cpi, (i + offset)) ||
+                     detect_flash(cpi, (i + offset + 1));
+
+    // Cumulative effect of prediction quality decay
+    if (!flash_detected) {
+      decay_accumulator =
+        decay_accumulator *
+        get_prediction_decay_rate(cpi, &this_frame);
+      decay_accumulator =
+        decay_accumulator < 0.1 ? 0.1 : decay_accumulator;
+    }
+
+    boost_score += (decay_accumulator *
+                    calc_frame_boost(cpi, &this_frame, this_frame_mv_in_out));
+
+  }
+  *b_boost = boost_score;
+
+  arf_boost = (*f_boost + *b_boost);
+  if (arf_boost < ((b_frames + f_frames) * 20))
+    arf_boost = ((b_frames + f_frames) * 20);
+
+  return arf_boost;
+}
+
+static void configure_arnr_filter(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
+  int half_gf_int;
+  int frames_after_arf;
+  int frames_bwd = cpi->oxcf.arnr_max_frames - 1;
+  int frames_fwd = cpi->oxcf.arnr_max_frames - 1;
+
+  // Define the arnr filter width for this group of frames:
+  // We only filter frames that lie within a distance of half
+  // the GF interval from the ARF frame. We also have to trap
+  // cases where the filter extends beyond the end of clip.
+  // Note: this_frame->frame has been updated in the loop
+  // so it now points at the ARF frame.
+  half_gf_int = cpi->baseline_gf_interval >> 1;
+  frames_after_arf = cpi->twopass.total_stats->count -
+                     this_frame->frame - 1;
+
+  switch (cpi->oxcf.arnr_type) {
+    case 1: // Backward filter
+      frames_fwd = 0;
+      if (frames_bwd > half_gf_int)
+        frames_bwd = half_gf_int;
+      break;
+
+    case 2: // Forward filter
+      if (frames_fwd > half_gf_int)
+        frames_fwd = half_gf_int;
+      if (frames_fwd > frames_after_arf)
+        frames_fwd = frames_after_arf;
+      frames_bwd = 0;
+      break;
+
+    case 3: // Centered filter
+    default:
+      frames_fwd >>= 1;
+      if (frames_fwd > frames_after_arf)
+        frames_fwd = frames_after_arf;
+      if (frames_fwd > half_gf_int)
+        frames_fwd = half_gf_int;
+
+      frames_bwd = frames_fwd;
+
+      // For even length filter there is one more frame backward
+      // than forward: e.g. len=6 ==> bbbAff, len=7 ==> bbbAfff.
+      if (frames_bwd < half_gf_int)
+        frames_bwd += (cpi->oxcf.arnr_max_frames + 1) & 0x1;
+      break;
+  }
+
+  cpi->active_arnr_frames = frames_bwd + 1 + frames_fwd;
+}
+
+// Analyse and define a gf/arf group .
+static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
+  FIRSTPASS_STATS next_frame;
+  FIRSTPASS_STATS *start_pos;
+  int i;
+  double boost_score = 0.0;
+  double old_boost_score = 0.0;
+  double gf_group_err = 0.0;
+  double gf_first_frame_err = 0.0;
+  double mod_frame_err = 0.0;
+
+  double mv_ratio_accumulator = 0.0;
+  double decay_accumulator = 1.0;
+  double zero_motion_accumulator = 1.0;
+
+  double loop_decay_rate = 1.00;          // Starting decay rate
+  double last_loop_decay_rate = 1.00;
+
+  double this_frame_mv_in_out = 0.0;
+  double mv_in_out_accumulator = 0.0;
+  double abs_mv_in_out_accumulator = 0.0;
+
+  int max_bits = frame_max_bits(cpi);     // Max for a single frame
+
+  unsigned int allow_alt_ref =
+    cpi->oxcf.play_alternate && cpi->oxcf.lag_in_frames;
+
+  int f_boost = 0;
+  int b_boost = 0;
+  BOOL flash_detected;
+
+  cpi->twopass.gf_group_bits = 0;
+
+  vp9_clear_system_state();  // __asm emms;
+
+  start_pos = cpi->twopass.stats_in;
+
+  vpx_memset(&next_frame, 0, sizeof(next_frame)); // assure clean
+
+  // Load stats for the current frame.
+  mod_frame_err = calculate_modified_err(cpi, this_frame);
+
+  // Note the error of the frame at the start of the group (this will be
+  // the GF frame error if we code a normal gf
+  gf_first_frame_err = mod_frame_err;
+
+  // Special treatment if the current frame is a key frame (which is also
+  // a gf). If it is then its error score (and hence bit allocation) need
+  // to be subtracted out from the calculation for the GF group
+  if (cpi->common.frame_type == KEY_FRAME)
+    gf_group_err -= gf_first_frame_err;
+
+  // Scan forward to try and work out how many frames the next gf group
+  // should contain and what level of boost is appropriate for the GF
+  // or ARF that will be coded with the group
+  i = 0;
+
+  while (((i < cpi->twopass.static_scene_max_gf_interval) ||
+          ((cpi->twopass.frames_to_key - i) < MIN_GF_INTERVAL)) &&
+         (i < cpi->twopass.frames_to_key)) {
+    i++;    // Increment the loop counter
+
+    // Accumulate error score of frames in this gf group
+    mod_frame_err = calculate_modified_err(cpi, this_frame);
+    gf_group_err += mod_frame_err;
+
+    if (EOF == input_stats(cpi, &next_frame))
+      break;
+
+    // Test for the case where there is a brief flash but the prediction
+    // quality back to an earlier frame is then restored.
+    flash_detected = detect_flash(cpi, 0);
+
+    // Update the motion related elements to the boost calculation
+    accumulate_frame_motion_stats(cpi, &next_frame,
+                                  &this_frame_mv_in_out, &mv_in_out_accumulator,
+                                  &abs_mv_in_out_accumulator, &mv_ratio_accumulator);
+
+    // Cumulative effect of prediction quality decay
+    if (!flash_detected) {
+      last_loop_decay_rate = loop_decay_rate;
+      loop_decay_rate = get_prediction_decay_rate(cpi, &next_frame);
+      decay_accumulator = decay_accumulator * loop_decay_rate;
+
+      // Monitor for static sections.
+      if ((next_frame.pcnt_inter - next_frame.pcnt_motion) <
+          zero_motion_accumulator) {
+        zero_motion_accumulator =
+          (next_frame.pcnt_inter - next_frame.pcnt_motion);
+      }
+
+      // Break clause to detect very still sections after motion
+      // (for example a staic image after a fade or other transition).
+      if (detect_transition_to_still(cpi, i, 5, loop_decay_rate,
+                                     last_loop_decay_rate)) {
+        allow_alt_ref = FALSE;
+        break;
+      }
+    }
+
+    // Calculate a boost number for this frame
+    boost_score +=
+      (decay_accumulator *
+       calc_frame_boost(cpi, &next_frame, this_frame_mv_in_out));
+
+    // Break out conditions.
+    if (
+      // Break at cpi->max_gf_interval unless almost totally static
+      (i >= cpi->max_gf_interval && (zero_motion_accumulator < 0.995)) ||
+      (
+        // Dont break out with a very short interval
+        (i > MIN_GF_INTERVAL) &&
+        // Dont break out very close to a key frame
+        ((cpi->twopass.frames_to_key - i) >= MIN_GF_INTERVAL) &&
+        ((boost_score > 125.0) || (next_frame.pcnt_inter < 0.75)) &&
+        (!flash_detected) &&
+        ((mv_ratio_accumulator > 100.0) ||
+         (abs_mv_in_out_accumulator > 3.0) ||
+         (mv_in_out_accumulator < -2.0) ||
+         ((boost_score - old_boost_score) < 12.5))
+      )) {
+      boost_score = old_boost_score;
+      break;
+    }
+
+    vpx_memcpy(this_frame, &next_frame, sizeof(*this_frame));
+
+    old_boost_score = boost_score;
+  }
+
+  // Dont allow a gf too near the next kf
+  if ((cpi->twopass.frames_to_key - i) < MIN_GF_INTERVAL) {
+    while (i < cpi->twopass.frames_to_key) {
+      i++;
+
+      if (EOF == input_stats(cpi, this_frame))
+        break;
+
+      if (i < cpi->twopass.frames_to_key) {
+        mod_frame_err = calculate_modified_err(cpi, this_frame);
+        gf_group_err += mod_frame_err;
+      }
+    }
+  }
+
+  // Set the interval till the next gf or arf.
+  cpi->baseline_gf_interval = i;
+
+  // Should we use the alternate refernce frame
+  if (allow_alt_ref &&
+      (i < cpi->oxcf.lag_in_frames) &&
+      (i >= MIN_GF_INTERVAL) &&
+      // dont use ARF very near next kf
+      (i <= (cpi->twopass.frames_to_key - MIN_GF_INTERVAL)) &&
+      ((next_frame.pcnt_inter > 0.75) ||
+       (next_frame.pcnt_second_ref > 0.5)) &&
+      ((mv_in_out_accumulator / (double)i > -0.2) ||
+       (mv_in_out_accumulator > -2.0)) &&
+      (boost_score > 100)) {
+    // Alterrnative boost calculation for alt ref
+    cpi->gfu_boost = calc_arf_boost(cpi, 0, (i - 1), (i - 1), &f_boost, &b_boost);
+    cpi->source_alt_ref_pending = TRUE;
+
+    configure_arnr_filter(cpi, this_frame);
+  } else {
+    cpi->gfu_boost = (int)boost_score;
+    cpi->source_alt_ref_pending = FALSE;
+  }
+
+  // Now decide how many bits should be allocated to the GF group as  a
+  // proportion of those remaining in the kf group.
+  // The final key frame group in the clip is treated as a special case
+  // where cpi->twopass.kf_group_bits is tied to cpi->twopass.bits_left.
+  // This is also important for short clips where there may only be one
+  // key frame.
+  if (cpi->twopass.frames_to_key >= (int)(cpi->twopass.total_stats->count -
+                                          cpi->common.current_video_frame)) {
+    cpi->twopass.kf_group_bits =
+      (cpi->twopass.bits_left > 0) ? cpi->twopass.bits_left : 0;
+  }
+
+  // Calculate the bits to be allocated to the group as a whole
+  if ((cpi->twopass.kf_group_bits > 0) &&
+      (cpi->twopass.kf_group_error_left > 0)) {
+    cpi->twopass.gf_group_bits =
+      (int)((double)cpi->twopass.kf_group_bits *
+            (gf_group_err / (double)cpi->twopass.kf_group_error_left));
+  } else
+    cpi->twopass.gf_group_bits = 0;
+
+  cpi->twopass.gf_group_bits =
+    (cpi->twopass.gf_group_bits < 0)
+    ? 0
+    : (cpi->twopass.gf_group_bits > cpi->twopass.kf_group_bits)
+    ? cpi->twopass.kf_group_bits : cpi->twopass.gf_group_bits;
+
+  // Clip cpi->twopass.gf_group_bits based on user supplied data rate
+  // variability limit (cpi->oxcf.two_pass_vbrmax_section)
+  if (cpi->twopass.gf_group_bits > max_bits * cpi->baseline_gf_interval)
+    cpi->twopass.gf_group_bits = max_bits * cpi->baseline_gf_interval;
+
+  // Reset the file position
+  reset_fpf_position(cpi, start_pos);
+
+  // Update the record of error used so far (only done once per gf group)
+  cpi->twopass.modified_error_used += gf_group_err;
+
+  // Assign  bits to the arf or gf.
+  for (i = 0; i <= (cpi->source_alt_ref_pending && cpi->common.frame_type != KEY_FRAME); i++) {
+    int boost;
+    int allocation_chunks;
+    int Q = (cpi->oxcf.fixed_q < 0) ? cpi->last_q[INTER_FRAME] : cpi->oxcf.fixed_q;
+    int gf_bits;
+
+    boost = (cpi->gfu_boost * vp9_gfboost_qadjust(Q)) / 100;
+
+    // Set max and minimum boost and hence minimum allocation
+    if (boost > ((cpi->baseline_gf_interval + 1) * 200))
+      boost = ((cpi->baseline_gf_interval + 1) * 200);
+    else if (boost < 125)
+      boost = 125;
+
+    if (cpi->source_alt_ref_pending && i == 0)
+      allocation_chunks =
+        ((cpi->baseline_gf_interval + 1) * 100) + boost;
+    else
+      allocation_chunks =
+        (cpi->baseline_gf_interval * 100) + (boost - 100);
+
+    // Prevent overflow
+    if (boost > 1028) {
+      int divisor = boost >> 10;
+      boost /= divisor;
+      allocation_chunks /= divisor;
+    }
+
+    // Calculate the number of bits to be spent on the gf or arf based on
+    // the boost number
+    gf_bits = (int)((double)boost *
+                    (cpi->twopass.gf_group_bits /
+                     (double)allocation_chunks));
+
+    // If the frame that is to be boosted is simpler than the average for
+    // the gf/arf group then use an alternative calculation
+    // based on the error score of the frame itself
+    if (mod_frame_err < gf_group_err / (double)cpi->baseline_gf_interval) {
+      double  alt_gf_grp_bits;
+      int     alt_gf_bits;
+
+      alt_gf_grp_bits =
+        (double)cpi->twopass.kf_group_bits  *
+        (mod_frame_err * (double)cpi->baseline_gf_interval) /
+        DOUBLE_DIVIDE_CHECK((double)cpi->twopass.kf_group_error_left);
+
+      alt_gf_bits = (int)((double)boost * (alt_gf_grp_bits /
+                                           (double)allocation_chunks));
+
+      if (gf_bits > alt_gf_bits) {
+        gf_bits = alt_gf_bits;
+      }
+    }
+    // Else if it is harder than other frames in the group make sure it at
+    // least receives an allocation in keeping with its relative error
+    // score, otherwise it may be worse off than an "un-boosted" frame
+    else {
+      int alt_gf_bits =
+        (int)((double)cpi->twopass.kf_group_bits *
+              mod_frame_err /
+              DOUBLE_DIVIDE_CHECK((double)cpi->twopass.kf_group_error_left));
+
+      if (alt_gf_bits > gf_bits) {
+        gf_bits = alt_gf_bits;
+      }
+    }
+
+    // Dont allow a negative value for gf_bits
+    if (gf_bits < 0)
+      gf_bits = 0;
+
+    gf_bits += cpi->min_frame_bandwidth;                     // Add in minimum for a frame
+
+    if (i == 0) {
+      cpi->twopass.gf_bits = gf_bits;
+    }
+    if (i == 1 || (!cpi->source_alt_ref_pending && (cpi->common.frame_type != KEY_FRAME))) {
+      cpi->per_frame_bandwidth = gf_bits;                 // Per frame bit target for this frame
+    }
+  }
+
+  {
+    // Adjust KF group bits and error remainin
+    cpi->twopass.kf_group_error_left -= gf_group_err;
+    cpi->twopass.kf_group_bits -= cpi->twopass.gf_group_bits;
+
+    if (cpi->twopass.kf_group_bits < 0)
+      cpi->twopass.kf_group_bits = 0;
+
+    // Note the error score left in the remaining frames of the group.
+    // For normal GFs we want to remove the error score for the first frame
+    // of the group (except in Key frame case where this has already
+    // happened)
+    if (!cpi->source_alt_ref_pending && cpi->common.frame_type != KEY_FRAME)
+      cpi->twopass.gf_group_error_left = gf_group_err - gf_first_frame_err;
+    else
+      cpi->twopass.gf_group_error_left = gf_group_err;
+
+    cpi->twopass.gf_group_bits -= cpi->twopass.gf_bits - cpi->min_frame_bandwidth;
+
+    if (cpi->twopass.gf_group_bits < 0)
+      cpi->twopass.gf_group_bits = 0;
+
+    // This condition could fail if there are two kfs very close together
+    // despite (MIN_GF_INTERVAL) and would cause a devide by 0 in the
+    // calculation of cpi->twopass.alt_extra_bits.
+    if (cpi->baseline_gf_interval >= 3) {
+      int boost = (cpi->source_alt_ref_pending)
+                  ? b_boost : cpi->gfu_boost;
+
+      if (boost >= 150) {
+        int pct_extra;
+
+        pct_extra = (boost - 100) / 50;
+        pct_extra = (pct_extra > 20) ? 20 : pct_extra;
+
+        cpi->twopass.alt_extra_bits =
+          (cpi->twopass.gf_group_bits * pct_extra) / 100;
+        cpi->twopass.gf_group_bits -= cpi->twopass.alt_extra_bits;
+        cpi->twopass.alt_extra_bits /=
+          ((cpi->baseline_gf_interval - 1) >> 1);
+      } else
+        cpi->twopass.alt_extra_bits = 0;
+    } else
+      cpi->twopass.alt_extra_bits = 0;
+  }
+
+  if (cpi->common.frame_type != KEY_FRAME) {
+    FIRSTPASS_STATS sectionstats;
+
+    zero_stats(&sectionstats);
+    reset_fpf_position(cpi, start_pos);
+
+    for (i = 0; i < cpi->baseline_gf_interval; i++) {
+      input_stats(cpi, &next_frame);
+      accumulate_stats(&sectionstats, &next_frame);
+    }
+
+    avg_stats(&sectionstats);
+
+    cpi->twopass.section_intra_rating =
+      sectionstats.intra_error /
+      DOUBLE_DIVIDE_CHECK(sectionstats.coded_error);
+
+    reset_fpf_position(cpi, start_pos);
+  }
+}
+
+// Allocate bits to a normal frame that is neither a gf an arf or a key frame.
+static void assign_std_frame_bits(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
+  int    target_frame_size;                                                             // gf_group_error_left
+
+  double modified_err;
+  double err_fraction;                                                                 // What portion of the remaining GF group error is used by this frame
+
+  int max_bits = frame_max_bits(cpi);    // Max for a single frame
+
+  // Calculate modified prediction error used in bit allocation
+  modified_err = calculate_modified_err(cpi, this_frame);
+
+  if (cpi->twopass.gf_group_error_left > 0)
+    err_fraction = modified_err / cpi->twopass.gf_group_error_left;                              // What portion of the remaining GF group error is used by this frame
+  else
+    err_fraction = 0.0;
+
+  target_frame_size = (int)((double)cpi->twopass.gf_group_bits * err_fraction);                    // How many of those bits available for allocation should we give it?
+
+  // Clip to target size to 0 - max_bits (or cpi->twopass.gf_group_bits) at the top end.
+  if (target_frame_size < 0)
+    target_frame_size = 0;
+  else {
+    if (target_frame_size > max_bits)
+      target_frame_size = max_bits;
+
+    if (target_frame_size > cpi->twopass.gf_group_bits)
+      target_frame_size = cpi->twopass.gf_group_bits;
+  }
+
+  cpi->twopass.gf_group_error_left -= modified_err;                                               // Adjust error remaining
+  cpi->twopass.gf_group_bits -= target_frame_size;                                                // Adjust bits remaining
+
+  if (cpi->twopass.gf_group_bits < 0)
+    cpi->twopass.gf_group_bits = 0;
+
+  target_frame_size += cpi->min_frame_bandwidth;                                          // Add in the minimum number of bits that is set aside for every frame.
+
+
+  cpi->per_frame_bandwidth = target_frame_size;                                           // Per frame bit target for this frame
+}
+
+// Make a damped adjustment to the active max q.
+static int adjust_active_maxq(int old_maxqi, int new_maxqi) {
+  int i;
+  int ret_val = new_maxqi;
+  double old_q;
+  double new_q;
+  double target_q;
+
+  old_q = vp9_convert_qindex_to_q(old_maxqi);
+  new_q = vp9_convert_qindex_to_q(new_maxqi);
+
+  target_q = ((old_q * 7.0) + new_q) / 8.0;
+
+  if (target_q > old_q) {
+    for (i = old_maxqi; i <= new_maxqi; i++) {
+      if (vp9_convert_qindex_to_q(i) >= target_q) {
+        ret_val = i;
+        break;
+      }
+    }
+  } else {
+    for (i = old_maxqi; i >= new_maxqi; i--) {
+      if (vp9_convert_qindex_to_q(i) <= target_q) {
+        ret_val = i;
+        break;
+      }
+    }
+  }
+
+  return ret_val;
+}
+
+void vp9_second_pass(VP9_COMP *cpi) {
+  int tmp_q;
+  int frames_left = (int)(cpi->twopass.total_stats->count - cpi->common.current_video_frame);
+
+  FIRSTPASS_STATS this_frame;
+  FIRSTPASS_STATS this_frame_copy;
+
+  double this_frame_error;
+  double this_frame_intra_error;
+  double this_frame_coded_error;
+
+  FIRSTPASS_STATS *start_pos;
+
+  int overhead_bits;
+
+  if (!cpi->twopass.stats_in) {
+    return;
+  }
+
+  vp9_clear_system_state();
+
+  vpx_memset(&this_frame, 0, sizeof(FIRSTPASS_STATS));
+
+  if (EOF == input_stats(cpi, &this_frame))
+    return;
+
+  this_frame_error = this_frame.ssim_weighted_pred_err;
+  this_frame_intra_error = this_frame.intra_error;
+  this_frame_coded_error = this_frame.coded_error;
+
+  start_pos = cpi->twopass.stats_in;
+
+  // keyframe and section processing !
+  if (cpi->twopass.frames_to_key == 0) {
+    // Define next KF group and assign bits to it
+    vpx_memcpy(&this_frame_copy, &this_frame, sizeof(this_frame));
+    find_next_key_frame(cpi, &this_frame_copy);
+  }
+
+  // Is this a GF / ARF (Note that a KF is always also a GF)
+  if (cpi->frames_till_gf_update_due == 0) {
+    // Define next gf group and assign bits to it
+    vpx_memcpy(&this_frame_copy, &this_frame, sizeof(this_frame));
+    define_gf_group(cpi, &this_frame_copy);
+
+    // If we are going to code an altref frame at the end of the group and the current frame is not a key frame....
+    // If the previous group used an arf this frame has already benefited from that arf boost and it should not be given extra bits
+    // If the previous group was NOT coded using arf we may want to apply some boost to this GF as well
+    if (cpi->source_alt_ref_pending && (cpi->common.frame_type != KEY_FRAME)) {
+      // Assign a standard frames worth of bits from those allocated to the GF group
+      int bak = cpi->per_frame_bandwidth;
+      vpx_memcpy(&this_frame_copy, &this_frame, sizeof(this_frame));
+      assign_std_frame_bits(cpi, &this_frame_copy);
+      cpi->per_frame_bandwidth = bak;
+    }
+  }
+
+  // Otherwise this is an ordinary frame
+  else {
+    // Assign bits from those allocated to the GF group
+    vpx_memcpy(&this_frame_copy, &this_frame, sizeof(this_frame));
+    assign_std_frame_bits(cpi, &this_frame_copy);
+  }
+
+  // Keep a globally available copy of this and the next frame's iiratio.
+  cpi->twopass.this_iiratio = this_frame_intra_error /
+                              DOUBLE_DIVIDE_CHECK(this_frame_coded_error);
+  {
+    FIRSTPASS_STATS next_frame;
+    if (lookup_next_frame_stats(cpi, &next_frame) != EOF) {
+      cpi->twopass.next_iiratio = next_frame.intra_error /
+                                  DOUBLE_DIVIDE_CHECK(next_frame.coded_error);
+    }
+  }
+
+  // Set nominal per second bandwidth for this frame
+  cpi->target_bandwidth = cpi->per_frame_bandwidth * cpi->output_frame_rate;
+  if (cpi->target_bandwidth < 0)
+    cpi->target_bandwidth = 0;
+
+
+  // Account for mv, mode and other overheads.
+  overhead_bits = estimate_modemvcost(
+                    cpi, cpi->twopass.total_left_stats);
+
+  // Special case code for first frame.
+  if (cpi->common.current_video_frame == 0) {
+    cpi->twopass.est_max_qcorrection_factor = 1.0;
+
+    // Set a cq_level in constrained quality mode.
+    if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) {
+      int est_cq;
+
+      est_cq =
+        estimate_cq(cpi,
+                    cpi->twopass.total_left_stats,
+                    (int)(cpi->twopass.bits_left / frames_left),
+                    overhead_bits);
+
+      cpi->cq_target_quality = cpi->oxcf.cq_level;
+      if (est_cq > cpi->cq_target_quality)
+        cpi->cq_target_quality = est_cq;
+    }
+
+    // guess at maxq needed in 2nd pass
+    cpi->twopass.maxq_max_limit = cpi->worst_quality;
+    cpi->twopass.maxq_min_limit = cpi->best_quality;
+
+    tmp_q = estimate_max_q(
+              cpi,
+              cpi->twopass.total_left_stats,
+              (int)(cpi->twopass.bits_left / frames_left),
+              overhead_bits);
+
+    cpi->active_worst_quality         = tmp_q;
+    cpi->ni_av_qi                     = tmp_q;
+    cpi->avg_q                        = vp9_convert_qindex_to_q(tmp_q);
+
+    // Limit the maxq value returned subsequently.
+    // This increases the risk of overspend or underspend if the initial
+    // estimate for the clip is bad, but helps prevent excessive
+    // variation in Q, especially near the end of a clip
+    // where for example a small overspend may cause Q to crash
+    adjust_maxq_qrange(cpi);
+  }
+
+  // The last few frames of a clip almost always have to few or too many
+  // bits and for the sake of over exact rate control we dont want to make
+  // radical adjustments to the allowed quantizer range just to use up a
+  // few surplus bits or get beneath the target rate.
+  else if ((cpi->common.current_video_frame <
+            (((unsigned int)cpi->twopass.total_stats->count * 255) >> 8)) &&
+           ((cpi->common.current_video_frame + cpi->baseline_gf_interval) <
+            (unsigned int)cpi->twopass.total_stats->count)) {
+    if (frames_left < 1)
+      frames_left = 1;
+
+    tmp_q = estimate_max_q(
+              cpi,
+              cpi->twopass.total_left_stats,
+              (int)(cpi->twopass.bits_left / frames_left),
+              overhead_bits);
+
+    // Make a damped adjustment to active max Q
+    cpi->active_worst_quality =
+      adjust_active_maxq(cpi->active_worst_quality, tmp_q);
+  }
+
+  cpi->twopass.frames_to_key--;
+
+  // Update the total stats remaining sturcture
+  subtract_stats(cpi->twopass.total_left_stats, &this_frame);
+}
+
+
+static BOOL test_candidate_kf(VP9_COMP *cpi,  FIRSTPASS_STATS *last_frame, FIRSTPASS_STATS *this_frame, FIRSTPASS_STATS *next_frame) {
+  BOOL is_viable_kf = FALSE;
+
+  // Does the frame satisfy the primary criteria of a key frame
+  //      If so, then examine how well it predicts subsequent frames
+  if ((this_frame->pcnt_second_ref < 0.10) &&
+      (next_frame->pcnt_second_ref < 0.10) &&
+      ((this_frame->pcnt_inter < 0.05) ||
+       (
+         ((this_frame->pcnt_inter - this_frame->pcnt_neutral) < .35) &&
+         ((this_frame->intra_error / DOUBLE_DIVIDE_CHECK(this_frame->coded_error)) < 2.5) &&
+         ((fabs(last_frame->coded_error - this_frame->coded_error) / DOUBLE_DIVIDE_CHECK(this_frame->coded_error) > .40) ||
+          (fabs(last_frame->intra_error - this_frame->intra_error) / DOUBLE_DIVIDE_CHECK(this_frame->intra_error) > .40) ||
+          ((next_frame->intra_error / DOUBLE_DIVIDE_CHECK(next_frame->coded_error)) > 3.5)
+         )
+       )
+      )
+     ) {
+    int i;
+    FIRSTPASS_STATS *start_pos;
+
+    FIRSTPASS_STATS local_next_frame;
+
+    double boost_score = 0.0;
+    double old_boost_score = 0.0;
+    double decay_accumulator = 1.0;
+    double next_iiratio;
+
+    vpx_memcpy(&local_next_frame, next_frame, sizeof(*next_frame));
+
+    // Note the starting file position so we can reset to it
+    start_pos = cpi->twopass.stats_in;
+
+    // Examine how well the key frame predicts subsequent frames
+    for (i = 0; i < 16; i++) {
+      next_iiratio = (IIKFACTOR1 * local_next_frame.intra_error / DOUBLE_DIVIDE_CHECK(local_next_frame.coded_error));
+
+      if (next_iiratio > RMAX)
+        next_iiratio = RMAX;
+
+      // Cumulative effect of decay in prediction quality
+      if (local_next_frame.pcnt_inter > 0.85)
+        decay_accumulator = decay_accumulator * local_next_frame.pcnt_inter;
+      else
+        decay_accumulator = decay_accumulator * ((0.85 + local_next_frame.pcnt_inter) / 2.0);
+
+      // decay_accumulator = decay_accumulator * local_next_frame.pcnt_inter;
+
+      // Keep a running total
+      boost_score += (decay_accumulator * next_iiratio);
+
+      // Test various breakout clauses
+      if ((local_next_frame.pcnt_inter < 0.05) ||
+          (next_iiratio < 1.5) ||
+          (((local_next_frame.pcnt_inter -
+             local_next_frame.pcnt_neutral) < 0.20) &&
+           (next_iiratio < 3.0)) ||
+          ((boost_score - old_boost_score) < 3.0) ||
+          (local_next_frame.intra_error < 200)
+         ) {
+        break;
+      }
+
+      old_boost_score = boost_score;
+
+      // Get the next frame details
+      if (EOF == input_stats(cpi, &local_next_frame))
+        break;
+    }
+
+    // If there is tolerable prediction for at least the next 3 frames then break out else discard this pottential key frame and move on
+    if (boost_score > 30.0 && (i > 3))
+      is_viable_kf = TRUE;
+    else {
+      // Reset the file position
+      reset_fpf_position(cpi, start_pos);
+
+      is_viable_kf = FALSE;
+    }
+  }
+
+  return is_viable_kf;
+}
+static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
+  int i, j;
+  FIRSTPASS_STATS last_frame;
+  FIRSTPASS_STATS first_frame;
+  FIRSTPASS_STATS next_frame;
+  FIRSTPASS_STATS *start_position;
+
+  double decay_accumulator = 1.0;
+  double zero_motion_accumulator = 1.0;
+  double boost_score = 0;
+  double old_boost_score = 0.0;
+  double loop_decay_rate;
+
+  double kf_mod_err = 0.0;
+  double kf_group_err = 0.0;
+  double kf_group_intra_err = 0.0;
+  double kf_group_coded_err = 0.0;
+  double recent_loop_decay[8] = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
+
+  vpx_memset(&next_frame, 0, sizeof(next_frame)); // assure clean
+
+  vp9_clear_system_state();  // __asm emms;
+  start_position = cpi->twopass.stats_in;
+
+  cpi->common.frame_type = KEY_FRAME;
+
+  // is this a forced key frame by interval
+  cpi->this_key_frame_forced = cpi->next_key_frame_forced;
+
+  // Clear the alt ref active flag as this can never be active on a key frame
+  cpi->source_alt_ref_active = FALSE;
+
+  // Kf is always a gf so clear frames till next gf counter
+  cpi->frames_till_gf_update_due = 0;
+
+  cpi->twopass.frames_to_key = 1;
+
+  // Take a copy of the initial frame details
+  vpx_memcpy(&first_frame, this_frame, sizeof(*this_frame));
+
+  cpi->twopass.kf_group_bits = 0;        // Total bits avaialable to kf group
+  cpi->twopass.kf_group_error_left = 0;  // Group modified error score.
+
+  kf_mod_err = calculate_modified_err(cpi, this_frame);
+
+  // find the next keyframe
+  i = 0;
+  while (cpi->twopass.stats_in < cpi->twopass.stats_in_end) {
+    // Accumulate kf group error
+    kf_group_err += calculate_modified_err(cpi, this_frame);
+
+    // These figures keep intra and coded error counts for all frames including key frames in the group.
+    // The effect of the key frame itself can be subtracted out using the first_frame data collected above
+    kf_group_intra_err += this_frame->intra_error;
+    kf_group_coded_err += this_frame->coded_error;
+
+    // load a the next frame's stats
+    vpx_memcpy(&last_frame, this_frame, sizeof(*this_frame));
+    input_stats(cpi, this_frame);
+
+    // Provided that we are not at the end of the file...
+    if (cpi->oxcf.auto_key
+        && lookup_next_frame_stats(cpi, &next_frame) != EOF) {
+      // Normal scene cut check
+      if (test_candidate_kf(cpi, &last_frame, this_frame, &next_frame)) {
+        break;
+      }
+
+      // How fast is prediction quality decaying
+      loop_decay_rate = get_prediction_decay_rate(cpi, &next_frame);
+
+      // We want to know something about the recent past... rather than
+      // as used elsewhere where we are concened with decay in prediction
+      // quality since the last GF or KF.
+      recent_loop_decay[i % 8] = loop_decay_rate;
+      decay_accumulator = 1.0;
+      for (j = 0; j < 8; j++) {
+        decay_accumulator = decay_accumulator * recent_loop_decay[j];
+      }
+
+      // Special check for transition or high motion followed by a
+      // to a static scene.
+      if (detect_transition_to_still(cpi, i,
+                                     (cpi->key_frame_frequency - i),
+                                     loop_decay_rate,
+                                     decay_accumulator)) {
+        break;
+      }
+
+
+      // Step on to the next frame
+      cpi->twopass.frames_to_key++;
+
+      // If we don't have a real key frame within the next two
+      // forcekeyframeevery intervals then break out of the loop.
+      if (cpi->twopass.frames_to_key >= 2 * (int)cpi->key_frame_frequency)
+        break;
+    } else
+      cpi->twopass.frames_to_key++;
+
+    i++;
+  }
+
+  // If there is a max kf interval set by the user we must obey it.
+  // We already breakout of the loop above at 2x max.
+  // This code centers the extra kf if the actual natural
+  // interval is between 1x and 2x
+  if (cpi->oxcf.auto_key
+      && cpi->twopass.frames_to_key > (int)cpi->key_frame_frequency) {
+    FIRSTPASS_STATS *current_pos = cpi->twopass.stats_in;
+    FIRSTPASS_STATS tmp_frame;
+
+    cpi->twopass.frames_to_key /= 2;
+
+    // Copy first frame details
+    vpx_memcpy(&tmp_frame, &first_frame, sizeof(first_frame));
+
+    // Reset to the start of the group
+    reset_fpf_position(cpi, start_position);
+
+    kf_group_err = 0;
+    kf_group_intra_err = 0;
+    kf_group_coded_err = 0;
+
+    // Rescan to get the correct error data for the forced kf group
+    for (i = 0; i < cpi->twopass.frames_to_key; i++) {
+      // Accumulate kf group errors
+      kf_group_err += calculate_modified_err(cpi, &tmp_frame);
+      kf_group_intra_err += tmp_frame.intra_error;
+      kf_group_coded_err += tmp_frame.coded_error;
+
+      // Load a the next frame's stats
+      input_stats(cpi, &tmp_frame);
+    }
+
+    // Reset to the start of the group
+    reset_fpf_position(cpi, current_pos);
+
+    cpi->next_key_frame_forced = TRUE;
+  } else
+    cpi->next_key_frame_forced = FALSE;
+
+  // Special case for the last frame of the file
+  if (cpi->twopass.stats_in >= cpi->twopass.stats_in_end) {
+    // Accumulate kf group error
+    kf_group_err += calculate_modified_err(cpi, this_frame);
+
+    // These figures keep intra and coded error counts for all frames including key frames in the group.
+    // The effect of the key frame itself can be subtracted out using the first_frame data collected above
+    kf_group_intra_err += this_frame->intra_error;
+    kf_group_coded_err += this_frame->coded_error;
+  }
+
+  // Calculate the number of bits that should be assigned to the kf group.
+  if ((cpi->twopass.bits_left > 0) && (cpi->twopass.modified_error_left > 0.0)) {
+    // Max for a single normal frame (not key frame)
+    int max_bits = frame_max_bits(cpi);
+
+    // Maximum bits for the kf group
+    int64_t max_grp_bits;
+
+    // Default allocation based on bits left and relative
+    // complexity of the section
+    cpi->twopass.kf_group_bits = (int64_t)(cpi->twopass.bits_left *
+                                           (kf_group_err /
+                                            cpi->twopass.modified_error_left));
+
+    // Clip based on maximum per frame rate defined by the user.
+    max_grp_bits = (int64_t)max_bits * (int64_t)cpi->twopass.frames_to_key;
+    if (cpi->twopass.kf_group_bits > max_grp_bits)
+      cpi->twopass.kf_group_bits = max_grp_bits;
+  } else
+    cpi->twopass.kf_group_bits = 0;
+
+  // Reset the first pass file position
+  reset_fpf_position(cpi, start_position);
+
+  // determine how big to make this keyframe based on how well the subsequent frames use inter blocks
+  decay_accumulator = 1.0;
+  boost_score = 0.0;
+  loop_decay_rate = 1.00;       // Starting decay rate
+
+  for (i = 0; i < cpi->twopass.frames_to_key; i++) {
+    double r;
+
+    if (EOF == input_stats(cpi, &next_frame))
+      break;
+
+    if (next_frame.intra_error > cpi->twopass.kf_intra_err_min)
+      r = (IIKFACTOR2 * next_frame.intra_error /
+           DOUBLE_DIVIDE_CHECK(next_frame.coded_error));
+    else
+      r = (IIKFACTOR2 * cpi->twopass.kf_intra_err_min /
+           DOUBLE_DIVIDE_CHECK(next_frame.coded_error));
+
+    if (r > RMAX)
+      r = RMAX;
+
+    // Monitor for static sections.
+    if ((next_frame.pcnt_inter - next_frame.pcnt_motion) <
+        zero_motion_accumulator) {
+      zero_motion_accumulator =
+        (next_frame.pcnt_inter - next_frame.pcnt_motion);
+    }
+
+    // How fast is prediction quality decaying
+    if (!detect_flash(cpi, 0)) {
+      loop_decay_rate = get_prediction_decay_rate(cpi, &next_frame);
+      decay_accumulator = decay_accumulator * loop_decay_rate;
+      decay_accumulator = decay_accumulator < 0.1 ? 0.1 : decay_accumulator;
+    }
+
+    boost_score += (decay_accumulator * r);
+
+    if ((i > MIN_GF_INTERVAL) &&
+        ((boost_score - old_boost_score) < 6.25)) {
+      break;
+    }
+
+    old_boost_score = boost_score;
+  }
+
+  {
+    FIRSTPASS_STATS sectionstats;
+
+    zero_stats(&sectionstats);
+    reset_fpf_position(cpi, start_position);
+
+    for (i = 0; i < cpi->twopass.frames_to_key; i++) {
+      input_stats(cpi, &next_frame);
+      accumulate_stats(&sectionstats, &next_frame);
+    }
+
+    avg_stats(&sectionstats);
+
+    cpi->twopass.section_intra_rating =
+      sectionstats.intra_error
+      / DOUBLE_DIVIDE_CHECK(sectionstats.coded_error);
+  }
+
+  // Reset the first pass file position
+  reset_fpf_position(cpi, start_position);
+
+  // Work out how many bits to allocate for the key frame itself
+  if (1) {
+    int kf_boost = boost_score;
+    int allocation_chunks;
+    int alt_kf_bits;
+
+    if (kf_boost < 300) {
+      kf_boost += (cpi->twopass.frames_to_key * 3);
+      if (kf_boost > 300)
+        kf_boost = 300;
+    }
+
+    if (kf_boost < 250)                                                      // Min KF boost
+      kf_boost = 250;
+
+    // Make a note of baseline boost and the zero motion
+    // accumulator value for use elsewhere.
+    cpi->kf_boost = kf_boost;
+    cpi->kf_zeromotion_pct = (int)(zero_motion_accumulator * 100.0);
+
+    // We do three calculations for kf size.
+    // The first is based on the error score for the whole kf group.
+    // The second (optionaly) on the key frames own error if this is
+    // smaller than the average for the group.
+    // The final one insures that the frame receives at least the
+    // allocation it would have received based on its own error score vs
+    // the error score remaining
+    // Special case if the sequence appears almost totaly static
+    // In this case we want to spend almost all of the bits on the
+    // key frame.
+    // cpi->twopass.frames_to_key-1 because key frame itself is taken
+    // care of by kf_boost.
+    if (zero_motion_accumulator >= 0.99) {
+      allocation_chunks =
+        ((cpi->twopass.frames_to_key - 1) * 10) + kf_boost;
+    } else {
+      allocation_chunks =
+        ((cpi->twopass.frames_to_key - 1) * 100) + kf_boost;
+    }
+
+    // Prevent overflow
+    if (kf_boost > 1028) {
+      int divisor = kf_boost >> 10;
+      kf_boost /= divisor;
+      allocation_chunks /= divisor;
+    }
+
+    cpi->twopass.kf_group_bits = (cpi->twopass.kf_group_bits < 0) ? 0 : cpi->twopass.kf_group_bits;
+
+    // Calculate the number of bits to be spent on the key frame
+    cpi->twopass.kf_bits  = (int)((double)kf_boost * ((double)cpi->twopass.kf_group_bits / (double)allocation_chunks));
+
+    // If the key frame is actually easier than the average for the
+    // kf group (which does sometimes happen... eg a blank intro frame)
+    // Then use an alternate calculation based on the kf error score
+    // which should give a smaller key frame.
+    if (kf_mod_err < kf_group_err / cpi->twopass.frames_to_key) {
+      double  alt_kf_grp_bits =
+        ((double)cpi->twopass.bits_left *
+         (kf_mod_err * (double)cpi->twopass.frames_to_key) /
+         DOUBLE_DIVIDE_CHECK(cpi->twopass.modified_error_left));
+
+      alt_kf_bits = (int)((double)kf_boost *
+                          (alt_kf_grp_bits / (double)allocation_chunks));
+
+      if (cpi->twopass.kf_bits > alt_kf_bits) {
+        cpi->twopass.kf_bits = alt_kf_bits;
+      }
+    }
+    // Else if it is much harder than other frames in the group make sure
+    // it at least receives an allocation in keeping with its relative
+    // error score
+    else {
+      alt_kf_bits =
+        (int)((double)cpi->twopass.bits_left *
+              (kf_mod_err /
+               DOUBLE_DIVIDE_CHECK(cpi->twopass.modified_error_left)));
+
+      if (alt_kf_bits > cpi->twopass.kf_bits) {
+        cpi->twopass.kf_bits = alt_kf_bits;
+      }
+    }
+
+    cpi->twopass.kf_group_bits -= cpi->twopass.kf_bits;
+    cpi->twopass.kf_bits += cpi->min_frame_bandwidth;                                          // Add in the minimum frame allowance
+
+    cpi->per_frame_bandwidth = cpi->twopass.kf_bits;                                           // Peer frame bit target for this frame
+    cpi->target_bandwidth = cpi->twopass.kf_bits * cpi->output_frame_rate;                      // Convert to a per second bitrate
+  }
+
+  // Note the total error score of the kf group minus the key frame itself
+  cpi->twopass.kf_group_error_left = (int)(kf_group_err - kf_mod_err);
+
+  // Adjust the count of total modified error left.
+  // The count of bits left is adjusted elsewhere based on real coded frame sizes
+  cpi->twopass.modified_error_left -= kf_group_err;
+}
--- /dev/null
+++ b/vp9/encoder/firstpass.h
@@ -1,0 +1,23 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#if !defined __INC_FIRSTPASS_H
+#define      __INC_FIRSTPASS_H
+
+extern void vp9_init_first_pass(VP9_COMP *cpi);
+extern void vp9_first_pass(VP9_COMP *cpi);
+extern void vp9_end_first_pass(VP9_COMP *cpi);
+
+extern void vp9_init_second_pass(VP9_COMP *cpi);
+extern void vp9_second_pass(VP9_COMP *cpi);
+extern void vp9_end_second_pass(VP9_COMP *cpi);
+
+#endif
--- /dev/null
+++ b/vp9/encoder/generic/csystemdependent.c
@@ -1,0 +1,48 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_ports/config.h"
+#include "vp9/encoder/variance.h"
+#include "vp9/encoder/onyx_int.h"
+
+
+void vp9_arch_x86_encoder_init(VP9_COMP *cpi);
+void vp9_arch_arm_encoder_init(VP9_COMP *cpi);
+
+void (*vp9_yv12_copy_partial_frame_ptr)(YV12_BUFFER_CONFIG *src_ybc,
+                                        YV12_BUFFER_CONFIG *dst_ybc,
+                                        int fraction);
+extern void vp9_yv12_copy_partial_frame(YV12_BUFFER_CONFIG *src_ybc,
+                                        YV12_BUFFER_CONFIG *dst_ybc,
+                                        int fraction);
+
+void vp9_cmachine_specific_config(VP9_COMP *cpi) {
+#if CONFIG_RUNTIME_CPU_DETECT
+  cpi->rtcd.common                    = &cpi->common.rtcd;
+
+  cpi->rtcd.search.full_search             = vp9_full_search_sad;
+  cpi->rtcd.search.refining_search         = vp9_refining_search_sad;
+  cpi->rtcd.search.diamond_search          = vp9_diamond_search_sad;
+  cpi->rtcd.temporal.apply                 = vp9_temporal_filter_apply_c;
+#endif
+
+  vp9_yv12_copy_partial_frame_ptr = vp9_yv12_copy_partial_frame;
+
+#if ARCH_X86 || ARCH_X86_64
+  vp9_arch_x86_encoder_init(cpi);
+#endif
+
+#if ARCH_ARM
+  vp9_arch_arm_encoder_init(cpi);
+#endif
+
+
+}
--- /dev/null
+++ b/vp9/encoder/lookahead.c
@@ -1,0 +1,191 @@
+/*
+ *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include <assert.h>
+#include <stdlib.h>
+#include "vpx_config.h"
+#include "lookahead.h"
+#include "vp9/common/extend.h"
+
+#define MAX_LAG_BUFFERS 25
+
+struct lookahead_ctx {
+  unsigned int max_sz;         /* Absolute size of the queue */
+  unsigned int sz;             /* Number of buffers currently in the queue */
+  unsigned int read_idx;       /* Read index */
+  unsigned int write_idx;      /* Write index */
+  struct lookahead_entry *buf; /* Buffer list */
+};
+
+
+/* Return the buffer at the given absolute index and increment the index */
+static struct lookahead_entry *
+pop(struct lookahead_ctx *ctx,
+    unsigned int         *idx) {
+  unsigned int            index = *idx;
+  struct lookahead_entry *buf = ctx->buf + index;
+
+  assert(index < ctx->max_sz);
+  if (++index >= ctx->max_sz)
+    index -= ctx->max_sz;
+  *idx = index;
+  return buf;
+}
+
+
+void
+vp9_lookahead_destroy(struct lookahead_ctx *ctx) {
+  if (ctx) {
+    if (ctx->buf) {
+      int i;
+
+      for (i = 0; i < ctx->max_sz; i++)
+        vp8_yv12_de_alloc_frame_buffer(&ctx->buf[i].img);
+      free(ctx->buf);
+    }
+    free(ctx);
+  }
+}
+
+
+struct lookahead_ctx *
+vp9_lookahead_init(unsigned int width,
+                   unsigned int height,
+                   unsigned int depth) {
+  struct lookahead_ctx *ctx = NULL;
+  int i;
+
+  /* Clamp the lookahead queue depth */
+  if (depth < 1)
+    depth = 1;
+  else if (depth > MAX_LAG_BUFFERS)
+    depth = MAX_LAG_BUFFERS;
+
+  /* Align the buffer dimensions */
+  width = (width + 15) &~15;
+  height = (height + 15) &~15;
+
+  /* Allocate the lookahead structures */
+  ctx = calloc(1, sizeof(*ctx));
+  if (ctx) {
+    ctx->max_sz = depth;
+    ctx->buf = calloc(depth, sizeof(*ctx->buf));
+    if (!ctx->buf)
+      goto bail;
+    for (i = 0; i < depth; i++)
+      if (vp8_yv12_alloc_frame_buffer(&ctx->buf[i].img,
+                                      width, height, VP8BORDERINPIXELS))
+        goto bail;
+  }
+  return ctx;
+bail:
+  vp9_lookahead_destroy(ctx);
+  return NULL;
+}
+
+
+int
+vp9_lookahead_push(struct lookahead_ctx *ctx,
+                   YV12_BUFFER_CONFIG   *src,
+                   int64_t               ts_start,
+                   int64_t               ts_end,
+                   unsigned int          flags,
+                   unsigned char        *active_map) {
+  struct lookahead_entry *buf;
+  int row, col, active_end;
+  int mb_rows = (src->y_height + 15) >> 4;
+  int mb_cols = (src->y_width + 15) >> 4;
+
+  if (ctx->sz + 1 > ctx->max_sz)
+    return 1;
+  ctx->sz++;
+  buf = pop(ctx, &ctx->write_idx);
+
+  // Only do this partial copy if the following conditions are all met:
+  // 1. Lookahead queue has has size of 1.
+  // 2. Active map is provided.
+  // 3. This is not a key frame, golden nor altref frame.
+  if (ctx->max_sz == 1 && active_map && !flags) {
+    for (row = 0; row < mb_rows; ++row) {
+      col = 0;
+
+      while (1) {
+        // Find the first active macroblock in this row.
+        for (; col < mb_cols; ++col) {
+          if (active_map[col])
+            break;
+        }
+
+        // No more active macroblock in this row.
+        if (col == mb_cols)
+          break;
+
+        // Find the end of active region in this row.
+        active_end = col;
+
+        for (; active_end < mb_cols; ++active_end) {
+          if (!active_map[active_end])
+            break;
+        }
+
+        // Only copy this active region.
+        vp9_copy_and_extend_frame_with_rect(src, &buf->img,
+                                            row << 4,
+                                            col << 4, 16,
+                                            (active_end - col) << 4);
+
+        // Start again from the end of this active region.
+        col = active_end;
+      }
+
+      active_map += mb_cols;
+    }
+  } else {
+    vp9_copy_and_extend_frame(src, &buf->img);
+  }
+  buf->ts_start = ts_start;
+  buf->ts_end = ts_end;
+  buf->flags = flags;
+  return 0;
+}
+
+
+struct lookahead_entry *
+vp9_lookahead_pop(struct lookahead_ctx *ctx,
+                  int                   drain) {
+  struct lookahead_entry *buf = NULL;
+
+  if (ctx->sz && (drain || ctx->sz == ctx->max_sz)) {
+    buf = pop(ctx, &ctx->read_idx);
+    ctx->sz--;
+  }
+  return buf;
+}
+
+
+struct lookahead_entry *
+vp9_lookahead_peek(struct lookahead_ctx *ctx,
+                   int                   index) {
+  struct lookahead_entry *buf = NULL;
+
+  assert(index < ctx->max_sz);
+  if (index < ctx->sz) {
+    index += ctx->read_idx;
+    if (index >= ctx->max_sz)
+      index -= ctx->max_sz;
+    buf = ctx->buf + index;
+  }
+  return buf;
+}
+
+
+unsigned int
+vp9_lookahead_depth(struct lookahead_ctx *ctx) {
+  return ctx->sz;
+}
--- /dev/null
+++ b/vp9/encoder/lookahead.h
@@ -1,0 +1,105 @@
+/*
+ *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef LOOKAHEAD_H
+#define LOOKAHEAD_H
+#include "vpx_scale/yv12config.h"
+#include "vpx/vpx_integer.h"
+
+struct lookahead_entry {
+  YV12_BUFFER_CONFIG  img;
+  int64_t             ts_start;
+  int64_t             ts_end;
+  unsigned int        flags;
+};
+
+
+struct lookahead_ctx;
+
+/**\brief Initializes the lookahead stage
+ *
+ * The lookahead stage is a queue of frame buffers on which some analysis
+ * may be done when buffers are enqueued.
+ *
+ *
+ */
+struct lookahead_ctx *vp9_lookahead_init(unsigned int width,
+                                         unsigned int height,
+                                         unsigned int depth
+                                        );
+
+
+/**\brief Destroys the lookahead stage
+ *
+ */
+void vp9_lookahead_destroy(struct lookahead_ctx *ctx);
+
+
+/**\brief Enqueue a source buffer
+ *
+ * This function will copy the source image into a new framebuffer with
+ * the expected stride/border.
+ *
+ * If active_map is non-NULL and there is only one frame in the queue, then copy
+ * only active macroblocks.
+ *
+ * \param[in] ctx         Pointer to the lookahead context
+ * \param[in] src         Pointer to the image to enqueue
+ * \param[in] ts_start    Timestamp for the start of this frame
+ * \param[in] ts_end      Timestamp for the end of this frame
+ * \param[in] flags       Flags set on this frame
+ * \param[in] active_map  Map that specifies which macroblock is active
+ */
+int
+vp9_lookahead_push(struct lookahead_ctx *ctx,
+                   YV12_BUFFER_CONFIG   *src,
+                   int64_t               ts_start,
+                   int64_t               ts_end,
+                   unsigned int          flags,
+                   unsigned char        *active_map);
+
+
+/**\brief Get the next source buffer to encode
+ *
+ *
+ * \param[in] ctx       Pointer to the lookahead context
+ * \param[in] drain     Flag indicating the buffer should be drained
+ *                      (return a buffer regardless of the current queue depth)
+ *
+ * \retval NULL, if drain set and queue is empty
+ * \retval NULL, if drain not set and queue not of the configured depth
+ *
+ */
+struct lookahead_entry *
+vp9_lookahead_pop(struct lookahead_ctx *ctx,
+                  int                   drain);
+
+
+/**\brief Get a future source buffer to encode
+ *
+ * \param[in] ctx       Pointer to the lookahead context
+ * \param[in] index     Index of the frame to be returned, 0 == next frame
+ *
+ * \retval NULL, if no buffer exists at the specified index
+ *
+ */
+struct lookahead_entry *
+vp9_lookahead_peek(struct lookahead_ctx *ctx,
+                   int                   index);
+
+
+/**\brief Get the number of frames currently in the lookahead queue
+ *
+ * \param[in] ctx       Pointer to the lookahead context
+ */
+unsigned int
+vp9_lookahead_depth(struct lookahead_ctx *ctx);
+
+
+#endif
--- /dev/null
+++ b/vp9/encoder/mbgraph.c
@@ -1,0 +1,480 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <limits.h>
+#include <vp9/encoder/encodeintra.h>
+#include <vp9/encoder/rdopt.h>
+#include <vp9/common/setupintrarecon.h>
+#include <vp9/common/blockd.h>
+#include <vp9/common/reconinter.h>
+#include <vp9/common/systemdependent.h>
+#include <vpx_mem/vpx_mem.h>
+#include <vp9/encoder/segmentation.h>
+
+static unsigned int do_16x16_motion_iteration(VP9_COMP *cpi,
+                                              int_mv *ref_mv,
+                                              int_mv *dst_mv) {
+  MACROBLOCK   *const x  = &cpi->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  BLOCK *b  = &x->block[0];
+  BLOCKD *d = &xd->block[0];
+  vp9_variance_fn_ptr_t v_fn_ptr = cpi->fn_ptr[BLOCK_16X16];
+  unsigned int best_err;
+  int step_param, further_steps;
+
+  int tmp_col_min = x->mv_col_min;
+  int tmp_col_max = x->mv_col_max;
+  int tmp_row_min = x->mv_row_min;
+  int tmp_row_max = x->mv_row_max;
+  int_mv ref_full;
+
+  // Further step/diamond searches as necessary
+  if (cpi->Speed < 8) {
+    step_param = cpi->sf.first_step + ((cpi->Speed > 5) ? 1 : 0);
+    further_steps = (cpi->sf.max_step_search_steps - 1) - step_param;
+  } else {
+    step_param = cpi->sf.first_step + 2;
+    further_steps = 0;
+  }
+
+  vp9_clamp_mv_min_max(x, ref_mv);
+
+  ref_full.as_mv.col = ref_mv->as_mv.col >> 3;
+  ref_full.as_mv.row = ref_mv->as_mv.row >> 3;
+
+  /*cpi->sf.search_method == HEX*/
+  best_err = vp9_hex_search(
+      x, b, d,
+      &ref_full, dst_mv,
+      step_param,
+      x->errorperbit,
+      &v_fn_ptr,
+      NULLMVCOST,
+      NULLMVCOST,
+      ref_mv);
+
+  // Try sub-pixel MC
+  // if (bestsme > error_thresh && bestsme < INT_MAX)
+  {
+    int distortion;
+    unsigned int sse;
+    best_err = cpi->find_fractional_mv_step(
+        x, b, d,
+        dst_mv, ref_mv,
+        x->errorperbit, &v_fn_ptr,
+        NULLMVCOST,
+        & distortion, &sse);
+  }
+
+#if CONFIG_PRED_FILTER
+  // Disable the prediction filter
+  xd->mode_info_context->mbmi.pred_filter_enabled = 0;
+#endif
+
+  vp9_set_mbmode_and_mvs(x, NEWMV, dst_mv);
+  vp9_build_1st_inter16x16_predictors_mby(xd, xd->predictor, 16, 0);
+  best_err = vp9_sad16x16(xd->dst.y_buffer, xd->dst.y_stride,
+                          xd->predictor, 16, INT_MAX);
+
+  /* restore UMV window */
+  x->mv_col_min = tmp_col_min;
+  x->mv_col_max = tmp_col_max;
+  x->mv_row_min = tmp_row_min;
+  x->mv_row_max = tmp_row_max;
+
+  return best_err;
+}
+
+static int do_16x16_motion_search
+(
+  VP9_COMP *cpi,
+  int_mv *ref_mv,
+  int_mv *dst_mv,
+  YV12_BUFFER_CONFIG *buf,
+  int buf_mb_y_offset,
+  YV12_BUFFER_CONFIG *ref,
+  int mb_y_offset
+) {
+  MACROBLOCK   *const x  = &cpi->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  unsigned int err, tmp_err;
+  int_mv tmp_mv;
+  int n;
+
+  for (n = 0; n < 16; n++) {
+    BLOCKD *d = &xd->block[n];
+    BLOCK *b  = &x->block[n];
+
+    b->base_src   = &buf->y_buffer;
+    b->src_stride = buf->y_stride;
+    b->src        = buf->y_stride * (n & 12) + (n & 3) * 4 + buf_mb_y_offset;
+
+    d->base_pre   = &ref->y_buffer;
+    d->pre_stride = ref->y_stride;
+    d->pre        = ref->y_stride * (n & 12) + (n & 3) * 4 + mb_y_offset;
+  }
+
+  // Try zero MV first
+  // FIXME should really use something like near/nearest MV and/or MV prediction
+  xd->pre.y_buffer = ref->y_buffer + mb_y_offset;
+  xd->pre.y_stride = ref->y_stride;
+  err = vp9_sad16x16(ref->y_buffer + mb_y_offset, ref->y_stride,
+                     xd->dst.y_buffer, xd->dst.y_stride, INT_MAX);
+  dst_mv->as_int = 0;
+
+  // Test last reference frame using the previous best mv as the
+  // starting point (best reference) for the search
+  tmp_err = do_16x16_motion_iteration(cpi, ref_mv, &tmp_mv);
+  if (tmp_err < err) {
+    err            = tmp_err;
+    dst_mv->as_int = tmp_mv.as_int;
+  }
+
+  // If the current best reference mv is not centred on 0,0 then do a 0,0 based search as well
+  if (ref_mv->as_int) {
+    int tmp_err;
+    int_mv zero_ref_mv, tmp_mv;
+
+    zero_ref_mv.as_int = 0;
+    tmp_err = do_16x16_motion_iteration(cpi, &zero_ref_mv, &tmp_mv);
+    if (tmp_err < err) {
+      dst_mv->as_int = tmp_mv.as_int;
+      err = tmp_err;
+    }
+  }
+
+  return err;
+}
+
+static int do_16x16_zerozero_search
+(
+  VP9_COMP *cpi,
+  int_mv *dst_mv,
+  YV12_BUFFER_CONFIG *buf,
+  int buf_mb_y_offset,
+  YV12_BUFFER_CONFIG *ref,
+  int mb_y_offset
+) {
+  MACROBLOCK   *const x  = &cpi->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  unsigned int err;
+  int n;
+
+  for (n = 0; n < 16; n++) {
+    BLOCKD *d = &xd->block[n];
+    BLOCK *b  = &x->block[n];
+
+    b->base_src   = &buf->y_buffer;
+    b->src_stride = buf->y_stride;
+    b->src        = buf->y_stride * (n & 12) + (n & 3) * 4 + buf_mb_y_offset;
+
+    d->base_pre   = &ref->y_buffer;
+    d->pre_stride = ref->y_stride;
+    d->pre        = ref->y_stride * (n & 12) + (n & 3) * 4 + mb_y_offset;
+  }
+
+  // Try zero MV first
+  // FIXME should really use something like near/nearest MV and/or MV prediction
+  xd->pre.y_buffer = ref->y_buffer + mb_y_offset;
+  xd->pre.y_stride = ref->y_stride;
+  // VARIANCE_INVOKE(&cpi->rtcd.variance, satd16x16)
+  err = vp9_sad16x16(ref->y_buffer + mb_y_offset, ref->y_stride,
+                     xd->dst.y_buffer, xd->dst.y_stride, INT_MAX);
+
+  dst_mv->as_int = 0;
+
+  return err;
+}
+static int find_best_16x16_intra
+(
+  VP9_COMP *cpi,
+  YV12_BUFFER_CONFIG *buf,
+  int mb_y_offset,
+  MB_PREDICTION_MODE *pbest_mode
+) {
+  MACROBLOCK   *const x  = &cpi->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_PREDICTION_MODE best_mode = -1, mode;
+  int best_err = INT_MAX;
+
+  // calculate SATD for each intra prediction mode;
+  // we're intentionally not doing 4x4, we just want a rough estimate
+  for (mode = DC_PRED; mode <= TM_PRED; mode++) {
+    unsigned int err;
+
+    xd->mode_info_context->mbmi.mode = mode;
+    vp9_build_intra_predictors_mby(xd);
+    err = vp9_sad16x16(xd->predictor, 16, buf->y_buffer + mb_y_offset,
+                       buf->y_stride, best_err);
+    // find best
+    if (err < best_err) {
+      best_err  = err;
+      best_mode = mode;
+    }
+  }
+
+  if (pbest_mode)
+    *pbest_mode = best_mode;
+
+  return best_err;
+}
+
+static void update_mbgraph_mb_stats
+(
+  VP9_COMP *cpi,
+  MBGRAPH_MB_STATS *stats,
+  YV12_BUFFER_CONFIG *buf,
+  int mb_y_offset,
+  YV12_BUFFER_CONFIG *golden_ref,
+  int_mv *prev_golden_ref_mv,
+  int gld_y_offset,
+  YV12_BUFFER_CONFIG *alt_ref,
+  int_mv *prev_alt_ref_mv,
+  int arf_y_offset
+) {
+  MACROBLOCK   *const x  = &cpi->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  int intra_error;
+
+  // FIXME in practice we're completely ignoring chroma here
+  xd->dst.y_buffer = buf->y_buffer + mb_y_offset;
+
+  // do intra 16x16 prediction
+  intra_error = find_best_16x16_intra(cpi, buf, mb_y_offset, &stats->ref[INTRA_FRAME].m.mode);
+  if (intra_error <= 0)
+    intra_error = 1;
+  stats->ref[INTRA_FRAME].err = intra_error;
+
+  // Golden frame MV search, if it exists and is different than last frame
+  if (golden_ref) {
+    int g_motion_error = do_16x16_motion_search(cpi, prev_golden_ref_mv,
+                                                &stats->ref[GOLDEN_FRAME].m.mv,
+                                                buf, mb_y_offset,
+                                                golden_ref, gld_y_offset);
+    stats->ref[GOLDEN_FRAME].err = g_motion_error;
+  } else {
+    stats->ref[GOLDEN_FRAME].err = INT_MAX;
+    stats->ref[GOLDEN_FRAME].m.mv.as_int = 0;
+  }
+
+  // Alt-ref frame MV search, if it exists and is different than last/golden frame
+  if (alt_ref) {
+    // int a_motion_error = do_16x16_motion_search(cpi, prev_alt_ref_mv,
+    //                                            &stats->ref[ALTREF_FRAME].m.mv,
+    //                                            buf, mb_y_offset,
+    //                                            alt_ref, arf_y_offset);
+
+    int a_motion_error =
+      do_16x16_zerozero_search(cpi,
+                               &stats->ref[ALTREF_FRAME].m.mv,
+                               buf, mb_y_offset,
+                               alt_ref, arf_y_offset);
+
+    stats->ref[ALTREF_FRAME].err = a_motion_error;
+  } else {
+    stats->ref[ALTREF_FRAME].err = INT_MAX;
+    stats->ref[ALTREF_FRAME].m.mv.as_int = 0;
+  }
+}
+
+static void update_mbgraph_frame_stats
+(
+  VP9_COMP *cpi,
+  MBGRAPH_FRAME_STATS *stats,
+  YV12_BUFFER_CONFIG *buf,
+  YV12_BUFFER_CONFIG *golden_ref,
+  YV12_BUFFER_CONFIG *alt_ref
+) {
+  MACROBLOCK   *const x  = &cpi->mb;
+  VP9_COMMON   *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  int mb_col, mb_row, offset = 0;
+  int mb_y_offset = 0, arf_y_offset = 0, gld_y_offset = 0;
+  int_mv arf_top_mv, gld_top_mv;
+  MODE_INFO mi_local;
+
+  // Set up limit values for motion vectors to prevent them extending outside the UMV borders
+  arf_top_mv.as_int = 0;
+  gld_top_mv.as_int = 0;
+  x->mv_row_min     = -(VP8BORDERINPIXELS - 16 - INTERP_EXTEND);
+  x->mv_row_max     = (cm->mb_rows - 1) * 16 + VP8BORDERINPIXELS - 16 - INTERP_EXTEND;
+  xd->up_available  = 0;
+  xd->dst.y_stride  = buf->y_stride;
+  xd->pre.y_stride  = buf->y_stride;
+  xd->dst.uv_stride = buf->uv_stride;
+  xd->mode_info_context = &mi_local;
+
+  for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) {
+    int_mv arf_left_mv, gld_left_mv;
+    int mb_y_in_offset  = mb_y_offset;
+    int arf_y_in_offset = arf_y_offset;
+    int gld_y_in_offset = gld_y_offset;
+
+    // Set up limit values for motion vectors to prevent them extending outside the UMV borders
+    arf_left_mv.as_int = arf_top_mv.as_int;
+    gld_left_mv.as_int = gld_top_mv.as_int;
+    x->mv_col_min      = -(VP8BORDERINPIXELS - 16 - INTERP_EXTEND);
+    x->mv_col_max      = (cm->mb_cols - 1) * 16 + VP8BORDERINPIXELS - 16 - INTERP_EXTEND;
+    xd->left_available = 0;
+
+    for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) {
+      MBGRAPH_MB_STATS *mb_stats = &stats->mb_stats[offset + mb_col];
+
+      update_mbgraph_mb_stats(cpi, mb_stats, buf, mb_y_in_offset,
+                              golden_ref, &gld_left_mv, gld_y_in_offset,
+                              alt_ref,    &arf_left_mv, arf_y_in_offset);
+      arf_left_mv.as_int = mb_stats->ref[ALTREF_FRAME].m.mv.as_int;
+      gld_left_mv.as_int = mb_stats->ref[GOLDEN_FRAME].m.mv.as_int;
+      if (mb_col == 0) {
+        arf_top_mv.as_int = arf_left_mv.as_int;
+        gld_top_mv.as_int = gld_left_mv.as_int;
+      }
+      xd->left_available = 1;
+      mb_y_in_offset    += 16;
+      gld_y_in_offset   += 16;
+      arf_y_in_offset   += 16;
+      x->mv_col_min     -= 16;
+      x->mv_col_max     -= 16;
+    }
+    xd->up_available = 1;
+    mb_y_offset     += buf->y_stride * 16;
+    gld_y_offset    += golden_ref->y_stride * 16;
+    if (alt_ref)
+      arf_y_offset    += alt_ref->y_stride * 16;
+    x->mv_row_min   -= 16;
+    x->mv_row_max   -= 16;
+    offset          += cm->mb_cols;
+  }
+}
+
+// void separate_arf_mbs_byzz
+static void separate_arf_mbs(VP9_COMP *cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  int mb_col, mb_row, offset, i;
+  int ncnt[4];
+  int n_frames = cpi->mbgraph_n_frames;
+
+  int *arf_not_zz;
+
+  CHECK_MEM_ERROR(arf_not_zz,
+                  vpx_calloc(cm->mb_rows * cm->mb_cols * sizeof(*arf_not_zz), 1));
+
+  vpx_memset(arf_not_zz, 0, sizeof(arf_not_zz));
+
+  // We are not interested in results beyond the alt ref itself.
+  if (n_frames > cpi->frames_till_gf_update_due)
+    n_frames = cpi->frames_till_gf_update_due;
+
+  // defer cost to reference frames
+  for (i = n_frames - 1; i >= 0; i--) {
+    MBGRAPH_FRAME_STATS *frame_stats = &cpi->mbgraph_stats[i];
+
+    for (offset = 0, mb_row = 0; mb_row < cm->mb_rows;
+         offset += cm->mb_cols, mb_row++) {
+      for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) {
+        MBGRAPH_MB_STATS *mb_stats =
+          &frame_stats->mb_stats[offset + mb_col];
+
+        int altref_err = mb_stats->ref[ALTREF_FRAME].err;
+        int intra_err  = mb_stats->ref[INTRA_FRAME ].err;
+        int golden_err = mb_stats->ref[GOLDEN_FRAME].err;
+
+        // Test for altref vs intra and gf and that its mv was 0,0.
+        if ((altref_err > 1000) ||
+            (altref_err > intra_err) ||
+            (altref_err > golden_err)) {
+          arf_not_zz[offset + mb_col]++;
+        }
+      }
+    }
+  }
+
+  vpx_memset(ncnt, 0, sizeof(ncnt));
+  for (offset = 0, mb_row = 0; mb_row < cm->mb_rows;
+       offset += cm->mb_cols, mb_row++) {
+    for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) {
+      // If any of the blocks in the sequence failed then the MB
+      // goes in segment 0
+      if (arf_not_zz[offset + mb_col]) {
+        ncnt[0]++;
+        cpi->segmentation_map[offset + mb_col] = 0;
+      } else {
+        ncnt[1]++;
+        cpi->segmentation_map[offset + mb_col] = 1;
+      }
+    }
+  }
+
+  // Only bother with segmentation if over 10% of the MBs in static segment
+  // if ( ncnt[1] && (ncnt[0] / ncnt[1] < 10) )
+  if (1) {
+    // Note % of blocks that are marked as static
+    if (cm->MBs)
+      cpi->static_mb_pct = (ncnt[1] * 100) / cm->MBs;
+
+    // This error case should not be reachable as this function should
+    // never be called with the common data structure unititialized.
+    else
+      cpi->static_mb_pct = 0;
+
+    cpi->seg0_cnt = ncnt[0];
+    vp9_enable_segmentation((VP9_PTR) cpi);
+  } else {
+    cpi->static_mb_pct = 0;
+    vp9_disable_segmentation((VP9_PTR) cpi);
+  }
+
+  // Free localy allocated storage
+  vpx_free(arf_not_zz);
+}
+
+void vp9_update_mbgraph_stats
+(
+  VP9_COMP *cpi
+) {
+  VP9_COMMON *const cm = &cpi->common;
+  int i, n_frames = vp9_lookahead_depth(cpi->lookahead);
+  YV12_BUFFER_CONFIG *golden_ref = &cm->yv12_fb[cm->gld_fb_idx];
+
+  // we need to look ahead beyond where the ARF transitions into
+  // being a GF - so exit if we don't look ahead beyond that
+  if (n_frames <= cpi->frames_till_gf_update_due)
+    return;
+  if (n_frames > cpi->common.frames_till_alt_ref_frame)
+    n_frames = cpi->common.frames_till_alt_ref_frame;
+  if (n_frames > MAX_LAG_BUFFERS)
+    n_frames = MAX_LAG_BUFFERS;
+
+  cpi->mbgraph_n_frames = n_frames;
+  for (i = 0; i < n_frames; i++) {
+    MBGRAPH_FRAME_STATS *frame_stats = &cpi->mbgraph_stats[i];
+    vpx_memset(frame_stats->mb_stats, 0,
+               cm->mb_rows * cm->mb_cols * sizeof(*cpi->mbgraph_stats[i].mb_stats));
+  }
+
+  // do motion search to find contribution of each reference to data
+  // later on in this GF group
+  // FIXME really, the GF/last MC search should be done forward, and
+  // the ARF MC search backwards, to get optimal results for MV caching
+  for (i = 0; i < n_frames; i++) {
+    MBGRAPH_FRAME_STATS *frame_stats = &cpi->mbgraph_stats[i];
+    struct lookahead_entry *q_cur =
+      vp9_lookahead_peek(cpi->lookahead, i);
+
+    assert(q_cur != NULL);
+
+    update_mbgraph_frame_stats(cpi, frame_stats, &q_cur->img,
+                               golden_ref, cpi->Source);
+  }
+
+  vp9_clear_system_state();  // __asm emms;
+
+  separate_arf_mbs(cpi);
+}
--- /dev/null
+++ b/vp9/encoder/mbgraph.h
@@ -1,0 +1,16 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef __INC_MBGRAPH_H__
+#define __INC_MBGRAPH_H__ 1
+
+extern void vp9_update_mbgraph_stats(VP9_COMP *cpi);
+
+#endif /* __INC_MBGRAPH_H__ */
--- /dev/null
+++ b/vp9/encoder/mcomp.c
@@ -1,0 +1,2203 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vp9/encoder/onyx_int.h"
+#include "mcomp.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/config.h"
+#include <stdio.h>
+#include <limits.h>
+#include <math.h>
+#include "vp9/common/findnearmv.h"
+
+#ifdef ENTROPY_STATS
+static int mv_ref_ct [31] [4] [2];
+static int mv_mode_cts [4] [2];
+#endif
+
+void vp9_clamp_mv_min_max(MACROBLOCK *x, int_mv *ref_mv) {
+  int col_min = (ref_mv->as_mv.col >> 3) - MAX_FULL_PEL_VAL +
+      ((ref_mv->as_mv.col & 7) ? 1 : 0);
+  int row_min = (ref_mv->as_mv.row >> 3) - MAX_FULL_PEL_VAL +
+      ((ref_mv->as_mv.row & 7) ? 1 : 0);
+  int col_max = (ref_mv->as_mv.col >> 3) + MAX_FULL_PEL_VAL;
+  int row_max = (ref_mv->as_mv.row >> 3) + MAX_FULL_PEL_VAL;
+
+  /* Get intersection of UMV window and valid MV window to reduce # of checks in diamond search. */
+  if (x->mv_col_min < col_min)
+    x->mv_col_min = col_min;
+  if (x->mv_col_max > col_max)
+    x->mv_col_max = col_max;
+  if (x->mv_row_min < row_min)
+    x->mv_row_min = row_min;
+  if (x->mv_row_max > row_max)
+    x->mv_row_max = row_max;
+}
+
+int vp9_mv_bit_cost(int_mv *mv, int_mv *ref, DEC_MVCOSTS,
+                    int Weight, int ishp) {
+  MV v;
+  v.row = (mv->as_mv.row - ref->as_mv.row);
+  v.col = (mv->as_mv.col - ref->as_mv.col);
+  return ((mvjcost[vp9_get_mv_joint(v)] +
+           mvcost[0][v.row] + mvcost[1][v.col]) *
+          Weight) >> 7;
+}
+
+static int mv_err_cost(int_mv *mv, int_mv *ref, DEC_MVCOSTS,
+                       int error_per_bit, int ishp) {
+  if (mvcost) {
+    MV v;
+    v.row = (mv->as_mv.row - ref->as_mv.row);
+    v.col = (mv->as_mv.col - ref->as_mv.col);
+    return ((mvjcost[vp9_get_mv_joint(v)] +
+             mvcost[0][v.row] + mvcost[1][v.col]) *
+            error_per_bit + 128) >> 8;
+  }
+  return 0;
+}
+
+static int mvsad_err_cost(int_mv *mv, int_mv *ref, DEC_MVSADCOSTS,
+                          int error_per_bit) {
+
+  if (mvsadcost) {
+    MV v;
+    v.row = (mv->as_mv.row - ref->as_mv.row);
+    v.col = (mv->as_mv.col - ref->as_mv.col);
+    return ((mvjsadcost[vp9_get_mv_joint(v)] +
+             mvsadcost[0][v.row] + mvsadcost[1][v.col]) *
+            error_per_bit + 128) >> 8;
+  }
+  return 0;
+}
+
+void vp9_init_dsmotion_compensation(MACROBLOCK *x, int stride) {
+  int Len;
+  int search_site_count = 0;
+
+
+  // Generate offsets for 4 search sites per step.
+  Len = MAX_FIRST_STEP;
+  x->ss[search_site_count].mv.col = 0;
+  x->ss[search_site_count].mv.row = 0;
+  x->ss[search_site_count].offset = 0;
+  search_site_count++;
+
+  while (Len > 0) {
+
+    // Compute offsets for search sites.
+    x->ss[search_site_count].mv.col = 0;
+    x->ss[search_site_count].mv.row = -Len;
+    x->ss[search_site_count].offset = -Len * stride;
+    search_site_count++;
+
+    // Compute offsets for search sites.
+    x->ss[search_site_count].mv.col = 0;
+    x->ss[search_site_count].mv.row = Len;
+    x->ss[search_site_count].offset = Len * stride;
+    search_site_count++;
+
+    // Compute offsets for search sites.
+    x->ss[search_site_count].mv.col = -Len;
+    x->ss[search_site_count].mv.row = 0;
+    x->ss[search_site_count].offset = -Len;
+    search_site_count++;
+
+    // Compute offsets for search sites.
+    x->ss[search_site_count].mv.col = Len;
+    x->ss[search_site_count].mv.row = 0;
+    x->ss[search_site_count].offset = Len;
+    search_site_count++;
+
+    // Contract.
+    Len /= 2;
+  }
+
+  x->ss_count = search_site_count;
+  x->searches_per_step = 4;
+}
+
+void vp9_init3smotion_compensation(MACROBLOCK *x, int stride) {
+  int Len;
+  int search_site_count = 0;
+
+  // Generate offsets for 8 search sites per step.
+  Len = MAX_FIRST_STEP;
+  x->ss[search_site_count].mv.col = 0;
+  x->ss[search_site_count].mv.row = 0;
+  x->ss[search_site_count].offset = 0;
+  search_site_count++;
+
+  while (Len > 0) {
+
+    // Compute offsets for search sites.
+    x->ss[search_site_count].mv.col = 0;
+    x->ss[search_site_count].mv.row = -Len;
+    x->ss[search_site_count].offset = -Len * stride;
+    search_site_count++;
+
+    // Compute offsets for search sites.
+    x->ss[search_site_count].mv.col = 0;
+    x->ss[search_site_count].mv.row = Len;
+    x->ss[search_site_count].offset = Len * stride;
+    search_site_count++;
+
+    // Compute offsets for search sites.
+    x->ss[search_site_count].mv.col = -Len;
+    x->ss[search_site_count].mv.row = 0;
+    x->ss[search_site_count].offset = -Len;
+    search_site_count++;
+
+    // Compute offsets for search sites.
+    x->ss[search_site_count].mv.col = Len;
+    x->ss[search_site_count].mv.row = 0;
+    x->ss[search_site_count].offset = Len;
+    search_site_count++;
+
+    // Compute offsets for search sites.
+    x->ss[search_site_count].mv.col = -Len;
+    x->ss[search_site_count].mv.row = -Len;
+    x->ss[search_site_count].offset = -Len * stride - Len;
+    search_site_count++;
+
+    // Compute offsets for search sites.
+    x->ss[search_site_count].mv.col = Len;
+    x->ss[search_site_count].mv.row = -Len;
+    x->ss[search_site_count].offset = -Len * stride + Len;
+    search_site_count++;
+
+    // Compute offsets for search sites.
+    x->ss[search_site_count].mv.col = -Len;
+    x->ss[search_site_count].mv.row = Len;
+    x->ss[search_site_count].offset = Len * stride - Len;
+    search_site_count++;
+
+    // Compute offsets for search sites.
+    x->ss[search_site_count].mv.col = Len;
+    x->ss[search_site_count].mv.row = Len;
+    x->ss[search_site_count].offset = Len * stride + Len;
+    search_site_count++;
+
+    // Contract.
+    Len /= 2;
+  }
+
+  x->ss_count = search_site_count;
+  x->searches_per_step = 8;
+}
+
+/*
+ * To avoid the penalty for crossing cache-line read, preload the reference
+ * area in a small buffer, which is aligned to make sure there won't be crossing
+ * cache-line read while reading from this buffer. This reduced the cpu
+ * cycles spent on reading ref data in sub-pixel filter functions.
+ * TODO: Currently, since sub-pixel search range here is -3 ~ 3, copy 22 rows x
+ * 32 cols area that is enough for 16x16 macroblock. Later, for SPLITMV, we
+ * could reduce the area.
+ */
+
+/* estimated cost of a motion vector (r,c) */
+#define MVC(r, c)                                       \
+    (mvcost ?                                           \
+     ((mvjcost[((r) != rr) * 2 + ((c) != rc)] +         \
+       mvcost[0][((r) - rr)] + mvcost[1][((c) - rc)]) * \
+      error_per_bit + 128) >> 8 : 0)
+
+#define SP(x) (((x) & 7) << 1)  // convert motion vector component to offset
+                                // for svf calc
+
+#define IFMVCV(r, c, s, e)                                \
+    if (c >= minc && c <= maxc && r >= minr && r <= maxr) \
+      s                                                   \
+    else                                                  \
+      e;
+
+/* pointer to predictor base of a motionvector */
+#define PRE(r, c) (y + (((r) >> 3) * y_stride + ((c) >> 3) -(offset)))
+
+/* returns subpixel variance error function */
+#define DIST(r, c) \
+    vfp->svf(PRE(r, c), y_stride, SP(c), SP(r), z, b->src_stride, &sse)
+
+/* checks if (r, c) has better score than previous best */
+#define CHECK_BETTER(v, r, c) \
+    IFMVCV(r, c, {                                                       \
+      thismse = (DIST(r, c));                                            \
+      if ((v = MVC(r, c) + thismse) < besterr) {                         \
+        besterr = v;                                                     \
+        br = r;                                                          \
+        bc = c;                                                          \
+        *distortion = thismse;                                           \
+        *sse1 = sse;                                                     \
+      }                                                                  \
+    },                                                                   \
+    v = INT_MAX;)
+
+#define MIN(x,y) (((x)<(y))?(x):(y))
+#define MAX(x,y) (((x)>(y))?(x):(y))
+
+int vp9_find_best_sub_pixel_step_iteratively(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
+                                             int_mv *bestmv, int_mv *ref_mv,
+                                             int error_per_bit,
+                                             const vp9_variance_fn_ptr_t *vfp,
+                                             DEC_MVCOSTS,
+                                             int *distortion,
+                                             unsigned int *sse1) {
+  unsigned char *z = (*(b->base_src) + b->src);
+  MACROBLOCKD *xd = &x->e_mbd;
+
+  int rr, rc, br, bc, hstep;
+  int tr, tc;
+  unsigned int besterr = INT_MAX;
+  unsigned int left, right, up, down, diag;
+  unsigned int sse;
+  unsigned int whichdir;
+  unsigned int halfiters = 4;
+  unsigned int quarteriters = 4;
+  unsigned int eighthiters = 4;
+  int thismse;
+  int maxc, minc, maxr, minr;
+  int y_stride;
+  int offset;
+  int usehp = xd->allow_high_precision_mv;
+
+#if !CONFIG_SUPERBLOCKS && (ARCH_X86 || ARCH_X86_64)
+  unsigned char *y0 = *(d->base_pre) + d->pre + (bestmv->as_mv.row) * d->pre_stride + bestmv->as_mv.col;
+  unsigned char *y;
+  int buf_r1, buf_r2, buf_c1, buf_c2;
+
+  // Clamping to avoid out-of-range data access
+  buf_r1 = ((bestmv->as_mv.row - INTERP_EXTEND) < x->mv_row_min) ?
+      (bestmv->as_mv.row - x->mv_row_min) : INTERP_EXTEND - 1;
+  buf_r2 = ((bestmv->as_mv.row + INTERP_EXTEND) > x->mv_row_max) ?
+      (x->mv_row_max - bestmv->as_mv.row) : INTERP_EXTEND - 1;
+  buf_c1 = ((bestmv->as_mv.col - INTERP_EXTEND) < x->mv_col_min) ?
+      (bestmv->as_mv.col - x->mv_col_min) : INTERP_EXTEND - 1;
+  buf_c2 = ((bestmv->as_mv.col + INTERP_EXTEND) > x->mv_col_max) ?
+      (x->mv_col_max - bestmv->as_mv.col) : INTERP_EXTEND - 1;
+  y_stride = 32;
+
+  /* Copy to intermediate buffer before searching. */
+  vfp->copymem(y0 - buf_c1 - d->pre_stride * buf_r1, d->pre_stride, xd->y_buf, y_stride, 16 + buf_r1 + buf_r2);
+  y = xd->y_buf + y_stride * buf_r1 + buf_c1;
+#else
+  unsigned char *y = *(d->base_pre) + d->pre + (bestmv->as_mv.row) * d->pre_stride + bestmv->as_mv.col;
+  y_stride = d->pre_stride;
+#endif
+
+  rr = ref_mv->as_mv.row;
+  rc = ref_mv->as_mv.col;
+  br = bestmv->as_mv.row << 3;
+  bc = bestmv->as_mv.col << 3;
+  hstep = 4;
+  minc = MAX(x->mv_col_min << 3, (ref_mv->as_mv.col) - ((1 << MV_MAX_BITS) - 1));
+  maxc = MIN(x->mv_col_max << 3, (ref_mv->as_mv.col) + ((1 << MV_MAX_BITS) - 1));
+  minr = MAX(x->mv_row_min << 3, (ref_mv->as_mv.row) - ((1 << MV_MAX_BITS) - 1));
+  maxr = MIN(x->mv_row_max << 3, (ref_mv->as_mv.row) + ((1 << MV_MAX_BITS) - 1));
+
+  tr = br;
+  tc = bc;
+
+
+  offset = (bestmv->as_mv.row) * y_stride + bestmv->as_mv.col;
+
+  // central mv
+  bestmv->as_mv.row <<= 3;
+  bestmv->as_mv.col <<= 3;
+
+  // calculate central point error
+  besterr = vfp->vf(y, y_stride, z, b->src_stride, sse1);
+  *distortion = besterr;
+  besterr += mv_err_cost(bestmv, ref_mv, MVCOSTS,
+                         error_per_bit, xd->allow_high_precision_mv);
+
+  // TODO: Each subsequent iteration checks at least one point in
+  // common with the last iteration could be 2 ( if diag selected)
+  while (--halfiters) {
+    // 1/2 pel
+    CHECK_BETTER(left, tr, tc - hstep);
+    CHECK_BETTER(right, tr, tc + hstep);
+    CHECK_BETTER(up, tr - hstep, tc);
+    CHECK_BETTER(down, tr + hstep, tc);
+
+    whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
+
+    switch (whichdir) {
+      case 0:
+        CHECK_BETTER(diag, tr - hstep, tc - hstep);
+        break;
+      case 1:
+        CHECK_BETTER(diag, tr - hstep, tc + hstep);
+        break;
+      case 2:
+        CHECK_BETTER(diag, tr + hstep, tc - hstep);
+        break;
+      case 3:
+        CHECK_BETTER(diag, tr + hstep, tc + hstep);
+        break;
+    }
+
+    // no reason to check the same one again.
+    if (tr == br && tc == bc)
+      break;
+
+    tr = br;
+    tc = bc;
+  }
+
+  // TODO: Each subsequent iteration checks at least one point in common with
+  // the last iteration could be 2 ( if diag selected) 1/4 pel
+  hstep >>= 1;
+  while (--quarteriters) {
+    CHECK_BETTER(left, tr, tc - hstep);
+    CHECK_BETTER(right, tr, tc + hstep);
+    CHECK_BETTER(up, tr - hstep, tc);
+    CHECK_BETTER(down, tr + hstep, tc);
+
+    whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
+
+    switch (whichdir) {
+      case 0:
+        CHECK_BETTER(diag, tr - hstep, tc - hstep);
+        break;
+      case 1:
+        CHECK_BETTER(diag, tr - hstep, tc + hstep);
+        break;
+      case 2:
+        CHECK_BETTER(diag, tr + hstep, tc - hstep);
+        break;
+      case 3:
+        CHECK_BETTER(diag, tr + hstep, tc + hstep);
+        break;
+    }
+
+    // no reason to check the same one again.
+    if (tr == br && tc == bc)
+      break;
+
+    tr = br;
+    tc = bc;
+  }
+
+  if (xd->allow_high_precision_mv) {
+    usehp = vp9_use_nmv_hp(&ref_mv->as_mv);
+  } else {
+    usehp = 0;
+  }
+
+  if (usehp) {
+    hstep >>= 1;
+    while (--eighthiters) {
+      CHECK_BETTER(left, tr, tc - hstep);
+      CHECK_BETTER(right, tr, tc + hstep);
+      CHECK_BETTER(up, tr - hstep, tc);
+      CHECK_BETTER(down, tr + hstep, tc);
+
+      whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
+
+      switch (whichdir) {
+        case 0:
+          CHECK_BETTER(diag, tr - hstep, tc - hstep);
+          break;
+        case 1:
+          CHECK_BETTER(diag, tr - hstep, tc + hstep);
+          break;
+        case 2:
+          CHECK_BETTER(diag, tr + hstep, tc - hstep);
+          break;
+        case 3:
+          CHECK_BETTER(diag, tr + hstep, tc + hstep);
+          break;
+      }
+
+      // no reason to check the same one again.
+      if (tr == br && tc == bc)
+        break;
+
+      tr = br;
+      tc = bc;
+    }
+  }
+  bestmv->as_mv.row = br;
+  bestmv->as_mv.col = bc;
+
+  if ((abs(bestmv->as_mv.col - ref_mv->as_mv.col) > (MAX_FULL_PEL_VAL << 3)) ||
+      (abs(bestmv->as_mv.row - ref_mv->as_mv.row) > (MAX_FULL_PEL_VAL << 3)))
+    return INT_MAX;
+
+  return besterr;
+}
+#undef MVC
+#undef PRE
+#undef DIST
+#undef IFMVCV
+#undef CHECK_BETTER
+#undef MIN
+#undef MAX
+
+int vp9_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
+                                 int_mv *bestmv, int_mv *ref_mv,
+                                 int error_per_bit,
+                                 const vp9_variance_fn_ptr_t *vfp,
+                                 DEC_MVCOSTS, int *distortion,
+                                 unsigned int *sse1) {
+  int bestmse = INT_MAX;
+  int_mv startmv;
+  int_mv this_mv;
+  int_mv orig_mv;
+  int yrow_movedback = 0, ycol_movedback = 0;
+  unsigned char *z = (*(b->base_src) + b->src);
+  int left, right, up, down, diag;
+  unsigned int sse;
+  int whichdir;
+  int thismse;
+  int y_stride;
+  MACROBLOCKD *xd = &x->e_mbd;
+  int usehp = xd->allow_high_precision_mv;
+
+#if !CONFIG_SUPERBLOCKS && (ARCH_X86 || ARCH_X86_64)
+  unsigned char *y0 = *(d->base_pre) + d->pre + (bestmv->as_mv.row) * d->pre_stride + bestmv->as_mv.col;
+  unsigned char *y;
+
+  y_stride = 32;
+  /* Copy 18 rows x 32 cols area to intermediate buffer before searching. */
+  vfp->copymem(y0 - 1 - d->pre_stride, d->pre_stride, xd->y_buf, y_stride, 18);
+  y = xd->y_buf + y_stride + 1;
+#else
+  unsigned char *y = *(d->base_pre) + d->pre + (bestmv->as_mv.row) * d->pre_stride + bestmv->as_mv.col;
+  y_stride = d->pre_stride;
+#endif
+
+  // central mv
+  bestmv->as_mv.row <<= 3;
+  bestmv->as_mv.col <<= 3;
+  startmv = *bestmv;
+  orig_mv = *bestmv;
+
+  // calculate central point error
+  bestmse = vfp->vf(y, y_stride, z, b->src_stride, sse1);
+  *distortion = bestmse;
+  bestmse += mv_err_cost(bestmv, ref_mv, MVCOSTS, error_per_bit,
+                         xd->allow_high_precision_mv);
+
+  // go left then right and check error
+  this_mv.as_mv.row = startmv.as_mv.row;
+  this_mv.as_mv.col = ((startmv.as_mv.col - 8) | 4);
+  thismse = vfp->svf_halfpix_h(y - 1, y_stride, z, b->src_stride, &sse);
+  left = thismse + mv_err_cost(&this_mv, ref_mv, MVCOSTS, error_per_bit,
+                               xd->allow_high_precision_mv);
+
+  if (left < bestmse) {
+    *bestmv = this_mv;
+    bestmse = left;
+    *distortion = thismse;
+    *sse1 = sse;
+  }
+
+  this_mv.as_mv.col += 8;
+  thismse = vfp->svf_halfpix_h(y, y_stride, z, b->src_stride, &sse);
+  right = thismse + mv_err_cost(&this_mv, ref_mv, MVCOSTS, error_per_bit,
+                                xd->allow_high_precision_mv);
+
+  if (right < bestmse) {
+    *bestmv = this_mv;
+    bestmse = right;
+    *distortion = thismse;
+    *sse1 = sse;
+  }
+
+  // go up then down and check error
+  this_mv.as_mv.col = startmv.as_mv.col;
+  this_mv.as_mv.row = ((startmv.as_mv.row - 8) | 4);
+  thismse =  vfp->svf_halfpix_v(y - y_stride, y_stride, z, b->src_stride, &sse);
+  up = thismse + mv_err_cost(&this_mv, ref_mv, MVCOSTS, error_per_bit,
+                             xd->allow_high_precision_mv);
+
+  if (up < bestmse) {
+    *bestmv = this_mv;
+    bestmse = up;
+    *distortion = thismse;
+    *sse1 = sse;
+  }
+
+  this_mv.as_mv.row += 8;
+  thismse = vfp->svf_halfpix_v(y, y_stride, z, b->src_stride, &sse);
+  down = thismse + mv_err_cost(&this_mv, ref_mv, MVCOSTS, error_per_bit,
+                               xd->allow_high_precision_mv);
+
+  if (down < bestmse) {
+    *bestmv = this_mv;
+    bestmse = down;
+    *distortion = thismse;
+    *sse1 = sse;
+  }
+
+
+  // now check 1 more diagonal
+  whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
+  // for(whichdir =0;whichdir<4;whichdir++)
+  // {
+  this_mv = startmv;
+
+  switch (whichdir) {
+    case 0:
+      this_mv.as_mv.col = (this_mv.as_mv.col - 8) | 4;
+      this_mv.as_mv.row = (this_mv.as_mv.row - 8) | 4;
+      thismse = vfp->svf_halfpix_hv(y - 1 - y_stride, y_stride, z, b->src_stride, &sse);
+      break;
+    case 1:
+      this_mv.as_mv.col += 4;
+      this_mv.as_mv.row = (this_mv.as_mv.row - 8) | 4;
+      thismse = vfp->svf_halfpix_hv(y - y_stride, y_stride, z, b->src_stride, &sse);
+      break;
+    case 2:
+      this_mv.as_mv.col = (this_mv.as_mv.col - 8) | 4;
+      this_mv.as_mv.row += 4;
+      thismse = vfp->svf_halfpix_hv(y - 1, y_stride, z, b->src_stride, &sse);
+      break;
+    case 3:
+    default:
+      this_mv.as_mv.col += 4;
+      this_mv.as_mv.row += 4;
+      thismse = vfp->svf_halfpix_hv(y, y_stride, z, b->src_stride, &sse);
+      break;
+  }
+
+  diag = thismse + mv_err_cost(&this_mv, ref_mv, MVCOSTS, error_per_bit,
+                               xd->allow_high_precision_mv);
+
+  if (diag < bestmse) {
+    *bestmv = this_mv;
+    bestmse = diag;
+    *distortion = thismse;
+    *sse1 = sse;
+  }
+
+//  }
+
+
+  // time to check quarter pels.
+  if (bestmv->as_mv.row < startmv.as_mv.row) {
+    y -= y_stride;
+    yrow_movedback = 1;
+  }
+
+  if (bestmv->as_mv.col < startmv.as_mv.col) {
+    y--;
+    ycol_movedback = 1;
+  }
+
+  startmv = *bestmv;
+
+
+
+  // go left then right and check error
+  this_mv.as_mv.row = startmv.as_mv.row;
+
+  if (startmv.as_mv.col & 7) {
+    this_mv.as_mv.col = startmv.as_mv.col - 2;
+    thismse = vfp->svf(y, y_stride,
+                       SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),
+                       z, b->src_stride, &sse);
+  } else {
+    this_mv.as_mv.col = (startmv.as_mv.col - 8) | 6;
+    thismse = vfp->svf(y - 1, y_stride, SP(6), SP(this_mv.as_mv.row), z,
+                       b->src_stride, &sse);
+  }
+
+  left = thismse + mv_err_cost(&this_mv, ref_mv, MVCOSTS, error_per_bit,
+                               xd->allow_high_precision_mv);
+
+  if (left < bestmse) {
+    *bestmv = this_mv;
+    bestmse = left;
+    *distortion = thismse;
+    *sse1 = sse;
+  }
+
+  this_mv.as_mv.col += 4;
+  thismse = vfp->svf(y, y_stride,
+                     SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),
+                     z, b->src_stride, &sse);
+  right = thismse + mv_err_cost(&this_mv, ref_mv, MVCOSTS, error_per_bit,
+                                xd->allow_high_precision_mv);
+
+  if (right < bestmse) {
+    *bestmv = this_mv;
+    bestmse = right;
+    *distortion = thismse;
+    *sse1 = sse;
+  }
+
+  // go up then down and check error
+  this_mv.as_mv.col = startmv.as_mv.col;
+
+  if (startmv.as_mv.row & 7) {
+    this_mv.as_mv.row = startmv.as_mv.row - 2;
+    thismse = vfp->svf(y, y_stride,
+                       SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),
+                       z, b->src_stride, &sse);
+  } else {
+    this_mv.as_mv.row = (startmv.as_mv.row - 8) | 6;
+    thismse = vfp->svf(y - y_stride, y_stride, SP(this_mv.as_mv.col), SP(6),
+                       z, b->src_stride, &sse);
+  }
+
+  up = thismse + mv_err_cost(&this_mv, ref_mv, MVCOSTS, error_per_bit,
+                             xd->allow_high_precision_mv);
+
+  if (up < bestmse) {
+    *bestmv = this_mv;
+    bestmse = up;
+    *distortion = thismse;
+    *sse1 = sse;
+  }
+
+  this_mv.as_mv.row += 4;
+  thismse = vfp->svf(y, y_stride, SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),
+                     z, b->src_stride, &sse);
+  down = thismse + mv_err_cost(&this_mv, ref_mv, MVCOSTS, error_per_bit,
+                               xd->allow_high_precision_mv);
+
+  if (down < bestmse) {
+    *bestmv = this_mv;
+    bestmse = down;
+    *distortion = thismse;
+    *sse1 = sse;
+  }
+
+
+  // now check 1 more diagonal
+  whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
+
+//  for(whichdir=0;whichdir<4;whichdir++)
+//  {
+  this_mv = startmv;
+
+  switch (whichdir) {
+    case 0:
+
+      if (startmv.as_mv.row & 7) {
+        this_mv.as_mv.row -= 2;
+
+        if (startmv.as_mv.col & 7) {
+          this_mv.as_mv.col -= 2;
+          thismse = vfp->svf(y, y_stride, SP(this_mv.as_mv.col), SP(this_mv.as_mv.row), z, b->src_stride, &sse);
+        } else {
+          this_mv.as_mv.col = (startmv.as_mv.col - 8) | 6;
+          thismse = vfp->svf(y - 1, y_stride, SP(6), SP(this_mv.as_mv.row), z, b->src_stride, &sse);;
+        }
+      } else {
+        this_mv.as_mv.row = (startmv.as_mv.row - 8) | 6;
+
+        if (startmv.as_mv.col & 7) {
+          this_mv.as_mv.col -= 2;
+          thismse = vfp->svf(y - y_stride, y_stride, SP(this_mv.as_mv.col), SP(6), z, b->src_stride, &sse);
+        } else {
+          this_mv.as_mv.col = (startmv.as_mv.col - 8) | 6;
+          thismse = vfp->svf(y - y_stride - 1, y_stride, SP(6), SP(6), z, b->src_stride, &sse);
+        }
+      }
+
+      break;
+    case 1:
+      this_mv.as_mv.col += 2;
+
+      if (startmv.as_mv.row & 7) {
+        this_mv.as_mv.row -= 2;
+        thismse = vfp->svf(y, y_stride, SP(this_mv.as_mv.col), SP(this_mv.as_mv.row), z, b->src_stride, &sse);
+      } else {
+        this_mv.as_mv.row = (startmv.as_mv.row - 8) | 6;
+        thismse = vfp->svf(y - y_stride, y_stride, SP(this_mv.as_mv.col), SP(6), z, b->src_stride, &sse);
+      }
+
+      break;
+    case 2:
+      this_mv.as_mv.row += 2;
+
+      if (startmv.as_mv.col & 7) {
+        this_mv.as_mv.col -= 2;
+        thismse = vfp->svf(y, y_stride, SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),
+                           z, b->src_stride, &sse);
+      } else {
+        this_mv.as_mv.col = (startmv.as_mv.col - 8) | 6;
+        thismse = vfp->svf(y - 1, y_stride, SP(6), SP(this_mv.as_mv.row), z,
+                           b->src_stride, &sse);
+      }
+
+      break;
+    case 3:
+      this_mv.as_mv.col += 2;
+      this_mv.as_mv.row += 2;
+      thismse = vfp->svf(y, y_stride,
+                         SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),
+                         z, b->src_stride, &sse);
+      break;
+  }
+
+  diag = thismse + mv_err_cost(&this_mv, ref_mv, MVCOSTS, error_per_bit,
+                               xd->allow_high_precision_mv);
+
+  if (diag < bestmse) {
+    *bestmv = this_mv;
+    bestmse = diag;
+    *distortion = thismse;
+    *sse1 = sse;
+  }
+
+  if (x->e_mbd.allow_high_precision_mv) {
+    usehp = vp9_use_nmv_hp(&ref_mv->as_mv);
+  } else {
+    usehp = 0;
+  }
+  if (!usehp)
+    return bestmse;
+
+  /* Now do 1/8th pixel */
+  if (bestmv->as_mv.row < orig_mv.as_mv.row && !yrow_movedback) {
+    y -= y_stride;
+    yrow_movedback = 1;
+  }
+
+  if (bestmv->as_mv.col < orig_mv.as_mv.col && !ycol_movedback) {
+    y--;
+    ycol_movedback = 1;
+  }
+
+  startmv = *bestmv;
+
+  // go left then right and check error
+  this_mv.as_mv.row = startmv.as_mv.row;
+
+  if (startmv.as_mv.col & 7) {
+    this_mv.as_mv.col = startmv.as_mv.col - 1;
+    thismse = vfp->svf(y, y_stride,
+                       SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),
+                       z, b->src_stride, &sse);
+  } else {
+    this_mv.as_mv.col = (startmv.as_mv.col - 8) | 7;
+    thismse = vfp->svf(y - 1, y_stride, SP(7), SP(this_mv.as_mv.row),
+                       z, b->src_stride, &sse);
+  }
+
+  left = thismse + mv_err_cost(&this_mv, ref_mv, MVCOSTS, error_per_bit,
+                               xd->allow_high_precision_mv);
+
+  if (left < bestmse) {
+    *bestmv = this_mv;
+    bestmse = left;
+    *distortion = thismse;
+    *sse1 = sse;
+  }
+
+  this_mv.as_mv.col += 2;
+  thismse = vfp->svf(y, y_stride, SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),
+                     z, b->src_stride, &sse);
+  right = thismse + mv_err_cost(&this_mv, ref_mv, MVCOSTS, error_per_bit,
+                                xd->allow_high_precision_mv);
+
+  if (right < bestmse) {
+    *bestmv = this_mv;
+    bestmse = right;
+    *distortion = thismse;
+    *sse1 = sse;
+  }
+
+  // go up then down and check error
+  this_mv.as_mv.col = startmv.as_mv.col;
+
+  if (startmv.as_mv.row & 7) {
+    this_mv.as_mv.row = startmv.as_mv.row - 1;
+    thismse = vfp->svf(y, y_stride, SP(this_mv.as_mv.col), SP(this_mv.as_mv.row), z, b->src_stride, &sse);
+  } else {
+    this_mv.as_mv.row = (startmv.as_mv.row - 8) | 7;
+    thismse = vfp->svf(y - y_stride, y_stride, SP(this_mv.as_mv.col), SP(7), z, b->src_stride, &sse);
+  }
+
+  up = thismse + mv_err_cost(&this_mv, ref_mv, MVCOSTS, error_per_bit,
+                             xd->allow_high_precision_mv);
+
+  if (up < bestmse) {
+    *bestmv = this_mv;
+    bestmse = up;
+    *distortion = thismse;
+    *sse1 = sse;
+  }
+
+  this_mv.as_mv.row += 2;
+  thismse = vfp->svf(y, y_stride, SP(this_mv.as_mv.col), SP(this_mv.as_mv.row), z, b->src_stride, &sse);
+  down = thismse + mv_err_cost(&this_mv, ref_mv, MVCOSTS, error_per_bit,
+                               xd->allow_high_precision_mv);
+
+  if (down < bestmse) {
+    *bestmv = this_mv;
+    bestmse = down;
+    *distortion = thismse;
+    *sse1 = sse;
+  }
+
+  // now check 1 more diagonal
+  whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
+
+//  for(whichdir=0;whichdir<4;whichdir++)
+//  {
+  this_mv = startmv;
+
+  switch (whichdir) {
+    case 0:
+
+      if (startmv.as_mv.row & 7) {
+        this_mv.as_mv.row -= 1;
+
+        if (startmv.as_mv.col & 7) {
+          this_mv.as_mv.col -= 1;
+          thismse = vfp->svf(y, y_stride, SP(this_mv.as_mv.col), SP(this_mv.as_mv.row), z, b->src_stride, &sse);
+        } else {
+          this_mv.as_mv.col = (startmv.as_mv.col - 8) | 7;
+          thismse = vfp->svf(y - 1, y_stride, SP(7), SP(this_mv.as_mv.row), z, b->src_stride, &sse);;
+        }
+      } else {
+        this_mv.as_mv.row = (startmv.as_mv.row - 8) | 7;
+
+        if (startmv.as_mv.col & 7) {
+          this_mv.as_mv.col -= 1;
+          thismse = vfp->svf(y - y_stride, y_stride, SP(this_mv.as_mv.col), SP(7), z, b->src_stride, &sse);
+        } else {
+          this_mv.as_mv.col = (startmv.as_mv.col - 8) | 7;
+          thismse = vfp->svf(y - y_stride - 1, y_stride, SP(7), SP(7), z, b->src_stride, &sse);
+        }
+      }
+
+      break;
+    case 1:
+      this_mv.as_mv.col += 1;
+
+      if (startmv.as_mv.row & 7) {
+        this_mv.as_mv.row -= 1;
+        thismse = vfp->svf(y, y_stride, SP(this_mv.as_mv.col), SP(this_mv.as_mv.row), z, b->src_stride, &sse);
+      } else {
+        this_mv.as_mv.row = (startmv.as_mv.row - 8) | 7;
+        thismse = vfp->svf(y - y_stride, y_stride, SP(this_mv.as_mv.col), SP(7), z, b->src_stride, &sse);
+      }
+
+      break;
+    case 2:
+      this_mv.as_mv.row += 1;
+
+      if (startmv.as_mv.col & 7) {
+        this_mv.as_mv.col -= 1;
+        thismse = vfp->svf(y, y_stride, SP(this_mv.as_mv.col), SP(this_mv.as_mv.row), z, b->src_stride, &sse);
+      } else {
+        this_mv.as_mv.col = (startmv.as_mv.col - 8) | 7;
+        thismse = vfp->svf(y - 1, y_stride, SP(7), SP(this_mv.as_mv.row), z, b->src_stride, &sse);
+      }
+
+      break;
+    case 3:
+      this_mv.as_mv.col += 1;
+      this_mv.as_mv.row += 1;
+      thismse = vfp->svf(y, y_stride,  SP(this_mv.as_mv.col), SP(this_mv.as_mv.row), z, b->src_stride, &sse);
+      break;
+  }
+
+  diag = thismse + mv_err_cost(&this_mv, ref_mv, MVCOSTS, error_per_bit,
+                               xd->allow_high_precision_mv);
+
+  if (diag < bestmse) {
+    *bestmv = this_mv;
+    bestmse = diag;
+    *distortion = thismse;
+    *sse1 = sse;
+  }
+
+  return bestmse;
+}
+
+#undef SP
+
+int vp9_find_best_half_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
+                                  int_mv *bestmv, int_mv *ref_mv,
+                                  int error_per_bit,
+                                  const vp9_variance_fn_ptr_t *vfp,
+                                  DEC_MVCOSTS,
+                                  int *distortion,
+                                  unsigned int *sse1) {
+  int bestmse = INT_MAX;
+  int_mv startmv;
+  int_mv this_mv;
+  unsigned char *z = (*(b->base_src) + b->src);
+  int left, right, up, down, diag;
+  unsigned int sse;
+  int whichdir;
+  int thismse;
+  int y_stride;
+  MACROBLOCKD *xd = &x->e_mbd;
+
+#if !CONFIG_SUPERBLOCKS && (ARCH_X86 || ARCH_X86_64)
+  unsigned char *y0 = *(d->base_pre) + d->pre +
+      (bestmv->as_mv.row) * d->pre_stride + bestmv->as_mv.col;
+  unsigned char *y;
+
+  y_stride = 32;
+  /* Copy 18 rows x 32 cols area to intermediate buffer before searching. */
+  vfp->copymem(y0 - 1 - d->pre_stride, d->pre_stride, xd->y_buf, y_stride, 18);
+  y = xd->y_buf + y_stride + 1;
+#else
+  unsigned char *y = *(d->base_pre) + d->pre +
+      (bestmv->as_mv.row) * d->pre_stride + bestmv->as_mv.col;
+  y_stride = d->pre_stride;
+#endif
+
+  // central mv
+  bestmv->as_mv.row <<= 3;
+  bestmv->as_mv.col <<= 3;
+  startmv = *bestmv;
+
+  // calculate central point error
+  bestmse = vfp->vf(y, y_stride, z, b->src_stride, sse1);
+  *distortion = bestmse;
+  bestmse += mv_err_cost(bestmv, ref_mv, MVCOSTS, error_per_bit,
+                         xd->allow_high_precision_mv);
+
+  // go left then right and check error
+  this_mv.as_mv.row = startmv.as_mv.row;
+  this_mv.as_mv.col = ((startmv.as_mv.col - 8) | 4);
+  thismse = vfp->svf_halfpix_h(y - 1, y_stride, z, b->src_stride, &sse);
+  left = thismse + mv_err_cost(&this_mv, ref_mv, MVCOSTS, error_per_bit,
+                               xd->allow_high_precision_mv);
+
+  if (left < bestmse) {
+    *bestmv = this_mv;
+    bestmse = left;
+    *distortion = thismse;
+    *sse1 = sse;
+  }
+
+  this_mv.as_mv.col += 8;
+  thismse = vfp->svf_halfpix_h(y, y_stride, z, b->src_stride, &sse);
+  right = thismse + mv_err_cost(&this_mv, ref_mv, MVCOSTS, error_per_bit,
+                                xd->allow_high_precision_mv);
+
+  if (right < bestmse) {
+    *bestmv = this_mv;
+    bestmse = right;
+    *distortion = thismse;
+    *sse1 = sse;
+  }
+
+  // go up then down and check error
+  this_mv.as_mv.col = startmv.as_mv.col;
+  this_mv.as_mv.row = ((startmv.as_mv.row - 8) | 4);
+  thismse = vfp->svf_halfpix_v(y - y_stride, y_stride, z, b->src_stride, &sse);
+  up = thismse + mv_err_cost(&this_mv, ref_mv, MVCOSTS, error_per_bit,
+                             xd->allow_high_precision_mv);
+
+  if (up < bestmse) {
+    *bestmv = this_mv;
+    bestmse = up;
+    *distortion = thismse;
+    *sse1 = sse;
+  }
+
+  this_mv.as_mv.row += 8;
+  thismse = vfp->svf_halfpix_v(y, y_stride, z, b->src_stride, &sse);
+  down = thismse + mv_err_cost(&this_mv, ref_mv, MVCOSTS, error_per_bit,
+                               xd->allow_high_precision_mv);
+
+  if (down < bestmse) {
+    *bestmv = this_mv;
+    bestmse = down;
+    *distortion = thismse;
+    *sse1 = sse;
+  }
+
+  // now check 1 more diagonal -
+  whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
+  this_mv = startmv;
+
+  switch (whichdir) {
+    case 0:
+      this_mv.as_mv.col = (this_mv.as_mv.col - 8) | 4;
+      this_mv.as_mv.row = (this_mv.as_mv.row - 8) | 4;
+      thismse = vfp->svf_halfpix_hv(y - 1 - y_stride, y_stride, z, b->src_stride, &sse);
+      break;
+    case 1:
+      this_mv.as_mv.col += 4;
+      this_mv.as_mv.row = (this_mv.as_mv.row - 8) | 4;
+      thismse = vfp->svf_halfpix_hv(y - y_stride, y_stride, z, b->src_stride, &sse);
+      break;
+    case 2:
+      this_mv.as_mv.col = (this_mv.as_mv.col - 8) | 4;
+      this_mv.as_mv.row += 4;
+      thismse = vfp->svf_halfpix_hv(y - 1, y_stride, z, b->src_stride, &sse);
+      break;
+    case 3:
+    default:
+      this_mv.as_mv.col += 4;
+      this_mv.as_mv.row += 4;
+      thismse = vfp->svf_halfpix_hv(y, y_stride, z, b->src_stride, &sse);
+      break;
+  }
+
+  diag = thismse + mv_err_cost(&this_mv, ref_mv, MVCOSTS, error_per_bit,
+                               xd->allow_high_precision_mv);
+
+  if (diag < bestmse) {
+    *bestmv = this_mv;
+    bestmse = diag;
+    *distortion = thismse;
+    *sse1 = sse;
+  }
+
+  return bestmse;
+}
+
+#define CHECK_BOUNDS(range) \
+  {\
+    all_in = 1;\
+    all_in &= ((br-range) >= x->mv_row_min);\
+    all_in &= ((br+range) <= x->mv_row_max);\
+    all_in &= ((bc-range) >= x->mv_col_min);\
+    all_in &= ((bc+range) <= x->mv_col_max);\
+  }
+
+#define CHECK_POINT \
+  {\
+    if (this_mv.as_mv.col < x->mv_col_min) continue;\
+    if (this_mv.as_mv.col > x->mv_col_max) continue;\
+    if (this_mv.as_mv.row < x->mv_row_min) continue;\
+    if (this_mv.as_mv.row > x->mv_row_max) continue;\
+  }
+
+#define CHECK_BETTER \
+  {\
+    if (thissad < bestsad)\
+    {\
+      thissad += mvsad_err_cost(&this_mv, &fcenter_mv, MVSADCOSTS, sad_per_bit);\
+      if (thissad < bestsad)\
+      {\
+        bestsad = thissad;\
+        best_site = i;\
+      }\
+    }\
+  }
+
+static const MV next_chkpts[6][3] = {
+  {{ -2, 0}, { -1, -2}, {1, -2}},
+  {{ -1, -2}, {1, -2}, {2, 0}},
+  {{1, -2}, {2, 0}, {1, 2}},
+  {{2, 0}, {1, 2}, { -1, 2}},
+  {{1, 2}, { -1, 2}, { -2, 0}},
+  {{ -1, 2}, { -2, 0}, { -1, -2}}
+};
+
+int vp9_hex_search
+(
+  MACROBLOCK *x,
+  BLOCK *b,
+  BLOCKD *d,
+  int_mv *ref_mv,
+  int_mv *best_mv,
+  int search_param,
+  int sad_per_bit,
+  const vp9_variance_fn_ptr_t *vfp,
+  DEC_MVSADCOSTS,
+  DEC_MVCOSTS,
+  int_mv *center_mv
+) {
+  MV hex[6] = { { -1, -2}, {1, -2}, {2, 0}, {1, 2}, { -1, 2}, { -2, 0} };
+  MV neighbors[4] = {{0, -1}, { -1, 0}, {1, 0}, {0, 1}};
+  int i, j;
+
+  unsigned char *what = (*(b->base_src) + b->src);
+  int what_stride = b->src_stride;
+  int in_what_stride = d->pre_stride;
+  int br, bc;
+  int_mv this_mv;
+  unsigned int bestsad = 0x7fffffff;
+  unsigned int thissad;
+  unsigned char *base_offset;
+  unsigned char *this_offset;
+  int k = -1;
+  int all_in;
+  int best_site = -1;
+
+  int_mv fcenter_mv;
+  fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
+  fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
+
+  // adjust ref_mv to make sure it is within MV range
+  clamp_mv(ref_mv, x->mv_col_min, x->mv_col_max, x->mv_row_min, x->mv_row_max);
+  br = ref_mv->as_mv.row;
+  bc = ref_mv->as_mv.col;
+
+  // Work out the start point for the search
+  base_offset = (unsigned char *)(*(d->base_pre) + d->pre);
+  this_offset = base_offset + (br * (d->pre_stride)) + bc;
+  this_mv.as_mv.row = br;
+  this_mv.as_mv.col = bc;
+  bestsad = vfp->sdf(what, what_stride, this_offset,
+                     in_what_stride, 0x7fffffff)
+            + mvsad_err_cost(&this_mv, &fcenter_mv, MVSADCOSTS, sad_per_bit);
+
+  // hex search
+  // j=0
+  CHECK_BOUNDS(2)
+
+  if (all_in) {
+    for (i = 0; i < 6; i++) {
+      this_mv.as_mv.row = br + hex[i].row;
+      this_mv.as_mv.col = bc + hex[i].col;
+      this_offset = base_offset + (this_mv.as_mv.row * in_what_stride) + this_mv.as_mv.col;
+      thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, bestsad);
+      CHECK_BETTER
+    }
+  } else {
+    for (i = 0; i < 6; i++) {
+      this_mv.as_mv.row = br + hex[i].row;
+      this_mv.as_mv.col = bc + hex[i].col;
+      CHECK_POINT
+      this_offset = base_offset + (this_mv.as_mv.row * in_what_stride) + this_mv.as_mv.col;
+      thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, bestsad);
+      CHECK_BETTER
+    }
+  }
+
+  if (best_site == -1)
+    goto cal_neighbors;
+  else {
+    br += hex[best_site].row;
+    bc += hex[best_site].col;
+    k = best_site;
+  }
+
+  for (j = 1; j < 127; j++) {
+    best_site = -1;
+    CHECK_BOUNDS(2)
+
+    if (all_in) {
+      for (i = 0; i < 3; i++) {
+        this_mv.as_mv.row = br + next_chkpts[k][i].row;
+        this_mv.as_mv.col = bc + next_chkpts[k][i].col;
+        this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) + this_mv.as_mv.col;
+        thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, bestsad);
+        CHECK_BETTER
+      }
+    } else {
+      for (i = 0; i < 3; i++) {
+        this_mv.as_mv.row = br + next_chkpts[k][i].row;
+        this_mv.as_mv.col = bc + next_chkpts[k][i].col;
+        CHECK_POINT
+        this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) + this_mv.as_mv.col;
+        thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, bestsad);
+        CHECK_BETTER
+      }
+    }
+
+    if (best_site == -1)
+      break;
+    else {
+      br += next_chkpts[k][best_site].row;
+      bc += next_chkpts[k][best_site].col;
+      k += 5 + best_site;
+      if (k >= 12) k -= 12;
+      else if (k >= 6) k -= 6;
+    }
+  }
+
+  // check 4 1-away neighbors
+cal_neighbors:
+  for (j = 0; j < 32; j++) {
+    best_site = -1;
+    CHECK_BOUNDS(1)
+
+    if (all_in) {
+      for (i = 0; i < 4; i++) {
+        this_mv.as_mv.row = br + neighbors[i].row;
+        this_mv.as_mv.col = bc + neighbors[i].col;
+        this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) + this_mv.as_mv.col;
+        thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, bestsad);
+        CHECK_BETTER
+      }
+    } else {
+      for (i = 0; i < 4; i++) {
+        this_mv.as_mv.row = br + neighbors[i].row;
+        this_mv.as_mv.col = bc + neighbors[i].col;
+        CHECK_POINT
+        this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) + this_mv.as_mv.col;
+        thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, bestsad);
+        CHECK_BETTER
+      }
+    }
+
+    if (best_site == -1)
+      break;
+    else {
+      br += neighbors[best_site].row;
+      bc += neighbors[best_site].col;
+    }
+  }
+
+  best_mv->as_mv.row = br;
+  best_mv->as_mv.col = bc;
+
+  return bestsad;
+}
+#undef CHECK_BOUNDS
+#undef CHECK_POINT
+#undef CHECK_BETTER
+
+int vp9_diamond_search_sad(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
+                           int_mv *ref_mv, int_mv *best_mv,
+                           int search_param, int sad_per_bit, int *num00,
+                           vp9_variance_fn_ptr_t *fn_ptr, DEC_MVCOSTS,
+                           int_mv *center_mv) {
+  int i, j, step;
+
+  unsigned char *what = (*(b->base_src) + b->src);
+  int what_stride = b->src_stride;
+  unsigned char *in_what;
+  int in_what_stride = d->pre_stride;
+  unsigned char *best_address;
+
+  int tot_steps;
+  int_mv this_mv;
+
+  int bestsad = INT_MAX;
+  int best_site = 0;
+  int last_site = 0;
+
+  int ref_row, ref_col;
+  int this_row_offset, this_col_offset;
+  search_site *ss;
+
+  unsigned char *check_here;
+  int thissad;
+  MACROBLOCKD *xd = &x->e_mbd;
+  int_mv fcenter_mv;
+
+  int *mvjsadcost = x->nmvjointsadcost;
+  int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]};
+
+  fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
+  fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
+
+  clamp_mv(ref_mv, x->mv_col_min, x->mv_col_max, x->mv_row_min, x->mv_row_max);
+  ref_row = ref_mv->as_mv.row;
+  ref_col = ref_mv->as_mv.col;
+  *num00 = 0;
+  best_mv->as_mv.row = ref_row;
+  best_mv->as_mv.col = ref_col;
+
+  // Work out the start point for the search
+  in_what = (unsigned char *)(*(d->base_pre) + d->pre + (ref_row * (d->pre_stride)) + ref_col);
+  best_address = in_what;
+
+  // Check the starting position
+  bestsad = fn_ptr->sdf(what, what_stride, in_what,
+                        in_what_stride, 0x7fffffff)
+            + mvsad_err_cost(best_mv, &fcenter_mv, MVSADCOSTS, sad_per_bit);
+
+  // search_param determines the length of the initial step and hence the number of iterations
+  // 0 = initial step (MAX_FIRST_STEP) pel : 1 = (MAX_FIRST_STEP/2) pel, 2 = (MAX_FIRST_STEP/4) pel... etc.
+  ss = &x->ss[search_param * x->searches_per_step];
+  tot_steps = (x->ss_count / x->searches_per_step) - search_param;
+
+  i = 1;
+
+  for (step = 0; step < tot_steps; step++) {
+    for (j = 0; j < x->searches_per_step; j++) {
+      // Trap illegal vectors
+      this_row_offset = best_mv->as_mv.row + ss[i].mv.row;
+      this_col_offset = best_mv->as_mv.col + ss[i].mv.col;
+
+      if ((this_col_offset > x->mv_col_min) && (this_col_offset < x->mv_col_max) &&
+          (this_row_offset > x->mv_row_min) && (this_row_offset < x->mv_row_max))
+
+      {
+        check_here = ss[i].offset + best_address;
+        thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, bestsad);
+
+        if (thissad < bestsad) {
+          this_mv.as_mv.row = this_row_offset;
+          this_mv.as_mv.col = this_col_offset;
+          thissad += mvsad_err_cost(&this_mv, &fcenter_mv,
+                                    MVSADCOSTS, sad_per_bit);
+
+          if (thissad < bestsad) {
+            bestsad = thissad;
+            best_site = i;
+          }
+        }
+      }
+
+      i++;
+    }
+
+    if (best_site != last_site) {
+      best_mv->as_mv.row += ss[best_site].mv.row;
+      best_mv->as_mv.col += ss[best_site].mv.col;
+      best_address += ss[best_site].offset;
+      last_site = best_site;
+    } else if (best_address == in_what)
+      (*num00)++;
+  }
+
+  this_mv.as_mv.row = best_mv->as_mv.row << 3;
+  this_mv.as_mv.col = best_mv->as_mv.col << 3;
+
+  if (bestsad == INT_MAX)
+    return INT_MAX;
+
+  return
+      fn_ptr->vf(what, what_stride, best_address, in_what_stride,
+                 (unsigned int *)(&thissad)) +
+      mv_err_cost(&this_mv, center_mv, MVCOSTS, x->errorperbit,
+                  xd->allow_high_precision_mv);
+}
+
+int vp9_diamond_search_sadx4(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
+                             int_mv *ref_mv, int_mv *best_mv, int search_param,
+                             int sad_per_bit, int *num00,
+                             vp9_variance_fn_ptr_t *fn_ptr,
+                             DEC_MVCOSTS, int_mv *center_mv) {
+  int i, j, step;
+
+  unsigned char *what = (*(b->base_src) + b->src);
+  int what_stride = b->src_stride;
+  unsigned char *in_what;
+  int in_what_stride = d->pre_stride;
+  unsigned char *best_address;
+
+  int tot_steps;
+  int_mv this_mv;
+
+  int bestsad = INT_MAX;
+  int best_site = 0;
+  int last_site = 0;
+
+  int ref_row;
+  int ref_col;
+  int this_row_offset;
+  int this_col_offset;
+  search_site *ss;
+
+  unsigned char *check_here;
+  unsigned int thissad;
+  MACROBLOCKD *xd = &x->e_mbd;
+  int_mv fcenter_mv;
+
+  int *mvjsadcost = x->nmvjointsadcost;
+  int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]};
+
+  fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
+  fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
+
+  clamp_mv(ref_mv, x->mv_col_min, x->mv_col_max, x->mv_row_min, x->mv_row_max);
+  ref_row = ref_mv->as_mv.row;
+  ref_col = ref_mv->as_mv.col;
+  *num00 = 0;
+  best_mv->as_mv.row = ref_row;
+  best_mv->as_mv.col = ref_col;
+
+  // Work out the start point for the search
+  in_what = (unsigned char *)(*(d->base_pre) + d->pre + (ref_row * (d->pre_stride)) + ref_col);
+  best_address = in_what;
+
+  // Check the starting position
+  bestsad = fn_ptr->sdf(what, what_stride,
+                        in_what, in_what_stride, 0x7fffffff)
+            + mvsad_err_cost(best_mv, &fcenter_mv, MVSADCOSTS, sad_per_bit);
+
+  // search_param determines the length of the initial step and hence the number of iterations
+  // 0 = initial step (MAX_FIRST_STEP) pel : 1 = (MAX_FIRST_STEP/2) pel, 2 = (MAX_FIRST_STEP/4) pel... etc.
+  ss = &x->ss[search_param * x->searches_per_step];
+  tot_steps = (x->ss_count / x->searches_per_step) - search_param;
+
+  i = 1;
+
+  for (step = 0; step < tot_steps; step++) {
+    int all_in = 1, t;
+
+    // To know if all neighbor points are within the bounds, 4 bounds checking are enough instead of
+    // checking 4 bounds for each points.
+    all_in &= ((best_mv->as_mv.row + ss[i].mv.row) > x->mv_row_min);
+    all_in &= ((best_mv->as_mv.row + ss[i + 1].mv.row) < x->mv_row_max);
+    all_in &= ((best_mv->as_mv.col + ss[i + 2].mv.col) > x->mv_col_min);
+    all_in &= ((best_mv->as_mv.col + ss[i + 3].mv.col) < x->mv_col_max);
+
+    if (all_in) {
+      unsigned int sad_array[4];
+
+      for (j = 0; j < x->searches_per_step; j += 4) {
+        unsigned char *block_offset[4];
+
+        for (t = 0; t < 4; t++)
+          block_offset[t] = ss[i + t].offset + best_address;
+
+        fn_ptr->sdx4df(what, what_stride, block_offset, in_what_stride,
+                       sad_array);
+
+        for (t = 0; t < 4; t++, i++) {
+          if (sad_array[t] < bestsad) {
+            this_mv.as_mv.row = best_mv->as_mv.row + ss[i].mv.row;
+            this_mv.as_mv.col = best_mv->as_mv.col + ss[i].mv.col;
+            sad_array[t] += mvsad_err_cost(&this_mv, &fcenter_mv,
+                                           MVSADCOSTS, sad_per_bit);
+
+            if (sad_array[t] < bestsad) {
+              bestsad = sad_array[t];
+              best_site = i;
+            }
+          }
+        }
+      }
+    } else {
+      for (j = 0; j < x->searches_per_step; j++) {
+        // Trap illegal vectors
+        this_row_offset = best_mv->as_mv.row + ss[i].mv.row;
+        this_col_offset = best_mv->as_mv.col + ss[i].mv.col;
+
+        if ((this_col_offset > x->mv_col_min) && (this_col_offset < x->mv_col_max) &&
+            (this_row_offset > x->mv_row_min) && (this_row_offset < x->mv_row_max)) {
+          check_here = ss[i].offset + best_address;
+          thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, bestsad);
+
+          if (thissad < bestsad) {
+            this_mv.as_mv.row = this_row_offset;
+            this_mv.as_mv.col = this_col_offset;
+            thissad += mvsad_err_cost(&this_mv, &fcenter_mv,
+                                      MVSADCOSTS, sad_per_bit);
+
+            if (thissad < bestsad) {
+              bestsad = thissad;
+              best_site = i;
+            }
+          }
+        }
+        i++;
+      }
+    }
+
+    if (best_site != last_site) {
+      best_mv->as_mv.row += ss[best_site].mv.row;
+      best_mv->as_mv.col += ss[best_site].mv.col;
+      best_address += ss[best_site].offset;
+      last_site = best_site;
+    } else if (best_address == in_what)
+      (*num00)++;
+  }
+
+  this_mv.as_mv.row = best_mv->as_mv.row << 3;
+  this_mv.as_mv.col = best_mv->as_mv.col << 3;
+
+  if (bestsad == INT_MAX)
+    return INT_MAX;
+
+  return
+      fn_ptr->vf(what, what_stride, best_address, in_what_stride,
+                 (unsigned int *)(&thissad)) +
+      mv_err_cost(&this_mv, center_mv, MVCOSTS, x->errorperbit,
+                  xd->allow_high_precision_mv);
+}
+
+/* do_refine: If last step (1-away) of n-step search doesn't pick the center
+              point as the best match, we will do a final 1-away diamond
+              refining search  */
+int vp9_full_pixel_diamond(VP9_COMP *cpi, MACROBLOCK *x, BLOCK *b,
+                           BLOCKD *d, int_mv *mvp_full, int step_param,
+                           int sadpb, int further_steps,
+                           int do_refine, vp9_variance_fn_ptr_t *fn_ptr,
+                           int_mv *ref_mv, int_mv *dst_mv) {
+  int_mv temp_mv;
+  int thissme, n, num00;
+  int bestsme = cpi->diamond_search_sad(x, b, d, mvp_full, &temp_mv,
+                                        step_param, sadpb, &num00,
+                                        fn_ptr, XMVCOST, ref_mv);
+  dst_mv->as_int = temp_mv.as_int;
+
+  n = num00;
+  num00 = 0;
+
+  /* If there won't be more n-step search, check to see if refining search is needed. */
+  if (n > further_steps)
+    do_refine = 0;
+
+  while (n < further_steps) {
+    n++;
+
+    if (num00)
+      num00--;
+    else {
+      thissme = cpi->diamond_search_sad(x, b, d, mvp_full, &temp_mv,
+                                        step_param + n, sadpb, &num00,
+                                        fn_ptr, XMVCOST, ref_mv);
+
+      /* check to see if refining search is needed. */
+      if (num00 > (further_steps - n))
+        do_refine = 0;
+
+      if (thissme < bestsme) {
+        bestsme = thissme;
+        dst_mv->as_int = temp_mv.as_int;
+      }
+    }
+  }
+
+  /* final 1-away diamond refining search */
+  if (do_refine == 1) {
+    int search_range = 8;
+    int_mv best_mv;
+    best_mv.as_int = dst_mv->as_int;
+    thissme = cpi->refining_search_sad(x, b, d, &best_mv, sadpb, search_range,
+                                       fn_ptr, XMVCOST, ref_mv);
+
+    if (thissme < bestsme) {
+      bestsme = thissme;
+      dst_mv->as_int = best_mv.as_int;
+    }
+  }
+  return bestsme;
+}
+
+int vp9_full_search_sad(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
+                        int sad_per_bit, int distance,
+                        vp9_variance_fn_ptr_t *fn_ptr, DEC_MVCOSTS,
+                        int_mv *center_mv) {
+  unsigned char *what = (*(b->base_src) + b->src);
+  int what_stride = b->src_stride;
+  unsigned char *in_what;
+  int in_what_stride = d->pre_stride;
+  int mv_stride = d->pre_stride;
+  unsigned char *bestaddress;
+  int_mv *best_mv = &d->bmi.as_mv.first;
+  int_mv this_mv;
+  int bestsad = INT_MAX;
+  int r, c;
+
+  unsigned char *check_here;
+  int thissad;
+  MACROBLOCKD *xd = &x->e_mbd;
+
+  int ref_row = ref_mv->as_mv.row;
+  int ref_col = ref_mv->as_mv.col;
+
+  int row_min = ref_row - distance;
+  int row_max = ref_row + distance;
+  int col_min = ref_col - distance;
+  int col_max = ref_col + distance;
+  int_mv fcenter_mv;
+
+  int *mvjsadcost = x->nmvjointsadcost;
+  int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]};
+
+  fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
+  fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
+
+  // Work out the mid point for the search
+  in_what = *(d->base_pre) + d->pre;
+  bestaddress = in_what + (ref_row * d->pre_stride) + ref_col;
+
+  best_mv->as_mv.row = ref_row;
+  best_mv->as_mv.col = ref_col;
+
+  // Baseline value at the centre
+  bestsad = fn_ptr->sdf(what, what_stride, bestaddress,
+                        in_what_stride, 0x7fffffff)
+            + mvsad_err_cost(best_mv, &fcenter_mv, MVSADCOSTS, sad_per_bit);
+
+  // Apply further limits to prevent us looking using vectors that stretch beyiond the UMV border
+  if (col_min < x->mv_col_min)
+    col_min = x->mv_col_min;
+
+  if (col_max > x->mv_col_max)
+    col_max = x->mv_col_max;
+
+  if (row_min < x->mv_row_min)
+    row_min = x->mv_row_min;
+
+  if (row_max > x->mv_row_max)
+    row_max = x->mv_row_max;
+
+  for (r = row_min; r < row_max; r++) {
+    this_mv.as_mv.row = r;
+    check_here = r * mv_stride + in_what + col_min;
+
+    for (c = col_min; c < col_max; c++) {
+      thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, bestsad);
+
+      this_mv.as_mv.col = c;
+      thissad  += mvsad_err_cost(&this_mv, &fcenter_mv,
+                                 MVSADCOSTS, sad_per_bit);
+
+      if (thissad < bestsad) {
+        bestsad = thissad;
+        best_mv->as_mv.row = r;
+        best_mv->as_mv.col = c;
+        bestaddress = check_here;
+      }
+
+      check_here++;
+    }
+  }
+
+  this_mv.as_mv.row = best_mv->as_mv.row << 3;
+  this_mv.as_mv.col = best_mv->as_mv.col << 3;
+
+  if (bestsad < INT_MAX)
+    return
+        fn_ptr->vf(what, what_stride, bestaddress, in_what_stride,
+                   (unsigned int *)(&thissad)) +
+        mv_err_cost(&this_mv, center_mv, MVCOSTS, x->errorperbit,
+                    xd->allow_high_precision_mv);
+  else
+    return INT_MAX;
+}
+
+int vp9_full_search_sadx3(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
+                          int sad_per_bit, int distance,
+                          vp9_variance_fn_ptr_t *fn_ptr, DEC_MVCOSTS,
+                          int_mv *center_mv) {
+  unsigned char *what = (*(b->base_src) + b->src);
+  int what_stride = b->src_stride;
+  unsigned char *in_what;
+  int in_what_stride = d->pre_stride;
+  int mv_stride = d->pre_stride;
+  unsigned char *bestaddress;
+  int_mv *best_mv = &d->bmi.as_mv.first;
+  int_mv this_mv;
+  int bestsad = INT_MAX;
+  int r, c;
+
+  unsigned char *check_here;
+  unsigned int thissad;
+  MACROBLOCKD *xd = &x->e_mbd;
+
+  int ref_row = ref_mv->as_mv.row;
+  int ref_col = ref_mv->as_mv.col;
+
+  int row_min = ref_row - distance;
+  int row_max = ref_row + distance;
+  int col_min = ref_col - distance;
+  int col_max = ref_col + distance;
+
+  unsigned int sad_array[3];
+  int_mv fcenter_mv;
+
+  int *mvjsadcost = x->nmvjointsadcost;
+  int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]};
+
+  fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
+  fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
+
+  // Work out the mid point for the search
+  in_what = *(d->base_pre) + d->pre;
+  bestaddress = in_what + (ref_row * d->pre_stride) + ref_col;
+
+  best_mv->as_mv.row = ref_row;
+  best_mv->as_mv.col = ref_col;
+
+  // Baseline value at the centre
+  bestsad = fn_ptr->sdf(what, what_stride,
+                        bestaddress, in_what_stride, 0x7fffffff)
+            + mvsad_err_cost(best_mv, &fcenter_mv, MVSADCOSTS, sad_per_bit);
+
+  // Apply further limits to prevent us looking using vectors that stretch beyiond the UMV border
+  if (col_min < x->mv_col_min)
+    col_min = x->mv_col_min;
+
+  if (col_max > x->mv_col_max)
+    col_max = x->mv_col_max;
+
+  if (row_min < x->mv_row_min)
+    row_min = x->mv_row_min;
+
+  if (row_max > x->mv_row_max)
+    row_max = x->mv_row_max;
+
+  for (r = row_min; r < row_max; r++) {
+    this_mv.as_mv.row = r;
+    check_here = r * mv_stride + in_what + col_min;
+    c = col_min;
+
+    while ((c + 2) < col_max) {
+      int i;
+
+      fn_ptr->sdx3f(what, what_stride, check_here, in_what_stride, sad_array);
+
+      for (i = 0; i < 3; i++) {
+        thissad = sad_array[i];
+
+        if (thissad < bestsad) {
+          this_mv.as_mv.col = c;
+          thissad  += mvsad_err_cost(&this_mv, &fcenter_mv,
+                                     MVSADCOSTS, sad_per_bit);
+
+          if (thissad < bestsad) {
+            bestsad = thissad;
+            best_mv->as_mv.row = r;
+            best_mv->as_mv.col = c;
+            bestaddress = check_here;
+          }
+        }
+
+        check_here++;
+        c++;
+      }
+    }
+
+    while (c < col_max) {
+      thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, bestsad);
+
+      if (thissad < bestsad) {
+        this_mv.as_mv.col = c;
+        thissad  += mvsad_err_cost(&this_mv, &fcenter_mv,
+                                   MVSADCOSTS, sad_per_bit);
+
+        if (thissad < bestsad) {
+          bestsad = thissad;
+          best_mv->as_mv.row = r;
+          best_mv->as_mv.col = c;
+          bestaddress = check_here;
+        }
+      }
+
+      check_here++;
+      c++;
+    }
+
+  }
+
+  this_mv.as_mv.row = best_mv->as_mv.row << 3;
+  this_mv.as_mv.col = best_mv->as_mv.col << 3;
+
+  if (bestsad < INT_MAX)
+    return
+        fn_ptr->vf(what, what_stride, bestaddress, in_what_stride,
+                   (unsigned int *)(&thissad)) +
+        mv_err_cost(&this_mv, center_mv, MVCOSTS, x->errorperbit,
+                    xd->allow_high_precision_mv);
+  else
+    return INT_MAX;
+}
+
+int vp9_full_search_sadx8(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
+                          int sad_per_bit, int distance,
+                          vp9_variance_fn_ptr_t *fn_ptr,
+                          DEC_MVCOSTS,
+                          int_mv *center_mv) {
+  unsigned char *what = (*(b->base_src) + b->src);
+  int what_stride = b->src_stride;
+  unsigned char *in_what;
+  int in_what_stride = d->pre_stride;
+  int mv_stride = d->pre_stride;
+  unsigned char *bestaddress;
+  int_mv *best_mv = &d->bmi.as_mv.first;
+  int_mv this_mv;
+  int bestsad = INT_MAX;
+  int r, c;
+
+  unsigned char *check_here;
+  unsigned int thissad;
+  MACROBLOCKD *xd = &x->e_mbd;
+
+  int ref_row = ref_mv->as_mv.row;
+  int ref_col = ref_mv->as_mv.col;
+
+  int row_min = ref_row - distance;
+  int row_max = ref_row + distance;
+  int col_min = ref_col - distance;
+  int col_max = ref_col + distance;
+
+  DECLARE_ALIGNED_ARRAY(16, unsigned short, sad_array8, 8);
+  unsigned int sad_array[3];
+  int_mv fcenter_mv;
+
+  int *mvjsadcost = x->nmvjointsadcost;
+  int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]};
+
+  fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
+  fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
+
+  // Work out the mid point for the search
+  in_what = *(d->base_pre) + d->pre;
+  bestaddress = in_what + (ref_row * d->pre_stride) + ref_col;
+
+  best_mv->as_mv.row = ref_row;
+  best_mv->as_mv.col = ref_col;
+
+  // Baseline value at the centre
+  bestsad = fn_ptr->sdf(what, what_stride,
+                        bestaddress, in_what_stride, 0x7fffffff)
+            + mvsad_err_cost(best_mv, &fcenter_mv, MVSADCOSTS, sad_per_bit);
+
+  // Apply further limits to prevent us looking using vectors that stretch beyiond the UMV border
+  if (col_min < x->mv_col_min)
+    col_min = x->mv_col_min;
+
+  if (col_max > x->mv_col_max)
+    col_max = x->mv_col_max;
+
+  if (row_min < x->mv_row_min)
+    row_min = x->mv_row_min;
+
+  if (row_max > x->mv_row_max)
+    row_max = x->mv_row_max;
+
+  for (r = row_min; r < row_max; r++) {
+    this_mv.as_mv.row = r;
+    check_here = r * mv_stride + in_what + col_min;
+    c = col_min;
+
+    while ((c + 7) < col_max) {
+      int i;
+
+      fn_ptr->sdx8f(what, what_stride, check_here, in_what_stride, sad_array8);
+
+      for (i = 0; i < 8; i++) {
+        thissad = (unsigned int)sad_array8[i];
+
+        if (thissad < bestsad) {
+          this_mv.as_mv.col = c;
+          thissad  += mvsad_err_cost(&this_mv, &fcenter_mv,
+                                     MVSADCOSTS, sad_per_bit);
+
+          if (thissad < bestsad) {
+            bestsad = thissad;
+            best_mv->as_mv.row = r;
+            best_mv->as_mv.col = c;
+            bestaddress = check_here;
+          }
+        }
+
+        check_here++;
+        c++;
+      }
+    }
+
+    while ((c + 2) < col_max) {
+      int i;
+
+      fn_ptr->sdx3f(what, what_stride, check_here, in_what_stride, sad_array);
+
+      for (i = 0; i < 3; i++) {
+        thissad = sad_array[i];
+
+        if (thissad < bestsad) {
+          this_mv.as_mv.col = c;
+          thissad  += mvsad_err_cost(&this_mv, &fcenter_mv,
+                                     MVSADCOSTS, sad_per_bit);
+
+          if (thissad < bestsad) {
+            bestsad = thissad;
+            best_mv->as_mv.row = r;
+            best_mv->as_mv.col = c;
+            bestaddress = check_here;
+          }
+        }
+
+        check_here++;
+        c++;
+      }
+    }
+
+    while (c < col_max) {
+      thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, bestsad);
+
+      if (thissad < bestsad) {
+        this_mv.as_mv.col = c;
+        thissad  += mvsad_err_cost(&this_mv, &fcenter_mv,
+                                   MVSADCOSTS, sad_per_bit);
+
+        if (thissad < bestsad) {
+          bestsad = thissad;
+          best_mv->as_mv.row = r;
+          best_mv->as_mv.col = c;
+          bestaddress = check_here;
+        }
+      }
+
+      check_here++;
+      c++;
+    }
+  }
+
+  this_mv.as_mv.row = best_mv->as_mv.row << 3;
+  this_mv.as_mv.col = best_mv->as_mv.col << 3;
+
+  if (bestsad < INT_MAX)
+    return
+        fn_ptr->vf(what, what_stride, bestaddress, in_what_stride,
+                   (unsigned int *)(&thissad)) +
+        mv_err_cost(&this_mv, center_mv, MVCOSTS, x->errorperbit,
+                    xd->allow_high_precision_mv);
+  else
+    return INT_MAX;
+}
+
+int vp9_refining_search_sad(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
+                            int error_per_bit, int search_range,
+                            vp9_variance_fn_ptr_t *fn_ptr, DEC_MVCOSTS,
+                            int_mv *center_mv) {
+  MV neighbors[4] = {{ -1, 0}, {0, -1}, {0, 1}, {1, 0}};
+  int i, j;
+  short this_row_offset, this_col_offset;
+
+  int what_stride = b->src_stride;
+  int in_what_stride = d->pre_stride;
+  unsigned char *what = (*(b->base_src) + b->src);
+  unsigned char *best_address = (unsigned char *)(*(d->base_pre) + d->pre +
+                                                  (ref_mv->as_mv.row * (d->pre_stride)) + ref_mv->as_mv.col);
+  unsigned char *check_here;
+  unsigned int thissad;
+  int_mv this_mv;
+  unsigned int bestsad = INT_MAX;
+  MACROBLOCKD *xd = &x->e_mbd;
+  int_mv fcenter_mv;
+
+  int *mvjsadcost = x->nmvjointsadcost;
+  int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]};
+
+  fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
+  fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
+
+  bestsad = fn_ptr->sdf(what, what_stride, best_address, in_what_stride, 0x7fffffff) +
+      mvsad_err_cost(ref_mv, &fcenter_mv, MVSADCOSTS, error_per_bit);
+
+  for (i = 0; i < search_range; i++) {
+    int best_site = -1;
+
+    for (j = 0; j < 4; j++) {
+      this_row_offset = ref_mv->as_mv.row + neighbors[j].row;
+      this_col_offset = ref_mv->as_mv.col + neighbors[j].col;
+
+      if ((this_col_offset > x->mv_col_min) && (this_col_offset < x->mv_col_max) &&
+          (this_row_offset > x->mv_row_min) && (this_row_offset < x->mv_row_max)) {
+        check_here = (neighbors[j].row) * in_what_stride + neighbors[j].col + best_address;
+        thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, bestsad);
+
+        if (thissad < bestsad) {
+          this_mv.as_mv.row = this_row_offset;
+          this_mv.as_mv.col = this_col_offset;
+          thissad += mvsad_err_cost(&this_mv, &fcenter_mv, MVSADCOSTS, error_per_bit);
+
+          if (thissad < bestsad) {
+            bestsad = thissad;
+            best_site = j;
+          }
+        }
+      }
+    }
+
+    if (best_site == -1)
+      break;
+    else {
+      ref_mv->as_mv.row += neighbors[best_site].row;
+      ref_mv->as_mv.col += neighbors[best_site].col;
+      best_address += (neighbors[best_site].row) * in_what_stride + neighbors[best_site].col;
+    }
+  }
+
+  this_mv.as_mv.row = ref_mv->as_mv.row << 3;
+  this_mv.as_mv.col = ref_mv->as_mv.col << 3;
+
+  if (bestsad < INT_MAX)
+    return
+        fn_ptr->vf(what, what_stride, best_address, in_what_stride,
+                   (unsigned int *)(&thissad)) +
+        mv_err_cost(&this_mv, center_mv, MVCOSTS, x->errorperbit,
+                    xd->allow_high_precision_mv);
+  else
+    return INT_MAX;
+}
+
+int vp9_refining_search_sadx4(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
+                              int_mv *ref_mv, int error_per_bit,
+                              int search_range, vp9_variance_fn_ptr_t *fn_ptr,
+                              DEC_MVCOSTS, int_mv *center_mv) {
+  MV neighbors[4] = {{ -1, 0}, {0, -1}, {0, 1}, {1, 0}};
+  int i, j;
+  short this_row_offset, this_col_offset;
+
+  int what_stride = b->src_stride;
+  int in_what_stride = d->pre_stride;
+  unsigned char *what = (*(b->base_src) + b->src);
+  unsigned char *best_address = (unsigned char *)(*(d->base_pre) + d->pre +
+                                                  (ref_mv->as_mv.row * (d->pre_stride)) + ref_mv->as_mv.col);
+  unsigned char *check_here;
+  unsigned int thissad;
+  int_mv this_mv;
+  unsigned int bestsad = INT_MAX;
+  MACROBLOCKD *xd = &x->e_mbd;
+  int_mv fcenter_mv;
+
+  int *mvjsadcost = x->nmvjointsadcost;
+  int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]};
+
+  fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
+  fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
+
+  bestsad = fn_ptr->sdf(what, what_stride, best_address, in_what_stride, 0x7fffffff) +
+      mvsad_err_cost(ref_mv, &fcenter_mv, MVSADCOSTS, error_per_bit);
+
+  for (i = 0; i < search_range; i++) {
+    int best_site = -1;
+    int all_in = 1;
+
+    all_in &= ((ref_mv->as_mv.row - 1) > x->mv_row_min);
+    all_in &= ((ref_mv->as_mv.row + 1) < x->mv_row_max);
+    all_in &= ((ref_mv->as_mv.col - 1) > x->mv_col_min);
+    all_in &= ((ref_mv->as_mv.col + 1) < x->mv_col_max);
+
+    if (all_in) {
+      unsigned int sad_array[4];
+      unsigned char *block_offset[4];
+      block_offset[0] = best_address - in_what_stride;
+      block_offset[1] = best_address - 1;
+      block_offset[2] = best_address + 1;
+      block_offset[3] = best_address + in_what_stride;
+
+      fn_ptr->sdx4df(what, what_stride, block_offset, in_what_stride, sad_array);
+
+      for (j = 0; j < 4; j++) {
+        if (sad_array[j] < bestsad) {
+          this_mv.as_mv.row = ref_mv->as_mv.row + neighbors[j].row;
+          this_mv.as_mv.col = ref_mv->as_mv.col + neighbors[j].col;
+          sad_array[j] += mvsad_err_cost(&this_mv, &fcenter_mv, MVSADCOSTS, error_per_bit);
+
+          if (sad_array[j] < bestsad) {
+            bestsad = sad_array[j];
+            best_site = j;
+          }
+        }
+      }
+    } else {
+      for (j = 0; j < 4; j++) {
+        this_row_offset = ref_mv->as_mv.row + neighbors[j].row;
+        this_col_offset = ref_mv->as_mv.col + neighbors[j].col;
+
+        if ((this_col_offset > x->mv_col_min) && (this_col_offset < x->mv_col_max) &&
+            (this_row_offset > x->mv_row_min) && (this_row_offset < x->mv_row_max)) {
+          check_here = (neighbors[j].row) * in_what_stride + neighbors[j].col + best_address;
+          thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, bestsad);
+
+          if (thissad < bestsad) {
+            this_mv.as_mv.row = this_row_offset;
+            this_mv.as_mv.col = this_col_offset;
+            thissad += mvsad_err_cost(&this_mv, &fcenter_mv, MVSADCOSTS, error_per_bit);
+
+            if (thissad < bestsad) {
+              bestsad = thissad;
+              best_site = j;
+            }
+          }
+        }
+      }
+    }
+
+    if (best_site == -1)
+      break;
+    else {
+      ref_mv->as_mv.row += neighbors[best_site].row;
+      ref_mv->as_mv.col += neighbors[best_site].col;
+      best_address += (neighbors[best_site].row) * in_what_stride + neighbors[best_site].col;
+    }
+  }
+
+  this_mv.as_mv.row = ref_mv->as_mv.row << 3;
+  this_mv.as_mv.col = ref_mv->as_mv.col << 3;
+
+  if (bestsad < INT_MAX)
+    return
+        fn_ptr->vf(what, what_stride, best_address, in_what_stride,
+                   (unsigned int *)(&thissad)) +
+        mv_err_cost(&this_mv, center_mv, MVCOSTS, x->errorperbit,
+                    xd->allow_high_precision_mv);
+  else
+    return INT_MAX;
+}
+
+
+
+#ifdef ENTROPY_STATS
+void print_mode_context(void) {
+  FILE *f = fopen("modecont.c", "a");
+  int i, j;
+
+  fprintf(f, "#include \"entropy.h\"\n");
+  fprintf(f, "const int vp9_mode_contexts[6][4] =");
+  fprintf(f, "{\n");
+  for (j = 0; j < 6; j++) {
+    fprintf(f, "  {/* %d */ ", j);
+    fprintf(f, "    ");
+    for (i = 0; i < 4; i++) {
+      int this_prob;
+      int count;
+
+      // context probs
+      count = mv_ref_ct[j][i][0] + mv_ref_ct[j][i][1];
+      if (count)
+        this_prob = 256 * mv_ref_ct[j][i][0] / count;
+      else
+        this_prob = 128;
+
+      if (this_prob == 0)
+        this_prob = 1;
+      fprintf(f, "%5d, ", this_prob);
+    }
+    fprintf(f, "  },\n");
+  }
+
+  fprintf(f, "};\n");
+  fclose(f);
+}
+
+/* MV ref count ENTROPY_STATS stats code */
+void init_mv_ref_counts() {
+  vpx_memset(mv_ref_ct, 0, sizeof(mv_ref_ct));
+  vpx_memset(mv_mode_cts, 0, sizeof(mv_mode_cts));
+}
+
+void accum_mv_refs(MB_PREDICTION_MODE m, const int ct[4]) {
+  if (m == ZEROMV) {
+    ++mv_ref_ct [ct[0]] [0] [0];
+    ++mv_mode_cts[0][0];
+  } else {
+    ++mv_ref_ct [ct[0]] [0] [1];
+    ++mv_mode_cts[0][1];
+
+    if (m == NEARESTMV) {
+      ++mv_ref_ct [ct[1]] [1] [0];
+      ++mv_mode_cts[1][0];
+    } else {
+      ++mv_ref_ct [ct[1]] [1] [1];
+      ++mv_mode_cts[1][1];
+
+      if (m == NEARMV) {
+        ++mv_ref_ct [ct[2]] [2] [0];
+        ++mv_mode_cts[2][0];
+      } else {
+        ++mv_ref_ct [ct[2]] [2] [1];
+        ++mv_mode_cts[2][1];
+
+        if (m == NEWMV) {
+          ++mv_ref_ct [ct[3]] [3] [0];
+          ++mv_mode_cts[3][0];
+        } else {
+          ++mv_ref_ct [ct[3]] [3] [1];
+          ++mv_mode_cts[3][1];
+        }
+      }
+    }
+  }
+}
+
+#endif/* END MV ref count ENTROPY_STATS stats code */
--- /dev/null
+++ b/vp9/encoder/mcomp.h
@@ -1,0 +1,159 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_MCOMP_H
+#define __INC_MCOMP_H
+
+#include "block.h"
+#include "variance.h"
+
+#define MVCOSTS mvjcost, mvcost
+#define MVSADCOSTS mvjsadcost, mvsadcost
+#define DEC_MVCOSTS int *mvjcost, int *mvcost[2]
+#define DEC_MVSADCOSTS int *mvjsadcost, int *mvsadcost[2]
+#define NULLMVCOST NULL, NULL
+#define XMVCOST x->nmvjointcost, (x->e_mbd.allow_high_precision_mv?x->nmvcost_hp:x->nmvcost)
+
+#ifdef ENTROPY_STATS
+extern void init_mv_ref_counts();
+extern void accum_mv_refs(MB_PREDICTION_MODE, const int near_mv_ref_cts[4]);
+#endif
+
+
+#define MAX_MVSEARCH_STEPS 8                                    // The maximum number of steps in a step search given the largest allowed initial step
+#define MAX_FULL_PEL_VAL ((1 << (MAX_MVSEARCH_STEPS)) - 1)      // Max full pel mv specified in 1 pel units
+#define MAX_FIRST_STEP (1 << (MAX_MVSEARCH_STEPS-1))            // Maximum size of the first step in full pel units
+
+extern void vp9_clamp_mv_min_max(MACROBLOCK *x, int_mv *ref_mv);
+extern int vp9_mv_bit_cost(int_mv *mv, int_mv *ref, DEC_MVCOSTS,
+                           int Weight, int ishp);
+extern void vp9_init_dsmotion_compensation(MACROBLOCK *x, int stride);
+extern void vp9_init3smotion_compensation(MACROBLOCK *x,  int stride);
+// Runs sequence of diamond searches in smaller steps for RD
+struct VP9_COMP;
+int vp9_full_pixel_diamond(struct VP9_COMP *cpi, MACROBLOCK *x, BLOCK *b,
+                           BLOCKD *d, int_mv *mvp_full, int step_param,
+                           int sadpb, int further_steps, int do_refine,
+                           vp9_variance_fn_ptr_t *fn_ptr,
+                           int_mv *ref_mv, int_mv *dst_mv);
+
+extern int vp9_hex_search
+(
+  MACROBLOCK *x,
+  BLOCK *b,
+  BLOCKD *d,
+  int_mv *ref_mv,
+  int_mv *best_mv,
+  int search_param,
+  int error_per_bit,
+  const vp9_variance_fn_ptr_t *vf,
+  DEC_MVSADCOSTS,
+  DEC_MVCOSTS,
+  int_mv *center_mv
+);
+
+typedef int (fractional_mv_step_fp)
+(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *bestmv, int_mv *ref_mv,
+ int error_per_bit, const vp9_variance_fn_ptr_t *vfp, DEC_MVCOSTS,
+ int *distortion, unsigned int *sse);
+extern fractional_mv_step_fp vp9_find_best_sub_pixel_step_iteratively;
+extern fractional_mv_step_fp vp9_find_best_sub_pixel_step;
+extern fractional_mv_step_fp vp9_find_best_half_pixel_step;
+
+#define prototype_full_search_sad(sym)\
+  int (sym)\
+  (\
+   MACROBLOCK *x, \
+   BLOCK *b, \
+   BLOCKD *d, \
+   int_mv *ref_mv, \
+   int sad_per_bit, \
+   int distance, \
+   vp9_variance_fn_ptr_t *fn_ptr, \
+   DEC_MVSADCOSTS, \
+   int_mv *center_mv \
+  )
+
+#define prototype_refining_search_sad(sym)\
+  int (sym)\
+  (\
+   MACROBLOCK *x, \
+   BLOCK *b, \
+   BLOCKD *d, \
+   int_mv *ref_mv, \
+   int sad_per_bit, \
+   int distance, \
+   vp9_variance_fn_ptr_t *fn_ptr, \
+   DEC_MVSADCOSTS, \
+   int_mv *center_mv \
+  )
+
+#define prototype_diamond_search_sad(sym)\
+  int (sym)\
+  (\
+   MACROBLOCK *x, \
+   BLOCK *b, \
+   BLOCKD *d, \
+   int_mv *ref_mv, \
+   int_mv *best_mv, \
+   int search_param, \
+   int sad_per_bit, \
+   int *num00, \
+   vp9_variance_fn_ptr_t *fn_ptr, \
+   DEC_MVSADCOSTS, \
+   int_mv *center_mv \
+  )
+
+#if ARCH_X86 || ARCH_X86_64
+#include "x86/mcomp_x86.h"
+#endif
+
+typedef prototype_full_search_sad(*vp9_full_search_fn_t);
+extern prototype_full_search_sad(vp9_full_search_sad);
+extern prototype_full_search_sad(vp9_full_search_sadx3);
+extern prototype_full_search_sad(vp9_full_search_sadx8);
+
+typedef prototype_refining_search_sad(*vp9_refining_search_fn_t);
+extern prototype_refining_search_sad(vp9_refining_search_sad);
+extern prototype_refining_search_sad(vp9_refining_search_sadx4);
+
+typedef prototype_diamond_search_sad(*vp9_diamond_search_fn_t);
+extern prototype_diamond_search_sad(vp9_diamond_search_sad);
+extern prototype_diamond_search_sad(vp9_diamond_search_sadx4);
+
+#ifndef vp9_search_full_search
+#define vp9_search_full_search vp9_full_search_sad
+#endif
+extern prototype_full_search_sad(vp9_search_full_search);
+
+#ifndef vp9_search_refining_search
+#define vp9_search_refining_search vp9_refining_search_sad
+#endif
+extern prototype_refining_search_sad(vp9_search_refining_search);
+
+#ifndef vp9_search_diamond_search
+#define vp9_search_diamond_search vp9_diamond_search_sad
+#endif
+extern prototype_diamond_search_sad(vp9_search_diamond_search);
+
+typedef struct {
+  prototype_full_search_sad(*full_search);
+  prototype_refining_search_sad(*refining_search);
+  prototype_diamond_search_sad(*diamond_search);
+} vp9_search_rtcd_vtable_t;
+
+#if CONFIG_RUNTIME_CPU_DETECT
+#define SEARCH_INVOKE(ctx,fn) (ctx)->fn
+#else
+#define SEARCH_INVOKE(ctx,fn) vp9_search_##fn
+#endif
+
+#endif
--- /dev/null
+++ b/vp9/encoder/modecosts.c
@@ -1,0 +1,49 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vp9/common/blockd.h"
+#include "onyx_int.h"
+#include "treewriter.h"
+#include "vp9/common/entropymode.h"
+
+
+void vp9_init_mode_costs(VP9_COMP *c) {
+  VP9_COMMON *x = &c->common;
+  const vp9_tree_p T = vp9_bmode_tree;
+  int i, j;
+
+  for (i = 0; i < VP9_BINTRAMODES; i++) {
+    for (j = 0; j < VP9_BINTRAMODES; j++) {
+      vp9_cost_tokens((int *)c->mb.bmode_costs[i][j],
+                      x->kf_bmode_prob[i][j], T);
+    }
+  }
+
+  vp9_cost_tokens((int *)c->mb.inter_bmode_costs, x->fc.bmode_prob, T);
+  vp9_cost_tokens((int *)c->mb.inter_bmode_costs,
+                  x->fc.sub_mv_ref_prob[0], vp9_sub_mv_ref_tree);
+
+  vp9_cost_tokens(c->mb.mbmode_cost[1], x->fc.ymode_prob, vp9_ymode_tree);
+  vp9_cost_tokens(c->mb.mbmode_cost[0],
+                  x->kf_ymode_prob[c->common.kf_ymode_probs_index],
+                  vp9_kf_ymode_tree);
+  vp9_cost_tokens(c->mb.intra_uv_mode_cost[1],
+                  x->fc.uv_mode_prob[VP9_YMODES - 1], vp9_uv_mode_tree);
+  vp9_cost_tokens(c->mb.intra_uv_mode_cost[0],
+                  x->kf_uv_mode_prob[VP9_YMODES - 1], vp9_uv_mode_tree);
+  vp9_cost_tokens(c->mb.i8x8_mode_costs,
+                  x->fc.i8x8_mode_prob, vp9_i8x8_mode_tree);
+
+  for (i = 0; i <= VP9_SWITCHABLE_FILTERS; ++i)
+    vp9_cost_tokens((int *)c->mb.switchable_interp_costs[i],
+                    x->fc.switchable_interp_prob[i],
+                    vp9_switchable_interp_tree);
+}
--- /dev/null
+++ b/vp9/encoder/modecosts.h
@@ -1,0 +1,17 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_MODECOSTS_H
+#define __INC_MODECOSTS_H
+
+void vp9_init_mode_costs(VP9_COMP *x);
+
+#endif
--- /dev/null
+++ b/vp9/encoder/onyx_if.c
@@ -1,0 +1,4486 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_config.h"
+#include "vp9/common/onyxc_int.h"
+#include "onyx_int.h"
+#include "vp9/common/systemdependent.h"
+#include "quantize.h"
+#include "vp9/common/alloccommon.h"
+#include "mcomp.h"
+#include "firstpass.h"
+#include "psnr.h"
+#include "vpx_scale/vpxscale.h"
+#include "vp9/common/extend.h"
+#include "ratectrl.h"
+#include "vp9/common/quant_common.h"
+#include "segmentation.h"
+#include "vpx_scale/yv12extend.h"
+#if CONFIG_POSTPROC
+#include "vp9/common/postproc.h"
+#endif
+#include "vpx_mem/vpx_mem.h"
+#include "vp9/common/swapyv12buffer.h"
+#include "vpx_ports/vpx_timer.h"
+#include "temporal_filter.h"
+
+#include "vp9/common/seg_common.h"
+#include "mbgraph.h"
+#include "vp9/common/pred_common.h"
+#include "vp9/encoder/rdopt.h"
+#include "bitstream.h"
+#include "ratectrl.h"
+
+#if CONFIG_NEWBESTREFMV
+#include "vp9/common/mvref_common.h"
+#endif
+
+#if ARCH_ARM
+#include "vpx_ports/arm.h"
+#endif
+
+#include <math.h>
+#include <stdio.h>
+#include <limits.h>
+
+#if CONFIG_RUNTIME_CPU_DETECT
+#define IF_RTCD(x) (x)
+#define RTCD(x) &cpi->common.rtcd.x
+#else
+#define IF_RTCD(x) NULL
+#define RTCD(x) NULL
+#endif
+
+extern void vp9_pick_filter_level_fast(YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi);
+
+extern void vp9_set_alt_lf_level(VP9_COMP *cpi, int filt_val);
+
+extern void vp9_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi);
+
+extern void vp9_cmachine_specific_config(VP9_COMP *cpi);
+
+extern void vp9_deblock_frame(YV12_BUFFER_CONFIG *source,
+                              YV12_BUFFER_CONFIG *post,
+                              int filt_lvl, int low_var_thresh, int flag);
+
+extern void print_tree_update_probs();
+
+#if HAVE_ARMV7
+extern void vp8_yv12_copy_frame_func_neon(YV12_BUFFER_CONFIG *src_ybc,
+                                          YV12_BUFFER_CONFIG *dst_ybc);
+
+extern void vp8_yv12_copy_src_frame_func_neon(YV12_BUFFER_CONFIG *src_ybc,
+                                              YV12_BUFFER_CONFIG *dst_ybc);
+#endif
+
+int vp9_calc_ss_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest);
+
+extern void vp9_temporal_filter_prepare_c(VP9_COMP *cpi, int distance);
+
+static void set_default_lf_deltas(VP9_COMP *cpi);
+
+#define DEFAULT_INTERP_FILTER EIGHTTAP  /* SWITCHABLE for better performance */
+#define SEARCH_BEST_FILTER 0            /* to search exhaustively for
+                                           best filter */
+#define RESET_FOREACH_FILTER 0          /* whether to reset the encoder state
+                                           before trying each new filter */
+#define SHARP_FILTER_QTHRESH 0          /* Q threshold for 8-tap sharp filter */
+
+#define ALTREF_HIGH_PRECISION_MV 1      /* whether to use high precision mv
+                                           for altref computation */
+#define HIGH_PRECISION_MV_QTHRESH 200   /* Q threshold for use of high precision
+                                           mv. Choose a very high value for
+                                           now so that HIGH_PRECISION is always
+                                           chosen */
+
+#if CONFIG_INTERNAL_STATS
+#include "math.h"
+
+extern double vp9_calc_ssim(YV12_BUFFER_CONFIG *source,
+                            YV12_BUFFER_CONFIG *dest, int lumamask,
+                            double *weight);
+
+
+extern double vp9_calc_ssimg(YV12_BUFFER_CONFIG *source,
+                             YV12_BUFFER_CONFIG *dest, double *ssim_y,
+                             double *ssim_u, double *ssim_v);
+
+
+#endif
+
+// #define OUTPUT_YUV_REC
+
+#ifdef OUTPUT_YUV_SRC
+FILE *yuv_file;
+#endif
+#ifdef OUTPUT_YUV_REC
+FILE *yuv_rec_file;
+#endif
+
+#if 0
+FILE *framepsnr;
+FILE *kf_list;
+FILE *keyfile;
+#endif
+
+#if 0
+extern int skip_true_count;
+extern int skip_false_count;
+#endif
+
+
+#ifdef ENTROPY_STATS
+extern int intra_mode_stats[VP9_BINTRAMODES][VP9_BINTRAMODES][VP9_BINTRAMODES];
+#endif
+
+#ifdef NMV_STATS
+extern void init_nmvstats();
+extern void print_nmvstats();
+#endif
+
+#ifdef SPEEDSTATS
+unsigned int frames_at_speed[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+#endif
+
+#if defined(SECTIONBITS_OUTPUT)
+extern unsigned __int64 Sectionbits[500];
+#endif
+#ifdef MODE_STATS
+extern INT64 Sectionbits[500];
+extern unsigned int y_modes[VP9_YMODES];
+extern unsigned int i8x8_modes[VP9_I8X8_MODES];
+extern unsigned int uv_modes[VP9_UV_MODES];
+extern unsigned int uv_modes_y[VP9_YMODES][VP9_UV_MODES];
+extern unsigned int b_modes[B_MODE_COUNT];
+extern unsigned int inter_y_modes[MB_MODE_COUNT];
+extern unsigned int inter_uv_modes[VP9_UV_MODES];
+extern unsigned int inter_b_modes[B_MODE_COUNT];
+#endif
+
+extern void vp9_init_quantizer(VP9_COMP *cpi);
+
+static int base_skip_false_prob[QINDEX_RANGE][3];
+
+// Tables relating active max Q to active min Q
+static int kf_low_motion_minq[QINDEX_RANGE];
+static int kf_high_motion_minq[QINDEX_RANGE];
+static int gf_low_motion_minq[QINDEX_RANGE];
+static int gf_high_motion_minq[QINDEX_RANGE];
+static int inter_minq[QINDEX_RANGE];
+
+// Functions to compute the active minq lookup table entries based on a
+// formulaic approach to facilitate easier adjustment of the Q tables.
+// The formulae were derived from computing a 3rd order polynomial best
+// fit to the original data (after plotting real maxq vs minq (not q index))
+static int calculate_minq_index(double maxq,
+                                double x3, double x2, double x, double c) {
+  int i;
+  double minqtarget;
+  double thisq;
+
+  minqtarget = ((x3 * maxq * maxq * maxq) +
+                (x2 * maxq * maxq) +
+                (x * maxq) +
+                c);
+
+  if (minqtarget > maxq)
+    minqtarget = maxq;
+
+  for (i = 0; i < QINDEX_RANGE; i++) {
+    thisq = vp9_convert_qindex_to_q(i);
+    if (minqtarget <= vp9_convert_qindex_to_q(i))
+      return i;
+  }
+  return QINDEX_RANGE - 1;
+}
+
+static void init_minq_luts(void) {
+  int i;
+  double maxq;
+
+  for (i = 0; i < QINDEX_RANGE; i++) {
+    maxq = vp9_convert_qindex_to_q(i);
+
+
+    kf_low_motion_minq[i] = calculate_minq_index(maxq,
+                                                 0.0000003,
+                                                 -0.000015,
+                                                 0.074,
+                                                 0.0);
+    kf_high_motion_minq[i] = calculate_minq_index(maxq,
+                                                  0.0000004,
+                                                  -0.000125,
+                                                  0.14,
+                                                  0.0);
+    gf_low_motion_minq[i] = calculate_minq_index(maxq,
+                                                 0.0000015,
+                                                 -0.0009,
+                                                 0.33,
+                                                 0.0);
+    gf_high_motion_minq[i] = calculate_minq_index(maxq,
+                                                  0.0000021,
+                                                  -0.00125,
+                                                  0.45,
+                                                  0.0);
+    inter_minq[i] = calculate_minq_index(maxq,
+                                         0.00000271,
+                                         -0.00113,
+                                         0.697,
+                                         0.0);
+
+  }
+}
+
+static void init_base_skip_probs(void) {
+  int i;
+  double q;
+  int skip_prob, t;
+
+  for (i = 0; i < QINDEX_RANGE; i++) {
+    q = vp9_convert_qindex_to_q(i);
+
+    // Exponential decay caluclation of baseline skip prob with clamping
+    // Based on crude best fit of old table.
+    t = (int)(564.25 * pow(2.71828, (-0.012 * q)));
+
+    skip_prob = t;
+    if (skip_prob < 1)
+      skip_prob = 1;
+    else if (skip_prob > 255)
+      skip_prob = 255;
+    base_skip_false_prob[i][1] = skip_prob;
+
+    skip_prob = t * 0.75;
+    if (skip_prob < 1)
+      skip_prob = 1;
+    else if (skip_prob > 255)
+      skip_prob = 255;
+    base_skip_false_prob[i][2] = skip_prob;
+
+    skip_prob = t * 1.25;
+    if (skip_prob < 1)
+      skip_prob = 1;
+    else if (skip_prob > 255)
+      skip_prob = 255;
+    base_skip_false_prob[i][0] = skip_prob;
+  }
+}
+
+static void update_base_skip_probs(VP9_COMP *cpi) {
+  VP9_COMMON *cm = &cpi->common;
+
+  if (cm->frame_type != KEY_FRAME) {
+    vp9_update_skip_probs(cpi);
+
+    if (cm->refresh_alt_ref_frame) {
+      int k;
+      for (k = 0; k < MBSKIP_CONTEXTS; ++k)
+        cpi->last_skip_false_probs[2][k] = cm->mbskip_pred_probs[k];
+      cpi->last_skip_probs_q[2] = cm->base_qindex;
+    } else if (cpi->common.refresh_golden_frame) {
+      int k;
+      for (k = 0; k < MBSKIP_CONTEXTS; ++k)
+        cpi->last_skip_false_probs[1][k] = cm->mbskip_pred_probs[k];
+      cpi->last_skip_probs_q[1] = cm->base_qindex;
+    } else {
+      int k;
+      for (k = 0; k < MBSKIP_CONTEXTS; ++k)
+        cpi->last_skip_false_probs[0][k] = cm->mbskip_pred_probs[k];
+      cpi->last_skip_probs_q[0] = cm->base_qindex;
+
+      // update the baseline table for the current q
+      for (k = 0; k < MBSKIP_CONTEXTS; ++k)
+        cpi->base_skip_false_prob[cm->base_qindex][k] =
+          cm->mbskip_pred_probs[k];
+    }
+  }
+
+}
+
+void vp9_initialize_enc() {
+  static int init_done = 0;
+
+  if (!init_done) {
+    vp8_scale_machine_specific_config();
+    vp9_initialize_common();
+    vp9_tokenize_initialize();
+    vp9_init_quant_tables();
+    vp9_init_me_luts();
+    init_minq_luts();
+    init_base_skip_probs();
+    init_done = 1;
+  }
+}
+#ifdef PACKET_TESTING
+extern FILE *vpxlogc;
+#endif
+
+static void setup_features(VP9_COMP *cpi) {
+  MACROBLOCKD *xd = &cpi->mb.e_mbd;
+
+  // Set up default state for MB feature flags
+
+  xd->segmentation_enabled = 0;   // Default segmentation disabled
+
+  xd->update_mb_segmentation_map = 0;
+  xd->update_mb_segmentation_data = 0;
+  vpx_memset(xd->mb_segment_tree_probs, 255, sizeof(xd->mb_segment_tree_probs));
+
+  vp9_clearall_segfeatures(xd);
+
+  xd->mode_ref_lf_delta_enabled = 0;
+  xd->mode_ref_lf_delta_update = 0;
+  vpx_memset(xd->ref_lf_deltas, 0, sizeof(xd->ref_lf_deltas));
+  vpx_memset(xd->mode_lf_deltas, 0, sizeof(xd->mode_lf_deltas));
+  vpx_memset(xd->last_ref_lf_deltas, 0, sizeof(xd->ref_lf_deltas));
+  vpx_memset(xd->last_mode_lf_deltas, 0, sizeof(xd->mode_lf_deltas));
+
+  set_default_lf_deltas(cpi);
+
+}
+
+
+static void dealloc_compressor_data(VP9_COMP *cpi) {
+  vpx_free(cpi->tplist);
+  cpi->tplist = NULL;
+
+  // Delete last frame MV storage buffers
+  vpx_free(cpi->lfmv);
+  cpi->lfmv = 0;
+
+  vpx_free(cpi->lf_ref_frame_sign_bias);
+  cpi->lf_ref_frame_sign_bias = 0;
+
+  vpx_free(cpi->lf_ref_frame);
+  cpi->lf_ref_frame = 0;
+
+  // Delete sementation map
+  vpx_free(cpi->segmentation_map);
+  cpi->segmentation_map = 0;
+  vpx_free(cpi->common.last_frame_seg_map);
+  cpi->common.last_frame_seg_map = 0;
+  vpx_free(cpi->coding_context.last_frame_seg_map_copy);
+  cpi->coding_context.last_frame_seg_map_copy = 0;
+
+  vpx_free(cpi->active_map);
+  cpi->active_map = 0;
+
+  vp9_de_alloc_frame_buffers(&cpi->common);
+
+  vp8_yv12_de_alloc_frame_buffer(&cpi->last_frame_uf);
+  vp8_yv12_de_alloc_frame_buffer(&cpi->scaled_source);
+#if VP9_TEMPORAL_ALT_REF
+  vp8_yv12_de_alloc_frame_buffer(&cpi->alt_ref_buffer);
+#endif
+  vp9_lookahead_destroy(cpi->lookahead);
+
+  vpx_free(cpi->tok);
+  cpi->tok = 0;
+
+  // Structure used to monitor GF usage
+  vpx_free(cpi->gf_active_flags);
+  cpi->gf_active_flags = 0;
+
+  // Activity mask based per mb zbin adjustments
+  vpx_free(cpi->mb_activity_map);
+  cpi->mb_activity_map = 0;
+  vpx_free(cpi->mb_norm_activity_map);
+  cpi->mb_norm_activity_map = 0;
+
+  vpx_free(cpi->mb.pip);
+  cpi->mb.pip = 0;
+
+  vpx_free(cpi->twopass.total_stats);
+  cpi->twopass.total_stats = 0;
+
+  vpx_free(cpi->twopass.total_left_stats);
+  cpi->twopass.total_left_stats = 0;
+
+  vpx_free(cpi->twopass.this_frame_stats);
+  cpi->twopass.this_frame_stats = 0;
+}
+
+// Computes a q delta (in "q index" terms) to get from a starting q value
+// to a target value
+// target q value
+static int compute_qdelta(VP9_COMP *cpi, double qstart, double qtarget) {
+  int i;
+  int start_index = cpi->worst_quality;
+  int target_index = cpi->worst_quality;
+
+  // Convert the average q value to an index.
+  for (i = cpi->best_quality; i < cpi->worst_quality; i++) {
+    start_index = i;
+    if (vp9_convert_qindex_to_q(i) >= qstart)
+      break;
+  }
+
+  // Convert the q target to an index
+  for (i = cpi->best_quality; i < cpi->worst_quality; i++) {
+    target_index = i;
+    if (vp9_convert_qindex_to_q(i) >= qtarget)
+      break;
+  }
+
+  return target_index - start_index;
+}
+
+static void init_seg_features(VP9_COMP *cpi) {
+  VP9_COMMON *cm = &cpi->common;
+  MACROBLOCKD *xd = &cpi->mb.e_mbd;
+
+  int high_q = (int)(cpi->avg_q > 48.0);
+  int qi_delta;
+
+  // Disable and clear down for KF
+  if (cm->frame_type == KEY_FRAME) {
+    // Clear down the global segmentation map
+    vpx_memset(cpi->segmentation_map, 0, (cm->mb_rows * cm->mb_cols));
+    xd->update_mb_segmentation_map = 0;
+    xd->update_mb_segmentation_data = 0;
+    cpi->static_mb_pct = 0;
+
+    // Disable segmentation
+    vp9_disable_segmentation((VP9_PTR)cpi);
+
+    // Clear down the segment features.
+    vp9_clearall_segfeatures(xd);
+  }
+
+  // If this is an alt ref frame
+  else if (cm->refresh_alt_ref_frame) {
+    // Clear down the global segmentation map
+    vpx_memset(cpi->segmentation_map, 0, (cm->mb_rows * cm->mb_cols));
+    xd->update_mb_segmentation_map = 0;
+    xd->update_mb_segmentation_data = 0;
+    cpi->static_mb_pct = 0;
+
+    // Disable segmentation and individual segment features by default
+    vp9_disable_segmentation((VP9_PTR)cpi);
+    vp9_clearall_segfeatures(xd);
+
+    // Scan frames from current to arf frame.
+    // This function re-enables segmentation if appropriate.
+    vp9_update_mbgraph_stats(cpi);
+
+    // If segmentation was enabled set those features needed for the
+    // arf itself.
+    if (xd->segmentation_enabled) {
+      xd->update_mb_segmentation_map = 1;
+      xd->update_mb_segmentation_data = 1;
+
+      qi_delta = compute_qdelta(cpi, cpi->avg_q, (cpi->avg_q * 0.875));
+      vp9_set_segdata(xd, 1, SEG_LVL_ALT_Q, (qi_delta - 2));
+      vp9_set_segdata(xd, 1, SEG_LVL_ALT_LF, -2);
+
+      vp9_enable_segfeature(xd, 1, SEG_LVL_ALT_Q);
+      vp9_enable_segfeature(xd, 1, SEG_LVL_ALT_LF);
+
+      // Where relevant assume segment data is delta data
+      xd->mb_segment_abs_delta = SEGMENT_DELTADATA;
+
+    }
+  }
+  // All other frames if segmentation has been enabled
+  else if (xd->segmentation_enabled) {
+    // First normal frame in a valid gf or alt ref group
+    if (cpi->common.frames_since_golden == 0) {
+      // Set up segment features for normal frames in an af group
+      if (cpi->source_alt_ref_active) {
+        xd->update_mb_segmentation_map = 0;
+        xd->update_mb_segmentation_data = 1;
+        xd->mb_segment_abs_delta = SEGMENT_DELTADATA;
+
+        qi_delta = compute_qdelta(cpi, cpi->avg_q,
+                                  (cpi->avg_q * 1.125));
+        vp9_set_segdata(xd, 1, SEG_LVL_ALT_Q, (qi_delta + 2));
+        vp9_set_segdata(xd, 1, SEG_LVL_ALT_Q, 0);
+        vp9_enable_segfeature(xd, 1, SEG_LVL_ALT_Q);
+
+        vp9_set_segdata(xd, 1, SEG_LVL_ALT_LF, -2);
+        vp9_enable_segfeature(xd, 1, SEG_LVL_ALT_LF);
+
+        // Segment coding disabled for compred testing
+        if (high_q || (cpi->static_mb_pct == 100)) {
+          // set_segref(xd, 1, LAST_FRAME);
+          vp9_set_segref(xd, 1, ALTREF_FRAME);
+          vp9_enable_segfeature(xd, 1, SEG_LVL_REF_FRAME);
+
+          vp9_set_segdata(xd, 1, SEG_LVL_MODE, ZEROMV);
+          vp9_enable_segfeature(xd, 1, SEG_LVL_MODE);
+
+          // EOB segment coding not fixed for 8x8 yet
+          vp9_set_segdata(xd, 1, SEG_LVL_EOB, 0);
+          vp9_enable_segfeature(xd, 1, SEG_LVL_EOB);
+        }
+      }
+      // Disable segmentation and clear down features if alt ref
+      // is not active for this group
+      else {
+        vp9_disable_segmentation((VP9_PTR)cpi);
+
+        vpx_memset(cpi->segmentation_map, 0,
+                   (cm->mb_rows * cm->mb_cols));
+
+        xd->update_mb_segmentation_map = 0;
+        xd->update_mb_segmentation_data = 0;
+
+        vp9_clearall_segfeatures(xd);
+      }
+    }
+
+    // Special case where we are coding over the top of a previous
+    // alt ref frame
+    // Segment coding disabled for compred testing
+    else if (cpi->is_src_frame_alt_ref) {
+      // Enable mode and ref frame features for segment 0 as well
+      vp9_enable_segfeature(xd, 0, SEG_LVL_REF_FRAME);
+      vp9_enable_segfeature(xd, 0, SEG_LVL_MODE);
+      vp9_enable_segfeature(xd, 1, SEG_LVL_REF_FRAME);
+      vp9_enable_segfeature(xd, 1, SEG_LVL_MODE);
+
+      // All mbs should use ALTREF_FRAME, ZEROMV exclusively
+      vp9_clear_segref(xd, 0);
+      vp9_set_segref(xd, 0, ALTREF_FRAME);
+      vp9_clear_segref(xd, 1);
+      vp9_set_segref(xd, 1, ALTREF_FRAME);
+      vp9_set_segdata(xd, 0, SEG_LVL_MODE, ZEROMV);
+      vp9_set_segdata(xd, 1, SEG_LVL_MODE, ZEROMV);
+
+      // Skip all MBs if high Q
+      if (high_q) {
+        vp9_enable_segfeature(xd, 0, SEG_LVL_EOB);
+        vp9_set_segdata(xd, 0, SEG_LVL_EOB, 0);
+        vp9_enable_segfeature(xd, 1, SEG_LVL_EOB);
+        vp9_set_segdata(xd, 1, SEG_LVL_EOB, 0);
+      }
+      // Enable data udpate
+      xd->update_mb_segmentation_data = 1;
+    }
+    // All other frames.
+    else {
+      // No updates.. leave things as they are.
+      xd->update_mb_segmentation_map = 0;
+      xd->update_mb_segmentation_data = 0;
+    }
+  }
+}
+
+// DEBUG: Print out the segment id of each MB in the current frame.
+static void print_seg_map(VP9_COMP *cpi) {
+  VP9_COMMON *cm = &cpi->common;
+  int row, col;
+  int map_index = 0;
+  FILE *statsfile;
+
+  statsfile = fopen("segmap.stt", "a");
+
+  fprintf(statsfile, "%10d\n",
+          cm->current_video_frame);
+
+  for (row = 0; row < cpi->common.mb_rows; row++) {
+    for (col = 0; col < cpi->common.mb_cols; col++) {
+      fprintf(statsfile, "%10d",
+              cpi->segmentation_map[map_index]);
+      map_index++;
+    }
+    fprintf(statsfile, "\n");
+  }
+  fprintf(statsfile, "\n");
+
+  fclose(statsfile);
+}
+
+static void update_reference_segmentation_map(VP9_COMP *cpi) {
+  VP9_COMMON *cm = &cpi->common;
+  int row, col, sb_rows = (cm->mb_rows + 1) >> 1, sb_cols = (cm->mb_cols + 1) >> 1;
+  MODE_INFO *mi = cm->mi;
+  uint8_t *segmap = cpi->segmentation_map;
+  uint8_t *segcache = cm->last_frame_seg_map;
+
+  for (row = 0; row < sb_rows; row++) {
+    for (col = 0; col < sb_cols; col++) {
+      MODE_INFO *miptr = mi + col * 2;
+      uint8_t *cache = segcache + col * 2;
+#if CONFIG_SUPERBLOCKS
+      if (miptr->mbmi.encoded_as_sb) {
+        cache[0] = miptr->mbmi.segment_id;
+        if (!(cm->mb_cols & 1) || col < sb_cols - 1)
+          cache[1] = miptr->mbmi.segment_id;
+        if (!(cm->mb_rows & 1) || row < sb_rows - 1) {
+          cache[cm->mb_cols] = miptr->mbmi.segment_id;
+          if (!(cm->mb_cols & 1) || col < sb_cols - 1)
+            cache[cm->mb_cols + 1] = miptr->mbmi.segment_id;
+        }
+      } else
+#endif
+      {
+        cache[0] = miptr[0].mbmi.segment_id;
+        if (!(cm->mb_cols & 1) || col < sb_cols - 1)
+          cache[1] = miptr[1].mbmi.segment_id;
+        if (!(cm->mb_rows & 1) || row < sb_rows - 1) {
+          cache[cm->mb_cols] = miptr[cm->mode_info_stride].mbmi.segment_id;
+          if (!(cm->mb_cols & 1) || col < sb_cols - 1)
+            cache[1] = miptr[1].mbmi.segment_id;
+          cache[cm->mb_cols + 1] = miptr[cm->mode_info_stride + 1].mbmi.segment_id;
+        }
+      }
+    }
+    segmap += 2 * cm->mb_cols;
+    segcache += 2 * cm->mb_cols;
+    mi += 2 * cm->mode_info_stride;
+  }
+}
+
+static void set_default_lf_deltas(VP9_COMP *cpi) {
+  cpi->mb.e_mbd.mode_ref_lf_delta_enabled = 1;
+  cpi->mb.e_mbd.mode_ref_lf_delta_update = 1;
+
+  vpx_memset(cpi->mb.e_mbd.ref_lf_deltas, 0, sizeof(cpi->mb.e_mbd.ref_lf_deltas));
+  vpx_memset(cpi->mb.e_mbd.mode_lf_deltas, 0, sizeof(cpi->mb.e_mbd.mode_lf_deltas));
+
+  // Test of ref frame deltas
+  cpi->mb.e_mbd.ref_lf_deltas[INTRA_FRAME] = 2;
+  cpi->mb.e_mbd.ref_lf_deltas[LAST_FRAME] = 0;
+  cpi->mb.e_mbd.ref_lf_deltas[GOLDEN_FRAME] = -2;
+  cpi->mb.e_mbd.ref_lf_deltas[ALTREF_FRAME] = -2;
+
+  cpi->mb.e_mbd.mode_lf_deltas[0] = 4;               // BPRED
+  cpi->mb.e_mbd.mode_lf_deltas[1] = -2;              // Zero
+  cpi->mb.e_mbd.mode_lf_deltas[2] = 2;               // New mv
+  cpi->mb.e_mbd.mode_lf_deltas[3] = 4;               // Split mv
+}
+
+void vp9_set_speed_features(VP9_COMP *cpi) {
+  SPEED_FEATURES *sf = &cpi->sf;
+  int Mode = cpi->compressor_speed;
+  int Speed = cpi->Speed;
+  int i;
+  VP9_COMMON *cm = &cpi->common;
+
+  // Only modes 0 and 1 supported for now in experimental code basae
+  if (Mode > 1)
+    Mode = 1;
+
+  // Initialise default mode frequency sampling variables
+  for (i = 0; i < MAX_MODES; i ++) {
+    cpi->mode_check_freq[i] = 0;
+    cpi->mode_test_hit_counts[i] = 0;
+    cpi->mode_chosen_counts[i] = 0;
+  }
+
+  // best quality defaults
+  sf->RD = 1;
+  sf->search_method = NSTEP;
+  sf->improved_dct = 1;
+  sf->auto_filter = 1;
+  sf->recode_loop = 1;
+  sf->quarter_pixel_search = 1;
+  sf->half_pixel_search = 1;
+  sf->iterative_sub_pixel = 1;
+#if CONFIG_LOSSLESS
+  sf->optimize_coefficients = 0;
+#else
+  sf->optimize_coefficients = 1;
+#endif
+  sf->no_skip_block4x4_search = 1;
+
+  sf->first_step = 0;
+  sf->max_step_search_steps = MAX_MVSEARCH_STEPS;
+  sf->improved_mv_pred = 1;
+
+  // default thresholds to 0
+  for (i = 0; i < MAX_MODES; i++)
+    sf->thresh_mult[i] = 0;
+
+  switch (Mode) {
+    case 0: // best quality mode
+#if CONFIG_PRED_FILTER
+      sf->thresh_mult[THR_ZEROMV        ] = 0;
+      sf->thresh_mult[THR_ZEROMV_FILT   ] = 0;
+      sf->thresh_mult[THR_ZEROG         ] = 0;
+      sf->thresh_mult[THR_ZEROG_FILT    ] = 0;
+      sf->thresh_mult[THR_ZEROA         ] = 0;
+      sf->thresh_mult[THR_ZEROA_FILT    ] = 0;
+      sf->thresh_mult[THR_NEARESTMV     ] = 0;
+      sf->thresh_mult[THR_NEARESTMV_FILT] = 0;
+      sf->thresh_mult[THR_NEARESTG      ] = 0;
+      sf->thresh_mult[THR_NEARESTG_FILT ] = 0;
+      sf->thresh_mult[THR_NEARESTA      ] = 0;
+      sf->thresh_mult[THR_NEARESTA_FILT ] = 0;
+      sf->thresh_mult[THR_NEARMV        ] = 0;
+      sf->thresh_mult[THR_NEARMV_FILT   ] = 0;
+      sf->thresh_mult[THR_NEARG         ] = 0;
+      sf->thresh_mult[THR_NEARG_FILT    ] = 0;
+      sf->thresh_mult[THR_NEARA         ] = 0;
+      sf->thresh_mult[THR_NEARA_FILT    ] = 0;
+
+      sf->thresh_mult[THR_DC       ] = 0;
+
+      sf->thresh_mult[THR_V_PRED   ] = 1000;
+      sf->thresh_mult[THR_H_PRED   ] = 1000;
+      sf->thresh_mult[THR_D45_PRED ] = 1000;
+      sf->thresh_mult[THR_D135_PRED] = 1000;
+      sf->thresh_mult[THR_D117_PRED] = 1000;
+      sf->thresh_mult[THR_D153_PRED] = 1000;
+      sf->thresh_mult[THR_D27_PRED ] = 1000;
+      sf->thresh_mult[THR_D63_PRED ] = 1000;
+      sf->thresh_mult[THR_B_PRED   ] = 2000;
+      sf->thresh_mult[THR_I8X8_PRED] = 2000;
+      sf->thresh_mult[THR_TM       ] = 1000;
+
+      sf->thresh_mult[THR_NEWMV    ] = 1000;
+      sf->thresh_mult[THR_NEWG     ] = 1000;
+      sf->thresh_mult[THR_NEWA     ] = 1000;
+      sf->thresh_mult[THR_NEWMV_FILT    ] = 1000;
+      sf->thresh_mult[THR_NEWG_FILT     ] = 1000;
+      sf->thresh_mult[THR_NEWA_FILT     ] = 1000;
+#else
+      sf->thresh_mult[THR_ZEROMV   ] = 0;
+      sf->thresh_mult[THR_ZEROG    ] = 0;
+      sf->thresh_mult[THR_ZEROA    ] = 0;
+      sf->thresh_mult[THR_NEARESTMV] = 0;
+      sf->thresh_mult[THR_NEARESTG ] = 0;
+      sf->thresh_mult[THR_NEARESTA ] = 0;
+      sf->thresh_mult[THR_NEARMV   ] = 0;
+      sf->thresh_mult[THR_NEARG    ] = 0;
+      sf->thresh_mult[THR_NEARA    ] = 0;
+
+      sf->thresh_mult[THR_DC       ] = 0;
+
+      sf->thresh_mult[THR_V_PRED   ] = 1000;
+      sf->thresh_mult[THR_H_PRED   ] = 1000;
+      sf->thresh_mult[THR_D45_PRED ] = 1000;
+      sf->thresh_mult[THR_D135_PRED] = 1000;
+      sf->thresh_mult[THR_D117_PRED] = 1000;
+      sf->thresh_mult[THR_D153_PRED] = 1000;
+      sf->thresh_mult[THR_D27_PRED ] = 1000;
+      sf->thresh_mult[THR_D63_PRED ] = 1000;
+      sf->thresh_mult[THR_B_PRED   ] = 2000;
+      sf->thresh_mult[THR_I8X8_PRED] = 2000;
+      sf->thresh_mult[THR_TM       ] = 1000;
+
+      sf->thresh_mult[THR_NEWMV    ] = 1000;
+      sf->thresh_mult[THR_NEWG     ] = 1000;
+      sf->thresh_mult[THR_NEWA     ] = 1000;
+#endif
+      sf->thresh_mult[THR_SPLITMV  ] = 2500;
+      sf->thresh_mult[THR_SPLITG   ] = 5000;
+      sf->thresh_mult[THR_SPLITA   ] = 5000;
+
+      sf->thresh_mult[THR_COMP_ZEROLG   ] = 0;
+      sf->thresh_mult[THR_COMP_NEARESTLG] = 0;
+      sf->thresh_mult[THR_COMP_NEARLG   ] = 0;
+      sf->thresh_mult[THR_COMP_ZEROLA   ] = 0;
+      sf->thresh_mult[THR_COMP_NEARESTLA] = 0;
+      sf->thresh_mult[THR_COMP_NEARLA   ] = 0;
+      sf->thresh_mult[THR_COMP_ZEROGA   ] = 0;
+      sf->thresh_mult[THR_COMP_NEARESTGA] = 0;
+      sf->thresh_mult[THR_COMP_NEARGA   ] = 0;
+
+      sf->thresh_mult[THR_COMP_NEWLG    ] = 1000;
+      sf->thresh_mult[THR_COMP_NEWLA    ] = 1000;
+      sf->thresh_mult[THR_COMP_NEWGA    ] = 1000;
+
+      sf->thresh_mult[THR_COMP_SPLITLA  ] = 2500;
+      sf->thresh_mult[THR_COMP_SPLITGA  ] = 5000;
+      sf->thresh_mult[THR_COMP_SPLITLG  ] = 5000;
+
+      sf->first_step = 0;
+      sf->max_step_search_steps = MAX_MVSEARCH_STEPS;
+      sf->search_best_filter = SEARCH_BEST_FILTER;
+      break;
+    case 1:
+#if CONFIG_PRED_FILTER
+      sf->thresh_mult[THR_NEARESTMV] = 0;
+      sf->thresh_mult[THR_NEARESTMV_FILT] = 0;
+      sf->thresh_mult[THR_ZEROMV   ] = 0;
+      sf->thresh_mult[THR_ZEROMV_FILT   ] = 0;
+      sf->thresh_mult[THR_DC       ] = 0;
+      sf->thresh_mult[THR_NEARMV   ] = 0;
+      sf->thresh_mult[THR_NEARMV_FILT   ] = 0;
+      sf->thresh_mult[THR_V_PRED   ] = 1000;
+      sf->thresh_mult[THR_H_PRED   ] = 1000;
+      sf->thresh_mult[THR_D45_PRED ] = 1000;
+      sf->thresh_mult[THR_D135_PRED] = 1000;
+      sf->thresh_mult[THR_D117_PRED] = 1000;
+      sf->thresh_mult[THR_D153_PRED] = 1000;
+      sf->thresh_mult[THR_D27_PRED ] = 1000;
+      sf->thresh_mult[THR_D63_PRED ] = 1000;
+      sf->thresh_mult[THR_B_PRED   ] = 2500;
+      sf->thresh_mult[THR_I8X8_PRED] = 2500;
+      sf->thresh_mult[THR_TM       ] = 1000;
+
+      sf->thresh_mult[THR_NEARESTG ] = 1000;
+      sf->thresh_mult[THR_NEARESTG_FILT ] = 1000;
+      sf->thresh_mult[THR_NEARESTA ] = 1000;
+      sf->thresh_mult[THR_NEARESTA_FILT ] = 1000;
+
+      sf->thresh_mult[THR_ZEROG    ] = 1000;
+      sf->thresh_mult[THR_ZEROA    ] = 1000;
+      sf->thresh_mult[THR_NEARG    ] = 1000;
+      sf->thresh_mult[THR_NEARA    ] = 1000;
+      sf->thresh_mult[THR_ZEROG_FILT    ] = 1000;
+      sf->thresh_mult[THR_ZEROA_FILT    ] = 1000;
+      sf->thresh_mult[THR_NEARG_FILT    ] = 1000;
+      sf->thresh_mult[THR_NEARA_FILT    ] = 1000;
+
+      sf->thresh_mult[THR_ZEROMV   ] = 0;
+      sf->thresh_mult[THR_ZEROG    ] = 0;
+      sf->thresh_mult[THR_ZEROA    ] = 0;
+      sf->thresh_mult[THR_NEARESTMV] = 0;
+      sf->thresh_mult[THR_NEARESTG ] = 0;
+      sf->thresh_mult[THR_NEARESTA ] = 0;
+      sf->thresh_mult[THR_NEARMV   ] = 0;
+      sf->thresh_mult[THR_NEARG    ] = 0;
+      sf->thresh_mult[THR_NEARA    ] = 0;
+      sf->thresh_mult[THR_ZEROMV_FILT   ] = 0;
+      sf->thresh_mult[THR_ZEROG_FILT    ] = 0;
+      sf->thresh_mult[THR_ZEROA_FILT    ] = 0;
+      sf->thresh_mult[THR_NEARESTMV_FILT] = 0;
+      sf->thresh_mult[THR_NEARESTG_FILT ] = 0;
+      sf->thresh_mult[THR_NEARESTA_FILT ] = 0;
+      sf->thresh_mult[THR_NEARMV_FILT   ] = 0;
+      sf->thresh_mult[THR_NEARG_FILT    ] = 0;
+      sf->thresh_mult[THR_NEARA_FILT    ] = 0;
+
+      sf->thresh_mult[THR_NEWMV    ] = 1000;
+      sf->thresh_mult[THR_NEWG     ] = 1000;
+      sf->thresh_mult[THR_NEWA     ] = 1000;
+      sf->thresh_mult[THR_NEWMV_FILT    ] = 1000;
+      sf->thresh_mult[THR_NEWG_FILT     ] = 1000;
+      sf->thresh_mult[THR_NEWA_FILT     ] = 1000;
+#else
+      sf->thresh_mult[THR_NEARESTMV] = 0;
+      sf->thresh_mult[THR_ZEROMV   ] = 0;
+      sf->thresh_mult[THR_DC       ] = 0;
+      sf->thresh_mult[THR_NEARMV   ] = 0;
+      sf->thresh_mult[THR_V_PRED   ] = 1000;
+      sf->thresh_mult[THR_H_PRED   ] = 1000;
+      sf->thresh_mult[THR_D45_PRED ] = 1000;
+      sf->thresh_mult[THR_D135_PRED] = 1000;
+      sf->thresh_mult[THR_D117_PRED] = 1000;
+      sf->thresh_mult[THR_D153_PRED] = 1000;
+      sf->thresh_mult[THR_D27_PRED ] = 1000;
+      sf->thresh_mult[THR_D63_PRED ] = 1000;
+      sf->thresh_mult[THR_B_PRED   ] = 2500;
+      sf->thresh_mult[THR_I8X8_PRED] = 2500;
+      sf->thresh_mult[THR_TM       ] = 1000;
+
+      sf->thresh_mult[THR_NEARESTG ] = 1000;
+      sf->thresh_mult[THR_NEARESTA ] = 1000;
+
+      sf->thresh_mult[THR_ZEROG    ] = 1000;
+      sf->thresh_mult[THR_ZEROA    ] = 1000;
+      sf->thresh_mult[THR_NEARG    ] = 1000;
+      sf->thresh_mult[THR_NEARA    ] = 1000;
+
+      sf->thresh_mult[THR_ZEROMV   ] = 0;
+      sf->thresh_mult[THR_ZEROG    ] = 0;
+      sf->thresh_mult[THR_ZEROA    ] = 0;
+      sf->thresh_mult[THR_NEARESTMV] = 0;
+      sf->thresh_mult[THR_NEARESTG ] = 0;
+      sf->thresh_mult[THR_NEARESTA ] = 0;
+      sf->thresh_mult[THR_NEARMV   ] = 0;
+      sf->thresh_mult[THR_NEARG    ] = 0;
+      sf->thresh_mult[THR_NEARA    ] = 0;
+
+      sf->thresh_mult[THR_NEWMV    ] = 1000;
+      sf->thresh_mult[THR_NEWG     ] = 1000;
+      sf->thresh_mult[THR_NEWA     ] = 1000;
+#endif
+      sf->thresh_mult[THR_SPLITMV  ] = 1700;
+      sf->thresh_mult[THR_SPLITG   ] = 4500;
+      sf->thresh_mult[THR_SPLITA   ] = 4500;
+
+      sf->thresh_mult[THR_COMP_ZEROLG   ] = 0;
+      sf->thresh_mult[THR_COMP_NEARESTLG] = 0;
+      sf->thresh_mult[THR_COMP_NEARLG   ] = 0;
+      sf->thresh_mult[THR_COMP_ZEROLA   ] = 0;
+      sf->thresh_mult[THR_COMP_NEARESTLA] = 0;
+      sf->thresh_mult[THR_COMP_NEARLA   ] = 0;
+      sf->thresh_mult[THR_COMP_ZEROGA   ] = 0;
+      sf->thresh_mult[THR_COMP_NEARESTGA] = 0;
+      sf->thresh_mult[THR_COMP_NEARGA   ] = 0;
+
+      sf->thresh_mult[THR_COMP_NEWLG    ] = 1000;
+      sf->thresh_mult[THR_COMP_NEWLA    ] = 1000;
+      sf->thresh_mult[THR_COMP_NEWGA    ] = 1000;
+
+      sf->thresh_mult[THR_COMP_SPLITLA  ] = 1700;
+      sf->thresh_mult[THR_COMP_SPLITGA  ] = 4500;
+      sf->thresh_mult[THR_COMP_SPLITLG  ] = 4500;
+
+      if (Speed > 0) {
+        /* Disable coefficient optimization above speed 0 */
+        sf->optimize_coefficients = 0;
+        sf->no_skip_block4x4_search = 0;
+
+        sf->first_step = 1;
+
+        cpi->mode_check_freq[THR_SPLITG] = 2;
+        cpi->mode_check_freq[THR_SPLITA] = 2;
+        cpi->mode_check_freq[THR_SPLITMV] = 0;
+
+        cpi->mode_check_freq[THR_COMP_SPLITGA] = 2;
+        cpi->mode_check_freq[THR_COMP_SPLITLG] = 2;
+        cpi->mode_check_freq[THR_COMP_SPLITLA] = 0;
+      }
+
+      if (Speed > 1) {
+        cpi->mode_check_freq[THR_SPLITG] = 4;
+        cpi->mode_check_freq[THR_SPLITA] = 4;
+        cpi->mode_check_freq[THR_SPLITMV] = 2;
+
+        cpi->mode_check_freq[THR_COMP_SPLITGA] = 4;
+        cpi->mode_check_freq[THR_COMP_SPLITLG] = 4;
+        cpi->mode_check_freq[THR_COMP_SPLITLA] = 2;
+
+        sf->thresh_mult[THR_TM       ] = 1500;
+        sf->thresh_mult[THR_V_PRED   ] = 1500;
+        sf->thresh_mult[THR_H_PRED   ] = 1500;
+        sf->thresh_mult[THR_D45_PRED ] = 1500;
+        sf->thresh_mult[THR_D135_PRED] = 1500;
+        sf->thresh_mult[THR_D117_PRED] = 1500;
+        sf->thresh_mult[THR_D153_PRED] = 1500;
+        sf->thresh_mult[THR_D27_PRED ] = 1500;
+        sf->thresh_mult[THR_D63_PRED ] = 1500;
+        sf->thresh_mult[THR_B_PRED   ] = 5000;
+        sf->thresh_mult[THR_I8X8_PRED] = 5000;
+
+        if (cpi->ref_frame_flags & VP9_LAST_FLAG) {
+          sf->thresh_mult[THR_NEWMV    ] = 2000;
+#if CONFIG_PRED_FILTER
+          sf->thresh_mult[THR_NEWMV_FILT    ] = 2000;
+#endif
+          sf->thresh_mult[THR_SPLITMV  ] = 10000;
+          sf->thresh_mult[THR_COMP_SPLITLG  ] = 20000;
+        }
+
+        if (cpi->ref_frame_flags & VP9_GOLD_FLAG) {
+          sf->thresh_mult[THR_NEARESTG ] = 1500;
+          sf->thresh_mult[THR_ZEROG    ] = 1500;
+          sf->thresh_mult[THR_NEARG    ] = 1500;
+          sf->thresh_mult[THR_NEWG     ] = 2000;
+#if CONFIG_PRED_FILTER
+          sf->thresh_mult[THR_NEARESTG_FILT ] = 1500;
+          sf->thresh_mult[THR_ZEROG_FILT    ] = 1500;
+          sf->thresh_mult[THR_NEARG_FILT    ] = 1500;
+          sf->thresh_mult[THR_NEWG_FILT     ] = 2000;
+#endif
+          sf->thresh_mult[THR_SPLITG   ] = 20000;
+          sf->thresh_mult[THR_COMP_SPLITGA  ] = 20000;
+        }
+
+        if (cpi->ref_frame_flags & VP9_ALT_FLAG) {
+          sf->thresh_mult[THR_NEARESTA ] = 1500;
+          sf->thresh_mult[THR_ZEROA    ] = 1500;
+          sf->thresh_mult[THR_NEARA    ] = 1500;
+          sf->thresh_mult[THR_NEWA     ] = 2000;
+#if CONFIG_PRED_FILTER
+          sf->thresh_mult[THR_NEARESTA_FILT ] = 1500;
+          sf->thresh_mult[THR_ZEROA_FILT    ] = 1500;
+          sf->thresh_mult[THR_NEARA_FILT    ] = 1500;
+          sf->thresh_mult[THR_NEWA_FILT     ] = 2000;
+#endif
+          sf->thresh_mult[THR_SPLITA   ] = 20000;
+          sf->thresh_mult[THR_COMP_SPLITLA  ] = 10000;
+        }
+
+        sf->thresh_mult[THR_COMP_ZEROLG   ] = 1500;
+        sf->thresh_mult[THR_COMP_NEARESTLG] = 1500;
+        sf->thresh_mult[THR_COMP_NEARLG   ] = 1500;
+        sf->thresh_mult[THR_COMP_ZEROLA   ] = 1500;
+        sf->thresh_mult[THR_COMP_NEARESTLA] = 1500;
+        sf->thresh_mult[THR_COMP_NEARLA   ] = 1500;
+        sf->thresh_mult[THR_COMP_ZEROGA   ] = 1500;
+        sf->thresh_mult[THR_COMP_NEARESTGA] = 1500;
+        sf->thresh_mult[THR_COMP_NEARGA   ] = 1500;
+
+        sf->thresh_mult[THR_COMP_NEWLG    ] = 2000;
+        sf->thresh_mult[THR_COMP_NEWLA    ] = 2000;
+        sf->thresh_mult[THR_COMP_NEWGA    ] = 2000;
+      }
+
+      if (Speed > 2) {
+        cpi->mode_check_freq[THR_SPLITG] = 15;
+        cpi->mode_check_freq[THR_SPLITA] = 15;
+        cpi->mode_check_freq[THR_SPLITMV] = 7;
+
+        cpi->mode_check_freq[THR_COMP_SPLITGA] = 15;
+        cpi->mode_check_freq[THR_COMP_SPLITLG] = 15;
+        cpi->mode_check_freq[THR_COMP_SPLITLA] = 7;
+
+        sf->thresh_mult[THR_TM       ] = 2000;
+        sf->thresh_mult[THR_V_PRED   ] = 2000;
+        sf->thresh_mult[THR_H_PRED   ] = 2000;
+        sf->thresh_mult[THR_D45_PRED ] = 2000;
+        sf->thresh_mult[THR_D135_PRED] = 2000;
+        sf->thresh_mult[THR_D117_PRED] = 2000;
+        sf->thresh_mult[THR_D153_PRED] = 2000;
+        sf->thresh_mult[THR_D27_PRED ] = 2000;
+        sf->thresh_mult[THR_D63_PRED ] = 2000;
+        sf->thresh_mult[THR_B_PRED   ] = 7500;
+        sf->thresh_mult[THR_I8X8_PRED] = 7500;
+
+        if (cpi->ref_frame_flags & VP9_LAST_FLAG) {
+          sf->thresh_mult[THR_NEWMV    ] = 2000;
+#if CONFIG_PRED_FILTER
+          sf->thresh_mult[THR_NEWMV_FILT    ] = 2000;
+#endif
+          sf->thresh_mult[THR_SPLITMV  ] = 25000;
+          sf->thresh_mult[THR_COMP_SPLITLG  ] = 50000;
+        }
+
+        if (cpi->ref_frame_flags & VP9_GOLD_FLAG) {
+          sf->thresh_mult[THR_NEARESTG ] = 2000;
+          sf->thresh_mult[THR_ZEROG    ] = 2000;
+          sf->thresh_mult[THR_NEARG    ] = 2000;
+          sf->thresh_mult[THR_NEWG     ] = 2500;
+#if CONFIG_PRED_FILTER
+          sf->thresh_mult[THR_NEARESTG_FILT ] = 2000;
+          sf->thresh_mult[THR_ZEROG_FILT    ] = 2000;
+          sf->thresh_mult[THR_NEARG_FILT    ] = 2000;
+          sf->thresh_mult[THR_NEWG_FILT     ] = 2500;
+#endif
+          sf->thresh_mult[THR_SPLITG   ] = 50000;
+          sf->thresh_mult[THR_COMP_SPLITGA  ] = 50000;
+        }
+
+        if (cpi->ref_frame_flags & VP9_ALT_FLAG) {
+          sf->thresh_mult[THR_NEARESTA ] = 2000;
+          sf->thresh_mult[THR_ZEROA    ] = 2000;
+          sf->thresh_mult[THR_NEARA    ] = 2000;
+          sf->thresh_mult[THR_NEWA     ] = 2500;
+#if CONFIG_PRED_FILTER
+          sf->thresh_mult[THR_NEARESTA_FILT ] = 2000;
+          sf->thresh_mult[THR_ZEROA_FILT    ] = 2000;
+          sf->thresh_mult[THR_NEARA_FILT    ] = 2000;
+          sf->thresh_mult[THR_NEWA_FILT     ] = 2500;
+#endif
+          sf->thresh_mult[THR_SPLITA   ] = 50000;
+          sf->thresh_mult[THR_COMP_SPLITLA  ] = 25000;
+        }
+
+        sf->thresh_mult[THR_COMP_ZEROLG   ] = 2000;
+        sf->thresh_mult[THR_COMP_NEARESTLG] = 2000;
+        sf->thresh_mult[THR_COMP_NEARLG   ] = 2000;
+        sf->thresh_mult[THR_COMP_ZEROLA   ] = 2000;
+        sf->thresh_mult[THR_COMP_NEARESTLA] = 2000;
+        sf->thresh_mult[THR_COMP_NEARLA   ] = 2000;
+        sf->thresh_mult[THR_COMP_ZEROGA   ] = 2000;
+        sf->thresh_mult[THR_COMP_NEARESTGA] = 2000;
+        sf->thresh_mult[THR_COMP_NEARGA   ] = 2000;
+
+        sf->thresh_mult[THR_COMP_NEWLG    ] = 2500;
+        sf->thresh_mult[THR_COMP_NEWLA    ] = 2500;
+        sf->thresh_mult[THR_COMP_NEWGA    ] = 2500;
+
+        sf->improved_dct = 0;
+
+        // Only do recode loop on key frames, golden frames and
+        // alt ref frames
+        sf->recode_loop = 2;
+
+      }
+
+      break;
+
+  }; /* switch */
+
+  /* disable frame modes if flags not set */
+  if (!(cpi->ref_frame_flags & VP9_LAST_FLAG)) {
+    sf->thresh_mult[THR_NEWMV    ] = INT_MAX;
+    sf->thresh_mult[THR_NEARESTMV] = INT_MAX;
+    sf->thresh_mult[THR_ZEROMV   ] = INT_MAX;
+    sf->thresh_mult[THR_NEARMV   ] = INT_MAX;
+#if CONFIG_PRED_FILTER
+    sf->thresh_mult[THR_NEWMV_FILT    ] = INT_MAX;
+    sf->thresh_mult[THR_NEARESTMV_FILT] = INT_MAX;
+    sf->thresh_mult[THR_ZEROMV_FILT   ] = INT_MAX;
+    sf->thresh_mult[THR_NEARMV_FILT   ] = INT_MAX;
+#endif
+    sf->thresh_mult[THR_SPLITMV  ] = INT_MAX;
+  }
+
+  if (!(cpi->ref_frame_flags & VP9_GOLD_FLAG)) {
+    sf->thresh_mult[THR_NEARESTG ] = INT_MAX;
+    sf->thresh_mult[THR_ZEROG    ] = INT_MAX;
+    sf->thresh_mult[THR_NEARG    ] = INT_MAX;
+    sf->thresh_mult[THR_NEWG     ] = INT_MAX;
+#if CONFIG_PRED_FILTER
+    sf->thresh_mult[THR_NEARESTG_FILT ] = INT_MAX;
+    sf->thresh_mult[THR_ZEROG_FILT    ] = INT_MAX;
+    sf->thresh_mult[THR_NEARG_FILT    ] = INT_MAX;
+    sf->thresh_mult[THR_NEWG_FILT     ] = INT_MAX;
+#endif
+    sf->thresh_mult[THR_SPLITG   ] = INT_MAX;
+  }
+
+  if (!(cpi->ref_frame_flags & VP9_ALT_FLAG)) {
+    sf->thresh_mult[THR_NEARESTA ] = INT_MAX;
+    sf->thresh_mult[THR_ZEROA    ] = INT_MAX;
+    sf->thresh_mult[THR_NEARA    ] = INT_MAX;
+    sf->thresh_mult[THR_NEWA     ] = INT_MAX;
+#if CONFIG_PRED_FILTER
+    sf->thresh_mult[THR_NEARESTA_FILT ] = INT_MAX;
+    sf->thresh_mult[THR_ZEROA_FILT    ] = INT_MAX;
+    sf->thresh_mult[THR_NEARA_FILT    ] = INT_MAX;
+    sf->thresh_mult[THR_NEWA_FILT     ] = INT_MAX;
+#endif
+    sf->thresh_mult[THR_SPLITA   ] = INT_MAX;
+  }
+
+  if ((cpi->ref_frame_flags & (VP9_LAST_FLAG | VP9_GOLD_FLAG)) != (VP9_LAST_FLAG | VP9_GOLD_FLAG)) {
+    sf->thresh_mult[THR_COMP_ZEROLG   ] = INT_MAX;
+    sf->thresh_mult[THR_COMP_NEARESTLG] = INT_MAX;
+    sf->thresh_mult[THR_COMP_NEARLG   ] = INT_MAX;
+    sf->thresh_mult[THR_COMP_NEWLG    ] = INT_MAX;
+    sf->thresh_mult[THR_COMP_SPLITLG  ] = INT_MAX;
+  }
+
+  if ((cpi->ref_frame_flags & (VP9_LAST_FLAG | VP9_ALT_FLAG)) != (VP9_LAST_FLAG | VP9_ALT_FLAG)) {
+    sf->thresh_mult[THR_COMP_ZEROLA   ] = INT_MAX;
+    sf->thresh_mult[THR_COMP_NEARESTLA] = INT_MAX;
+    sf->thresh_mult[THR_COMP_NEARLA   ] = INT_MAX;
+    sf->thresh_mult[THR_COMP_NEWLA    ] = INT_MAX;
+    sf->thresh_mult[THR_COMP_SPLITLA  ] = INT_MAX;
+  }
+
+  if ((cpi->ref_frame_flags & (VP9_GOLD_FLAG | VP9_ALT_FLAG)) != (VP9_GOLD_FLAG | VP9_ALT_FLAG)) {
+    sf->thresh_mult[THR_COMP_ZEROGA   ] = INT_MAX;
+    sf->thresh_mult[THR_COMP_NEARESTGA] = INT_MAX;
+    sf->thresh_mult[THR_COMP_NEARGA   ] = INT_MAX;
+    sf->thresh_mult[THR_COMP_NEWGA    ] = INT_MAX;
+    sf->thresh_mult[THR_COMP_SPLITGA  ] = INT_MAX;
+  }
+
+  // Slow quant, dct and trellis not worthwhile for first pass
+  // so make sure they are always turned off.
+  if (cpi->pass == 1) {
+    sf->optimize_coefficients = 0;
+    sf->improved_dct = 0;
+  }
+
+  if (cpi->sf.search_method == NSTEP) {
+    vp9_init3smotion_compensation(&cpi->mb,
+                                  cm->yv12_fb[cm->lst_fb_idx].y_stride);
+  } else if (cpi->sf.search_method == DIAMOND) {
+    vp9_init_dsmotion_compensation(&cpi->mb,
+                                   cm->yv12_fb[cm->lst_fb_idx].y_stride);
+  }
+
+  cpi->mb.vp9_short_fdct16x16 = vp9_short_fdct16x16;
+  cpi->mb.vp9_short_fdct8x8 = vp9_short_fdct8x8;
+  cpi->mb.vp9_short_fdct8x4 = vp9_short_fdct8x4;
+  cpi->mb.vp9_short_fdct4x4 = vp9_short_fdct4x4;
+  cpi->mb.short_walsh4x4 = vp9_short_walsh4x4;
+  cpi->mb.short_fhaar2x2 = vp9_short_fhaar2x2;
+
+#if CONFIG_LOSSLESS
+  if (cpi->oxcf.lossless) {
+    cpi->mb.vp9_short_fdct8x4 = vp9_short_walsh8x4_x8;
+    cpi->mb.vp9_short_fdct4x4 = vp9_short_walsh4x4_x8;
+    cpi->mb.short_walsh4x4 = vp9_short_walsh4x4;
+    cpi->mb.short_fhaar2x2 = vp9_short_fhaar2x2;
+    cpi->mb.short_walsh4x4 = vp9_short_walsh4x4_lossless;
+  }
+#endif
+
+
+
+  cpi->mb.quantize_b_4x4      = vp9_regular_quantize_b_4x4;
+  cpi->mb.quantize_b_4x4_pair = vp9_regular_quantize_b_4x4_pair;
+  cpi->mb.quantize_b_8x8      = vp9_regular_quantize_b_8x8;
+  cpi->mb.quantize_b_16x16    = vp9_regular_quantize_b_16x16;
+  cpi->mb.quantize_b_2x2      = vp9_regular_quantize_b_2x2;
+
+  vp9_init_quantizer(cpi);
+
+#if CONFIG_RUNTIME_CPU_DETECT
+  cpi->mb.e_mbd.rtcd = &cpi->common.rtcd;
+#endif
+
+  if (cpi->sf.iterative_sub_pixel == 1) {
+    cpi->find_fractional_mv_step = vp9_find_best_sub_pixel_step_iteratively;
+  } else if (cpi->sf.quarter_pixel_search) {
+    cpi->find_fractional_mv_step = vp9_find_best_sub_pixel_step;
+  } else if (cpi->sf.half_pixel_search) {
+    cpi->find_fractional_mv_step = vp9_find_best_half_pixel_step;
+  }
+
+  if (cpi->sf.optimize_coefficients == 1 && cpi->pass != 1)
+    cpi->mb.optimize = 1;
+  else
+    cpi->mb.optimize = 0;
+
+#ifdef SPEEDSTATS
+  frames_at_speed[cpi->Speed]++;
+#endif
+}
+static void alloc_raw_frame_buffers(VP9_COMP *cpi) {
+  int width = (cpi->oxcf.Width + 15) & ~15;
+  int height = (cpi->oxcf.Height + 15) & ~15;
+
+  cpi->lookahead = vp9_lookahead_init(cpi->oxcf.Width, cpi->oxcf.Height,
+                                      cpi->oxcf.lag_in_frames);
+  if (!cpi->lookahead)
+    vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
+                       "Failed to allocate lag buffers");
+
+#if VP9_TEMPORAL_ALT_REF
+
+  if (vp8_yv12_alloc_frame_buffer(&cpi->alt_ref_buffer,
+                                  width, height, VP8BORDERINPIXELS))
+    vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
+                       "Failed to allocate altref buffer");
+
+#endif
+}
+
+static int alloc_partition_data(VP9_COMP *cpi) {
+  vpx_free(cpi->mb.pip);
+
+  cpi->mb.pip = vpx_calloc((cpi->common.mb_cols + 1) *
+                           (cpi->common.mb_rows + 1),
+                           sizeof(PARTITION_INFO));
+  if (!cpi->mb.pip)
+    return 1;
+
+  cpi->mb.pi = cpi->mb.pip + cpi->common.mode_info_stride + 1;
+
+  return 0;
+}
+
+void vp9_alloc_compressor_data(VP9_COMP *cpi) {
+  VP9_COMMON *cm = &cpi->common;
+
+  int width = cm->Width;
+  int height = cm->Height;
+
+  if (vp9_alloc_frame_buffers(cm, width, height))
+    vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
+                       "Failed to allocate frame buffers");
+
+  if (alloc_partition_data(cpi))
+    vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
+                       "Failed to allocate partition data");
+
+
+  if ((width & 0xf) != 0)
+    width += 16 - (width & 0xf);
+
+  if ((height & 0xf) != 0)
+    height += 16 - (height & 0xf);
+
+
+  if (vp8_yv12_alloc_frame_buffer(&cpi->last_frame_uf,
+                                  width, height, VP8BORDERINPIXELS))
+    vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
+                       "Failed to allocate last frame buffer");
+
+  if (vp8_yv12_alloc_frame_buffer(&cpi->scaled_source,
+                                  width, height, VP8BORDERINPIXELS))
+    vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
+                       "Failed to allocate scaled source buffer");
+
+
+  vpx_free(cpi->tok);
+
+  {
+    unsigned int tokens = cm->mb_rows * cm->mb_cols * 24 * 16;
+
+    CHECK_MEM_ERROR(cpi->tok, vpx_calloc(tokens, sizeof(*cpi->tok)));
+  }
+
+  // Data used for real time vc mode to see if gf needs refreshing
+  cpi->inter_zz_count = 0;
+  cpi->gf_bad_count = 0;
+  cpi->gf_update_recommended = 0;
+
+
+  // Structures used to minitor GF usage
+  vpx_free(cpi->gf_active_flags);
+  CHECK_MEM_ERROR(cpi->gf_active_flags,
+                  vpx_calloc(1, cm->mb_rows * cm->mb_cols));
+  cpi->gf_active_count = cm->mb_rows * cm->mb_cols;
+
+  vpx_free(cpi->mb_activity_map);
+  CHECK_MEM_ERROR(cpi->mb_activity_map,
+                  vpx_calloc(sizeof(unsigned int),
+                             cm->mb_rows * cm->mb_cols));
+
+  vpx_free(cpi->mb_norm_activity_map);
+  CHECK_MEM_ERROR(cpi->mb_norm_activity_map,
+                  vpx_calloc(sizeof(unsigned int),
+                             cm->mb_rows * cm->mb_cols));
+
+  vpx_free(cpi->twopass.total_stats);
+
+  cpi->twopass.total_stats = vpx_calloc(1, sizeof(FIRSTPASS_STATS));
+
+  vpx_free(cpi->twopass.total_left_stats);
+  cpi->twopass.total_left_stats = vpx_calloc(1, sizeof(FIRSTPASS_STATS));
+
+  vpx_free(cpi->twopass.this_frame_stats);
+
+  cpi->twopass.this_frame_stats = vpx_calloc(1, sizeof(FIRSTPASS_STATS));
+
+  if (!cpi->twopass.total_stats ||
+      !cpi->twopass.total_left_stats ||
+      !cpi->twopass.this_frame_stats)
+    vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
+                       "Failed to allocate firstpass stats");
+
+  vpx_free(cpi->tplist);
+
+  CHECK_MEM_ERROR(cpi->tplist,
+                  vpx_malloc(sizeof(TOKENLIST) * (cpi->common.mb_rows)));
+}
+
+
+// TODO perhaps change number of steps expose to outside world when setting
+// max and min limits. Also this will likely want refining for the extended Q
+// range.
+//
+// Table that converts 0-63 Q range values passed in outside to the Qindex
+// range used internally.
+static const int q_trans[] = {
+  0,    4,   8,  12,  16,  20,  24,  28,
+  32,   36,  40,  44,  48,  52,  56,  60,
+  64,   68,  72,  76,  80,  84,  88,  92,
+  96,  100, 104, 108, 112, 116, 120, 124,
+  128, 132, 136, 140, 144, 148, 152, 156,
+  160, 164, 168, 172, 176, 180, 184, 188,
+  192, 196, 200, 204, 208, 212, 216, 220,
+  224, 228, 232, 236, 240, 244, 249, 255,
+};
+
+int vp9_reverse_trans(int x) {
+  int i;
+
+  for (i = 0; i < 64; i++)
+    if (q_trans[i] >= x)
+      return i;
+
+  return 63;
+};
+void vp9_new_frame_rate(VP9_COMP *cpi, double framerate) {
+  if (framerate < .1)
+    framerate = 30;
+
+  cpi->oxcf.frame_rate             = framerate;
+  cpi->output_frame_rate            = cpi->oxcf.frame_rate;
+  cpi->per_frame_bandwidth          = (int)(cpi->oxcf.target_bandwidth / cpi->output_frame_rate);
+  cpi->av_per_frame_bandwidth        = (int)(cpi->oxcf.target_bandwidth / cpi->output_frame_rate);
+  cpi->min_frame_bandwidth          = (int)(cpi->av_per_frame_bandwidth * cpi->oxcf.two_pass_vbrmin_section / 100);
+
+  if (cpi->min_frame_bandwidth < FRAME_OVERHEAD_BITS)
+    cpi->min_frame_bandwidth = FRAME_OVERHEAD_BITS;
+
+  // Set Maximum gf/arf interval
+  cpi->max_gf_interval = ((int)(cpi->output_frame_rate / 2.0) + 2);
+
+  if (cpi->max_gf_interval < 12)
+    cpi->max_gf_interval = 12;
+
+  // Extended interval for genuinely static scenes
+  cpi->twopass.static_scene_max_gf_interval = cpi->key_frame_frequency >> 1;
+
+  // Special conditions when altr ref frame enabled in lagged compress mode
+  if (cpi->oxcf.play_alternate && cpi->oxcf.lag_in_frames) {
+    if (cpi->max_gf_interval > cpi->oxcf.lag_in_frames - 1)
+      cpi->max_gf_interval = cpi->oxcf.lag_in_frames - 1;
+
+    if (cpi->twopass.static_scene_max_gf_interval > cpi->oxcf.lag_in_frames - 1)
+      cpi->twopass.static_scene_max_gf_interval = cpi->oxcf.lag_in_frames - 1;
+  }
+
+  if (cpi->max_gf_interval > cpi->twopass.static_scene_max_gf_interval)
+    cpi->max_gf_interval = cpi->twopass.static_scene_max_gf_interval;
+}
+
+
+static int
+rescale(int val, int num, int denom) {
+  int64_t llnum = num;
+  int64_t llden = denom;
+  int64_t llval = val;
+
+  return llval * llnum / llden;
+}
+
+
+static void init_config(VP9_PTR ptr, VP9_CONFIG *oxcf) {
+  VP9_COMP *cpi = (VP9_COMP *)(ptr);
+  VP9_COMMON *cm = &cpi->common;
+
+  cpi->oxcf = *oxcf;
+
+  cpi->goldfreq = 7;
+
+  cm->version = oxcf->Version;
+  vp9_setup_version(cm);
+
+  // change includes all joint functionality
+  vp9_change_config(ptr, oxcf);
+
+  // Initialize active best and worst q and average q values.
+  cpi->active_worst_quality         = cpi->oxcf.worst_allowed_q;
+  cpi->active_best_quality          = cpi->oxcf.best_allowed_q;
+  cpi->avg_frame_qindex             = cpi->oxcf.worst_allowed_q;
+
+  // Initialise the starting buffer levels
+  cpi->buffer_level                 = cpi->oxcf.starting_buffer_level;
+  cpi->bits_off_target              = cpi->oxcf.starting_buffer_level;
+
+  cpi->rolling_target_bits          = cpi->av_per_frame_bandwidth;
+  cpi->rolling_actual_bits          = cpi->av_per_frame_bandwidth;
+  cpi->long_rolling_target_bits     = cpi->av_per_frame_bandwidth;
+  cpi->long_rolling_actual_bits     = cpi->av_per_frame_bandwidth;
+
+  cpi->total_actual_bits            = 0;
+  cpi->total_target_vs_actual       = 0;
+
+  cpi->static_mb_pct = 0;
+
+#if VP9_TEMPORAL_ALT_REF
+  {
+    int i;
+
+    cpi->fixed_divide[0] = 0;
+
+    for (i = 1; i < 512; i++)
+      cpi->fixed_divide[i] = 0x80000 / i;
+  }
+#endif
+}
+
+
+void vp9_change_config(VP9_PTR ptr, VP9_CONFIG *oxcf) {
+  VP9_COMP *cpi = (VP9_COMP *)(ptr);
+  VP9_COMMON *cm = &cpi->common;
+
+  if (!cpi)
+    return;
+
+  if (!oxcf)
+    return;
+
+  if (cm->version != oxcf->Version) {
+    cm->version = oxcf->Version;
+    vp9_setup_version(cm);
+  }
+
+  cpi->oxcf = *oxcf;
+
+  switch (cpi->oxcf.Mode) {
+      // Real time and one pass deprecated in test code base
+    case MODE_FIRSTPASS:
+      cpi->pass = 1;
+      cpi->compressor_speed = 1;
+      break;
+
+    case MODE_SECONDPASS:
+      cpi->pass = 2;
+      cpi->compressor_speed = 1;
+
+      if (cpi->oxcf.cpu_used < -5) {
+        cpi->oxcf.cpu_used = -5;
+      }
+
+      if (cpi->oxcf.cpu_used > 5)
+        cpi->oxcf.cpu_used = 5;
+
+      break;
+
+    case MODE_SECONDPASS_BEST:
+      cpi->pass = 2;
+      cpi->compressor_speed = 0;
+      break;
+  }
+
+  cpi->oxcf.worst_allowed_q = q_trans[oxcf->worst_allowed_q];
+  cpi->oxcf.best_allowed_q = q_trans[oxcf->best_allowed_q];
+  cpi->oxcf.cq_level = q_trans[cpi->oxcf.cq_level];
+
+#if CONFIG_LOSSLESS
+  cpi->oxcf.lossless = oxcf->lossless;
+  if (cpi->oxcf.lossless) {
+    cpi->common.rtcd.idct.idct1        = vp9_short_inv_walsh4x4_1_x8_c;
+    cpi->common.rtcd.idct.idct16       = vp9_short_inv_walsh4x4_x8_c;
+    cpi->common.rtcd.idct.idct1_scalar_add  = vp9_dc_only_inv_walsh_add_c;
+    cpi->common.rtcd.idct.iwalsh1      = vp9_short_inv_walsh4x4_1_c;
+    cpi->common.rtcd.idct.iwalsh16     = vp9_short_inv_walsh4x4_lossless_c;
+  }
+#endif
+
+  cpi->baseline_gf_interval = DEFAULT_GF_INTERVAL;
+
+  cpi->ref_frame_flags = VP9_ALT_FLAG | VP9_GOLD_FLAG | VP9_LAST_FLAG;
+
+  // cpi->use_golden_frame_only = 0;
+  // cpi->use_last_frame_only = 0;
+  cm->refresh_golden_frame = 0;
+  cm->refresh_last_frame = 1;
+  cm->refresh_entropy_probs = 1;
+
+  setup_features(cpi);
+  cpi->mb.e_mbd.allow_high_precision_mv = 0;   // Default mv precision adaptation
+
+  {
+    int i;
+
+    for (i = 0; i < MAX_MB_SEGMENTS; i++)
+      cpi->segment_encode_breakout[i] = cpi->oxcf.encode_breakout;
+  }
+
+  // At the moment the first order values may not be > MAXQ
+  if (cpi->oxcf.fixed_q > MAXQ)
+    cpi->oxcf.fixed_q = MAXQ;
+
+  // local file playback mode == really big buffer
+  if (cpi->oxcf.end_usage == USAGE_LOCAL_FILE_PLAYBACK) {
+    cpi->oxcf.starting_buffer_level   = 60000;
+    cpi->oxcf.optimal_buffer_level    = 60000;
+    cpi->oxcf.maximum_buffer_size     = 240000;
+  }
+
+  // Convert target bandwidth from Kbit/s to Bit/s
+  cpi->oxcf.target_bandwidth       *= 1000;
+
+  cpi->oxcf.starting_buffer_level =
+    rescale(cpi->oxcf.starting_buffer_level,
+            cpi->oxcf.target_bandwidth, 1000);
+
+  // Set or reset optimal and maximum buffer levels.
+  if (cpi->oxcf.optimal_buffer_level == 0)
+    cpi->oxcf.optimal_buffer_level = cpi->oxcf.target_bandwidth / 8;
+  else
+    cpi->oxcf.optimal_buffer_level =
+      rescale(cpi->oxcf.optimal_buffer_level,
+              cpi->oxcf.target_bandwidth, 1000);
+
+  if (cpi->oxcf.maximum_buffer_size == 0)
+    cpi->oxcf.maximum_buffer_size = cpi->oxcf.target_bandwidth / 8;
+  else
+    cpi->oxcf.maximum_buffer_size =
+      rescale(cpi->oxcf.maximum_buffer_size,
+              cpi->oxcf.target_bandwidth, 1000);
+
+  // Set up frame rate and related parameters rate control values.
+  vp9_new_frame_rate(cpi, cpi->oxcf.frame_rate);
+
+  // Set absolute upper and lower quality limits
+  cpi->worst_quality               = cpi->oxcf.worst_allowed_q;
+  cpi->best_quality                = cpi->oxcf.best_allowed_q;
+
+  // active values should only be modified if out of new range
+  if (cpi->active_worst_quality > cpi->oxcf.worst_allowed_q) {
+    cpi->active_worst_quality = cpi->oxcf.worst_allowed_q;
+  }
+  // less likely
+  else if (cpi->active_worst_quality < cpi->oxcf.best_allowed_q) {
+    cpi->active_worst_quality = cpi->oxcf.best_allowed_q;
+  }
+  if (cpi->active_best_quality < cpi->oxcf.best_allowed_q) {
+    cpi->active_best_quality = cpi->oxcf.best_allowed_q;
+  }
+  // less likely
+  else if (cpi->active_best_quality > cpi->oxcf.worst_allowed_q) {
+    cpi->active_best_quality = cpi->oxcf.worst_allowed_q;
+  }
+
+  cpi->buffered_mode = (cpi->oxcf.optimal_buffer_level > 0) ? TRUE : FALSE;
+
+  cpi->cq_target_quality = cpi->oxcf.cq_level;
+
+  if (!cm->use_bilinear_mc_filter)
+    cm->mcomp_filter_type = DEFAULT_INTERP_FILTER;
+  else
+    cm->mcomp_filter_type = BILINEAR;
+
+  cpi->target_bandwidth = cpi->oxcf.target_bandwidth;
+
+  cm->Width       = cpi->oxcf.Width;
+  cm->Height      = cpi->oxcf.Height;
+
+  cm->horiz_scale  = cpi->horiz_scale;
+  cm->vert_scale   = cpi->vert_scale;
+
+  // VP8 sharpness level mapping 0-7 (vs 0-10 in general VPx dialogs)
+  if (cpi->oxcf.Sharpness > 7)
+    cpi->oxcf.Sharpness = 7;
+
+  cm->sharpness_level = cpi->oxcf.Sharpness;
+
+  if (cm->horiz_scale != NORMAL || cm->vert_scale != NORMAL) {
+    int UNINITIALIZED_IS_SAFE(hr), UNINITIALIZED_IS_SAFE(hs);
+    int UNINITIALIZED_IS_SAFE(vr), UNINITIALIZED_IS_SAFE(vs);
+
+    Scale2Ratio(cm->horiz_scale, &hr, &hs);
+    Scale2Ratio(cm->vert_scale, &vr, &vs);
+
+    // always go to the next whole number
+    cm->Width = (hs - 1 + cpi->oxcf.Width * hr) / hs;
+    cm->Height = (vs - 1 + cpi->oxcf.Height * vr) / vs;
+  }
+
+  if (((cm->Width + 15) & 0xfffffff0) !=
+      cm->yv12_fb[cm->lst_fb_idx].y_width ||
+      ((cm->Height + 15) & 0xfffffff0) !=
+      cm->yv12_fb[cm->lst_fb_idx].y_height ||
+      cm->yv12_fb[cm->lst_fb_idx].y_width == 0) {
+    alloc_raw_frame_buffers(cpi);
+    vp9_alloc_compressor_data(cpi);
+  }
+
+  if (cpi->oxcf.fixed_q >= 0) {
+    cpi->last_q[0] = cpi->oxcf.fixed_q;
+    cpi->last_q[1] = cpi->oxcf.fixed_q;
+    cpi->last_boosted_qindex = cpi->oxcf.fixed_q;
+  }
+
+  cpi->Speed = cpi->oxcf.cpu_used;
+
+  // force to allowlag to 0 if lag_in_frames is 0;
+  if (cpi->oxcf.lag_in_frames == 0) {
+    cpi->oxcf.allow_lag = 0;
+  }
+  // Limit on lag buffers as these are not currently dynamically allocated
+  else if (cpi->oxcf.lag_in_frames > MAX_LAG_BUFFERS)
+    cpi->oxcf.lag_in_frames = MAX_LAG_BUFFERS;
+
+  // YX Temp
+  cpi->alt_ref_source = NULL;
+  cpi->is_src_frame_alt_ref = 0;
+
+#if 0
+  // Experimental RD Code
+  cpi->frame_distortion = 0;
+  cpi->last_frame_distortion = 0;
+#endif
+
+}
+
+#define M_LOG2_E 0.693147180559945309417
+#define log2f(x) (log (x) / (float) M_LOG2_E)
+
+static void cal_nmvjointsadcost(int *mvjointsadcost) {
+  mvjointsadcost[0] = 600;
+  mvjointsadcost[1] = 300;
+  mvjointsadcost[2] = 300;
+  mvjointsadcost[0] = 300;
+}
+
+static void cal_nmvsadcosts(int *mvsadcost[2]) {
+  int i = 1;
+
+  mvsadcost [0] [0] = 0;
+  mvsadcost [1] [0] = 0;
+
+  do {
+    double z = 256 * (2 * (log2f(8 * i) + .6));
+    mvsadcost [0][i] = (int) z;
+    mvsadcost [1][i] = (int) z;
+    mvsadcost [0][-i] = (int) z;
+    mvsadcost [1][-i] = (int) z;
+  } while (++i <= MV_MAX);
+}
+
+static void cal_nmvsadcosts_hp(int *mvsadcost[2]) {
+  int i = 1;
+
+  mvsadcost [0] [0] = 0;
+  mvsadcost [1] [0] = 0;
+
+  do {
+    double z = 256 * (2 * (log2f(8 * i) + .6));
+    mvsadcost [0][i] = (int) z;
+    mvsadcost [1][i] = (int) z;
+    mvsadcost [0][-i] = (int) z;
+    mvsadcost [1][-i] = (int) z;
+  } while (++i <= MV_MAX);
+}
+
+VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) {
+  int i;
+  volatile union {
+    VP9_COMP *cpi;
+    VP9_PTR   ptr;
+  } ctx;
+
+  VP9_COMP *cpi;
+  VP9_COMMON *cm;
+
+  cpi = ctx.cpi = vpx_memalign(32, sizeof(VP9_COMP));
+  // Check that the CPI instance is valid
+  if (!cpi)
+    return 0;
+
+  cm = &cpi->common;
+
+  vpx_memset(cpi, 0, sizeof(VP9_COMP));
+
+  if (setjmp(cm->error.jmp)) {
+    VP9_PTR ptr = ctx.ptr;
+
+    ctx.cpi->common.error.setjmp = 0;
+    vp9_remove_compressor(&ptr);
+    return 0;
+  }
+
+  cpi->common.error.setjmp = 1;
+
+  CHECK_MEM_ERROR(cpi->mb.ss, vpx_calloc(sizeof(search_site), (MAX_MVSEARCH_STEPS * 8) + 1));
+
+  vp9_create_common(&cpi->common);
+  vp9_cmachine_specific_config(cpi);
+
+  init_config((VP9_PTR)cpi, oxcf);
+
+  memcpy(cpi->base_skip_false_prob, base_skip_false_prob, sizeof(base_skip_false_prob));
+  cpi->common.current_video_frame   = 0;
+  cpi->kf_overspend_bits            = 0;
+  cpi->kf_bitrate_adjustment        = 0;
+  cpi->frames_till_gf_update_due      = 0;
+  cpi->gf_overspend_bits            = 0;
+  cpi->non_gf_bitrate_adjustment     = 0;
+  cm->prob_last_coded               = 128;
+  cm->prob_gf_coded                 = 128;
+  cm->prob_intra_coded              = 63;
+#if CONFIG_SUPERBLOCKS
+  cm->sb_coded                      = 200;
+#endif
+  for (i = 0; i < COMP_PRED_CONTEXTS; i++)
+    cm->prob_comppred[i]         = 128;
+  for (i = 0; i < TX_SIZE_MAX - 1; i++)
+    cm->prob_tx[i]               = 128;
+
+  // Prime the recent reference frame useage counters.
+  // Hereafter they will be maintained as a sort of moving average
+  cpi->recent_ref_frame_usage[INTRA_FRAME]  = 1;
+  cpi->recent_ref_frame_usage[LAST_FRAME]   = 1;
+  cpi->recent_ref_frame_usage[GOLDEN_FRAME] = 1;
+  cpi->recent_ref_frame_usage[ALTREF_FRAME] = 1;
+
+  // Set reference frame sign bias for ALTREF frame to 1 (for now)
+  cpi->common.ref_frame_sign_bias[ALTREF_FRAME] = 1;
+
+  cpi->baseline_gf_interval = DEFAULT_GF_INTERVAL;
+
+  cpi->gold_is_last = 0;
+  cpi->alt_is_last  = 0;
+  cpi->gold_is_alt  = 0;
+
+  // allocate memory for storing last frame's MVs for MV prediction.
+  CHECK_MEM_ERROR(cpi->lfmv, vpx_calloc((cpi->common.mb_rows + 2) * (cpi->common.mb_cols + 2), sizeof(int_mv)));
+  CHECK_MEM_ERROR(cpi->lf_ref_frame_sign_bias, vpx_calloc((cpi->common.mb_rows + 2) * (cpi->common.mb_cols + 2), sizeof(int)));
+  CHECK_MEM_ERROR(cpi->lf_ref_frame, vpx_calloc((cpi->common.mb_rows + 2) * (cpi->common.mb_cols + 2), sizeof(int)));
+
+  // Create the encoder segmentation map and set all entries to 0
+  CHECK_MEM_ERROR(cpi->segmentation_map, vpx_calloc((cpi->common.mb_rows * cpi->common.mb_cols), 1));
+
+  // And a copy in common for temporal coding
+  CHECK_MEM_ERROR(cm->last_frame_seg_map,
+                  vpx_calloc((cpi->common.mb_rows * cpi->common.mb_cols), 1));
+
+  // And a place holder structure is the coding context
+  // for use if we want to save and restore it
+  CHECK_MEM_ERROR(cpi->coding_context.last_frame_seg_map_copy,
+                  vpx_calloc((cpi->common.mb_rows * cpi->common.mb_cols), 1));
+
+  CHECK_MEM_ERROR(cpi->active_map, vpx_calloc(cpi->common.mb_rows * cpi->common.mb_cols, 1));
+  vpx_memset(cpi->active_map, 1, (cpi->common.mb_rows * cpi->common.mb_cols));
+  cpi->active_map_enabled = 0;
+
+  for (i = 0; i < (sizeof(cpi->mbgraph_stats) /
+                   sizeof(cpi->mbgraph_stats[0])); i++) {
+    CHECK_MEM_ERROR(cpi->mbgraph_stats[i].mb_stats,
+                    vpx_calloc(cpi->common.mb_rows * cpi->common.mb_cols *
+                               sizeof(*cpi->mbgraph_stats[i].mb_stats),
+                               1));
+  }
+
+#ifdef ENTROPY_STATS
+  if (cpi->pass != 1)
+    init_context_counters();
+#endif
+#ifdef MODE_STATS
+  vp9_zero(y_modes);
+  vp9_zero(i8x8_modes);
+  vp9_zero(uv_modes);
+  vp9_zero(uv_modes_y);
+  vp9_zero(b_modes);
+  vp9_zero(inter_y_modes);
+  vp9_zero(inter_uv_modes);
+  vp9_zero(inter_b_modes);
+#endif
+#ifdef NMV_STATS
+  init_nmvstats();
+#endif
+
+  /*Initialize the feed-forward activity masking.*/
+  cpi->activity_avg = 90 << 12;
+
+  cpi->frames_since_key = 8;        // Give a sensible default for the first frame.
+  cpi->key_frame_frequency = cpi->oxcf.key_freq;
+  cpi->this_key_frame_forced = FALSE;
+  cpi->next_key_frame_forced = FALSE;
+
+  cpi->source_alt_ref_pending = FALSE;
+  cpi->source_alt_ref_active = FALSE;
+  cpi->common.refresh_alt_ref_frame = 0;
+
+  cpi->b_calculate_psnr = CONFIG_INTERNAL_STATS;
+#if CONFIG_INTERNAL_STATS
+  cpi->b_calculate_ssimg = 0;
+
+  cpi->count = 0;
+  cpi->bytes = 0;
+
+  if (cpi->b_calculate_psnr) {
+    cpi->total_sq_error = 0.0;
+    cpi->total_sq_error2 = 0.0;
+    cpi->total_y = 0.0;
+    cpi->total_u = 0.0;
+    cpi->total_v = 0.0;
+    cpi->total = 0.0;
+    cpi->totalp_y = 0.0;
+    cpi->totalp_u = 0.0;
+    cpi->totalp_v = 0.0;
+    cpi->totalp = 0.0;
+    cpi->tot_recode_hits = 0;
+    cpi->summed_quality = 0;
+    cpi->summed_weights = 0;
+  }
+
+  if (cpi->b_calculate_ssimg) {
+    cpi->total_ssimg_y = 0;
+    cpi->total_ssimg_u = 0;
+    cpi->total_ssimg_v = 0;
+    cpi->total_ssimg_all = 0;
+  }
+
+#endif
+
+#ifndef LLONG_MAX
+#define LLONG_MAX  9223372036854775807LL
+#endif
+  cpi->first_time_stamp_ever = LLONG_MAX;
+
+  cpi->frames_till_gf_update_due      = 0;
+  cpi->key_frame_count              = 1;
+
+  cpi->ni_av_qi                     = cpi->oxcf.worst_allowed_q;
+  cpi->ni_tot_qi                    = 0;
+  cpi->ni_frames                   = 0;
+  cpi->tot_q = 0.0;
+  cpi->avg_q = vp9_convert_qindex_to_q(cpi->oxcf.worst_allowed_q);
+  cpi->total_byte_count             = 0;
+
+  cpi->rate_correction_factor         = 1.0;
+  cpi->key_frame_rate_correction_factor = 1.0;
+  cpi->gf_rate_correction_factor  = 1.0;
+  cpi->twopass.est_max_qcorrection_factor  = 1.0;
+
+  cal_nmvjointsadcost(cpi->mb.nmvjointsadcost);
+  cpi->mb.nmvcost[0] = &cpi->mb.nmvcosts[0][MV_MAX];
+  cpi->mb.nmvcost[1] = &cpi->mb.nmvcosts[1][MV_MAX];
+  cpi->mb.nmvsadcost[0] = &cpi->mb.nmvsadcosts[0][MV_MAX];
+  cpi->mb.nmvsadcost[1] = &cpi->mb.nmvsadcosts[1][MV_MAX];
+  cal_nmvsadcosts(cpi->mb.nmvsadcost);
+
+  cpi->mb.nmvcost_hp[0] = &cpi->mb.nmvcosts_hp[0][MV_MAX];
+  cpi->mb.nmvcost_hp[1] = &cpi->mb.nmvcosts_hp[1][MV_MAX];
+  cpi->mb.nmvsadcost_hp[0] = &cpi->mb.nmvsadcosts_hp[0][MV_MAX];
+  cpi->mb.nmvsadcost_hp[1] = &cpi->mb.nmvsadcosts_hp[1][MV_MAX];
+  cal_nmvsadcosts_hp(cpi->mb.nmvsadcost_hp);
+
+  for (i = 0; i < KEY_FRAME_CONTEXT; i++) {
+    cpi->prior_key_frame_distance[i] = (int)cpi->output_frame_rate;
+  }
+
+#ifdef OUTPUT_YUV_SRC
+  yuv_file = fopen("bd.yuv", "ab");
+#endif
+#ifdef OUTPUT_YUV_REC
+  yuv_rec_file = fopen("rec.yuv", "wb");
+#endif
+
+#if 0
+  framepsnr = fopen("framepsnr.stt", "a");
+  kf_list = fopen("kf_list.stt", "w");
+#endif
+
+  cpi->output_pkt_list = oxcf->output_pkt_list;
+
+  if (cpi->pass == 1) {
+    vp9_init_first_pass(cpi);
+  } else if (cpi->pass == 2) {
+    size_t packet_sz = sizeof(FIRSTPASS_STATS);
+    int packets = oxcf->two_pass_stats_in.sz / packet_sz;
+
+    cpi->twopass.stats_in_start = oxcf->two_pass_stats_in.buf;
+    cpi->twopass.stats_in = cpi->twopass.stats_in_start;
+    cpi->twopass.stats_in_end = (void *)((char *)cpi->twopass.stats_in
+                                         + (packets - 1) * packet_sz);
+    vp9_init_second_pass(cpi);
+  }
+
+  vp9_set_speed_features(cpi);
+
+  // Set starting values of RD threshold multipliers (128 = *1)
+  for (i = 0; i < MAX_MODES; i++) {
+    cpi->rd_thresh_mult[i] = 128;
+  }
+
+#ifdef ENTROPY_STATS
+  init_mv_ref_counts();
+#endif
+
+#define BFP(BT, SDF, VF, SVF, SVFHH, SVFHV, SVFHHV, SDX3F, SDX8F, SDX4DF) \
+    cpi->fn_ptr[BT].sdf            = SDF; \
+    cpi->fn_ptr[BT].vf             = VF; \
+    cpi->fn_ptr[BT].svf            = SVF; \
+    cpi->fn_ptr[BT].svf_halfpix_h  = SVFHH; \
+    cpi->fn_ptr[BT].svf_halfpix_v  = SVFHV; \
+    cpi->fn_ptr[BT].svf_halfpix_hv = SVFHHV; \
+    cpi->fn_ptr[BT].sdx3f          = SDX3F; \
+    cpi->fn_ptr[BT].sdx8f          = SDX8F; \
+    cpi->fn_ptr[BT].sdx4df         = SDX4DF;
+
+
+#if CONFIG_SUPERBLOCKS
+  BFP(BLOCK_32X32, vp9_sad32x32, vp9_variance32x32, vp9_sub_pixel_variance32x32,
+      vp9_variance_halfpixvar32x32_h, vp9_variance_halfpixvar32x32_v,
+      vp9_variance_halfpixvar32x32_hv, vp9_sad32x32x3, vp9_sad32x32x8,
+      vp9_sad32x32x4d)
+#endif
+
+  BFP(BLOCK_16X16, vp9_sad16x16, vp9_variance16x16, vp9_sub_pixel_variance16x16,
+       vp9_variance_halfpixvar16x16_h, vp9_variance_halfpixvar16x16_v,
+       vp9_variance_halfpixvar16x16_hv, vp9_sad16x16x3, vp9_sad16x16x8,
+       vp9_sad16x16x4d)
+
+  BFP(BLOCK_16X8, vp9_sad16x8, vp9_variance16x8, vp9_sub_pixel_variance16x8,
+      NULL, NULL, NULL, vp9_sad16x8x3, vp9_sad16x8x8, vp9_sad16x8x4d)
+
+  BFP(BLOCK_8X16, vp9_sad8x16, vp9_variance8x16, vp9_sub_pixel_variance8x16,
+      NULL, NULL, NULL, vp9_sad8x16x3, vp9_sad8x16x8, vp9_sad8x16x4d)
+
+  BFP(BLOCK_8X8, vp9_sad8x8, vp9_variance8x8, vp9_sub_pixel_variance8x8,
+      NULL, NULL, NULL, vp9_sad8x8x3, vp9_sad8x8x8, vp9_sad8x8x4d)
+
+  BFP(BLOCK_4X4, vp9_sad4x4, vp9_variance4x4, vp9_sub_pixel_variance4x4,
+      NULL, NULL, NULL, vp9_sad4x4x3, vp9_sad4x4x8, vp9_sad4x4x4d)
+
+#if ARCH_X86 || ARCH_X86_64
+  cpi->fn_ptr[BLOCK_16X16].copymem  = vp9_copy32xn;
+  cpi->fn_ptr[BLOCK_16X8].copymem   = vp9_copy32xn;
+  cpi->fn_ptr[BLOCK_8X16].copymem   = vp9_copy32xn;
+  cpi->fn_ptr[BLOCK_8X8].copymem    = vp9_copy32xn;
+  cpi->fn_ptr[BLOCK_4X4].copymem    = vp9_copy32xn;
+#endif
+
+  cpi->full_search_sad = SEARCH_INVOKE(&cpi->rtcd.search, full_search);
+  cpi->diamond_search_sad = SEARCH_INVOKE(&cpi->rtcd.search, diamond_search);
+  cpi->refining_search_sad = SEARCH_INVOKE(&cpi->rtcd.search, refining_search);
+
+  // make sure frame 1 is okay
+  cpi->error_bins[0] = cpi->common.MBs;
+
+  /* vp9_init_quantizer() is first called here. Add check in
+   * vp9_frame_init_quantizer() so that vp9_init_quantizer is only
+   * called later when needed. This will avoid unnecessary calls of
+   * vp9_init_quantizer() for every frame.
+   */
+  vp9_init_quantizer(cpi);
+
+  vp9_loop_filter_init(cm);
+
+  cpi->common.error.setjmp = 0;
+
+  vp9_zero(cpi->y_uv_mode_count)
+
+  return (VP9_PTR) cpi;
+}
+
+void vp9_remove_compressor(VP9_PTR *ptr) {
+  VP9_COMP *cpi = (VP9_COMP *)(*ptr);
+  int i;
+
+  if (!cpi)
+    return;
+
+  if (cpi && (cpi->common.current_video_frame > 0)) {
+    if (cpi->pass == 2) {
+      vp9_end_second_pass(cpi);
+    }
+
+#ifdef ENTROPY_STATS
+    if (cpi->pass != 1) {
+      print_context_counters();
+      print_tree_update_probs();
+      print_mode_context();
+    }
+#endif
+#ifdef NMV_STATS
+    if (cpi->pass != 1)
+      print_nmvstats();
+#endif
+
+#if CONFIG_INTERNAL_STATS
+
+    vp9_clear_system_state();
+
+    // printf("\n8x8-4x4:%d-%d\n", cpi->t8x8_count, cpi->t4x4_count);
+    if (cpi->pass != 1) {
+      FILE *f = fopen("opsnr.stt", "a");
+      double time_encoded = (cpi->last_end_time_stamp_seen
+                             - cpi->first_time_stamp_ever) / 10000000.000;
+      double total_encode_time = (cpi->time_receive_data + cpi->time_compress_data)   / 1000.000;
+      double dr = (double)cpi->bytes * (double) 8 / (double)1000  / time_encoded;
+#if defined(MODE_STATS)
+      print_mode_contexts(&cpi->common);
+#endif
+      if (cpi->b_calculate_psnr) {
+        YV12_BUFFER_CONFIG *lst_yv12 = &cpi->common.yv12_fb[cpi->common.lst_fb_idx];
+        double samples = 3.0 / 2 * cpi->count * lst_yv12->y_width * lst_yv12->y_height;
+        double total_psnr = vp9_mse2psnr(samples, 255.0, cpi->total_sq_error);
+        double total_psnr2 = vp9_mse2psnr(samples, 255.0, cpi->total_sq_error2);
+        double total_ssim = 100 * pow(cpi->summed_quality / cpi->summed_weights, 8.0);
+
+        fprintf(f, "Bitrate\tAVGPsnr\tGLBPsnr\tAVPsnrP\tGLPsnrP\tVPXSSIM\t  Time(ms)\n");
+        fprintf(f, "%7.2f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t%8.0f\n",
+                dr, cpi->total / cpi->count, total_psnr, cpi->totalp / cpi->count, total_psnr2, total_ssim,
+                total_encode_time);
+//                fprintf(f, "%7.3f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t%8.0f %10ld\n",
+//                        dr, cpi->total / cpi->count, total_psnr, cpi->totalp / cpi->count, total_psnr2, total_ssim,
+//                        total_encode_time, cpi->tot_recode_hits);
+      }
+
+      if (cpi->b_calculate_ssimg) {
+        fprintf(f, "BitRate\tSSIM_Y\tSSIM_U\tSSIM_V\tSSIM_A\t  Time(ms)\n");
+        fprintf(f, "%7.2f\t%6.4f\t%6.4f\t%6.4f\t%6.4f\t%8.0f\n", dr,
+                cpi->total_ssimg_y / cpi->count, cpi->total_ssimg_u / cpi->count,
+                cpi->total_ssimg_v / cpi->count, cpi->total_ssimg_all / cpi->count, total_encode_time);
+//                fprintf(f, "%7.3f\t%6.4f\t%6.4f\t%6.4f\t%6.4f\t%8.0f  %10ld\n", dr,
+//                        cpi->total_ssimg_y / cpi->count, cpi->total_ssimg_u / cpi->count,
+//                        cpi->total_ssimg_v / cpi->count, cpi->total_ssimg_all / cpi->count, total_encode_time, cpi->tot_recode_hits);
+      }
+
+      fclose(f);
+    }
+
+#endif
+
+
+#ifdef MODE_STATS
+    {
+      extern int count_mb_seg[4];
+      char modes_stats_file[250];
+      FILE *f;
+      double dr = (double)cpi->oxcf.frame_rate * (double)cpi->bytes * (double)8 / (double)cpi->count / (double)1000;
+      sprintf(modes_stats_file, "modes_q%03d.stt", cpi->common.base_qindex);
+      f = fopen(modes_stats_file, "w");
+      fprintf(f, "intra_mode in Intra Frames:\n");
+      {
+        int i;
+        fprintf(f, "Y: ");
+        for (i = 0; i < VP9_YMODES; i++) fprintf(f, " %8d,", y_modes[i]);
+        fprintf(f, "\n");
+      }
+      {
+        int i;
+        fprintf(f, "I8: ");
+        for (i = 0; i < VP9_I8X8_MODES; i++) fprintf(f, " %8d,", i8x8_modes[i]);
+        fprintf(f, "\n");
+      }
+      {
+        int i;
+        fprintf(f, "UV: ");
+        for (i = 0; i < VP9_UV_MODES; i++) fprintf(f, " %8d,", uv_modes[i]);
+        fprintf(f, "\n");
+      }
+      {
+        int i, j;
+        fprintf(f, "KeyFrame Y-UV:\n");
+        for (i = 0; i < VP9_YMODES; i++) {
+          fprintf(f, "%2d:", i);
+          for (j = 0; j < VP9_UV_MODES; j++) fprintf(f, "%8d, ", uv_modes_y[i][j]);
+          fprintf(f, "\n");
+        }
+      }
+      {
+        int i, j;
+        fprintf(f, "Inter Y-UV:\n");
+        for (i = 0; i < VP9_YMODES; i++) {
+          fprintf(f, "%2d:", i);
+          for (j = 0; j < VP9_UV_MODES; j++) fprintf(f, "%8d, ", cpi->y_uv_mode_count[i][j]);
+          fprintf(f, "\n");
+        }
+      }
+      {
+        int i;
+
+        fprintf(f, "B: ");
+        for (i = 0; i < VP9_BINTRAMODES; i++)
+          fprintf(f, "%8d, ", b_modes[i]);
+
+        fprintf(f, "\n");
+
+      }
+
+      fprintf(f, "Modes in Inter Frames:\n");
+      {
+        int i;
+        fprintf(f, "Y: ");
+        for (i = 0; i < MB_MODE_COUNT; i++) fprintf(f, " %8d,", inter_y_modes[i]);
+        fprintf(f, "\n");
+      }
+      {
+        int i;
+        fprintf(f, "UV: ");
+        for (i = 0; i < VP9_UV_MODES; i++) fprintf(f, " %8d,", inter_uv_modes[i]);
+        fprintf(f, "\n");
+      }
+      {
+        int i;
+        fprintf(f, "B: ");
+        for (i = 0; i < B_MODE_COUNT; i++) fprintf(f, "%8d, ", inter_b_modes[i]);
+        fprintf(f, "\n");
+      }
+      fprintf(f, "P:%8d, %8d, %8d, %8d\n", count_mb_seg[0], count_mb_seg[1], count_mb_seg[2], count_mb_seg[3]);
+      fprintf(f, "PB:%8d, %8d, %8d, %8d\n", inter_b_modes[LEFT4X4], inter_b_modes[ABOVE4X4], inter_b_modes[ZERO4X4], inter_b_modes[NEW4X4]);
+      fclose(f);
+    }
+#endif
+
+#ifdef ENTROPY_STATS
+    {
+      int i, j, k;
+      FILE *fmode = fopen("modecontext.c", "w");
+
+      fprintf(fmode, "\n#include \"entropymode.h\"\n\n");
+      fprintf(fmode, "const unsigned int vp9_kf_default_bmode_counts ");
+      fprintf(fmode, "[VP9_BINTRAMODES] [VP9_BINTRAMODES] [VP9_BINTRAMODES] =\n{\n");
+
+      for (i = 0; i < 10; i++) {
+
+        fprintf(fmode, "    { // Above Mode :  %d\n", i);
+
+        for (j = 0; j < 10; j++) {
+
+          fprintf(fmode, "        {");
+
+          for (k = 0; k < VP9_BINTRAMODES; k++) {
+            if (!intra_mode_stats[i][j][k])
+              fprintf(fmode, " %5d, ", 1);
+            else
+              fprintf(fmode, " %5d, ", intra_mode_stats[i][j][k]);
+          }
+
+          fprintf(fmode, "}, // left_mode %d\n", j);
+
+        }
+
+        fprintf(fmode, "    },\n");
+
+      }
+
+      fprintf(fmode, "};\n");
+      fclose(fmode);
+    }
+#endif
+
+
+#if defined(SECTIONBITS_OUTPUT)
+
+    if (0) {
+      int i;
+      FILE *f = fopen("tokenbits.stt", "a");
+
+      for (i = 0; i < 28; i++)
+        fprintf(f, "%8d", (int)(Sectionbits[i] / 256));
+
+      fprintf(f, "\n");
+      fclose(f);
+    }
+
+#endif
+
+#if 0
+    {
+      printf("\n_pick_loop_filter_level:%d\n", cpi->time_pick_lpf / 1000);
+      printf("\n_frames recive_data encod_mb_row compress_frame  Total\n");
+      printf("%6d %10ld %10ld %10ld %10ld\n", cpi->common.current_video_frame, cpi->time_receive_data / 1000, cpi->time_encode_mb_row / 1000, cpi->time_compress_data / 1000, (cpi->time_receive_data + cpi->time_compress_data) / 1000);
+    }
+#endif
+
+  }
+
+  dealloc_compressor_data(cpi);
+  vpx_free(cpi->mb.ss);
+  vpx_free(cpi->tok);
+
+  for (i = 0; i < sizeof(cpi->mbgraph_stats) / sizeof(cpi->mbgraph_stats[0]); i++) {
+    vpx_free(cpi->mbgraph_stats[i].mb_stats);
+  }
+
+  vp9_remove_common(&cpi->common);
+  vpx_free(cpi);
+  *ptr = 0;
+
+#ifdef OUTPUT_YUV_SRC
+  fclose(yuv_file);
+#endif
+#ifdef OUTPUT_YUV_REC
+  fclose(yuv_rec_file);
+#endif
+
+#if 0
+
+  if (keyfile)
+    fclose(keyfile);
+
+  if (framepsnr)
+    fclose(framepsnr);
+
+  if (kf_list)
+    fclose(kf_list);
+
+#endif
+
+}
+
+
+static uint64_t calc_plane_error(unsigned char *orig, int orig_stride,
+                                 unsigned char *recon, int recon_stride,
+                                 unsigned int cols, unsigned int rows) {
+  unsigned int row, col;
+  uint64_t total_sse = 0;
+  int diff;
+
+  for (row = 0; row + 16 <= rows; row += 16) {
+    for (col = 0; col + 16 <= cols; col += 16) {
+      unsigned int sse;
+
+      vp9_mse16x16(orig + col, orig_stride, recon + col, recon_stride, &sse);
+      total_sse += sse;
+    }
+
+    /* Handle odd-sized width */
+    if (col < cols) {
+      unsigned int   border_row, border_col;
+      unsigned char *border_orig = orig;
+      unsigned char *border_recon = recon;
+
+      for (border_row = 0; border_row < 16; border_row++) {
+        for (border_col = col; border_col < cols; border_col++) {
+          diff = border_orig[border_col] - border_recon[border_col];
+          total_sse += diff * diff;
+        }
+
+        border_orig += orig_stride;
+        border_recon += recon_stride;
+      }
+    }
+
+    orig += orig_stride * 16;
+    recon += recon_stride * 16;
+  }
+
+  /* Handle odd-sized height */
+  for (; row < rows; row++) {
+    for (col = 0; col < cols; col++) {
+      diff = orig[col] - recon[col];
+      total_sse += diff * diff;
+    }
+
+    orig += orig_stride;
+    recon += recon_stride;
+  }
+
+  return total_sse;
+}
+
+
+static void generate_psnr_packet(VP9_COMP *cpi) {
+  YV12_BUFFER_CONFIG      *orig = cpi->Source;
+  YV12_BUFFER_CONFIG      *recon = cpi->common.frame_to_show;
+  struct vpx_codec_cx_pkt  pkt;
+  uint64_t                 sse;
+  int                      i;
+  unsigned int             width = cpi->common.Width;
+  unsigned int             height = cpi->common.Height;
+
+  pkt.kind = VPX_CODEC_PSNR_PKT;
+  sse = calc_plane_error(orig->y_buffer, orig->y_stride,
+                         recon->y_buffer, recon->y_stride,
+                         width, height);
+  pkt.data.psnr.sse[0] = sse;
+  pkt.data.psnr.sse[1] = sse;
+  pkt.data.psnr.samples[0] = width * height;
+  pkt.data.psnr.samples[1] = width * height;
+
+  width = (width + 1) / 2;
+  height = (height + 1) / 2;
+
+  sse = calc_plane_error(orig->u_buffer, orig->uv_stride,
+                         recon->u_buffer, recon->uv_stride,
+                         width, height);
+  pkt.data.psnr.sse[0] += sse;
+  pkt.data.psnr.sse[2] = sse;
+  pkt.data.psnr.samples[0] += width * height;
+  pkt.data.psnr.samples[2] = width * height;
+
+  sse = calc_plane_error(orig->v_buffer, orig->uv_stride,
+                         recon->v_buffer, recon->uv_stride,
+                         width, height);
+  pkt.data.psnr.sse[0] += sse;
+  pkt.data.psnr.sse[3] = sse;
+  pkt.data.psnr.samples[0] += width * height;
+  pkt.data.psnr.samples[3] = width * height;
+
+  for (i = 0; i < 4; i++)
+    pkt.data.psnr.psnr[i] = vp9_mse2psnr(pkt.data.psnr.samples[i], 255.0,
+                                         pkt.data.psnr.sse[i]);
+
+  vpx_codec_pkt_list_add(cpi->output_pkt_list, &pkt);
+}
+
+
+int vp9_use_as_reference(VP9_PTR ptr, int ref_frame_flags) {
+  VP9_COMP *cpi = (VP9_COMP *)(ptr);
+
+  if (ref_frame_flags > 7)
+    return -1;
+
+  cpi->ref_frame_flags = ref_frame_flags;
+  return 0;
+}
+int vp9_update_reference(VP9_PTR ptr, int ref_frame_flags) {
+  VP9_COMP *cpi = (VP9_COMP *)(ptr);
+
+  if (ref_frame_flags > 7)
+    return -1;
+
+  cpi->common.refresh_golden_frame = 0;
+  cpi->common.refresh_alt_ref_frame = 0;
+  cpi->common.refresh_last_frame   = 0;
+
+  if (ref_frame_flags & VP9_LAST_FLAG)
+    cpi->common.refresh_last_frame = 1;
+
+  if (ref_frame_flags & VP9_GOLD_FLAG)
+    cpi->common.refresh_golden_frame = 1;
+
+  if (ref_frame_flags & VP9_ALT_FLAG)
+    cpi->common.refresh_alt_ref_frame = 1;
+
+  return 0;
+}
+
+int vp9_get_reference_enc(VP9_PTR ptr, VP9_REFFRAME ref_frame_flag,
+                          YV12_BUFFER_CONFIG *sd) {
+  VP9_COMP *cpi = (VP9_COMP *)(ptr);
+  VP9_COMMON *cm = &cpi->common;
+  int ref_fb_idx;
+
+  if (ref_frame_flag == VP9_LAST_FLAG)
+    ref_fb_idx = cm->lst_fb_idx;
+  else if (ref_frame_flag == VP9_GOLD_FLAG)
+    ref_fb_idx = cm->gld_fb_idx;
+  else if (ref_frame_flag == VP9_ALT_FLAG)
+    ref_fb_idx = cm->alt_fb_idx;
+  else
+    return -1;
+
+  vp8_yv12_copy_frame_ptr(&cm->yv12_fb[ref_fb_idx], sd);
+
+  return 0;
+}
+
+int vp9_set_reference_enc(VP9_PTR ptr, VP9_REFFRAME ref_frame_flag,
+                          YV12_BUFFER_CONFIG *sd) {
+  VP9_COMP *cpi = (VP9_COMP *)(ptr);
+  VP9_COMMON *cm = &cpi->common;
+
+  int ref_fb_idx;
+
+  if (ref_frame_flag == VP9_LAST_FLAG)
+    ref_fb_idx = cm->lst_fb_idx;
+  else if (ref_frame_flag == VP9_GOLD_FLAG)
+    ref_fb_idx = cm->gld_fb_idx;
+  else if (ref_frame_flag == VP9_ALT_FLAG)
+    ref_fb_idx = cm->alt_fb_idx;
+  else
+    return -1;
+
+  vp8_yv12_copy_frame_ptr(sd, &cm->yv12_fb[ref_fb_idx]);
+
+  return 0;
+}
+int vp9_update_entropy(VP9_PTR comp, int update) {
+  VP9_COMP *cpi = (VP9_COMP *) comp;
+  VP9_COMMON *cm = &cpi->common;
+  cm->refresh_entropy_probs = update;
+
+  return 0;
+}
+
+
+#ifdef OUTPUT_YUV_SRC
+void vp9_write_yuv_frame(YV12_BUFFER_CONFIG *s) {
+  unsigned char *src = s->y_buffer;
+  int h = s->y_height;
+
+  do {
+    fwrite(src, s->y_width, 1,  yuv_file);
+    src += s->y_stride;
+  } while (--h);
+
+  src = s->u_buffer;
+  h = s->uv_height;
+
+  do {
+    fwrite(src, s->uv_width, 1,  yuv_file);
+    src += s->uv_stride;
+  } while (--h);
+
+  src = s->v_buffer;
+  h = s->uv_height;
+
+  do {
+    fwrite(src, s->uv_width, 1, yuv_file);
+    src += s->uv_stride;
+  } while (--h);
+}
+#endif
+
+#ifdef OUTPUT_YUV_REC
+void vp9_write_yuv_rec_frame(VP9_COMMON *cm) {
+  YV12_BUFFER_CONFIG *s = cm->frame_to_show;
+  unsigned char *src = s->y_buffer;
+  int h = cm->Height;
+
+  do {
+    fwrite(src, s->y_width, 1,  yuv_rec_file);
+    src += s->y_stride;
+  } while (--h);
+
+  src = s->u_buffer;
+  h = (cm->Height + 1) / 2;
+
+  do {
+    fwrite(src, s->uv_width, 1,  yuv_rec_file);
+    src += s->uv_stride;
+  } while (--h);
+
+  src = s->v_buffer;
+  h = (cm->Height + 1) / 2;
+
+  do {
+    fwrite(src, s->uv_width, 1, yuv_rec_file);
+    src += s->uv_stride;
+  } while (--h);
+}
+#endif
+
+static void update_alt_ref_frame_stats(VP9_COMP *cpi) {
+  VP9_COMMON *cm = &cpi->common;
+
+  // Update data structure that monitors level of reference to last GF
+  vpx_memset(cpi->gf_active_flags, 1, (cm->mb_rows * cm->mb_cols));
+  cpi->gf_active_count = cm->mb_rows * cm->mb_cols;
+
+  // this frame refreshes means next frames don't unless specified by user
+  cpi->common.frames_since_golden = 0;
+
+  // Clear the alternate reference update pending flag.
+  cpi->source_alt_ref_pending = FALSE;
+
+  // Set the alternate refernce frame active flag
+  cpi->source_alt_ref_active = TRUE;
+
+
+}
+static void update_golden_frame_stats(VP9_COMP *cpi) {
+  VP9_COMMON *cm = &cpi->common;
+
+  // Update the Golden frame usage counts.
+  if (cm->refresh_golden_frame) {
+    // Update data structure that monitors level of reference to last GF
+    vpx_memset(cpi->gf_active_flags, 1, (cm->mb_rows * cm->mb_cols));
+    cpi->gf_active_count = cm->mb_rows * cm->mb_cols;
+
+    // this frame refreshes means next frames don't unless specified by user
+    cm->refresh_golden_frame = 0;
+    cpi->common.frames_since_golden = 0;
+
+    // if ( cm->frame_type == KEY_FRAME )
+    // {
+    cpi->recent_ref_frame_usage[INTRA_FRAME] = 1;
+    cpi->recent_ref_frame_usage[LAST_FRAME] = 1;
+    cpi->recent_ref_frame_usage[GOLDEN_FRAME] = 1;
+    cpi->recent_ref_frame_usage[ALTREF_FRAME] = 1;
+    // }
+    // else
+    // {
+    //  // Carry a potrtion of count over to begining of next gf sequence
+    //  cpi->recent_ref_frame_usage[INTRA_FRAME] >>= 5;
+    //  cpi->recent_ref_frame_usage[LAST_FRAME] >>= 5;
+    //  cpi->recent_ref_frame_usage[GOLDEN_FRAME] >>= 5;
+    //  cpi->recent_ref_frame_usage[ALTREF_FRAME] >>= 5;
+    // }
+
+    // ******** Fixed Q test code only ************
+    // If we are going to use the ALT reference for the next group of frames set a flag to say so.
+    if (cpi->oxcf.fixed_q >= 0 &&
+        cpi->oxcf.play_alternate && !cpi->common.refresh_alt_ref_frame) {
+      cpi->source_alt_ref_pending = TRUE;
+      cpi->frames_till_gf_update_due = cpi->baseline_gf_interval;
+    }
+
+    if (!cpi->source_alt_ref_pending)
+      cpi->source_alt_ref_active = FALSE;
+
+    // Decrement count down till next gf
+    if (cpi->frames_till_gf_update_due > 0)
+      cpi->frames_till_gf_update_due--;
+
+  } else if (!cpi->common.refresh_alt_ref_frame) {
+    // Decrement count down till next gf
+    if (cpi->frames_till_gf_update_due > 0)
+      cpi->frames_till_gf_update_due--;
+
+    if (cpi->common.frames_till_alt_ref_frame)
+      cpi->common.frames_till_alt_ref_frame--;
+
+    cpi->common.frames_since_golden++;
+
+    if (cpi->common.frames_since_golden > 1) {
+      cpi->recent_ref_frame_usage[INTRA_FRAME] += cpi->count_mb_ref_frame_usage[INTRA_FRAME];
+      cpi->recent_ref_frame_usage[LAST_FRAME] += cpi->count_mb_ref_frame_usage[LAST_FRAME];
+      cpi->recent_ref_frame_usage[GOLDEN_FRAME] += cpi->count_mb_ref_frame_usage[GOLDEN_FRAME];
+      cpi->recent_ref_frame_usage[ALTREF_FRAME] += cpi->count_mb_ref_frame_usage[ALTREF_FRAME];
+    }
+  }
+}
+
+static int find_fp_qindex() {
+  int i;
+
+  for (i = 0; i < QINDEX_RANGE; i++) {
+    if (vp9_convert_qindex_to_q(i) >= 30.0) {
+      break;
+    }
+  }
+
+  if (i == QINDEX_RANGE)
+    i--;
+
+  return i;
+}
+
+static void Pass1Encode(VP9_COMP *cpi, unsigned long *size, unsigned char *dest, unsigned int *frame_flags) {
+  (void) size;
+  (void) dest;
+  (void) frame_flags;
+
+
+  vp9_set_quantizer(cpi, find_fp_qindex());
+  vp9_first_pass(cpi);
+}
+
+#define WRITE_RECON_BUFFER 0
+#if WRITE_RECON_BUFFER
+void write_cx_frame_to_file(YV12_BUFFER_CONFIG *frame, int this_frame) {
+
+  // write the frame
+  FILE *yframe;
+  int i;
+  char filename[255];
+
+  sprintf(filename, "cx\\y%04d.raw", this_frame);
+  yframe = fopen(filename, "wb");
+
+  for (i = 0; i < frame->y_height; i++)
+    fwrite(frame->y_buffer + i * frame->y_stride,
+           frame->y_width, 1, yframe);
+
+  fclose(yframe);
+  sprintf(filename, "cx\\u%04d.raw", this_frame);
+  yframe = fopen(filename, "wb");
+
+  for (i = 0; i < frame->uv_height; i++)
+    fwrite(frame->u_buffer + i * frame->uv_stride,
+           frame->uv_width, 1, yframe);
+
+  fclose(yframe);
+  sprintf(filename, "cx\\v%04d.raw", this_frame);
+  yframe = fopen(filename, "wb");
+
+  for (i = 0; i < frame->uv_height; i++)
+    fwrite(frame->v_buffer + i * frame->uv_stride,
+           frame->uv_width, 1, yframe);
+
+  fclose(yframe);
+}
+#endif
+
+static double compute_edge_pixel_proportion(YV12_BUFFER_CONFIG *frame) {
+#define EDGE_THRESH 128
+  int i, j;
+  int num_edge_pels = 0;
+  int num_pels = (frame->y_height - 2) * (frame->y_width - 2);
+  unsigned char *prev = frame->y_buffer + 1;
+  unsigned char *curr = frame->y_buffer + 1 + frame->y_stride;
+  unsigned char *next = frame->y_buffer + 1 + 2 * frame->y_stride;
+  for (i = 1; i < frame->y_height - 1; i++) {
+    for (j = 1; j < frame->y_width - 1; j++) {
+      /* Sobel hor and ver gradients */
+      int v = 2 * (curr[1] - curr[-1]) + (prev[1] - prev[-1]) + (next[1] - next[-1]);
+      int h = 2 * (prev[0] - next[0]) + (prev[1] - next[1]) + (prev[-1] - next[-1]);
+      h = (h < 0 ? -h : h);
+      v = (v < 0 ? -v : v);
+      if (h > EDGE_THRESH || v > EDGE_THRESH) num_edge_pels++;
+      curr++;
+      prev++;
+      next++;
+    }
+    curr += frame->y_stride - frame->y_width + 2;
+    prev += frame->y_stride - frame->y_width + 2;
+    next += frame->y_stride - frame->y_width + 2;
+  }
+  return (double)num_edge_pels / (double)num_pels;
+}
+
+// Function to test for conditions that indicate we should loop
+// back and recode a frame.
+static BOOL recode_loop_test(VP9_COMP *cpi,
+                             int high_limit, int low_limit,
+                             int q, int maxq, int minq) {
+  BOOL    force_recode = FALSE;
+  VP9_COMMON *cm = &cpi->common;
+
+  // Is frame recode allowed at all
+  // Yes if either recode mode 1 is selected or mode two is selcted
+  // and the frame is a key frame. golden frame or alt_ref_frame
+  if ((cpi->sf.recode_loop == 1) ||
+      ((cpi->sf.recode_loop == 2) &&
+       ((cm->frame_type == KEY_FRAME) ||
+        cm->refresh_golden_frame ||
+        cm->refresh_alt_ref_frame))) {
+    // General over and under shoot tests
+    if (((cpi->projected_frame_size > high_limit) && (q < maxq)) ||
+        ((cpi->projected_frame_size < low_limit) && (q > minq))) {
+      force_recode = TRUE;
+    }
+    // Special Constrained quality tests
+    else if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) {
+      // Undershoot and below auto cq level
+      if ((q > cpi->cq_target_quality) &&
+          (cpi->projected_frame_size <
+           ((cpi->this_frame_target * 7) >> 3))) {
+        force_recode = TRUE;
+      }
+      // Severe undershoot and between auto and user cq level
+      else if ((q > cpi->oxcf.cq_level) &&
+               (cpi->projected_frame_size < cpi->min_frame_bandwidth) &&
+               (cpi->active_best_quality > cpi->oxcf.cq_level)) {
+        force_recode = TRUE;
+        cpi->active_best_quality = cpi->oxcf.cq_level;
+      }
+    }
+  }
+
+  return force_recode;
+}
+
+static void update_reference_frames(VP9_COMMON *cm) {
+  YV12_BUFFER_CONFIG *yv12_fb = cm->yv12_fb;
+
+  // At this point the new frame has been encoded.
+  // If any buffer copy / swapping is signaled it should be done here.
+
+  if (cm->frame_type == KEY_FRAME) {
+    yv12_fb[cm->new_fb_idx].flags |= VP9_GOLD_FLAG | VP9_ALT_FLAG;
+
+    yv12_fb[cm->gld_fb_idx].flags &= ~VP9_GOLD_FLAG;
+    yv12_fb[cm->alt_fb_idx].flags &= ~VP9_ALT_FLAG;
+
+    cm->alt_fb_idx = cm->gld_fb_idx = cm->new_fb_idx;
+  } else { /* For non key frames */
+    if (cm->refresh_alt_ref_frame) {
+      assert(!cm->copy_buffer_to_arf);
+
+      cm->yv12_fb[cm->new_fb_idx].flags |= VP9_ALT_FLAG;
+      cm->yv12_fb[cm->alt_fb_idx].flags &= ~VP9_ALT_FLAG;
+      cm->alt_fb_idx = cm->new_fb_idx;
+    } else if (cm->copy_buffer_to_arf) {
+      assert(!(cm->copy_buffer_to_arf & ~0x3));
+
+      if (cm->copy_buffer_to_arf == 1) {
+        if (cm->alt_fb_idx != cm->lst_fb_idx) {
+          yv12_fb[cm->lst_fb_idx].flags |= VP9_ALT_FLAG;
+          yv12_fb[cm->alt_fb_idx].flags &= ~VP9_ALT_FLAG;
+          cm->alt_fb_idx = cm->lst_fb_idx;
+        }
+      } else { /* if (cm->copy_buffer_to_arf == 2) */
+        if (cm->alt_fb_idx != cm->gld_fb_idx) {
+          yv12_fb[cm->gld_fb_idx].flags |= VP9_ALT_FLAG;
+          yv12_fb[cm->alt_fb_idx].flags &= ~VP9_ALT_FLAG;
+          cm->alt_fb_idx = cm->gld_fb_idx;
+        }
+      }
+    }
+
+    if (cm->refresh_golden_frame) {
+      assert(!cm->copy_buffer_to_gf);
+
+      cm->yv12_fb[cm->new_fb_idx].flags |= VP9_GOLD_FLAG;
+      cm->yv12_fb[cm->gld_fb_idx].flags &= ~VP9_GOLD_FLAG;
+      cm->gld_fb_idx = cm->new_fb_idx;
+    } else if (cm->copy_buffer_to_gf) {
+      assert(!(cm->copy_buffer_to_arf & ~0x3));
+
+      if (cm->copy_buffer_to_gf == 1) {
+        if (cm->gld_fb_idx != cm->lst_fb_idx) {
+          yv12_fb[cm->lst_fb_idx].flags |= VP9_GOLD_FLAG;
+          yv12_fb[cm->gld_fb_idx].flags &= ~VP9_GOLD_FLAG;
+          cm->gld_fb_idx = cm->lst_fb_idx;
+        }
+      } else { /* if (cm->copy_buffer_to_gf == 2) */
+        if (cm->alt_fb_idx != cm->gld_fb_idx) {
+          yv12_fb[cm->alt_fb_idx].flags |= VP9_GOLD_FLAG;
+          yv12_fb[cm->gld_fb_idx].flags &= ~VP9_GOLD_FLAG;
+          cm->gld_fb_idx = cm->alt_fb_idx;
+        }
+      }
+    }
+  }
+
+  if (cm->refresh_last_frame) {
+    cm->yv12_fb[cm->new_fb_idx].flags |= VP9_LAST_FLAG;
+    cm->yv12_fb[cm->lst_fb_idx].flags &= ~VP9_LAST_FLAG;
+    cm->lst_fb_idx = cm->new_fb_idx;
+  }
+}
+
+static void loopfilter_frame(VP9_COMP *cpi, VP9_COMMON *cm) {
+  if (cm->no_lpf) {
+    cm->filter_level = 0;
+  }
+#if CONFIG_LOSSLESS
+  else if (cpi->oxcf.lossless) {
+    cm->filter_level = 0;
+  }
+#endif
+  else {
+    struct vpx_usec_timer timer;
+
+    vp9_clear_system_state();
+
+    vpx_usec_timer_start(&timer);
+    if (cpi->sf.auto_filter == 0)
+      vp9_pick_filter_level_fast(cpi->Source, cpi);
+    else
+      vp9_pick_filter_level(cpi->Source, cpi);
+
+    vpx_usec_timer_mark(&timer);
+    cpi->time_pick_lpf += vpx_usec_timer_elapsed(&timer);
+  }
+
+  if (cm->filter_level > 0) {
+    vp9_set_alt_lf_level(cpi, cm->filter_level);
+    vp9_loop_filter_frame(cm, &cpi->mb.e_mbd);
+  }
+
+  vp8_yv12_extend_frame_borders_ptr(cm->frame_to_show);
+
+}
+
+#if CONFIG_PRED_FILTER
+void select_pred_filter_mode(VP9_COMP *cpi) {
+  VP9_COMMON *cm = &cpi->common;
+
+  int prob_pred_filter_off = cm->prob_pred_filter_off;
+
+  // Force filter on/off if probability is extreme
+  if (prob_pred_filter_off >= 255 * 0.95)
+    cm->pred_filter_mode = 0;   // Off at the frame level
+  else if (prob_pred_filter_off <= 255 * 0.05)
+    cm->pred_filter_mode = 1;   // On at the frame level
+  else
+    cm->pred_filter_mode = 2;   // Selectable at the MB level
+}
+
+void update_pred_filt_prob(VP9_COMP *cpi) {
+  VP9_COMMON *cm = &cpi->common;
+  int prob_pred_filter_off;
+
+  // Based on the selection in the previous frame determine what mode
+  // to use for the current frame and work out the signaling probability
+  if (cpi->pred_filter_on_count + cpi->pred_filter_off_count) {
+    prob_pred_filter_off = cpi->pred_filter_off_count * 256 /
+                           (cpi->pred_filter_on_count + cpi->pred_filter_off_count);
+
+    if (prob_pred_filter_off < 1)
+      prob_pred_filter_off = 1;
+
+    if (prob_pred_filter_off > 255)
+      prob_pred_filter_off = 255;
+
+    cm->prob_pred_filter_off = prob_pred_filter_off;
+  } else
+    cm->prob_pred_filter_off = 128;
+  /*
+      {
+        FILE *fp = fopen("filt_use.txt", "a");
+        fprintf (fp, "%d %d prob=%d\n", cpi->pred_filter_off_count,
+                 cpi->pred_filter_on_count, cm->prob_pred_filter_off);
+        fclose(fp);
+      }
+  */
+}
+#endif
+
+static void encode_frame_to_data_rate
+(
+  VP9_COMP *cpi,
+  unsigned long *size,
+  unsigned char *dest,
+  unsigned int *frame_flags
+) {
+  VP9_COMMON *cm = &cpi->common;
+  MACROBLOCKD *xd = &cpi->mb.e_mbd;
+
+  int Q;
+  int frame_over_shoot_limit;
+  int frame_under_shoot_limit;
+
+  int Loop = FALSE;
+  int loop_count;
+  int this_q;
+  int last_zbin_oq;
+
+  int q_low;
+  int q_high;
+  int zbin_oq_high;
+  int zbin_oq_low = 0;
+
+  int top_index;
+  int bottom_index;
+  int active_worst_qchanged = FALSE;
+
+  int overshoot_seen = FALSE;
+  int undershoot_seen = FALSE;
+
+  int loop_size_estimate = 0;
+
+  SPEED_FEATURES *sf = &cpi->sf;
+#if RESET_FOREACH_FILTER
+  int q_low0;
+  int q_high0;
+  int zbin_oq_high0;
+  int zbin_oq_low0 = 0;
+  int Q0;
+  int last_zbin_oq0;
+  int active_best_quality0;
+  int active_worst_quality0;
+  double rate_correction_factor0;
+  double gf_rate_correction_factor0;
+#endif
+
+  /* list of filters to search over */
+  int mcomp_filters_to_search[] = {
+    EIGHTTAP, EIGHTTAP_SHARP, SIXTAP, SWITCHABLE
+  };
+  int mcomp_filters = sizeof(mcomp_filters_to_search) /
+      sizeof(*mcomp_filters_to_search);
+  int mcomp_filter_index = 0;
+  INT64 mcomp_filter_cost[4];
+
+  // Clear down mmx registers to allow floating point in what follows
+  vp9_clear_system_state();
+
+
+  // For an alt ref frame in 2 pass we skip the call to the second
+  // pass function that sets the target bandwidth so must set it here
+  if (cpi->common.refresh_alt_ref_frame) {
+    cpi->per_frame_bandwidth = cpi->twopass.gf_bits;                           // Per frame bit target for the alt ref frame
+    cpi->target_bandwidth = cpi->twopass.gf_bits * cpi->output_frame_rate;      // per second target bitrate
+  }
+
+  // Default turn off buffer to buffer copying
+  cm->copy_buffer_to_gf = 0;
+  cm->copy_buffer_to_arf = 0;
+
+  // Clear zbin over-quant value and mode boost values.
+  cpi->zbin_over_quant = 0;
+  cpi->zbin_mode_boost = 0;
+
+  // Enable or disable mode based tweaking of the zbin
+  // For 2 Pass Only used where GF/ARF prediction quality
+  // is above a threshold
+  cpi->zbin_mode_boost = 0;
+#if CONFIG_LOSSLESS
+  cpi->zbin_mode_boost_enabled = FALSE;
+#else
+  cpi->zbin_mode_boost_enabled = TRUE;
+#endif
+  if (cpi->gfu_boost <= 400) {
+    cpi->zbin_mode_boost_enabled = FALSE;
+  }
+
+  // Current default encoder behaviour for the altref sign bias
+  if (cpi->source_alt_ref_active)
+    cpi->common.ref_frame_sign_bias[ALTREF_FRAME] = 1;
+  else
+    cpi->common.ref_frame_sign_bias[ALTREF_FRAME] = 0;
+
+  // Check to see if a key frame is signalled
+  // For two pass with auto key frame enabled cm->frame_type may already be set, but not for one pass.
+  if ((cm->current_video_frame == 0) ||
+      (cm->frame_flags & FRAMEFLAGS_KEY) ||
+      (cpi->oxcf.auto_key && (cpi->frames_since_key % cpi->key_frame_frequency == 0))) {
+    // Key frame from VFW/auto-keyframe/first frame
+    cm->frame_type = KEY_FRAME;
+  }
+
+  // Set default state for segment based loop filter update flags
+  xd->mode_ref_lf_delta_update = 0;
+
+  // Set various flags etc to special state if it is a key frame
+  if (cm->frame_type == KEY_FRAME) {
+    int i;
+
+    // Reset the loop filter deltas and segmentation map
+    setup_features(cpi);
+
+    // If segmentation is enabled force a map update for key frames
+    if (xd->segmentation_enabled) {
+      xd->update_mb_segmentation_map = 1;
+      xd->update_mb_segmentation_data = 1;
+    }
+
+    // The alternate reference frame cannot be active for a key frame
+    cpi->source_alt_ref_active = FALSE;
+
+    // Reset the RD threshold multipliers to default of * 1 (128)
+    for (i = 0; i < MAX_MODES; i++) {
+      cpi->rd_thresh_mult[i] = 128;
+    }
+  }
+
+  // Test code for new segment features
+  init_seg_features(cpi);
+
+  // Decide how big to make the frame
+  vp9_pick_frame_size(cpi);
+
+  vp9_clear_system_state();
+
+  // Set an active best quality and if necessary active worst quality
+  Q = cpi->active_worst_quality;
+
+  if (cm->frame_type == KEY_FRAME) {
+    int high = 2000;
+    int low = 400;
+
+    if (cpi->kf_boost > high)
+      cpi->active_best_quality = kf_low_motion_minq[Q];
+    else if (cpi->kf_boost < low)
+      cpi->active_best_quality = kf_high_motion_minq[Q];
+    else {
+      int gap = high - low;
+      int offset = high - cpi->kf_boost;
+      int qdiff = kf_high_motion_minq[Q] - kf_low_motion_minq[Q];
+      int adjustment = ((offset * qdiff) + (gap >> 1)) / gap;
+
+      cpi->active_best_quality = kf_low_motion_minq[Q] + adjustment;
+    }
+
+    // Make an adjustment based on the %s static
+    // The main impact of this is at lower Q to prevent overly large key
+    // frames unless a lot of the image is static.
+    if (cpi->kf_zeromotion_pct < 64)
+      cpi->active_best_quality += 4 - (cpi->kf_zeromotion_pct >> 4);
+
+    // Special case for key frames forced because we have reached
+    // the maximum key frame interval. Here force the Q to a range
+    // based on the ambient Q to reduce the risk of popping
+    if (cpi->this_key_frame_forced) {
+      int delta_qindex;
+      int qindex = cpi->last_boosted_qindex;
+
+      delta_qindex = compute_qdelta(cpi, qindex,
+                                    (qindex * 0.75));
+
+      cpi->active_best_quality = qindex + delta_qindex;
+      if (cpi->active_best_quality < cpi->best_quality)
+        cpi->active_best_quality = cpi->best_quality;
+    }
+  }
+
+  else if (cm->refresh_golden_frame || cpi->common.refresh_alt_ref_frame) {
+    int high = 2000;
+    int low = 400;
+
+    // Use the lower of cpi->active_worst_quality and recent
+    // average Q as basis for GF/ARF Q limit unless last frame was
+    // a key frame.
+    if ((cpi->frames_since_key > 1) &&
+        (cpi->avg_frame_qindex < cpi->active_worst_quality)) {
+      Q = cpi->avg_frame_qindex;
+    }
+
+    // For constrained quality dont allow Q less than the cq level
+    if ((cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) &&
+        (Q < cpi->cq_target_quality)) {
+      Q = cpi->cq_target_quality;
+    }
+
+    if (cpi->gfu_boost > high)
+      cpi->active_best_quality = gf_low_motion_minq[Q];
+    else if (cpi->gfu_boost < low)
+      cpi->active_best_quality = gf_high_motion_minq[Q];
+    else {
+      int gap = high - low;
+      int offset = high - cpi->gfu_boost;
+      int qdiff = gf_high_motion_minq[Q] - gf_low_motion_minq[Q];
+      int adjustment = ((offset * qdiff) + (gap >> 1)) / gap;
+
+      cpi->active_best_quality = gf_low_motion_minq[Q] + adjustment;
+    }
+
+    // Constrained quality use slightly lower active best.
+    if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) {
+      cpi->active_best_quality =
+        cpi->active_best_quality * 15 / 16;
+    }
+  } else {
+    cpi->active_best_quality = inter_minq[Q];
+
+    // For the constant/constrained quality mode we dont want
+    // q to fall below the cq level.
+    if ((cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) &&
+        (cpi->active_best_quality < cpi->cq_target_quality)) {
+      // If we are strongly undershooting the target rate in the last
+      // frames then use the user passed in cq value not the auto
+      // cq value.
+      if (cpi->rolling_actual_bits < cpi->min_frame_bandwidth)
+        cpi->active_best_quality = cpi->oxcf.cq_level;
+      else
+        cpi->active_best_quality = cpi->cq_target_quality;
+    }
+  }
+
+  // Clip the active best and worst quality values to limits
+  if (cpi->active_worst_quality > cpi->worst_quality)
+    cpi->active_worst_quality = cpi->worst_quality;
+
+  if (cpi->active_best_quality < cpi->best_quality)
+    cpi->active_best_quality = cpi->best_quality;
+
+  if (cpi->active_best_quality > cpi->worst_quality)
+    cpi->active_best_quality = cpi->worst_quality;
+
+  if (cpi->active_worst_quality < cpi->active_best_quality)
+    cpi->active_worst_quality = cpi->active_best_quality;
+
+  // Specuial case code to try and match quality with forced key frames
+  if ((cm->frame_type == KEY_FRAME) && cpi->this_key_frame_forced) {
+    Q = cpi->last_boosted_qindex;
+  } else {
+    // Determine initial Q to try
+    Q = vp9_regulate_q(cpi, cpi->this_frame_target);
+  }
+  last_zbin_oq = cpi->zbin_over_quant;
+
+  // Set highest allowed value for Zbin over quant
+  if (cm->frame_type == KEY_FRAME)
+    zbin_oq_high = 0; // ZBIN_OQ_MAX/16
+  else if (cm->refresh_alt_ref_frame || (cm->refresh_golden_frame && !cpi->source_alt_ref_active))
+    zbin_oq_high = 16;
+  else
+    zbin_oq_high = ZBIN_OQ_MAX;
+
+  vp9_compute_frame_size_bounds(cpi, &frame_under_shoot_limit,
+                                &frame_over_shoot_limit);
+
+  // Limit Q range for the adaptive loop.
+  bottom_index = cpi->active_best_quality;
+  top_index    = cpi->active_worst_quality;
+  q_low  = cpi->active_best_quality;
+  q_high = cpi->active_worst_quality;
+
+  loop_count = 0;
+
+  if (cm->frame_type != KEY_FRAME) {
+    /* TODO: Decide this more intelligently */
+    if (sf->search_best_filter) {
+      cm->mcomp_filter_type = mcomp_filters_to_search[0];
+      mcomp_filter_index = 0;
+    } else {
+      cm->mcomp_filter_type = DEFAULT_INTERP_FILTER;
+    }
+    /* TODO: Decide this more intelligently */
+    xd->allow_high_precision_mv = (Q < HIGH_PRECISION_MV_QTHRESH);
+  }
+
+#if CONFIG_POSTPROC
+
+  if (cpi->oxcf.noise_sensitivity > 0) {
+    unsigned char *src;
+    int l = 0;
+
+    switch (cpi->oxcf.noise_sensitivity) {
+      case 1:
+        l = 20;
+        break;
+      case 2:
+        l = 40;
+        break;
+      case 3:
+        l = 60;
+        break;
+      case 4:
+
+      case 5:
+        l = 100;
+        break;
+      case 6:
+        l = 150;
+        break;
+    }
+
+
+    if (cm->frame_type == KEY_FRAME) {
+      vp9_de_noise(cpi->Source, cpi->Source, l, 1,  0, RTCD(postproc));
+    } else {
+      vp9_de_noise(cpi->Source, cpi->Source, l, 1,  0, RTCD(postproc));
+
+      src = cpi->Source->y_buffer;
+
+      if (cpi->Source->y_stride < 0) {
+        src += cpi->Source->y_stride * (cpi->Source->y_height - 1);
+      }
+    }
+  }
+
+#endif
+
+#ifdef OUTPUT_YUV_SRC
+  vp9_write_yuv_frame(cpi->Source);
+#endif
+
+#if RESET_FOREACH_FILTER
+  if (sf->search_best_filter) {
+    q_low0 = q_low;
+    q_high0 = q_high;
+    Q0 = Q;
+    zbin_oq_low0 = zbin_oq_low;
+    zbin_oq_high0 = zbin_oq_high;
+    last_zbin_oq0 = last_zbin_oq;
+    rate_correction_factor0 = cpi->rate_correction_factor;
+    gf_rate_correction_factor0 = cpi->gf_rate_correction_factor;
+    active_best_quality0 = cpi->active_best_quality;
+    active_worst_quality0 = cpi->active_worst_quality;
+  }
+#endif
+  do {
+    vp9_clear_system_state();  // __asm emms;
+
+    vp9_set_quantizer(cpi, Q);
+    this_q = Q;
+
+    if (loop_count == 0) {
+
+      // setup skip prob for costing in mode/mv decision
+      if (cpi->common.mb_no_coeff_skip) {
+        int k;
+        for (k = 0; k < MBSKIP_CONTEXTS; k++)
+          cm->mbskip_pred_probs[k] = cpi->base_skip_false_prob[Q][k];
+
+        if (cm->frame_type != KEY_FRAME) {
+          if (cpi->common.refresh_alt_ref_frame) {
+            for (k = 0; k < MBSKIP_CONTEXTS; k++) {
+              if (cpi->last_skip_false_probs[2][k] != 0)
+                cm->mbskip_pred_probs[k] = cpi->last_skip_false_probs[2][k];
+            }
+          } else if (cpi->common.refresh_golden_frame) {
+            for (k = 0; k < MBSKIP_CONTEXTS; k++) {
+              if (cpi->last_skip_false_probs[1][k] != 0)
+                cm->mbskip_pred_probs[k] = cpi->last_skip_false_probs[1][k];
+            }
+          } else {
+            int k;
+            for (k = 0; k < MBSKIP_CONTEXTS; k++) {
+              if (cpi->last_skip_false_probs[0][k] != 0)
+                cm->mbskip_pred_probs[k] = cpi->last_skip_false_probs[0][k];
+            }
+          }
+
+          // as this is for cost estimate, let's make sure it does not
+          // get extreme either way
+          {
+            int k;
+            for (k = 0; k < MBSKIP_CONTEXTS; ++k) {
+              if (cm->mbskip_pred_probs[k] < 5)
+                cm->mbskip_pred_probs[k] = 5;
+
+              if (cm->mbskip_pred_probs[k] > 250)
+                cm->mbskip_pred_probs[k] = 250;
+
+              if (cpi->is_src_frame_alt_ref)
+                cm->mbskip_pred_probs[k] = 1;
+            }
+          }
+        }
+      }
+
+      // Set up entropy depending on frame type.
+      if (cm->frame_type == KEY_FRAME)
+        vp9_setup_key_frame(cpi);
+      else
+        vp9_setup_inter_frame(cpi);
+    }
+
+    // transform / motion compensation build reconstruction frame
+
+    vp9_encode_frame(cpi);
+
+    // Update the skip mb flag probabilities based on the distribution
+    // seen in the last encoder iteration.
+    update_base_skip_probs(cpi);
+
+    vp9_clear_system_state();  // __asm emms;
+
+#if CONFIG_PRED_FILTER
+    // Update prediction filter on/off probability based on
+    // selection made for the current frame
+    if (cm->frame_type != KEY_FRAME)
+      update_pred_filt_prob(cpi);
+#endif
+
+    // Dummy pack of the bitstream using up to date stats to get an
+    // accurate estimate of output frame size to determine if we need
+    // to recode.
+    vp9_save_coding_context(cpi);
+    cpi->dummy_packing = 1;
+    vp9_pack_bitstream(cpi, dest, size);
+    cpi->projected_frame_size = (*size) << 3;
+    vp9_restore_coding_context(cpi);
+
+    if (frame_over_shoot_limit == 0)
+      frame_over_shoot_limit = 1;
+    active_worst_qchanged = FALSE;
+
+    // Special case handling for forced key frames
+    if ((cm->frame_type == KEY_FRAME) && cpi->this_key_frame_forced) {
+      int last_q = Q;
+      int kf_err = vp9_calc_ss_err(cpi->Source,
+                                   &cm->yv12_fb[cm->new_fb_idx]);
+
+      int high_err_target = cpi->ambient_err;
+      int low_err_target = (cpi->ambient_err >> 1);
+
+      // Prevent possible divide by zero error below for perfect KF
+      kf_err += (!kf_err);
+
+      // The key frame is not good enough or we can afford
+      // to make it better without undue risk of popping.
+      if (((kf_err > high_err_target) &&
+           (cpi->projected_frame_size <= frame_over_shoot_limit)) ||
+          ((kf_err > low_err_target) &&
+           (cpi->projected_frame_size <= frame_under_shoot_limit))) {
+        // Lower q_high
+        q_high = (Q > q_low) ? (Q - 1) : q_low;
+
+        // Adjust Q
+        Q = (Q * high_err_target) / kf_err;
+        if (Q < ((q_high + q_low) >> 1))
+          Q = (q_high + q_low) >> 1;
+      }
+      // The key frame is much better than the previous frame
+      else if ((kf_err < low_err_target) &&
+               (cpi->projected_frame_size >= frame_under_shoot_limit)) {
+        // Raise q_low
+        q_low = (Q < q_high) ? (Q + 1) : q_high;
+
+        // Adjust Q
+        Q = (Q * low_err_target) / kf_err;
+        if (Q > ((q_high + q_low + 1) >> 1))
+          Q = (q_high + q_low + 1) >> 1;
+      }
+
+      // Clamp Q to upper and lower limits:
+      if (Q > q_high)
+        Q = q_high;
+      else if (Q < q_low)
+        Q = q_low;
+
+      Loop = ((Q != last_q)) ? TRUE : FALSE;
+    }
+
+    // Is the projected frame size out of range and are we allowed to attempt to recode.
+    else if (recode_loop_test(cpi,
+                              frame_over_shoot_limit, frame_under_shoot_limit,
+                              Q, top_index, bottom_index)) {
+      int last_q = Q;
+      int Retries = 0;
+
+      // Frame size out of permitted range:
+      // Update correction factor & compute new Q to try...
+
+      // Frame is too large
+      if (cpi->projected_frame_size > cpi->this_frame_target) {
+        q_low = (Q < q_high) ? (Q + 1) : q_high; // Raise Qlow as to at least the current value
+
+        if (cpi->zbin_over_quant > 0)            // If we are using over quant do the same for zbin_oq_low
+          zbin_oq_low = (cpi->zbin_over_quant < zbin_oq_high) ? (cpi->zbin_over_quant + 1) : zbin_oq_high;
+
+        if (undershoot_seen || (loop_count > 1)) {
+          // Update rate_correction_factor unless cpi->active_worst_quality has changed.
+          if (!active_worst_qchanged)
+            vp9_update_rate_correction_factors(cpi, 1);
+
+          Q = (q_high + q_low + 1) / 2;
+
+          // Adjust cpi->zbin_over_quant (only allowed when Q is max)
+          if (Q < MAXQ)
+            cpi->zbin_over_quant = 0;
+          else {
+            zbin_oq_low = (cpi->zbin_over_quant < zbin_oq_high) ? (cpi->zbin_over_quant + 1) : zbin_oq_high;
+            cpi->zbin_over_quant = (zbin_oq_high + zbin_oq_low) / 2;
+          }
+        } else {
+          // Update rate_correction_factor unless cpi->active_worst_quality has changed.
+          if (!active_worst_qchanged)
+            vp9_update_rate_correction_factors(cpi, 0);
+
+          Q = vp9_regulate_q(cpi, cpi->this_frame_target);
+
+          while (((Q < q_low) || (cpi->zbin_over_quant < zbin_oq_low)) && (Retries < 10)) {
+            vp9_update_rate_correction_factors(cpi, 0);
+            Q = vp9_regulate_q(cpi, cpi->this_frame_target);
+            Retries++;
+          }
+        }
+
+        overshoot_seen = TRUE;
+      }
+      // Frame is too small
+      else {
+        if (cpi->zbin_over_quant == 0)
+          q_high = (Q > q_low) ? (Q - 1) : q_low; // Lower q_high if not using over quant
+        else                                    // else lower zbin_oq_high
+          zbin_oq_high = (cpi->zbin_over_quant > zbin_oq_low) ? (cpi->zbin_over_quant - 1) : zbin_oq_low;
+
+        if (overshoot_seen || (loop_count > 1)) {
+          // Update rate_correction_factor unless cpi->active_worst_quality has changed.
+          if (!active_worst_qchanged)
+            vp9_update_rate_correction_factors(cpi, 1);
+
+          Q = (q_high + q_low) / 2;
+
+          // Adjust cpi->zbin_over_quant (only allowed when Q is max)
+          if (Q < MAXQ)
+            cpi->zbin_over_quant = 0;
+          else
+            cpi->zbin_over_quant = (zbin_oq_high + zbin_oq_low) / 2;
+        } else {
+          // Update rate_correction_factor unless cpi->active_worst_quality has changed.
+          if (!active_worst_qchanged)
+            vp9_update_rate_correction_factors(cpi, 0);
+
+          Q = vp9_regulate_q(cpi, cpi->this_frame_target);
+
+          // Special case reset for qlow for constrained quality.
+          // This should only trigger where there is very substantial
+          // undershoot on a frame and the auto cq level is above
+          // the user passsed in value.
+          if ((cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) &&
+              (Q < q_low)) {
+            q_low = Q;
+          }
+
+          while (((Q > q_high) || (cpi->zbin_over_quant > zbin_oq_high)) && (Retries < 10)) {
+            vp9_update_rate_correction_factors(cpi, 0);
+            Q = vp9_regulate_q(cpi, cpi->this_frame_target);
+            Retries++;
+          }
+        }
+
+        undershoot_seen = TRUE;
+      }
+
+      // Clamp Q to upper and lower limits:
+      if (Q > q_high)
+        Q = q_high;
+      else if (Q < q_low)
+        Q = q_low;
+
+      // Clamp cpi->zbin_over_quant
+      cpi->zbin_over_quant = (cpi->zbin_over_quant < zbin_oq_low) ?
+          zbin_oq_low : (cpi->zbin_over_quant > zbin_oq_high) ?
+          zbin_oq_high : cpi->zbin_over_quant;
+
+      // Loop = ((Q != last_q) || (last_zbin_oq != cpi->zbin_over_quant)) ? TRUE : FALSE;
+      Loop = ((Q != last_q)) ? TRUE : FALSE;
+      last_zbin_oq = cpi->zbin_over_quant;
+    } else
+      Loop = FALSE;
+
+    if (cpi->is_src_frame_alt_ref)
+      Loop = FALSE;
+
+    if (cm->frame_type != KEY_FRAME &&
+        !sf->search_best_filter &&
+        cm->mcomp_filter_type == SWITCHABLE) {
+      int interp_factor = Q / 3;  /* denominator is 256 */
+      int count[VP9_SWITCHABLE_FILTERS];
+      int tot_count = 0, c = 0, thr;
+      int i, j;
+      for (i = 0; i < VP9_SWITCHABLE_FILTERS; ++i) {
+        count[i] = 0;
+        for (j = 0; j <= VP9_SWITCHABLE_FILTERS; ++j) {
+          count[i] += cpi->switchable_interp_count[j][i];
+        }
+        tot_count += count[i];
+      }
+
+      thr = ((tot_count * interp_factor + 128) >> 8);
+      for (i = 0; i < VP9_SWITCHABLE_FILTERS; ++i) {
+        c += (count[i] >= thr);
+      }
+      if (c == 1) {
+        /* Mostly one filter is used. So set the filter at frame level */
+        for (i = 0; i < VP9_SWITCHABLE_FILTERS; ++i) {
+          if (count[i]) {
+            cm->mcomp_filter_type = vp9_switchable_interp[i];
+            Loop = TRUE;  /* Make sure to loop since the filter changed */
+            break;
+          }
+        }
+      }
+    }
+
+    if (Loop == FALSE && cm->frame_type != KEY_FRAME && sf->search_best_filter) {
+      if (mcomp_filter_index < mcomp_filters) {
+        INT64 err = vp9_calc_ss_err(cpi->Source,
+                                    &cm->yv12_fb[cm->new_fb_idx]);
+        INT64 rate = cpi->projected_frame_size << 8;
+        mcomp_filter_cost[mcomp_filter_index] =
+          (RDCOST(cpi->RDMULT, cpi->RDDIV, rate, err));
+        mcomp_filter_index++;
+        if (mcomp_filter_index < mcomp_filters) {
+          cm->mcomp_filter_type = mcomp_filters_to_search[mcomp_filter_index];
+          loop_count = -1;
+          Loop = TRUE;
+        } else {
+          int f;
+          INT64 best_cost = mcomp_filter_cost[0];
+          int mcomp_best_filter = mcomp_filters_to_search[0];
+          for (f = 1; f < mcomp_filters; f++) {
+            if (mcomp_filter_cost[f] < best_cost) {
+              mcomp_best_filter = mcomp_filters_to_search[f];
+              best_cost = mcomp_filter_cost[f];
+            }
+          }
+          if (mcomp_best_filter != mcomp_filters_to_search[mcomp_filters - 1]) {
+            loop_count = -1;
+            Loop = TRUE;
+            cm->mcomp_filter_type = mcomp_best_filter;
+          }
+          /*
+          printf("  best filter = %d, ( ", mcomp_best_filter);
+          for (f=0;f<mcomp_filters; f++) printf("%d ",  mcomp_filter_cost[f]);
+          printf(")\n");
+          */
+        }
+#if RESET_FOREACH_FILTER
+        if (Loop == TRUE) {
+          overshoot_seen = FALSE;
+          undershoot_seen = FALSE;
+          zbin_oq_low = zbin_oq_low0;
+          zbin_oq_high = zbin_oq_high0;
+          q_low = q_low0;
+          q_high = q_high0;
+          Q = Q0;
+          cpi->zbin_over_quant = last_zbin_oq = last_zbin_oq0;
+          cpi->rate_correction_factor = rate_correction_factor0;
+          cpi->gf_rate_correction_factor = gf_rate_correction_factor0;
+          cpi->active_best_quality = active_best_quality0;
+          cpi->active_worst_quality = active_worst_quality0;
+        }
+#endif
+      }
+    }
+
+    if (Loop == TRUE) {
+      loop_count++;
+#if CONFIG_INTERNAL_STATS
+      cpi->tot_recode_hits++;
+#endif
+    }
+  } while (Loop == TRUE);
+
+  // Special case code to reduce pulsing when key frames are forced at a
+  // fixed interval. Note the reconstruction error if it is the frame before
+  // the force key frame
+  if (cpi->next_key_frame_forced && (cpi->twopass.frames_to_key == 0)) {
+    cpi->ambient_err = vp9_calc_ss_err(cpi->Source,
+                                       &cm->yv12_fb[cm->new_fb_idx]);
+  }
+
+  // This frame's MVs are saved and will be used in next frame's MV
+  // prediction. Last frame has one more line(add to bottom) and one
+  // more column(add to right) than cm->mip. The edge elements are
+  // initialized to 0.
+  if (cm->show_frame) { // do not save for altref frame
+    int mb_row;
+    int mb_col;
+    MODE_INFO *tmp = cm->mip;
+
+    if (cm->frame_type != KEY_FRAME) {
+      for (mb_row = 0; mb_row < cm->mb_rows + 1; mb_row ++) {
+        for (mb_col = 0; mb_col < cm->mb_cols + 1; mb_col ++) {
+          if (tmp->mbmi.ref_frame != INTRA_FRAME)
+            cpi->lfmv[mb_col + mb_row * (cm->mode_info_stride + 1)].as_int = tmp->mbmi.mv[0].as_int;
+
+          cpi->lf_ref_frame_sign_bias[mb_col + mb_row * (cm->mode_info_stride + 1)] = cm->ref_frame_sign_bias[tmp->mbmi.ref_frame];
+          cpi->lf_ref_frame[mb_col + mb_row * (cm->mode_info_stride + 1)] = tmp->mbmi.ref_frame;
+          tmp++;
+        }
+      }
+    }
+  }
+
+  // Update the GF useage maps.
+  // This is done after completing the compression of a frame when all modes
+  // etc. are finalized but before loop filter
+  vp9_update_gf_useage_maps(cpi, cm, &cpi->mb);
+
+  if (cm->frame_type == KEY_FRAME)
+    cm->refresh_last_frame = 1;
+
+#if 0
+  {
+    FILE *f = fopen("gfactive.stt", "a");
+    fprintf(f, "%8d %8d %8d %8d %8d\n", cm->current_video_frame, (100 * cpi->gf_active_count) / (cpi->common.mb_rows * cpi->common.mb_cols), cpi->this_iiratio, cpi->next_iiratio, cm->refresh_golden_frame);
+    fclose(f);
+  }
+#endif
+
+  cm->frame_to_show = &cm->yv12_fb[cm->new_fb_idx];
+
+#if WRITE_RECON_BUFFER
+  if (cm->show_frame)
+    write_cx_frame_to_file(cm->frame_to_show,
+                           cm->current_video_frame);
+  else
+    write_cx_frame_to_file(cm->frame_to_show,
+                           cm->current_video_frame + 1000);
+#endif
+
+  // Pick the loop filter level for the frame.
+  loopfilter_frame(cpi, cm);
+
+  // build the bitstream
+  cpi->dummy_packing = 0;
+  vp9_pack_bitstream(cpi, dest, size);
+
+  if (cpi->mb.e_mbd.update_mb_segmentation_map) {
+    update_reference_segmentation_map(cpi);
+  }
+
+#if CONFIG_PRED_FILTER
+  // Select the prediction filtering mode to use for the
+  // next frame based on the current frame selections
+  if (cm->frame_type != KEY_FRAME)
+    select_pred_filter_mode(cpi);
+#endif
+
+  update_reference_frames(cm);
+  vp9_copy(cpi->common.fc.coef_counts, cpi->coef_counts);
+  vp9_copy(cpi->common.fc.hybrid_coef_counts, cpi->hybrid_coef_counts);
+  vp9_copy(cpi->common.fc.coef_counts_8x8, cpi->coef_counts_8x8);
+  vp9_copy(cpi->common.fc.hybrid_coef_counts_8x8, cpi->hybrid_coef_counts_8x8);
+  vp9_copy(cpi->common.fc.coef_counts_16x16, cpi->coef_counts_16x16);
+  vp9_copy(cpi->common.fc.hybrid_coef_counts_16x16,
+           cpi->hybrid_coef_counts_16x16);
+  vp9_adapt_coef_probs(&cpi->common);
+  if (cpi->common.frame_type != KEY_FRAME) {
+    vp9_copy(cpi->common.fc.ymode_counts, cpi->ymode_count);
+    vp9_copy(cpi->common.fc.uv_mode_counts, cpi->y_uv_mode_count);
+    vp9_copy(cpi->common.fc.bmode_counts, cpi->bmode_count);
+    vp9_copy(cpi->common.fc.i8x8_mode_counts, cpi->i8x8_mode_count);
+    vp9_copy(cpi->common.fc.sub_mv_ref_counts, cpi->sub_mv_ref_count);
+    vp9_copy(cpi->common.fc.mbsplit_counts, cpi->mbsplit_count);
+    vp9_adapt_mode_probs(&cpi->common);
+
+    cpi->common.fc.NMVcount = cpi->NMVcount;
+    vp9_adapt_nmv_probs(&cpi->common, cpi->mb.e_mbd.allow_high_precision_mv);
+    vp9_update_mode_context(&cpi->common);
+  }
+
+  /* Move storing frame_type out of the above loop since it is also
+   * needed in motion search besides loopfilter */
+  cm->last_frame_type = cm->frame_type;
+
+  // Keep a copy of the size estimate used in the loop
+  loop_size_estimate = cpi->projected_frame_size;
+
+  // Update rate control heuristics
+  cpi->total_byte_count += (*size);
+  cpi->projected_frame_size = (*size) << 3;
+
+  if (!active_worst_qchanged)
+    vp9_update_rate_correction_factors(cpi, 2);
+
+  cpi->last_q[cm->frame_type] = cm->base_qindex;
+
+  // Keep record of last boosted (KF/KF/ARF) Q value.
+  // If the current frame is coded at a lower Q then we also update it.
+  // If all mbs in this group are skipped only update if the Q value is
+  // better than that already stored.
+  // This is used to help set quality in forced key frames to reduce popping
+  if ((cm->base_qindex < cpi->last_boosted_qindex) ||
+      ((cpi->static_mb_pct < 100) &&
+       ((cm->frame_type == KEY_FRAME) ||
+        cm->refresh_alt_ref_frame ||
+        (cm->refresh_golden_frame && !cpi->is_src_frame_alt_ref)))) {
+    cpi->last_boosted_qindex = cm->base_qindex;
+  }
+
+  if (cm->frame_type == KEY_FRAME) {
+    vp9_adjust_key_frame_context(cpi);
+  }
+
+  // Keep a record of ambient average Q.
+  if (cm->frame_type != KEY_FRAME)
+    cpi->avg_frame_qindex = (2 + 3 * cpi->avg_frame_qindex + cm->base_qindex) >> 2;
+
+  // Keep a record from which we can calculate the average Q excluding GF updates and key frames
+  if ((cm->frame_type != KEY_FRAME) && !cm->refresh_golden_frame && !cm->refresh_alt_ref_frame) {
+    cpi->ni_frames++;
+    cpi->tot_q += vp9_convert_qindex_to_q(Q);
+    cpi->avg_q = cpi->tot_q / (double)cpi->ni_frames;
+
+    // Calculate the average Q for normal inter frames (not key or GFU
+    // frames).
+    cpi->ni_tot_qi += Q;
+    cpi->ni_av_qi = (cpi->ni_tot_qi / cpi->ni_frames);
+  }
+
+  // Update the buffer level variable.
+  // Non-viewable frames are a special case and are treated as pure overhead.
+  if (!cm->show_frame)
+    cpi->bits_off_target -= cpi->projected_frame_size;
+  else
+    cpi->bits_off_target += cpi->av_per_frame_bandwidth - cpi->projected_frame_size;
+
+  // Clip the buffer level at the maximum buffer size
+  if (cpi->bits_off_target > cpi->oxcf.maximum_buffer_size)
+    cpi->bits_off_target = cpi->oxcf.maximum_buffer_size;
+
+  // Rolling monitors of whether we are over or underspending used to help regulate min and Max Q in two pass.
+  cpi->rolling_target_bits = ((cpi->rolling_target_bits * 3) + cpi->this_frame_target + 2) / 4;
+  cpi->rolling_actual_bits = ((cpi->rolling_actual_bits * 3) + cpi->projected_frame_size + 2) / 4;
+  cpi->long_rolling_target_bits = ((cpi->long_rolling_target_bits * 31) + cpi->this_frame_target + 16) / 32;
+  cpi->long_rolling_actual_bits = ((cpi->long_rolling_actual_bits * 31) + cpi->projected_frame_size + 16) / 32;
+
+  // Actual bits spent
+  cpi->total_actual_bits    += cpi->projected_frame_size;
+
+  // Debug stats
+  cpi->total_target_vs_actual += (cpi->this_frame_target - cpi->projected_frame_size);
+
+  cpi->buffer_level = cpi->bits_off_target;
+
+  // Update bits left to the kf and gf groups to account for overshoot or undershoot on these frames
+  if (cm->frame_type == KEY_FRAME) {
+    cpi->twopass.kf_group_bits += cpi->this_frame_target - cpi->projected_frame_size;
+
+    if (cpi->twopass.kf_group_bits < 0)
+      cpi->twopass.kf_group_bits = 0;
+  } else if (cm->refresh_golden_frame || cm->refresh_alt_ref_frame) {
+    cpi->twopass.gf_group_bits += cpi->this_frame_target - cpi->projected_frame_size;
+
+    if (cpi->twopass.gf_group_bits < 0)
+      cpi->twopass.gf_group_bits = 0;
+  }
+
+  // Update the skip mb flag probabilities based on the distribution seen
+  // in this frame.
+  update_base_skip_probs(cpi);
+
+#if 0 //CONFIG_NEW_MVREF && CONFIG_INTERNAL_STATS
+  {
+    FILE *f = fopen("mv_ref_dist.stt", "a");
+    unsigned int i;
+    for (i = 0; i < MAX_MV_REFS; ++i) {
+      fprintf(f, "%10d", cpi->best_ref_index_counts[0][i]);
+    }
+    fprintf(f, "\n" );
+
+    fclose(f);
+  }
+#endif
+
+#if 0// 1 && CONFIG_INTERNAL_STATS
+  {
+    FILE *f = fopen("tmp.stt", "a");
+    int recon_err;
+
+    vp9_clear_system_state();  // __asm emms;
+
+    recon_err = vp9_calc_ss_err(cpi->Source,
+                                &cm->yv12_fb[cm->new_fb_idx]);
+
+    if (cpi->twopass.total_left_stats->coded_error != 0.0)
+      fprintf(f, "%10d %10d %10d %10d %10d %10d %10d %10d"
+              "%7.2f %7.2f %7.2f %7.2f %7.2f %7.2f %7.2f"
+              "%6d %5d %5d %5d %8d %8.2f %10d %10.3f"
+              "%10.3f %8d %10d %10d %10d\n",
+              cpi->common.current_video_frame, cpi->this_frame_target,
+              cpi->projected_frame_size, loop_size_estimate,
+              (cpi->projected_frame_size - cpi->this_frame_target),
+              (int)cpi->total_target_vs_actual,
+              (cpi->oxcf.starting_buffer_level - cpi->bits_off_target),
+              (int)cpi->total_actual_bits,
+              vp9_convert_qindex_to_q(cm->base_qindex),
+              (double)vp9_dc_quant(cm->base_qindex, 0) / 4.0,
+              vp9_convert_qindex_to_q(cpi->active_best_quality),
+              vp9_convert_qindex_to_q(cpi->active_worst_quality),
+              cpi->avg_q,
+              vp9_convert_qindex_to_q(cpi->ni_av_qi),
+              vp9_convert_qindex_to_q(cpi->cq_target_quality),
+              cpi->zbin_over_quant,
+              // cpi->avg_frame_qindex, cpi->zbin_over_quant,
+              cm->refresh_golden_frame, cm->refresh_alt_ref_frame,
+              cm->frame_type, cpi->gfu_boost,
+              cpi->twopass.est_max_qcorrection_factor,
+              (int)cpi->twopass.bits_left,
+              cpi->twopass.total_left_stats->coded_error,
+              (double)cpi->twopass.bits_left /
+              cpi->twopass.total_left_stats->coded_error,
+              cpi->tot_recode_hits, recon_err, cpi->kf_boost,
+              cpi->kf_zeromotion_pct);
+    else
+      fprintf(f, "%10d %10d %10d %10d %10d %10d %10d %10d"
+              "%7.2f %7.2f %7.2f %7.2f %7.2f %7.2f %7.2f"
+              "%6d %5d %5d %5d %8d %8.2f %10d %10.3f"
+              "%8d %10d %10d %10d\n",
+              cpi->common.current_video_frame,
+              cpi->this_frame_target, cpi->projected_frame_size,
+              loop_size_estimate,
+              (cpi->projected_frame_size - cpi->this_frame_target),
+              (int)cpi->total_target_vs_actual,
+              (cpi->oxcf.starting_buffer_level - cpi->bits_off_target),
+              (int)cpi->total_actual_bits,
+              vp9_convert_qindex_to_q(cm->base_qindex),
+              (double)vp9_dc_quant(cm->base_qindex, 0) / 4.0,
+              vp9_convert_qindex_to_q(cpi->active_best_quality),
+              vp9_convert_qindex_to_q(cpi->active_worst_quality),
+              cpi->avg_q,
+              vp9_convert_qindex_to_q(cpi->ni_av_qi),
+              vp9_convert_qindex_to_q(cpi->cq_target_quality),
+              cpi->zbin_over_quant,
+              // cpi->avg_frame_qindex, cpi->zbin_over_quant,
+              cm->refresh_golden_frame, cm->refresh_alt_ref_frame,
+              cm->frame_type, cpi->gfu_boost,
+              cpi->twopass.est_max_qcorrection_factor,
+              (int)cpi->twopass.bits_left,
+              cpi->twopass.total_left_stats->coded_error,
+              cpi->tot_recode_hits, recon_err, cpi->kf_boost,
+              cpi->kf_zeromotion_pct);
+
+    fclose(f);
+
+    if (0) {
+      FILE *fmodes = fopen("Modes.stt", "a");
+      int i;
+
+      fprintf(fmodes, "%6d:%1d:%1d:%1d ",
+              cpi->common.current_video_frame,
+              cm->frame_type, cm->refresh_golden_frame,
+              cm->refresh_alt_ref_frame);
+
+      for (i = 0; i < MAX_MODES; i++)
+        fprintf(fmodes, "%5d ", cpi->mode_chosen_counts[i]);
+
+      fprintf(fmodes, "\n");
+
+      fclose(fmodes);
+    }
+  }
+
+#endif
+
+#if 0
+  // Debug stats for segment feature experiments.
+  print_seg_map(cpi);
+#endif
+
+  // If this was a kf or Gf note the Q
+  if ((cm->frame_type == KEY_FRAME) || cm->refresh_golden_frame || cm->refresh_alt_ref_frame)
+    cm->last_kf_gf_q = cm->base_qindex;
+
+  if (cm->refresh_golden_frame == 1)
+    cm->frame_flags = cm->frame_flags | FRAMEFLAGS_GOLDEN;
+  else
+    cm->frame_flags = cm->frame_flags&~FRAMEFLAGS_GOLDEN;
+
+  if (cm->refresh_alt_ref_frame == 1)
+    cm->frame_flags = cm->frame_flags | FRAMEFLAGS_ALTREF;
+  else
+    cm->frame_flags = cm->frame_flags&~FRAMEFLAGS_ALTREF;
+
+
+  if (cm->refresh_last_frame & cm->refresh_golden_frame) // both refreshed
+    cpi->gold_is_last = 1;
+  else if (cm->refresh_last_frame ^ cm->refresh_golden_frame) // 1 refreshed but not the other
+    cpi->gold_is_last = 0;
+
+  if (cm->refresh_last_frame & cm->refresh_alt_ref_frame) // both refreshed
+    cpi->alt_is_last = 1;
+  else if (cm->refresh_last_frame ^ cm->refresh_alt_ref_frame) // 1 refreshed but not the other
+    cpi->alt_is_last = 0;
+
+  if (cm->refresh_alt_ref_frame & cm->refresh_golden_frame) // both refreshed
+    cpi->gold_is_alt = 1;
+  else if (cm->refresh_alt_ref_frame ^ cm->refresh_golden_frame) // 1 refreshed but not the other
+    cpi->gold_is_alt = 0;
+
+  cpi->ref_frame_flags = VP9_ALT_FLAG | VP9_GOLD_FLAG | VP9_LAST_FLAG;
+
+  if (cpi->gold_is_last)
+    cpi->ref_frame_flags &= ~VP9_GOLD_FLAG;
+
+  if (cpi->alt_is_last)
+    cpi->ref_frame_flags &= ~VP9_ALT_FLAG;
+
+  if (cpi->gold_is_alt)
+    cpi->ref_frame_flags &= ~VP9_ALT_FLAG;
+
+  if (cpi->oxcf.play_alternate && cm->refresh_alt_ref_frame && (cm->frame_type != KEY_FRAME))
+    // Update the alternate reference frame stats as appropriate.
+    update_alt_ref_frame_stats(cpi);
+  else
+    // Update the Golden frame stats as appropriate.
+    update_golden_frame_stats(cpi);
+
+  if (cm->frame_type == KEY_FRAME) {
+    // Tell the caller that the frame was coded as a key frame
+    *frame_flags = cm->frame_flags | FRAMEFLAGS_KEY;
+
+    // As this frame is a key frame  the next defaults to an inter frame.
+    cm->frame_type = INTER_FRAME;
+  } else {
+    *frame_flags = cm->frame_flags&~FRAMEFLAGS_KEY;
+  }
+
+  // Clear the one shot update flags for segmentation map and mode/ref loop filter deltas.
+  xd->update_mb_segmentation_map = 0;
+  xd->update_mb_segmentation_data = 0;
+  xd->mode_ref_lf_delta_update = 0;
+
+
+  // Dont increment frame counters if this was an altref buffer update not a real frame
+  if (cm->show_frame) {
+    cm->current_video_frame++;
+    cpi->frames_since_key++;
+  }
+
+  // reset to normal state now that we are done.
+
+
+
+#if 0
+  {
+    char filename[512];
+    FILE *recon_file;
+    sprintf(filename, "enc%04d.yuv", (int) cm->current_video_frame);
+    recon_file = fopen(filename, "wb");
+    fwrite(cm->yv12_fb[cm->lst_fb_idx].buffer_alloc,
+           cm->yv12_fb[cm->lst_fb_idx].frame_size, 1, recon_file);
+    fclose(recon_file);
+  }
+#endif
+#ifdef OUTPUT_YUV_REC
+  vp9_write_yuv_rec_frame(cm);
+#endif
+
+  if (cm->show_frame) {
+    vpx_memcpy(cm->prev_mip, cm->mip,
+               (cm->mb_cols + 1) * (cm->mb_rows + 1)* sizeof(MODE_INFO));
+  } else {
+    vpx_memset(cm->prev_mip, 0,
+               (cm->mb_cols + 1) * (cm->mb_rows + 1)* sizeof(MODE_INFO));
+  }
+}
+
+static void Pass2Encode(VP9_COMP *cpi, unsigned long *size,
+                        unsigned char *dest, unsigned int *frame_flags) {
+
+  if (!cpi->common.refresh_alt_ref_frame)
+    vp9_second_pass(cpi);
+
+  encode_frame_to_data_rate(cpi, size, dest, frame_flags);
+  cpi->twopass.bits_left -= 8 * *size;
+
+  if (!cpi->common.refresh_alt_ref_frame) {
+    double lower_bounds_min_rate = FRAME_OVERHEAD_BITS * cpi->oxcf.frame_rate;
+    double two_pass_min_rate = (double)(cpi->oxcf.target_bandwidth
+                                        * cpi->oxcf.two_pass_vbrmin_section / 100);
+
+    if (two_pass_min_rate < lower_bounds_min_rate)
+      two_pass_min_rate = lower_bounds_min_rate;
+
+    cpi->twopass.bits_left += (int64_t)(two_pass_min_rate / cpi->oxcf.frame_rate);
+  }
+}
+
+// For ARM NEON, d8-d15 are callee-saved registers, and need to be saved by us.
+#if HAVE_ARMV7
+extern void vp9_push_neon(int64_t *store);
+extern void vp9_pop_neon(int64_t *store);
+#endif
+
+
+int vp9_receive_raw_frame(VP9_PTR ptr, unsigned int frame_flags,
+                          YV12_BUFFER_CONFIG *sd, int64_t time_stamp,
+                          int64_t end_time) {
+#if HAVE_ARMV7
+  int64_t store_reg[8];
+#endif
+  VP9_COMP              *cpi = (VP9_COMP *) ptr;
+  VP9_COMMON            *cm = &cpi->common;
+  struct vpx_usec_timer  timer;
+  int                    res = 0;
+
+#if HAVE_ARMV7
+#if CONFIG_RUNTIME_CPU_DETECT
+  if (cm->rtcd.flags & HAS_NEON)
+#endif
+  {
+    vp9_push_neon(store_reg);
+  }
+#endif
+
+  vpx_usec_timer_start(&timer);
+  if (vp9_lookahead_push(cpi->lookahead, sd, time_stamp, end_time, frame_flags,
+                         cpi->active_map_enabled ? cpi->active_map : NULL))
+    res = -1;
+  cm->clr_type = sd->clrtype;
+  vpx_usec_timer_mark(&timer);
+  cpi->time_receive_data += vpx_usec_timer_elapsed(&timer);
+
+#if HAVE_ARMV7
+#if CONFIG_RUNTIME_CPU_DETECT
+  if (cm->rtcd.flags & HAS_NEON)
+#endif
+  {
+    vp9_pop_neon(store_reg);
+  }
+#endif
+
+  return res;
+}
+
+
+static int frame_is_reference(const VP9_COMP *cpi) {
+  const VP9_COMMON *cm = &cpi->common;
+  const MACROBLOCKD *xd = &cpi->mb.e_mbd;
+
+  return cm->frame_type == KEY_FRAME || cm->refresh_last_frame
+         || cm->refresh_golden_frame || cm->refresh_alt_ref_frame
+         || cm->copy_buffer_to_gf || cm->copy_buffer_to_arf
+         || cm->refresh_entropy_probs
+         || xd->mode_ref_lf_delta_update
+         || xd->update_mb_segmentation_map || xd->update_mb_segmentation_data;
+}
+
+
+int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags,
+                            unsigned long *size, unsigned char *dest,
+                            int64_t *time_stamp, int64_t *time_end, int flush) {
+#if HAVE_ARMV7
+  int64_t store_reg[8];
+#endif
+  VP9_COMP *cpi = (VP9_COMP *) ptr;
+  VP9_COMMON *cm = &cpi->common;
+  struct vpx_usec_timer  cmptimer;
+  YV12_BUFFER_CONFIG    *force_src_buffer = NULL;
+
+  if (!cpi)
+    return -1;
+
+#if HAVE_ARMV7
+#if CONFIG_RUNTIME_CPU_DETECT
+  if (cm->rtcd.flags & HAS_NEON)
+#endif
+  {
+    vp9_push_neon(store_reg);
+  }
+#endif
+
+  vpx_usec_timer_start(&cmptimer);
+
+  cpi->source = NULL;
+
+  cpi->mb.e_mbd.allow_high_precision_mv = ALTREF_HIGH_PRECISION_MV;
+  // Should we code an alternate reference frame
+  if (cpi->oxcf.play_alternate &&
+      cpi->source_alt_ref_pending) {
+    if ((cpi->source = vp9_lookahead_peek(cpi->lookahead,
+                                          cpi->frames_till_gf_update_due))) {
+      cpi->alt_ref_source = cpi->source;
+      if (cpi->oxcf.arnr_max_frames > 0) {
+        vp9_temporal_filter_prepare_c(cpi,
+                                      cpi->frames_till_gf_update_due);
+        force_src_buffer = &cpi->alt_ref_buffer;
+      }
+      cm->frames_till_alt_ref_frame = cpi->frames_till_gf_update_due;
+      cm->refresh_alt_ref_frame = 1;
+      cm->refresh_golden_frame = 0;
+      cm->refresh_last_frame = 0;
+      cm->show_frame = 0;
+      cpi->source_alt_ref_pending = FALSE;   // Clear Pending altf Ref flag.
+      cpi->is_src_frame_alt_ref = 0;
+    }
+  }
+
+  if (!cpi->source) {
+    if ((cpi->source = vp9_lookahead_pop(cpi->lookahead, flush))) {
+      cm->show_frame = 1;
+
+      cpi->is_src_frame_alt_ref = cpi->alt_ref_source
+                                  && (cpi->source == cpi->alt_ref_source);
+
+      if (cpi->is_src_frame_alt_ref)
+        cpi->alt_ref_source = NULL;
+    }
+  }
+
+  if (cpi->source) {
+    cpi->un_scaled_source =
+      cpi->Source = force_src_buffer ? force_src_buffer : &cpi->source->img;
+    *time_stamp = cpi->source->ts_start;
+    *time_end = cpi->source->ts_end;
+    *frame_flags = cpi->source->flags;
+  } else {
+    *size = 0;
+    if (flush && cpi->pass == 1 && !cpi->twopass.first_pass_done) {
+      vp9_end_first_pass(cpi);    /* get last stats packet */
+      cpi->twopass.first_pass_done = 1;
+    }
+
+#if HAVE_ARMV7
+#if CONFIG_RUNTIME_CPU_DETECT
+    if (cm->rtcd.flags & HAS_NEON)
+#endif
+    {
+      vp9_pop_neon(store_reg);
+    }
+#endif
+    return -1;
+  }
+
+  if (cpi->source->ts_start < cpi->first_time_stamp_ever) {
+    cpi->first_time_stamp_ever = cpi->source->ts_start;
+    cpi->last_end_time_stamp_seen = cpi->source->ts_start;
+  }
+
+  // adjust frame rates based on timestamps given
+  if (!cm->refresh_alt_ref_frame) {
+    int64_t this_duration;
+    int step = 0;
+
+    if (cpi->source->ts_start == cpi->first_time_stamp_ever) {
+      this_duration = cpi->source->ts_end - cpi->source->ts_start;
+      step = 1;
+    } else {
+      int64_t last_duration;
+
+      this_duration = cpi->source->ts_end - cpi->last_end_time_stamp_seen;
+      last_duration = cpi->last_end_time_stamp_seen
+                      - cpi->last_time_stamp_seen;
+      // do a step update if the duration changes by 10%
+      if (last_duration)
+        step = ((this_duration - last_duration) * 10 / last_duration);
+    }
+
+    if (this_duration) {
+      if (step)
+        vp9_new_frame_rate(cpi, 10000000.0 / this_duration);
+      else {
+        double avg_duration, interval;
+
+        /* Average this frame's rate into the last second's average
+         * frame rate. If we haven't seen 1 second yet, then average
+         * over the whole interval seen.
+         */
+        interval = cpi->source->ts_end - cpi->first_time_stamp_ever;
+        if (interval > 10000000.0)
+          interval = 10000000;
+
+        avg_duration = 10000000.0 / cpi->oxcf.frame_rate;
+        avg_duration *= (interval - avg_duration + this_duration);
+        avg_duration /= interval;
+
+        vp9_new_frame_rate(cpi, 10000000.0 / avg_duration);
+      }
+    }
+
+    cpi->last_time_stamp_seen = cpi->source->ts_start;
+    cpi->last_end_time_stamp_seen = cpi->source->ts_end;
+  }
+
+  // start with a 0 size frame
+  *size = 0;
+
+  // Clear down mmx registers
+  vp9_clear_system_state();  // __asm emms;
+
+  cm->frame_type = INTER_FRAME;
+  cm->frame_flags = *frame_flags;
+
+#if 0
+
+  if (cm->refresh_alt_ref_frame) {
+    // cm->refresh_golden_frame = 1;
+    cm->refresh_golden_frame = 0;
+    cm->refresh_last_frame = 0;
+  } else {
+    cm->refresh_golden_frame = 0;
+    cm->refresh_last_frame = 1;
+  }
+
+#endif
+  /* find a free buffer for the new frame */
+  {
+    int i = 0;
+    for (; i < NUM_YV12_BUFFERS; i++) {
+      if (!cm->yv12_fb[i].flags) {
+        cm->new_fb_idx = i;
+        break;
+      }
+    }
+
+    assert(i < NUM_YV12_BUFFERS);
+  }
+  if (cpi->pass == 1) {
+    Pass1Encode(cpi, size, dest, frame_flags);
+  } else if (cpi->pass == 2) {
+    Pass2Encode(cpi, size, dest, frame_flags);
+  } else {
+    encode_frame_to_data_rate(cpi, size, dest, frame_flags);
+  }
+
+  if (cm->refresh_entropy_probs) {
+    if (cm->refresh_alt_ref_frame)
+      vpx_memcpy(&cm->lfc_a, &cm->fc, sizeof(cm->fc));
+    else
+      vpx_memcpy(&cm->lfc, &cm->fc, sizeof(cm->fc));
+  }
+
+  // if its a dropped frame honor the requests on subsequent frames
+  if (*size > 0) {
+    cpi->droppable = !frame_is_reference(cpi);
+
+    // return to normal state
+    cm->refresh_entropy_probs = 1;
+    cm->refresh_alt_ref_frame = 0;
+    cm->refresh_golden_frame = 0;
+    cm->refresh_last_frame = 1;
+    cm->frame_type = INTER_FRAME;
+
+  }
+
+  vpx_usec_timer_mark(&cmptimer);
+  cpi->time_compress_data += vpx_usec_timer_elapsed(&cmptimer);
+
+  if (cpi->b_calculate_psnr && cpi->pass != 1 && cm->show_frame) {
+    generate_psnr_packet(cpi);
+  }
+
+#if CONFIG_INTERNAL_STATS
+
+  if (cpi->pass != 1) {
+    cpi->bytes += *size;
+
+    if (cm->show_frame) {
+
+      cpi->count++;
+
+      if (cpi->b_calculate_psnr) {
+        double ye, ue, ve;
+        double frame_psnr;
+        YV12_BUFFER_CONFIG      *orig = cpi->Source;
+        YV12_BUFFER_CONFIG      *recon = cpi->common.frame_to_show;
+        YV12_BUFFER_CONFIG      *pp = &cm->post_proc_buffer;
+        int y_samples = orig->y_height * orig->y_width;
+        int uv_samples = orig->uv_height * orig->uv_width;
+        int t_samples = y_samples + 2 * uv_samples;
+        int64_t sq_error;
+
+        ye = calc_plane_error(orig->y_buffer, orig->y_stride,
+                              recon->y_buffer, recon->y_stride, orig->y_width,
+                              orig->y_height);
+
+        ue = calc_plane_error(orig->u_buffer, orig->uv_stride,
+                              recon->u_buffer, recon->uv_stride, orig->uv_width,
+                              orig->uv_height);
+
+        ve = calc_plane_error(orig->v_buffer, orig->uv_stride,
+                              recon->v_buffer, recon->uv_stride, orig->uv_width,
+                              orig->uv_height);
+
+        sq_error = ye + ue + ve;
+
+        frame_psnr = vp9_mse2psnr(t_samples, 255.0, sq_error);
+
+        cpi->total_y += vp9_mse2psnr(y_samples, 255.0, ye);
+        cpi->total_u += vp9_mse2psnr(uv_samples, 255.0, ue);
+        cpi->total_v += vp9_mse2psnr(uv_samples, 255.0, ve);
+        cpi->total_sq_error += sq_error;
+        cpi->total  += frame_psnr;
+        {
+          double frame_psnr2, frame_ssim2 = 0;
+          double weight = 0;
+#if CONFIG_POSTPROC
+          vp9_deblock(cm->frame_to_show, &cm->post_proc_buffer, cm->filter_level * 10 / 6, 1, 0, IF_RTCD(&cm->rtcd.postproc));
+#endif
+          vp9_clear_system_state();
+
+          ye = calc_plane_error(orig->y_buffer, orig->y_stride,
+                                pp->y_buffer, pp->y_stride, orig->y_width,
+                                orig->y_height);
+
+          ue = calc_plane_error(orig->u_buffer, orig->uv_stride,
+                                pp->u_buffer, pp->uv_stride, orig->uv_width,
+                                orig->uv_height);
+
+          ve = calc_plane_error(orig->v_buffer, orig->uv_stride,
+                                pp->v_buffer, pp->uv_stride, orig->uv_width,
+                                orig->uv_height);
+
+          sq_error = ye + ue + ve;
+
+          frame_psnr2 = vp9_mse2psnr(t_samples, 255.0, sq_error);
+
+          cpi->totalp_y += vp9_mse2psnr(y_samples, 255.0, ye);
+          cpi->totalp_u += vp9_mse2psnr(uv_samples, 255.0, ue);
+          cpi->totalp_v += vp9_mse2psnr(uv_samples, 255.0, ve);
+          cpi->total_sq_error2 += sq_error;
+          cpi->totalp  += frame_psnr2;
+
+          frame_ssim2 = vp9_calc_ssim(cpi->Source,
+                                      &cm->post_proc_buffer, 1, &weight);
+
+          cpi->summed_quality += frame_ssim2 * weight;
+          cpi->summed_weights += weight;
+#if 0
+          {
+            FILE *f = fopen("q_used.stt", "a");
+            fprintf(f, "%5d : Y%f7.3:U%f7.3:V%f7.3:F%f7.3:S%7.3f\n",
+                    cpi->common.current_video_frame, y2, u2, v2,
+                    frame_psnr2, frame_ssim2);
+            fclose(f);
+          }
+#endif
+        }
+      }
+
+      if (cpi->b_calculate_ssimg) {
+        double y, u, v, frame_all;
+        frame_all =  vp9_calc_ssimg(cpi->Source, cm->frame_to_show,
+                                    &y, &u, &v);
+        cpi->total_ssimg_y += y;
+        cpi->total_ssimg_u += u;
+        cpi->total_ssimg_v += v;
+        cpi->total_ssimg_all += frame_all;
+      }
+
+    }
+  }
+
+#endif
+
+#if HAVE_ARMV7
+#if CONFIG_RUNTIME_CPU_DETECT
+  if (cm->rtcd.flags & HAS_NEON)
+#endif
+  {
+    vp9_pop_neon(store_reg);
+  }
+#endif
+
+  return 0;
+}
+
+int vp9_get_preview_raw_frame(VP9_PTR comp, YV12_BUFFER_CONFIG *dest,
+                              vp9_ppflags_t *flags) {
+  VP9_COMP *cpi = (VP9_COMP *) comp;
+
+  if (cpi->common.refresh_alt_ref_frame)
+    return -1;
+  else {
+    int ret;
+#if CONFIG_POSTPROC
+    ret = vp9_post_proc_frame(&cpi->common, dest, flags);
+#else
+
+    if (cpi->common.frame_to_show) {
+      *dest = *cpi->common.frame_to_show;
+      dest->y_width = cpi->common.Width;
+      dest->y_height = cpi->common.Height;
+      dest->uv_height = cpi->common.Height / 2;
+      ret = 0;
+    } else {
+      ret = -1;
+    }
+
+#endif // !CONFIG_POSTPROC
+    vp9_clear_system_state();
+    return ret;
+  }
+}
+
+int vp9_set_roimap(VP9_PTR comp, unsigned char *map, unsigned int rows,
+                   unsigned int cols, int delta_q[4], int delta_lf[4],
+                   unsigned int threshold[4]) {
+  VP9_COMP *cpi = (VP9_COMP *) comp;
+  signed char feature_data[SEG_LVL_MAX][MAX_MB_SEGMENTS];
+  MACROBLOCKD *xd = &cpi->mb.e_mbd;
+  int i;
+
+  if (cpi->common.mb_rows != rows || cpi->common.mb_cols != cols)
+    return -1;
+
+  if (!map) {
+    vp9_disable_segmentation((VP9_PTR)cpi);
+    return 0;
+  }
+
+  // Set the segmentation Map
+  vp9_set_segmentation_map((VP9_PTR)cpi, map);
+
+  // Activate segmentation.
+  vp9_enable_segmentation((VP9_PTR)cpi);
+
+  // Set up the quant segment data
+  feature_data[SEG_LVL_ALT_Q][0] = delta_q[0];
+  feature_data[SEG_LVL_ALT_Q][1] = delta_q[1];
+  feature_data[SEG_LVL_ALT_Q][2] = delta_q[2];
+  feature_data[SEG_LVL_ALT_Q][3] = delta_q[3];
+
+  // Set up the loop segment data s
+  feature_data[SEG_LVL_ALT_LF][0] = delta_lf[0];
+  feature_data[SEG_LVL_ALT_LF][1] = delta_lf[1];
+  feature_data[SEG_LVL_ALT_LF][2] = delta_lf[2];
+  feature_data[SEG_LVL_ALT_LF][3] = delta_lf[3];
+
+  cpi->segment_encode_breakout[0] = threshold[0];
+  cpi->segment_encode_breakout[1] = threshold[1];
+  cpi->segment_encode_breakout[2] = threshold[2];
+  cpi->segment_encode_breakout[3] = threshold[3];
+
+  // Enable the loop and quant changes in the feature mask
+  for (i = 0; i < 4; i++) {
+    if (delta_q[i])
+      vp9_enable_segfeature(xd, i, SEG_LVL_ALT_Q);
+    else
+      vp9_disable_segfeature(xd, i, SEG_LVL_ALT_Q);
+
+    if (delta_lf[i])
+      vp9_enable_segfeature(xd, i, SEG_LVL_ALT_LF);
+    else
+      vp9_disable_segfeature(xd, i, SEG_LVL_ALT_LF);
+  }
+
+  // Initialise the feature data structure
+  // SEGMENT_DELTADATA    0, SEGMENT_ABSDATA      1
+  vp9_set_segment_data((VP9_PTR)cpi, &feature_data[0][0], SEGMENT_DELTADATA);
+
+  return 0;
+}
+
+int vp9_set_active_map(VP9_PTR comp, unsigned char *map,
+                       unsigned int rows, unsigned int cols) {
+  VP9_COMP *cpi = (VP9_COMP *) comp;
+
+  if (rows == cpi->common.mb_rows && cols == cpi->common.mb_cols) {
+    if (map) {
+      vpx_memcpy(cpi->active_map, map, rows * cols);
+      cpi->active_map_enabled = 1;
+    } else
+      cpi->active_map_enabled = 0;
+
+    return 0;
+  } else {
+    // cpi->active_map_enabled = 0;
+    return -1;
+  }
+}
+
+int vp9_set_internal_size(VP9_PTR comp,
+                          VPX_SCALING horiz_mode, VPX_SCALING vert_mode) {
+  VP9_COMP *cpi = (VP9_COMP *) comp;
+
+  if (horiz_mode <= ONETWO)
+    cpi->common.horiz_scale = horiz_mode;
+  else
+    return -1;
+
+  if (vert_mode <= ONETWO)
+    cpi->common.vert_scale  = vert_mode;
+  else
+    return -1;
+
+  return 0;
+}
+
+
+
+int vp9_calc_ss_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest) {
+  int i, j;
+  int Total = 0;
+
+  unsigned char *src = source->y_buffer;
+  unsigned char *dst = dest->y_buffer;
+
+  // Loop through the Y plane raw and reconstruction data summing (square differences)
+  for (i = 0; i < source->y_height; i += 16) {
+    for (j = 0; j < source->y_width; j += 16) {
+      unsigned int sse;
+      Total += vp9_mse16x16(src + j, source->y_stride, dst + j, dest->y_stride,
+                            &sse);
+    }
+
+    src += 16 * source->y_stride;
+    dst += 16 * dest->y_stride;
+  }
+
+  return Total;
+}
+
+
+int vp9_get_quantizer(VP9_PTR c) {
+  VP9_COMP   *cpi = (VP9_COMP *) c;
+  return cpi->common.base_qindex;
+}
--- /dev/null
+++ b/vp9/encoder/onyx_int.h
@@ -1,0 +1,788 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_ONYX_INT_H
+#define __INC_ONYX_INT_H
+
+#include <stdio.h>
+#include "vpx_ports/config.h"
+#include "vp9/common/onyx.h"
+#include "treewriter.h"
+#include "tokenize.h"
+#include "vp9/common/onyxc_int.h"
+#include "variance.h"
+#include "encodemb.h"
+#include "quantize.h"
+#include "vp9/common/entropy.h"
+#include "vp9/common/entropymode.h"
+#include "vpx_ports/mem.h"
+#include "vpx/internal/vpx_codec_internal.h"
+#include "mcomp.h"
+#include "temporal_filter.h"
+#include "vp9/common/findnearmv.h"
+#include "lookahead.h"
+
+// #define SPEEDSTATS 1
+#define MIN_GF_INTERVAL             4
+#define DEFAULT_GF_INTERVAL         7
+
+#define KEY_FRAME_CONTEXT 5
+
+#define MAX_LAG_BUFFERS 25
+
+#define AF_THRESH   25
+#define AF_THRESH2  100
+#define ARF_DECAY_THRESH 12
+
+#if CONFIG_PRED_FILTER
+#define MAX_MODES 54
+#else  // CONFIG_PRED_FILTER
+#define MAX_MODES 42
+#endif  // CONFIG_PRED_FILTER
+
+#define MIN_THRESHMULT  32
+#define MAX_THRESHMULT  512
+
+#define GF_ZEROMV_ZBIN_BOOST 12
+#define LF_ZEROMV_ZBIN_BOOST 6
+#define MV_ZBIN_BOOST        4
+#define ZBIN_OQ_MAX 192
+
+#define VP9_TEMPORAL_ALT_REF 1
+
+typedef struct {
+  nmv_context nmvc;
+  int nmvjointcost[MV_JOINTS];
+  int nmvcosts[2][MV_VALS];
+  int nmvcosts_hp[2][MV_VALS];
+
+#ifdef MODE_STATS
+  // Stats
+  int y_modes[VP9_YMODES];
+  int uv_modes[VP9_UV_MODES];
+  int i8x8_modes[VP9_I8X8_MODES];
+  int b_modes[B_MODE_COUNT];
+  int inter_y_modes[MB_MODE_COUNT];
+  int inter_uv_modes[VP9_UV_MODES];
+  int inter_b_modes[B_MODE_COUNT];
+#endif
+
+  vp9_prob segment_pred_probs[PREDICTION_PROBS];
+  unsigned char ref_pred_probs_update[PREDICTION_PROBS];
+  vp9_prob ref_pred_probs[PREDICTION_PROBS];
+  vp9_prob prob_comppred[COMP_PRED_CONTEXTS];
+
+  unsigned char *last_frame_seg_map_copy;
+
+  // 0 = Intra, Last, GF, ARF
+  signed char last_ref_lf_deltas[MAX_REF_LF_DELTAS];
+  // 0 = BPRED, ZERO_MV, MV, SPLIT
+  signed char last_mode_lf_deltas[MAX_MODE_LF_DELTAS];
+
+  vp9_prob coef_probs[BLOCK_TYPES]
+      [COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES];
+  vp9_prob hybrid_coef_probs[BLOCK_TYPES]
+      [COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES];
+
+  vp9_prob coef_probs_8x8[BLOCK_TYPES_8X8]
+      [COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES];
+  vp9_prob hybrid_coef_probs_8x8[BLOCK_TYPES_8X8]
+      [COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES];
+
+  vp9_prob coef_probs_16x16[BLOCK_TYPES_16X16]
+      [COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES];
+  vp9_prob hybrid_coef_probs_16x16[BLOCK_TYPES_16X16]
+      [COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES];
+
+  vp9_prob ymode_prob [VP9_YMODES - 1]; /* interframe intra mode probs */
+  vp9_prob uv_mode_prob [VP9_YMODES][VP9_UV_MODES - 1];
+  vp9_prob bmode_prob [VP9_BINTRAMODES - 1];
+  vp9_prob i8x8_mode_prob [VP9_I8X8_MODES - 1];
+  vp9_prob sub_mv_ref_prob [SUBMVREF_COUNT][VP9_SUBMVREFS - 1];
+  vp9_prob mbsplit_prob [VP9_NUMMBSPLITS - 1];
+
+  vp9_prob switchable_interp_prob[VP9_SWITCHABLE_FILTERS + 1]
+                                 [VP9_SWITCHABLE_FILTERS - 1];
+
+  int mv_ref_ct[6][4][2];
+  int mode_context[6][4];
+  int mv_ref_ct_a[6][4][2];
+  int mode_context_a[6][4];
+
+} CODING_CONTEXT;
+
+typedef struct {
+  double frame;
+  double intra_error;
+  double coded_error;
+  double sr_coded_error;
+  double ssim_weighted_pred_err;
+  double pcnt_inter;
+  double pcnt_motion;
+  double pcnt_second_ref;
+  double pcnt_neutral;
+  double MVr;
+  double mvr_abs;
+  double MVc;
+  double mvc_abs;
+  double MVrv;
+  double MVcv;
+  double mv_in_out_count;
+  double new_mv_count;
+  double duration;
+  double count;
+}
+FIRSTPASS_STATS;
+
+typedef struct {
+  int frames_so_far;
+  double frame_intra_error;
+  double frame_coded_error;
+  double frame_pcnt_inter;
+  double frame_pcnt_motion;
+  double frame_mvr;
+  double frame_mvr_abs;
+  double frame_mvc;
+  double frame_mvc_abs;
+
+} ONEPASS_FRAMESTATS;
+
+typedef struct {
+  struct {
+    int err;
+    union {
+      int_mv mv;
+      MB_PREDICTION_MODE mode;
+    } m;
+  } ref[MAX_REF_FRAMES];
+} MBGRAPH_MB_STATS;
+
+typedef struct {
+  MBGRAPH_MB_STATS *mb_stats;
+} MBGRAPH_FRAME_STATS;
+
+#if CONFIG_PRED_FILTER
+typedef enum {
+  THR_ZEROMV,
+  THR_ZEROMV_FILT,
+  THR_DC,
+
+  THR_NEARESTMV,
+  THR_NEARESTMV_FILT,
+  THR_NEARMV,
+  THR_NEARMV_FILT,
+
+  THR_ZEROG,
+  THR_ZEROG_FILT,
+  THR_NEARESTG,
+  THR_NEARESTG_FILT,
+
+  THR_ZEROA,
+  THR_ZEROA_FILT,
+  THR_NEARESTA,
+  THR_NEARESTA_FILT,
+
+  THR_NEARG,
+  THR_NEARG_FILT,
+  THR_NEARA,
+  THR_NEARA_FILT,
+
+  THR_V_PRED,
+  THR_H_PRED,
+  THR_D45_PRED,
+  THR_D135_PRED,
+  THR_D117_PRED,
+  THR_D153_PRED,
+  THR_D27_PRED,
+  THR_D63_PRED,
+  THR_TM,
+
+  THR_NEWMV,
+  THR_NEWMV_FILT,
+  THR_NEWG,
+  THR_NEWG_FILT,
+  THR_NEWA,
+  THR_NEWA_FILT,
+
+  THR_SPLITMV,
+  THR_SPLITG,
+  THR_SPLITA,
+
+  THR_B_PRED,
+  THR_I8X8_PRED,
+
+  THR_COMP_ZEROLG,
+  THR_COMP_NEARESTLG,
+  THR_COMP_NEARLG,
+
+  THR_COMP_ZEROLA,
+  THR_COMP_NEARESTLA,
+  THR_COMP_NEARLA,
+
+  THR_COMP_ZEROGA,
+  THR_COMP_NEARESTGA,
+  THR_COMP_NEARGA,
+
+  THR_COMP_NEWLG,
+  THR_COMP_NEWLA,
+  THR_COMP_NEWGA,
+
+  THR_COMP_SPLITLG,
+  THR_COMP_SPLITLA,
+  THR_COMP_SPLITGA,
+}
+THR_MODES;
+#else
+typedef enum {
+  THR_ZEROMV,
+  THR_DC,
+
+  THR_NEARESTMV,
+  THR_NEARMV,
+
+  THR_ZEROG,
+  THR_NEARESTG,
+
+  THR_ZEROA,
+  THR_NEARESTA,
+
+  THR_NEARG,
+  THR_NEARA,
+
+  THR_V_PRED,
+  THR_H_PRED,
+  THR_D45_PRED,
+  THR_D135_PRED,
+  THR_D117_PRED,
+  THR_D153_PRED,
+  THR_D27_PRED,
+  THR_D63_PRED,
+  THR_TM,
+
+  THR_NEWMV,
+  THR_NEWG,
+  THR_NEWA,
+
+  THR_SPLITMV,
+  THR_SPLITG,
+  THR_SPLITA,
+
+  THR_B_PRED,
+  THR_I8X8_PRED,
+
+  THR_COMP_ZEROLG,
+  THR_COMP_NEARESTLG,
+  THR_COMP_NEARLG,
+
+  THR_COMP_ZEROLA,
+  THR_COMP_NEARESTLA,
+  THR_COMP_NEARLA,
+
+  THR_COMP_ZEROGA,
+  THR_COMP_NEARESTGA,
+  THR_COMP_NEARGA,
+
+  THR_COMP_NEWLG,
+  THR_COMP_NEWLA,
+  THR_COMP_NEWGA,
+
+  THR_COMP_SPLITLG,
+  THR_COMP_SPLITLA,
+  THR_COMP_SPLITGA
+}
+THR_MODES;
+#endif
+
+typedef enum {
+  DIAMOND = 0,
+  NSTEP = 1,
+  HEX = 2
+} SEARCH_METHODS;
+
+typedef struct {
+  int RD;
+  SEARCH_METHODS search_method;
+  int improved_dct;
+  int auto_filter;
+  int recode_loop;
+  int iterative_sub_pixel;
+  int half_pixel_search;
+  int quarter_pixel_search;
+  int thresh_mult[MAX_MODES];
+  int max_step_search_steps;
+  int first_step;
+  int optimize_coefficients;
+  int no_skip_block4x4_search;
+  int improved_mv_pred;
+  int search_best_filter;
+
+} SPEED_FEATURES;
+
+typedef struct {
+  MACROBLOCK  mb;
+  int totalrate;
+} MB_ROW_COMP;
+
+typedef struct {
+  TOKENEXTRA *start;
+  TOKENEXTRA *stop;
+} TOKENLIST;
+
+typedef struct {
+  int ithread;
+  void *ptr1;
+  void *ptr2;
+} ENCODETHREAD_DATA;
+typedef struct {
+  int ithread;
+  void *ptr1;
+} LPFTHREAD_DATA;
+
+
+typedef struct VP9_ENCODER_RTCD {
+  VP9_COMMON_RTCD            *common;
+  vp9_search_rtcd_vtable_t    search;
+  vp9_temporal_rtcd_vtable_t  temporal;
+} VP9_ENCODER_RTCD;
+
+enum BlockSize {
+  BLOCK_16X8 = PARTITIONING_16X8,
+  BLOCK_8X16 = PARTITIONING_8X16,
+  BLOCK_8X8 = PARTITIONING_8X8,
+  BLOCK_4X4 = PARTITIONING_4X4,
+  BLOCK_16X16,
+  BLOCK_MAX_SEGMENTS,
+  BLOCK_32X32 = BLOCK_MAX_SEGMENTS,
+  BLOCK_MAX_SB_SEGMENTS,
+};
+
+typedef struct VP9_COMP {
+
+  DECLARE_ALIGNED(16, short, Y1quant[QINDEX_RANGE][16]);
+  DECLARE_ALIGNED(16, unsigned char, Y1quant_shift[QINDEX_RANGE][16]);
+  DECLARE_ALIGNED(16, short, Y1zbin[QINDEX_RANGE][16]);
+  DECLARE_ALIGNED(16, short, Y1round[QINDEX_RANGE][16]);
+
+  DECLARE_ALIGNED(16, short, Y2quant[QINDEX_RANGE][16]);
+  DECLARE_ALIGNED(16, unsigned char, Y2quant_shift[QINDEX_RANGE][16]);
+  DECLARE_ALIGNED(16, short, Y2zbin[QINDEX_RANGE][16]);
+  DECLARE_ALIGNED(16, short, Y2round[QINDEX_RANGE][16]);
+
+  DECLARE_ALIGNED(16, short, UVquant[QINDEX_RANGE][16]);
+  DECLARE_ALIGNED(16, unsigned char, UVquant_shift[QINDEX_RANGE][16]);
+  DECLARE_ALIGNED(16, short, UVzbin[QINDEX_RANGE][16]);
+  DECLARE_ALIGNED(16, short, UVround[QINDEX_RANGE][16]);
+
+  DECLARE_ALIGNED(16, short, zrun_zbin_boost_y1[QINDEX_RANGE][16]);
+  DECLARE_ALIGNED(16, short, zrun_zbin_boost_y2[QINDEX_RANGE][16]);
+  DECLARE_ALIGNED(16, short, zrun_zbin_boost_uv[QINDEX_RANGE][16]);
+
+  DECLARE_ALIGNED(64, short, Y1zbin_8x8[QINDEX_RANGE][64]);
+  DECLARE_ALIGNED(64, short, Y2zbin_8x8[QINDEX_RANGE][64]);
+  DECLARE_ALIGNED(64, short, UVzbin_8x8[QINDEX_RANGE][64]);
+  DECLARE_ALIGNED(64, short, zrun_zbin_boost_y1_8x8[QINDEX_RANGE][64]);
+  DECLARE_ALIGNED(64, short, zrun_zbin_boost_y2_8x8[QINDEX_RANGE][64]);
+  DECLARE_ALIGNED(64, short, zrun_zbin_boost_uv_8x8[QINDEX_RANGE][64]);
+
+  DECLARE_ALIGNED(16, short, Y1zbin_16x16[QINDEX_RANGE][256]);
+  DECLARE_ALIGNED(16, short, Y2zbin_16x16[QINDEX_RANGE][256]);
+  DECLARE_ALIGNED(16, short, UVzbin_16x16[QINDEX_RANGE][256]);
+  DECLARE_ALIGNED(16, short, zrun_zbin_boost_y1_16x16[QINDEX_RANGE][256]);
+  DECLARE_ALIGNED(16, short, zrun_zbin_boost_y2_16x16[QINDEX_RANGE][256]);
+  DECLARE_ALIGNED(16, short, zrun_zbin_boost_uv_16x16[QINDEX_RANGE][256]);
+
+  MACROBLOCK mb;
+  VP9_COMMON common;
+  VP9_CONFIG oxcf;
+
+  struct lookahead_ctx    *lookahead;
+  struct lookahead_entry  *source;
+  struct lookahead_entry  *alt_ref_source;
+
+  YV12_BUFFER_CONFIG *Source;
+  YV12_BUFFER_CONFIG *un_scaled_source;
+  YV12_BUFFER_CONFIG scaled_source;
+
+  int source_alt_ref_pending; // frame in src_buffers has been identified to be encoded as an alt ref
+  int source_alt_ref_active;  // an alt ref frame has been encoded and is usable
+
+  int is_src_frame_alt_ref;   // source of frame to encode is an exact copy of an alt ref frame
+
+  int gold_is_last; // golden frame same as last frame ( short circuit gold searches)
+  int alt_is_last;  // Alt reference frame same as last ( short circuit altref search)
+  int gold_is_alt;  // don't do both alt and gold search ( just do gold).
+
+  // int refresh_alt_ref_frame;
+  YV12_BUFFER_CONFIG last_frame_uf;
+
+  TOKENEXTRA *tok;
+  unsigned int tok_count;
+
+
+  unsigned int frames_since_key;
+  unsigned int key_frame_frequency;
+  unsigned int this_key_frame_forced;
+  unsigned int next_key_frame_forced;
+
+  // Ambient reconstruction err target for force key frames
+  int ambient_err;
+
+  unsigned int mode_check_freq[MAX_MODES];
+  unsigned int mode_test_hit_counts[MAX_MODES];
+  unsigned int mode_chosen_counts[MAX_MODES];
+
+  int rd_thresh_mult[MAX_MODES];
+  int rd_baseline_thresh[MAX_MODES];
+  int rd_threshes[MAX_MODES];
+  int64_t rd_comp_pred_diff[NB_PREDICTION_TYPES];
+  int rd_prediction_type_threshes[4][NB_PREDICTION_TYPES];
+  int comp_pred_count[COMP_PRED_CONTEXTS];
+  int single_pred_count[COMP_PRED_CONTEXTS];
+  // FIXME contextualize
+  int txfm_count[TX_SIZE_MAX];
+  int txfm_count_8x8p[TX_SIZE_MAX - 1];
+  int64_t rd_tx_select_diff[NB_TXFM_MODES];
+  int rd_tx_select_threshes[4][NB_TXFM_MODES];
+
+  int RDMULT;
+  int RDDIV;
+
+  CODING_CONTEXT coding_context;
+
+  // Rate targetting variables
+  int64_t prediction_error;
+  int64_t last_prediction_error;
+  int64_t intra_error;
+  int64_t last_intra_error;
+
+  int this_frame_target;
+  int projected_frame_size;
+  int last_q[2];                   // Separate values for Intra/Inter
+  int last_boosted_qindex;         // Last boosted GF/KF/ARF q
+
+  double rate_correction_factor;
+  double key_frame_rate_correction_factor;
+  double gf_rate_correction_factor;
+
+  int frames_till_gf_update_due;      // Count down till next GF
+  int current_gf_interval;          // GF interval chosen when we coded the last GF
+
+  int gf_overspend_bits;            // Total bits overspent becasue of GF boost (cumulative)
+
+  int non_gf_bitrate_adjustment;     // Used in the few frames following a GF to recover the extra bits spent in that GF
+
+  int kf_overspend_bits;            // Extra bits spent on key frames that need to be recovered on inter frames
+  int kf_bitrate_adjustment;        // Current number of bit s to try and recover on each inter frame.
+  int max_gf_interval;
+  int baseline_gf_interval;
+  int active_arnr_frames;           // <= cpi->oxcf.arnr_max_frames
+
+  int64_t key_frame_count;
+  int prior_key_frame_distance[KEY_FRAME_CONTEXT];
+  int per_frame_bandwidth;          // Current section per frame bandwidth target
+  int av_per_frame_bandwidth;        // Average frame size target for clip
+  int min_frame_bandwidth;          // Minimum allocation that should be used for any frame
+  int inter_frame_target;
+  double output_frame_rate;
+  int64_t last_time_stamp_seen;
+  int64_t last_end_time_stamp_seen;
+  int64_t first_time_stamp_ever;
+
+  int ni_av_qi;
+  int ni_tot_qi;
+  int ni_frames;
+  int avg_frame_qindex;
+  double tot_q;
+  double avg_q;
+
+  int zbin_over_quant;
+  int zbin_mode_boost;
+  int zbin_mode_boost_enabled;
+
+  int64_t total_byte_count;
+
+  int buffered_mode;
+
+  int buffer_level;
+  int bits_off_target;
+
+  int rolling_target_bits;
+  int rolling_actual_bits;
+
+  int long_rolling_target_bits;
+  int long_rolling_actual_bits;
+
+  int64_t total_actual_bits;
+  int total_target_vs_actual;        // debug stats
+
+  int worst_quality;
+  int active_worst_quality;
+  int best_quality;
+  int active_best_quality;
+
+  int cq_target_quality;
+
+#if CONFIG_SUPERBLOCKS
+  int sb_count;
+  int sb_ymode_count [VP9_I32X32_MODES];
+#endif
+  int ymode_count [VP9_YMODES];        /* intra MB type cts this frame */
+  int bmode_count [VP9_BINTRAMODES];
+  int i8x8_mode_count [VP9_I8X8_MODES];
+  int sub_mv_ref_count [SUBMVREF_COUNT][VP9_SUBMVREFS];
+  int mbsplit_count [VP9_NUMMBSPLITS];
+  // int uv_mode_count[VP9_UV_MODES];       /* intra MB type cts this frame */
+  int y_uv_mode_count[VP9_YMODES][VP9_UV_MODES];
+
+  nmv_context_counts NMVcount;
+
+  unsigned int coef_counts [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];  /* for this frame */
+  vp9_prob frame_coef_probs [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
+  unsigned int frame_branch_ct [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES][2];
+  unsigned int hybrid_coef_counts [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];  /* for this frame */
+  vp9_prob frame_hybrid_coef_probs [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
+  unsigned int frame_hybrid_branch_ct [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES][2];
+
+  unsigned int coef_counts_8x8 [BLOCK_TYPES_8X8] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];  /* for this frame */
+  vp9_prob frame_coef_probs_8x8 [BLOCK_TYPES_8X8] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
+  unsigned int frame_branch_ct_8x8 [BLOCK_TYPES_8X8] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES][2];
+  unsigned int hybrid_coef_counts_8x8 [BLOCK_TYPES_8X8] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];  /* for this frame */
+  vp9_prob frame_hybrid_coef_probs_8x8 [BLOCK_TYPES_8X8] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
+  unsigned int frame_hybrid_branch_ct_8x8 [BLOCK_TYPES_8X8] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES][2];
+
+  unsigned int coef_counts_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];  /* for this frame */
+  vp9_prob frame_coef_probs_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
+  unsigned int frame_branch_ct_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES][2];
+  unsigned int hybrid_coef_counts_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];  /* for this frame */
+  vp9_prob frame_hybrid_coef_probs_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
+  unsigned int frame_hybrid_branch_ct_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES][2];
+
+  int gfu_boost;
+  int last_boost;
+  int kf_boost;
+  int kf_zeromotion_pct;
+
+  int target_bandwidth;
+  struct vpx_codec_pkt_list  *output_pkt_list;
+
+#if 0
+  // Experimental code for lagged and one pass
+  ONEPASS_FRAMESTATS one_pass_frame_stats[MAX_LAG_BUFFERS];
+  int one_pass_frame_index;
+#endif
+  MBGRAPH_FRAME_STATS mbgraph_stats[MAX_LAG_BUFFERS];
+  int mbgraph_n_frames;             // number of frames filled in the above
+  int static_mb_pct;                // % forced skip mbs by segmentation
+  int seg0_progress, seg0_idx, seg0_cnt;
+  int ref_pred_count[3][2];
+
+  int decimation_factor;
+  int decimation_count;
+
+  // for real time encoding
+  int avg_encode_time;              // microsecond
+  int avg_pick_mode_time;            // microsecond
+  int Speed;
+  unsigned int cpu_freq;           // Mhz
+  int compressor_speed;
+
+  int interquantizer;
+  int goldfreq;
+  int auto_worst_q;
+  int cpu_used;
+  int horiz_scale;
+  int vert_scale;
+  int pass;
+
+  vp9_prob last_skip_false_probs[3][MBSKIP_CONTEXTS];
+  int last_skip_probs_q[3];
+
+  int recent_ref_frame_usage[MAX_REF_FRAMES];
+  int count_mb_ref_frame_usage[MAX_REF_FRAMES];
+  int ref_frame_flags;
+
+  unsigned char ref_pred_probs_update[PREDICTION_PROBS];
+
+  SPEED_FEATURES sf;
+  int error_bins[1024];
+
+  // Data used for real time conferencing mode to help determine if it would be good to update the gf
+  int inter_zz_count;
+  int gf_bad_count;
+  int gf_update_recommended;
+  int skip_true_count[3];
+  int skip_false_count[3];
+
+  unsigned char *segmentation_map;
+
+  // segment threashold for encode breakout
+  int  segment_encode_breakout[MAX_MB_SEGMENTS];
+
+  unsigned char *active_map;
+  unsigned int active_map_enabled;
+
+  TOKENLIST *tplist;
+
+  fractional_mv_step_fp *find_fractional_mv_step;
+  vp9_full_search_fn_t full_search_sad;
+  vp9_refining_search_fn_t refining_search_sad;
+  vp9_diamond_search_fn_t diamond_search_sad;
+  vp9_variance_fn_ptr_t fn_ptr[BLOCK_MAX_SB_SEGMENTS];
+  uint64_t time_receive_data;
+  uint64_t time_compress_data;
+  uint64_t time_pick_lpf;
+  uint64_t time_encode_mb_row;
+
+  int base_skip_false_prob[QINDEX_RANGE][3];
+
+  struct twopass_rc {
+    unsigned int section_intra_rating;
+    unsigned int next_iiratio;
+    unsigned int this_iiratio;
+    FIRSTPASS_STATS *total_stats;
+    FIRSTPASS_STATS *this_frame_stats;
+    FIRSTPASS_STATS *stats_in, *stats_in_end, *stats_in_start;
+    FIRSTPASS_STATS *total_left_stats;
+    int first_pass_done;
+    int64_t bits_left;
+    int64_t clip_bits_total;
+    double avg_iiratio;
+    double modified_error_total;
+    double modified_error_used;
+    double modified_error_left;
+    double kf_intra_err_min;
+    double gf_intra_err_min;
+    int frames_to_key;
+    int maxq_max_limit;
+    int maxq_min_limit;
+    int static_scene_max_gf_interval;
+    int kf_bits;
+    int gf_group_error_left;           // Remaining error from uncoded frames in a gf group. Two pass use only
+
+    // Projected total bits available for a key frame group of frames
+    int64_t kf_group_bits;
+
+    // Error score of frames still to be coded in kf group
+    int64_t kf_group_error_left;
+
+    int gf_group_bits;                // Projected Bits available for a group of frames including 1 GF or ARF
+    int gf_bits;                     // Bits for the golden frame or ARF - 2 pass only
+    int alt_extra_bits;
+
+    int sr_update_lag;
+    double est_max_qcorrection_factor;
+  } twopass;
+
+#if CONFIG_RUNTIME_CPU_DETECT
+  VP9_ENCODER_RTCD            rtcd;
+#endif
+#if VP9_TEMPORAL_ALT_REF
+  YV12_BUFFER_CONFIG alt_ref_buffer;
+  YV12_BUFFER_CONFIG *frames[MAX_LAG_BUFFERS];
+  int fixed_divide[512];
+#endif
+
+#if CONFIG_INTERNAL_STATS
+  int    count;
+  double total_y;
+  double total_u;
+  double total_v;
+  double total;
+  double total_sq_error;
+  double totalp_y;
+  double totalp_u;
+  double totalp_v;
+  double totalp;
+  double total_sq_error2;
+  int    bytes;
+  double summed_quality;
+  double summed_weights;
+  unsigned int tot_recode_hits;
+
+
+  double total_ssimg_y;
+  double total_ssimg_u;
+  double total_ssimg_v;
+  double total_ssimg_all;
+
+  int b_calculate_ssimg;
+#endif
+  int b_calculate_psnr;
+
+  // Per MB activity measurement
+  unsigned int activity_avg;
+  unsigned int *mb_activity_map;
+  int *mb_norm_activity_map;
+
+  // Record of which MBs still refer to last golden frame either
+  // directly or through 0,0
+  unsigned char *gf_active_flags;
+  int gf_active_count;
+
+  int output_partition;
+
+  // Store last frame's MV info for next frame MV prediction
+  int_mv *lfmv;
+  int *lf_ref_frame_sign_bias;
+  int *lf_ref_frame;
+
+  /* force next frame to intra when kf_auto says so */
+  int force_next_frame_intra;
+
+  int droppable;
+
+  // TODO Do we still need this??
+  int update_context;
+
+  int dummy_packing;    /* flag to indicate if packing is dummy */
+
+#if CONFIG_PRED_FILTER
+  int pred_filter_on_count;
+  int pred_filter_off_count;
+#endif
+  unsigned int switchable_interp_count[VP9_SWITCHABLE_FILTERS + 1]
+                                      [VP9_SWITCHABLE_FILTERS];
+
+#if CONFIG_NEW_MVREF
+  unsigned int best_ref_index_counts[MAX_REF_FRAMES][MAX_MV_REFS];
+#endif
+
+} VP9_COMP;
+
+void vp9_encode_frame(VP9_COMP *cpi);
+
+void vp9_pack_bitstream(VP9_COMP *cpi, unsigned char *dest,
+                        unsigned long *size);
+
+void vp9_activity_masking(VP9_COMP *cpi, MACROBLOCK *x);
+
+void vp9_tokenize_mb(VP9_COMP *, MACROBLOCKD *, TOKENEXTRA **, int dry_run);
+void vp9_stuff_mb(VP9_COMP *cpi, MACROBLOCKD *xd, TOKENEXTRA **t, int dry_run);
+
+void vp9_set_speed_features(VP9_COMP *cpi);
+
+#if CONFIG_DEBUG
+#define CHECK_MEM_ERROR(lval,expr) do {\
+    lval = (expr); \
+    if(!lval) \
+      vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,\
+                         "Failed to allocate "#lval" at %s:%d", \
+                         __FILE__,__LINE__);\
+  } while(0)
+#else
+#define CHECK_MEM_ERROR(lval,expr) do {\
+    lval = (expr); \
+    if(!lval) \
+      vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,\
+                         "Failed to allocate "#lval);\
+  } while(0)
+#endif
+#endif  // __INC_ONYX_INT_H
--- /dev/null
+++ b/vp9/encoder/picklpf.c
@@ -1,0 +1,420 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vp9/common/onyxc_int.h"
+#include "onyx_int.h"
+#include "quantize.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_scale/yv12extend.h"
+#include "vpx_scale/vpxscale.h"
+#include "vp9/common/alloccommon.h"
+#include "vp9/common/loopfilter.h"
+#if ARCH_ARM
+#include "vpx_ports/arm.h"
+#endif
+
+extern int vp9_calc_ss_err(YV12_BUFFER_CONFIG *source,
+                           YV12_BUFFER_CONFIG *dest);
+#if HAVE_ARMV7
+extern void vp8_yv12_copy_frame_yonly_no_extend_frame_borders_neon(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc);
+#endif
+
+#if CONFIG_RUNTIME_CPU_DETECT
+#define IF_RTCD(x) (x)
+#else
+#define IF_RTCD(x) NULL
+#endif
+
+extern void(*vp9_yv12_copy_partial_frame_ptr)(YV12_BUFFER_CONFIG *src_ybc,
+                                              YV12_BUFFER_CONFIG *dst_ybc,
+                                              int fraction);
+
+void vp9_yv12_copy_partial_frame(YV12_BUFFER_CONFIG *src_ybc,
+                                 YV12_BUFFER_CONFIG *dst_ybc, int Fraction) {
+  unsigned char *src_y, *dst_y;
+  int yheight;
+  int ystride;
+  int border;
+  int yoffset;
+  int linestocopy;
+
+  border   = src_ybc->border;
+  yheight  = src_ybc->y_height;
+  ystride  = src_ybc->y_stride;
+
+  linestocopy = (yheight >> (Fraction + 4));
+
+  if (linestocopy < 1)
+    linestocopy = 1;
+
+  linestocopy <<= 4;
+
+  yoffset  = ystride * ((yheight >> 5) * 16 - 8);
+  src_y = src_ybc->y_buffer + yoffset;
+  dst_y = dst_ybc->y_buffer + yoffset;
+
+  vpx_memcpy(dst_y, src_y, ystride * (linestocopy + 16));
+}
+
+static int calc_partial_ssl_err(YV12_BUFFER_CONFIG *source,
+                                YV12_BUFFER_CONFIG *dest, int Fraction) {
+  int i, j;
+  int Total = 0;
+  int srcoffset, dstoffset;
+  unsigned char *src = source->y_buffer;
+  unsigned char *dst = dest->y_buffer;
+
+  int linestocopy = (source->y_height >> (Fraction + 4));
+
+  if (linestocopy < 1)
+    linestocopy = 1;
+
+  linestocopy <<= 4;
+
+
+  srcoffset = source->y_stride   * (dest->y_height >> 5) * 16;
+  dstoffset = dest->y_stride     * (dest->y_height >> 5) * 16;
+
+  src += srcoffset;
+  dst += dstoffset;
+
+  // Loop through the Y plane raw and reconstruction data summing (square differences)
+  for (i = 0; i < linestocopy; i += 16) {
+    for (j = 0; j < source->y_width; j += 16) {
+      unsigned int sse;
+      Total += vp9_mse16x16(src + j, source->y_stride, dst + j, dest->y_stride,
+                            &sse);
+    }
+
+    src += 16 * source->y_stride;
+    dst += 16 * dest->y_stride;
+  }
+
+  return Total;
+}
+
+// Enforce a minimum filter level based upon baseline Q
+static int get_min_filter_level(VP9_COMP *cpi, int base_qindex) {
+  int min_filter_level;
+  /*int q = (int) vp9_convert_qindex_to_q(base_qindex);
+
+  if (cpi->source_alt_ref_active && cpi->common.refresh_golden_frame && !cpi->common.refresh_alt_ref_frame)
+      min_filter_level = 0;
+  else
+  {
+      if (q <= 10)
+          min_filter_level = 0;
+      else if (q <= 64)
+          min_filter_level = 1;
+      else
+          min_filter_level = (q >> 6);
+  }
+  */
+  min_filter_level = 0;
+
+  return min_filter_level;
+}
+
+// Enforce a maximum filter level based upon baseline Q
+static int get_max_filter_level(VP9_COMP *cpi, int base_qindex) {
+  // PGW August 2006: Highest filter values almost always a bad idea
+
+  // jbb chg: 20100118 - not so any more with this overquant stuff allow high values
+  // with lots of intra coming in.
+  int max_filter_level = MAX_LOOP_FILTER;// * 3 / 4;
+  (void)base_qindex;
+
+  if (cpi->twopass.section_intra_rating > 8)
+    max_filter_level = MAX_LOOP_FILTER * 3 / 4;
+
+  return max_filter_level;
+}
+
+void vp9_pick_filter_level_fast(YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi) {
+  VP9_COMMON *cm = &cpi->common;
+
+  int best_err = 0;
+  int filt_err = 0;
+  int min_filter_level = get_min_filter_level(cpi, cm->base_qindex);
+  int max_filter_level = get_max_filter_level(cpi, cm->base_qindex);
+  int filt_val;
+  int best_filt_val = cm->filter_level;
+
+  //  Make a copy of the unfiltered / processed recon buffer
+  vp9_yv12_copy_partial_frame_ptr(cm->frame_to_show, &cpi->last_frame_uf, 3);
+
+  if (cm->frame_type == KEY_FRAME)
+    cm->sharpness_level = 0;
+  else
+    cm->sharpness_level = cpi->oxcf.Sharpness;
+
+  if (cm->sharpness_level != cm->last_sharpness_level) {
+    vp9_loop_filter_update_sharpness(&cm->lf_info, cm->sharpness_level);
+    cm->last_sharpness_level = cm->sharpness_level;
+  }
+
+  // Start the search at the previous frame filter level unless it is now out of range.
+  if (cm->filter_level < min_filter_level)
+    cm->filter_level = min_filter_level;
+  else if (cm->filter_level > max_filter_level)
+    cm->filter_level = max_filter_level;
+
+  filt_val = cm->filter_level;
+  best_filt_val = filt_val;
+
+  // Get the err using the previous frame's filter value.
+  vp9_loop_filter_partial_frame(cm, &cpi->mb.e_mbd, filt_val);
+
+  best_err = calc_partial_ssl_err(sd, cm->frame_to_show, 3);
+
+  //  Re-instate the unfiltered frame
+  vp9_yv12_copy_partial_frame_ptr(&cpi->last_frame_uf, cm->frame_to_show, 3);
+
+  filt_val -= (1 + ((filt_val > 10) ? 1 : 0));
+
+  // Search lower filter levels
+  while (filt_val >= min_filter_level) {
+    // Apply the loop filter
+    vp9_loop_filter_partial_frame(cm, &cpi->mb.e_mbd, filt_val);
+
+    // Get the err for filtered frame
+    filt_err = calc_partial_ssl_err(sd, cm->frame_to_show, 3);
+
+    //  Re-instate the unfiltered frame
+    vp9_yv12_copy_partial_frame_ptr(&cpi->last_frame_uf, cm->frame_to_show, 3);
+
+
+    // Update the best case record or exit loop.
+    if (filt_err < best_err) {
+      best_err = filt_err;
+      best_filt_val = filt_val;
+    } else
+      break;
+
+    // Adjust filter level
+    filt_val -= (1 + ((filt_val > 10) ? 1 : 0));
+  }
+
+  // Search up (note that we have already done filt_val = cm->filter_level)
+  filt_val = cm->filter_level + (1 + ((filt_val > 10) ? 1 : 0));
+
+  if (best_filt_val == cm->filter_level) {
+    // Resist raising filter level for very small gains
+    best_err -= (best_err >> 10);
+
+    while (filt_val < max_filter_level) {
+      // Apply the loop filter
+      vp9_loop_filter_partial_frame(cm, &cpi->mb.e_mbd, filt_val);
+
+      // Get the err for filtered frame
+      filt_err = calc_partial_ssl_err(sd, cm->frame_to_show, 3);
+
+      //  Re-instate the unfiltered frame
+      vp9_yv12_copy_partial_frame_ptr(&cpi->last_frame_uf,
+                                      cm->frame_to_show, 3);
+
+      // Update the best case record or exit loop.
+      if (filt_err < best_err) {
+        // Do not raise filter level if improvement is < 1 part in 4096
+        best_err = filt_err - (filt_err >> 10);
+
+        best_filt_val = filt_val;
+      } else
+        break;
+
+      // Adjust filter level
+      filt_val += (1 + ((filt_val > 10) ? 1 : 0));
+    }
+  }
+
+  cm->filter_level = best_filt_val;
+
+  if (cm->filter_level < min_filter_level)
+    cm->filter_level = min_filter_level;
+
+  if (cm->filter_level > max_filter_level)
+    cm->filter_level = max_filter_level;
+}
+
+// Stub function for now Alt LF not used
+void vp9_set_alt_lf_level(VP9_COMP *cpi, int filt_val) {
+}
+
+void vp9_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi) {
+  VP9_COMMON *cm = &cpi->common;
+
+  int best_err = 0;
+  int filt_err = 0;
+  int min_filter_level = get_min_filter_level(cpi, cm->base_qindex);
+  int max_filter_level = get_max_filter_level(cpi, cm->base_qindex);
+
+  int filter_step;
+  int filt_high = 0;
+  int filt_mid = cm->filter_level;      // Start search at previous frame filter level
+  int filt_low = 0;
+  int filt_best;
+  int filt_direction = 0;
+
+  int Bias = 0;                       // Bias against raising loop filter and in favour of lowering it
+
+  //  Make a copy of the unfiltered / processed recon buffer
+#if HAVE_ARMV7
+#if CONFIG_RUNTIME_CPU_DETECT
+  if (cm->rtcd.flags & HAS_NEON)
+#endif
+  {
+    vp8_yv12_copy_frame_yonly_no_extend_frame_borders_neon(cm->frame_to_show, &cpi->last_frame_uf);
+  }
+#if CONFIG_RUNTIME_CPU_DETECT
+  else
+#endif
+#endif
+#if !HAVE_ARMV7 || CONFIG_RUNTIME_CPU_DETECT
+  {
+    vp8_yv12_copy_frame_ptr(cm->frame_to_show, &cpi->last_frame_uf);
+  }
+#endif
+
+  if (cm->frame_type == KEY_FRAME)
+    cm->sharpness_level = 0;
+  else
+    cm->sharpness_level = cpi->oxcf.Sharpness;
+
+  // Start the search at the previous frame filter level unless it is now out of range.
+  filt_mid = cm->filter_level;
+
+  if (filt_mid < min_filter_level)
+    filt_mid = min_filter_level;
+  else if (filt_mid > max_filter_level)
+    filt_mid = max_filter_level;
+
+  // Define the initial step size
+  filter_step = (filt_mid < 16) ? 4 : filt_mid / 4;
+
+  // Get baseline error score
+  vp9_set_alt_lf_level(cpi, filt_mid);
+  vp9_loop_filter_frame_yonly(cm, &cpi->mb.e_mbd, filt_mid);
+
+  best_err = vp9_calc_ss_err(sd, cm->frame_to_show);
+  filt_best = filt_mid;
+
+  //  Re-instate the unfiltered frame
+#if HAVE_ARMV7
+#if CONFIG_RUNTIME_CPU_DETECT
+  if (cm->rtcd.flags & HAS_NEON)
+#endif
+  {
+    vp8_yv12_copy_frame_yonly_no_extend_frame_borders_neon(&cpi->last_frame_uf, cm->frame_to_show);
+  }
+#if CONFIG_RUNTIME_CPU_DETECT
+  else
+#endif
+#endif
+#if !HAVE_ARMV7 || CONFIG_RUNTIME_CPU_DETECT
+  {
+    vp8_yv12_copy_frame_yonly_ptr(&cpi->last_frame_uf, cm->frame_to_show);
+  }
+#endif
+
+  while (filter_step > 0) {
+    Bias = (best_err >> (15 - (filt_mid / 8))) * filter_step; // PGW change 12/12/06 for small images
+
+    // jbb chg: 20100118 - in sections with lots of new material coming in don't bias as much to a low filter value
+    if (cpi->twopass.section_intra_rating < 20)
+      Bias = Bias * cpi->twopass.section_intra_rating / 20;
+
+    // yx, bias less for large block size
+    if (cpi->common.txfm_mode != ONLY_4X4)
+      Bias >>= 1;
+
+    filt_high = ((filt_mid + filter_step) > max_filter_level) ? max_filter_level : (filt_mid + filter_step);
+    filt_low = ((filt_mid - filter_step) < min_filter_level) ? min_filter_level : (filt_mid - filter_step);
+
+    if ((filt_direction <= 0) && (filt_low != filt_mid)) {
+      // Get Low filter error score
+      vp9_set_alt_lf_level(cpi, filt_low);
+      vp9_loop_filter_frame_yonly(cm, &cpi->mb.e_mbd, filt_low);
+
+      filt_err = vp9_calc_ss_err(sd, cm->frame_to_show);
+
+      //  Re-instate the unfiltered frame
+#if HAVE_ARMV7
+#if CONFIG_RUNTIME_CPU_DETECT
+      if (cm->rtcd.flags & HAS_NEON)
+#endif
+      {
+        vp8_yv12_copy_frame_yonly_no_extend_frame_borders_neon(&cpi->last_frame_uf, cm->frame_to_show);
+      }
+#if CONFIG_RUNTIME_CPU_DETECT
+      else
+#endif
+#endif
+#if !HAVE_ARMV7 || CONFIG_RUNTIME_CPU_DETECT
+      {
+        vp8_yv12_copy_frame_yonly_ptr(&cpi->last_frame_uf, cm->frame_to_show);
+      }
+#endif
+
+      // If value is close to the best so far then bias towards a lower loop filter value.
+      if ((filt_err - Bias) < best_err) {
+        // Was it actually better than the previous best?
+        if (filt_err < best_err)
+          best_err = filt_err;
+
+        filt_best = filt_low;
+      }
+    }
+
+    // Now look at filt_high
+    if ((filt_direction >= 0) && (filt_high != filt_mid)) {
+      vp9_set_alt_lf_level(cpi, filt_high);
+      vp9_loop_filter_frame_yonly(cm, &cpi->mb.e_mbd, filt_high);
+
+      filt_err = vp9_calc_ss_err(sd, cm->frame_to_show);
+
+      //  Re-instate the unfiltered frame
+#if HAVE_ARMV7
+#if CONFIG_RUNTIME_CPU_DETECT
+      if (cm->rtcd.flags & HAS_NEON)
+#endif
+      {
+        vp8_yv12_copy_frame_yonly_no_extend_frame_borders_neon(&cpi->last_frame_uf, cm->frame_to_show);
+      }
+#if CONFIG_RUNTIME_CPU_DETECT
+      else
+#endif
+#endif
+#if !HAVE_ARMV7 || CONFIG_RUNTIME_CPU_DETECT
+      {
+        vp8_yv12_copy_frame_yonly_ptr(&cpi->last_frame_uf, cm->frame_to_show);
+      }
+#endif
+
+      // Was it better than the previous best?
+      if (filt_err < (best_err - Bias)) {
+        best_err = filt_err;
+        filt_best = filt_high;
+      }
+    }
+
+    // Half the step distance if the best filter value was the same as last time
+    if (filt_best == filt_mid) {
+      filter_step = filter_step / 2;
+      filt_direction = 0;
+    } else {
+      filt_direction = (filt_best < filt_mid) ? -1 : 1;
+      filt_mid = filt_best;
+    }
+  }
+
+  cm->filter_level = filt_best;
+}
+
--- /dev/null
+++ b/vp9/encoder/ppc/csystemdependent.c
@@ -1,0 +1,155 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vp9/encoder/variance.h"
+#include "vp9/encoder/onyx_int.h"
+
+SADFunction *vp9_sad16x16;
+SADFunction *vp9_sad16x8;
+SADFunction *vp9_sad8x16;
+SADFunction *vp9_sad8x8;
+SADFunction *vp9_sad4x4;
+
+variance_function *vp9_variance4x4;
+variance_function *vp9_variance8x8;
+variance_function *vp9_variance8x16;
+variance_function *vp9_variance16x8;
+variance_function *vp9_variance16x16;
+
+variance_function *vp9_mse16x16;
+
+sub_pixel_variance_function *vp9_sub_pixel_variance4x4;
+sub_pixel_variance_function *vp9_sub_pixel_variance8x8;
+sub_pixel_variance_function *vp9_sub_pixel_variance8x16;
+sub_pixel_variance_function *vp9_sub_pixel_variance16x8;
+sub_pixel_variance_function *vp9_sub_pixel_variance16x16;
+
+int (*vp9_block_error)(short *coeff, short *dqcoeff);
+int (*vp9_mbblock_error)(MACROBLOCK *mb, int dc);
+
+int (*vp9_mbuverror)(MACROBLOCK *mb);
+unsigned int (*vp9_get_mb_ss)(short *);
+void (*vp9_short_fdct4x4)(short *input, short *output, int pitch);
+void (*vp9_short_fdct8x4)(short *input, short *output, int pitch);
+void (*vp8_fast_fdct4x4)(short *input, short *output, int pitch);
+void (*vp8_fast_fdct8x4)(short *input, short *output, int pitch);
+void (*short_walsh4x4)(short *input, short *output, int pitch);
+
+void (*vp9_subtract_b)(BLOCK *be, BLOCKD *bd, int pitch);
+void (*vp9_subtract_mby)(short *diff, unsigned char *src, unsigned char *pred, int stride);
+void (*vp9_subtract_mbuv)(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride);
+void (*vp8_fast_quantize_b)(BLOCK *b, BLOCKD *d);
+
+// c imports
+extern int block_error_c(short *coeff, short *dqcoeff);
+extern int vp9_mbblock_error_c(MACROBLOCK *mb, int dc);
+
+extern int vp9_mbuverror_c(MACROBLOCK *mb);
+extern unsigned int vp8_get8x8var_c(unsigned char *src_ptr, int  source_stride, unsigned char *ref_ptr, int  recon_stride, unsigned int *SSE, int *Sum);
+extern void short_fdct4x4_c(short *input, short *output, int pitch);
+extern void short_fdct8x4_c(short *input, short *output, int pitch);
+extern void vp9_short_walsh4x4_c(short *input, short *output, int pitch);
+
+extern void vp9_subtract_b_c(BLOCK *be, BLOCKD *bd, int pitch);
+extern void subtract_mby_c(short *diff, unsigned char *src, unsigned char *pred, int stride);
+extern void subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride);
+extern void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d);
+
+extern SADFunction sad16x16_c;
+extern SADFunction sad16x8_c;
+extern SADFunction sad8x16_c;
+extern SADFunction sad8x8_c;
+extern SADFunction sad4x4_c;
+
+extern variance_function variance16x16_c;
+extern variance_function variance8x16_c;
+extern variance_function variance16x8_c;
+extern variance_function variance8x8_c;
+extern variance_function variance4x4_c;
+extern variance_function mse16x16_c;
+
+extern sub_pixel_variance_function sub_pixel_variance4x4_c;
+extern sub_pixel_variance_function sub_pixel_variance8x8_c;
+extern sub_pixel_variance_function sub_pixel_variance8x16_c;
+extern sub_pixel_variance_function sub_pixel_variance16x8_c;
+extern sub_pixel_variance_function sub_pixel_variance16x16_c;
+
+extern unsigned int vp9_get_mb_ss_c(short *);
+
+// ppc
+extern int vp9_block_error_ppc(short *coeff, short *dqcoeff);
+
+extern void vp9_short_fdct4x4_ppc(short *input, short *output, int pitch);
+extern void vp9_short_fdct8x4_ppc(short *input, short *output, int pitch);
+
+extern void vp9_subtract_mby_ppc(short *diff, unsigned char *src, unsigned char *pred, int stride);
+extern void vp9_subtract_mbuv_ppc(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride);
+
+extern SADFunction vp9_sad16x16_ppc;
+extern SADFunction vp9_sad16x8_ppc;
+extern SADFunction vp9_sad8x16_ppc;
+extern SADFunction vp9_sad8x8_ppc;
+extern SADFunction vp9_sad4x4_ppc;
+
+extern variance_function vp9_variance16x16_ppc;
+extern variance_function vp9_variance8x16_ppc;
+extern variance_function vp9_variance16x8_ppc;
+extern variance_function vp9_variance8x8_ppc;
+extern variance_function vp9_variance4x4_ppc;
+extern variance_function vp9_mse16x16_ppc;
+
+extern sub_pixel_variance_function vp9_sub_pixel_variance4x4_ppc;
+extern sub_pixel_variance_function vp9_sub_pixel_variance8x8_ppc;
+extern sub_pixel_variance_function vp9_sub_pixel_variance8x16_ppc;
+extern sub_pixel_variance_function vp9_sub_pixel_variance16x8_ppc;
+extern sub_pixel_variance_function vp9_sub_pixel_variance16x16_ppc;
+
+extern unsigned int vp8_get8x8var_ppc(unsigned char *src_ptr, int  source_stride, unsigned char *ref_ptr, int  recon_stride, unsigned int *SSE, int *Sum);
+extern unsigned int vp8_get16x16var_ppc(unsigned char *src_ptr, int  source_stride, unsigned char *ref_ptr, int  recon_stride, unsigned int *SSE, int *Sum);
+
+void vp9_cmachine_specific_config(void) {
+  // Pure C:
+  vp9_mbuverror               = vp9_mbuverror_c;
+  vp8_fast_quantize_b           = vp8_fast_quantize_b_c;
+  vp9_short_fdct4x4            = vp9_short_fdct4x4_ppc;
+  vp9_short_fdct8x4            = vp9_short_fdct8x4_ppc;
+  vp8_fast_fdct4x4             = vp9_short_fdct4x4_ppc;
+  vp8_fast_fdct8x4             = vp9_short_fdct8x4_ppc;
+  short_walsh4x4               = vp9_short_walsh4x4_c;
+
+  vp9_variance4x4             = vp9_variance4x4_ppc;
+  vp9_variance8x8             = vp9_variance8x8_ppc;
+  vp9_variance8x16            = vp9_variance8x16_ppc;
+  vp9_variance16x8            = vp9_variance16x8_ppc;
+  vp9_variance16x16           = vp9_variance16x16_ppc;
+  vp9_mse16x16                = vp9_mse16x16_ppc;
+
+  vp9_sub_pixel_variance4x4     = vp9_sub_pixel_variance4x4_ppc;
+  vp9_sub_pixel_variance8x8     = vp9_sub_pixel_variance8x8_ppc;
+  vp9_sub_pixel_variance8x16    = vp9_sub_pixel_variance8x16_ppc;
+  vp9_sub_pixel_variance16x8    = vp9_sub_pixel_variance16x8_ppc;
+  vp9_sub_pixel_variance16x16   = vp9_sub_pixel_variance16x16_ppc;
+
+  vp9_get_mb_ss                 = vp9_get_mb_ss_c;
+
+  vp9_sad16x16                = vp9_sad16x16_ppc;
+  vp9_sad16x8                 = vp9_sad16x8_ppc;
+  vp9_sad8x16                 = vp9_sad8x16_ppc;
+  vp9_sad8x8                  = vp9_sad8x8_ppc;
+  vp9_sad4x4                  = vp9_sad4x4_ppc;
+
+  vp9_block_error              = vp9_block_error_ppc;
+  vp9_mbblock_error            = vp9_mbblock_error_c;
+
+  vp9_subtract_b               = vp9_subtract_b_c;
+  vp9_subtract_mby             = vp9_subtract_mby_ppc;
+  vp9_subtract_mbuv            = vp9_subtract_mbuv_ppc;
+}
--- /dev/null
+++ b/vp9/encoder/ppc/encodemb_altivec.asm
@@ -1,0 +1,153 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    .globl vp8_subtract_mbuv_ppc
+    .globl vp8_subtract_mby_ppc
+
+;# r3 short *diff
+;# r4 unsigned char *usrc
+;# r5 unsigned char *vsrc
+;# r6 unsigned char *pred
+;# r7 int stride
+vp8_subtract_mbuv_ppc:
+    mfspr   r11, 256            ;# get old VRSAVE
+    oris    r12, r11, 0xf000
+    mtspr   256, r12            ;# set VRSAVE
+
+    li      r9, 256
+    add     r3, r3, r9
+    add     r3, r3, r9
+    add     r6, r6, r9
+
+    li      r10, 16
+    li      r9,  4
+    mtctr   r9
+
+    vspltisw v0, 0
+
+mbu_loop:
+    lvsl    v5, 0, r4           ;# permutate value for alignment
+    lvx     v1, 0, r4           ;# src
+    lvx     v2, 0, r6           ;# pred
+
+    add     r4, r4, r7
+    addi    r6, r6, 16
+
+    vperm   v1, v1, v0, v5
+
+    vmrghb  v3, v0, v1          ;# unpack high src  to short
+    vmrghb  v4, v0, v2          ;# unpack high pred to short
+
+    lvsl    v5, 0, r4           ;# permutate value for alignment
+    lvx     v1, 0, r4           ;# src
+
+    add     r4, r4, r7
+
+    vsubshs v3, v3, v4
+
+    stvx    v3, 0, r3           ;# store out diff
+
+    vperm   v1, v1, v0, v5
+
+    vmrghb  v3, v0, v1          ;# unpack high src  to short
+    vmrglb  v4, v0, v2          ;# unpack high pred to short
+
+    vsubshs v3, v3, v4
+
+    stvx    v3, r10, r3         ;# store out diff
+
+    addi    r3, r3, 32
+
+    bdnz    mbu_loop
+
+    mtctr   r9
+
+mbv_loop:
+    lvsl    v5, 0, r5           ;# permutate value for alignment
+    lvx     v1, 0, r5           ;# src
+    lvx     v2, 0, r6           ;# pred
+
+    add     r5, r5, r7
+    addi    r6, r6, 16
+
+    vperm   v1, v1, v0, v5
+
+    vmrghb  v3, v0, v1          ;# unpack high src  to short
+    vmrghb  v4, v0, v2          ;# unpack high pred to short
+
+    lvsl    v5, 0, r5           ;# permutate value for alignment
+    lvx     v1, 0, r5           ;# src
+
+    add     r5, r5, r7
+
+    vsubshs v3, v3, v4
+
+    stvx    v3, 0, r3           ;# store out diff
+
+    vperm   v1, v1, v0, v5
+
+    vmrghb  v3, v0, v1          ;# unpack high src  to short
+    vmrglb  v4, v0, v2          ;# unpack high pred to short
+
+    vsubshs v3, v3, v4
+
+    stvx    v3, r10, r3         ;# store out diff
+
+    addi    r3, r3, 32
+
+    bdnz    mbv_loop
+
+    mtspr   256, r11            ;# reset old VRSAVE
+
+    blr
+
+;# r3 short *diff
+;# r4 unsigned char *src
+;# r5 unsigned char *pred
+;# r6 int stride
+vp8_subtract_mby_ppc:
+    mfspr   r11, 256            ;# get old VRSAVE
+    oris    r12, r11, 0xf800
+    mtspr   256, r12            ;# set VRSAVE
+
+    li      r10, 16
+    mtctr   r10
+
+    vspltisw v0, 0
+
+mby_loop:
+    lvx     v1, 0, r4           ;# src
+    lvx     v2, 0, r5           ;# pred
+
+    add     r4, r4, r6
+    addi    r5, r5, 16
+
+    vmrghb  v3, v0, v1          ;# unpack high src  to short
+    vmrghb  v4, v0, v2          ;# unpack high pred to short
+
+    vsubshs v3, v3, v4
+
+    stvx    v3, 0, r3           ;# store out diff
+
+    vmrglb  v3, v0, v1          ;# unpack low src  to short
+    vmrglb  v4, v0, v2          ;# unpack low pred to short
+
+    vsubshs v3, v3, v4
+
+    stvx    v3, r10, r3         ;# store out diff
+
+    addi    r3, r3, 32
+
+    bdnz    mby_loop
+
+    mtspr   256, r11            ;# reset old VRSAVE
+
+    blr
--- /dev/null
+++ b/vp9/encoder/ppc/fdct_altivec.asm
@@ -1,0 +1,205 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    .globl vp8_short_fdct4x4_ppc
+    .globl vp8_short_fdct8x4_ppc
+
+.macro load_c V, LABEL, OFF, R0, R1
+    lis     \R0, \LABEL@ha
+    la      \R1, \LABEL@l(\R0)
+    lvx     \V, \OFF, \R1
+.endm
+
+;# Forward and inverse DCTs are nearly identical; only differences are
+;#   in normalization (fwd is twice unitary, inv is half unitary)
+;#   and that they are of course transposes of each other.
+;#
+;#   The following three accomplish most of implementation and
+;#   are used only by ppc_idct.c and ppc_fdct.c.
+.macro prologue
+    mfspr   r11, 256            ;# get old VRSAVE
+    oris    r12, r11, 0xfffc
+    mtspr   256, r12            ;# set VRSAVE
+
+    stwu    r1,-32(r1)          ;# create space on the stack
+
+    li      r6, 16
+
+    load_c v0, dct_tab, 0, r9, r10
+    lvx     v1,   r6, r10
+    addi    r10, r10, 32
+    lvx     v2,    0, r10
+    lvx     v3,   r6, r10
+
+    load_c v4, ppc_dctperm_tab,  0, r9, r10
+    load_c v5, ppc_dctperm_tab, r6, r9, r10
+
+    load_c v6, round_tab, 0, r10, r9
+.endm
+
+.macro epilogue
+    addi    r1, r1, 32          ;# recover stack
+
+    mtspr   256, r11            ;# reset old VRSAVE
+.endm
+
+;# Do horiz xf on two rows of coeffs  v8 = a0 a1 a2 a3  b0 b1 b2 b3.
+;#   a/A are the even rows 0,2   b/B are the odd rows 1,3
+;#   For fwd transform, indices are horizontal positions, then frequencies.
+;#   For inverse transform, frequencies then positions.
+;#   The two resulting  A0..A3  B0..B3  are later combined
+;#   and vertically transformed.
+
+.macro two_rows_horiz Dst
+    vperm   v9, v8, v8, v4      ;# v9 = a2 a3 a0 a1  b2 b3 b0 b1
+
+    vmsumshm v10, v0, v8, v6
+    vmsumshm v10, v1, v9, v10
+    vsraw   v10, v10, v7        ;# v10 = A0 A1  B0 B1
+
+    vmsumshm v11, v2, v8, v6
+    vmsumshm v11, v3, v9, v11
+    vsraw   v11, v11, v7        ;# v11 = A2 A3  B2 B3
+
+    vpkuwum v10, v10, v11       ;# v10  = A0 A1  B0 B1  A2 A3  B2 B3
+    vperm   \Dst, v10, v10, v5  ;# Dest = A0 B0  A1 B1  A2 B2  A3 B3
+.endm
+
+;# Vertical xf on two rows. DCT values in comments are for inverse transform;
+;#   forward transform uses transpose.
+
+.macro two_rows_vert Ceven, Codd
+    vspltw  v8, \Ceven, 0       ;# v8 = c00 c10  or  c02 c12 four times
+    vspltw  v9, \Codd,  0       ;# v9 = c20 c30  or  c22 c32 ""
+    vmsumshm v8, v8, v12, v6
+    vmsumshm v8, v9, v13, v8
+    vsraw   v10, v8, v7
+
+    vspltw  v8, \Codd,  1       ;# v8 = c01 c11  or  c03 c13
+    vspltw  v9, \Ceven, 1       ;# v9 = c21 c31  or  c23 c33
+    vmsumshm v8, v8, v12, v6
+    vmsumshm v8, v9, v13, v8
+    vsraw   v8, v8, v7
+
+    vpkuwum v8, v10, v8         ;# v8 = rows 0,1  or 2,3
+.endm
+
+.macro two_rows_h Dest
+    stw     r0,  0(r8)
+    lwz     r0,  4(r3)
+    stw     r0,  4(r8)
+    lwzux   r0, r3,r5
+    stw     r0,  8(r8)
+    lwz     r0,  4(r3)
+    stw     r0, 12(r8)
+    lvx     v8,  0,r8
+    two_rows_horiz \Dest
+.endm
+
+    .align 2
+;# r3 short *input
+;# r4 short *output
+;# r5 int pitch
+vp8_short_fdct4x4_ppc:
+
+    prologue
+
+    vspltisw v7, 14             ;# == 14, fits in 5 signed bits
+    addi    r8, r1, 0
+
+
+    lwz     r0, 0(r3)
+    two_rows_h v12                ;# v12 = H00 H10  H01 H11  H02 H12  H03 H13
+
+    lwzux   r0, r3, r5
+    two_rows_h v13                ;# v13 = H20 H30  H21 H31  H22 H32  H23 H33
+
+    lvx     v6, r6, r9          ;# v6 = Vround
+    vspltisw v7, -16            ;# == 16 == -16, only low 5 bits matter
+
+    two_rows_vert v0, v1
+    stvx    v8, 0, r4
+    two_rows_vert v2, v3
+    stvx    v8, r6, r4
+
+    epilogue
+
+    blr
+
+    .align 2
+;# r3 short *input
+;# r4 short *output
+;# r5 int pitch
+vp8_short_fdct8x4_ppc:
+    prologue
+
+    vspltisw v7, 14             ;# == 14, fits in 5 signed bits
+    addi    r8,  r1, 0
+    addi    r10, r3, 0
+
+    lwz     r0, 0(r3)
+    two_rows_h v12                ;# v12 = H00 H10  H01 H11  H02 H12  H03 H13
+
+    lwzux   r0, r3, r5
+    two_rows_h v13                ;# v13 = H20 H30  H21 H31  H22 H32  H23 H33
+
+    lvx     v6, r6, r9          ;# v6 = Vround
+    vspltisw v7, -16            ;# == 16 == -16, only low 5 bits matter
+
+    two_rows_vert v0, v1
+    stvx    v8, 0, r4
+    two_rows_vert v2, v3
+    stvx    v8, r6, r4
+
+    ;# Next block
+    addi    r3, r10, 8
+    addi    r4, r4, 32
+    lvx     v6, 0, r9           ;# v6 = Hround
+
+    vspltisw v7, 14             ;# == 14, fits in 5 signed bits
+    addi    r8, r1, 0
+
+    lwz     r0, 0(r3)
+    two_rows_h v12                ;# v12 = H00 H10  H01 H11  H02 H12  H03 H13
+
+    lwzux   r0, r3, r5
+    two_rows_h v13                ;# v13 = H20 H30  H21 H31  H22 H32  H23 H33
+
+    lvx     v6, r6, r9          ;# v6 = Vround
+    vspltisw v7, -16            ;# == 16 == -16, only low 5 bits matter
+
+    two_rows_vert v0, v1
+    stvx    v8, 0, r4
+    two_rows_vert v2, v3
+    stvx    v8, r6, r4
+
+    epilogue
+
+    blr
+
+    .data
+    .align 4
+ppc_dctperm_tab:
+    .byte 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11
+    .byte 0,1,4,5, 2,3,6,7, 8,9,12,13, 10,11,14,15
+
+    .align 4
+dct_tab:
+    .short  23170, 23170,-12540,-30274, 23170, 23170,-12540,-30274
+    .short  23170, 23170, 30274, 12540, 23170, 23170, 30274, 12540
+
+    .short  23170,-23170, 30274,-12540, 23170,-23170, 30274,-12540
+    .short -23170, 23170, 12540,-30274,-23170, 23170, 12540,-30274
+
+    .align 4
+round_tab:
+    .long (1 << (14-1)), (1 << (14-1)), (1 << (14-1)), (1 << (14-1))
+    .long (1 << (16-1)), (1 << (16-1)), (1 << (16-1)), (1 << (16-1))
--- /dev/null
+++ b/vp9/encoder/ppc/rdopt_altivec.asm
@@ -1,0 +1,51 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    .globl vp8_block_error_ppc
+
+    .align 2
+;# r3 short *Coeff
+;# r4 short *dqcoeff
+vp8_block_error_ppc:
+    mfspr   r11, 256            ;# get old VRSAVE
+    oris    r12, r11, 0xf800
+    mtspr   256, r12            ;# set VRSAVE
+
+    stwu    r1,-32(r1)          ;# create space on the stack
+
+    stw     r5, 12(r1)          ;# tranfer dc to vector register
+
+    lvx     v0, 0, r3           ;# Coeff
+    lvx     v1, 0, r4           ;# dqcoeff
+
+    li      r10, 16
+
+    vspltisw v3, 0
+
+    vsubshs v0, v0, v1
+
+    vmsumshm v2, v0, v0, v3     ;# multiply differences
+
+    lvx     v0, r10, r3         ;# Coeff
+    lvx     v1, r10, r4         ;# dqcoeff
+
+    vsubshs v0, v0, v1
+
+    vmsumshm v1, v0, v0, v2     ;# multiply differences
+    vsumsws v1, v1, v3          ;# sum up
+
+    stvx    v1, 0, r1
+    lwz     r3, 12(r1)          ;# return value
+
+    addi    r1, r1, 32          ;# recover stack
+    mtspr   256, r11            ;# reset old VRSAVE
+
+    blr
--- /dev/null
+++ b/vp9/encoder/ppc/sad_altivec.asm
@@ -1,0 +1,277 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    .globl vp8_sad16x16_ppc
+    .globl vp8_sad16x8_ppc
+    .globl vp8_sad8x16_ppc
+    .globl vp8_sad8x8_ppc
+    .globl vp8_sad4x4_ppc
+
+.macro load_aligned_16 V R O
+    lvsl    v3,  0, \R          ;# permutate value for alignment
+
+    lvx     v1,  0, \R
+    lvx     v2, \O, \R
+
+    vperm   \V, v1, v2, v3
+.endm
+
+.macro prologue
+    mfspr   r11, 256            ;# get old VRSAVE
+    oris    r12, r11, 0xffc0
+    mtspr   256, r12            ;# set VRSAVE
+
+    stwu    r1, -32(r1)         ;# create space on the stack
+
+    li      r10, 16             ;# load offset and loop counter
+
+    vspltisw v8, 0              ;# zero out total to start
+.endm
+
+.macro epilogue
+    addi    r1, r1, 32          ;# recover stack
+
+    mtspr   256, r11            ;# reset old VRSAVE
+.endm
+
+.macro SAD_16
+    ;# v6 = abs (v4 - v5)
+    vsububs v6, v4, v5
+    vsububs v7, v5, v4
+    vor     v6, v6, v7
+
+    ;# v8 += abs (v4 - v5)
+    vsum4ubs v8, v6, v8
+.endm
+
+.macro sad_16_loop loop_label
+    lvsl    v3,  0, r5          ;# only needs to be done once per block
+
+    ;# preload a line of data before getting into the loop
+    lvx     v4, 0, r3
+    lvx     v1,  0, r5
+    lvx     v2, r10, r5
+
+    add     r5, r5, r6
+    add     r3, r3, r4
+
+    vperm   v5, v1, v2, v3
+
+    .align 4
+\loop_label:
+    ;# compute difference on first row
+    vsububs v6, v4, v5
+    vsububs v7, v5, v4
+
+    ;# load up next set of data
+    lvx     v9, 0, r3
+    lvx     v1,  0, r5
+    lvx     v2, r10, r5
+
+    ;# perform abs() of difference
+    vor     v6, v6, v7
+    add     r3, r3, r4
+
+    ;# add to the running tally
+    vsum4ubs v8, v6, v8
+
+    ;# now onto the next line
+    vperm   v5, v1, v2, v3
+    add     r5, r5, r6
+    lvx     v4, 0, r3
+
+    ;# compute difference on second row
+    vsububs v6, v9, v5
+    lvx     v1,  0, r5
+    vsububs v7, v5, v9
+    lvx     v2, r10, r5
+    vor     v6, v6, v7
+    add     r3, r3, r4
+    vsum4ubs v8, v6, v8
+    vperm   v5, v1, v2, v3
+    add     r5, r5, r6
+
+    bdnz    \loop_label
+
+    vspltisw v7, 0
+
+    vsumsws v8, v8, v7
+
+    stvx    v8, 0, r1
+    lwz     r3, 12(r1)
+.endm
+
+.macro sad_8_loop loop_label
+    .align 4
+\loop_label:
+    ;# only one of the inputs should need to be aligned.
+    load_aligned_16 v4, r3, r10
+    load_aligned_16 v5, r5, r10
+
+    ;# move onto the next line
+    add     r3, r3, r4
+    add     r5, r5, r6
+
+    ;# only one of the inputs should need to be aligned.
+    load_aligned_16 v6, r3, r10
+    load_aligned_16 v7, r5, r10
+
+    ;# move onto the next line
+    add     r3, r3, r4
+    add     r5, r5, r6
+
+    vmrghb  v4, v4, v6
+    vmrghb  v5, v5, v7
+
+    SAD_16
+
+    bdnz    \loop_label
+
+    vspltisw v7, 0
+
+    vsumsws v8, v8, v7
+
+    stvx    v8, 0, r1
+    lwz     r3, 12(r1)
+.endm
+
+    .align 2
+;# r3 unsigned char *src_ptr
+;# r4 int  src_stride
+;# r5 unsigned char *ref_ptr
+;# r6 int  ref_stride
+;#
+;# r3 return value
+vp8_sad16x16_ppc:
+
+    prologue
+
+    li      r9, 8
+    mtctr   r9
+
+    sad_16_loop sad16x16_loop
+
+    epilogue
+
+    blr
+
+    .align 2
+;# r3 unsigned char *src_ptr
+;# r4 int  src_stride
+;# r5 unsigned char *ref_ptr
+;# r6 int  ref_stride
+;#
+;# r3 return value
+vp8_sad16x8_ppc:
+
+    prologue
+
+    li      r9, 4
+    mtctr   r9
+
+    sad_16_loop sad16x8_loop
+
+    epilogue
+
+    blr
+
+    .align 2
+;# r3 unsigned char *src_ptr
+;# r4 int  src_stride
+;# r5 unsigned char *ref_ptr
+;# r6 int  ref_stride
+;#
+;# r3 return value
+vp8_sad8x16_ppc:
+
+    prologue
+
+    li      r9, 8
+    mtctr   r9
+
+    sad_8_loop sad8x16_loop
+
+    epilogue
+
+    blr
+
+    .align 2
+;# r3 unsigned char *src_ptr
+;# r4 int  src_stride
+;# r5 unsigned char *ref_ptr
+;# r6 int  ref_stride
+;#
+;# r3 return value
+vp8_sad8x8_ppc:
+
+    prologue
+
+    li      r9, 4
+    mtctr   r9
+
+    sad_8_loop sad8x8_loop
+
+    epilogue
+
+    blr
+
+.macro transfer_4x4 I P
+    lwz     r0, 0(\I)
+    add     \I, \I, \P
+
+    lwz     r7, 0(\I)
+    add     \I, \I, \P
+
+    lwz     r8, 0(\I)
+    add     \I, \I, \P
+
+    lwz     r9, 0(\I)
+
+    stw     r0,  0(r1)
+    stw     r7,  4(r1)
+    stw     r8,  8(r1)
+    stw     r9, 12(r1)
+.endm
+
+    .align 2
+;# r3 unsigned char *src_ptr
+;# r4 int  src_stride
+;# r5 unsigned char *ref_ptr
+;# r6 int  ref_stride
+;#
+;# r3 return value
+vp8_sad4x4_ppc:
+
+    prologue
+
+    transfer_4x4 r3, r4
+    lvx     v4, 0, r1
+
+    transfer_4x4 r5, r6
+    lvx     v5, 0, r1
+
+    vspltisw v8, 0              ;# zero out total to start
+
+    ;# v6 = abs (v4 - v5)
+    vsububs v6, v4, v5
+    vsububs v7, v5, v4
+    vor     v6, v6, v7
+
+    ;# v8 += abs (v4 - v5)
+    vsum4ubs v7, v6, v8
+    vsumsws v7, v7, v8
+
+    stvx    v7, 0, r1
+    lwz     r3, 12(r1)
+
+    epilogue
+
+    blr
--- /dev/null
+++ b/vp9/encoder/ppc/variance_altivec.asm
@@ -1,0 +1,375 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    .globl vp8_get8x8var_ppc
+    .globl vp8_get16x16var_ppc
+    .globl vp8_mse16x16_ppc
+    .globl vp9_variance16x16_ppc
+    .globl vp9_variance16x8_ppc
+    .globl vp9_variance8x16_ppc
+    .globl vp9_variance8x8_ppc
+    .globl vp9_variance4x4_ppc
+
+.macro load_aligned_16 V R O
+    lvsl    v3,  0, \R          ;# permutate value for alignment
+
+    lvx     v1,  0, \R
+    lvx     v2, \O, \R
+
+    vperm   \V, v1, v2, v3
+.endm
+
+.macro prologue
+    mfspr   r11, 256            ;# get old VRSAVE
+    oris    r12, r11, 0xffc0
+    mtspr   256, r12            ;# set VRSAVE
+
+    stwu    r1, -32(r1)         ;# create space on the stack
+
+    li      r10, 16             ;# load offset and loop counter
+
+    vspltisw v7, 0              ;# zero for merging
+    vspltisw v8, 0              ;# zero out total to start
+    vspltisw v9, 0              ;# zero out total for dif^2
+.endm
+
+.macro epilogue
+    addi    r1, r1, 32          ;# recover stack
+
+    mtspr   256, r11            ;# reset old VRSAVE
+.endm
+
+.macro compute_sum_sse
+    ;# Compute sum first.  Unpack to so signed subract
+    ;#  can be used.  Only have a half word signed
+    ;#  subract.  Do high, then low.
+    vmrghb  v2, v7, v4
+    vmrghb  v3, v7, v5
+    vsubshs v2, v2, v3
+    vsum4shs v8, v2, v8
+
+    vmrglb  v2, v7, v4
+    vmrglb  v3, v7, v5
+    vsubshs v2, v2, v3
+    vsum4shs v8, v2, v8
+
+    ;# Now compute sse.
+    vsububs v2, v4, v5
+    vsububs v3, v5, v4
+    vor     v2, v2, v3
+
+    vmsumubm v9, v2, v2, v9
+.endm
+
+.macro variance_16 DS loop_label store_sum
+\loop_label:
+    ;# only one of the inputs should need to be aligned.
+    load_aligned_16 v4, r3, r10
+    load_aligned_16 v5, r5, r10
+
+    ;# move onto the next line
+    add     r3, r3, r4
+    add     r5, r5, r6
+
+    compute_sum_sse
+
+    bdnz    \loop_label
+
+    vsumsws v8, v8, v7
+    vsumsws v9, v9, v7
+
+    stvx    v8, 0, r1
+    lwz     r3, 12(r1)
+
+    stvx    v9, 0, r1
+    lwz     r4, 12(r1)
+
+.if \store_sum
+    stw     r3, 0(r8)           ;# sum
+.endif
+    stw     r4, 0(r7)           ;# sse
+
+    mullw   r3, r3, r3          ;# sum*sum
+    srawi   r3, r3, \DS         ;# (sum*sum) >> DS
+    subf    r3, r3, r4          ;# sse - ((sum*sum) >> DS)
+.endm
+
+.macro variance_8 DS loop_label store_sum
+\loop_label:
+    ;# only one of the inputs should need to be aligned.
+    load_aligned_16 v4, r3, r10
+    load_aligned_16 v5, r5, r10
+
+    ;# move onto the next line
+    add     r3, r3, r4
+    add     r5, r5, r6
+
+    ;# only one of the inputs should need to be aligned.
+    load_aligned_16 v6, r3, r10
+    load_aligned_16 v0, r5, r10
+
+    ;# move onto the next line
+    add     r3, r3, r4
+    add     r5, r5, r6
+
+    vmrghb  v4, v4, v6
+    vmrghb  v5, v5, v0
+
+    compute_sum_sse
+
+    bdnz    \loop_label
+
+    vsumsws v8, v8, v7
+    vsumsws v9, v9, v7
+
+    stvx    v8, 0, r1
+    lwz     r3, 12(r1)
+
+    stvx    v9, 0, r1
+    lwz     r4, 12(r1)
+
+.if \store_sum
+    stw     r3, 0(r8)           ;# sum
+.endif
+    stw     r4, 0(r7)           ;# sse
+
+    mullw   r3, r3, r3          ;# sum*sum
+    srawi   r3, r3, \DS         ;# (sum*sum) >> 8
+    subf    r3, r3, r4          ;# sse - ((sum*sum) >> 8)
+.endm
+
+    .align 2
+;# r3 unsigned char *src_ptr
+;# r4 int  source_stride
+;# r5 unsigned char *ref_ptr
+;# r6 int  recon_stride
+;# r7 unsigned int *SSE
+;# r8 int *Sum
+;#
+;# r3 return value
+vp8_get8x8var_ppc:
+
+    prologue
+
+    li      r9, 4
+    mtctr   r9
+
+    variance_8 6, get8x8var_loop, 1
+
+    epilogue
+
+    blr
+
+    .align 2
+;# r3 unsigned char *src_ptr
+;# r4 int  source_stride
+;# r5 unsigned char *ref_ptr
+;# r6 int  recon_stride
+;# r7 unsigned int *SSE
+;# r8 int *Sum
+;#
+;# r3 return value
+vp8_get16x16var_ppc:
+
+    prologue
+
+    mtctr   r10
+
+    variance_16 8, get16x16var_loop, 1
+
+    epilogue
+
+    blr
+
+    .align 2
+;# r3 unsigned char *src_ptr
+;# r4 int  source_stride
+;# r5 unsigned char *ref_ptr
+;# r6 int  recon_stride
+;# r7 unsigned int *sse
+;#
+;# r 3 return value
+vp8_mse16x16_ppc:
+    prologue
+
+    mtctr   r10
+
+mse16x16_loop:
+    ;# only one of the inputs should need to be aligned.
+    load_aligned_16 v4, r3, r10
+    load_aligned_16 v5, r5, r10
+
+    ;# move onto the next line
+    add     r3, r3, r4
+    add     r5, r5, r6
+
+    ;# Now compute sse.
+    vsububs v2, v4, v5
+    vsububs v3, v5, v4
+    vor     v2, v2, v3
+
+    vmsumubm v9, v2, v2, v9
+
+    bdnz    mse16x16_loop
+
+    vsumsws v9, v9, v7
+
+    stvx    v9, 0, r1
+    lwz     r3, 12(r1)
+
+    stvx    v9, 0, r1
+    lwz     r3, 12(r1)
+
+    stw     r3, 0(r7)           ;# sse
+
+    epilogue
+
+    blr
+
+    .align 2
+;# r3 unsigned char *src_ptr
+;# r4 int  source_stride
+;# r5 unsigned char *ref_ptr
+;# r6 int  recon_stride
+;# r7 unsigned int *sse
+;#
+;# r3 return value
+vp9_variance16x16_ppc:
+
+    prologue
+
+    mtctr   r10
+
+    variance_16 8, variance16x16_loop, 0
+
+    epilogue
+
+    blr
+
+    .align 2
+;# r3 unsigned char *src_ptr
+;# r4 int  source_stride
+;# r5 unsigned char *ref_ptr
+;# r6 int  recon_stride
+;# r7 unsigned int *sse
+;#
+;# r3 return value
+vp9_variance16x8_ppc:
+
+    prologue
+
+    li      r9, 8
+    mtctr   r9
+
+    variance_16 7, variance16x8_loop, 0
+
+    epilogue
+
+    blr
+
+    .align 2
+;# r3 unsigned char *src_ptr
+;# r4 int  source_stride
+;# r5 unsigned char *ref_ptr
+;# r6 int  recon_stride
+;# r7 unsigned int *sse
+;#
+;# r3 return value
+vp9_variance8x16_ppc:
+
+    prologue
+
+    li      r9, 8
+    mtctr   r9
+
+    variance_8 7, variance8x16_loop, 0
+
+    epilogue
+
+    blr
+
+    .align 2
+;# r3 unsigned char *src_ptr
+;# r4 int  source_stride
+;# r5 unsigned char *ref_ptr
+;# r6 int  recon_stride
+;# r7 unsigned int *sse
+;#
+;# r3 return value
+vp9_variance8x8_ppc:
+
+    prologue
+
+    li      r9, 4
+    mtctr   r9
+
+    variance_8 6, variance8x8_loop, 0
+
+    epilogue
+
+    blr
+
+.macro transfer_4x4 I P
+    lwz     r0, 0(\I)
+    add     \I, \I, \P
+
+    lwz     r10,0(\I)
+    add     \I, \I, \P
+
+    lwz     r8, 0(\I)
+    add     \I, \I, \P
+
+    lwz     r9, 0(\I)
+
+    stw     r0,  0(r1)
+    stw     r10, 4(r1)
+    stw     r8,  8(r1)
+    stw     r9, 12(r1)
+.endm
+
+    .align 2
+;# r3 unsigned char *src_ptr
+;# r4 int  source_stride
+;# r5 unsigned char *ref_ptr
+;# r6 int  recon_stride
+;# r7 unsigned int *sse
+;#
+;# r3 return value
+vp9_variance4x4_ppc:
+
+    prologue
+
+    transfer_4x4 r3, r4
+    lvx     v4, 0, r1
+
+    transfer_4x4 r5, r6
+    lvx     v5, 0, r1
+
+    compute_sum_sse
+
+    vsumsws v8, v8, v7
+    vsumsws v9, v9, v7
+
+    stvx    v8, 0, r1
+    lwz     r3, 12(r1)
+
+    stvx    v9, 0, r1
+    lwz     r4, 12(r1)
+
+    stw     r4, 0(r7)           ;# sse
+
+    mullw   r3, r3, r3          ;# sum*sum
+    srawi   r3, r3, 4           ;# (sum*sum) >> 4
+    subf    r3, r3, r4          ;# sse - ((sum*sum) >> 4)
+
+    epilogue
+
+    blr
--- /dev/null
+++ b/vp9/encoder/ppc/variance_subpixel_altivec.asm
@@ -1,0 +1,865 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    .globl vp9_sub_pixel_variance4x4_ppc
+    .globl vp9_sub_pixel_variance8x8_ppc
+    .globl vp9_sub_pixel_variance8x16_ppc
+    .globl vp9_sub_pixel_variance16x8_ppc
+    .globl vp9_sub_pixel_variance16x16_ppc
+
+.macro load_c V, LABEL, OFF, R0, R1
+    lis     \R0, \LABEL@ha
+    la      \R1, \LABEL@l(\R0)
+    lvx     \V, \OFF, \R1
+.endm
+
+.macro load_vfilter V0, V1
+    load_c \V0, vfilter_b, r6, r12, r10
+
+    addi    r6,  r6, 16
+    lvx     \V1, r6, r10
+.endm
+
+.macro HProlog jump_label
+    ;# load up horizontal filter
+    slwi.   r5, r5, 4           ;# index into horizontal filter array
+
+    ;# index to the next set of vectors in the row.
+    li      r10, 16
+
+    ;# downshift by 7 ( divide by 128 ) at the end
+    vspltish v19, 7
+
+    ;# If there isn't any filtering to be done for the horizontal, then
+    ;#  just skip to the second pass.
+    beq     \jump_label
+
+    load_c v20, hfilter_b, r5, r12, r0
+
+    ;# setup constants
+    ;# v14 permutation value for alignment
+    load_c v28, b_hperm_b, 0, r12, r0
+
+    ;# index to the next set of vectors in the row.
+    li      r12, 32
+
+    ;# rounding added in on the multiply
+    vspltisw v21, 8
+    vspltisw v18, 3
+    vslw    v18, v21, v18       ;# 0x00000040000000400000004000000040
+
+    slwi.   r6, r6, 5           ;# index into vertical filter array
+.endm
+
+;# Filters a horizontal line
+;# expects:
+;#  r3  src_ptr
+;#  r4  pitch
+;#  r10 16
+;#  r12 32
+;#  v17 perm intput
+;#  v18 rounding
+;#  v19 shift
+;#  v20 filter taps
+;#  v21 tmp
+;#  v22 tmp
+;#  v23 tmp
+;#  v24 tmp
+;#  v25 tmp
+;#  v26 tmp
+;#  v27 tmp
+;#  v28 perm output
+;#
+
+.macro hfilter_8 V, hp, lp, increment_counter
+    lvsl    v17,  0, r3         ;# permutate value for alignment
+
+    ;# input to filter is 9 bytes wide, output is 8 bytes.
+    lvx     v21,   0, r3
+    lvx     v22, r10, r3
+
+.if \increment_counter
+    add     r3, r3, r4
+.endif
+    vperm   v21, v21, v22, v17
+
+    vperm   v24, v21, v21, \hp  ;# v20 = 0123 1234 2345 3456
+    vperm   v25, v21, v21, \lp  ;# v21 = 4567 5678 6789 789A
+
+    vmsummbm v24, v20, v24, v18
+    vmsummbm v25, v20, v25, v18
+
+    vpkswus v24, v24, v25       ;# v24 = 0 4 8 C 1 5 9 D (16-bit)
+
+    vsrh    v24, v24, v19       ;# divide v0, v1 by 128
+
+    vpkuhus \V, v24, v24        ;# \V = scrambled 8-bit result
+.endm
+
+.macro vfilter_16 P0 P1
+    vmuleub v22, \P0, v20       ;# 64 + 4 positive taps
+    vadduhm v22, v18, v22
+    vmuloub v23, \P0, v20
+    vadduhm v23, v18, v23
+
+    vmuleub v24, \P1, v21
+    vadduhm v22, v22, v24       ;# Re = evens, saturation unnecessary
+    vmuloub v25, \P1, v21
+    vadduhm v23, v23, v25       ;# Ro = odds
+
+    vsrh    v22, v22, v19       ;# divide by 128
+    vsrh    v23, v23, v19       ;# v16 v17 = evens, odds
+    vmrghh  \P0, v22, v23       ;# v18 v19 = 16-bit result in order
+    vmrglh  v23, v22, v23
+    vpkuhus \P0, \P0, v23       ;# P0 = 8-bit result
+.endm
+
+.macro compute_sum_sse src, ref, sum, sse, t1, t2, z0
+    ;# Compute sum first.  Unpack to so signed subract
+    ;#  can be used.  Only have a half word signed
+    ;#  subract.  Do high, then low.
+    vmrghb  \t1, \z0, \src
+    vmrghb  \t2, \z0, \ref
+    vsubshs \t1, \t1, \t2
+    vsum4shs \sum, \t1, \sum
+
+    vmrglb  \t1, \z0, \src
+    vmrglb  \t2, \z0, \ref
+    vsubshs \t1, \t1, \t2
+    vsum4shs \sum, \t1, \sum
+
+    ;# Now compute sse.
+    vsububs \t1, \src, \ref
+    vsububs \t2, \ref, \src
+    vor     \t1, \t1, \t2
+
+    vmsumubm \sse, \t1, \t1, \sse
+.endm
+
+.macro variance_final sum, sse, z0, DS
+    vsumsws \sum, \sum, \z0
+    vsumsws \sse, \sse, \z0
+
+    stvx    \sum, 0, r1
+    lwz     r3, 12(r1)
+
+    stvx    \sse, 0, r1
+    lwz     r4, 12(r1)
+
+    stw     r4, 0(r9)           ;# sse
+
+    mullw   r3, r3, r3          ;# sum*sum
+    srawi   r3, r3, \DS         ;# (sum*sum) >> 8
+    subf    r3, r3, r4          ;# sse - ((sum*sum) >> 8)
+.endm
+
+.macro compute_sum_sse_16 V, increment_counter
+    load_and_align_16  v16, r7, r8, \increment_counter
+    compute_sum_sse \V, v16, v18, v19, v20, v21, v23
+.endm
+
+.macro load_and_align_16 V, R, P, increment_counter
+    lvsl    v17,  0, \R         ;# permutate value for alignment
+
+    ;# input to filter is 21 bytes wide, output is 16 bytes.
+    ;#  input will can span three vectors if not aligned correctly.
+    lvx     v21,   0, \R
+    lvx     v22, r10, \R
+
+.if \increment_counter
+    add     \R, \R, \P
+.endif
+
+    vperm   \V, v21, v22, v17
+.endm
+
+    .align 2
+;# r3 unsigned char  *src_ptr
+;# r4 int  src_pixels_per_line
+;# r5 int  xoffset
+;# r6 int  yoffset
+;# r7 unsigned char *dst_ptr
+;# r8 int dst_pixels_per_line
+;# r9 unsigned int *sse
+;#
+;# r3 return value
+vp9_sub_pixel_variance4x4_ppc:
+    mfspr   r11, 256            ;# get old VRSAVE
+    oris    r12, r11, 0xf830
+    ori     r12, r12, 0xfff8
+    mtspr   256, r12            ;# set VRSAVE
+
+    stwu    r1,-32(r1)          ;# create space on the stack
+
+    HProlog second_pass_4x4_pre_copy_b
+
+    ;# Load up permutation constants
+    load_c v10, b_0123_b, 0, r12, r0
+    load_c v11, b_4567_b, 0, r12, r0
+
+    hfilter_8 v0, v10, v11, 1
+    hfilter_8 v1, v10, v11, 1
+    hfilter_8 v2, v10, v11, 1
+    hfilter_8 v3, v10, v11, 1
+
+    ;# Finished filtering main horizontal block.  If there is no
+    ;#  vertical filtering, jump to storing the data.  Otherwise
+    ;#  load up and filter the additional line that is needed
+    ;#  for the vertical filter.
+    beq     compute_sum_sse_4x4_b
+
+    hfilter_8 v4, v10, v11, 0
+
+    b   second_pass_4x4_b
+
+second_pass_4x4_pre_copy_b:
+    slwi    r6, r6, 5           ;# index into vertical filter array
+
+    load_and_align_16 v0, r3, r4, 1
+    load_and_align_16 v1, r3, r4, 1
+    load_and_align_16 v2, r3, r4, 1
+    load_and_align_16 v3, r3, r4, 1
+    load_and_align_16 v4, r3, r4, 0
+
+second_pass_4x4_b:
+    vspltish v20, 8
+    vspltish v18, 3
+    vslh    v18, v20, v18       ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
+
+    load_vfilter v20, v21
+
+    vfilter_16 v0,  v1
+    vfilter_16 v1,  v2
+    vfilter_16 v2,  v3
+    vfilter_16 v3,  v4
+
+compute_sum_sse_4x4_b:
+    vspltish v18, 0             ;# sum
+    vspltish v19, 0             ;# sse
+    vspltish v23, 0             ;# unpack
+    li      r10, 16
+
+    load_and_align_16 v4, r7, r8, 1
+    load_and_align_16 v5, r7, r8, 1
+    load_and_align_16 v6, r7, r8, 1
+    load_and_align_16 v7, r7, r8, 1
+
+    vmrghb  v0, v0, v1
+    vmrghb  v1, v2, v3
+
+    vmrghb  v2, v4, v5
+    vmrghb  v3, v6, v7
+
+    load_c v10, b_hilo_b, 0, r12, r0
+
+    vperm   v0, v0, v1, v10
+    vperm   v1, v2, v3, v10
+
+    compute_sum_sse v0, v1, v18, v19, v20, v21, v23
+
+    variance_final v18, v19, v23, 4
+
+    addi    r1, r1, 32          ;# recover stack
+    mtspr   256, r11            ;# reset old VRSAVE
+
+    blr
+
+    .align 2
+;# r3 unsigned char  *src_ptr
+;# r4 int  src_pixels_per_line
+;# r5 int  xoffset
+;# r6 int  yoffset
+;# r7 unsigned char *dst_ptr
+;# r8 int dst_pixels_per_line
+;# r9 unsigned int *sse
+;#
+;# r3 return value
+vp9_sub_pixel_variance8x8_ppc:
+    mfspr   r11, 256            ;# get old VRSAVE
+    oris    r12, r11, 0xfff0
+    ori     r12, r12, 0xffff
+    mtspr   256, r12            ;# set VRSAVE
+
+    stwu    r1,-32(r1)          ;# create space on the stack
+
+    HProlog second_pass_8x8_pre_copy_b
+
+    ;# Load up permutation constants
+    load_c v10, b_0123_b, 0, r12, r0
+    load_c v11, b_4567_b, 0, r12, r0
+
+    hfilter_8 v0, v10, v11, 1
+    hfilter_8 v1, v10, v11, 1
+    hfilter_8 v2, v10, v11, 1
+    hfilter_8 v3, v10, v11, 1
+    hfilter_8 v4, v10, v11, 1
+    hfilter_8 v5, v10, v11, 1
+    hfilter_8 v6, v10, v11, 1
+    hfilter_8 v7, v10, v11, 1
+
+    ;# Finished filtering main horizontal block.  If there is no
+    ;#  vertical filtering, jump to storing the data.  Otherwise
+    ;#  load up and filter the additional line that is needed
+    ;#  for the vertical filter.
+    beq     compute_sum_sse_8x8_b
+
+    hfilter_8 v8, v10, v11, 0
+
+    b   second_pass_8x8_b
+
+second_pass_8x8_pre_copy_b:
+    slwi.   r6, r6, 5           ;# index into vertical filter array
+
+    load_and_align_16 v0, r3, r4, 1
+    load_and_align_16 v1, r3, r4, 1
+    load_and_align_16 v2, r3, r4, 1
+    load_and_align_16 v3, r3, r4, 1
+    load_and_align_16 v4, r3, r4, 1
+    load_and_align_16 v5, r3, r4, 1
+    load_and_align_16 v6, r3, r4, 1
+    load_and_align_16 v7, r3, r4, 1
+    load_and_align_16 v8, r3, r4, 0
+
+    beq     compute_sum_sse_8x8_b
+
+second_pass_8x8_b:
+    vspltish v20, 8
+    vspltish v18, 3
+    vslh    v18, v20, v18   ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
+
+    load_vfilter v20, v21
+
+    vfilter_16 v0, v1
+    vfilter_16 v1, v2
+    vfilter_16 v2, v3
+    vfilter_16 v3, v4
+    vfilter_16 v4, v5
+    vfilter_16 v5, v6
+    vfilter_16 v6, v7
+    vfilter_16 v7, v8
+
+compute_sum_sse_8x8_b:
+    vspltish v18, 0             ;# sum
+    vspltish v19, 0             ;# sse
+    vspltish v23, 0             ;# unpack
+    li      r10, 16
+
+    vmrghb  v0, v0, v1
+    vmrghb  v1, v2, v3
+    vmrghb  v2, v4, v5
+    vmrghb  v3, v6, v7
+
+    load_and_align_16 v4,  r7, r8, 1
+    load_and_align_16 v5,  r7, r8, 1
+    load_and_align_16 v6,  r7, r8, 1
+    load_and_align_16 v7,  r7, r8, 1
+    load_and_align_16 v8,  r7, r8, 1
+    load_and_align_16 v9,  r7, r8, 1
+    load_and_align_16 v10, r7, r8, 1
+    load_and_align_16 v11, r7, r8, 0
+
+    vmrghb  v4, v4,  v5
+    vmrghb  v5, v6,  v7
+    vmrghb  v6, v8,  v9
+    vmrghb  v7, v10, v11
+
+    compute_sum_sse v0, v4, v18, v19, v20, v21, v23
+    compute_sum_sse v1, v5, v18, v19, v20, v21, v23
+    compute_sum_sse v2, v6, v18, v19, v20, v21, v23
+    compute_sum_sse v3, v7, v18, v19, v20, v21, v23
+
+    variance_final v18, v19, v23, 6
+
+    addi    r1, r1, 32          ;# recover stack
+    mtspr   256, r11            ;# reset old VRSAVE
+    blr
+
+    .align 2
+;# r3 unsigned char  *src_ptr
+;# r4 int  src_pixels_per_line
+;# r5 int  xoffset
+;# r6 int  yoffset
+;# r7 unsigned char *dst_ptr
+;# r8 int dst_pixels_per_line
+;# r9 unsigned int *sse
+;#
+;# r3 return value
+vp9_sub_pixel_variance8x16_ppc:
+    mfspr   r11, 256            ;# get old VRSAVE
+    oris    r12, r11, 0xffff
+    ori     r12, r12, 0xfffc
+    mtspr   256, r12            ;# set VRSAVE
+
+    stwu    r1,-32(r1)          ;# create space on the stack
+
+    HProlog second_pass_8x16_pre_copy_b
+
+    ;# Load up permutation constants
+    load_c v29, b_0123_b, 0, r12, r0
+    load_c v30, b_4567_b, 0, r12, r0
+
+    hfilter_8 v0,  v29, v30, 1
+    hfilter_8 v1,  v29, v30, 1
+    hfilter_8 v2,  v29, v30, 1
+    hfilter_8 v3,  v29, v30, 1
+    hfilter_8 v4,  v29, v30, 1
+    hfilter_8 v5,  v29, v30, 1
+    hfilter_8 v6,  v29, v30, 1
+    hfilter_8 v7,  v29, v30, 1
+    hfilter_8 v8,  v29, v30, 1
+    hfilter_8 v9,  v29, v30, 1
+    hfilter_8 v10, v29, v30, 1
+    hfilter_8 v11, v29, v30, 1
+    hfilter_8 v12, v29, v30, 1
+    hfilter_8 v13, v29, v30, 1
+    hfilter_8 v14, v29, v30, 1
+    hfilter_8 v15, v29, v30, 1
+
+    ;# Finished filtering main horizontal block.  If there is no
+    ;#  vertical filtering, jump to storing the data.  Otherwise
+    ;#  load up and filter the additional line that is needed
+    ;#  for the vertical filter.
+    beq     compute_sum_sse_8x16_b
+
+    hfilter_8 v16, v29, v30, 0
+
+    b   second_pass_8x16_b
+
+second_pass_8x16_pre_copy_b:
+    slwi.   r6, r6, 5           ;# index into vertical filter array
+
+    load_and_align_16 v0,  r3, r4, 1
+    load_and_align_16 v1,  r3, r4, 1
+    load_and_align_16 v2,  r3, r4, 1
+    load_and_align_16 v3,  r3, r4, 1
+    load_and_align_16 v4,  r3, r4, 1
+    load_and_align_16 v5,  r3, r4, 1
+    load_and_align_16 v6,  r3, r4, 1
+    load_and_align_16 v7,  r3, r4, 1
+    load_and_align_16 v8,  r3, r4, 1
+    load_and_align_16 v9,  r3, r4, 1
+    load_and_align_16 v10, r3, r4, 1
+    load_and_align_16 v11, r3, r4, 1
+    load_and_align_16 v12, r3, r4, 1
+    load_and_align_16 v13, r3, r4, 1
+    load_and_align_16 v14, r3, r4, 1
+    load_and_align_16 v15, r3, r4, 1
+    load_and_align_16 v16, r3, r4, 0
+
+    beq     compute_sum_sse_8x16_b
+
+second_pass_8x16_b:
+    vspltish v20, 8
+    vspltish v18, 3
+    vslh    v18, v20, v18   ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
+
+    load_vfilter v20, v21
+
+    vfilter_16 v0,  v1
+    vfilter_16 v1,  v2
+    vfilter_16 v2,  v3
+    vfilter_16 v3,  v4
+    vfilter_16 v4,  v5
+    vfilter_16 v5,  v6
+    vfilter_16 v6,  v7
+    vfilter_16 v7,  v8
+    vfilter_16 v8,  v9
+    vfilter_16 v9,  v10
+    vfilter_16 v10, v11
+    vfilter_16 v11, v12
+    vfilter_16 v12, v13
+    vfilter_16 v13, v14
+    vfilter_16 v14, v15
+    vfilter_16 v15, v16
+
+compute_sum_sse_8x16_b:
+    vspltish v18, 0             ;# sum
+    vspltish v19, 0             ;# sse
+    vspltish v23, 0             ;# unpack
+    li      r10, 16
+
+    vmrghb  v0, v0,  v1
+    vmrghb  v1, v2,  v3
+    vmrghb  v2, v4,  v5
+    vmrghb  v3, v6,  v7
+    vmrghb  v4, v8,  v9
+    vmrghb  v5, v10, v11
+    vmrghb  v6, v12, v13
+    vmrghb  v7, v14, v15
+
+    load_and_align_16 v8,  r7, r8, 1
+    load_and_align_16 v9,  r7, r8, 1
+    load_and_align_16 v10, r7, r8, 1
+    load_and_align_16 v11, r7, r8, 1
+    load_and_align_16 v12, r7, r8, 1
+    load_and_align_16 v13, r7, r8, 1
+    load_and_align_16 v14, r7, r8, 1
+    load_and_align_16 v15, r7, r8, 1
+
+    vmrghb  v8,  v8,  v9
+    vmrghb  v9,  v10, v11
+    vmrghb  v10, v12, v13
+    vmrghb  v11, v14, v15
+
+    compute_sum_sse v0, v8,  v18, v19, v20, v21, v23
+    compute_sum_sse v1, v9,  v18, v19, v20, v21, v23
+    compute_sum_sse v2, v10, v18, v19, v20, v21, v23
+    compute_sum_sse v3, v11, v18, v19, v20, v21, v23
+
+    load_and_align_16 v8,  r7, r8, 1
+    load_and_align_16 v9,  r7, r8, 1
+    load_and_align_16 v10, r7, r8, 1
+    load_and_align_16 v11, r7, r8, 1
+    load_and_align_16 v12, r7, r8, 1
+    load_and_align_16 v13, r7, r8, 1
+    load_and_align_16 v14, r7, r8, 1
+    load_and_align_16 v15, r7, r8, 0
+
+    vmrghb  v8,  v8,  v9
+    vmrghb  v9,  v10, v11
+    vmrghb  v10, v12, v13
+    vmrghb  v11, v14, v15
+
+    compute_sum_sse v4, v8,  v18, v19, v20, v21, v23
+    compute_sum_sse v5, v9,  v18, v19, v20, v21, v23
+    compute_sum_sse v6, v10, v18, v19, v20, v21, v23
+    compute_sum_sse v7, v11, v18, v19, v20, v21, v23
+
+    variance_final v18, v19, v23, 7
+
+    addi    r1, r1, 32          ;# recover stack
+    mtspr   256, r11            ;# reset old VRSAVE
+    blr
+
+;# Filters a horizontal line
+;# expects:
+;#  r3  src_ptr
+;#  r4  pitch
+;#  r10 16
+;#  r12 32
+;#  v17 perm intput
+;#  v18 rounding
+;#  v19 shift
+;#  v20 filter taps
+;#  v21 tmp
+;#  v22 tmp
+;#  v23 tmp
+;#  v24 tmp
+;#  v25 tmp
+;#  v26 tmp
+;#  v27 tmp
+;#  v28 perm output
+;#
+.macro hfilter_16 V, increment_counter
+
+    lvsl    v17,  0, r3         ;# permutate value for alignment
+
+    ;# input to filter is 21 bytes wide, output is 16 bytes.
+    ;#  input will can span three vectors if not aligned correctly.
+    lvx     v21,   0, r3
+    lvx     v22, r10, r3
+    lvx     v23, r12, r3
+
+.if \increment_counter
+    add     r3, r3, r4
+.endif
+    vperm   v21, v21, v22, v17
+    vperm   v22, v22, v23, v17  ;# v8 v9 = 21 input pixels left-justified
+
+    ;# set 0
+    vmsummbm v24, v20, v21, v18 ;# taps times elements
+
+    ;# set 1
+    vsldoi  v23, v21, v22, 1
+    vmsummbm v25, v20, v23, v18
+
+    ;# set 2
+    vsldoi  v23, v21, v22, 2
+    vmsummbm v26, v20, v23, v18
+
+    ;# set 3
+    vsldoi  v23, v21, v22, 3
+    vmsummbm v27, v20, v23, v18
+
+    vpkswus v24, v24, v25       ;# v24 = 0 4 8 C 1 5 9 D (16-bit)
+    vpkswus v25, v26, v27       ;# v25 = 2 6 A E 3 7 B F
+
+    vsrh    v24, v24, v19       ;# divide v0, v1 by 128
+    vsrh    v25, v25, v19
+
+    vpkuhus \V, v24, v25        ;# \V = scrambled 8-bit result
+    vperm   \V, \V, v0, v28     ;# \V = correctly-ordered result
+.endm
+
+    .align 2
+;# r3 unsigned char  *src_ptr
+;# r4 int  src_pixels_per_line
+;# r5 int  xoffset
+;# r6 int  yoffset
+;# r7 unsigned char *dst_ptr
+;# r8 int dst_pixels_per_line
+;# r9 unsigned int *sse
+;#
+;# r3 return value
+vp9_sub_pixel_variance16x8_ppc:
+    mfspr   r11, 256            ;# get old VRSAVE
+    oris    r12, r11, 0xffff
+    ori     r12, r12, 0xfff8
+    mtspr   256, r12            ;# set VRSAVE
+
+    stwu    r1, -32(r1)         ;# create space on the stack
+
+    HProlog second_pass_16x8_pre_copy_b
+
+    hfilter_16 v0, 1
+    hfilter_16 v1, 1
+    hfilter_16 v2, 1
+    hfilter_16 v3, 1
+    hfilter_16 v4, 1
+    hfilter_16 v5, 1
+    hfilter_16 v6, 1
+    hfilter_16 v7, 1
+
+    ;# Finished filtering main horizontal block.  If there is no
+    ;#  vertical filtering, jump to storing the data.  Otherwise
+    ;#  load up and filter the additional line that is needed
+    ;#  for the vertical filter.
+    beq     compute_sum_sse_16x8_b
+
+    hfilter_16 v8, 0
+
+    b   second_pass_16x8_b
+
+second_pass_16x8_pre_copy_b:
+    slwi.   r6, r6, 5           ;# index into vertical filter array
+
+    load_and_align_16  v0,  r3, r4, 1
+    load_and_align_16  v1,  r3, r4, 1
+    load_and_align_16  v2,  r3, r4, 1
+    load_and_align_16  v3,  r3, r4, 1
+    load_and_align_16  v4,  r3, r4, 1
+    load_and_align_16  v5,  r3, r4, 1
+    load_and_align_16  v6,  r3, r4, 1
+    load_and_align_16  v7,  r3, r4, 1
+    load_and_align_16  v8,  r3, r4, 1
+
+    beq     compute_sum_sse_16x8_b
+
+second_pass_16x8_b:
+    vspltish v20, 8
+    vspltish v18, 3
+    vslh    v18, v20, v18   ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
+
+    load_vfilter v20, v21
+
+    vfilter_16 v0,  v1
+    vfilter_16 v1,  v2
+    vfilter_16 v2,  v3
+    vfilter_16 v3,  v4
+    vfilter_16 v4,  v5
+    vfilter_16 v5,  v6
+    vfilter_16 v6,  v7
+    vfilter_16 v7,  v8
+
+compute_sum_sse_16x8_b:
+    vspltish v18, 0             ;# sum
+    vspltish v19, 0             ;# sse
+    vspltish v23, 0             ;# unpack
+    li      r10, 16
+
+    compute_sum_sse_16 v0, 1
+    compute_sum_sse_16 v1, 1
+    compute_sum_sse_16 v2, 1
+    compute_sum_sse_16 v3, 1
+    compute_sum_sse_16 v4, 1
+    compute_sum_sse_16 v5, 1
+    compute_sum_sse_16 v6, 1
+    compute_sum_sse_16 v7, 0
+
+    variance_final v18, v19, v23, 7
+
+    addi    r1, r1, 32          ;# recover stack
+
+    mtspr   256, r11            ;# reset old VRSAVE
+
+    blr
+
+    .align 2
+;# r3 unsigned char  *src_ptr
+;# r4 int  src_pixels_per_line
+;# r5 int  xoffset
+;# r6 int  yoffset
+;# r7 unsigned char *dst_ptr
+;# r8 int dst_pixels_per_line
+;# r9 unsigned int *sse
+;#
+;# r3 return value
+vp9_sub_pixel_variance16x16_ppc:
+    mfspr   r11, 256            ;# get old VRSAVE
+    oris    r12, r11, 0xffff
+    ori     r12, r12, 0xfff8
+    mtspr   256, r12            ;# set VRSAVE
+
+    stwu    r1, -32(r1)         ;# create space on the stack
+
+    HProlog second_pass_16x16_pre_copy_b
+
+    hfilter_16 v0,  1
+    hfilter_16 v1,  1
+    hfilter_16 v2,  1
+    hfilter_16 v3,  1
+    hfilter_16 v4,  1
+    hfilter_16 v5,  1
+    hfilter_16 v6,  1
+    hfilter_16 v7,  1
+    hfilter_16 v8,  1
+    hfilter_16 v9,  1
+    hfilter_16 v10, 1
+    hfilter_16 v11, 1
+    hfilter_16 v12, 1
+    hfilter_16 v13, 1
+    hfilter_16 v14, 1
+    hfilter_16 v15, 1
+
+    ;# Finished filtering main horizontal block.  If there is no
+    ;#  vertical filtering, jump to storing the data.  Otherwise
+    ;#  load up and filter the additional line that is needed
+    ;#  for the vertical filter.
+    beq     compute_sum_sse_16x16_b
+
+    hfilter_16 v16, 0
+
+    b   second_pass_16x16_b
+
+second_pass_16x16_pre_copy_b:
+    slwi.   r6, r6, 5           ;# index into vertical filter array
+
+    load_and_align_16  v0,  r3, r4, 1
+    load_and_align_16  v1,  r3, r4, 1
+    load_and_align_16  v2,  r3, r4, 1
+    load_and_align_16  v3,  r3, r4, 1
+    load_and_align_16  v4,  r3, r4, 1
+    load_and_align_16  v5,  r3, r4, 1
+    load_and_align_16  v6,  r3, r4, 1
+    load_and_align_16  v7,  r3, r4, 1
+    load_and_align_16  v8,  r3, r4, 1
+    load_and_align_16  v9,  r3, r4, 1
+    load_and_align_16  v10, r3, r4, 1
+    load_and_align_16  v11, r3, r4, 1
+    load_and_align_16  v12, r3, r4, 1
+    load_and_align_16  v13, r3, r4, 1
+    load_and_align_16  v14, r3, r4, 1
+    load_and_align_16  v15, r3, r4, 1
+    load_and_align_16  v16, r3, r4, 0
+
+    beq     compute_sum_sse_16x16_b
+
+second_pass_16x16_b:
+    vspltish v20, 8
+    vspltish v18, 3
+    vslh    v18, v20, v18   ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
+
+    load_vfilter v20, v21
+
+    vfilter_16 v0,  v1
+    vfilter_16 v1,  v2
+    vfilter_16 v2,  v3
+    vfilter_16 v3,  v4
+    vfilter_16 v4,  v5
+    vfilter_16 v5,  v6
+    vfilter_16 v6,  v7
+    vfilter_16 v7,  v8
+    vfilter_16 v8,  v9
+    vfilter_16 v9,  v10
+    vfilter_16 v10, v11
+    vfilter_16 v11, v12
+    vfilter_16 v12, v13
+    vfilter_16 v13, v14
+    vfilter_16 v14, v15
+    vfilter_16 v15, v16
+
+compute_sum_sse_16x16_b:
+    vspltish v18, 0             ;# sum
+    vspltish v19, 0             ;# sse
+    vspltish v23, 0             ;# unpack
+    li      r10, 16
+
+    compute_sum_sse_16 v0,  1
+    compute_sum_sse_16 v1,  1
+    compute_sum_sse_16 v2,  1
+    compute_sum_sse_16 v3,  1
+    compute_sum_sse_16 v4,  1
+    compute_sum_sse_16 v5,  1
+    compute_sum_sse_16 v6,  1
+    compute_sum_sse_16 v7,  1
+    compute_sum_sse_16 v8,  1
+    compute_sum_sse_16 v9,  1
+    compute_sum_sse_16 v10, 1
+    compute_sum_sse_16 v11, 1
+    compute_sum_sse_16 v12, 1
+    compute_sum_sse_16 v13, 1
+    compute_sum_sse_16 v14, 1
+    compute_sum_sse_16 v15, 0
+
+    variance_final v18, v19, v23, 8
+
+    addi    r1, r1, 32          ;# recover stack
+
+    mtspr   256, r11            ;# reset old VRSAVE
+
+    blr
+
+    .data
+
+    .align 4
+hfilter_b:
+    .byte   128,  0,  0,  0,128,  0,  0,  0,128,  0,  0,  0,128,  0,  0,  0
+    .byte   112, 16,  0,  0,112, 16,  0,  0,112, 16,  0,  0,112, 16,  0,  0
+    .byte    96, 32,  0,  0, 96, 32,  0,  0, 96, 32,  0,  0, 96, 32,  0,  0
+    .byte    80, 48,  0,  0, 80, 48,  0,  0, 80, 48,  0,  0, 80, 48,  0,  0
+    .byte    64, 64,  0,  0, 64, 64,  0,  0, 64, 64,  0,  0, 64, 64,  0,  0
+    .byte    48, 80,  0,  0, 48, 80,  0,  0, 48, 80,  0,  0, 48, 80,  0,  0
+    .byte    32, 96,  0,  0, 32, 96,  0,  0, 32, 96,  0,  0, 32, 96,  0,  0
+    .byte    16,112,  0,  0, 16,112,  0,  0, 16,112,  0,  0, 16,112,  0,  0
+
+    .align 4
+vfilter_b:
+    .byte   128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128
+    .byte     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
+    .byte   112,112,112,112,112,112,112,112,112,112,112,112,112,112,112,112
+    .byte    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
+    .byte    96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96
+    .byte    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32
+    .byte    80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80
+    .byte    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48
+    .byte    64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
+    .byte    64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
+    .byte    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48
+    .byte    80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80
+    .byte    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32
+    .byte    96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96
+    .byte    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
+    .byte   112,112,112,112,112,112,112,112,112,112,112,112,112,112,112,112
+
+    .align 4
+b_hperm_b:
+    .byte     0,  4,  8, 12,  1,  5,  9, 13,  2,  6, 10, 14,  3,  7, 11, 15
+
+    .align 4
+b_0123_b:
+    .byte     0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6
+
+    .align 4
+b_4567_b:
+    .byte     4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10
+
+b_hilo_b:
+    .byte     0,  1,  2,  3,  4,  5,  6,  7, 16, 17, 18, 19, 20, 21, 22, 23
--- /dev/null
+++ b/vp9/encoder/psnr.c
@@ -1,0 +1,30 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_scale/yv12config.h"
+#include "math.h"
+#include "vp9/common/systemdependent.h" /* for vp9_clear_system_state() */
+
+#define MAX_PSNR 100
+
+double vp9_mse2psnr(double Samples, double Peak, double Mse) {
+  double psnr;
+
+  if ((double)Mse > 0.0)
+    psnr = 10.0 * log10(Peak * Peak * Samples / Mse);
+  else
+    psnr = MAX_PSNR;      // Limit to prevent / 0
+
+  if (psnr > MAX_PSNR)
+    psnr = MAX_PSNR;
+
+  return psnr;
+}
--- /dev/null
+++ b/vp9/encoder/psnr.h
@@ -1,0 +1,17 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_PSNR_H
+#define __INC_PSNR_H
+
+extern double vp9_mse2psnr(double Samples, double Peak, double Mse);
+
+#endif
--- /dev/null
+++ b/vp9/encoder/quantize.c
@@ -1,0 +1,716 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+#include "vpx_mem/vpx_mem.h"
+
+#include "onyx_int.h"
+#include "quantize.h"
+#include "vp9/common/quant_common.h"
+
+#include "vp9/common/seg_common.h"
+
+#ifdef ENC_DEBUG
+extern int enc_debug;
+#endif
+
+void vp9_ht_quantize_b_4x4(BLOCK *b, BLOCKD *d, TX_TYPE tx_type) {
+  int i, rc, eob;
+  int zbin;
+  int x, y, z, sz;
+  short *zbin_boost_ptr  = b->zrun_zbin_boost;
+  short *coeff_ptr       = b->coeff;
+  short *zbin_ptr        = b->zbin;
+  short *round_ptr       = b->round;
+  short *quant_ptr       = b->quant;
+  unsigned char *quant_shift_ptr = b->quant_shift;
+  short *qcoeff_ptr      = d->qcoeff;
+  short *dqcoeff_ptr     = d->dqcoeff;
+  short *dequant_ptr     = d->dequant;
+  short zbin_oq_value    = b->zbin_extra;
+
+  int const *pt_scan ;
+
+  switch (tx_type) {
+    case ADST_DCT :
+      pt_scan = vp9_row_scan;
+      break;
+
+    case DCT_ADST :
+      pt_scan = vp9_col_scan;
+      break;
+
+    default :
+      pt_scan = vp9_default_zig_zag1d;
+      break;
+  }
+
+  vpx_memset(qcoeff_ptr, 0, 32);
+  vpx_memset(dqcoeff_ptr, 0, 32);
+
+  eob = -1;
+
+  for (i = 0; i < b->eob_max_offset; i++) {
+    rc   = pt_scan[i];
+    z    = coeff_ptr[rc];
+
+    zbin = zbin_ptr[rc] + *zbin_boost_ptr + zbin_oq_value;
+    zbin_boost_ptr ++;
+
+    sz = (z >> 31);                                 // sign of z
+    x  = (z ^ sz) - sz;                             // x = abs(z)
+
+    if (x >= zbin) {
+      x += round_ptr[rc];
+      y  = (((x * quant_ptr[rc]) >> 16) + x)
+           >> quant_shift_ptr[rc];                // quantize (x)
+      x  = (y ^ sz) - sz;                         // get the sign back
+      qcoeff_ptr[rc]  = x;                        // write to destination
+      dqcoeff_ptr[rc] = x * dequant_ptr[rc];      // dequantized value
+
+      if (y) {
+        eob = i;                                // last nonzero coeffs
+        zbin_boost_ptr = b->zrun_zbin_boost;    // reset zero runlength
+      }
+    }
+  }
+
+  d->eob = eob + 1;
+}
+
+void vp9_regular_quantize_b_4x4(BLOCK *b, BLOCKD *d) {
+  int i, rc, eob;
+  int zbin;
+  int x, y, z, sz;
+  short *zbin_boost_ptr  = b->zrun_zbin_boost;
+  short *coeff_ptr       = b->coeff;
+  short *zbin_ptr        = b->zbin;
+  short *round_ptr       = b->round;
+  short *quant_ptr       = b->quant;
+  unsigned char *quant_shift_ptr = b->quant_shift;
+  short *qcoeff_ptr      = d->qcoeff;
+  short *dqcoeff_ptr     = d->dqcoeff;
+  short *dequant_ptr     = d->dequant;
+  short zbin_oq_value    = b->zbin_extra;
+
+  vpx_memset(qcoeff_ptr, 0, 32);
+  vpx_memset(dqcoeff_ptr, 0, 32);
+
+  eob = -1;
+
+  for (i = 0; i < b->eob_max_offset; i++) {
+    rc   = vp9_default_zig_zag1d[i];
+    z    = coeff_ptr[rc];
+
+    zbin = zbin_ptr[rc] + *zbin_boost_ptr + zbin_oq_value;
+    zbin_boost_ptr ++;
+
+    sz = (z >> 31);                                 // sign of z
+    x  = (z ^ sz) - sz;                             // x = abs(z)
+
+    if (x >= zbin) {
+      x += round_ptr[rc];
+
+      y  = (((x * quant_ptr[rc]) >> 16) + x)
+           >> quant_shift_ptr[rc];                // quantize (x)
+      x  = (y ^ sz) - sz;                         // get the sign back
+      qcoeff_ptr[rc]  = x;                        // write to destination
+      dqcoeff_ptr[rc] = x * dequant_ptr[rc];      // dequantized value
+
+      if (y) {
+        eob = i;                                // last nonzero coeffs
+        zbin_boost_ptr = b->zrun_zbin_boost;    // reset zero runlength
+      }
+    }
+  }
+
+  d->eob = eob + 1;
+}
+
+void vp9_quantize_mby_4x4_c(MACROBLOCK *x) {
+  int i;
+  int has_2nd_order = x->e_mbd.mode_info_context->mbmi.mode != SPLITMV;
+
+  for (i = 0; i < 16; i++)
+    x->quantize_b_4x4(&x->block[i], &x->e_mbd.block[i]);
+
+  if (has_2nd_order)
+    x->quantize_b_4x4(&x->block[24], &x->e_mbd.block[24]);
+}
+
+void vp9_quantize_mbuv_4x4_c(MACROBLOCK *x) {
+  int i;
+
+  for (i = 16; i < 24; i++)
+    x->quantize_b_4x4(&x->block[i], &x->e_mbd.block[i]);
+}
+
+void vp9_quantize_mb_4x4_c(MACROBLOCK *x) {
+  vp9_quantize_mby_4x4_c(x);
+  vp9_quantize_mbuv_4x4_c(x);
+}
+
+void vp9_regular_quantize_b_2x2(BLOCK *b, BLOCKD *d) {
+  int i, rc, eob;
+  int zbin;
+  int x, y, z, sz;
+  short *zbin_boost_ptr = b->zrun_zbin_boost;
+  int zbin_zrun_index = 0;
+  short *coeff_ptr  = b->coeff;
+  short *zbin_ptr   = b->zbin;
+  short *round_ptr  = b->round;
+  short *quant_ptr  = b->quant;
+  unsigned char *quant_shift_ptr = b->quant_shift;
+  short *qcoeff_ptr = d->qcoeff;
+  short *dqcoeff_ptr = d->dqcoeff;
+  short *dequant_ptr = d->dequant;
+  short zbin_oq_value = b->zbin_extra;
+  // double q2nd = 4;
+  vpx_memset(qcoeff_ptr, 0, 32);
+  vpx_memset(dqcoeff_ptr, 0, 32);
+
+  eob = -1;
+
+  for (i = 0; i < b->eob_max_offset_8x8; i++) {
+    rc   = vp9_default_zig_zag1d[i];
+    z    = coeff_ptr[rc];
+
+    zbin_boost_ptr = &b->zrun_zbin_boost[zbin_zrun_index];
+    zbin_zrun_index += 4;
+    zbin = (zbin_ptr[rc] + *zbin_boost_ptr + zbin_oq_value);
+
+    sz = (z >> 31);                               // sign of z
+    x  = (z ^ sz) - sz;                           // x = abs(z)
+
+    if (x >= zbin) {
+      x += (round_ptr[rc]);
+      y  = ((int)((int)(x * quant_ptr[rc]) >> 16) + x)
+           >> quant_shift_ptr[rc];                // quantize (x)
+      x  = (y ^ sz) - sz;                         // get the sign back
+      qcoeff_ptr[rc]  = x;                        // write to destination
+      dqcoeff_ptr[rc] = x * dequant_ptr[rc];      // dequantized value
+
+      if (y) {
+        eob = i;                                  // last nonzero coeffs
+        zbin_zrun_index = 0;
+      }
+    }
+  }
+
+  d->eob = eob + 1;
+}
+
+void vp9_regular_quantize_b_8x8(BLOCK *b, BLOCKD *d) {
+  int i, rc, eob;
+  int zbin;
+  int x, y, z, sz;
+  short *zbin_boost_ptr = b->zrun_zbin_boost_8x8;
+  short *coeff_ptr  = b->coeff;
+  short *zbin_ptr   = b->zbin_8x8;
+  short *round_ptr  = b->round;
+  short *quant_ptr  = b->quant;
+  unsigned char *quant_shift_ptr = b->quant_shift;
+  short *qcoeff_ptr = d->qcoeff;
+  short *dqcoeff_ptr = d->dqcoeff;
+  short *dequant_ptr = d->dequant;
+  short zbin_oq_value = b->zbin_extra;
+
+  vpx_memset(qcoeff_ptr, 0, 64 * sizeof(short));
+  vpx_memset(dqcoeff_ptr, 0, 64 * sizeof(short));
+
+  eob = -1;
+
+  for (i = 0; i < b->eob_max_offset_8x8; i++) {
+    rc   = vp9_default_zig_zag1d_8x8[i];
+    z    = coeff_ptr[rc];
+
+    zbin = (zbin_ptr[rc != 0] + *zbin_boost_ptr + zbin_oq_value);
+    zbin_boost_ptr++;
+
+    sz = (z >> 31);                               // sign of z
+    x  = (z ^ sz) - sz;                           // x = abs(z)
+
+    if (x >= zbin) {
+      x += (round_ptr[rc != 0]);
+      y  = ((int)(((int)(x * quant_ptr[rc != 0]) >> 16) + x))
+           >> quant_shift_ptr[rc != 0];            // quantize (x)
+      x  = (y ^ sz) - sz;                         // get the sign back
+      qcoeff_ptr[rc]  = x;                        // write to destination
+      dqcoeff_ptr[rc] = x * dequant_ptr[rc != 0]; // dequantized value
+
+      if (y) {
+        eob = i;                                  // last nonzero coeffs
+        zbin_boost_ptr = b->zrun_zbin_boost_8x8;
+      }
+    }
+  }
+
+  d->eob = eob + 1;
+}
+
+void vp9_quantize_mby_8x8(MACROBLOCK *x) {
+  int i;
+  int has_2nd_order = x->e_mbd.mode_info_context->mbmi.mode != SPLITMV;
+
+  for (i = 0; i < 16; i ++) {
+    x->e_mbd.block[i].eob = 0;
+  }
+  x->e_mbd.block[24].eob = 0;
+  for (i = 0; i < 16; i += 4)
+    x->quantize_b_8x8(&x->block[i], &x->e_mbd.block[i]);
+
+  if (has_2nd_order)
+    x->quantize_b_2x2(&x->block[24], &x->e_mbd.block[24]);
+}
+
+void vp9_quantize_mbuv_8x8(MACROBLOCK *x) {
+  int i;
+
+  for (i = 16; i < 24; i ++)
+    x->e_mbd.block[i].eob = 0;
+  for (i = 16; i < 24; i += 4)
+    x->quantize_b_8x8(&x->block[i], &x->e_mbd.block[i]);
+}
+
+void vp9_quantize_mb_8x8(MACROBLOCK *x) {
+  vp9_quantize_mby_8x8(x);
+  vp9_quantize_mbuv_8x8(x);
+}
+
+void vp9_quantize_mby_16x16(MACROBLOCK *x) {
+  int i;
+
+  for (i = 0; i < 16; i++)
+    x->e_mbd.block[i].eob = 0;
+  x->e_mbd.block[24].eob = 0;
+  x->quantize_b_16x16(&x->block[0], &x->e_mbd.block[0]);
+}
+
+void vp9_quantize_mb_16x16(MACROBLOCK *x) {
+  vp9_quantize_mby_16x16(x);
+  vp9_quantize_mbuv_8x8(x);
+}
+
+void vp9_regular_quantize_b_16x16(BLOCK *b, BLOCKD *d) {
+  int i, rc, eob;
+  int zbin;
+  int x, y, z, sz;
+  short *zbin_boost_ptr = b->zrun_zbin_boost_16x16;
+  short *coeff_ptr  = b->coeff;
+  short *zbin_ptr   = b->zbin_16x16;
+  short *round_ptr  = b->round;
+  short *quant_ptr  = b->quant;
+  unsigned char *quant_shift_ptr = b->quant_shift;
+  short *qcoeff_ptr = d->qcoeff;
+  short *dqcoeff_ptr = d->dqcoeff;
+  short *dequant_ptr = d->dequant;
+  short zbin_oq_value = b->zbin_extra;
+
+  vpx_memset(qcoeff_ptr, 0, 256*sizeof(short));
+  vpx_memset(dqcoeff_ptr, 0, 256*sizeof(short));
+
+  eob = -1;
+  for (i = 0; i < b->eob_max_offset_16x16; i++) {
+    rc   = vp9_default_zig_zag1d_16x16[i];
+    z    = coeff_ptr[rc];
+
+    zbin = (zbin_ptr[rc!=0] + *zbin_boost_ptr + zbin_oq_value);
+    zbin_boost_ptr ++;
+
+    sz = (z >> 31);                               // sign of z
+    x  = (z ^ sz) - sz;                           // x = abs(z)
+
+    if (x >= zbin) {
+      x += (round_ptr[rc!=0]);
+      y  = ((int)(((int)(x * quant_ptr[rc!=0]) >> 16) + x))
+          >> quant_shift_ptr[rc!=0];              // quantize (x)
+      x  = (y ^ sz) - sz;                         // get the sign back
+      qcoeff_ptr[rc]  = x;                        // write to destination
+      dqcoeff_ptr[rc] = x * dequant_ptr[rc!=0];   // dequantized value
+
+      if (y) {
+        eob = i;                                  // last nonzero coeffs
+        zbin_boost_ptr = b->zrun_zbin_boost_16x16;
+      }
+    }
+  }
+
+  d->eob = eob + 1;
+}
+
+/* quantize_b_pair function pointer in MACROBLOCK structure is set to one of
+ * these two C functions if corresponding optimized routine is not available.
+ * NEON optimized version implements currently the fast quantization for pair
+ * of blocks. */
+void vp9_regular_quantize_b_4x4_pair(BLOCK *b1, BLOCK *b2,
+                                     BLOCKD *d1, BLOCKD *d2) {
+  vp9_regular_quantize_b_4x4(b1, d1);
+  vp9_regular_quantize_b_4x4(b2, d2);
+}
+
+static void invert_quant(short *quant,
+                         unsigned char *shift, short d) {
+  unsigned t;
+  int l;
+  t = d;
+  for (l = 0; t > 1; l++)
+    t >>= 1;
+  t = 1 + (1 << (16 + l)) / d;
+  *quant = (short)(t - (1 << 16));
+  *shift = l;
+}
+
+void vp9_init_quantizer(VP9_COMP *cpi) {
+  int i;
+  int quant_val;
+  int Q;
+  static const int zbin_boost[16] = {  0,  0,  8, 10, 12, 14, 16, 20,
+                                      24, 28, 32, 36, 40, 44, 44, 44
+                                    };
+
+  static const int zbin_boost_8x8[64] = {  0,  0,  0,  8,  8,  8, 10, 12,
+                                          14, 16, 18, 20, 22, 24, 26, 28,
+                                          30, 32, 34, 36, 38, 40, 42, 44,
+                                          46, 48, 48, 48, 48, 48, 48, 48,
+                                          48, 48, 48, 48, 48, 48, 48, 48,
+                                          48, 48, 48, 48, 48, 48, 48, 48,
+                                          48, 48, 48, 48, 48, 48, 48, 48,
+                                          48, 48, 48, 48, 48, 48, 48, 48
+                                        };
+  static const int zbin_boost_16x16[256] = {
+     0,  0,  0,  8,  8,  8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28,
+    30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+  };
+  int qrounding_factor = 48;
+
+
+  for (Q = 0; Q < QINDEX_RANGE; Q++) {
+    int qzbin_factor = (vp9_dc_quant(Q, 0) < 148) ? 84 : 80;
+
+#if CONFIG_LOSSLESS
+    if (cpi->oxcf.lossless) {
+      if (Q == 0) {
+        qzbin_factor = 64;
+        qrounding_factor = 64;
+      }
+    }
+#endif
+
+    // dc values
+    quant_val = vp9_dc_quant(Q, cpi->common.y1dc_delta_q);
+    invert_quant(cpi->Y1quant[Q] + 0,
+                 cpi->Y1quant_shift[Q] + 0, quant_val);
+    cpi->Y1zbin[Q][0] = ((qzbin_factor * quant_val) + 64) >> 7;
+    cpi->Y1zbin_8x8[Q][0] = ((qzbin_factor * quant_val) + 64) >> 7;
+    cpi->Y1zbin_16x16[Q][0] = ((qzbin_factor * quant_val) + 64) >> 7;
+    cpi->Y1round[Q][0] = (qrounding_factor * quant_val) >> 7;
+    cpi->common.Y1dequant[Q][0] = quant_val;
+    cpi->zrun_zbin_boost_y1[Q][0] = (quant_val * zbin_boost[0]) >> 7;
+    cpi->zrun_zbin_boost_y1_8x8[Q][0] =
+      ((quant_val * zbin_boost_8x8[0]) + 64) >> 7;
+    cpi->zrun_zbin_boost_y1_16x16[Q][0] = ((quant_val * zbin_boost_16x16[0]) + 64) >> 7;
+
+
+    quant_val = vp9_dc2quant(Q, cpi->common.y2dc_delta_q);
+    invert_quant(cpi->Y2quant[Q] + 0,
+                 cpi->Y2quant_shift[Q] + 0, quant_val);
+    cpi->Y2zbin[Q][0] = ((qzbin_factor * quant_val) + 64) >> 7;
+    cpi->Y2zbin_8x8[Q][0] = ((qzbin_factor * quant_val) + 64) >> 7;
+    cpi->Y2zbin_16x16[Q][0] = ((qzbin_factor * quant_val) + 64) >> 7;
+    cpi->Y2round[Q][0] = (qrounding_factor * quant_val) >> 7;
+    cpi->common.Y2dequant[Q][0] = quant_val;
+    cpi->zrun_zbin_boost_y2[Q][0] = (quant_val * zbin_boost[0]) >> 7;
+    cpi->zrun_zbin_boost_y2_8x8[Q][0] =
+      ((quant_val * zbin_boost_8x8[0]) + 64) >> 7;
+    cpi->zrun_zbin_boost_y2_16x16[Q][0] = ((quant_val * zbin_boost_16x16[0]) + 64) >> 7;
+
+    quant_val = vp9_dc_uv_quant(Q, cpi->common.uvdc_delta_q);
+    invert_quant(cpi->UVquant[Q] + 0,
+                 cpi->UVquant_shift[Q] + 0, quant_val);
+    cpi->UVzbin[Q][0] = ((qzbin_factor * quant_val) + 64) >> 7;
+    cpi->UVzbin_8x8[Q][0] = ((qzbin_factor * quant_val) + 64) >> 7;
+    cpi->UVzbin_16x16[Q][0] = ((qzbin_factor * quant_val) + 64) >> 7;
+    cpi->UVround[Q][0] = (qrounding_factor * quant_val) >> 7;
+    cpi->common.UVdequant[Q][0] = quant_val;
+    cpi->zrun_zbin_boost_uv[Q][0] = (quant_val * zbin_boost[0]) >> 7;
+    cpi->zrun_zbin_boost_uv_8x8[Q][0] =
+      ((quant_val * zbin_boost_8x8[0]) + 64) >> 7;
+    cpi->zrun_zbin_boost_uv_16x16[Q][0] = ((quant_val * zbin_boost_16x16[0]) + 64) >> 7;
+
+    // all the 4x4 ac values =;
+    for (i = 1; i < 16; i++) {
+      int rc = vp9_default_zig_zag1d[i];
+
+      quant_val = vp9_ac_yquant(Q);
+      invert_quant(cpi->Y1quant[Q] + rc,
+                   cpi->Y1quant_shift[Q] + rc, quant_val);
+      cpi->Y1zbin[Q][rc] = ((qzbin_factor * quant_val) + 64) >> 7;
+      cpi->Y1round[Q][rc] = (qrounding_factor * quant_val) >> 7;
+      cpi->common.Y1dequant[Q][rc] = quant_val;
+      cpi->zrun_zbin_boost_y1[Q][i] =
+        ((quant_val * zbin_boost[i]) + 64) >> 7;
+
+      quant_val = vp9_ac2quant(Q, cpi->common.y2ac_delta_q);
+      invert_quant(cpi->Y2quant[Q] + rc,
+                   cpi->Y2quant_shift[Q] + rc, quant_val);
+      cpi->Y2zbin[Q][rc] = ((qzbin_factor * quant_val) + 64) >> 7;
+      cpi->Y2round[Q][rc] = (qrounding_factor * quant_val) >> 7;
+      cpi->common.Y2dequant[Q][rc] = quant_val;
+      cpi->zrun_zbin_boost_y2[Q][i] =
+        ((quant_val * zbin_boost[i]) + 64) >> 7;
+
+      quant_val = vp9_ac_uv_quant(Q, cpi->common.uvac_delta_q);
+      invert_quant(cpi->UVquant[Q] + rc,
+                   cpi->UVquant_shift[Q] + rc, quant_val);
+      cpi->UVzbin[Q][rc] = ((qzbin_factor * quant_val) + 64) >> 7;
+      cpi->UVround[Q][rc] = (qrounding_factor * quant_val) >> 7;
+      cpi->common.UVdequant[Q][rc] = quant_val;
+      cpi->zrun_zbin_boost_uv[Q][i] =
+        ((quant_val * zbin_boost[i]) + 64) >> 7;
+    }
+
+    // 8x8 structures... only zbin seperated out for now
+    // This needs cleaning up for 8x8 especially if we are to add
+    // support for non flat Q matices
+    for (i = 1; i < 64; i++) {
+      int rc = vp9_default_zig_zag1d_8x8[i];
+
+      quant_val = vp9_ac_yquant(Q);
+      cpi->Y1zbin_8x8[Q][rc] = ((qzbin_factor * quant_val) + 64) >> 7;
+      cpi->zrun_zbin_boost_y1_8x8[Q][i] =
+        ((quant_val * zbin_boost_8x8[i]) + 64) >> 7;
+
+      quant_val = vp9_ac2quant(Q, cpi->common.y2ac_delta_q);
+      cpi->Y2zbin_8x8[Q][rc] = ((qzbin_factor * quant_val) + 64) >> 7;
+      cpi->zrun_zbin_boost_y2_8x8[Q][i] =
+        ((quant_val * zbin_boost_8x8[i]) + 64) >> 7;
+
+      quant_val = vp9_ac_uv_quant(Q, cpi->common.uvac_delta_q);
+      cpi->UVzbin_8x8[Q][rc] = ((qzbin_factor * quant_val) + 64) >> 7;
+      cpi->zrun_zbin_boost_uv_8x8[Q][i] =
+        ((quant_val * zbin_boost_8x8[i]) + 64) >> 7;
+    }
+
+    // 16x16 structures. Same comment above applies.
+    for (i = 1; i < 256; i++) {
+      int rc = vp9_default_zig_zag1d_16x16[i];
+
+      quant_val = vp9_ac_yquant(Q);
+      cpi->Y1zbin_16x16[Q][rc] = ((qzbin_factor * quant_val) + 64) >> 7;
+      cpi->zrun_zbin_boost_y1_16x16[Q][i] = ((quant_val * zbin_boost_16x16[i]) + 64) >> 7;
+
+      quant_val = vp9_ac2quant(Q, cpi->common.y2ac_delta_q);
+      cpi->Y2zbin_16x16[Q][rc] = ((qzbin_factor * quant_val) + 64) >> 7;
+      cpi->zrun_zbin_boost_y2_16x16[Q][i] = ((quant_val * zbin_boost_16x16[i]) + 64) >> 7;
+
+      quant_val = vp9_ac_uv_quant(Q, cpi->common.uvac_delta_q);
+      cpi->UVzbin_16x16[Q][rc] = ((qzbin_factor * quant_val) + 64) >> 7;
+      cpi->zrun_zbin_boost_uv_16x16[Q][i] = ((quant_val * zbin_boost_16x16[i]) + 64) >> 7;
+    }
+  }
+}
+
+void vp9_mb_init_quantizer(VP9_COMP *cpi, MACROBLOCK *x) {
+  int i;
+  int QIndex;
+  MACROBLOCKD *xd = &x->e_mbd;
+  int zbin_extra;
+  int segment_id = xd->mode_info_context->mbmi.segment_id;
+
+  // Select the baseline MB Q index allowing for any segment level change.
+  if (vp9_segfeature_active(xd, segment_id, SEG_LVL_ALT_Q)) {
+    // Abs Value
+    if (xd->mb_segment_abs_delta == SEGMENT_ABSDATA)
+      QIndex = vp9_get_segdata(xd, segment_id, SEG_LVL_ALT_Q);
+
+    // Delta Value
+    else {
+      QIndex = cpi->common.base_qindex +
+               vp9_get_segdata(xd, segment_id, SEG_LVL_ALT_Q);
+
+      // Clamp to valid range
+      QIndex = (QIndex >= 0) ? ((QIndex <= MAXQ) ? QIndex : MAXQ) : 0;
+    }
+  } else
+    QIndex = cpi->common.base_qindex;
+
+  // Y
+  zbin_extra = (cpi->common.Y1dequant[QIndex][1] *
+                (cpi->zbin_over_quant +
+                 cpi->zbin_mode_boost +
+                 x->act_zbin_adj)) >> 7;
+
+  for (i = 0; i < 16; i++) {
+    x->block[i].quant = cpi->Y1quant[QIndex];
+    x->block[i].quant_shift = cpi->Y1quant_shift[QIndex];
+    x->block[i].zbin = cpi->Y1zbin[QIndex];
+    x->block[i].zbin_8x8 = cpi->Y1zbin_8x8[QIndex];
+    x->block[i].zbin_16x16 = cpi->Y1zbin_16x16[QIndex];
+    x->block[i].round = cpi->Y1round[QIndex];
+    x->e_mbd.block[i].dequant = cpi->common.Y1dequant[QIndex];
+    x->block[i].zrun_zbin_boost = cpi->zrun_zbin_boost_y1[QIndex];
+    x->block[i].zrun_zbin_boost_8x8 = cpi->zrun_zbin_boost_y1_8x8[QIndex];
+    x->block[i].zrun_zbin_boost_16x16 = cpi->zrun_zbin_boost_y1_16x16[QIndex];
+    x->block[i].zbin_extra = (short)zbin_extra;
+
+    // Segment max eob offset feature.
+    if (vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB)) {
+      x->block[i].eob_max_offset =
+        vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);
+      x->block[i].eob_max_offset_8x8 =
+        vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);
+      x->block[i].eob_max_offset_16x16 =
+        vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);
+    } else {
+      x->block[i].eob_max_offset = 16;
+      x->block[i].eob_max_offset_8x8 = 64;
+      x->block[i].eob_max_offset_16x16 = 256;
+    }
+  }
+
+  // UV
+  zbin_extra = (cpi->common.UVdequant[QIndex][1] *
+                (cpi->zbin_over_quant +
+                 cpi->zbin_mode_boost +
+                 x->act_zbin_adj)) >> 7;
+
+  for (i = 16; i < 24; i++) {
+    x->block[i].quant = cpi->UVquant[QIndex];
+    x->block[i].quant_shift = cpi->UVquant_shift[QIndex];
+    x->block[i].zbin = cpi->UVzbin[QIndex];
+    x->block[i].zbin_8x8 = cpi->UVzbin_8x8[QIndex];
+    x->block[i].zbin_16x16 = cpi->UVzbin_16x16[QIndex];
+    x->block[i].round = cpi->UVround[QIndex];
+    x->e_mbd.block[i].dequant = cpi->common.UVdequant[QIndex];
+    x->block[i].zrun_zbin_boost = cpi->zrun_zbin_boost_uv[QIndex];
+    x->block[i].zrun_zbin_boost_8x8 = cpi->zrun_zbin_boost_uv_8x8[QIndex];
+    x->block[i].zrun_zbin_boost_16x16 = cpi->zrun_zbin_boost_uv_16x16[QIndex];
+
+    x->block[i].zbin_extra = (short)zbin_extra;
+
+    // Segment max eob offset feature.
+    if (vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB)) {
+      x->block[i].eob_max_offset =
+        vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);
+      x->block[i].eob_max_offset_8x8 =
+        vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);
+    } else {
+      x->block[i].eob_max_offset = 16;
+      x->block[i].eob_max_offset_8x8 = 64;
+    }
+  }
+
+  // Y2
+  zbin_extra = (cpi->common.Y2dequant[QIndex][1] *
+                ((cpi->zbin_over_quant / 2) +
+                 cpi->zbin_mode_boost +
+                 x->act_zbin_adj)) >> 7;
+
+  x->block[24].quant = cpi->Y2quant[QIndex];
+  x->block[24].quant_shift = cpi->Y2quant_shift[QIndex];
+  x->block[24].zbin = cpi->Y2zbin[QIndex];
+  x->block[24].zbin_8x8 = cpi->Y2zbin_8x8[QIndex];
+  x->block[24].zbin_16x16 = cpi->Y2zbin_16x16[QIndex];
+  x->block[24].round = cpi->Y2round[QIndex];
+  x->e_mbd.block[24].dequant = cpi->common.Y2dequant[QIndex];
+  x->block[24].zrun_zbin_boost = cpi->zrun_zbin_boost_y2[QIndex];
+  x->block[24].zrun_zbin_boost_8x8 = cpi->zrun_zbin_boost_y2_8x8[QIndex];
+  x->block[24].zrun_zbin_boost_16x16 = cpi->zrun_zbin_boost_y2_16x16[QIndex];
+  x->block[24].zbin_extra = (short)zbin_extra;
+
+  // TBD perhaps not use for Y2
+  // Segment max eob offset feature.
+  if (vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB)) {
+    x->block[24].eob_max_offset =
+      vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);
+    x->block[24].eob_max_offset_8x8 =
+      vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);
+  } else {
+    x->block[24].eob_max_offset = 16;
+    x->block[24].eob_max_offset_8x8 = 4;
+  }
+
+  /* save this macroblock QIndex for vp9_update_zbin_extra() */
+  x->e_mbd.q_index = QIndex;
+}
+
+void vp9_update_zbin_extra(VP9_COMP *cpi, MACROBLOCK *x) {
+  int i;
+  int QIndex = x->e_mbd.q_index;
+  int zbin_extra;
+
+  // Y
+  zbin_extra = (cpi->common.Y1dequant[QIndex][1] *
+                (cpi->zbin_over_quant +
+                 cpi->zbin_mode_boost +
+                 x->act_zbin_adj)) >> 7;
+  for (i = 0; i < 16; i++) {
+    x->block[i].zbin_extra = (short)zbin_extra;
+  }
+
+  // UV
+  zbin_extra = (cpi->common.UVdequant[QIndex][1] *
+                (cpi->zbin_over_quant +
+                 cpi->zbin_mode_boost +
+                 x->act_zbin_adj)) >> 7;
+
+  for (i = 16; i < 24; i++) {
+    x->block[i].zbin_extra = (short)zbin_extra;
+  }
+
+  // Y2
+  zbin_extra = (cpi->common.Y2dequant[QIndex][1] *
+                ((cpi->zbin_over_quant / 2) +
+                 cpi->zbin_mode_boost +
+                 x->act_zbin_adj)) >> 7;
+
+  x->block[24].zbin_extra = (short)zbin_extra;
+}
+
+void vp9_frame_init_quantizer(VP9_COMP *cpi) {
+  // Clear Zbin mode boost for default case
+  cpi->zbin_mode_boost = 0;
+
+  // MB level quantizer setup
+  vp9_mb_init_quantizer(cpi, &cpi->mb);
+}
+
+void vp9_set_quantizer(struct VP9_COMP *cpi, int Q) {
+  VP9_COMMON *cm = &cpi->common;
+
+  cm->base_qindex = Q;
+
+  // if any of the delta_q values are changing update flag will
+  // have to be set.
+  cm->y1dc_delta_q = 0;
+  cm->y2ac_delta_q = 0;
+  cm->uvdc_delta_q = 0;
+  cm->uvac_delta_q = 0;
+  cm->y2dc_delta_q = 0;
+
+  // quantizer has to be reinitialized if any delta_q changes.
+  // As there are not any here for now this is inactive code.
+  // if(update)
+  //    vp9_init_quantizer(cpi);
+}
--- /dev/null
+++ b/vp9/encoder/quantize.h
@@ -1,0 +1,97 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef __INC_QUANTIZE_H
+#define __INC_QUANTIZE_H
+
+#include "block.h"
+
+#define prototype_quantize_block(sym) \
+  void (sym)(BLOCK *b,BLOCKD *d)
+
+#define prototype_quantize_block_pair(sym) \
+  void (sym)(BLOCK *b1, BLOCK *b2, BLOCKD *d1, BLOCKD *d2)
+
+#define prototype_quantize_mb(sym) \
+  void (sym)(MACROBLOCK *x)
+
+#if ARCH_X86 || ARCH_X86_64
+#include "x86/quantize_x86.h"
+#endif
+
+#if ARCH_ARM
+#include "arm/quantize_arm.h"
+#endif
+
+#define prototype_quantize_block_type(sym) \
+  void (sym)(BLOCK *b, BLOCKD *d, TX_TYPE type)
+extern prototype_quantize_block_type(vp9_ht_quantize_b_4x4);
+
+#ifndef vp9_quantize_quantb_4x4
+#define vp9_quantize_quantb_4x4 vp9_regular_quantize_b_4x4
+#endif
+extern prototype_quantize_block(vp9_quantize_quantb_4x4);
+
+#ifndef vp9_quantize_quantb_4x4_pair
+#define vp9_quantize_quantb_4x4_pair vp9_regular_quantize_b_4x4_pair
+#endif
+extern prototype_quantize_block_pair(vp9_quantize_quantb_4x4_pair);
+
+#ifndef vp9_quantize_quantb_8x8
+#define vp9_quantize_quantb_8x8 vp9_regular_quantize_b_8x8
+#endif
+extern prototype_quantize_block(vp9_quantize_quantb_8x8);
+
+#ifndef vp9_quantize_quantb_16x16
+#define vp9_quantize_quantb_16x16 vp9_regular_quantize_b_16x16
+#endif
+extern prototype_quantize_block(vp9_quantize_quantb_16x16);
+
+#ifndef vp9_quantize_quantb_2x2
+#define vp9_quantize_quantb_2x2 vp9_regular_quantize_b_2x2
+#endif
+extern prototype_quantize_block(vp9_quantize_quantb_2x2);
+
+#ifndef vp9_quantize_mb_4x4
+#define vp9_quantize_mb_4x4 vp9_quantize_mb_4x4_c
+#endif
+extern prototype_quantize_mb(vp9_quantize_mb_4x4);
+void vp9_quantize_mb_8x8(MACROBLOCK *x);
+
+#ifndef vp9_quantize_mbuv_4x4
+#define vp9_quantize_mbuv_4x4 vp9_quantize_mbuv_4x4_c
+#endif
+extern prototype_quantize_mb(vp9_quantize_mbuv_4x4);
+
+#ifndef vp9_quantize_mby_4x4
+#define vp9_quantize_mby_4x4 vp9_quantize_mby_4x4_c
+#endif
+extern prototype_quantize_mb(vp9_quantize_mby_4x4);
+
+extern prototype_quantize_mb(vp9_quantize_mby_8x8);
+extern prototype_quantize_mb(vp9_quantize_mbuv_8x8);
+
+void vp9_quantize_mb_16x16(MACROBLOCK *x);
+extern prototype_quantize_block(vp9_quantize_quantb_16x16);
+extern prototype_quantize_mb(vp9_quantize_mby_16x16);
+
+struct VP9_COMP;
+
+extern void vp9_set_quantizer(struct VP9_COMP *cpi, int Q);
+
+extern void vp9_frame_init_quantizer(struct VP9_COMP *cpi);
+
+extern void vp9_update_zbin_extra(struct VP9_COMP *cpi, MACROBLOCK *x);
+
+extern void vp9_mb_init_quantizer(struct VP9_COMP *cpi, MACROBLOCK *x);
+
+extern void vp9_init_quantizer(struct VP9_COMP *cpi);
+
+#endif
--- /dev/null
+++ b/vp9/encoder/ratectrl.c
@@ -1,0 +1,698 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <limits.h>
+#include <assert.h>
+
+#include "math.h"
+#include "vp9/common/alloccommon.h"
+#include "vp9/common/common.h"
+#include "ratectrl.h"
+#include "vp9/common/entropymode.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vp9/common/systemdependent.h"
+#include "encodemv.h"
+#include "vp9/common/quant_common.h"
+
+#define MIN_BPB_FACTOR          0.005
+#define MAX_BPB_FACTOR          50
+
+#ifdef MODE_STATS
+extern unsigned int y_modes[VP9_YMODES];
+extern unsigned int uv_modes[VP9_UV_MODES];
+extern unsigned int b_modes[B_MODE_COUNT];
+
+extern unsigned int inter_y_modes[MB_MODE_COUNT];
+extern unsigned int inter_uv_modes[VP9_UV_MODES];
+extern unsigned int inter_b_modes[B_MODE_COUNT];
+#endif
+
+// Bits Per MB at different Q (Multiplied by 512)
+#define BPER_MB_NORMBITS    9
+
+// % adjustment to target kf size based on seperation from previous frame
+static const int kf_boost_seperation_adjustment[16] = {
+  30,   40,   50,   55,   60,   65,   70,   75,
+  80,   85,   90,   95,  100,  100,  100,  100,
+};
+
+static const int gf_adjust_table[101] = {
+  100,
+  115, 130, 145, 160, 175, 190, 200, 210, 220, 230,
+  240, 260, 270, 280, 290, 300, 310, 320, 330, 340,
+  350, 360, 370, 380, 390, 400, 400, 400, 400, 400,
+  400, 400, 400, 400, 400, 400, 400, 400, 400, 400,
+  400, 400, 400, 400, 400, 400, 400, 400, 400, 400,
+  400, 400, 400, 400, 400, 400, 400, 400, 400, 400,
+  400, 400, 400, 400, 400, 400, 400, 400, 400, 400,
+  400, 400, 400, 400, 400, 400, 400, 400, 400, 400,
+  400, 400, 400, 400, 400, 400, 400, 400, 400, 400,
+  400, 400, 400, 400, 400, 400, 400, 400, 400, 400,
+};
+
+static const int gf_intra_usage_adjustment[20] = {
+  125, 120, 115, 110, 105, 100,  95,  85,  80,  75,
+  70,  65,  60,  55,  50,  50,  50,  50,  50,  50,
+};
+
+static const int gf_interval_table[101] = {
+  7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+  8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+};
+
+static const unsigned int prior_key_frame_weight[KEY_FRAME_CONTEXT] = { 1, 2, 3, 4, 5 };
+
+// These functions use formulaic calculations to make playing with the
+// quantizer tables easier. If necessary they can be replaced by lookup
+// tables if and when things settle down in the experimental bitstream
+double vp9_convert_qindex_to_q(int qindex) {
+  // Convert the index to a real Q value (scaled down to match old Q values)
+  return (double)vp9_ac_yquant(qindex) / 4.0;
+}
+
+int vp9_gfboost_qadjust(int qindex) {
+  int retval;
+  double q;
+
+  q = vp9_convert_qindex_to_q(qindex);
+  retval = (int)((0.00000828 * q * q * q) +
+                 (-0.0055 * q * q) +
+                 (1.32 * q) + 79.3);
+  return retval;
+}
+
+static int kfboost_qadjust(int qindex) {
+  int retval;
+  double q;
+
+  q = vp9_convert_qindex_to_q(qindex);
+  retval = (int)((0.00000973 * q * q * q) +
+                 (-0.00613 * q * q) +
+                 (1.316 * q) + 121.2);
+  return retval;
+}
+
+int vp9_bits_per_mb(FRAME_TYPE frame_type, int qindex) {
+  if (frame_type == KEY_FRAME)
+    return (int)(4500000 / vp9_convert_qindex_to_q(qindex));
+  else
+    return (int)(2850000 / vp9_convert_qindex_to_q(qindex));
+}
+
+
+void vp9_save_coding_context(VP9_COMP *cpi) {
+  CODING_CONTEXT *const cc = &cpi->coding_context;
+  VP9_COMMON *cm = &cpi->common;
+  MACROBLOCKD *xd = &cpi->mb.e_mbd;
+
+  // Stores a snapshot of key state variables which can subsequently be
+  // restored with a call to vp9_restore_coding_context. These functions are
+  // intended for use in a re-code loop in vp9_compress_frame where the
+  // quantizer value is adjusted between loop iterations.
+
+  cc->nmvc = cm->fc.nmvc;
+  vp9_copy(cc->nmvjointcost,  cpi->mb.nmvjointcost);
+  vp9_copy(cc->nmvcosts,  cpi->mb.nmvcosts);
+  vp9_copy(cc->nmvcosts_hp,  cpi->mb.nmvcosts_hp);
+
+  vp9_copy(cc->mv_ref_ct, cm->fc.mv_ref_ct);
+  vp9_copy(cc->mode_context, cm->fc.mode_context);
+  vp9_copy(cc->mv_ref_ct_a, cm->fc.mv_ref_ct_a);
+  vp9_copy(cc->mode_context_a, cm->fc.mode_context_a);
+
+  vp9_copy(cc->ymode_prob, cm->fc.ymode_prob);
+  vp9_copy(cc->bmode_prob, cm->fc.bmode_prob);
+  vp9_copy(cc->uv_mode_prob, cm->fc.uv_mode_prob);
+  vp9_copy(cc->i8x8_mode_prob, cm->fc.i8x8_mode_prob);
+  vp9_copy(cc->sub_mv_ref_prob, cm->fc.sub_mv_ref_prob);
+  vp9_copy(cc->mbsplit_prob, cm->fc.mbsplit_prob);
+
+  // Stats
+#ifdef MODE_STATS
+  vp9_copy(cc->y_modes,       y_modes);
+  vp9_copy(cc->uv_modes,      uv_modes);
+  vp9_copy(cc->b_modes,       b_modes);
+  vp9_copy(cc->inter_y_modes,  inter_y_modes);
+  vp9_copy(cc->inter_uv_modes, inter_uv_modes);
+  vp9_copy(cc->inter_b_modes,  inter_b_modes);
+#endif
+
+  vp9_copy(cc->segment_pred_probs, cm->segment_pred_probs);
+  vp9_copy(cc->ref_pred_probs_update, cpi->ref_pred_probs_update);
+  vp9_copy(cc->ref_pred_probs, cm->ref_pred_probs);
+  vp9_copy(cc->prob_comppred, cm->prob_comppred);
+
+  vpx_memcpy(cpi->coding_context.last_frame_seg_map_copy,
+             cm->last_frame_seg_map, (cm->mb_rows * cm->mb_cols));
+
+  vp9_copy(cc->last_ref_lf_deltas, xd->last_ref_lf_deltas);
+  vp9_copy(cc->last_mode_lf_deltas, xd->last_mode_lf_deltas);
+
+  vp9_copy(cc->coef_probs, cm->fc.coef_probs);
+  vp9_copy(cc->hybrid_coef_probs, cm->fc.hybrid_coef_probs);
+  vp9_copy(cc->coef_probs_8x8, cm->fc.coef_probs_8x8);
+  vp9_copy(cc->hybrid_coef_probs_8x8, cm->fc.hybrid_coef_probs_8x8);
+  vp9_copy(cc->coef_probs_16x16, cm->fc.coef_probs_16x16);
+  vp9_copy(cc->hybrid_coef_probs_16x16, cm->fc.hybrid_coef_probs_16x16);
+  vp9_copy(cc->switchable_interp_prob, cm->fc.switchable_interp_prob);
+}
+
+void vp9_restore_coding_context(VP9_COMP *cpi) {
+  CODING_CONTEXT *const cc = &cpi->coding_context;
+  VP9_COMMON *cm = &cpi->common;
+  MACROBLOCKD *xd = &cpi->mb.e_mbd;
+
+  // Restore key state variables to the snapshot state stored in the
+  // previous call to vp9_save_coding_context.
+
+  cm->fc.nmvc = cc->nmvc;
+  vp9_copy(cpi->mb.nmvjointcost, cc->nmvjointcost);
+  vp9_copy(cpi->mb.nmvcosts, cc->nmvcosts);
+  vp9_copy(cpi->mb.nmvcosts_hp, cc->nmvcosts_hp);
+
+  vp9_copy(cm->fc.mv_ref_ct, cc->mv_ref_ct);
+  vp9_copy(cm->fc.mode_context, cc->mode_context);
+  vp9_copy(cm->fc.mv_ref_ct_a, cc->mv_ref_ct_a);
+  vp9_copy(cm->fc.mode_context_a, cc->mode_context_a);
+
+  vp9_copy(cm->fc.ymode_prob, cc->ymode_prob);
+  vp9_copy(cm->fc.bmode_prob, cc->bmode_prob);
+  vp9_copy(cm->fc.i8x8_mode_prob, cc->i8x8_mode_prob);
+  vp9_copy(cm->fc.uv_mode_prob, cc->uv_mode_prob);
+  vp9_copy(cm->fc.sub_mv_ref_prob, cc->sub_mv_ref_prob);
+  vp9_copy(cm->fc.mbsplit_prob, cc->mbsplit_prob);
+
+  // Stats
+#ifdef MODE_STATS
+  vp9_copy(y_modes, cc->y_modes);
+  vp9_copy(uv_modes, cc->uv_modes);
+  vp9_copy(b_modes, cc->b_modes);
+  vp9_copy(inter_y_modes, cc->inter_y_modes);
+  vp9_copy(inter_uv_modes, cc->inter_uv_modes);
+  vp9_copy(inter_b_modes, cc->inter_b_modes);
+#endif
+
+  vp9_copy(cm->segment_pred_probs, cc->segment_pred_probs);
+  vp9_copy(cpi->ref_pred_probs_update, cc->ref_pred_probs_update);
+  vp9_copy(cm->ref_pred_probs, cc->ref_pred_probs);
+  vp9_copy(cm->prob_comppred, cc->prob_comppred);
+
+  vpx_memcpy(cm->last_frame_seg_map,
+             cpi->coding_context.last_frame_seg_map_copy,
+             (cm->mb_rows * cm->mb_cols));
+
+  vp9_copy(xd->last_ref_lf_deltas, cc->last_ref_lf_deltas);
+  vp9_copy(xd->last_mode_lf_deltas, cc->last_mode_lf_deltas);
+
+  vp9_copy(cm->fc.coef_probs, cc->coef_probs);
+  vp9_copy(cm->fc.hybrid_coef_probs, cc->hybrid_coef_probs);
+  vp9_copy(cm->fc.coef_probs_8x8, cc->coef_probs_8x8);
+  vp9_copy(cm->fc.hybrid_coef_probs_8x8, cc->hybrid_coef_probs_8x8);
+  vp9_copy(cm->fc.coef_probs_16x16, cc->coef_probs_16x16);
+  vp9_copy(cm->fc.hybrid_coef_probs_16x16, cc->hybrid_coef_probs_16x16);
+  vp9_copy(cm->fc.switchable_interp_prob, cc->switchable_interp_prob);
+}
+
+
+void vp9_setup_key_frame(VP9_COMP *cpi) {
+  VP9_COMMON *cm = &cpi->common;
+  // Setup for Key frame:
+  vp9_default_coef_probs(& cpi->common);
+  vp9_kf_default_bmode_probs(cpi->common.kf_bmode_prob);
+  vp9_init_mbmode_probs(& cpi->common);
+  vp9_default_bmode_probs(cm->fc.bmode_prob);
+
+  vp9_init_mv_probs(& cpi->common);
+
+  // cpi->common.filter_level = 0;      // Reset every key frame.
+  cpi->common.filter_level = cpi->common.base_qindex * 3 / 8;
+
+  // interval before next GF
+  cpi->frames_till_gf_update_due = cpi->baseline_gf_interval;
+
+  cpi->common.refresh_golden_frame = TRUE;
+  cpi->common.refresh_alt_ref_frame = TRUE;
+
+  vp9_init_mode_contexts(&cpi->common);
+  vpx_memcpy(&cpi->common.lfc, &cpi->common.fc, sizeof(cpi->common.fc));
+  vpx_memcpy(&cpi->common.lfc_a, &cpi->common.fc, sizeof(cpi->common.fc));
+
+  vpx_memset(cm->prev_mip, 0,
+    (cm->mb_cols + 1) * (cm->mb_rows + 1)* sizeof(MODE_INFO));
+  vpx_memset(cm->mip, 0,
+    (cm->mb_cols + 1) * (cm->mb_rows + 1)* sizeof(MODE_INFO));
+
+  vp9_update_mode_info_border(cm, cm->mip);
+  vp9_update_mode_info_in_image(cm, cm->mi);
+}
+
+void vp9_setup_inter_frame(VP9_COMP *cpi) {
+  if (cpi->common.refresh_alt_ref_frame) {
+    vpx_memcpy(&cpi->common.fc,
+               &cpi->common.lfc_a,
+               sizeof(cpi->common.fc));
+    vpx_memcpy(cpi->common.fc.vp8_mode_contexts,
+               cpi->common.fc.mode_context_a,
+               sizeof(cpi->common.fc.vp8_mode_contexts));
+  } else {
+    vpx_memcpy(&cpi->common.fc,
+               &cpi->common.lfc,
+               sizeof(cpi->common.fc));
+    vpx_memcpy(cpi->common.fc.vp8_mode_contexts,
+               cpi->common.fc.mode_context,
+               sizeof(cpi->common.fc.vp8_mode_contexts));
+  }
+}
+
+
+static int estimate_bits_at_q(int frame_kind, int Q, int MBs,
+                              double correction_factor) {
+  int Bpm = (int)(.5 + correction_factor * vp9_bits_per_mb(frame_kind, Q));
+
+  /* Attempt to retain reasonable accuracy without overflow. The cutoff is
+   * chosen such that the maximum product of Bpm and MBs fits 31 bits. The
+   * largest Bpm takes 20 bits.
+   */
+  if (MBs > (1 << 11))
+    return (Bpm >> BPER_MB_NORMBITS) * MBs;
+  else
+    return (Bpm * MBs) >> BPER_MB_NORMBITS;
+}
+
+
+static void calc_iframe_target_size(VP9_COMP *cpi) {
+  // boost defaults to half second
+  int target;
+
+  // Clear down mmx registers to allow floating point in what follows
+  vp9_clear_system_state();  // __asm emms;
+
+  // New Two pass RC
+  target = cpi->per_frame_bandwidth;
+
+  if (cpi->oxcf.rc_max_intra_bitrate_pct) {
+    unsigned int max_rate = cpi->per_frame_bandwidth
+                            * cpi->oxcf.rc_max_intra_bitrate_pct / 100;
+
+    if (target > max_rate)
+      target = max_rate;
+  }
+
+  cpi->this_frame_target = target;
+
+}
+
+
+//  Do the best we can to define the parameteres for the next GF based
+//  on what information we have available.
+//
+//  In this experimental code only two pass is supported
+//  so we just use the interval determined in the two pass code.
+static void calc_gf_params(VP9_COMP *cpi) {
+  // Set the gf interval
+  cpi->frames_till_gf_update_due = cpi->baseline_gf_interval;
+}
+
+
+static void calc_pframe_target_size(VP9_COMP *cpi) {
+  int min_frame_target;
+
+  min_frame_target = 0;
+
+  min_frame_target = cpi->min_frame_bandwidth;
+
+  if (min_frame_target < (cpi->av_per_frame_bandwidth >> 5))
+    min_frame_target = cpi->av_per_frame_bandwidth >> 5;
+
+
+  // Special alt reference frame case
+  if (cpi->common.refresh_alt_ref_frame) {
+    // Per frame bit target for the alt ref frame
+    cpi->per_frame_bandwidth = cpi->twopass.gf_bits;
+    cpi->this_frame_target = cpi->per_frame_bandwidth;
+  }
+
+  // Normal frames (gf,and inter)
+  else {
+    cpi->this_frame_target = cpi->per_frame_bandwidth;
+  }
+
+  // Sanity check that the total sum of adjustments is not above the maximum allowed
+  // That is that having allowed for KF and GF penalties we have not pushed the
+  // current interframe target to low. If the adjustment we apply here is not capable of recovering
+  // all the extra bits we have spent in the KF or GF then the remainder will have to be recovered over
+  // a longer time span via other buffer / rate control mechanisms.
+  if (cpi->this_frame_target < min_frame_target)
+    cpi->this_frame_target = min_frame_target;
+
+  if (!cpi->common.refresh_alt_ref_frame)
+    // Note the baseline target data rate for this inter frame.
+    cpi->inter_frame_target = cpi->this_frame_target;
+
+  // Adjust target frame size for Golden Frames:
+  if (cpi->frames_till_gf_update_due == 0) {
+    // int Boost = 0;
+    int Q = (cpi->oxcf.fixed_q < 0) ? cpi->last_q[INTER_FRAME] : cpi->oxcf.fixed_q;
+
+    cpi->common.refresh_golden_frame = TRUE;
+
+    calc_gf_params(cpi);
+
+    // If we are using alternate ref instead of gf then do not apply the boost
+    // It will instead be applied to the altref update
+    // Jims modified boost
+    if (!cpi->source_alt_ref_active) {
+      if (cpi->oxcf.fixed_q < 0) {
+        // The spend on the GF is defined in the two pass code
+        // for two pass encodes
+        cpi->this_frame_target = cpi->per_frame_bandwidth;
+      } else
+        cpi->this_frame_target =
+          (estimate_bits_at_q(1, Q, cpi->common.MBs, 1.0)
+           * cpi->last_boost) / 100;
+
+    }
+    // If there is an active ARF at this location use the minimum
+    // bits on this frame even if it is a contructed arf.
+    // The active maximum quantizer insures that an appropriate
+    // number of bits will be spent if needed for contstructed ARFs.
+    else {
+      cpi->this_frame_target = 0;
+    }
+
+    cpi->current_gf_interval = cpi->frames_till_gf_update_due;
+  }
+}
+
+
+void vp9_update_rate_correction_factors(VP9_COMP *cpi, int damp_var) {
+  int    Q = cpi->common.base_qindex;
+  int    correction_factor = 100;
+  double rate_correction_factor;
+  double adjustment_limit;
+
+  int    projected_size_based_on_q = 0;
+
+  // Clear down mmx registers to allow floating point in what follows
+  vp9_clear_system_state();  // __asm emms;
+
+  if (cpi->common.frame_type == KEY_FRAME) {
+    rate_correction_factor = cpi->key_frame_rate_correction_factor;
+  } else {
+    if (cpi->common.refresh_alt_ref_frame || cpi->common.refresh_golden_frame)
+      rate_correction_factor = cpi->gf_rate_correction_factor;
+    else
+      rate_correction_factor = cpi->rate_correction_factor;
+  }
+
+  // Work out how big we would have expected the frame to be at this Q given the current correction factor.
+  // Stay in double to avoid int overflow when values are large
+  projected_size_based_on_q =
+    (int)(((.5 + rate_correction_factor *
+            vp9_bits_per_mb(cpi->common.frame_type, Q)) *
+           cpi->common.MBs) / (1 << BPER_MB_NORMBITS));
+
+  // Make some allowance for cpi->zbin_over_quant
+  if (cpi->zbin_over_quant > 0) {
+    int Z = cpi->zbin_over_quant;
+    double Factor = 0.99;
+    double factor_adjustment = 0.01 / 256.0; // (double)ZBIN_OQ_MAX;
+
+    while (Z > 0) {
+      Z--;
+      projected_size_based_on_q =
+        (int)(Factor * projected_size_based_on_q);
+      Factor += factor_adjustment;
+
+      if (Factor  >= 0.999)
+        Factor = 0.999;
+    }
+  }
+
+  // Work out a size correction factor.
+  // if ( cpi->this_frame_target > 0 )
+  //  correction_factor = (100 * cpi->projected_frame_size) / cpi->this_frame_target;
+  if (projected_size_based_on_q > 0)
+    correction_factor = (100 * cpi->projected_frame_size) / projected_size_based_on_q;
+
+  // More heavily damped adjustment used if we have been oscillating either side of target
+  switch (damp_var) {
+    case 0:
+      adjustment_limit = 0.75;
+      break;
+    case 1:
+      adjustment_limit = 0.375;
+      break;
+    case 2:
+    default:
+      adjustment_limit = 0.25;
+      break;
+  }
+
+  // if ( (correction_factor > 102) && (Q < cpi->active_worst_quality) )
+  if (correction_factor > 102) {
+    // We are not already at the worst allowable quality
+    correction_factor = (int)(100.5 + ((correction_factor - 100) * adjustment_limit));
+    rate_correction_factor = ((rate_correction_factor * correction_factor) / 100);
+
+    // Keep rate_correction_factor within limits
+    if (rate_correction_factor > MAX_BPB_FACTOR)
+      rate_correction_factor = MAX_BPB_FACTOR;
+  }
+  // else if ( (correction_factor < 99) && (Q > cpi->active_best_quality) )
+  else if (correction_factor < 99) {
+    // We are not already at the best allowable quality
+    correction_factor = (int)(100.5 - ((100 - correction_factor) * adjustment_limit));
+    rate_correction_factor = ((rate_correction_factor * correction_factor) / 100);
+
+    // Keep rate_correction_factor within limits
+    if (rate_correction_factor < MIN_BPB_FACTOR)
+      rate_correction_factor = MIN_BPB_FACTOR;
+  }
+
+  if (cpi->common.frame_type == KEY_FRAME)
+    cpi->key_frame_rate_correction_factor = rate_correction_factor;
+  else {
+    if (cpi->common.refresh_alt_ref_frame || cpi->common.refresh_golden_frame)
+      cpi->gf_rate_correction_factor = rate_correction_factor;
+    else
+      cpi->rate_correction_factor = rate_correction_factor;
+  }
+}
+
+
+int vp9_regulate_q(VP9_COMP *cpi, int target_bits_per_frame) {
+  int Q = cpi->active_worst_quality;
+
+  int i;
+  int last_error = INT_MAX;
+  int target_bits_per_mb;
+  int bits_per_mb_at_this_q;
+  double correction_factor;
+
+  // Reset Zbin OQ value
+  cpi->zbin_over_quant = 0;
+
+  // Select the appropriate correction factor based upon type of frame.
+  if (cpi->common.frame_type == KEY_FRAME)
+    correction_factor = cpi->key_frame_rate_correction_factor;
+  else {
+    if (cpi->common.refresh_alt_ref_frame || cpi->common.refresh_golden_frame)
+      correction_factor = cpi->gf_rate_correction_factor;
+    else
+      correction_factor = cpi->rate_correction_factor;
+  }
+
+  // Calculate required scaling factor based on target frame size and size of frame produced using previous Q
+  if (target_bits_per_frame >= (INT_MAX >> BPER_MB_NORMBITS))
+    target_bits_per_mb = (target_bits_per_frame / cpi->common.MBs) << BPER_MB_NORMBITS;       // Case where we would overflow int
+  else
+    target_bits_per_mb = (target_bits_per_frame << BPER_MB_NORMBITS) / cpi->common.MBs;
+
+  i = cpi->active_best_quality;
+
+  do {
+    bits_per_mb_at_this_q =
+      (int)(.5 + correction_factor *
+            vp9_bits_per_mb(cpi->common.frame_type, i));
+
+    if (bits_per_mb_at_this_q <= target_bits_per_mb) {
+      if ((target_bits_per_mb - bits_per_mb_at_this_q) <= last_error)
+        Q = i;
+      else
+        Q = i - 1;
+
+      break;
+    } else
+      last_error = bits_per_mb_at_this_q - target_bits_per_mb;
+  } while (++i <= cpi->active_worst_quality);
+
+
+  // If we are at MAXQ then enable Q over-run which seeks to claw back additional bits through things like
+  // the RD multiplier and zero bin size.
+  if (Q >= MAXQ) {
+    int zbin_oqmax;
+
+    double Factor = 0.99;
+    double factor_adjustment = 0.01 / 256.0; // (double)ZBIN_OQ_MAX;
+
+    if (cpi->common.frame_type == KEY_FRAME)
+      zbin_oqmax = 0; // ZBIN_OQ_MAX/16
+    else if (cpi->common.refresh_alt_ref_frame || (cpi->common.refresh_golden_frame && !cpi->source_alt_ref_active))
+      zbin_oqmax = 16;
+    else
+      zbin_oqmax = ZBIN_OQ_MAX;
+
+    // Each incrment in the zbin is assumed to have a fixed effect on bitrate. This is not of course true.
+    // The effect will be highly clip dependent and may well have sudden steps.
+    // The idea here is to acheive higher effective quantizers than the normal maximum by expanding the zero
+    // bin and hence decreasing the number of low magnitude non zero coefficients.
+    while (cpi->zbin_over_quant < zbin_oqmax) {
+      cpi->zbin_over_quant++;
+
+      if (cpi->zbin_over_quant > zbin_oqmax)
+        cpi->zbin_over_quant = zbin_oqmax;
+
+      // Adjust bits_per_mb_at_this_q estimate
+      bits_per_mb_at_this_q = (int)(Factor * bits_per_mb_at_this_q);
+      Factor += factor_adjustment;
+
+      if (Factor  >= 0.999)
+        Factor = 0.999;
+
+      if (bits_per_mb_at_this_q <= target_bits_per_mb)    // Break out if we get down to the target rate
+        break;
+    }
+
+  }
+
+  return Q;
+}
+
+
+static int estimate_keyframe_frequency(VP9_COMP *cpi) {
+  int i;
+
+  // Average key frame frequency
+  int av_key_frame_frequency = 0;
+
+  /* First key frame at start of sequence is a special case. We have no
+   * frequency data.
+   */
+  if (cpi->key_frame_count == 1) {
+    /* Assume a default of 1 kf every 2 seconds, or the max kf interval,
+     * whichever is smaller.
+     */
+    int key_freq = cpi->oxcf.key_freq > 0 ? cpi->oxcf.key_freq : 1;
+    av_key_frame_frequency = (int)cpi->output_frame_rate * 2;
+
+    if (cpi->oxcf.auto_key && av_key_frame_frequency > key_freq)
+      av_key_frame_frequency = cpi->oxcf.key_freq;
+
+    cpi->prior_key_frame_distance[KEY_FRAME_CONTEXT - 1]
+      = av_key_frame_frequency;
+  } else {
+    unsigned int total_weight = 0;
+    int last_kf_interval =
+      (cpi->frames_since_key > 0) ? cpi->frames_since_key : 1;
+
+    /* reset keyframe context and calculate weighted average of last
+     * KEY_FRAME_CONTEXT keyframes
+     */
+    for (i = 0; i < KEY_FRAME_CONTEXT; i++) {
+      if (i < KEY_FRAME_CONTEXT - 1)
+        cpi->prior_key_frame_distance[i]
+          = cpi->prior_key_frame_distance[i + 1];
+      else
+        cpi->prior_key_frame_distance[i] = last_kf_interval;
+
+      av_key_frame_frequency += prior_key_frame_weight[i]
+                                * cpi->prior_key_frame_distance[i];
+      total_weight += prior_key_frame_weight[i];
+    }
+
+    av_key_frame_frequency  /= total_weight;
+
+  }
+  return av_key_frame_frequency;
+}
+
+
+void vp9_adjust_key_frame_context(VP9_COMP *cpi) {
+  // Clear down mmx registers to allow floating point in what follows
+  vp9_clear_system_state();
+
+  cpi->frames_since_key = 0;
+  cpi->key_frame_count++;
+}
+
+
+void vp9_compute_frame_size_bounds(VP9_COMP *cpi, int *frame_under_shoot_limit,
+                                   int *frame_over_shoot_limit) {
+  // Set-up bounds on acceptable frame size:
+  if (cpi->oxcf.fixed_q >= 0) {
+    // Fixed Q scenario: frame size never outranges target (there is no target!)
+    *frame_under_shoot_limit = 0;
+    *frame_over_shoot_limit  = INT_MAX;
+  } else {
+    if (cpi->common.frame_type == KEY_FRAME) {
+      *frame_over_shoot_limit  = cpi->this_frame_target * 9 / 8;
+      *frame_under_shoot_limit = cpi->this_frame_target * 7 / 8;
+    } else {
+      if (cpi->common.refresh_alt_ref_frame || cpi->common.refresh_golden_frame) {
+        *frame_over_shoot_limit  = cpi->this_frame_target * 9 / 8;
+        *frame_under_shoot_limit = cpi->this_frame_target * 7 / 8;
+      } else {
+        // Stron overshoot limit for constrained quality
+        if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) {
+          *frame_over_shoot_limit  = cpi->this_frame_target * 11 / 8;
+          *frame_under_shoot_limit = cpi->this_frame_target * 2 / 8;
+        } else {
+          *frame_over_shoot_limit  = cpi->this_frame_target * 11 / 8;
+          *frame_under_shoot_limit = cpi->this_frame_target * 5 / 8;
+        }
+      }
+    }
+
+    // For very small rate targets where the fractional adjustment
+    // (eg * 7/8) may be tiny make sure there is at least a minimum
+    // range.
+    *frame_over_shoot_limit += 200;
+    *frame_under_shoot_limit -= 200;
+    if (*frame_under_shoot_limit < 0)
+      *frame_under_shoot_limit = 0;
+  }
+}
+
+
+// return of 0 means drop frame
+int vp9_pick_frame_size(VP9_COMP *cpi) {
+  VP9_COMMON *cm = &cpi->common;
+
+  if (cm->frame_type == KEY_FRAME)
+    calc_iframe_target_size(cpi);
+  else
+    calc_pframe_target_size(cpi);
+
+  return 1;
+}
--- /dev/null
+++ b/vp9/encoder/ratectrl.h
@@ -1,0 +1,37 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#if !defined __INC_RATECTRL_H
+
+#include "onyx_int.h"
+
+#define FRAME_OVERHEAD_BITS 200
+
+extern void vp9_save_coding_context(VP9_COMP *cpi);
+extern void vp9_restore_coding_context(VP9_COMP *cpi);
+
+extern void vp9_setup_key_frame(VP9_COMP *cpi);
+extern void vp9_update_rate_correction_factors(VP9_COMP *cpi, int damp_var);
+extern int vp9_regulate_q(VP9_COMP *cpi, int target_bits_per_frame);
+extern void vp9_adjust_key_frame_context(VP9_COMP *cpi);
+extern void vp9_compute_frame_size_bounds(VP9_COMP *cpi,
+                                          int *frame_under_shoot_limit,
+                                          int *frame_over_shoot_limit);
+
+// return of 0 means drop frame
+extern int vp9_pick_frame_size(VP9_COMP *cpi);
+
+extern double vp9_convert_qindex_to_q(int qindex);
+extern int vp9_gfboost_qadjust(int qindex);
+extern int vp9_bits_per_mb(FRAME_TYPE frame_type, int qindex);
+void vp9_setup_inter_frame(VP9_COMP *cpi);
+
+#endif
--- /dev/null
+++ b/vp9/encoder/rdopt.c
@@ -1,0 +1,4854 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include <stdio.h>
+#include <math.h>
+#include <limits.h>
+#include <assert.h>
+#include "vp9/common/pragmas.h"
+
+#include "tokenize.h"
+#include "treewriter.h"
+#include "onyx_int.h"
+#include "modecosts.h"
+#include "encodeintra.h"
+#include "vp9/common/entropymode.h"
+#include "vp9/common/reconinter.h"
+#include "vp9/common/reconintra.h"
+#include "vp9/common/reconintra4x4.h"
+#include "vp9/common/findnearmv.h"
+#include "vp9/common/quant_common.h"
+#include "encodemb.h"
+#include "quantize.h"
+#include "vp9/common/idct.h"
+#include "variance.h"
+#include "mcomp.h"
+#include "rdopt.h"
+#include "ratectrl.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vp9/common/systemdependent.h"
+#include "vp9/encoder/encodemv.h"
+
+#include "vp9/common/seg_common.h"
+#include "vp9/common/pred_common.h"
+#include "vp9/common/entropy.h"
+#include "vpx_rtcd.h"
+#if CONFIG_NEWBESTREFMV
+#include "vp9/common/mvref_common.h"
+#endif
+
+#if CONFIG_RUNTIME_CPU_DETECT
+#define IF_RTCD(x)  (x)
+#else
+#define IF_RTCD(x)  NULL
+#endif
+
+extern void vp9_mb_init_quantizer(VP9_COMP *cpi, MACROBLOCK *x);
+extern void vp9_update_zbin_extra(VP9_COMP *cpi, MACROBLOCK *x);
+
+#define MAXF(a,b)            (((a) > (b)) ? (a) : (b))
+
+#define INVALID_MV 0x80008000
+
+/* Factor to weigh the rate for switchable interp filters */
+#define SWITCHABLE_INTERP_RATE_FACTOR 1
+
+static const int auto_speed_thresh[17] = {
+  1000,
+  200,
+  150,
+  130,
+  150,
+  125,
+  120,
+  115,
+  115,
+  115,
+  115,
+  115,
+  115,
+  115,
+  115,
+  115,
+  105
+};
+
+#if CONFIG_PRED_FILTER
+const MODE_DEFINITION vp9_mode_order[MAX_MODES] = {
+  {ZEROMV,    LAST_FRAME,   0,  0},
+  {ZEROMV,    LAST_FRAME,   0,  1},
+  {DC_PRED,   INTRA_FRAME,  0,  0},
+
+  {NEARESTMV, LAST_FRAME,   0,  0},
+  {NEARESTMV, LAST_FRAME,   0,  1},
+  {NEARMV,    LAST_FRAME,   0,  0},
+  {NEARMV,    LAST_FRAME,   0,  1},
+
+  {ZEROMV,    GOLDEN_FRAME, 0,  0},
+  {ZEROMV,    GOLDEN_FRAME, 0,  1},
+  {NEARESTMV, GOLDEN_FRAME, 0,  0},
+  {NEARESTMV, GOLDEN_FRAME, 0,  1},
+
+  {ZEROMV,    ALTREF_FRAME, 0,  0},
+  {ZEROMV,    ALTREF_FRAME, 0,  1},
+  {NEARESTMV, ALTREF_FRAME, 0,  0},
+  {NEARESTMV, ALTREF_FRAME, 0,  1},
+
+  {NEARMV,    GOLDEN_FRAME, 0,  0},
+  {NEARMV,    GOLDEN_FRAME, 0,  1},
+  {NEARMV,    ALTREF_FRAME, 0,  0},
+  {NEARMV,    ALTREF_FRAME, 0,  1},
+
+  {V_PRED,    INTRA_FRAME,  0,  0},
+  {H_PRED,    INTRA_FRAME,  0,  0},
+  {D45_PRED,  INTRA_FRAME,  0,  0},
+  {D135_PRED, INTRA_FRAME,  0,  0},
+  {D117_PRED, INTRA_FRAME,  0,  0},
+  {D153_PRED, INTRA_FRAME,  0,  0},
+  {D27_PRED,  INTRA_FRAME,  0,  0},
+  {D63_PRED,  INTRA_FRAME,  0,  0},
+
+  {TM_PRED,   INTRA_FRAME,  0,  0},
+
+  {NEWMV,     LAST_FRAME,   0,  0},
+  {NEWMV,     LAST_FRAME,   0,  1},
+  {NEWMV,     GOLDEN_FRAME, 0,  0},
+  {NEWMV,     GOLDEN_FRAME, 0,  1},
+  {NEWMV,     ALTREF_FRAME, 0,  0},
+  {NEWMV,     ALTREF_FRAME, 0,  1},
+
+  {SPLITMV,   LAST_FRAME,   0,  0},
+  {SPLITMV,   GOLDEN_FRAME, 0,  0},
+  {SPLITMV,   ALTREF_FRAME, 0,  0},
+
+  {B_PRED,    INTRA_FRAME,  0,  0},
+  {I8X8_PRED, INTRA_FRAME,  0,  0},
+
+  /* compound prediction modes */
+  {ZEROMV,    LAST_FRAME,   GOLDEN_FRAME, 0},
+  {NEARESTMV, LAST_FRAME,   GOLDEN_FRAME, 0},
+  {NEARMV,    LAST_FRAME,   GOLDEN_FRAME, 0},
+
+  {ZEROMV,    ALTREF_FRAME, LAST_FRAME,   0},
+  {NEARESTMV, ALTREF_FRAME, LAST_FRAME,   0},
+  {NEARMV,    ALTREF_FRAME, LAST_FRAME,   0},
+
+  {ZEROMV,    GOLDEN_FRAME, ALTREF_FRAME, 0},
+  {NEARESTMV, GOLDEN_FRAME, ALTREF_FRAME, 0},
+  {NEARMV,    GOLDEN_FRAME, ALTREF_FRAME, 0},
+
+  {NEWMV,     LAST_FRAME,   GOLDEN_FRAME, 0},
+  {NEWMV,     ALTREF_FRAME, LAST_FRAME,   0},
+  {NEWMV,     GOLDEN_FRAME, ALTREF_FRAME, 0},
+
+  {SPLITMV,   LAST_FRAME,   GOLDEN_FRAME, 0},
+  {SPLITMV,   ALTREF_FRAME, LAST_FRAME,   0},
+  {SPLITMV,   GOLDEN_FRAME, ALTREF_FRAME, 0}
+};
+#else
+const MODE_DEFINITION vp9_mode_order[MAX_MODES] = {
+  {ZEROMV,    LAST_FRAME,   0},
+  {DC_PRED,   INTRA_FRAME,  0},
+
+  {NEARESTMV, LAST_FRAME,   0},
+  {NEARMV,    LAST_FRAME,   0},
+
+  {ZEROMV,    GOLDEN_FRAME, 0},
+  {NEARESTMV, GOLDEN_FRAME, 0},
+
+  {ZEROMV,    ALTREF_FRAME, 0},
+  {NEARESTMV, ALTREF_FRAME, 0},
+
+  {NEARMV,    GOLDEN_FRAME, 0},
+  {NEARMV,    ALTREF_FRAME, 0},
+
+  {V_PRED,    INTRA_FRAME,  0},
+  {H_PRED,    INTRA_FRAME,  0},
+  {D45_PRED,  INTRA_FRAME,  0},
+  {D135_PRED, INTRA_FRAME,  0},
+  {D117_PRED, INTRA_FRAME,  0},
+  {D153_PRED, INTRA_FRAME,  0},
+  {D27_PRED,  INTRA_FRAME,  0},
+  {D63_PRED,  INTRA_FRAME,  0},
+
+  {TM_PRED,   INTRA_FRAME,  0},
+
+  {NEWMV,     LAST_FRAME,   0},
+  {NEWMV,     GOLDEN_FRAME, 0},
+  {NEWMV,     ALTREF_FRAME, 0},
+
+  {SPLITMV,   LAST_FRAME,   0},
+  {SPLITMV,   GOLDEN_FRAME, 0},
+  {SPLITMV,   ALTREF_FRAME, 0},
+
+  {B_PRED,    INTRA_FRAME,  0},
+  {I8X8_PRED, INTRA_FRAME,  0},
+
+  /* compound prediction modes */
+  {ZEROMV,    LAST_FRAME,   GOLDEN_FRAME},
+  {NEARESTMV, LAST_FRAME,   GOLDEN_FRAME},
+  {NEARMV,    LAST_FRAME,   GOLDEN_FRAME},
+
+  {ZEROMV,    ALTREF_FRAME, LAST_FRAME},
+  {NEARESTMV, ALTREF_FRAME, LAST_FRAME},
+  {NEARMV,    ALTREF_FRAME, LAST_FRAME},
+
+  {ZEROMV,    GOLDEN_FRAME, ALTREF_FRAME},
+  {NEARESTMV, GOLDEN_FRAME, ALTREF_FRAME},
+  {NEARMV,    GOLDEN_FRAME, ALTREF_FRAME},
+
+  {NEWMV,     LAST_FRAME,   GOLDEN_FRAME},
+  {NEWMV,     ALTREF_FRAME, LAST_FRAME  },
+  {NEWMV,     GOLDEN_FRAME, ALTREF_FRAME},
+
+  {SPLITMV,   LAST_FRAME,   GOLDEN_FRAME},
+  {SPLITMV,   ALTREF_FRAME, LAST_FRAME  },
+  {SPLITMV,   GOLDEN_FRAME, ALTREF_FRAME}
+};
+#endif
+
+static void fill_token_costs(
+  unsigned int (*c)[COEF_BANDS][PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS],
+  const vp9_prob(*p)[COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES],
+  int block_type_counts) {
+  int i, j, k;
+
+  for (i = 0; i < block_type_counts; i++)
+    for (j = 0; j < COEF_BANDS; j++)
+      for (k = 0; k < PREV_COEF_CONTEXTS; k++) {
+        if (k == 0 && ((j > 0 && i > 0) || (j > 1 && i == 0)))
+          vp9_cost_tokens_skip((int *)(c[i][j][k]),
+                               p[i][j][k],
+                               vp9_coef_tree);
+        else
+          vp9_cost_tokens((int *)(c[i][j][k]),
+                          p[i][j][k],
+                          vp9_coef_tree);
+      }
+}
+
+
+static int rd_iifactor[32] =  { 4, 4, 3, 2, 1, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0, };
+
+// 3* dc_qlookup[Q]*dc_qlookup[Q];
+
+/* values are now correlated to quantizer */
+static int sad_per_bit16lut[QINDEX_RANGE];
+static int sad_per_bit4lut[QINDEX_RANGE];
+
+void vp9_init_me_luts() {
+  int i;
+
+  // Initialize the sad lut tables using a formulaic calculation for now
+  // This is to make it easier to resolve the impact of experimental changes
+  // to the quantizer tables.
+  for (i = 0; i < QINDEX_RANGE; i++) {
+    sad_per_bit16lut[i] =
+      (int)((0.0418 * vp9_convert_qindex_to_q(i)) + 2.4107);
+    sad_per_bit4lut[i] = (int)((0.063 * vp9_convert_qindex_to_q(i)) + 2.742);
+  }
+}
+
+static int compute_rd_mult(int qindex) {
+  int q;
+
+  q = vp9_dc_quant(qindex, 0);
+  return (11 * q * q) >> 6;
+}
+
+void vp9_initialize_me_consts(VP9_COMP *cpi, int QIndex) {
+  cpi->mb.sadperbit16 =  sad_per_bit16lut[QIndex];
+  cpi->mb.sadperbit4  =  sad_per_bit4lut[QIndex];
+}
+
+
+void vp9_initialize_rd_consts(VP9_COMP *cpi, int QIndex) {
+  int q, i;
+
+  vp9_clear_system_state();  // __asm emms;
+
+  // Further tests required to see if optimum is different
+  // for key frames, golden frames and arf frames.
+  // if (cpi->common.refresh_golden_frame ||
+  //     cpi->common.refresh_alt_ref_frame)
+  QIndex = (QIndex < 0) ? 0 : ((QIndex > MAXQ) ? MAXQ : QIndex);
+
+  cpi->RDMULT = compute_rd_mult(QIndex);
+
+  // Extend rate multiplier along side quantizer zbin increases
+  if (cpi->zbin_over_quant  > 0) {
+    double oq_factor;
+
+    // Experimental code using the same basic equation as used for Q above
+    // The units of cpi->zbin_over_quant are 1/128 of Q bin size
+    oq_factor = 1.0 + ((double)0.0015625 * cpi->zbin_over_quant);
+    cpi->RDMULT = (int)((double)cpi->RDMULT * oq_factor * oq_factor);
+  }
+
+  if (cpi->pass == 2 && (cpi->common.frame_type != KEY_FRAME)) {
+    if (cpi->twopass.next_iiratio > 31)
+      cpi->RDMULT += (cpi->RDMULT * rd_iifactor[31]) >> 4;
+    else
+      cpi->RDMULT +=
+        (cpi->RDMULT * rd_iifactor[cpi->twopass.next_iiratio]) >> 4;
+  }
+
+  if (cpi->RDMULT < 7)
+    cpi->RDMULT = 7;
+
+  cpi->mb.errorperbit = (cpi->RDMULT / 110);
+  cpi->mb.errorperbit += (cpi->mb.errorperbit == 0);
+
+  vp9_set_speed_features(cpi);
+
+  q = (int)pow(vp9_dc_quant(QIndex, 0) >> 2, 1.25);
+  q = q << 2;
+  cpi->RDMULT = cpi->RDMULT << 4;
+
+  if (q < 8)
+    q = 8;
+
+  if (cpi->RDMULT > 1000) {
+    cpi->RDDIV = 1;
+    cpi->RDMULT /= 100;
+
+    for (i = 0; i < MAX_MODES; i++) {
+      if (cpi->sf.thresh_mult[i] < INT_MAX) {
+        cpi->rd_threshes[i] = cpi->sf.thresh_mult[i] * q / 100;
+      } else {
+        cpi->rd_threshes[i] = INT_MAX;
+      }
+
+      cpi->rd_baseline_thresh[i] = cpi->rd_threshes[i];
+    }
+  } else {
+    cpi->RDDIV = 100;
+
+    for (i = 0; i < MAX_MODES; i++) {
+      if (cpi->sf.thresh_mult[i] < (INT_MAX / q)) {
+        cpi->rd_threshes[i] = cpi->sf.thresh_mult[i] * q;
+      } else {
+        cpi->rd_threshes[i] = INT_MAX;
+      }
+
+      cpi->rd_baseline_thresh[i] = cpi->rd_threshes[i];
+    }
+  }
+
+  fill_token_costs(
+    cpi->mb.token_costs[TX_4X4],
+    (const vp9_prob( *)[8][PREV_COEF_CONTEXTS][11]) cpi->common.fc.coef_probs,
+    BLOCK_TYPES);
+  fill_token_costs(
+    cpi->mb.hybrid_token_costs[TX_4X4],
+    (const vp9_prob( *)[8][PREV_COEF_CONTEXTS][11])
+    cpi->common.fc.hybrid_coef_probs,
+    BLOCK_TYPES);
+
+  fill_token_costs(
+    cpi->mb.token_costs[TX_8X8],
+    (const vp9_prob( *)[8][PREV_COEF_CONTEXTS][11]) cpi->common.fc.coef_probs_8x8,
+    BLOCK_TYPES_8X8);
+  fill_token_costs(
+    cpi->mb.hybrid_token_costs[TX_8X8],
+    (const vp9_prob( *)[8][PREV_COEF_CONTEXTS][11])
+    cpi->common.fc.hybrid_coef_probs_8x8,
+    BLOCK_TYPES_8X8);
+
+  fill_token_costs(
+    cpi->mb.token_costs[TX_16X16],
+    (const vp9_prob(*)[8][PREV_COEF_CONTEXTS][11]) cpi->common.fc.coef_probs_16x16,
+    BLOCK_TYPES_16X16);
+  fill_token_costs(
+    cpi->mb.hybrid_token_costs[TX_16X16],
+    (const vp9_prob(*)[8][PREV_COEF_CONTEXTS][11])
+    cpi->common.fc.hybrid_coef_probs_16x16,
+    BLOCK_TYPES_16X16);
+
+  /*rough estimate for costing*/
+  cpi->common.kf_ymode_probs_index = cpi->common.base_qindex >> 4;
+  vp9_init_mode_costs(cpi);
+
+  if (cpi->common.frame_type != KEY_FRAME)
+  {
+    vp9_build_nmv_cost_table(
+        cpi->mb.nmvjointcost,
+        cpi->mb.e_mbd.allow_high_precision_mv ?
+        cpi->mb.nmvcost_hp : cpi->mb.nmvcost,
+        &cpi->common.fc.nmvc,
+        cpi->mb.e_mbd.allow_high_precision_mv, 1, 1);
+  }
+}
+
+void vp9_auto_select_speed(VP9_COMP *cpi) {
+  int milliseconds_for_compress = (int)(1000000 / cpi->oxcf.frame_rate);
+
+  milliseconds_for_compress = milliseconds_for_compress * (16 - cpi->oxcf.cpu_used) / 16;
+
+  /*
+  // this is done during parameter valid check
+  if( cpi->oxcf.cpu_used > 16)
+      cpi->oxcf.cpu_used = 16;
+  if( cpi->oxcf.cpu_used < -16)
+      cpi->oxcf.cpu_used = -16;
+  */
+
+  if (cpi->avg_pick_mode_time < milliseconds_for_compress &&
+      (cpi->avg_encode_time - cpi->avg_pick_mode_time) <
+      milliseconds_for_compress) {
+    if (cpi->avg_pick_mode_time == 0) {
+      cpi->Speed = 4;
+    } else {
+      if (milliseconds_for_compress * 100 < cpi->avg_encode_time * 95) {
+        cpi->Speed          += 2;
+        cpi->avg_pick_mode_time = 0;
+        cpi->avg_encode_time = 0;
+
+        if (cpi->Speed > 16) {
+          cpi->Speed = 16;
+        }
+      }
+
+      if (milliseconds_for_compress * 100 >
+          cpi->avg_encode_time * auto_speed_thresh[cpi->Speed]) {
+        cpi->Speed          -= 1;
+        cpi->avg_pick_mode_time = 0;
+        cpi->avg_encode_time = 0;
+
+        // In real-time mode, cpi->speed is in [4, 16].
+        if (cpi->Speed < 4) {      // if ( cpi->Speed < 0 )
+          cpi->Speed = 4;        // cpi->Speed = 0;
+        }
+      }
+    }
+  } else {
+    cpi->Speed += 4;
+
+    if (cpi->Speed > 16)
+      cpi->Speed = 16;
+
+
+    cpi->avg_pick_mode_time = 0;
+    cpi->avg_encode_time = 0;
+  }
+}
+
+int vp9_block_error_c(short *coeff, short *dqcoeff, int block_size) {
+  int i, error = 0;
+
+  for (i = 0; i < block_size; i++) {
+    int this_diff = coeff[i] - dqcoeff[i];
+    error += this_diff * this_diff;
+  }
+
+  return error;
+}
+
+int vp9_mbblock_error_c(MACROBLOCK *mb, int dc) {
+  BLOCK  *be;
+  BLOCKD *bd;
+  int i, j;
+  int berror, error = 0;
+
+  for (i = 0; i < 16; i++) {
+    be = &mb->block[i];
+    bd = &mb->e_mbd.block[i];
+
+    berror = 0;
+
+    for (j = dc; j < 16; j++) {
+      int this_diff = be->coeff[j] - bd->dqcoeff[j];
+      berror += this_diff * this_diff;
+    }
+
+    error += berror;
+  }
+
+  return error;
+}
+
+int vp9_mbuverror_c(MACROBLOCK *mb) {
+  BLOCK  *be;
+  BLOCKD *bd;
+
+  int i, error = 0;
+
+  for (i = 16; i < 24; i++) {
+    be = &mb->block[i];
+    bd = &mb->e_mbd.block[i];
+
+    error += vp9_block_error_c(be->coeff, bd->dqcoeff, 16);
+  }
+
+  return error;
+}
+
+int vp9_uvsse(MACROBLOCK *x) {
+  unsigned char *uptr, *vptr;
+  unsigned char *upred_ptr = (*(x->block[16].base_src) + x->block[16].src);
+  unsigned char *vpred_ptr = (*(x->block[20].base_src) + x->block[20].src);
+  int uv_stride = x->block[16].src_stride;
+
+  unsigned int sse1 = 0;
+  unsigned int sse2 = 0;
+  int mv_row = x->e_mbd.mode_info_context->mbmi.mv[0].as_mv.row;
+  int mv_col = x->e_mbd.mode_info_context->mbmi.mv[0].as_mv.col;
+  int offset;
+  int pre_stride = x->e_mbd.block[16].pre_stride;
+
+  if (mv_row < 0)
+    mv_row -= 1;
+  else
+    mv_row += 1;
+
+  if (mv_col < 0)
+    mv_col -= 1;
+  else
+    mv_col += 1;
+
+  mv_row /= 2;
+  mv_col /= 2;
+
+  offset = (mv_row >> 3) * pre_stride + (mv_col >> 3);
+  uptr = x->e_mbd.pre.u_buffer + offset;
+  vptr = x->e_mbd.pre.v_buffer + offset;
+
+  if ((mv_row | mv_col) & 7) {
+    vp9_sub_pixel_variance8x8(uptr, pre_stride, (mv_col & 7) << 1,
+                              (mv_row & 7) << 1, upred_ptr, uv_stride, &sse2);
+    vp9_sub_pixel_variance8x8(vptr, pre_stride, (mv_col & 7) << 1,
+                              (mv_row & 7) << 1, vpred_ptr, uv_stride, &sse1);
+    sse2 += sse1;
+  } else {
+    vp9_variance8x8(uptr, pre_stride, upred_ptr, uv_stride, &sse2);
+    vp9_variance8x8(vptr, pre_stride, vpred_ptr, uv_stride, &sse1);
+    sse2 += sse1;
+  }
+  return sse2;
+
+}
+
+static int cost_coeffs_2x2(MACROBLOCK *mb,
+                           BLOCKD *b, PLANE_TYPE type,
+                           ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l) {
+  int c = (type == PLANE_TYPE_Y_NO_DC); /* start at coef 0, unless Y with Y2 */
+  int eob = b->eob;
+  int pt;    /* surrounding block/prev coef predictor */
+  int cost = 0;
+  short *qcoeff_ptr = b->qcoeff;
+
+  VP9_COMBINEENTROPYCONTEXTS(pt, *a, *l);
+  assert(eob <= 4);
+
+  for (; c < eob; c++) {
+    int v = qcoeff_ptr[vp9_default_zig_zag1d[c]];
+    int t = vp9_dct_value_tokens_ptr[v].Token;
+    cost += mb->token_costs[TX_8X8][type][vp9_coef_bands[c]][pt][t];
+    cost += vp9_dct_value_cost_ptr[v];
+    pt = vp9_prev_token_class[t];
+  }
+
+  if (c < 4)
+    cost += mb->token_costs[TX_8X8][type][vp9_coef_bands[c]]
+            [pt] [DCT_EOB_TOKEN];
+
+  pt = (c != !type); // is eob first coefficient;
+  *a = *l = pt;
+  return cost;
+}
+
+static int cost_coeffs(MACROBLOCK *mb, BLOCKD *b, PLANE_TYPE type,
+                       ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
+                       int tx_size) {
+  const int eob = b->eob;
+  int c = (type == PLANE_TYPE_Y_NO_DC); /* start at coef 0, unless Y with Y2 */
+  int cost = 0, default_eob, seg_eob;
+  int pt;                     /* surrounding block/prev coef predictor */
+  int const *scan, *band;
+  short *qcoeff_ptr = b->qcoeff;
+  MACROBLOCKD *xd = &mb->e_mbd;
+  MB_MODE_INFO *mbmi = &mb->e_mbd.mode_info_context->mbmi;
+  TX_TYPE tx_type = DCT_DCT;
+  int segment_id = mbmi->segment_id;
+
+  switch (tx_size) {
+    case TX_4X4:
+      scan = vp9_default_zig_zag1d;
+      band = vp9_coef_bands;
+      default_eob = 16;
+      if (type == PLANE_TYPE_Y_WITH_DC) {
+        tx_type = get_tx_type_4x4(xd, b);
+        if (tx_type != DCT_DCT) {
+          switch (tx_type) {
+            case ADST_DCT:
+              scan = vp9_row_scan;
+              break;
+
+            case DCT_ADST:
+              scan = vp9_col_scan;
+              break;
+
+            default:
+              scan = vp9_default_zig_zag1d;
+              break;
+          }
+        }
+      }
+
+      break;
+    case TX_8X8:
+      scan = vp9_default_zig_zag1d_8x8;
+      band = vp9_coef_bands_8x8;
+      default_eob = 64;
+      if (type == PLANE_TYPE_Y_WITH_DC) {
+        BLOCKD *bb;
+        int ib = (b - xd->block);
+        if (ib < 16) {
+          ib = (ib & 8) + ((ib & 4) >> 1);
+          bb = xd->block + ib;
+          tx_type = get_tx_type_8x8(xd, bb);
+        }
+      }
+      break;
+    case TX_16X16:
+      scan = vp9_default_zig_zag1d_16x16;
+      band = vp9_coef_bands_16x16;
+      default_eob = 256;
+      if (type == PLANE_TYPE_Y_WITH_DC) {
+        tx_type = get_tx_type_16x16(xd, b);
+      }
+      break;
+    default:
+      break;
+  }
+  if (vp9_segfeature_active(&mb->e_mbd, segment_id, SEG_LVL_EOB))
+    seg_eob = vp9_get_segdata(&mb->e_mbd, segment_id, SEG_LVL_EOB);
+  else
+    seg_eob = default_eob;
+
+  VP9_COMBINEENTROPYCONTEXTS(pt, *a, *l);
+
+  if (tx_type != DCT_DCT) {
+    for (; c < eob; c++) {
+      int v = qcoeff_ptr[scan[c]];
+      int t = vp9_dct_value_tokens_ptr[v].Token;
+      cost += mb->hybrid_token_costs[tx_size][type][band[c]][pt][t];
+      cost += vp9_dct_value_cost_ptr[v];
+      pt = vp9_prev_token_class[t];
+    }
+    if (c < seg_eob)
+      cost += mb->hybrid_token_costs[tx_size][type][band[c]]
+          [pt][DCT_EOB_TOKEN];
+  } else {
+    for (; c < eob; c++) {
+      int v = qcoeff_ptr[scan[c]];
+      int t = vp9_dct_value_tokens_ptr[v].Token;
+      cost += mb->token_costs[tx_size][type][band[c]][pt][t];
+      cost += vp9_dct_value_cost_ptr[v];
+      pt = vp9_prev_token_class[t];
+    }
+    if (c < seg_eob)
+      cost += mb->token_costs[tx_size][type][band[c]]
+          [pt][DCT_EOB_TOKEN];
+  }
+
+  pt = (c != !type); // is eob first coefficient;
+  *a = *l = pt;
+  return cost;
+}
+
+static int rdcost_mby_4x4(MACROBLOCK *mb) {
+  int cost = 0;
+  int b;
+  MACROBLOCKD *xd = &mb->e_mbd;
+  ENTROPY_CONTEXT_PLANES t_above, t_left;
+  ENTROPY_CONTEXT *ta;
+  ENTROPY_CONTEXT *tl;
+
+  vpx_memcpy(&t_above, xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES));
+  vpx_memcpy(&t_left, xd->left_context, sizeof(ENTROPY_CONTEXT_PLANES));
+
+  ta = (ENTROPY_CONTEXT *)&t_above;
+  tl = (ENTROPY_CONTEXT *)&t_left;
+
+  for (b = 0; b < 16; b++)
+    cost += cost_coeffs(mb, xd->block + b, PLANE_TYPE_Y_NO_DC,
+                        ta + vp9_block2above[b], tl + vp9_block2left[b],
+                        TX_4X4);
+
+  cost += cost_coeffs(mb, xd->block + 24, PLANE_TYPE_Y2,
+                      ta + vp9_block2above[24], tl + vp9_block2left[24],
+                      TX_4X4);
+
+  return cost;
+}
+
+static void macro_block_yrd_4x4(MACROBLOCK *mb,
+                                int *Rate,
+                                int *Distortion,
+                                const VP9_ENCODER_RTCD *rtcd,
+                                int *skippable) {
+  int b;
+  MACROBLOCKD *const xd = &mb->e_mbd;
+  BLOCK   *const mb_y2 = mb->block + 24;
+  BLOCKD *const x_y2  = xd->block + 24;
+  short *Y2DCPtr = mb_y2->src_diff;
+  BLOCK *beptr;
+  int d;
+
+  vp9_subtract_mby(mb->src_diff, *(mb->block[0].base_src), xd->predictor,
+                   mb->block[0].src_stride);
+
+  // Fdct and building the 2nd order block
+  for (beptr = mb->block; beptr < mb->block + 16; beptr += 2) {
+    mb->vp9_short_fdct8x4(beptr->src_diff, beptr->coeff, 32);
+    *Y2DCPtr++ = beptr->coeff[0];
+    *Y2DCPtr++ = beptr->coeff[16];
+  }
+
+  // 2nd order fdct
+  mb->short_walsh4x4(mb_y2->src_diff, mb_y2->coeff, 8);
+
+  // Quantization
+  for (b = 0; b < 16; b++) {
+    mb->quantize_b_4x4(&mb->block[b], &xd->block[b]);
+  }
+
+  // DC predication and Quantization of 2nd Order block
+  mb->quantize_b_4x4(mb_y2, x_y2);
+
+  // Distortion
+  d = vp9_mbblock_error(mb, 1);
+
+  d += vp9_block_error(mb_y2->coeff, x_y2->dqcoeff, 16);
+
+  *Distortion = (d >> 2);
+  // rate
+  *Rate = rdcost_mby_4x4(mb);
+  *skippable = vp9_mby_is_skippable_4x4(&mb->e_mbd, 1);
+}
+
+static int rdcost_mby_8x8(MACROBLOCK *mb, int backup) {
+  int cost = 0;
+  int b;
+  MACROBLOCKD *xd = &mb->e_mbd;
+  ENTROPY_CONTEXT_PLANES t_above, t_left;
+  ENTROPY_CONTEXT *ta;
+  ENTROPY_CONTEXT *tl;
+
+  if (backup) {
+    vpx_memcpy(&t_above,xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES));
+    vpx_memcpy(&t_left, xd->left_context, sizeof(ENTROPY_CONTEXT_PLANES));
+
+    ta = (ENTROPY_CONTEXT *)&t_above;
+    tl = (ENTROPY_CONTEXT *)&t_left;
+  } else {
+    ta = (ENTROPY_CONTEXT *)mb->e_mbd.above_context;
+    tl = (ENTROPY_CONTEXT *)mb->e_mbd.left_context;
+  }
+
+  for (b = 0; b < 16; b += 4)
+    cost += cost_coeffs(mb, xd->block + b, PLANE_TYPE_Y_NO_DC,
+                        ta + vp9_block2above_8x8[b], tl + vp9_block2left_8x8[b],
+                        TX_8X8);
+
+  cost += cost_coeffs_2x2(mb, xd->block + 24, PLANE_TYPE_Y2,
+                          ta + vp9_block2above[24], tl + vp9_block2left[24]);
+  return cost;
+}
+
+static void macro_block_yrd_8x8(MACROBLOCK *mb,
+                                int *Rate,
+                                int *Distortion,
+                                const VP9_ENCODER_RTCD *rtcd,
+                                int *skippable) {
+  MACROBLOCKD *const xd = &mb->e_mbd;
+  BLOCK   *const mb_y2 = mb->block + 24;
+  BLOCKD *const x_y2  = xd->block + 24;
+  int d;
+
+  vp9_subtract_mby(mb->src_diff, *(mb->block[0].base_src), xd->predictor,
+                   mb->block[0].src_stride);
+
+  vp9_transform_mby_8x8(mb);
+  vp9_quantize_mby_8x8(mb);
+
+  /* remove 1st order dc to properly combine 1st/2nd order distortion */
+  mb->coeff[0] = 0;
+  mb->coeff[64] = 0;
+  mb->coeff[128] = 0;
+  mb->coeff[192] = 0;
+  xd->dqcoeff[0] = 0;
+  xd->dqcoeff[64] = 0;
+  xd->dqcoeff[128] = 0;
+  xd->dqcoeff[192] = 0;
+
+  d = vp9_mbblock_error(mb, 0);
+  d += vp9_block_error(mb_y2->coeff, x_y2->dqcoeff, 16);
+
+  *Distortion = (d >> 2);
+  // rate
+  *Rate = rdcost_mby_8x8(mb, 1);
+  *skippable = vp9_mby_is_skippable_8x8(&mb->e_mbd, 1);
+}
+
+static int rdcost_mby_16x16(MACROBLOCK *mb) {
+  int cost;
+  MACROBLOCKD *xd = &mb->e_mbd;
+  ENTROPY_CONTEXT_PLANES t_above, t_left;
+  ENTROPY_CONTEXT *ta, *tl;
+
+  vpx_memcpy(&t_above, xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES));
+  vpx_memcpy(&t_left, xd->left_context, sizeof(ENTROPY_CONTEXT_PLANES));
+
+  ta = (ENTROPY_CONTEXT *)&t_above;
+  tl = (ENTROPY_CONTEXT *)&t_left;
+
+  cost = cost_coeffs(mb, xd->block, PLANE_TYPE_Y_WITH_DC, ta, tl, TX_16X16);
+  return cost;
+}
+
+static void macro_block_yrd_16x16(MACROBLOCK *mb, int *Rate, int *Distortion,
+                                  const VP9_ENCODER_RTCD *rtcd, int *skippable) {
+  int d;
+  MACROBLOCKD *xd = &mb->e_mbd;
+  BLOCKD *b  = &mb->e_mbd.block[0];
+  BLOCK  *be = &mb->block[0];
+  TX_TYPE tx_type;
+
+  vp9_subtract_mby(mb->src_diff, *(mb->block[0].base_src), mb->e_mbd.predictor,
+                   mb->block[0].src_stride);
+
+  tx_type = get_tx_type_16x16(xd, b);
+  if (tx_type != DCT_DCT) {
+    vp9_fht(be->src_diff, 32, be->coeff, tx_type, 16);
+  } else
+    vp9_transform_mby_16x16(mb);
+
+  vp9_quantize_mby_16x16(mb);
+  // TODO(jingning) is it possible to quickly determine whether to force
+  //                trailing coefficients to be zero, instead of running trellis
+  //                optimization in the rate-distortion optimization loop?
+  if (mb->e_mbd.mode_info_context->mbmi.mode < I8X8_PRED)
+    vp9_optimize_mby_16x16(mb, rtcd);
+
+  d = vp9_mbblock_error(mb, 0);
+
+  *Distortion = (d >> 2);
+  // rate
+  *Rate = rdcost_mby_16x16(mb);
+  *skippable = vp9_mby_is_skippable_16x16(&mb->e_mbd);
+}
+
+static void macro_block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
+                            int *distortion, int *skippable,
+                            int64_t txfm_cache[NB_TXFM_MODES]) {
+  VP9_COMMON *cm = &cpi->common;
+  MB_MODE_INFO *mbmi = &x->e_mbd.mode_info_context->mbmi;
+
+  MACROBLOCKD *xd = &x->e_mbd;
+  int can_skip = cm->mb_no_coeff_skip;
+  vp9_prob skip_prob = can_skip ? vp9_get_pred_prob(cm, xd, PRED_MBSKIP) : 128;
+  int s0, s1;
+  int r4x4, r4x4s, r8x8, r8x8s, d4x4, d8x8, s4x4, s8x8;
+  int64_t rd4x4, rd8x8, rd4x4s, rd8x8s;
+  int d16x16, r16x16, r16x16s, s16x16;
+  int64_t rd16x16, rd16x16s;
+
+  // FIXME don't do sub x3
+  if (skip_prob == 0)
+    skip_prob = 1;
+  s0 = vp9_cost_bit(skip_prob, 0);
+  s1 = vp9_cost_bit(skip_prob, 1);
+  macro_block_yrd_16x16(x, &r16x16, &d16x16, IF_RTCD(&cpi->rtcd), &s16x16);
+  if (can_skip) {
+    if (s16x16) {
+      rd16x16 = RDCOST(x->rdmult, x->rddiv, s1, d16x16);
+    } else {
+      rd16x16 = RDCOST(x->rdmult, x->rddiv, r16x16 + s0, d16x16);
+    }
+  } else {
+    rd16x16 = RDCOST(x->rdmult, x->rddiv, r16x16, d16x16);
+  }
+  r16x16s = r16x16 + vp9_cost_one(cm->prob_tx[0]) + vp9_cost_one(cm->prob_tx[1]);
+  if (can_skip) {
+    if (s16x16) {
+      rd16x16s = RDCOST(x->rdmult, x->rddiv, s1, d16x16);
+    } else {
+      rd16x16s = RDCOST(x->rdmult, x->rddiv, r16x16s + s0, d16x16);
+    }
+  } else {
+    rd16x16s = RDCOST(x->rdmult, x->rddiv, r16x16s, d16x16);
+  }
+  macro_block_yrd_8x8(x, &r8x8, &d8x8, IF_RTCD(&cpi->rtcd), &s8x8);
+  if (can_skip) {
+    if (s8x8) {
+      rd8x8 = RDCOST(x->rdmult, x->rddiv, s1, d8x8);
+    } else {
+      rd8x8 = RDCOST(x->rdmult, x->rddiv, r8x8 + s0, d8x8);
+    }
+  } else {
+    rd8x8 = RDCOST(x->rdmult, x->rddiv, r8x8, d8x8);
+  }
+  r8x8s = r8x8 + vp9_cost_one(cm->prob_tx[0]);
+  r8x8s += vp9_cost_zero(cm->prob_tx[1]);
+  if (can_skip) {
+    if (s8x8) {
+      rd8x8s = RDCOST(x->rdmult, x->rddiv, s1, d8x8);
+    } else {
+      rd8x8s = RDCOST(x->rdmult, x->rddiv, r8x8s + s0, d8x8);
+    }
+  } else {
+    rd8x8s = RDCOST(x->rdmult, x->rddiv, r8x8s, d8x8);
+  }
+  macro_block_yrd_4x4(x, &r4x4, &d4x4, IF_RTCD(&cpi->rtcd), &s4x4);
+  if (can_skip) {
+    if (s4x4) {
+      rd4x4 = RDCOST(x->rdmult, x->rddiv, s1, d4x4);
+    } else {
+      rd4x4 = RDCOST(x->rdmult, x->rddiv, r4x4 + s0, d4x4);
+    }
+  } else {
+    rd4x4 = RDCOST(x->rdmult, x->rddiv, r4x4, d4x4);
+  }
+  r4x4s = r4x4 + vp9_cost_zero(cm->prob_tx[0]);
+  if (can_skip) {
+    if (s4x4) {
+      rd4x4s = RDCOST(x->rdmult, x->rddiv, s1, d4x4);
+    } else {
+      rd4x4s = RDCOST(x->rdmult, x->rddiv, r4x4s + s0, d4x4);
+    }
+  } else {
+    rd4x4s = RDCOST(x->rdmult, x->rddiv, r4x4s, d4x4);
+  }
+
+  if ( cpi->common.txfm_mode == ALLOW_16X16 ||
+      (cpi->common.txfm_mode == TX_MODE_SELECT &&
+       rd16x16s < rd8x8s && rd16x16s < rd4x4s)) {
+    mbmi->txfm_size = TX_16X16;
+    *skippable = s16x16;
+    *distortion = d16x16;
+    *rate = (cpi->common.txfm_mode == ALLOW_16X16) ? r16x16 : r16x16s;
+  } else
+  if ( cpi->common.txfm_mode == ALLOW_8X8 ||
+      (cpi->common.txfm_mode == TX_MODE_SELECT && rd8x8s < rd4x4s)) {
+    mbmi->txfm_size = TX_8X8;
+    *skippable = s8x8;
+    *distortion = d8x8;
+    *rate = (cpi->common.txfm_mode == ALLOW_8X8) ? r8x8 : r8x8s;
+  } else {
+    assert(cpi->common.txfm_mode == ONLY_4X4 ||
+           (cpi->common.txfm_mode == TX_MODE_SELECT && rd4x4s <= rd8x8s));
+    mbmi->txfm_size = TX_4X4;
+    *skippable = s4x4;
+    *distortion = d4x4;
+    *rate = (cpi->common.txfm_mode == ONLY_4X4) ? r4x4 : r4x4s;
+  }
+
+  txfm_cache[ONLY_4X4] = rd4x4;
+  txfm_cache[ALLOW_8X8] = rd8x8;
+  txfm_cache[ALLOW_16X16] = rd16x16;
+  if (rd16x16s < rd8x8s && rd16x16s < rd4x4s)
+    txfm_cache[TX_MODE_SELECT] = rd16x16s;
+  else
+    txfm_cache[TX_MODE_SELECT] = rd4x4s < rd8x8s ? rd4x4s : rd8x8s;
+
+}
+
+static void copy_predictor(unsigned char *dst, const unsigned char *predictor) {
+  const unsigned int *p = (const unsigned int *)predictor;
+  unsigned int *d = (unsigned int *)dst;
+  d[0] = p[0];
+  d[4] = p[4];
+  d[8] = p[8];
+  d[12] = p[12];
+}
+
+#if CONFIG_SUPERBLOCKS
+static void super_block_yrd_8x8(MACROBLOCK *x,
+                                int *rate,
+                                int *distortion,
+                                const VP9_ENCODER_RTCD *rtcd, int *skip)
+{
+  MACROBLOCKD *const xd = &x->e_mbd;
+  BLOCK *const by2 = x->block + 24;
+  BLOCKD *const bdy2  = xd->block + 24;
+  int d = 0, r = 0, n;
+  const uint8_t *src = x->src.y_buffer, *dst = xd->dst.y_buffer;
+  int src_y_stride = x->src.y_stride, dst_y_stride = xd->dst.y_stride;
+  ENTROPY_CONTEXT_PLANES *ta = xd->above_context;
+  ENTROPY_CONTEXT_PLANES *tl = xd->left_context;
+  ENTROPY_CONTEXT_PLANES t_above[2];
+  ENTROPY_CONTEXT_PLANES t_left[2];
+  int skippable = 1;
+
+  vpx_memcpy(t_above, xd->above_context, sizeof(t_above));
+  vpx_memcpy(t_left, xd->left_context, sizeof(t_left));
+
+  for (n = 0; n < 4; n++) {
+    int x_idx = n & 1, y_idx = n >> 1;
+
+    vp9_subtract_mby_s_c(x->src_diff,
+                         src + x_idx * 16 + y_idx * 16 * src_y_stride,
+                         src_y_stride,
+                         dst + x_idx * 16 + y_idx * 16 * dst_y_stride,
+                         dst_y_stride);
+    vp9_transform_mby_8x8(x);
+    vp9_quantize_mby_8x8(x);
+
+    /* remove 1st order dc to properly combine 1st/2nd order distortion */
+    x->coeff[  0] = 0;
+    x->coeff[ 64] = 0;
+    x->coeff[128] = 0;
+    x->coeff[192] = 0;
+    xd->dqcoeff[  0] = 0;
+    xd->dqcoeff[ 64] = 0;
+    xd->dqcoeff[128] = 0;
+    xd->dqcoeff[192] = 0;
+
+    d += vp9_mbblock_error(x, 0);
+    d += vp9_block_error(by2->coeff, bdy2->dqcoeff, 16);
+    xd->above_context = ta + x_idx;
+    xd->left_context = tl + y_idx;
+    r += rdcost_mby_8x8(x, 0);
+    skippable = skippable && vp9_mby_is_skippable_8x8(xd, 1);
+  }
+
+  *distortion = (d >> 2);
+  *rate       = r;
+  if (skip) *skip = skippable;
+  xd->above_context = ta;
+  xd->left_context = tl;
+  vpx_memcpy(xd->above_context, &t_above, sizeof(t_above));
+  vpx_memcpy(xd->left_context, &t_left, sizeof(t_left));
+}
+#endif
+
+static void copy_predictor_8x8(unsigned char *dst, const unsigned char *predictor) {
+  const unsigned int *p = (const unsigned int *)predictor;
+  unsigned int *d = (unsigned int *)dst;
+  d[0] = p[0];
+  d[1] = p[1];
+  d[4] = p[4];
+  d[5] = p[5];
+  d[8] = p[8];
+  d[9] = p[9];
+  d[12] = p[12];
+  d[13] = p[13];
+  d[16] = p[16];
+  d[17] = p[17];
+  d[20] = p[20];
+  d[21] = p[21];
+  d[24] = p[24];
+  d[25] = p[25];
+  d[28] = p[28];
+  d[29] = p[29];
+}
+
+static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, BLOCK *be,
+                                     BLOCKD *b, B_PREDICTION_MODE *best_mode,
+#if CONFIG_COMP_INTRA_PRED
+                                     B_PREDICTION_MODE *best_second_mode,
+                                     int allow_comp,
+#endif
+                                     int *bmode_costs,
+                                     ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
+                                     int *bestrate, int *bestratey,
+                                     int *bestdistortion) {
+  B_PREDICTION_MODE mode;
+  MACROBLOCKD *xd = &x->e_mbd;
+
+#if CONFIG_COMP_INTRA_PRED
+  B_PREDICTION_MODE mode2;
+#endif
+  int64_t best_rd = INT64_MAX;
+  int rate = 0;
+  int distortion;
+
+  ENTROPY_CONTEXT ta = *a, tempa = *a;
+  ENTROPY_CONTEXT tl = *l, templ = *l;
+  TX_TYPE tx_type = DCT_DCT;
+  TX_TYPE best_tx_type = DCT_DCT;
+  /*
+   * The predictor buffer is a 2d buffer with a stride of 16.  Create
+   * a temp buffer that meets the stride requirements, but we are only
+   * interested in the left 4x4 block
+   * */
+  DECLARE_ALIGNED_ARRAY(16, unsigned char,  best_predictor, 16 * 4);
+  DECLARE_ALIGNED_ARRAY(16, short, best_dqcoeff, 16);
+
+  for (mode = B_DC_PRED; mode <= B_HU_PRED; mode++) {
+#if CONFIG_COMP_INTRA_PRED
+    for (mode2 = (allow_comp ? 0 : (B_DC_PRED - 1));
+                   mode2 != (allow_comp ? (mode + 1) : 0); mode2++) {
+#endif
+      int64_t this_rd;
+      int ratey;
+
+      b->bmi.as_mode.first = mode;
+      rate = bmode_costs[mode];
+
+#if CONFIG_COMP_INTRA_PRED
+      if (mode2 == (B_PREDICTION_MODE)(B_DC_PRED - 1)) {
+#endif
+        vp9_intra4x4_predict(b, mode, b->predictor);
+#if CONFIG_COMP_INTRA_PRED
+      } else {
+        vp9_comp_intra4x4_predict(b, mode, mode2, b->predictor);
+        rate += bmode_costs[mode2];
+      }
+#endif
+      vp9_subtract_b(be, b, 16);
+
+      b->bmi.as_mode.first = mode;
+      tx_type = get_tx_type_4x4(xd, b);
+      if (tx_type != DCT_DCT) {
+        vp9_fht(be->src_diff, 32, be->coeff, tx_type, 4);
+        vp9_ht_quantize_b_4x4(be, b, tx_type);
+      } else {
+        x->vp9_short_fdct4x4(be->src_diff, be->coeff, 32);
+        x->quantize_b_4x4(be, b);
+      }
+
+      tempa = ta;
+      templ = tl;
+
+      ratey = cost_coeffs(x, b, PLANE_TYPE_Y_WITH_DC, &tempa, &templ, TX_4X4);
+      rate += ratey;
+      distortion = vp9_block_error(be->coeff, b->dqcoeff, 16) >> 2;
+
+      this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
+
+      if (this_rd < best_rd) {
+        *bestrate = rate;
+        *bestratey = ratey;
+        *bestdistortion = distortion;
+        best_rd = this_rd;
+        *best_mode = mode;
+        best_tx_type = tx_type;
+
+#if CONFIG_COMP_INTRA_PRED
+        *best_second_mode = mode2;
+#endif
+        *a = tempa;
+        *l = templ;
+        copy_predictor(best_predictor, b->predictor);
+        vpx_memcpy(best_dqcoeff, b->dqcoeff, 32);
+      }
+#if CONFIG_COMP_INTRA_PRED
+    }
+#endif
+  }
+  b->bmi.as_mode.first = (B_PREDICTION_MODE)(*best_mode);
+#if CONFIG_COMP_INTRA_PRED
+  b->bmi.as_mode.second = (B_PREDICTION_MODE)(*best_second_mode);
+#endif
+
+  // inverse transform
+  if (best_tx_type != DCT_DCT)
+    vp9_ihtllm_c(best_dqcoeff, b->diff, 32, best_tx_type, 4);
+  else
+    IDCT_INVOKE(IF_RTCD(&cpi->rtcd.common->idct), idct16)(
+        best_dqcoeff, b->diff, 32);
+
+  vp9_recon_b(best_predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+
+  return best_rd;
+}
+
+static int64_t rd_pick_intra4x4mby_modes(VP9_COMP *cpi, MACROBLOCK *mb, int *Rate,
+                                     int *rate_y, int *Distortion, int64_t best_rd,
+#if CONFIG_COMP_INTRA_PRED
+                                     int allow_comp,
+#endif
+                                     int update_contexts) {
+  int i;
+  MACROBLOCKD *const xd = &mb->e_mbd;
+  int cost = mb->mbmode_cost [xd->frame_type] [B_PRED];
+  int distortion = 0;
+  int tot_rate_y = 0;
+  int64_t total_rd = 0;
+  ENTROPY_CONTEXT_PLANES t_above, t_left;
+  ENTROPY_CONTEXT *ta, *tl;
+  int *bmode_costs;
+
+  if (update_contexts) {
+    ta = (ENTROPY_CONTEXT *)xd->above_context;
+    tl = (ENTROPY_CONTEXT *)xd->left_context;
+  } else {
+    vpx_memcpy(&t_above, xd->above_context,
+               sizeof(ENTROPY_CONTEXT_PLANES));
+    vpx_memcpy(&t_left, xd->left_context,
+               sizeof(ENTROPY_CONTEXT_PLANES));
+
+    ta = (ENTROPY_CONTEXT *)&t_above;
+    tl = (ENTROPY_CONTEXT *)&t_left;
+  }
+
+  xd->mode_info_context->mbmi.mode = B_PRED;
+  bmode_costs = mb->inter_bmode_costs;
+
+  for (i = 0; i < 16; i++) {
+    MODE_INFO *const mic = xd->mode_info_context;
+    const int mis = xd->mode_info_stride;
+    B_PREDICTION_MODE UNINITIALIZED_IS_SAFE(best_mode);
+#if CONFIG_COMP_INTRA_PRED
+    B_PREDICTION_MODE UNINITIALIZED_IS_SAFE(best_second_mode);
+#endif
+    int UNINITIALIZED_IS_SAFE(r), UNINITIALIZED_IS_SAFE(ry), UNINITIALIZED_IS_SAFE(d);
+
+    if (xd->frame_type == KEY_FRAME) {
+      const B_PREDICTION_MODE A = above_block_mode(mic, i, mis);
+      const B_PREDICTION_MODE L = left_block_mode(mic, i);
+
+      bmode_costs  = mb->bmode_costs[A][L];
+    }
+
+    total_rd += rd_pick_intra4x4block(
+                  cpi, mb, mb->block + i, xd->block + i, &best_mode,
+#if CONFIG_COMP_INTRA_PRED
+                  & best_second_mode, allow_comp,
+#endif
+                  bmode_costs, ta + vp9_block2above[i],
+                  tl + vp9_block2left[i], &r, &ry, &d);
+
+    cost += r;
+    distortion += d;
+    tot_rate_y += ry;
+
+    mic->bmi[i].as_mode.first = best_mode;
+#if CONFIG_COMP_INTRA_PRED
+    mic->bmi[i].as_mode.second = best_second_mode;
+#endif
+
+    if (total_rd >= best_rd)
+      break;
+  }
+
+  if (total_rd >= best_rd)
+    return INT64_MAX;
+
+#if CONFIG_COMP_INTRA_PRED
+  cost += vp9_cost_bit(128, allow_comp);
+#endif
+  *Rate = cost;
+  *rate_y += tot_rate_y;
+  *Distortion = distortion;
+
+  return RDCOST(mb->rdmult, mb->rddiv, cost, distortion);
+}
+
+#if CONFIG_SUPERBLOCKS
+static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi,
+                                      MACROBLOCK *x,
+                                      int *rate,
+                                      int *rate_tokenonly,
+                                      int *distortion,
+                                      int *skippable) {
+  MB_PREDICTION_MODE mode;
+  MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected);
+  int this_rate, this_rate_tokenonly;
+  int this_distortion, s;
+  int64_t best_rd = INT64_MAX, this_rd;
+
+  /* Y Search for 32x32 intra prediction mode */
+  for (mode = DC_PRED; mode <= TM_PRED; mode++) {
+    x->e_mbd.mode_info_context->mbmi.mode = mode;
+    vp9_build_intra_predictors_sby_s(&x->e_mbd);
+
+    super_block_yrd_8x8(x, &this_rate_tokenonly,
+                        &this_distortion, IF_RTCD(&cpi->rtcd), &s);
+    this_rate = this_rate_tokenonly +
+                x->mbmode_cost[x->e_mbd.frame_type]
+                              [x->e_mbd.mode_info_context->mbmi.mode];
+    this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
+
+    if (this_rd < best_rd) {
+      mode_selected   = mode;
+      best_rd         = this_rd;
+      *rate           = this_rate;
+      *rate_tokenonly = this_rate_tokenonly;
+      *distortion     = this_distortion;
+      *skippable      = s;
+    }
+  }
+
+  x->e_mbd.mode_info_context->mbmi.mode = mode_selected;
+
+  return best_rd;
+}
+#endif
+
+static int64_t rd_pick_intra16x16mby_mode(VP9_COMP *cpi,
+                                          MACROBLOCK *x,
+                                          int *Rate,
+                                          int *rate_y,
+                                          int *Distortion,
+                                          int *skippable,
+                                          int64_t txfm_cache[NB_TXFM_MODES]) {
+  MB_PREDICTION_MODE mode;
+  TX_SIZE txfm_size;
+  MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected);
+#if CONFIG_COMP_INTRA_PRED
+  MB_PREDICTION_MODE mode2;
+  MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode2_selected);
+#endif
+  MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;
+  int rate, ratey;
+  int distortion, skip;
+  int64_t best_rd = INT64_MAX;
+  int64_t this_rd;
+  MACROBLOCKD *xd = &x->e_mbd;
+
+  int i;
+  for (i = 0; i < NB_TXFM_MODES; i++)
+    txfm_cache[i] = INT64_MAX;
+
+  // Y Search for 16x16 intra prediction mode
+  for (mode = DC_PRED; mode <= TM_PRED; mode++) {
+    int64_t local_txfm_cache[NB_TXFM_MODES];
+
+    mbmi->mode = mode;
+
+#if CONFIG_COMP_INTRA_PRED
+    for (mode2 = DC_PRED - 1; mode2 != TM_PRED + 1; mode2++) {
+      mbmi->second_mode = mode2;
+      if (mode2 == (MB_PREDICTION_MODE)(DC_PRED - 1)) {
+#endif
+        vp9_build_intra_predictors_mby(&x->e_mbd);
+#if CONFIG_COMP_INTRA_PRED
+      } else {
+        continue; // i.e. disable for now
+        vp9_build_comp_intra_predictors_mby(&x->e_mbd);
+      }
+#endif
+
+      macro_block_yrd(cpi, x, &ratey, &distortion, &skip, local_txfm_cache);
+
+      // FIXME add compoundmode cost
+      // FIXME add rate for mode2
+      rate = ratey + x->mbmode_cost[x->e_mbd.frame_type][mbmi->mode];
+
+      this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
+
+
+      if (this_rd < best_rd) {
+        mode_selected = mode;
+        txfm_size = mbmi->txfm_size;
+#if CONFIG_COMP_INTRA_PRED
+        mode2_selected = mode2;
+#endif
+        best_rd = this_rd;
+        *Rate = rate;
+        *rate_y = ratey;
+        *Distortion = distortion;
+        *skippable = skip;
+      }
+
+      for (i = 0; i < NB_TXFM_MODES; i++) {
+        int64_t adj_rd = this_rd + local_txfm_cache[i] -
+                          local_txfm_cache[cpi->common.txfm_mode];
+        if (adj_rd < txfm_cache[i]) {
+          txfm_cache[i] = adj_rd;
+        }
+      }
+
+#if CONFIG_COMP_INTRA_PRED
+    }
+#endif
+  }
+
+  mbmi->txfm_size = txfm_size;
+  mbmi->mode = mode_selected;
+
+#if CONFIG_COMP_INTRA_PRED
+  mbmi->second_mode = mode2_selected;
+#endif
+  return best_rd;
+}
+
+
+static int64_t rd_pick_intra8x8block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
+                                     B_PREDICTION_MODE *best_mode,
+#if CONFIG_COMP_INTRA_PRED
+                                     B_PREDICTION_MODE *best_second_mode,
+#endif
+                                     int *mode_costs,
+                                     ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
+                                     int *bestrate, int *bestratey,
+                                     int *bestdistortion) {
+  MB_PREDICTION_MODE mode;
+#if CONFIG_COMP_INTRA_PRED
+  MB_PREDICTION_MODE mode2;
+#endif
+  MACROBLOCKD *xd = &x->e_mbd;
+  int64_t best_rd = INT64_MAX;
+  int distortion, rate = 0;
+  BLOCK  *be = x->block + ib;
+  BLOCKD *b = xd->block + ib;
+  ENTROPY_CONTEXT ta0, ta1, besta0 = 0, besta1 = 0;
+  ENTROPY_CONTEXT tl0, tl1, bestl0 = 0, bestl1 = 0;
+
+  /*
+   * The predictor buffer is a 2d buffer with a stride of 16.  Create
+   * a temp buffer that meets the stride requirements, but we are only
+   * interested in the left 8x8 block
+   * */
+  DECLARE_ALIGNED_ARRAY(16, unsigned char,  best_predictor, 16 * 8);
+  DECLARE_ALIGNED_ARRAY(16, short, best_dqcoeff, 16 * 4);
+
+  // perform transformation of dimension 8x8
+  // note the input and output index mapping
+  int idx = (ib & 0x02) ? (ib + 2) : ib;
+
+  for (mode = DC_PRED; mode <= TM_PRED; mode++) {
+#if CONFIG_COMP_INTRA_PRED
+    for (mode2 = DC_PRED - 1; mode2 != TM_PRED + 1; mode2++) {
+#endif
+      int64_t this_rd;
+      int rate_t;
+
+      // FIXME rate for compound mode and second intrapred mode
+      rate = mode_costs[mode];
+      b->bmi.as_mode.first = mode;
+
+#if CONFIG_COMP_INTRA_PRED
+      if (mode2 == (MB_PREDICTION_MODE)(DC_PRED - 1)) {
+#endif
+        vp9_intra8x8_predict(b, mode, b->predictor);
+#if CONFIG_COMP_INTRA_PRED
+      } else {
+        continue; // i.e. disable for now
+        vp9_comp_intra8x8_predict(b, mode, mode2, b->predictor);
+      }
+#endif
+
+      vp9_subtract_4b_c(be, b, 16);
+
+      if (xd->mode_info_context->mbmi.txfm_size == TX_8X8) {
+        TX_TYPE tx_type = get_tx_type_8x8(xd, b);
+        if (tx_type != DCT_DCT)
+          vp9_fht(be->src_diff, 32, (x->block + idx)->coeff, tx_type, 8);
+        else
+          x->vp9_short_fdct8x8(be->src_diff, (x->block + idx)->coeff, 32);
+        x->quantize_b_8x8(x->block + idx, xd->block + idx);
+
+        // compute quantization mse of 8x8 block
+        distortion = vp9_block_error_c((x->block + idx)->coeff,
+                                       (xd->block + idx)->dqcoeff, 64);
+        ta0 = a[vp9_block2above_8x8[idx]];
+        tl0 = l[vp9_block2left_8x8[idx]];
+
+        rate_t = cost_coeffs(x, xd->block + idx, PLANE_TYPE_Y_WITH_DC,
+                             &ta0, &tl0, TX_8X8);
+
+        rate += rate_t;
+        ta1 = ta0;
+        tl1 = tl0;
+      } else {
+        x->vp9_short_fdct8x4(be->src_diff, be->coeff, 32);
+        x->vp9_short_fdct8x4((be + 4)->src_diff, (be + 4)->coeff, 32);
+
+        x->quantize_b_4x4_pair(x->block + ib, x->block + ib + 1,
+                               xd->block + ib, xd->block + ib + 1);
+        x->quantize_b_4x4_pair(x->block + ib + 4, x->block + ib + 5,
+                               xd->block + ib + 4, xd->block + ib + 5);
+
+        distortion = vp9_block_error_c((x->block + ib)->coeff,
+                                       (xd->block + ib)->dqcoeff, 16);
+        distortion += vp9_block_error_c((x->block + ib + 1)->coeff,
+                                        (xd->block + ib + 1)->dqcoeff, 16);
+        distortion += vp9_block_error_c((x->block + ib + 4)->coeff,
+                                        (xd->block + ib + 4)->dqcoeff, 16);
+        distortion += vp9_block_error_c((x->block + ib + 5)->coeff,
+                                        (xd->block + ib + 5)->dqcoeff, 16);
+
+        ta0 = a[vp9_block2above[ib]];
+        ta1 = a[vp9_block2above[ib + 1]];
+        tl0 = l[vp9_block2left[ib]];
+        tl1 = l[vp9_block2left[ib + 4]];
+        rate_t = cost_coeffs(x, xd->block + ib, PLANE_TYPE_Y_WITH_DC,
+                             &ta0, &tl0, TX_4X4);
+        rate_t += cost_coeffs(x, xd->block + ib + 1, PLANE_TYPE_Y_WITH_DC,
+                              &ta1, &tl0, TX_4X4);
+        rate_t += cost_coeffs(x, xd->block + ib + 4, PLANE_TYPE_Y_WITH_DC,
+                              &ta0, &tl1, TX_4X4);
+        rate_t += cost_coeffs(x, xd->block + ib + 5, PLANE_TYPE_Y_WITH_DC,
+                              &ta1, &tl1, TX_4X4);
+        rate += rate_t;
+      }
+
+      distortion >>= 2;
+      this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
+      if (this_rd < best_rd) {
+        *bestrate = rate;
+        *bestratey = rate_t;
+        *bestdistortion = distortion;
+        besta0 = ta0;
+        besta1 = ta1;
+        bestl0 = tl0;
+        bestl1 = tl1;
+        best_rd = this_rd;
+        *best_mode = mode;
+#if CONFIG_COMP_INTRA_PRED
+        *best_second_mode = mode2;
+#endif
+        copy_predictor_8x8(best_predictor, b->predictor);
+        vpx_memcpy(best_dqcoeff, b->dqcoeff, 64);
+        vpx_memcpy(best_dqcoeff + 32, b->dqcoeff + 64, 64);
+#if CONFIG_COMP_INTRA_PRED
+      }
+#endif
+    }
+  }
+  b->bmi.as_mode.first = (*best_mode);
+#if CONFIG_COMP_INTRA_PRED
+  b->bmi.as_mode.second = (*best_second_mode);
+#endif
+  vp9_encode_intra8x8(IF_RTCD(&cpi->rtcd), x, ib);
+
+  if (xd->mode_info_context->mbmi.txfm_size == TX_8X8) {
+    a[vp9_block2above_8x8[idx]]     = besta0;
+    a[vp9_block2above_8x8[idx] + 1] = besta1;
+    l[vp9_block2left_8x8[idx]]      = bestl0;
+    l[vp9_block2left_8x8[idx] + 1]  = bestl1;
+  } else {
+    a[vp9_block2above[ib]]     = besta0;
+    a[vp9_block2above[ib + 1]] = besta1;
+    l[vp9_block2left[ib]]      = bestl0;
+    l[vp9_block2left[ib + 4]]  = bestl1;
+  }
+
+  return best_rd;
+}
+
+static int64_t rd_pick_intra8x8mby_modes(VP9_COMP *cpi, MACROBLOCK *mb,
+                                         int *Rate, int *rate_y,
+                                         int *Distortion, int64_t best_rd) {
+  MACROBLOCKD *const xd = &mb->e_mbd;
+  int i, ib;
+  int cost = mb->mbmode_cost [xd->frame_type] [I8X8_PRED];
+  int distortion = 0;
+  int tot_rate_y = 0;
+  long long total_rd = 0;
+  ENTROPY_CONTEXT_PLANES t_above, t_left;
+  ENTROPY_CONTEXT *ta, *tl;
+  int *i8x8mode_costs;
+
+  vpx_memcpy(&t_above, xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES));
+  vpx_memcpy(&t_left, xd->left_context, sizeof(ENTROPY_CONTEXT_PLANES));
+
+  ta = (ENTROPY_CONTEXT *)&t_above;
+  tl = (ENTROPY_CONTEXT *)&t_left;
+
+  xd->mode_info_context->mbmi.mode = I8X8_PRED;
+  i8x8mode_costs  = mb->i8x8_mode_costs;
+
+  for (i = 0; i < 4; i++) {
+    MODE_INFO *const mic = xd->mode_info_context;
+    B_PREDICTION_MODE UNINITIALIZED_IS_SAFE(best_mode);
+#if CONFIG_COMP_INTRA_PRED
+    B_PREDICTION_MODE UNINITIALIZED_IS_SAFE(best_second_mode);
+#endif
+    int UNINITIALIZED_IS_SAFE(r), UNINITIALIZED_IS_SAFE(ry), UNINITIALIZED_IS_SAFE(d);
+
+    ib = vp9_i8x8_block[i];
+    total_rd += rd_pick_intra8x8block(
+                  cpi, mb, ib, &best_mode,
+#if CONFIG_COMP_INTRA_PRED
+                  & best_second_mode,
+#endif
+                  i8x8mode_costs, ta, tl, &r, &ry, &d);
+    cost += r;
+    distortion += d;
+    tot_rate_y += ry;
+    mic->bmi[ib].as_mode.first = best_mode;
+#if CONFIG_COMP_INTRA_PRED
+    mic->bmi[ib].as_mode.second = best_second_mode;
+#endif
+  }
+  *Rate = cost;
+  *rate_y += tot_rate_y;
+  *Distortion = distortion;
+  return RDCOST(mb->rdmult, mb->rddiv, cost, distortion);
+}
+
+static int rd_cost_mbuv(MACROBLOCK *mb) {
+  int b;
+  int cost = 0;
+  MACROBLOCKD *xd = &mb->e_mbd;
+  ENTROPY_CONTEXT_PLANES t_above, t_left;
+  ENTROPY_CONTEXT *ta, *tl;
+
+  vpx_memcpy(&t_above, xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES));
+  vpx_memcpy(&t_left, xd->left_context, sizeof(ENTROPY_CONTEXT_PLANES));
+
+  ta = (ENTROPY_CONTEXT *)&t_above;
+  tl = (ENTROPY_CONTEXT *)&t_left;
+
+  for (b = 16; b < 24; b++)
+    cost += cost_coeffs(mb, xd->block + b, PLANE_TYPE_UV,
+                        ta + vp9_block2above[b], tl + vp9_block2left[b],
+                        TX_4X4);
+
+  return cost;
+}
+
+
+static int64_t rd_inter16x16_uv(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
+                                int *distortion, int fullpixel, int *skip) {
+  vp9_subtract_mbuv(x->src_diff, x->src.u_buffer, x->src.v_buffer,
+                    x->e_mbd.predictor, x->src.uv_stride);
+
+  vp9_transform_mbuv_4x4(x);
+  vp9_quantize_mbuv_4x4(x);
+
+  *rate       = rd_cost_mbuv(x);
+  *distortion = vp9_mbuverror(x) / 4;
+  *skip       = vp9_mbuv_is_skippable_4x4(&x->e_mbd);
+
+  return RDCOST(x->rdmult, x->rddiv, *rate, *distortion);
+}
+
+static int rd_cost_mbuv_8x8(MACROBLOCK *mb, int backup) {
+  int b;
+  int cost = 0;
+  MACROBLOCKD *xd = &mb->e_mbd;
+  ENTROPY_CONTEXT_PLANES t_above, t_left;
+  ENTROPY_CONTEXT *ta, *tl;
+
+  if (backup) {
+    vpx_memcpy(&t_above, xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES));
+    vpx_memcpy(&t_left, xd->left_context, sizeof(ENTROPY_CONTEXT_PLANES));
+
+    ta = (ENTROPY_CONTEXT *)&t_above;
+    tl = (ENTROPY_CONTEXT *)&t_left;
+  } else {
+    ta = (ENTROPY_CONTEXT *)mb->e_mbd.above_context;
+    tl = (ENTROPY_CONTEXT *)mb->e_mbd.left_context;
+  }
+
+  for (b = 16; b < 24; b += 4)
+    cost += cost_coeffs(mb, xd->block + b, PLANE_TYPE_UV,
+                        ta + vp9_block2above_8x8[b],
+                        tl + vp9_block2left_8x8[b], TX_8X8);
+
+  return cost;
+}
+
+#if CONFIG_SUPERBLOCKS
+static int64_t rd_inter32x32_uv_8x8(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
+                                int *distortion, int fullpixel, int *skip) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  int n, r = 0, d = 0;
+  const uint8_t *usrc = x->src.u_buffer, *udst = xd->dst.u_buffer;
+  const uint8_t *vsrc = x->src.v_buffer, *vdst = xd->dst.v_buffer;
+  int src_uv_stride = x->src.uv_stride, dst_uv_stride = xd->dst.uv_stride;
+  int skippable = 1;
+  ENTROPY_CONTEXT_PLANES t_above[2], t_left[2];
+  ENTROPY_CONTEXT_PLANES *ta = xd->above_context;
+  ENTROPY_CONTEXT_PLANES *tl = xd->left_context;
+
+  memcpy(t_above, xd->above_context, sizeof(t_above));
+  memcpy(t_left, xd->left_context, sizeof(t_left));
+
+  for (n = 0; n < 4; n++) {
+    int x_idx = n & 1, y_idx = n >> 1;
+
+    vp9_subtract_mbuv_s_c(x->src_diff,
+                          usrc + x_idx * 8 + y_idx * 8 * src_uv_stride,
+                          vsrc + x_idx * 8 + y_idx * 8 * src_uv_stride,
+                          src_uv_stride,
+                          udst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
+                          vdst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
+                          dst_uv_stride);
+
+    vp9_transform_mbuv_8x8(x);
+    vp9_quantize_mbuv_8x8(x);
+
+    xd->above_context = ta + x_idx;
+    xd->left_context = tl + y_idx;
+    r += rd_cost_mbuv_8x8(x, 0);
+    d += vp9_mbuverror(x) / 4;
+    skippable = skippable && vp9_mbuv_is_skippable_8x8(xd);
+  }
+
+  *rate = r;
+  *distortion = d;
+  if (skip) *skip = skippable;
+  xd->left_context = tl;
+  xd->above_context = ta;
+  memcpy(xd->above_context, t_above, sizeof(t_above));
+  memcpy(xd->left_context, t_left, sizeof(t_left));
+
+  return RDCOST(x->rdmult, x->rddiv, *rate, *distortion);
+}
+#endif
+
+static int64_t rd_inter16x16_uv_8x8(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
+                                    int *distortion, int fullpixel, int *skip) {
+  vp9_subtract_mbuv(x->src_diff, x->src.u_buffer, x->src.v_buffer,
+                    x->e_mbd.predictor, x->src.uv_stride);
+
+  vp9_transform_mbuv_8x8(x);
+  vp9_quantize_mbuv_8x8(x);
+
+  *rate       = rd_cost_mbuv_8x8(x, 1);
+  *distortion = vp9_mbuverror(x) / 4;
+  *skip       = vp9_mbuv_is_skippable_8x8(&x->e_mbd);
+
+  return RDCOST(x->rdmult, x->rddiv, *rate, *distortion);
+}
+
+
+static int64_t rd_inter4x4_uv(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
+                              int *distortion, int *skippable, int fullpixel) {
+  vp9_build_inter4x4_predictors_mbuv(&x->e_mbd);
+  vp9_subtract_mbuv(x->src_diff, x->src.u_buffer, x->src.v_buffer,
+                    x->e_mbd.predictor, x->src.uv_stride);
+
+  vp9_transform_mbuv_4x4(x);
+  vp9_quantize_mbuv_4x4(x);
+
+  *rate       = rd_cost_mbuv(x);
+  *distortion = vp9_mbuverror(x) / 4;
+  *skippable  = vp9_mbuv_is_skippable_4x4(&x->e_mbd);
+
+  return RDCOST(x->rdmult, x->rddiv, *rate, *distortion);
+}
+
+static void rd_pick_intra_mbuv_mode(VP9_COMP *cpi,
+                                    MACROBLOCK *x,
+                                    int *rate,
+                                    int *rate_tokenonly,
+                                    int *distortion,
+                                    int *skippable) {
+  MB_PREDICTION_MODE mode;
+  MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected);
+#if CONFIG_COMP_INTRA_PRED
+  MB_PREDICTION_MODE mode2;
+  MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode2_selected);
+#endif
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;
+  int64_t best_rd = INT64_MAX;
+  int UNINITIALIZED_IS_SAFE(d), UNINITIALIZED_IS_SAFE(r);
+  int rate_to, UNINITIALIZED_IS_SAFE(skip);
+
+  for (mode = DC_PRED; mode <= TM_PRED; mode++) {
+#if CONFIG_COMP_INTRA_PRED
+    for (mode2 = DC_PRED - 1; mode2 != TM_PRED + 1; mode2++) {
+#endif
+      int rate;
+      int distortion;
+      int64_t this_rd;
+
+      mbmi->uv_mode = mode;
+#if CONFIG_COMP_INTRA_PRED
+      mbmi->second_uv_mode = mode2;
+      if (mode2 == (MB_PREDICTION_MODE)(DC_PRED - 1)) {
+#endif
+        vp9_build_intra_predictors_mbuv(&x->e_mbd);
+#if CONFIG_COMP_INTRA_PRED
+      } else {
+        continue;
+        vp9_build_comp_intra_predictors_mbuv(&x->e_mbd);
+      }
+#endif
+
+      vp9_subtract_mbuv(x->src_diff, x->src.u_buffer, x->src.v_buffer,
+                        x->e_mbd.predictor, x->src.uv_stride);
+      vp9_transform_mbuv_4x4(x);
+      vp9_quantize_mbuv_4x4(x);
+
+      rate_to = rd_cost_mbuv(x);
+      rate = rate_to
+             + x->intra_uv_mode_cost[x->e_mbd.frame_type][mbmi->uv_mode];
+
+      distortion = vp9_mbuverror(x) / 4;
+
+      this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
+
+      if (this_rd < best_rd) {
+        skip = vp9_mbuv_is_skippable_4x4(xd);
+        best_rd = this_rd;
+        d = distortion;
+        r = rate;
+        *rate_tokenonly = rate_to;
+        mode_selected = mode;
+#if CONFIG_COMP_INTRA_PRED
+        mode2_selected = mode2;
+      }
+#endif
+    }
+  }
+
+  *rate = r;
+  *distortion = d;
+  *skippable = skip;
+
+  mbmi->uv_mode = mode_selected;
+#if CONFIG_COMP_INTRA_PRED
+  mbmi->second_uv_mode = mode2_selected;
+#endif
+}
+
+static void rd_pick_intra_mbuv_mode_8x8(VP9_COMP *cpi,
+                                        MACROBLOCK *x,
+                                        int *rate,
+                                        int *rate_tokenonly,
+                                        int *distortion,
+                                        int *skippable) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_PREDICTION_MODE mode;
+  MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected);
+  MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;
+  int64_t best_rd = INT64_MAX;
+  int UNINITIALIZED_IS_SAFE(d), UNINITIALIZED_IS_SAFE(r);
+  int rate_to, UNINITIALIZED_IS_SAFE(skip);
+
+  for (mode = DC_PRED; mode <= TM_PRED; mode++) {
+    int rate;
+    int distortion;
+    int64_t this_rd;
+
+    mbmi->uv_mode = mode;
+    vp9_build_intra_predictors_mbuv(&x->e_mbd);
+    vp9_subtract_mbuv(x->src_diff, x->src.u_buffer, x->src.v_buffer,
+                      x->e_mbd.predictor, x->src.uv_stride);
+    vp9_transform_mbuv_8x8(x);
+
+    vp9_quantize_mbuv_8x8(x);
+
+    rate_to = rd_cost_mbuv_8x8(x, 1);
+    rate = rate_to + x->intra_uv_mode_cost[x->e_mbd.frame_type][mbmi->uv_mode];
+
+    distortion = vp9_mbuverror(x) / 4;
+    this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
+
+    if (this_rd < best_rd) {
+      skip = vp9_mbuv_is_skippable_8x8(xd);
+      best_rd = this_rd;
+      d = distortion;
+      r = rate;
+      *rate_tokenonly = rate_to;
+      mode_selected = mode;
+    }
+  }
+  *rate = r;
+  *distortion = d;
+  *skippable = skip;
+  mbmi->uv_mode = mode_selected;
+}
+
+#if CONFIG_SUPERBLOCKS
+static void super_block_uvrd_8x8(MACROBLOCK *x,
+                                 int *rate,
+                                 int *distortion,
+                                 const VP9_ENCODER_RTCD *rtcd,
+                                 int *skippable) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  int d = 0, r = 0, n, s = 1;
+  const uint8_t *usrc = x->src.u_buffer, *udst = xd->dst.u_buffer;
+  const uint8_t *vsrc = x->src.v_buffer, *vdst = xd->dst.v_buffer;
+  int src_uv_stride = x->src.uv_stride, dst_uv_stride = xd->dst.uv_stride;
+  ENTROPY_CONTEXT_PLANES t_above[2], t_left[2];
+  ENTROPY_CONTEXT_PLANES *ta = xd->above_context;
+  ENTROPY_CONTEXT_PLANES *tl = xd->left_context;
+
+  memcpy(t_above, xd->above_context, sizeof(t_above));
+  memcpy(t_left,  xd->left_context,  sizeof(t_left));
+
+  for (n = 0; n < 4; n++) {
+    int x_idx = n & 1, y_idx = n >> 1;
+
+    vp9_subtract_mbuv_s_c(x->src_diff,
+                          usrc + x_idx * 8 + y_idx * 8 * src_uv_stride,
+                          vsrc + x_idx * 8 + y_idx * 8 * src_uv_stride,
+                          src_uv_stride,
+                          udst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
+                          vdst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
+                          dst_uv_stride);
+    vp9_transform_mbuv_8x8(x);
+    vp9_quantize_mbuv_8x8(x);
+    s &= vp9_mbuv_is_skippable_8x8(xd);
+
+    d += vp9_mbuverror(x) >> 2;
+    xd->above_context = ta + x_idx;
+    xd->left_context = tl + y_idx;
+    r += rd_cost_mbuv_8x8(x, 0);
+  }
+
+  xd->above_context = ta;
+  xd->left_context = tl;
+  *distortion = d;
+  *rate       = r;
+  *skippable  = s;
+
+  xd->left_context = tl;
+  xd->above_context = ta;
+  memcpy(xd->above_context, t_above, sizeof(t_above));
+  memcpy(xd->left_context,  t_left,  sizeof(t_left));
+}
+
+static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi,
+                                       MACROBLOCK *x,
+                                       int *rate,
+                                       int *rate_tokenonly,
+                                       int *distortion,
+                                       int *skippable) {
+  MB_PREDICTION_MODE mode;
+  MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected);
+  int64_t best_rd = INT64_MAX, this_rd;
+  int this_rate_tokenonly, this_rate;
+  int this_distortion, s;
+
+  for (mode = DC_PRED; mode <= TM_PRED; mode++) {
+    x->e_mbd.mode_info_context->mbmi.uv_mode = mode;
+    vp9_build_intra_predictors_sbuv_s(&x->e_mbd);
+
+    super_block_uvrd_8x8(x, &this_rate_tokenonly,
+                         &this_distortion, IF_RTCD(&cpi->rtcd), &s);
+    this_rate = this_rate_tokenonly +
+                x->mbmode_cost[x->e_mbd.frame_type]
+                              [x->e_mbd.mode_info_context->mbmi.mode];
+    this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
+
+    if (this_rd < best_rd) {
+      mode_selected   = mode;
+      best_rd         = this_rd;
+      *rate           = this_rate;
+      *rate_tokenonly = this_rate_tokenonly;
+      *distortion     = this_distortion;
+      *skippable      = s;
+    }
+  }
+
+  x->e_mbd.mode_info_context->mbmi.uv_mode = mode_selected;
+
+  return best_rd;
+}
+#endif
+
+int vp9_cost_mv_ref(VP9_COMP *cpi,
+                    MB_PREDICTION_MODE m,
+                    const int near_mv_ref_ct[4]) {
+  MACROBLOCKD *xd = &cpi->mb.e_mbd;
+  int segment_id = xd->mode_info_context->mbmi.segment_id;
+
+  // If the mode coding is done entirely at the segment level
+  // we should not account for it at the per mb level in rd code.
+  // Note that if the segment level coding is expanded from single mode
+  // to multiple mode masks as per reference frame coding we will need
+  // to do something different here.
+  if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_MODE)) {
+    VP9_COMMON *pc = &cpi->common;
+
+    vp9_prob p [VP9_MVREFS - 1];
+    assert(NEARESTMV <= m  &&  m <= SPLITMV);
+    vp9_mv_ref_probs(pc, p, near_mv_ref_ct);
+    return cost_token(vp9_mv_ref_tree, p,
+                      vp9_mv_ref_encoding_array - NEARESTMV + m);
+  } else
+    return 0;
+}
+
+void vp9_set_mbmode_and_mvs(MACROBLOCK *x, MB_PREDICTION_MODE mb, int_mv *mv) {
+  x->e_mbd.mode_info_context->mbmi.mode = mb;
+  x->e_mbd.mode_info_context->mbmi.mv[0].as_int = mv->as_int;
+}
+
+static int labels2mode(
+  MACROBLOCK *x,
+  int const *labelings, int which_label,
+  B_PREDICTION_MODE this_mode,
+  int_mv *this_mv, int_mv *this_second_mv,
+  int_mv seg_mvs[MAX_REF_FRAMES - 1],
+  int_mv *best_ref_mv,
+  int_mv *second_best_ref_mv,
+  DEC_MVCOSTS) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MODE_INFO *const mic = xd->mode_info_context;
+  MB_MODE_INFO * mbmi = &mic->mbmi;
+  const int mis = xd->mode_info_stride;
+
+  int i, cost = 0, thismvcost = 0;
+
+  /* We have to be careful retrieving previously-encoded motion vectors.
+     Ones from this macroblock have to be pulled from the BLOCKD array
+     as they have not yet made it to the bmi array in our MB_MODE_INFO. */
+  for (i = 0; i < 16; ++i) {
+    BLOCKD *const d = xd->block + i;
+    const int row = i >> 2,  col = i & 3;
+
+    B_PREDICTION_MODE m;
+
+    if (labelings[i] != which_label)
+      continue;
+
+    if (col  &&  labelings[i] == labelings[i - 1])
+      m = LEFT4X4;
+    else if (row  &&  labelings[i] == labelings[i - 4])
+      m = ABOVE4X4;
+    else {
+      // the only time we should do costing for new motion vector or mode
+      // is when we are on a new label  (jbb May 08, 2007)
+      switch (m = this_mode) {
+        case NEW4X4 :
+          if (mbmi->second_ref_frame) {
+            this_mv->as_int = seg_mvs[mbmi->ref_frame - 1].as_int;
+            this_second_mv->as_int =
+              seg_mvs[mbmi->second_ref_frame - 1].as_int;
+          }
+
+          thismvcost  = vp9_mv_bit_cost(this_mv, best_ref_mv, MVCOSTS,
+                                        102, xd->allow_high_precision_mv);
+          if (mbmi->second_ref_frame) {
+            thismvcost += vp9_mv_bit_cost(this_second_mv, second_best_ref_mv,
+                                          MVCOSTS, 102,
+                                          xd->allow_high_precision_mv);
+          }
+          break;
+        case LEFT4X4:
+          this_mv->as_int = col ? d[-1].bmi.as_mv.first.as_int : left_block_mv(mic, i);
+          if (mbmi->second_ref_frame)
+            this_second_mv->as_int = col ? d[-1].bmi.as_mv.second.as_int : left_block_second_mv(mic, i);
+          break;
+        case ABOVE4X4:
+          this_mv->as_int = row ? d[-4].bmi.as_mv.first.as_int : above_block_mv(mic, i, mis);
+          if (mbmi->second_ref_frame)
+            this_second_mv->as_int = row ? d[-4].bmi.as_mv.second.as_int : above_block_second_mv(mic, i, mis);
+          break;
+        case ZERO4X4:
+          this_mv->as_int = 0;
+          if (mbmi->second_ref_frame)
+            this_second_mv->as_int = 0;
+          break;
+        default:
+          break;
+      }
+
+      if (m == ABOVE4X4) { // replace above with left if same
+        int_mv left_mv, left_second_mv;
+
+        left_second_mv.as_int = 0;
+        left_mv.as_int = col ? d[-1].bmi.as_mv.first.as_int :
+                         left_block_mv(mic, i);
+        if (mbmi->second_ref_frame)
+          left_second_mv.as_int = col ? d[-1].bmi.as_mv.second.as_int :
+                                  left_block_second_mv(mic, i);
+
+        if (left_mv.as_int == this_mv->as_int &&
+            (!mbmi->second_ref_frame ||
+             left_second_mv.as_int == this_second_mv->as_int))
+          m = LEFT4X4;
+      }
+
+      cost = x->inter_bmode_costs[ m];
+    }
+
+    d->bmi.as_mv.first.as_int = this_mv->as_int;
+    if (mbmi->second_ref_frame)
+      d->bmi.as_mv.second.as_int = this_second_mv->as_int;
+
+    x->partition_info->bmi[i].mode = m;
+    x->partition_info->bmi[i].mv.as_int = this_mv->as_int;
+    if (mbmi->second_ref_frame)
+      x->partition_info->bmi[i].second_mv.as_int = this_second_mv->as_int;
+  }
+
+  cost += thismvcost;
+  return cost;
+}
+
+static int64_t encode_inter_mb_segment(MACROBLOCK *x,
+                                       int const *labels,
+                                       int which_label,
+                                       int *labelyrate,
+                                       int *distortion,
+                                       ENTROPY_CONTEXT *ta,
+                                       ENTROPY_CONTEXT *tl,
+                                       const VP9_ENCODER_RTCD *rtcd) {
+  int i;
+  MACROBLOCKD *xd = &x->e_mbd;
+
+  *labelyrate = 0;
+  *distortion = 0;
+  for (i = 0; i < 16; i++) {
+    if (labels[i] == which_label) {
+      BLOCKD *bd = &x->e_mbd.block[i];
+      BLOCK *be = &x->block[i];
+      int thisdistortion;
+
+      vp9_build_inter_predictors_b(bd, 16, xd->subpixel_predict);
+      if (xd->mode_info_context->mbmi.second_ref_frame)
+        vp9_build_2nd_inter_predictors_b(bd, 16, xd->subpixel_predict_avg);
+      vp9_subtract_b(be, bd, 16);
+      x->vp9_short_fdct4x4(be->src_diff, be->coeff, 32);
+      x->quantize_b_4x4(be, bd);
+      thisdistortion = vp9_block_error(be->coeff, bd->dqcoeff, 16);
+      *distortion += thisdistortion;
+      *labelyrate += cost_coeffs(x, bd, PLANE_TYPE_Y_WITH_DC,
+                                 ta + vp9_block2above[i],
+                                 tl + vp9_block2left[i], TX_4X4);
+    }
+  }
+  *distortion >>= 2;
+  return RDCOST(x->rdmult, x->rddiv, *labelyrate, *distortion);
+}
+
+static int64_t encode_inter_mb_segment_8x8(MACROBLOCK *x,
+                                           int const *labels,
+                                           int which_label,
+                                           int *labelyrate,
+                                           int *distortion,
+                                           int64_t *otherrd,
+                                           ENTROPY_CONTEXT *ta,
+                                           ENTROPY_CONTEXT *tl,
+                                           const VP9_ENCODER_RTCD *rtcd) {
+  int i, j;
+  MACROBLOCKD *xd = &x->e_mbd;
+  const int iblock[4] = { 0, 1, 4, 5 };
+  int othercost = 0, otherdist = 0;
+  ENTROPY_CONTEXT_PLANES tac, tlc;
+  ENTROPY_CONTEXT *tacp = (ENTROPY_CONTEXT *) &tac,
+                  *tlcp = (ENTROPY_CONTEXT *) &tlc;
+
+  if (otherrd) {
+    memcpy(&tac, ta, sizeof(ENTROPY_CONTEXT_PLANES));
+    memcpy(&tlc, tl, sizeof(ENTROPY_CONTEXT_PLANES));
+  }
+
+  *distortion = 0;
+  *labelyrate = 0;
+  for (i = 0; i < 4; i++) {
+    int ib = vp9_i8x8_block[i];
+
+    if (labels[ib] == which_label) {
+      int idx = (ib & 8) + ((ib & 2) << 1);
+      BLOCKD *bd = &xd->block[ib], *bd2 = &xd->block[idx];
+      BLOCK *be = &x->block[ib], *be2 = &x->block[idx];
+      int thisdistortion;
+
+      vp9_build_inter_predictors4b(xd, bd, 16);
+      if (xd->mode_info_context->mbmi.second_ref_frame)
+        vp9_build_2nd_inter_predictors4b(xd, bd, 16);
+      vp9_subtract_4b_c(be, bd, 16);
+
+      if (xd->mode_info_context->mbmi.txfm_size == TX_4X4) {
+        if (otherrd) {
+          x->vp9_short_fdct8x8(be->src_diff, be2->coeff, 32);
+          x->quantize_b_8x8(be2, bd2);
+          thisdistortion = vp9_block_error_c(be2->coeff, bd2->dqcoeff, 64);
+          otherdist += thisdistortion;
+          othercost += cost_coeffs(x, bd2, PLANE_TYPE_Y_WITH_DC,
+                                     tacp + vp9_block2above_8x8[idx],
+                                     tlcp + vp9_block2left_8x8[idx], TX_8X8);
+        }
+        for (j = 0; j < 4; j += 2) {
+          bd = &xd->block[ib + iblock[j]];
+          be = &x->block[ib + iblock[j]];
+          x->vp9_short_fdct8x4(be->src_diff, be->coeff, 32);
+          x->quantize_b_4x4_pair(be, be + 1, bd, bd + 1);
+          thisdistortion = vp9_block_error_c(be->coeff, bd->dqcoeff, 32);
+          *distortion += thisdistortion;
+          *labelyrate += cost_coeffs(x, bd, PLANE_TYPE_Y_WITH_DC,
+                                     ta + vp9_block2above[ib + iblock[j]],
+                                     tl + vp9_block2left[ib + iblock[j]],
+                                     TX_4X4);
+          *labelyrate += cost_coeffs(x, bd + 1, PLANE_TYPE_Y_WITH_DC,
+                                     ta + vp9_block2above[ib + iblock[j] + 1],
+                                     tl + vp9_block2left[ib + iblock[j]],
+                                     TX_4X4);
+        }
+      } else /* 8x8 */ {
+        if (otherrd) {
+          for (j = 0; j < 4; j += 2) {
+            BLOCKD *bd3 = &xd->block[ib + iblock[j]];
+            BLOCK *be3 = &x->block[ib + iblock[j]];
+            x->vp9_short_fdct8x4(be3->src_diff, be3->coeff, 32);
+            x->quantize_b_4x4_pair(be3, be3 + 1, bd3, bd3 + 1);
+            thisdistortion = vp9_block_error_c(be3->coeff, bd3->dqcoeff, 32);
+            otherdist += thisdistortion;
+            othercost += cost_coeffs(x, bd3, PLANE_TYPE_Y_WITH_DC,
+                                     tacp + vp9_block2above[ib + iblock[j]],
+                                     tlcp + vp9_block2left[ib + iblock[j]],
+                                     TX_4X4);
+            othercost += cost_coeffs(x, bd3 + 1, PLANE_TYPE_Y_WITH_DC,
+                                     tacp + vp9_block2above[ib + iblock[j] + 1],
+                                     tlcp + vp9_block2left[ib + iblock[j]],
+                                     TX_4X4);
+          }
+        }
+        x->vp9_short_fdct8x8(be->src_diff, be2->coeff, 32);
+        x->quantize_b_8x8(be2, bd2);
+        thisdistortion = vp9_block_error_c(be2->coeff, bd2->dqcoeff, 64);
+        *distortion += thisdistortion;
+        *labelyrate += cost_coeffs(x, bd2, PLANE_TYPE_Y_WITH_DC,
+                                   ta + vp9_block2above_8x8[idx],
+                                   tl + vp9_block2left_8x8[idx], TX_8X8);
+      }
+    }
+  }
+  *distortion >>= 2;
+  if (otherrd) {
+    otherdist >>= 2;
+    *otherrd = RDCOST(x->rdmult, x->rddiv, othercost, otherdist);
+  }
+  return RDCOST(x->rdmult, x->rddiv, *labelyrate, *distortion);
+}
+
+static const unsigned int segmentation_to_sseshift[4] = {3, 3, 2, 0};
+
+
+typedef struct {
+  int_mv *ref_mv, *second_ref_mv;
+  int_mv mvp;
+
+  int64_t segment_rd;
+  SPLITMV_PARTITIONING_TYPE segment_num;
+  TX_SIZE txfm_size;
+  int r;
+  int d;
+  int segment_yrate;
+  B_PREDICTION_MODE modes[16];
+  int_mv mvs[16], second_mvs[16];
+  int eobs[16];
+
+  int mvthresh;
+  int *mdcounts;
+
+  int_mv sv_mvp[4];     // save 4 mvp from 8x8
+  int sv_istep[2];  // save 2 initial step_param for 16x8/8x16
+
+} BEST_SEG_INFO;
+
+static __inline
+int mv_check_bounds(MACROBLOCK *x, int_mv *mv) {
+  int r = 0;
+  r |= (mv->as_mv.row >> 3) < x->mv_row_min;
+  r |= (mv->as_mv.row >> 3) > x->mv_row_max;
+  r |= (mv->as_mv.col >> 3) < x->mv_col_min;
+  r |= (mv->as_mv.col >> 3) > x->mv_col_max;
+  return r;
+}
+
+static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
+                                    BEST_SEG_INFO *bsi,
+                                    SPLITMV_PARTITIONING_TYPE segmentation,
+                                    TX_SIZE tx_size, int64_t *otherrds,
+                                    int64_t *rds, int *completed,
+                                    /* 16 = n_blocks */
+                                    int_mv seg_mvs[16 /* n_blocks */]
+                                                  [MAX_REF_FRAMES - 1]) {
+  int i, j;
+  int const *labels;
+  int br = 0, bd = 0;
+  B_PREDICTION_MODE this_mode;
+  MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;
+
+  int label_count;
+  int64_t this_segment_rd = 0, other_segment_rd;
+  int label_mv_thresh;
+  int rate = 0;
+  int sbr = 0, sbd = 0;
+  int segmentyrate = 0;
+  int best_eobs[16] = { 0 };
+
+  vp9_variance_fn_ptr_t *v_fn_ptr;
+
+  ENTROPY_CONTEXT_PLANES t_above, t_left;
+  ENTROPY_CONTEXT *ta, *tl;
+  ENTROPY_CONTEXT_PLANES t_above_b, t_left_b;
+  ENTROPY_CONTEXT *ta_b, *tl_b;
+
+  vpx_memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
+  vpx_memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
+
+  ta = (ENTROPY_CONTEXT *)&t_above;
+  tl = (ENTROPY_CONTEXT *)&t_left;
+  ta_b = (ENTROPY_CONTEXT *)&t_above_b;
+  tl_b = (ENTROPY_CONTEXT *)&t_left_b;
+
+  v_fn_ptr = &cpi->fn_ptr[segmentation];
+  labels = vp9_mbsplits[segmentation];
+  label_count = vp9_mbsplit_count[segmentation];
+
+  // 64 makes this threshold really big effectively
+  // making it so that we very rarely check mvs on
+  // segments.   setting this to 1 would make mv thresh
+  // roughly equal to what it is for macroblocks
+  label_mv_thresh = 1 * bsi->mvthresh / label_count;
+
+  // Segmentation method overheads
+  rate = cost_token(vp9_mbsplit_tree, vp9_mbsplit_probs,
+                    vp9_mbsplit_encodings + segmentation);
+  rate += vp9_cost_mv_ref(cpi, SPLITMV, bsi->mdcounts);
+  this_segment_rd += RDCOST(x->rdmult, x->rddiv, rate, 0);
+  br += rate;
+  other_segment_rd = this_segment_rd;
+
+  mbmi->txfm_size = tx_size;
+  for (i = 0; i < label_count && this_segment_rd < bsi->segment_rd; i++) {
+    int_mv mode_mv[B_MODE_COUNT], second_mode_mv[B_MODE_COUNT];
+    int64_t best_label_rd = INT64_MAX, best_other_rd = INT64_MAX;
+    B_PREDICTION_MODE mode_selected = ZERO4X4;
+    int bestlabelyrate = 0;
+
+    // search for the best motion vector on this segment
+    for (this_mode = LEFT4X4; this_mode <= NEW4X4; this_mode ++) {
+      int64_t this_rd, other_rd;
+      int distortion;
+      int labelyrate;
+      ENTROPY_CONTEXT_PLANES t_above_s, t_left_s;
+      ENTROPY_CONTEXT *ta_s;
+      ENTROPY_CONTEXT *tl_s;
+
+      vpx_memcpy(&t_above_s, &t_above, sizeof(ENTROPY_CONTEXT_PLANES));
+      vpx_memcpy(&t_left_s, &t_left, sizeof(ENTROPY_CONTEXT_PLANES));
+
+      ta_s = (ENTROPY_CONTEXT *)&t_above_s;
+      tl_s = (ENTROPY_CONTEXT *)&t_left_s;
+
+      // motion search for newmv (single predictor case only)
+      if (!mbmi->second_ref_frame && this_mode == NEW4X4) {
+        int sseshift, n;
+        int step_param = 0;
+        int further_steps;
+        int thissme, bestsme = INT_MAX;
+        BLOCK *c;
+        BLOCKD *e;
+
+        /* Is the best so far sufficiently good that we cant justify doing
+         * and new motion search. */
+        if (best_label_rd < label_mv_thresh)
+          break;
+
+        if (cpi->compressor_speed) {
+          if (segmentation == PARTITIONING_8X16 ||
+              segmentation == PARTITIONING_16X8) {
+            bsi->mvp.as_int = bsi->sv_mvp[i].as_int;
+            if (i == 1 && segmentation == PARTITIONING_16X8)
+              bsi->mvp.as_int = bsi->sv_mvp[2].as_int;
+
+            step_param = bsi->sv_istep[i];
+          }
+
+          // use previous block's result as next block's MV predictor.
+          if (segmentation == PARTITIONING_4X4 && i > 0) {
+            bsi->mvp.as_int = x->e_mbd.block[i - 1].bmi.as_mv.first.as_int;
+            if (i == 4 || i == 8 || i == 12)
+              bsi->mvp.as_int = x->e_mbd.block[i - 4].bmi.as_mv.first.as_int;
+            step_param = 2;
+          }
+        }
+
+        further_steps = (MAX_MVSEARCH_STEPS - 1) - step_param;
+
+        {
+          int sadpb = x->sadperbit4;
+          int_mv mvp_full;
+
+          mvp_full.as_mv.row = bsi->mvp.as_mv.row >> 3;
+          mvp_full.as_mv.col = bsi->mvp.as_mv.col >> 3;
+
+          // find first label
+          n = vp9_mbsplit_offset[segmentation][i];
+
+          c = &x->block[n];
+          e = &x->e_mbd.block[n];
+
+          bestsme = vp9_full_pixel_diamond(cpi, x, c, e, &mvp_full, step_param,
+                                           sadpb, further_steps, 0, v_fn_ptr,
+                                           bsi->ref_mv, &mode_mv[NEW4X4]);
+
+          sseshift = segmentation_to_sseshift[segmentation];
+
+          // Should we do a full search (best quality only)
+          if ((cpi->compressor_speed == 0) && (bestsme >> sseshift) > 4000) {
+            /* Check if mvp_full is within the range. */
+            clamp_mv(&mvp_full, x->mv_col_min, x->mv_col_max,
+                     x->mv_row_min, x->mv_row_max);
+
+            thissme = cpi->full_search_sad(x, c, e, &mvp_full,
+                                           sadpb, 16, v_fn_ptr,
+                                           XMVCOST, bsi->ref_mv);
+
+            if (thissme < bestsme) {
+              bestsme = thissme;
+              mode_mv[NEW4X4].as_int = e->bmi.as_mv.first.as_int;
+            } else {
+              /* The full search result is actually worse so re-instate the
+               * previous best vector */
+              e->bmi.as_mv.first.as_int = mode_mv[NEW4X4].as_int;
+            }
+          }
+        }
+
+        if (bestsme < INT_MAX) {
+          int distortion;
+          unsigned int sse;
+          cpi->find_fractional_mv_step(x, c, e, &mode_mv[NEW4X4],
+                                       bsi->ref_mv, x->errorperbit, v_fn_ptr,
+                                       XMVCOST, &distortion, &sse);
+
+          // safe motion search result for use in compound prediction
+          seg_mvs[i][mbmi->ref_frame - 1].as_int = mode_mv[NEW4X4].as_int;
+        }
+      } /* NEW4X4 */
+      else if (mbmi->second_ref_frame && this_mode == NEW4X4) {
+        /* motion search not completed? Then skip newmv for this block with
+         * comppred */
+        if (seg_mvs[i][mbmi->second_ref_frame - 1].as_int == INVALID_MV ||
+            seg_mvs[i][mbmi->ref_frame        - 1].as_int == INVALID_MV) {
+          continue;
+        }
+      }
+
+      rate = labels2mode(x, labels, i, this_mode, &mode_mv[this_mode],
+                         &second_mode_mv[this_mode], seg_mvs[i],
+                         bsi->ref_mv, bsi->second_ref_mv, XMVCOST);
+
+      // Trap vectors that reach beyond the UMV borders
+      if (((mode_mv[this_mode].as_mv.row >> 3) < x->mv_row_min) ||
+          ((mode_mv[this_mode].as_mv.row >> 3) > x->mv_row_max) ||
+          ((mode_mv[this_mode].as_mv.col >> 3) < x->mv_col_min) ||
+          ((mode_mv[this_mode].as_mv.col >> 3) > x->mv_col_max)) {
+        continue;
+      }
+      if (mbmi->second_ref_frame &&
+          mv_check_bounds(x, &second_mode_mv[this_mode]))
+        continue;
+
+      if (segmentation == PARTITIONING_4X4) {
+        this_rd = encode_inter_mb_segment(x, labels, i, &labelyrate,
+                                          &distortion,
+                                          ta_s, tl_s, IF_RTCD(&cpi->rtcd));
+        other_rd = this_rd;
+      } else {
+        this_rd = encode_inter_mb_segment_8x8(x, labels, i, &labelyrate,
+                                              &distortion, &other_rd,
+                                              ta_s, tl_s, IF_RTCD(&cpi->rtcd));
+      }
+      this_rd += RDCOST(x->rdmult, x->rddiv, rate, 0);
+      rate += labelyrate;
+
+      if (this_rd < best_label_rd) {
+        sbr = rate;
+        sbd = distortion;
+        bestlabelyrate = labelyrate;
+        mode_selected = this_mode;
+        best_label_rd = this_rd;
+        if (x->e_mbd.mode_info_context->mbmi.txfm_size == TX_4X4) {
+          for (j = 0; j < 16; j++)
+            if (labels[j] == i)
+              best_eobs[j] = x->e_mbd.block[j].eob;
+        } else {
+          for (j = 0; j < 4; j++) {
+            int ib = vp9_i8x8_block[j], idx = j * 4;
+
+            if (labels[ib] == i)
+              best_eobs[idx] = x->e_mbd.block[idx].eob;
+          }
+        }
+        if (other_rd < best_other_rd)
+          best_other_rd = other_rd;
+
+        vpx_memcpy(ta_b, ta_s, sizeof(ENTROPY_CONTEXT_PLANES));
+        vpx_memcpy(tl_b, tl_s, sizeof(ENTROPY_CONTEXT_PLANES));
+
+      }
+    } /*for each 4x4 mode*/
+
+    vpx_memcpy(ta, ta_b, sizeof(ENTROPY_CONTEXT_PLANES));
+    vpx_memcpy(tl, tl_b, sizeof(ENTROPY_CONTEXT_PLANES));
+
+    labels2mode(x, labels, i, mode_selected, &mode_mv[mode_selected],
+                &second_mode_mv[mode_selected], seg_mvs[i],
+                bsi->ref_mv, bsi->second_ref_mv, XMVCOST);
+
+    br += sbr;
+    bd += sbd;
+    segmentyrate += bestlabelyrate;
+    this_segment_rd += best_label_rd;
+    other_segment_rd += best_other_rd;
+    if (rds)
+      rds[i] = this_segment_rd;
+    if (otherrds)
+      otherrds[i] = other_segment_rd;
+  } /* for each label */
+
+  if (this_segment_rd < bsi->segment_rd) {
+    bsi->r = br;
+    bsi->d = bd;
+    bsi->segment_yrate = segmentyrate;
+    bsi->segment_rd = this_segment_rd;
+    bsi->segment_num = segmentation;
+    bsi->txfm_size = mbmi->txfm_size;
+
+    // store everything needed to come back to this!!
+    for (i = 0; i < 16; i++) {
+      BLOCKD *bd = &x->e_mbd.block[i];
+
+      bsi->mvs[i].as_mv = x->partition_info->bmi[i].mv.as_mv;
+      if (mbmi->second_ref_frame)
+        bsi->second_mvs[i].as_mv = x->partition_info->bmi[i].second_mv.as_mv;
+      bsi->modes[i] = x->partition_info->bmi[i].mode;
+      bsi->eobs[i] = best_eobs[i];
+    }
+  }
+
+  if (completed) {
+    *completed = i;
+  }
+}
+
+static void rd_check_segment(VP9_COMP *cpi, MACROBLOCK *x,
+                             BEST_SEG_INFO *bsi,
+                             unsigned int segmentation,
+                             /* 16 = n_blocks */
+                             int_mv seg_mvs[16][MAX_REF_FRAMES - 1],
+                             int64_t txfm_cache[NB_TXFM_MODES]) {
+  int i, n, c = vp9_mbsplit_count[segmentation];
+
+  if (segmentation == PARTITIONING_4X4) {
+    int64_t rd[16];
+
+    rd_check_segment_txsize(cpi, x, bsi, segmentation, TX_4X4, NULL,
+                            rd, &n, seg_mvs);
+    if (n == c) {
+      for (i = 0; i < NB_TXFM_MODES; i++) {
+        if (rd[c - 1] < txfm_cache[i])
+          txfm_cache[i] = rd[c - 1];
+      }
+    }
+  } else {
+    int64_t diff, base_rd;
+    int cost4x4 = vp9_cost_bit(cpi->common.prob_tx[0], 0);
+    int cost8x8 = vp9_cost_bit(cpi->common.prob_tx[0], 1);
+
+    if (cpi->common.txfm_mode == TX_MODE_SELECT) {
+      int64_t rd4x4[4], rd8x8[4];
+      int n4x4, n8x8, nmin;
+      BEST_SEG_INFO bsi4x4, bsi8x8;
+
+      /* factor in cost of cost4x4/8x8 in decision */
+      vpx_memcpy(&bsi4x4, bsi, sizeof(*bsi));
+      vpx_memcpy(&bsi8x8, bsi, sizeof(*bsi));
+      rd_check_segment_txsize(cpi, x, &bsi4x4, segmentation,
+                              TX_4X4, NULL, rd4x4, &n4x4, seg_mvs);
+      rd_check_segment_txsize(cpi, x, &bsi8x8, segmentation,
+                              TX_8X8, NULL, rd8x8, &n8x8, seg_mvs);
+      if (bsi4x4.segment_num == segmentation) {
+        bsi4x4.segment_rd += RDCOST(x->rdmult, x->rddiv, cost4x4, 0);
+        if (bsi4x4.segment_rd < bsi->segment_rd)
+          vpx_memcpy(bsi, &bsi4x4, sizeof(*bsi));
+      }
+      if (bsi8x8.segment_num == segmentation) {
+        bsi8x8.segment_rd += RDCOST(x->rdmult, x->rddiv, cost8x8, 0);
+        if (bsi8x8.segment_rd < bsi->segment_rd)
+          vpx_memcpy(bsi, &bsi8x8, sizeof(*bsi));
+      }
+      n = n4x4 > n8x8 ? n4x4 : n8x8;
+      if (n == c) {
+        nmin = n4x4 < n8x8 ? n4x4 : n8x8;
+        diff = rd8x8[nmin - 1] - rd4x4[nmin - 1];
+        if (n == n4x4) {
+          base_rd = rd4x4[c - 1];
+        } else {
+          base_rd = rd8x8[c - 1] - diff;
+        }
+      }
+    } else {
+      int64_t rd[4], otherrd[4];
+
+      if (cpi->common.txfm_mode == ONLY_4X4) {
+        rd_check_segment_txsize(cpi, x, bsi, segmentation, TX_4X4, otherrd,
+                                rd, &n, seg_mvs);
+        if (n == c) {
+          base_rd = rd[c - 1];
+          diff = otherrd[c - 1] - rd[c - 1];
+        }
+      } else /* use 8x8 transform */ {
+        rd_check_segment_txsize(cpi, x, bsi, segmentation, TX_8X8, otherrd,
+                                rd, &n, seg_mvs);
+        if (n == c) {
+          diff = rd[c - 1] - otherrd[c - 1];
+          base_rd = otherrd[c - 1];
+        }
+      }
+    }
+
+    if (n == c) {
+      if (base_rd < txfm_cache[ONLY_4X4]) {
+        txfm_cache[ONLY_4X4] = base_rd;
+      }
+      if (base_rd + diff < txfm_cache[1]) {
+        txfm_cache[ALLOW_8X8] = txfm_cache[ALLOW_16X16] = base_rd + diff;
+      }
+      if (diff < 0) {
+        base_rd += diff + RDCOST(x->rdmult, x->rddiv, cost8x8, 0);
+      } else {
+        base_rd += RDCOST(x->rdmult, x->rddiv, cost4x4, 0);
+      }
+      if (base_rd < txfm_cache[TX_MODE_SELECT]) {
+        txfm_cache[TX_MODE_SELECT] = base_rd;
+      }
+    }
+  }
+}
+
+static __inline void cal_step_param(int sr, int *sp) {
+  int step = 0;
+
+  if (sr > MAX_FIRST_STEP) sr = MAX_FIRST_STEP;
+  else if (sr < 1) sr = 1;
+
+  while (sr >>= 1)
+    step++;
+
+  *sp = MAX_MVSEARCH_STEPS - 1 - step;
+}
+
+static int rd_pick_best_mbsegmentation(VP9_COMP *cpi, MACROBLOCK *x,
+                                       int_mv *best_ref_mv,
+                                       int_mv *second_best_ref_mv,
+                                       int64_t best_rd,
+                                       int *mdcounts,
+                                       int *returntotrate,
+                                       int *returnyrate,
+                                       int *returndistortion,
+                                       int *skippable, int mvthresh,
+                                       int_mv seg_mvs[NB_PARTITIONINGS]
+                                                     [16 /* n_blocks */]
+                                                     [MAX_REF_FRAMES - 1],
+                                       int64_t txfm_cache[NB_TXFM_MODES]) {
+  int i;
+  BEST_SEG_INFO bsi;
+  MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;
+
+  vpx_memset(&bsi, 0, sizeof(bsi));
+  for (i = 0; i < NB_TXFM_MODES; i++)
+    txfm_cache[i] = INT64_MAX;
+
+  bsi.segment_rd = best_rd;
+  bsi.ref_mv = best_ref_mv;
+  bsi.second_ref_mv = second_best_ref_mv;
+  bsi.mvp.as_int = best_ref_mv->as_int;
+  bsi.mvthresh = mvthresh;
+  bsi.mdcounts = mdcounts;
+  bsi.txfm_size = TX_4X4;
+
+  for (i = 0; i < 16; i++)
+    bsi.modes[i] = ZERO4X4;
+
+  if (cpi->compressor_speed == 0) {
+    /* for now, we will keep the original segmentation order
+       when in best quality mode */
+    rd_check_segment(cpi, x, &bsi, PARTITIONING_16X8,
+                     seg_mvs[PARTITIONING_16X8], txfm_cache);
+    rd_check_segment(cpi, x, &bsi, PARTITIONING_8X16,
+                     seg_mvs[PARTITIONING_8X16], txfm_cache);
+    rd_check_segment(cpi, x, &bsi, PARTITIONING_8X8,
+                     seg_mvs[PARTITIONING_8X8], txfm_cache);
+    rd_check_segment(cpi, x, &bsi, PARTITIONING_4X4,
+                     seg_mvs[PARTITIONING_4X4], txfm_cache);
+  } else {
+    int sr;
+
+    rd_check_segment(cpi, x, &bsi, PARTITIONING_8X8,
+                     seg_mvs[PARTITIONING_8X8], txfm_cache);
+
+    if (bsi.segment_rd < best_rd) {
+      int tmp_col_min = x->mv_col_min;
+      int tmp_col_max = x->mv_col_max;
+      int tmp_row_min = x->mv_row_min;
+      int tmp_row_max = x->mv_row_max;
+
+      vp9_clamp_mv_min_max(x, best_ref_mv);
+
+      /* Get 8x8 result */
+      bsi.sv_mvp[0].as_int = bsi.mvs[0].as_int;
+      bsi.sv_mvp[1].as_int = bsi.mvs[2].as_int;
+      bsi.sv_mvp[2].as_int = bsi.mvs[8].as_int;
+      bsi.sv_mvp[3].as_int = bsi.mvs[10].as_int;
+
+      /* Use 8x8 result as 16x8/8x16's predictor MV. Adjust search range
+       * according to the closeness of 2 MV. */
+      /* block 8X16 */
+      sr = MAXF((abs(bsi.sv_mvp[0].as_mv.row - bsi.sv_mvp[2].as_mv.row)) >> 3,
+                (abs(bsi.sv_mvp[0].as_mv.col - bsi.sv_mvp[2].as_mv.col)) >> 3);
+      cal_step_param(sr, &bsi.sv_istep[0]);
+
+      sr = MAXF((abs(bsi.sv_mvp[1].as_mv.row - bsi.sv_mvp[3].as_mv.row)) >> 3,
+                (abs(bsi.sv_mvp[1].as_mv.col - bsi.sv_mvp[3].as_mv.col)) >> 3);
+      cal_step_param(sr, &bsi.sv_istep[1]);
+
+      rd_check_segment(cpi, x, &bsi, PARTITIONING_8X16,
+                       seg_mvs[PARTITIONING_8X16], txfm_cache);
+
+      /* block 16X8 */
+      sr = MAXF((abs(bsi.sv_mvp[0].as_mv.row - bsi.sv_mvp[1].as_mv.row)) >> 3,
+                (abs(bsi.sv_mvp[0].as_mv.col - bsi.sv_mvp[1].as_mv.col)) >> 3);
+      cal_step_param(sr, &bsi.sv_istep[0]);
+
+      sr = MAXF((abs(bsi.sv_mvp[2].as_mv.row - bsi.sv_mvp[3].as_mv.row)) >> 3,
+                (abs(bsi.sv_mvp[2].as_mv.col - bsi.sv_mvp[3].as_mv.col)) >> 3);
+      cal_step_param(sr, &bsi.sv_istep[1]);
+
+      rd_check_segment(cpi, x, &bsi, PARTITIONING_16X8,
+                       seg_mvs[PARTITIONING_16X8], txfm_cache);
+
+      /* If 8x8 is better than 16x8/8x16, then do 4x4 search */
+      /* Not skip 4x4 if speed=0 (good quality) */
+      if (cpi->sf.no_skip_block4x4_search ||
+          bsi.segment_num == PARTITIONING_8X8) {
+        /* || (sv_segment_rd8x8-bsi.segment_rd) < sv_segment_rd8x8>>5) */
+        bsi.mvp.as_int = bsi.sv_mvp[0].as_int;
+        rd_check_segment(cpi, x, &bsi, PARTITIONING_4X4,
+                         seg_mvs[PARTITIONING_4X4], txfm_cache);
+      }
+
+      /* restore UMV window */
+      x->mv_col_min = tmp_col_min;
+      x->mv_col_max = tmp_col_max;
+      x->mv_row_min = tmp_row_min;
+      x->mv_row_max = tmp_row_max;
+    }
+  }
+
+  /* set it to the best */
+  for (i = 0; i < 16; i++) {
+    BLOCKD *bd = &x->e_mbd.block[i];
+
+    bd->bmi.as_mv.first.as_int = bsi.mvs[i].as_int;
+    if (mbmi->second_ref_frame)
+      bd->bmi.as_mv.second.as_int = bsi.second_mvs[i].as_int;
+    bd->eob = bsi.eobs[i];
+  }
+
+  *returntotrate = bsi.r;
+  *returndistortion = bsi.d;
+  *returnyrate = bsi.segment_yrate;
+  *skippable = bsi.txfm_size == TX_4X4 ?
+                    vp9_mby_is_skippable_4x4(&x->e_mbd, 0) :
+                    vp9_mby_is_skippable_8x8(&x->e_mbd, 0);
+
+  /* save partitions */
+  mbmi->txfm_size = bsi.txfm_size;
+  mbmi->partitioning = bsi.segment_num;
+  x->partition_info->count = vp9_mbsplit_count[bsi.segment_num];
+
+  for (i = 0; i < x->partition_info->count; i++) {
+    int j;
+
+    j = vp9_mbsplit_offset[bsi.segment_num][i];
+
+    x->partition_info->bmi[i].mode = bsi.modes[j];
+    x->partition_info->bmi[i].mv.as_mv = bsi.mvs[j].as_mv;
+    if (mbmi->second_ref_frame)
+      x->partition_info->bmi[i].second_mv.as_mv = bsi.second_mvs[j].as_mv;
+  }
+  /*
+   * used to set mbmi->mv.as_int
+   */
+  x->partition_info->bmi[15].mv.as_int = bsi.mvs[15].as_int;
+  if (mbmi->second_ref_frame)
+    x->partition_info->bmi[15].second_mv.as_int = bsi.second_mvs[15].as_int;
+
+  return bsi.segment_rd;
+}
+
+/* Order arr in increasing order, original position stored in idx */
+static void insertsortmv(int arr[], int len) {
+  int i, j, k;
+
+  for (i = 1; i <= len - 1; i++) {
+    for (j = 0; j < i; j++) {
+      if (arr[j] > arr[i]) {
+        int temp;
+
+        temp = arr[i];
+
+        for (k = i; k > j; k--)
+          arr[k] = arr[k - 1];
+
+        arr[j] = temp;
+      }
+    }
+  }
+}
+
+static void insertsortsad(int arr[], int idx[], int len) {
+  int i, j, k;
+
+  for (i = 1; i <= len - 1; i++) {
+    for (j = 0; j < i; j++) {
+      if (arr[j] > arr[i]) {
+        int temp, tempi;
+
+        temp = arr[i];
+        tempi = idx[i];
+
+        for (k = i; k > j; k--) {
+          arr[k] = arr[k - 1];
+          idx[k] = idx[k - 1];
+        }
+
+        arr[j] = temp;
+        idx[j] = tempi;
+      }
+    }
+  }
+}
+
+// The improved MV prediction
+void vp9_mv_pred(VP9_COMP *cpi, MACROBLOCKD *xd, const MODE_INFO *here,
+                 int_mv *mvp, int refframe, int *ref_frame_sign_bias,
+                 int *sr, int near_sadidx[]) {
+  const MODE_INFO *above = here - xd->mode_info_stride;
+  const MODE_INFO *left = here - 1;
+  const MODE_INFO *aboveleft = above - 1;
+  int_mv           near_mvs[8];
+  int              near_ref[8];
+  int_mv           mv;
+  int              vcnt = 0;
+  int              find = 0;
+  int              mb_offset;
+
+  int              mvx[8];
+  int              mvy[8];
+  int              i;
+
+  mv.as_int = 0;
+
+  if (here->mbmi.ref_frame != INTRA_FRAME) {
+    near_mvs[0].as_int = near_mvs[1].as_int = near_mvs[2].as_int = near_mvs[3].as_int = near_mvs[4].as_int = near_mvs[5].as_int = near_mvs[6].as_int = near_mvs[7].as_int = 0;
+    near_ref[0] = near_ref[1] = near_ref[2] = near_ref[3] = near_ref[4] = near_ref[5] = near_ref[6] = near_ref[7] = 0;
+
+    // read in 3 nearby block's MVs from current frame as prediction candidates.
+    if (above->mbmi.ref_frame != INTRA_FRAME) {
+      near_mvs[vcnt].as_int = above->mbmi.mv[0].as_int;
+      mv_bias(ref_frame_sign_bias[above->mbmi.ref_frame], refframe, &near_mvs[vcnt], ref_frame_sign_bias);
+      near_ref[vcnt] =  above->mbmi.ref_frame;
+    }
+    vcnt++;
+    if (left->mbmi.ref_frame != INTRA_FRAME) {
+      near_mvs[vcnt].as_int = left->mbmi.mv[0].as_int;
+      mv_bias(ref_frame_sign_bias[left->mbmi.ref_frame], refframe, &near_mvs[vcnt], ref_frame_sign_bias);
+      near_ref[vcnt] =  left->mbmi.ref_frame;
+    }
+    vcnt++;
+    if (aboveleft->mbmi.ref_frame != INTRA_FRAME) {
+      near_mvs[vcnt].as_int = aboveleft->mbmi.mv[0].as_int;
+      mv_bias(ref_frame_sign_bias[aboveleft->mbmi.ref_frame], refframe, &near_mvs[vcnt], ref_frame_sign_bias);
+      near_ref[vcnt] =  aboveleft->mbmi.ref_frame;
+    }
+    vcnt++;
+
+    // read in 5 nearby block's MVs from last frame.
+    if (cpi->common.last_frame_type != KEY_FRAME) {
+      mb_offset = (-xd->mb_to_top_edge / 128 + 1) * (xd->mode_info_stride + 1) + (-xd->mb_to_left_edge / 128 + 1);
+
+      // current in last frame
+      if (cpi->lf_ref_frame[mb_offset] != INTRA_FRAME) {
+        near_mvs[vcnt].as_int = cpi->lfmv[mb_offset].as_int;
+        mv_bias(cpi->lf_ref_frame_sign_bias[mb_offset], refframe, &near_mvs[vcnt], ref_frame_sign_bias);
+        near_ref[vcnt] =  cpi->lf_ref_frame[mb_offset];
+      }
+      vcnt++;
+
+      // above in last frame
+      if (cpi->lf_ref_frame[mb_offset - xd->mode_info_stride - 1] != INTRA_FRAME) {
+        near_mvs[vcnt].as_int = cpi->lfmv[mb_offset - xd->mode_info_stride - 1].as_int;
+        mv_bias(cpi->lf_ref_frame_sign_bias[mb_offset - xd->mode_info_stride - 1], refframe, &near_mvs[vcnt], ref_frame_sign_bias);
+        near_ref[vcnt] =  cpi->lf_ref_frame[mb_offset - xd->mode_info_stride - 1];
+      }
+      vcnt++;
+
+      // left in last frame
+      if (cpi->lf_ref_frame[mb_offset - 1] != INTRA_FRAME) {
+        near_mvs[vcnt].as_int = cpi->lfmv[mb_offset - 1].as_int;
+        mv_bias(cpi->lf_ref_frame_sign_bias[mb_offset - 1], refframe, &near_mvs[vcnt], ref_frame_sign_bias);
+        near_ref[vcnt] =  cpi->lf_ref_frame[mb_offset - 1];
+      }
+      vcnt++;
+
+      // right in last frame
+      if (cpi->lf_ref_frame[mb_offset + 1] != INTRA_FRAME) {
+        near_mvs[vcnt].as_int = cpi->lfmv[mb_offset + 1].as_int;
+        mv_bias(cpi->lf_ref_frame_sign_bias[mb_offset + 1], refframe, &near_mvs[vcnt], ref_frame_sign_bias);
+        near_ref[vcnt] =  cpi->lf_ref_frame[mb_offset + 1];
+      }
+      vcnt++;
+
+      // below in last frame
+      if (cpi->lf_ref_frame[mb_offset + xd->mode_info_stride + 1] != INTRA_FRAME) {
+        near_mvs[vcnt].as_int = cpi->lfmv[mb_offset + xd->mode_info_stride + 1].as_int;
+        mv_bias(cpi->lf_ref_frame_sign_bias[mb_offset + xd->mode_info_stride + 1], refframe, &near_mvs[vcnt], ref_frame_sign_bias);
+        near_ref[vcnt] =  cpi->lf_ref_frame[mb_offset + xd->mode_info_stride + 1];
+      }
+      vcnt++;
+    }
+
+    for (i = 0; i < vcnt; i++) {
+      if (near_ref[near_sadidx[i]] != INTRA_FRAME) {
+        if (here->mbmi.ref_frame == near_ref[near_sadidx[i]]) {
+          mv.as_int = near_mvs[near_sadidx[i]].as_int;
+          find = 1;
+          if (i < 3)
+            *sr = 3;
+          else
+            *sr = 2;
+          break;
+        }
+      }
+    }
+
+    if (!find) {
+      for (i = 0; i < vcnt; i++) {
+        mvx[i] = near_mvs[i].as_mv.row;
+        mvy[i] = near_mvs[i].as_mv.col;
+      }
+
+      insertsortmv(mvx, vcnt);
+      insertsortmv(mvy, vcnt);
+      mv.as_mv.row = mvx[vcnt / 2];
+      mv.as_mv.col = mvy[vcnt / 2];
+
+      find = 1;
+      // sr is set to 0 to allow calling function to decide the search range.
+      *sr = 0;
+    }
+  }
+
+  /* Set up return values */
+  mvp->as_int = mv.as_int;
+  clamp_mv2(mvp, xd);
+}
+
+static void cal_sad(VP9_COMP *cpi, MACROBLOCKD *xd, MACROBLOCK *x,
+                    int recon_yoffset, int near_sadidx[],
+                    enum BlockSize block_size) {
+  /* 0-cf above, 1-cf left, 2-cf aboveleft, 3-lf current, 4-lf above,
+   * 5-lf left, 6-lf right, 7-lf below */
+  int near_sad[8] = {0};
+  BLOCK *b = &x->block[0];
+  unsigned char *src_y_ptr = *(b->base_src);
+  const unsigned char *dst_y_ptr = xd->dst.y_buffer;
+  const int bs = (block_size == BLOCK_16X16) ? 16 : 32;
+  const int dst_y_str = xd->dst.y_stride;
+
+  // calculate sad for current frame 3 nearby MBs.
+  if (xd->mb_to_top_edge == 0 && xd->mb_to_left_edge == 0) {
+    near_sad[0] = near_sad[1] = near_sad[2] = INT_MAX;
+  } else if (xd->mb_to_top_edge == 0) {
+    // only has left MB for sad calculation.
+    near_sad[0] = near_sad[2] = INT_MAX;
+    near_sad[1] = cpi->fn_ptr[block_size].sdf(src_y_ptr, b->src_stride,
+                                              dst_y_ptr - bs,
+                                              dst_y_str, 0x7fffffff);
+  } else if (xd->mb_to_left_edge == 0) {
+    // only has left MB for sad calculation.
+    near_sad[1] = near_sad[2] = INT_MAX;
+    near_sad[0] = cpi->fn_ptr[block_size].sdf(src_y_ptr, b->src_stride,
+                                              dst_y_ptr - dst_y_str * bs,
+                                              dst_y_str, 0x7fffffff);
+  } else {
+    near_sad[0] = cpi->fn_ptr[block_size].sdf(src_y_ptr, b->src_stride,
+                                              dst_y_ptr - dst_y_str * bs,
+                                              dst_y_str, 0x7fffffff);
+    near_sad[1] = cpi->fn_ptr[block_size].sdf(src_y_ptr, b->src_stride,
+                                              dst_y_ptr - bs,
+                                              dst_y_str, 0x7fffffff);
+    near_sad[2] = cpi->fn_ptr[block_size].sdf(src_y_ptr, b->src_stride,
+                                              dst_y_ptr - dst_y_str * bs - bs,
+                                              dst_y_str, 0x7fffffff);
+  }
+
+  if (cpi->common.last_frame_type != KEY_FRAME) {
+    // calculate sad for last frame 5 nearby MBs.
+    unsigned char *pre_y_buffer = cpi->common.yv12_fb[cpi->common.lst_fb_idx].y_buffer + recon_yoffset;
+    const int pre_y_str = cpi->common.yv12_fb[cpi->common.lst_fb_idx].y_stride;
+
+    if (xd->mb_to_top_edge == 0) near_sad[4] = INT_MAX;
+    if (xd->mb_to_left_edge == 0) near_sad[5] = INT_MAX;
+    if (xd->mb_to_right_edge == 0) near_sad[6] = INT_MAX;
+    if (xd->mb_to_bottom_edge == 0) near_sad[7] = INT_MAX;
+
+    near_sad[3] = cpi->fn_ptr[block_size].sdf(src_y_ptr, b->src_stride,
+                                              pre_y_buffer,
+                                              pre_y_str, 0x7fffffff);
+    if (near_sad[4] != INT_MAX)
+      near_sad[4] = cpi->fn_ptr[block_size].sdf(src_y_ptr, b->src_stride,
+                                                pre_y_buffer - pre_y_str * bs,
+                                                pre_y_str, 0x7fffffff);
+    if (near_sad[5] != INT_MAX)
+      near_sad[5] = cpi->fn_ptr[block_size].sdf(src_y_ptr, b->src_stride,
+                                                pre_y_buffer - bs,
+                                                pre_y_str, 0x7fffffff);
+    if (near_sad[6] != INT_MAX)
+      near_sad[6] = cpi->fn_ptr[block_size].sdf(src_y_ptr, b->src_stride,
+                                                pre_y_buffer + bs,
+                                                pre_y_str, 0x7fffffff);
+    if (near_sad[7] != INT_MAX)
+      near_sad[7] = cpi->fn_ptr[block_size].sdf(src_y_ptr, b->src_stride,
+                                                pre_y_buffer + pre_y_str * bs,
+                                                pre_y_str, 0x7fffffff);
+  }
+
+  if (cpi->common.last_frame_type != KEY_FRAME) {
+    insertsortsad(near_sad, near_sadidx, 8);
+  } else {
+    insertsortsad(near_sad, near_sadidx, 3);
+  }
+}
+
+static void set_i8x8_block_modes(MACROBLOCK *x, int modes[2][4]) {
+  int i;
+  MACROBLOCKD *xd = &x->e_mbd;
+  for (i = 0; i < 4; i++) {
+    int ib = vp9_i8x8_block[i];
+    xd->mode_info_context->bmi[ib + 0].as_mode.first = modes[0][i];
+    xd->mode_info_context->bmi[ib + 1].as_mode.first = modes[0][i];
+    xd->mode_info_context->bmi[ib + 4].as_mode.first = modes[0][i];
+    xd->mode_info_context->bmi[ib + 5].as_mode.first = modes[0][i];
+#if CONFIG_COMP_INTRA_PRED
+    xd->mode_info_context->bmi[ib + 0].as_mode.second = modes[1][i];
+    xd->mode_info_context->bmi[ib + 1].as_mode.second = modes[1][i];
+    xd->mode_info_context->bmi[ib + 4].as_mode.second = modes[1][i];
+    xd->mode_info_context->bmi[ib + 5].as_mode.second = modes[1][i];
+#endif
+    // printf("%d,%d,%d,%d %d,%d,%d,%d\n",
+    //       modes[0][0], modes[0][1], modes[0][2], modes[0][3],
+    //       modes[1][0], modes[1][1], modes[1][2], modes[1][3]);
+  }
+
+  for (i = 0; i < 16; i++) {
+    xd->block[i].bmi = xd->mode_info_context->bmi[i];
+  }
+}
+
+extern void vp9_calc_ref_probs(int *count, vp9_prob *probs);
+static void estimate_curframe_refprobs(VP9_COMP *cpi, vp9_prob mod_refprobs[3], int pred_ref) {
+  int norm_cnt[MAX_REF_FRAMES];
+  const int *const rfct = cpi->count_mb_ref_frame_usage;
+  int intra_count = rfct[INTRA_FRAME];
+  int last_count  = rfct[LAST_FRAME];
+  int gf_count    = rfct[GOLDEN_FRAME];
+  int arf_count   = rfct[ALTREF_FRAME];
+
+  // Work out modified reference frame probabilities to use where prediction
+  // of the reference frame fails
+  if (pred_ref == INTRA_FRAME) {
+    norm_cnt[0] = 0;
+    norm_cnt[1] = last_count;
+    norm_cnt[2] = gf_count;
+    norm_cnt[3] = arf_count;
+    vp9_calc_ref_probs(norm_cnt, mod_refprobs);
+    mod_refprobs[0] = 0;    // This branch implicit
+  } else if (pred_ref == LAST_FRAME) {
+    norm_cnt[0] = intra_count;
+    norm_cnt[1] = 0;
+    norm_cnt[2] = gf_count;
+    norm_cnt[3] = arf_count;
+    vp9_calc_ref_probs(norm_cnt, mod_refprobs);
+    mod_refprobs[1] = 0;    // This branch implicit
+  } else if (pred_ref == GOLDEN_FRAME) {
+    norm_cnt[0] = intra_count;
+    norm_cnt[1] = last_count;
+    norm_cnt[2] = 0;
+    norm_cnt[3] = arf_count;
+    vp9_calc_ref_probs(norm_cnt, mod_refprobs);
+    mod_refprobs[2] = 0;  // This branch implicit
+  } else {
+    norm_cnt[0] = intra_count;
+    norm_cnt[1] = last_count;
+    norm_cnt[2] = gf_count;
+    norm_cnt[3] = 0;
+    vp9_calc_ref_probs(norm_cnt, mod_refprobs);
+    mod_refprobs[2] = 0;  // This branch implicit
+  }
+}
+
+static __inline unsigned weighted_cost(vp9_prob *tab0, vp9_prob *tab1, int idx, int val, int weight) {
+  unsigned cost0 = tab0[idx] ? vp9_cost_bit(tab0[idx], val) : 0;
+  unsigned cost1 = tab1[idx] ? vp9_cost_bit(tab1[idx], val) : 0;
+  // weight is 16-bit fixed point, so this basically calculates:
+  // 0.5 + weight * cost1 + (1.0 - weight) * cost0
+  return (0x8000 + weight * cost1 + (0x10000 - weight) * cost0) >> 16;
+}
+
+static void estimate_ref_frame_costs(VP9_COMP *cpi, int segment_id, unsigned int *ref_costs) {
+  VP9_COMMON *cm = &cpi->common;
+  MACROBLOCKD *xd = &cpi->mb.e_mbd;
+  vp9_prob *mod_refprobs;
+
+  unsigned int cost;
+  int pred_ref;
+  int pred_flag;
+  int pred_ctx;
+  int i;
+  int tot_count;
+
+  vp9_prob pred_prob, new_pred_prob;
+  int seg_ref_active;
+  int seg_ref_count = 0;
+  seg_ref_active = vp9_segfeature_active(xd,
+                                         segment_id,
+                                         SEG_LVL_REF_FRAME);
+
+  if (seg_ref_active) {
+    seg_ref_count = vp9_check_segref(xd, segment_id, INTRA_FRAME)  +
+                    vp9_check_segref(xd, segment_id, LAST_FRAME)   +
+                    vp9_check_segref(xd, segment_id, GOLDEN_FRAME) +
+                    vp9_check_segref(xd, segment_id, ALTREF_FRAME);
+  }
+
+  // Get the predicted reference for this mb
+  pred_ref = vp9_get_pred_ref(cm, xd);
+
+  // Get the context probability for the prediction flag (based on last frame)
+  pred_prob = vp9_get_pred_prob(cm, xd, PRED_REF);
+
+  // Predict probability for current frame based on stats so far
+  pred_ctx = vp9_get_pred_context(cm, xd, PRED_REF);
+  tot_count = cpi->ref_pred_count[pred_ctx][0] + cpi->ref_pred_count[pred_ctx][1];
+  if (tot_count) {
+    new_pred_prob =
+      (cpi->ref_pred_count[pred_ctx][0] * 255 + (tot_count >> 1)) / tot_count;
+    new_pred_prob += !new_pred_prob;
+  } else
+    new_pred_prob = 128;
+
+  // Get the set of probabilities to use if prediction fails
+  mod_refprobs = cm->mod_refprobs[pred_ref];
+
+  // For each possible selected reference frame work out a cost.
+  for (i = 0; i < MAX_REF_FRAMES; i++) {
+    if (seg_ref_active && seg_ref_count == 1) {
+      cost = 0;
+    } else {
+      pred_flag = (i == pred_ref);
+
+      // Get the prediction for the current mb
+      cost = weighted_cost(&pred_prob, &new_pred_prob, 0,
+                           pred_flag, cpi->seg0_progress);
+      if (cost > 1024) cost = 768; // i.e. account for 4 bits max.
+
+      // for incorrectly predicted cases
+      if (! pred_flag) {
+        vp9_prob curframe_mod_refprobs[3];
+
+        if (cpi->seg0_progress) {
+          estimate_curframe_refprobs(cpi, curframe_mod_refprobs, pred_ref);
+        } else {
+          vpx_memset(curframe_mod_refprobs, 0, sizeof(curframe_mod_refprobs));
+        }
+
+        cost += weighted_cost(mod_refprobs, curframe_mod_refprobs, 0,
+                              (i != INTRA_FRAME), cpi->seg0_progress);
+        if (i != INTRA_FRAME) {
+          cost += weighted_cost(mod_refprobs, curframe_mod_refprobs, 1,
+                                (i != LAST_FRAME), cpi->seg0_progress);
+          if (i != LAST_FRAME) {
+            cost += weighted_cost(mod_refprobs, curframe_mod_refprobs, 2,
+                                  (i != GOLDEN_FRAME), cpi->seg0_progress);
+          }
+        }
+      }
+    }
+
+    ref_costs[i] = cost;
+  }
+}
+
+static void store_coding_context(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
+                                 int mode_index,
+                                 PARTITION_INFO *partition,
+                                 int_mv *ref_mv,
+                                 int_mv *second_ref_mv,
+                                 int single_pred_diff,
+                                 int comp_pred_diff,
+                                 int hybrid_pred_diff,
+                                 int64_t txfm_size_diff[NB_TXFM_MODES]) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
+
+  // Take a snapshot of the coding context so it can be
+  // restored if we decide to encode this way
+  ctx->best_mode_index = mode_index;
+  vpx_memcpy(&ctx->mic, xd->mode_info_context,
+             sizeof(MODE_INFO));
+  if (partition)
+    vpx_memcpy(&ctx->partition_info, partition,
+               sizeof(PARTITION_INFO));
+  ctx->best_ref_mv.as_int = ref_mv->as_int;
+  ctx->second_best_ref_mv.as_int = second_ref_mv->as_int;
+
+  // ctx[mb_index].rddiv = x->rddiv;
+  // ctx[mb_index].rdmult = x->rdmult;
+
+  ctx->single_pred_diff = single_pred_diff;
+  ctx->comp_pred_diff   = comp_pred_diff;
+  ctx->hybrid_pred_diff = hybrid_pred_diff;
+
+  if (txfm_size_diff) {
+    memcpy(ctx->txfm_rd_diff, txfm_size_diff, sizeof(ctx->txfm_rd_diff));
+  } else {
+    memset(ctx->txfm_rd_diff, 0, sizeof(ctx->txfm_rd_diff));
+  }
+}
+
+static void inter_mode_cost(VP9_COMP *cpi, MACROBLOCK *x, int this_mode,
+                            int *rate2, int *distortion2, int *rate_y,
+                            int *distortion, int* rate_uv, int *distortion_uv,
+                            int *skippable, int64_t txfm_cache[NB_TXFM_MODES]) {
+  int y_skippable, uv_skippable;
+
+  // Y cost and distortion
+  macro_block_yrd(cpi, x, rate_y, distortion, &y_skippable, txfm_cache);
+
+  *rate2 += *rate_y;
+  *distortion2 += *distortion;
+
+  // UV cost and distortion
+  if (x->e_mbd.mode_info_context->mbmi.txfm_size != TX_4X4)
+    rd_inter16x16_uv_8x8(cpi, x, rate_uv, distortion_uv,
+                         cpi->common.full_pixel, &uv_skippable);
+  else
+    rd_inter16x16_uv(cpi, x, rate_uv, distortion_uv, cpi->common.full_pixel,
+                     &uv_skippable);
+  *rate2 += *rate_uv;
+  *distortion2 += *distortion_uv;
+  *skippable = y_skippable && uv_skippable;
+}
+
+#define MIN(x,y) (((x)<(y))?(x):(y))
+#define MAX(x,y) (((x)>(y))?(x):(y))
+static void setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x,
+                               int idx, int frame_type,
+                               int recon_yoffset, int recon_uvoffset,
+                               int_mv frame_nearest_mv[4],
+                               int_mv frame_near_mv[4],
+                               int_mv frame_best_ref_mv[4],
+                               int frame_mdcounts[4][4],
+                               unsigned char *y_buffer[4],
+                               unsigned char *u_buffer[4],
+                               unsigned char *v_buffer[4]) {
+  YV12_BUFFER_CONFIG *yv12 = &cpi->common.yv12_fb[idx];
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO * mbmi = &xd->mode_info_context->mbmi;
+
+
+  vp9_find_near_mvs(xd, xd->mode_info_context,
+                    xd->prev_mode_info_context,
+                    &frame_nearest_mv[frame_type], &frame_near_mv[frame_type],
+                    &frame_best_ref_mv[frame_type], frame_mdcounts[frame_type],
+                    frame_type, cpi->common.ref_frame_sign_bias);
+
+  y_buffer[frame_type] = yv12->y_buffer + recon_yoffset;
+  u_buffer[frame_type] = yv12->u_buffer + recon_uvoffset;
+  v_buffer[frame_type] = yv12->v_buffer + recon_uvoffset;
+
+#if CONFIG_NEWBESTREFMV
+  vp9_find_mv_refs(xd, xd->mode_info_context,
+                   xd->prev_mode_info_context,
+                   frame_type,
+                   mbmi->ref_mvs[frame_type],
+                   cpi->common.ref_frame_sign_bias);
+
+  vp9_find_best_ref_mvs(xd, y_buffer[frame_type],
+                        yv12->y_stride,
+                        mbmi->ref_mvs[frame_type],
+                        &frame_best_ref_mv[frame_type],
+                        &frame_nearest_mv[frame_type],
+                        &frame_near_mv[frame_type]);
+#endif
+}
+
+static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
+                                 enum BlockSize block_size,
+                                 int *saddone, int near_sadidx[],
+                                 int mdcounts[4], int64_t txfm_cache[],
+                                 int *rate2, int *distortion, int *skippable,
+                                 int *compmode_cost,
+                                 int *rate_y, int *distortion_y,
+                                 int *rate_uv, int *distortion_uv,
+                                 int *mode_excluded, int *disable_skip,
+                                 int recon_yoffset, int mode_index,
+                                 int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES],
+                                 int_mv frame_best_ref_mv[4]) {
+  VP9_COMMON *cm = &cpi->common;
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
+  BLOCK *b = &x->block[0];
+  BLOCKD *d = &xd->block[0];
+  const int is_comp_pred = (mbmi->second_ref_frame != 0);
+  const int num_refs = is_comp_pred ? 2 : 1;
+  const int this_mode = mbmi->mode;
+  int i;
+  int refs[2] = { mbmi->ref_frame, mbmi->second_ref_frame };
+  int_mv cur_mv[2];
+  int_mv mvp;
+  int64_t this_rd = 0;
+
+  switch (this_mode) {
+    case NEWMV:
+      if (is_comp_pred) {
+        if (frame_mv[NEWMV][refs[0]].as_int == INVALID_MV ||
+            frame_mv[NEWMV][refs[1]].as_int == INVALID_MV)
+          return INT64_MAX;
+        *rate2 += vp9_mv_bit_cost(&frame_mv[NEWMV][refs[0]],
+                                  &frame_best_ref_mv[refs[0]],
+                                  XMVCOST, 96,
+                                  x->e_mbd.allow_high_precision_mv);
+        *rate2 += vp9_mv_bit_cost(&frame_mv[NEWMV][refs[1]],
+                                  &frame_best_ref_mv[refs[1]],
+                                  XMVCOST, 96,
+                                  x->e_mbd.allow_high_precision_mv);
+      } else {
+        int bestsme = INT_MAX;
+        int further_steps, step_param = cpi->sf.first_step;
+        int sadpb = x->sadperbit16;
+        int_mv mvp_full, tmp_mv;
+        // search range got from mv_pred(). It uses step_param levels. (0-7)
+        int sr = 0;
+
+        int tmp_col_min = x->mv_col_min;
+        int tmp_col_max = x->mv_col_max;
+        int tmp_row_min = x->mv_row_min;
+        int tmp_row_max = x->mv_row_max;
+
+        vp9_clamp_mv_min_max(x, &frame_best_ref_mv[refs[0]]);
+
+        if (!*saddone) {
+          cal_sad(cpi, xd, x, recon_yoffset, &near_sadidx[0], block_size);
+          *saddone = 1;
+        }
+
+        vp9_mv_pred(cpi, &x->e_mbd, x->e_mbd.mode_info_context, &mvp,
+                    mbmi->ref_frame, cpi->common.ref_frame_sign_bias,
+                    &sr, &near_sadidx[0]);
+
+        mvp_full.as_mv.col = mvp.as_mv.col >> 3;
+        mvp_full.as_mv.row = mvp.as_mv.row >> 3;
+
+        // adjust search range according to sr from mv prediction
+        step_param = MAX(step_param, sr);
+
+        // Further step/diamond searches as necessary
+        further_steps = (cpi->sf.max_step_search_steps - 1) - step_param;
+
+        bestsme = vp9_full_pixel_diamond(cpi, x, b, d, &mvp_full, step_param,
+                                         sadpb, further_steps, 1,
+                                         &cpi->fn_ptr[block_size],
+                                         &frame_best_ref_mv[refs[0]], &tmp_mv);
+
+        x->mv_col_min = tmp_col_min;
+        x->mv_col_max = tmp_col_max;
+        x->mv_row_min = tmp_row_min;
+        x->mv_row_max = tmp_row_max;
+
+        if (bestsme < INT_MAX) {
+          int dis; /* TODO: use dis in distortion calculation later. */
+          unsigned int sse;
+          cpi->find_fractional_mv_step(x, b, d, &tmp_mv,
+                                       &frame_best_ref_mv[refs[0]],
+                                       x->errorperbit,
+                                       &cpi->fn_ptr[block_size],
+                                       XMVCOST, &dis, &sse);
+        }
+        d->bmi.as_mv.first.as_int = tmp_mv.as_int;
+        frame_mv[NEWMV][refs[0]].as_int = d->bmi.as_mv.first.as_int;
+
+        // Add the new motion vector cost to our rolling cost variable
+        *rate2 += vp9_mv_bit_cost(&tmp_mv, &frame_best_ref_mv[refs[0]],
+                                  XMVCOST, 96, xd->allow_high_precision_mv);
+      }
+      break;
+    case NEARESTMV:
+    case NEARMV:
+      // Do not bother proceeding if the vector (from newmv, nearest or
+      // near) is 0,0 as this should then be coded using the zeromv mode.
+      for (i = 0; i < num_refs; ++i)
+        if (frame_mv[this_mode][refs[i]].as_int == 0)
+          return INT64_MAX;
+    case ZEROMV:
+    default:
+      break;
+  }
+  for (i = 0; i < num_refs; ++i) {
+    cur_mv[i] = frame_mv[this_mode][refs[i]];
+    // Clip "next_nearest" so that it does not extend to far out of image
+    clamp_mv2(&cur_mv[i], xd);
+    if (mv_check_bounds(x, &cur_mv[i]))
+      return INT64_MAX;
+    mbmi->mv[i].as_int = cur_mv[i].as_int;
+  }
+
+#if CONFIG_PRED_FILTER
+  // Filtered prediction:
+  mbmi->pred_filter_enabled = vp9_mode_order[mode_index].pred_filter_flag;
+  *rate2 += vp9_cost_bit(cpi->common.prob_pred_filter_off,
+                         mbmi->pred_filter_enabled);
+#endif
+  if (cpi->common.mcomp_filter_type == SWITCHABLE) {
+    const int c = vp9_get_pred_context(cm, xd, PRED_SWITCHABLE_INTERP);
+    const int m = vp9_switchable_interp_map[mbmi->interp_filter];
+    *rate2 += SWITCHABLE_INTERP_RATE_FACTOR * x->switchable_interp_costs[c][m];
+  }
+
+  /* We don't include the cost of the second reference here, because there
+   * are only three options: Last/Golden, ARF/Last or Golden/ARF, or in other
+   * words if you present them in that order, the second one is always known
+   * if the first is known */
+  *compmode_cost = vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_COMP),
+                                is_comp_pred);
+  *rate2 += vp9_cost_mv_ref(cpi, this_mode, mdcounts);
+
+  if (block_size == BLOCK_16X16) {
+    vp9_build_1st_inter16x16_predictors_mby(xd, xd->predictor, 16, 0);
+    if (is_comp_pred)
+      vp9_build_2nd_inter16x16_predictors_mby(xd, xd->predictor, 16);
+  } else {
+#if CONFIG_SUPERBLOCKS
+    vp9_build_inter32x32_predictors_sb(xd,
+                                       xd->dst.y_buffer,
+                                       xd->dst.u_buffer,
+                                       xd->dst.v_buffer,
+                                       xd->dst.y_stride,
+                                       xd->dst.uv_stride);
+#endif
+  }
+
+  if (cpi->active_map_enabled && x->active_ptr[0] == 0)
+    x->skip = 1;
+  else if (x->encode_breakout) {
+    unsigned int sse, var;
+    int threshold = (xd->block[0].dequant[1]
+                     * xd->block[0].dequant[1] >> 4);
+
+    if (threshold < x->encode_breakout)
+      threshold = x->encode_breakout;
+
+    if (block_size == BLOCK_16X16) {
+      var = vp9_variance16x16(*(b->base_src), b->src_stride,
+                              xd->predictor, 16, &sse);
+    } else {
+#if CONFIG_SUPERBLOCKS
+      var = vp9_variance32x32(*(b->base_src), b->src_stride,
+                              xd->dst.y_buffer, xd->dst.y_stride, &sse);
+#endif
+    }
+
+    if (sse < threshold) {
+      unsigned int q2dc = xd->block[24].dequant[0];
+      /* If there is no codeable 2nd order dc
+       or a very small uniform pixel change change */
+      if ((sse - var < q2dc * q2dc >> 4) ||
+          (sse / 2 > var && sse - var < 64)) {
+        // Check u and v to make sure skip is ok
+        int sse2;
+
+        if (block_size == BLOCK_16X16) {
+          sse2 = vp9_uvsse(x);
+        } else {
+          unsigned int sse2u, sse2v;
+          var = vp9_variance16x16(x->src.u_buffer, x->src.uv_stride,
+                                  xd->dst.u_buffer, xd->dst.uv_stride, &sse2u);
+          var = vp9_variance16x16(x->src.v_buffer, x->src.uv_stride,
+                                  xd->dst.v_buffer, xd->dst.uv_stride, &sse2v);
+          sse2 = sse2u + sse2v;
+        }
+
+        if (sse2 * 2 < threshold) {
+          x->skip = 1;
+          *distortion = sse + sse2;
+          *rate2 = 500;
+
+          /* for best_yrd calculation */
+          *rate_uv = 0;
+          *distortion_uv = sse2;
+
+          *disable_skip = 1;
+          this_rd = RDCOST(x->rdmult, x->rddiv, *rate2, *distortion);
+        }
+      }
+    }
+  }
+
+  if (!x->skip) {
+    if (block_size == BLOCK_16X16) {
+      vp9_build_1st_inter16x16_predictors_mbuv(xd, &xd->predictor[256],
+                                               &xd->predictor[320], 8);
+      if (is_comp_pred)
+        vp9_build_2nd_inter16x16_predictors_mbuv(xd, &xd->predictor[256],
+                                                 &xd->predictor[320], 8);
+      inter_mode_cost(cpi, x, this_mode, rate2, distortion,
+                      rate_y, distortion_y, rate_uv, distortion_uv,
+                      skippable, txfm_cache);
+    } else {
+#if CONFIG_SUPERBLOCKS
+      int skippable_y, skippable_uv;
+
+      // Y cost and distortion - FIXME support other transform sizes
+      super_block_yrd_8x8(x, rate_y, distortion_y,
+                          IF_RTCD(&cpi->rtcd), &skippable_y);
+      *rate2 += *rate_y;
+      *distortion += *distortion_y;
+
+      rd_inter32x32_uv_8x8(cpi, x, rate_uv, distortion_uv,
+                           cm->full_pixel, &skippable_uv);
+
+      *rate2 += *rate_uv;
+      *distortion += *distortion_uv;
+      *skippable = skippable_y && skippable_uv;
+#endif
+    }
+  }
+  if (is_comp_pred) {
+    *mode_excluded = (cpi->common.comp_pred_mode == SINGLE_PREDICTION_ONLY);
+  } else {
+    *mode_excluded = (cpi->common.comp_pred_mode == COMP_PREDICTION_ONLY);
+  }
+
+  return this_rd;  // if 0, this will be re-calculated by caller
+}
+
+void vp9_rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
+                            int recon_yoffset, int recon_uvoffset,
+                            int *returnrate, int *returndistortion,
+                            int64_t *returnintra) {
+  VP9_COMMON *cm = &cpi->common;
+  MACROBLOCKD *xd = &x->e_mbd;
+  union b_mode_info best_bmodes[16];
+  MB_MODE_INFO best_mbmode;
+  PARTITION_INFO best_partition;
+  int_mv best_ref_mv, second_best_ref_mv;
+  MB_PREDICTION_MODE this_mode;
+  MB_MODE_INFO * mbmi = &xd->mode_info_context->mbmi;
+  int i, best_mode_index = 0;
+  int mode8x8[2][4];
+  unsigned char segment_id = mbmi->segment_id;
+
+  int mode_index;
+  int mdcounts[4];
+  int rate, distortion;
+  int rate2, distortion2;
+  int64_t best_txfm_rd[NB_TXFM_MODES];
+  int64_t best_txfm_diff[NB_TXFM_MODES];
+  int64_t best_pred_diff[NB_PREDICTION_TYPES];
+  int64_t best_pred_rd[NB_PREDICTION_TYPES];
+  int64_t best_rd = INT64_MAX, best_intra_rd = INT64_MAX;
+#if CONFIG_PRED_FILTER
+  int64_t best_overall_rd = INT64_MAX;
+#endif
+  int uv_intra_rate, uv_intra_distortion, uv_intra_rate_tokenonly;
+  int uv_intra_skippable = 0;
+  int uv_intra_rate_8x8 = 0, uv_intra_distortion_8x8 = 0, uv_intra_rate_tokenonly_8x8 = 0;
+  int uv_intra_skippable_8x8 = 0;
+  int rate_y, UNINITIALIZED_IS_SAFE(rate_uv);
+  int distortion_uv = INT_MAX;
+  int64_t best_yrd = INT64_MAX;
+#if CONFIG_PRED_FILTER
+  int best_filter_state;
+#endif
+  int switchable_filter_index = 0;
+
+  MB_PREDICTION_MODE uv_intra_mode;
+  MB_PREDICTION_MODE uv_intra_mode_8x8 = 0;
+
+  int near_sadidx[8] = {0, 1, 2, 3, 4, 5, 6, 7};
+  int saddone = 0;
+
+  int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
+  int_mv frame_best_ref_mv[4];
+  int frame_mdcounts[4][4];
+  unsigned char *y_buffer[4], *u_buffer[4], *v_buffer[4];
+
+  unsigned int ref_costs[MAX_REF_FRAMES];
+  int_mv seg_mvs[NB_PARTITIONINGS][16 /* n_blocks */][MAX_REF_FRAMES - 1];
+
+  vpx_memset(mode8x8, 0, sizeof(mode8x8));
+  vpx_memset(&frame_mv, 0, sizeof(frame_mv));
+  vpx_memset(&best_mbmode, 0, sizeof(best_mbmode));
+  vpx_memset(&best_bmodes, 0, sizeof(best_bmodes));
+  vpx_memset(&x->mb_context[xd->mb_index], 0, sizeof(PICK_MODE_CONTEXT));
+
+  for (i = 0; i < MAX_REF_FRAMES; i++)
+    frame_mv[NEWMV][i].as_int = INVALID_MV;
+  for (i = 0; i < NB_PREDICTION_TYPES; ++i)
+    best_pred_rd[i] = INT64_MAX;
+  for (i = 0; i < NB_TXFM_MODES; i++)
+    best_txfm_rd[i] = INT64_MAX;
+
+  for (i = 0; i < NB_PARTITIONINGS; i++) {
+    int j, k;
+
+    for (j = 0; j < 16; j++)
+      for (k = 0; k < MAX_REF_FRAMES - 1; k++)
+        seg_mvs[i][j][k].as_int = INVALID_MV;
+  }
+
+  if (cpi->ref_frame_flags & VP9_LAST_FLAG) {
+    setup_buffer_inter(cpi, x, cpi->common.lst_fb_idx, LAST_FRAME,
+                       recon_yoffset, recon_uvoffset, frame_mv[NEARESTMV],
+                       frame_mv[NEARMV], frame_best_ref_mv,
+                       frame_mdcounts, y_buffer, u_buffer, v_buffer);
+  }
+
+  if (cpi->ref_frame_flags & VP9_GOLD_FLAG) {
+    setup_buffer_inter(cpi, x, cpi->common.gld_fb_idx, GOLDEN_FRAME,
+                       recon_yoffset, recon_uvoffset, frame_mv[NEARESTMV],
+                       frame_mv[NEARMV], frame_best_ref_mv,
+                       frame_mdcounts, y_buffer, u_buffer, v_buffer);
+  }
+
+  if (cpi->ref_frame_flags & VP9_ALT_FLAG) {
+    setup_buffer_inter(cpi, x, cpi->common.alt_fb_idx, ALTREF_FRAME,
+                       recon_yoffset, recon_uvoffset, frame_mv[NEARESTMV],
+                       frame_mv[NEARMV], frame_best_ref_mv,
+                       frame_mdcounts, y_buffer, u_buffer, v_buffer);
+  }
+
+  *returnintra = INT64_MAX;
+
+  x->skip = 0;
+
+  mbmi->ref_frame = INTRA_FRAME;
+
+  /* Initialize zbin mode boost for uv costing */
+  cpi->zbin_mode_boost = 0;
+  vp9_update_zbin_extra(cpi, x);
+
+  rd_pick_intra_mbuv_mode(cpi, x, &uv_intra_rate,
+                          &uv_intra_rate_tokenonly, &uv_intra_distortion,
+                          &uv_intra_skippable);
+  uv_intra_mode = mbmi->uv_mode;
+
+  /* rough estimate for now */
+  if (cpi->common.txfm_mode != ONLY_4X4) {
+    rd_pick_intra_mbuv_mode_8x8(cpi, x, &uv_intra_rate_8x8,
+                                &uv_intra_rate_tokenonly_8x8,
+                                &uv_intra_distortion_8x8,
+                                &uv_intra_skippable_8x8);
+    uv_intra_mode_8x8 = mbmi->uv_mode;
+  }
+
+  // Get estimates of reference frame costs for each reference frame
+  // that depend on the current prediction etc.
+  estimate_ref_frame_costs(cpi, segment_id, ref_costs);
+
+  for (mode_index = 0; mode_index < MAX_MODES;
+       mode_index += (!switchable_filter_index)) {
+    int64_t this_rd = INT64_MAX;
+    int disable_skip = 0, skippable = 0;
+    int other_cost = 0;
+    int compmode_cost = 0;
+    int mode_excluded = 0;
+    int64_t txfm_cache[NB_TXFM_MODES] = { 0 };
+
+    // These variables hold are rolling total cost and distortion for this mode
+    rate2 = 0;
+    distortion2 = 0;
+    rate_y = 0;
+    rate_uv = 0;
+
+    this_mode = vp9_mode_order[mode_index].mode;
+    mbmi->mode = this_mode;
+    mbmi->uv_mode = DC_PRED;
+    mbmi->ref_frame = vp9_mode_order[mode_index].ref_frame;
+    mbmi->second_ref_frame = vp9_mode_order[mode_index].second_ref_frame;
+#if CONFIG_PRED_FILTER
+    mbmi->pred_filter_enabled = 0;
+#endif
+    if (cpi->common.mcomp_filter_type == SWITCHABLE &&
+        this_mode >= NEARESTMV && this_mode <= SPLITMV) {
+      mbmi->interp_filter =
+          vp9_switchable_interp[switchable_filter_index++];
+      if (switchable_filter_index == VP9_SWITCHABLE_FILTERS)
+        switchable_filter_index = 0;
+    } else {
+      mbmi->interp_filter = cpi->common.mcomp_filter_type;
+    }
+    vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common);
+
+    // Test best rd so far against threshold for trying this mode.
+    if (best_rd <= cpi->rd_threshes[mode_index])
+      continue;
+
+    // current coding mode under rate-distortion optimization test loop
+#if CONFIG_COMP_INTRA_PRED
+    mbmi->second_mode = (MB_PREDICTION_MODE)(DC_PRED - 1);
+    mbmi->second_uv_mode = (MB_PREDICTION_MODE)(DC_PRED - 1);
+#endif
+
+    // If the segment reference frame feature is enabled....
+    // then do nothing if the current ref frame is not allowed..
+    if (vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME) &&
+        !vp9_check_segref(xd, segment_id, mbmi->ref_frame)) {
+      continue;
+    // If the segment mode feature is enabled....
+    // then do nothing if the current mode is not allowed..
+    } else if (vp9_segfeature_active(xd, segment_id, SEG_LVL_MODE) &&
+               (this_mode !=
+                vp9_get_segdata(xd, segment_id, SEG_LVL_MODE))) {
+      continue;
+    // Disable this drop out case if either the mode or ref frame
+    // segment level feature is enabled for this segment. This is to
+    // prevent the possibility that the we end up unable to pick any mode.
+    } else if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME) &&
+               !vp9_segfeature_active(xd, segment_id, SEG_LVL_MODE)) {
+      // Only consider ZEROMV/ALTREF_FRAME for alt ref frame,
+      // unless ARNR filtering is enabled in which case we want
+      // an unfiltered alternative
+      if (cpi->is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0)) {
+        if (this_mode != ZEROMV ||
+            mbmi->ref_frame != ALTREF_FRAME) {
+          continue;
+        }
+      }
+    }
+
+    /* everything but intra */
+    if (mbmi->ref_frame) {
+      int ref = mbmi->ref_frame;
+
+      xd->pre.y_buffer = y_buffer[ref];
+      xd->pre.u_buffer = u_buffer[ref];
+      xd->pre.v_buffer = v_buffer[ref];
+      best_ref_mv = frame_best_ref_mv[ref];
+      vpx_memcpy(mdcounts, frame_mdcounts[ref], sizeof(mdcounts));
+    }
+
+    if (mbmi->second_ref_frame) {
+      int ref = mbmi->second_ref_frame;
+
+      xd->second_pre.y_buffer = y_buffer[ref];
+      xd->second_pre.u_buffer = u_buffer[ref];
+      xd->second_pre.v_buffer = v_buffer[ref];
+      second_best_ref_mv  = frame_best_ref_mv[ref];
+    }
+
+    // Experimental code. Special case for gf and arf zeromv modes.
+    // Increase zbin size to suppress noise
+    if (cpi->zbin_mode_boost_enabled) {
+      if (vp9_mode_order[mode_index].ref_frame == INTRA_FRAME)
+        cpi->zbin_mode_boost = 0;
+      else {
+        if (vp9_mode_order[mode_index].mode == ZEROMV) {
+          if (vp9_mode_order[mode_index].ref_frame != LAST_FRAME)
+            cpi->zbin_mode_boost = GF_ZEROMV_ZBIN_BOOST;
+          else
+            cpi->zbin_mode_boost = LF_ZEROMV_ZBIN_BOOST;
+        } else if (vp9_mode_order[mode_index].mode == SPLITMV)
+          cpi->zbin_mode_boost = 0;
+        else
+          cpi->zbin_mode_boost = MV_ZBIN_BOOST;
+      }
+
+      vp9_update_zbin_extra(cpi, x);
+    }
+
+    // Intra
+    if (!mbmi->ref_frame) {
+      switch (this_mode) {
+        default:
+        case DC_PRED:
+        case V_PRED:
+        case H_PRED:
+        case TM_PRED:
+        case D45_PRED:
+        case D135_PRED:
+        case D117_PRED:
+        case D153_PRED:
+        case D27_PRED:
+        case D63_PRED:
+          mbmi->ref_frame = INTRA_FRAME;
+          // FIXME compound intra prediction
+          vp9_build_intra_predictors_mby(&x->e_mbd);
+          macro_block_yrd(cpi, x, &rate_y, &distortion, &skippable, txfm_cache);
+          rate2 += rate_y;
+          distortion2 += distortion;
+          rate2 += x->mbmode_cost[xd->frame_type][mbmi->mode];
+          if (mbmi->txfm_size != TX_4X4) {
+            rate2 += uv_intra_rate_8x8;
+            rate_uv = uv_intra_rate_tokenonly_8x8;
+            distortion2 += uv_intra_distortion_8x8;
+            distortion_uv = uv_intra_distortion_8x8;
+            skippable = skippable && uv_intra_skippable_8x8;
+          } else {
+            rate2 += uv_intra_rate;
+            rate_uv = uv_intra_rate_tokenonly;
+            distortion2 += uv_intra_distortion;
+            distortion_uv = uv_intra_distortion;
+            skippable = skippable && uv_intra_skippable;
+          }
+          break;
+        case B_PRED: {
+          int64_t tmp_rd;
+
+          // Note the rate value returned here includes the cost of coding
+          // the BPRED mode : x->mbmode_cost[xd->frame_type][BPRED];
+          mbmi->txfm_size = TX_4X4;
+          tmp_rd = rd_pick_intra4x4mby_modes(cpi, x, &rate, &rate_y, &distortion, best_yrd,
+#if CONFIG_COMP_INTRA_PRED
+                                             0,
+#endif
+                                             0);
+          rate2 += rate;
+          distortion2 += distortion;
+
+          if (tmp_rd < best_yrd) {
+            rate2 += uv_intra_rate;
+            rate_uv = uv_intra_rate_tokenonly;
+            distortion2 += uv_intra_distortion;
+            distortion_uv = uv_intra_distortion;
+          } else {
+            this_rd = INT64_MAX;
+            disable_skip = 1;
+          }
+        }
+        break;
+        case I8X8_PRED: {
+          int cost0 = vp9_cost_bit(cm->prob_tx[0], 0);
+          int cost1 = vp9_cost_bit(cm->prob_tx[0], 1);
+          int64_t tmp_rd_4x4s, tmp_rd_8x8s;
+          int64_t tmp_rd_4x4, tmp_rd_8x8, tmp_rd;
+          int r4x4, tok4x4, d4x4, r8x8, tok8x8, d8x8;
+          mbmi->txfm_size = TX_4X4;
+          tmp_rd_4x4 = rd_pick_intra8x8mby_modes(cpi, x, &r4x4, &tok4x4,
+                                                 &d4x4, best_yrd);
+          mode8x8[0][0] = xd->mode_info_context->bmi[0].as_mode.first;
+          mode8x8[0][1] = xd->mode_info_context->bmi[2].as_mode.first;
+          mode8x8[0][2] = xd->mode_info_context->bmi[8].as_mode.first;
+          mode8x8[0][3] = xd->mode_info_context->bmi[10].as_mode.first;
+#if CONFIG_COMP_INTRA_PRED
+          mode8x8[1][0] = xd->mode_info_context->bmi[0].as_mode.second;
+          mode8x8[1][1] = xd->mode_info_context->bmi[2].as_mode.second;
+          mode8x8[1][2] = xd->mode_info_context->bmi[8].as_mode.second;
+          mode8x8[1][3] = xd->mode_info_context->bmi[10].as_mode.second;
+#endif
+          mbmi->txfm_size = TX_8X8;
+          tmp_rd_8x8 = rd_pick_intra8x8mby_modes(cpi, x, &r8x8, &tok8x8,
+                                                 &d8x8, best_yrd);
+          txfm_cache[ONLY_4X4]  = tmp_rd_4x4;
+          txfm_cache[ALLOW_8X8] = tmp_rd_8x8;
+          txfm_cache[ALLOW_16X16] = tmp_rd_8x8;
+          tmp_rd_4x4s = tmp_rd_4x4 + RDCOST(x->rdmult, x->rddiv, cost0, 0);
+          tmp_rd_8x8s = tmp_rd_8x8 + RDCOST(x->rdmult, x->rddiv, cost1, 0);
+          txfm_cache[TX_MODE_SELECT] = tmp_rd_4x4s < tmp_rd_8x8s ? tmp_rd_4x4s : tmp_rd_8x8s;
+          if (cm->txfm_mode == TX_MODE_SELECT) {
+            if (tmp_rd_4x4s < tmp_rd_8x8s) {
+              rate = r4x4 + cost0;
+              rate_y = tok4x4 + cost0;
+              distortion = d4x4;
+              mbmi->txfm_size = TX_4X4;
+              tmp_rd = tmp_rd_4x4s;
+            } else {
+              rate = r8x8 + cost1;
+              rate_y = tok8x8 + cost1;
+              distortion = d8x8;
+              mbmi->txfm_size = TX_8X8;
+              tmp_rd = tmp_rd_8x8s;
+
+              mode8x8[0][0] = xd->mode_info_context->bmi[0].as_mode.first;
+              mode8x8[0][1] = xd->mode_info_context->bmi[2].as_mode.first;
+              mode8x8[0][2] = xd->mode_info_context->bmi[8].as_mode.first;
+              mode8x8[0][3] = xd->mode_info_context->bmi[10].as_mode.first;
+#if CONFIG_COMP_INTRA_PRED
+              mode8x8[1][0] = xd->mode_info_context->bmi[0].as_mode.second;
+              mode8x8[1][1] = xd->mode_info_context->bmi[2].as_mode.second;
+              mode8x8[1][2] = xd->mode_info_context->bmi[8].as_mode.second;
+              mode8x8[1][3] = xd->mode_info_context->bmi[10].as_mode.second;
+#endif
+            }
+          } else if (cm->txfm_mode == ONLY_4X4) {
+            rate = r4x4;
+            rate_y = tok4x4;
+            distortion = d4x4;
+            mbmi->txfm_size = TX_4X4;
+            tmp_rd = tmp_rd_4x4;
+          } else {
+            rate = r8x8;
+            rate_y = tok8x8;
+            distortion = d8x8;
+            mbmi->txfm_size = TX_8X8;
+            tmp_rd = tmp_rd_8x8;
+
+            mode8x8[0][0] = xd->mode_info_context->bmi[0].as_mode.first;
+            mode8x8[0][1] = xd->mode_info_context->bmi[2].as_mode.first;
+            mode8x8[0][2] = xd->mode_info_context->bmi[8].as_mode.first;
+            mode8x8[0][3] = xd->mode_info_context->bmi[10].as_mode.first;
+#if CONFIG_COMP_INTRA_PRED
+            mode8x8[1][0] = xd->mode_info_context->bmi[0].as_mode.second;
+            mode8x8[1][1] = xd->mode_info_context->bmi[2].as_mode.second;
+            mode8x8[1][2] = xd->mode_info_context->bmi[8].as_mode.second;
+            mode8x8[1][3] = xd->mode_info_context->bmi[10].as_mode.second;
+#endif
+          }
+
+          rate2 += rate;
+          distortion2 += distortion;
+
+          /* TODO: uv rate maybe over-estimated here since there is UV intra
+                   mode coded in I8X8_PRED prediction */
+          if (tmp_rd < best_yrd) {
+            rate2 += uv_intra_rate;
+            rate_uv = uv_intra_rate_tokenonly;
+            distortion2 += uv_intra_distortion;
+            distortion_uv = uv_intra_distortion;
+          } else {
+            this_rd = INT64_MAX;
+            disable_skip = 1;
+          }
+        }
+        break;
+      }
+    }
+    // Split MV. The code is very different from the other inter modes so
+    // special case it.
+    else if (this_mode == SPLITMV) {
+      const int is_comp_pred = mbmi->second_ref_frame != 0;
+      int64_t tmp_rd, this_rd_thresh;
+      int_mv *second_ref = is_comp_pred ? &second_best_ref_mv : NULL;
+
+      this_rd_thresh =
+              (mbmi->ref_frame == LAST_FRAME) ?
+          cpi->rd_threshes[THR_NEWMV] : cpi->rd_threshes[THR_NEWA];
+      this_rd_thresh =
+              (mbmi->ref_frame == GOLDEN_FRAME) ?
+          cpi->rd_threshes[THR_NEWG] : this_rd_thresh;
+
+      tmp_rd = rd_pick_best_mbsegmentation(cpi, x, &best_ref_mv,
+                                           second_ref, best_yrd, mdcounts,
+                                           &rate, &rate_y, &distortion,
+                                           &skippable,
+                                           this_rd_thresh, seg_mvs,
+                                           txfm_cache);
+      rate2 += rate;
+      distortion2 += distortion;
+
+      if (cpi->common.mcomp_filter_type == SWITCHABLE)
+        rate2 += SWITCHABLE_INTERP_RATE_FACTOR * x->switchable_interp_costs
+            [vp9_get_pred_context(&cpi->common, xd, PRED_SWITCHABLE_INTERP)]
+                [vp9_switchable_interp_map[mbmi->interp_filter]];
+      // If even the 'Y' rd value of split is higher than best so far
+      // then dont bother looking at UV
+      if (tmp_rd < best_yrd) {
+        int uv_skippable;
+
+        rd_inter4x4_uv(cpi, x, &rate_uv, &distortion_uv, &uv_skippable,
+                       cpi->common.full_pixel);
+        rate2 += rate_uv;
+        distortion2 += distortion_uv;
+        skippable = skippable && uv_skippable;
+      } else {
+        this_rd = INT64_MAX;
+        disable_skip = 1;
+      }
+
+      if (is_comp_pred)
+        mode_excluded = cpi->common.comp_pred_mode == SINGLE_PREDICTION_ONLY;
+      else
+        mode_excluded = cpi->common.comp_pred_mode == COMP_PREDICTION_ONLY;
+
+      compmode_cost =
+        vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_COMP), is_comp_pred);
+      mbmi->mode = this_mode;
+    }
+    else {
+      this_rd = handle_inter_mode(cpi, x, BLOCK_16X16,
+                                  &saddone, near_sadidx, mdcounts, txfm_cache,
+                                  &rate2, &distortion2, &skippable,
+                                  &compmode_cost, &rate_y, &distortion,
+                                  &rate_uv, &distortion_uv,
+                                  &mode_excluded, &disable_skip, recon_yoffset,
+                                  mode_index, frame_mv, frame_best_ref_mv);
+      if (this_rd == INT64_MAX)
+        continue;
+    }
+
+    if (cpi->common.comp_pred_mode == HYBRID_PREDICTION)
+      rate2 += compmode_cost;
+
+    // Estimate the reference frame signaling cost and add it
+    // to the rolling cost variable.
+    rate2 += ref_costs[mbmi->ref_frame];
+
+    if (!disable_skip) {
+      // Test for the condition where skip block will be activated
+      // because there are no non zero coefficients and make any
+      // necessary adjustment for rate. Ignore if skip is coded at
+      // segment level as the cost wont have been added in.
+      if (cpi->common.mb_no_coeff_skip) {
+        int mb_skip_allowed;
+
+        // Is Mb level skip allowed for this mb.
+        mb_skip_allowed =
+          !vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) ||
+          vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);
+
+        if (skippable) {
+          mbmi->mb_skip_coeff = 1;
+
+          // Back out the coefficient coding costs
+          rate2 -= (rate_y + rate_uv);
+          // for best_yrd calculation
+          rate_uv = 0;
+
+          if (mb_skip_allowed) {
+            int prob_skip_cost;
+
+            // Cost the skip mb case
+            vp9_prob skip_prob =
+              vp9_get_pred_prob(cm, &x->e_mbd, PRED_MBSKIP);
+
+            if (skip_prob) {
+              prob_skip_cost = vp9_cost_bit(skip_prob, 1);
+              rate2 += prob_skip_cost;
+              other_cost += prob_skip_cost;
+            }
+          }
+        }
+        // Add in the cost of the no skip flag.
+        else {
+          mbmi->mb_skip_coeff = 0;
+          if (mb_skip_allowed) {
+            int prob_skip_cost = vp9_cost_bit(
+                   vp9_get_pred_prob(cm, &x->e_mbd, PRED_MBSKIP), 0);
+            rate2 += prob_skip_cost;
+            other_cost += prob_skip_cost;
+          }
+        }
+      }
+
+      // Calculate the final RD estimate for this mode.
+      this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
+    }
+
+    // Keep record of best intra distortion
+    if ((mbmi->ref_frame == INTRA_FRAME) &&
+        (this_rd < best_intra_rd)) {
+      best_intra_rd = this_rd;
+      *returnintra = distortion2;
+    }
+
+    if (!disable_skip && mbmi->ref_frame == INTRA_FRAME)
+      for (i = 0; i < NB_PREDICTION_TYPES; ++i)
+        best_pred_rd[i] = MIN(best_pred_rd[i], this_rd);
+
+#if CONFIG_PRED_FILTER
+    // Keep track of the best mode irrespective of prediction filter state
+    if (this_rd < best_overall_rd) {
+      best_overall_rd = this_rd;
+      best_filter_state = mbmi->pred_filter_enabled;
+    }
+
+    // Ignore modes where the prediction filter state doesn't
+    // match the state signaled at the frame level
+    if ((cm->pred_filter_mode == 2) ||
+        (cm->pred_filter_mode ==
+         mbmi->pred_filter_enabled)) {
+#endif
+      // Did this mode help.. i.e. is it the new best mode
+      if (this_rd < best_rd || x->skip) {
+        if (!mode_excluded) {
+          // Note index of best mode so far
+          best_mode_index = mode_index;
+
+          if (this_mode <= B_PRED) {
+            if (mbmi->txfm_size != TX_4X4
+                && this_mode != B_PRED
+                && this_mode != I8X8_PRED)
+              mbmi->uv_mode = uv_intra_mode_8x8;
+            else
+              mbmi->uv_mode = uv_intra_mode;
+            /* required for left and above block mv */
+            mbmi->mv[0].as_int = 0;
+          }
+
+          other_cost += ref_costs[mbmi->ref_frame];
+
+          /* Calculate the final y RD estimate for this mode */
+          best_yrd = RDCOST(x->rdmult, x->rddiv, (rate2 - rate_uv - other_cost),
+                            (distortion2 - distortion_uv));
+
+          *returnrate = rate2;
+          *returndistortion = distortion2;
+          best_rd = this_rd;
+          vpx_memcpy(&best_mbmode, mbmi, sizeof(MB_MODE_INFO));
+          vpx_memcpy(&best_partition, x->partition_info, sizeof(PARTITION_INFO));
+
+          if ((this_mode == B_PRED)
+              || (this_mode == I8X8_PRED)
+              || (this_mode == SPLITMV))
+            for (i = 0; i < 16; i++) {
+              best_bmodes[i] = xd->block[i].bmi;
+            }
+        }
+
+        // Testing this mode gave rise to an improvement in best error score.
+        // Lower threshold a bit for next time
+        cpi->rd_thresh_mult[mode_index] =
+            (cpi->rd_thresh_mult[mode_index] >= (MIN_THRESHMULT + 2)) ?
+            cpi->rd_thresh_mult[mode_index] - 2 : MIN_THRESHMULT;
+        cpi->rd_threshes[mode_index] =
+            (cpi->rd_baseline_thresh[mode_index] >> 7) *
+            cpi->rd_thresh_mult[mode_index];
+      }
+      // If the mode did not help improve the best error case then raise the
+      // threshold for testing that mode next time around.
+      else {
+        cpi->rd_thresh_mult[mode_index] += 4;
+
+        if (cpi->rd_thresh_mult[mode_index] > MAX_THRESHMULT)
+          cpi->rd_thresh_mult[mode_index] = MAX_THRESHMULT;
+
+        cpi->rd_threshes[mode_index] = (cpi->rd_baseline_thresh[mode_index] >> 7) * cpi->rd_thresh_mult[mode_index];
+      }
+
+      /* keep record of best compound/single-only prediction */
+      if (!disable_skip &&
+          mbmi->ref_frame != INTRA_FRAME) {
+        int64_t single_rd, hybrid_rd;
+        int single_rate, hybrid_rate;
+
+        if (cpi->common.comp_pred_mode == HYBRID_PREDICTION) {
+          single_rate = rate2 - compmode_cost;
+          hybrid_rate = rate2;
+        } else {
+          single_rate = rate2;
+          hybrid_rate = rate2 + compmode_cost;
+        }
+
+        single_rd = RDCOST(x->rdmult, x->rddiv, single_rate, distortion2);
+        hybrid_rd = RDCOST(x->rdmult, x->rddiv, hybrid_rate, distortion2);
+
+        if (mbmi->second_ref_frame == INTRA_FRAME &&
+            single_rd < best_pred_rd[SINGLE_PREDICTION_ONLY]) {
+          best_pred_rd[SINGLE_PREDICTION_ONLY] = single_rd;
+        } else if (mbmi->second_ref_frame != INTRA_FRAME &&
+                   single_rd < best_pred_rd[COMP_PREDICTION_ONLY]) {
+          best_pred_rd[COMP_PREDICTION_ONLY] = single_rd;
+        }
+        if (hybrid_rd < best_pred_rd[HYBRID_PREDICTION])
+          best_pred_rd[HYBRID_PREDICTION] = hybrid_rd;
+      }
+
+      /* keep record of best txfm size */
+      if (!mode_excluded && this_rd != INT64_MAX) {
+        for (i = 0; i < NB_TXFM_MODES; i++) {
+          int64_t adj_rd;
+          if (this_mode != B_PRED) {
+            adj_rd = this_rd + txfm_cache[i] - txfm_cache[cm->txfm_mode];
+          } else {
+            adj_rd = this_rd;
+          }
+          if (adj_rd < best_txfm_rd[i])
+            best_txfm_rd[i] = adj_rd;
+        }
+      }
+#if CONFIG_PRED_FILTER
+    }
+#endif
+
+    if (x->skip && !mode_excluded)
+      break;
+  }
+
+#if CONFIG_PRED_FILTER
+  // Update counts for prediction filter usage
+  if (best_filter_state != 0)
+    ++cpi->pred_filter_on_count;
+  else
+    ++cpi->pred_filter_off_count;
+#endif
+  if (cpi->common.mcomp_filter_type == SWITCHABLE &&
+      best_mbmode.mode >= NEARESTMV &&
+      best_mbmode.mode <= SPLITMV) {
+    ++cpi->switchable_interp_count
+        [vp9_get_pred_context(&cpi->common, xd, PRED_SWITCHABLE_INTERP)]
+        [vp9_switchable_interp_map[best_mbmode.interp_filter]];
+  }
+
+  // Reduce the activation RD thresholds for the best choice mode
+  if ((cpi->rd_baseline_thresh[best_mode_index] > 0) &&
+      (cpi->rd_baseline_thresh[best_mode_index] < (INT_MAX >> 2))) {
+    int best_adjustment = (cpi->rd_thresh_mult[best_mode_index] >> 2);
+
+    cpi->rd_thresh_mult[best_mode_index] =
+        (cpi->rd_thresh_mult[best_mode_index] >=
+         (MIN_THRESHMULT + best_adjustment)) ?
+        cpi->rd_thresh_mult[best_mode_index] - best_adjustment : MIN_THRESHMULT;
+    cpi->rd_threshes[best_mode_index] =
+        (cpi->rd_baseline_thresh[best_mode_index] >> 7) *
+        cpi->rd_thresh_mult[best_mode_index];
+  }
+
+  // This code force Altref,0,0 and skip for the frame that overlays a
+  // an alrtef unless Altref is filtered. However, this is unsafe if
+  // segment level coding of ref frame or mode is enabled for this
+  // segment.
+  if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME) &&
+      !vp9_segfeature_active(xd, segment_id, SEG_LVL_MODE) &&
+      cpi->is_src_frame_alt_ref &&
+      (cpi->oxcf.arnr_max_frames == 0) &&
+      (best_mbmode.mode != ZEROMV || best_mbmode.ref_frame != ALTREF_FRAME)) {
+    mbmi->mode = ZEROMV;
+    if (cm->txfm_mode != TX_MODE_SELECT)
+      mbmi->txfm_size = cm->txfm_mode;
+    else
+      mbmi->txfm_size = TX_16X16;
+    mbmi->ref_frame = ALTREF_FRAME;
+    mbmi->mv[0].as_int = 0;
+    mbmi->uv_mode = DC_PRED;
+    mbmi->mb_skip_coeff =
+      (cpi->common.mb_no_coeff_skip) ? 1 : 0;
+    mbmi->partitioning = 0;
+
+    vpx_memset(best_pred_diff, 0, sizeof(best_pred_diff));
+    vpx_memset(best_txfm_diff, 0, sizeof(best_txfm_diff));
+    goto end;
+  }
+
+  // macroblock modes
+  vpx_memcpy(mbmi, &best_mbmode, sizeof(MB_MODE_INFO));
+  if (best_mbmode.mode == B_PRED) {
+    for (i = 0; i < 16; i++) {
+      xd->mode_info_context->bmi[i].as_mode = best_bmodes[i].as_mode;
+      xd->block[i].bmi.as_mode = xd->mode_info_context->bmi[i].as_mode;
+    }
+  }
+
+  if (best_mbmode.mode == I8X8_PRED)
+    set_i8x8_block_modes(x, mode8x8);
+
+  if (best_mbmode.mode == SPLITMV) {
+    for (i = 0; i < 16; i++)
+      xd->mode_info_context->bmi[i].as_mv.first.as_int = best_bmodes[i].as_mv.first.as_int;
+    if (mbmi->second_ref_frame)
+      for (i = 0; i < 16; i++)
+        xd->mode_info_context->bmi[i].as_mv.second.as_int = best_bmodes[i].as_mv.second.as_int;
+
+    vpx_memcpy(x->partition_info, &best_partition, sizeof(PARTITION_INFO));
+
+    mbmi->mv[0].as_int = x->partition_info->bmi[15].mv.as_int;
+    mbmi->mv[1].as_int = x->partition_info->bmi[15].second_mv.as_int;
+  }
+
+  for (i = 0; i < NB_PREDICTION_TYPES; ++i) {
+    if (best_pred_rd[i] == INT64_MAX)
+      best_pred_diff[i] = INT_MIN;
+    else
+      best_pred_diff[i] = best_rd - best_pred_rd[i];
+  }
+
+  if (!x->skip) {
+    for (i = 0; i < NB_TXFM_MODES; i++) {
+      if (best_txfm_rd[i] == INT64_MAX)
+        best_txfm_diff[i] = INT_MIN;
+      else
+        best_txfm_diff[i] = best_rd - best_txfm_rd[i];
+    }
+  } else {
+    vpx_memset(best_txfm_diff, 0, sizeof(best_txfm_diff));
+  }
+
+end:
+  store_coding_context(x, &x->mb_context[xd->mb_index], best_mode_index, &best_partition,
+                       &frame_best_ref_mv[xd->mode_info_context->mbmi.ref_frame],
+                       &frame_best_ref_mv[xd->mode_info_context->mbmi.second_ref_frame],
+                       best_pred_diff[0], best_pred_diff[1], best_pred_diff[2],
+                       best_txfm_diff);
+}
+
+#if CONFIG_SUPERBLOCKS
+void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
+                               int *returnrate,
+                               int *returndist) {
+  VP9_COMMON *cm = &cpi->common;
+  MACROBLOCKD *xd = &x->e_mbd;
+  int rate_y, rate_uv;
+  int rate_y_tokenonly, rate_uv_tokenonly;
+  int error_y, error_uv;
+  int dist_y, dist_uv;
+  int y_skip, uv_skip;
+
+  xd->mode_info_context->mbmi.txfm_size = TX_8X8;
+
+  error_uv = rd_pick_intra_sbuv_mode(cpi, x, &rate_uv, &rate_uv_tokenonly,
+                                     &dist_uv, &uv_skip);
+  error_y = rd_pick_intra_sby_mode(cpi, x, &rate_y, &rate_y_tokenonly,
+                                   &dist_y, &y_skip);
+
+  if (cpi->common.mb_no_coeff_skip && y_skip && uv_skip) {
+    *returnrate = rate_y + rate_uv - rate_y_tokenonly - rate_uv_tokenonly +
+                  vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_MBSKIP), 1);
+    *returndist = dist_y + (dist_uv >> 2);
+  } else {
+    *returnrate = rate_y + rate_uv;
+    if (cpi->common.mb_no_coeff_skip)
+      *returnrate += vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_MBSKIP), 0);
+    *returndist = dist_y + (dist_uv >> 2);
+  }
+}
+#endif
+
+void vp9_rd_pick_intra_mode(VP9_COMP *cpi, MACROBLOCK *x,
+                            int *returnrate, int *returndist) {
+  VP9_COMMON *cm = &cpi->common;
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;
+  int64_t error4x4, error16x16;
+#if CONFIG_COMP_INTRA_PRED
+  int64_t error4x4d;
+  int rate4x4d, dist4x4d;
+#endif
+  int rate4x4, rate16x16 = 0, rateuv, rateuv8x8;
+  int dist4x4, dist16x16, distuv, distuv8x8;
+  int rate;
+  int rate4x4_tokenonly = 0;
+  int rate16x16_tokenonly = 0;
+  int rateuv_tokenonly = 0, rateuv8x8_tokenonly = 0;
+  int64_t error8x8;
+  int rate8x8_tokenonly=0;
+  int rate8x8, dist8x8;
+  int mode16x16;
+  int mode8x8[2][4];
+  int dist;
+  int modeuv, modeuv8x8, uv_intra_skippable, uv_intra_skippable_8x8;
+  int y_intra16x16_skippable;
+  int64_t txfm_cache[NB_TXFM_MODES];
+  TX_SIZE txfm_size_16x16;
+  int i;
+
+  mbmi->ref_frame = INTRA_FRAME;
+  rd_pick_intra_mbuv_mode(cpi, x, &rateuv, &rateuv_tokenonly, &distuv,
+                          &uv_intra_skippable);
+  modeuv = mbmi->uv_mode;
+  if (cpi->common.txfm_mode != ONLY_4X4) {
+    rd_pick_intra_mbuv_mode_8x8(cpi, x, &rateuv8x8, &rateuv8x8_tokenonly,
+                                &distuv8x8, &uv_intra_skippable_8x8);
+    modeuv8x8 = mbmi->uv_mode;
+  } else {
+    uv_intra_skippable_8x8 = uv_intra_skippable;
+    rateuv8x8 = rateuv;
+    distuv8x8 = distuv;
+    rateuv8x8_tokenonly = rateuv_tokenonly;
+    modeuv8x8 = modeuv;
+  }
+
+  // current macroblock under rate-distortion optimization test loop
+  error16x16 = rd_pick_intra16x16mby_mode(cpi, x, &rate16x16,
+                                          &rate16x16_tokenonly, &dist16x16,
+                                          &y_intra16x16_skippable, txfm_cache);
+  mode16x16 = mbmi->mode;
+  txfm_size_16x16 = mbmi->txfm_size;
+
+  // FIXME(rbultje) support transform-size selection
+  mbmi->txfm_size = (cm->txfm_mode == ONLY_4X4) ? TX_4X4 : TX_8X8;
+  error8x8 = rd_pick_intra8x8mby_modes(cpi, x, &rate8x8, &rate8x8_tokenonly,
+                                       &dist8x8, error16x16);
+  mode8x8[0][0]= xd->mode_info_context->bmi[0].as_mode.first;
+  mode8x8[0][1]= xd->mode_info_context->bmi[2].as_mode.first;
+  mode8x8[0][2]= xd->mode_info_context->bmi[8].as_mode.first;
+  mode8x8[0][3]= xd->mode_info_context->bmi[10].as_mode.first;
+#if CONFIG_COMP_INTRA_PRED
+  mode8x8[1][0] = xd->mode_info_context->bmi[0].as_mode.second;
+  mode8x8[1][1] = xd->mode_info_context->bmi[2].as_mode.second;
+  mode8x8[1][2] = xd->mode_info_context->bmi[8].as_mode.second;
+  mode8x8[1][3] = xd->mode_info_context->bmi[10].as_mode.second;
+#endif
+
+  error4x4 = rd_pick_intra4x4mby_modes(cpi, x,
+                                       &rate4x4, &rate4x4_tokenonly,
+                                       &dist4x4, error16x16,
+#if CONFIG_COMP_INTRA_PRED
+                                       0,
+#endif
+                                       0);
+#if CONFIG_COMP_INTRA_PRED
+  error4x4d = rd_pick_intra4x4mby_modes(cpi, x,
+                                        &rate4x4d, &rate4x4_tokenonly,
+                                        &dist4x4d, error16x16, 1, 0);
+#endif
+
+  mbmi->mb_skip_coeff = 0;
+  if (cpi->common.mb_no_coeff_skip &&
+      y_intra16x16_skippable && uv_intra_skippable_8x8) {
+    mbmi->mb_skip_coeff = 1;
+    mbmi->mode = mode16x16;
+    mbmi->uv_mode = modeuv;
+    rate = rateuv8x8 + rate16x16 - rateuv8x8_tokenonly - rate16x16_tokenonly +
+           vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_MBSKIP), 1);
+    dist = dist16x16 + (distuv8x8 >> 2);
+    mbmi->txfm_size = txfm_size_16x16;
+    memset(x->mb_context[xd->mb_index].txfm_rd_diff, 0,
+           sizeof(x->mb_context[xd->mb_index].txfm_rd_diff));
+  } else if (error8x8 > error16x16) {
+    if (error4x4 < error16x16) {
+      rate = rateuv;
+#if CONFIG_COMP_INTRA_PRED
+      rate += (error4x4d < error4x4) ? rate4x4d : rate4x4;
+      if (error4x4d >= error4x4) // FIXME save original modes etc.
+        error4x4 = rd_pick_intra4x4mby_modes(cpi, x, &rate4x4,
+                                             &rate4x4_tokenonly,
+                                             &dist4x4, error16x16, 0,
+                                             cpi->update_context);
+#else
+      rate += rate4x4;
+#endif
+      mbmi->mode = B_PRED;
+      mbmi->txfm_size = TX_4X4;
+      dist = dist4x4 + (distuv >> 2);
+      memset(x->mb_context[xd->mb_index].txfm_rd_diff, 0,
+             sizeof(x->mb_context[xd->mb_index].txfm_rd_diff));
+    } else {
+      mbmi->txfm_size = txfm_size_16x16;
+      mbmi->mode = mode16x16;
+      rate = rate16x16 + rateuv8x8;
+      dist = dist16x16 + (distuv8x8 >> 2);
+      for (i = 0; i < NB_TXFM_MODES; i++) {
+        x->mb_context[xd->mb_index].txfm_rd_diff[i] = error16x16 - txfm_cache[i];
+      }
+    }
+    if (cpi->common.mb_no_coeff_skip)
+      rate += vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_MBSKIP), 0);
+  } else {
+    if (error4x4 < error8x8) {
+      rate = rateuv;
+#if CONFIG_COMP_INTRA_PRED
+      rate += (error4x4d < error4x4) ? rate4x4d : rate4x4;
+      if (error4x4d >= error4x4) // FIXME save original modes etc.
+        error4x4 = rd_pick_intra4x4mby_modes(cpi, x, &rate4x4,
+                                             &rate4x4_tokenonly,
+                                             &dist4x4, error16x16, 0,
+                                             cpi->update_context);
+#else
+      rate += rate4x4;
+#endif
+      mbmi->mode = B_PRED;
+      mbmi->txfm_size = TX_4X4;
+      dist = dist4x4 + (distuv >> 2);
+      memset(x->mb_context[xd->mb_index].txfm_rd_diff, 0,
+             sizeof(x->mb_context[xd->mb_index].txfm_rd_diff));
+    } else {
+      // FIXME(rbultje) support transform-size selection
+      mbmi->mode = I8X8_PRED;
+      mbmi->txfm_size = (cm->txfm_mode == ONLY_4X4) ? TX_4X4 : TX_8X8;
+      set_i8x8_block_modes(x, mode8x8);
+      rate = rate8x8 + rateuv;
+      dist = dist8x8 + (distuv >> 2);
+      memset(x->mb_context[xd->mb_index].txfm_rd_diff, 0,
+             sizeof(x->mb_context[xd->mb_index].txfm_rd_diff));
+    }
+    if (cpi->common.mb_no_coeff_skip)
+      rate += vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_MBSKIP), 0);
+  }
+
+  *returnrate = rate;
+  *returndist = dist;
+}
+
+#if CONFIG_SUPERBLOCKS
+int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
+                                  int recon_yoffset, int recon_uvoffset,
+                                  int *returnrate, int *returndistortion) {
+  VP9_COMMON *cm = &cpi->common;
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
+  MB_PREDICTION_MODE this_mode;
+  MV_REFERENCE_FRAME ref_frame;
+  unsigned char segment_id = xd->mode_info_context->mbmi.segment_id;
+  int comp_pred;
+  int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
+  int_mv frame_best_ref_mv[4];
+  int frame_mdcounts[4][4];
+  unsigned char *y_buffer[4];
+  unsigned char *u_buffer[4];
+  unsigned char *v_buffer[4];
+  static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
+                                    VP9_ALT_FLAG };
+  int idx_list[4] = { 0, cpi->common.lst_fb_idx, cpi->common.gld_fb_idx,
+                      cpi->common.alt_fb_idx };
+  int mdcounts[4];
+  int near_sadidx[8] = { 0, 1, 2, 3, 4, 5, 6, 7 };
+  int saddone = 0;
+  int64_t best_rd = INT64_MAX;
+  int64_t best_comp_rd = INT64_MAX;
+  int64_t best_single_rd = INT64_MAX;
+  int64_t best_hybrid_rd = INT64_MAX;
+  int64_t best_yrd = INT64_MAX;
+  MB_MODE_INFO best_mbmode;
+  int mode_index, best_mode_index;
+  unsigned int ref_costs[MAX_REF_FRAMES];
+
+  x->skip = 0;
+  xd->mode_info_context->mbmi.segment_id = segment_id;
+  estimate_ref_frame_costs(cpi, segment_id, ref_costs);
+  vpx_memset(&best_mbmode, 0, sizeof(best_mbmode));
+
+  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
+    if (cpi->ref_frame_flags & flag_list[ref_frame]) {
+      setup_buffer_inter(cpi, x, idx_list[ref_frame], ref_frame,
+                         recon_yoffset, recon_uvoffset, frame_mv[NEARESTMV],
+                         frame_mv[NEARMV], frame_best_ref_mv,
+                         frame_mdcounts, y_buffer, u_buffer, v_buffer);
+    }
+    frame_mv[NEWMV][ref_frame].as_int = INVALID_MV;
+    frame_mv[ZEROMV][ref_frame].as_int = 0;
+  }
+
+  for (mode_index = 0; mode_index < MAX_MODES; mode_index++) {
+    int mode_excluded;
+    int64_t this_rd = INT64_MAX;
+    int disable_skip = 0;
+    int other_cost = 0;
+    int compmode_cost = 0;
+    int rate2 = 0, rate_y = 0, rate_uv = 0;
+    int distortion2 = 0, distortion_y = 0, distortion_uv = 0;
+    int skippable;
+    int64_t txfm_cache[NB_TXFM_MODES];
+
+    // Test best rd so far against threshold for trying this mode.
+    if (best_rd <= cpi->rd_threshes[mode_index]) {
+      continue;
+    }
+
+    this_mode = vp9_mode_order[mode_index].mode;
+    ref_frame = vp9_mode_order[mode_index].ref_frame;
+    mbmi->ref_frame = ref_frame;
+    comp_pred = vp9_mode_order[mode_index].second_ref_frame != INTRA_FRAME;
+    mbmi->mode = this_mode;
+    mbmi->uv_mode = DC_PRED;
+#if CONFIG_COMP_INTRA_PRED
+    mbmi->second_mode = (MB_PREDICTION_MODE)(DC_PRED - 1);
+    mbmi->second_uv_mode = (MB_PREDICTION_MODE)(DC_PRED - 1);
+#endif
+
+    if (!(cpi->ref_frame_flags & flag_list[ref_frame]))
+      continue;
+
+    // not yet supported or not superblocky
+    // TODO(rbultje): support intra coding
+    if (ref_frame == INTRA_FRAME || this_mode == SPLITMV)
+      continue;
+
+    if (comp_pred) {
+      int second_ref;
+
+      if (ref_frame == ALTREF_FRAME) {
+        second_ref = LAST_FRAME;
+      } else {
+        second_ref = ref_frame + 1;
+      }
+      if (!(cpi->ref_frame_flags & flag_list[second_ref]))
+        continue;
+      mbmi->second_ref_frame = second_ref;
+
+      xd->second_pre.y_buffer = y_buffer[second_ref];
+      xd->second_pre.u_buffer = u_buffer[second_ref];
+      xd->second_pre.v_buffer = v_buffer[second_ref];
+      mode_excluded = cm->comp_pred_mode == SINGLE_PREDICTION_ONLY;
+    } else {
+      mbmi->second_ref_frame = INTRA_FRAME;
+      mode_excluded = cm->comp_pred_mode == COMP_PREDICTION_ONLY;
+    }
+
+    xd->pre.y_buffer = y_buffer[ref_frame];
+    xd->pre.u_buffer = u_buffer[ref_frame];
+    xd->pre.v_buffer = v_buffer[ref_frame];
+    vpx_memcpy(mdcounts, frame_mdcounts[ref_frame], sizeof(mdcounts));
+
+    // If the segment reference frame feature is enabled....
+    // then do nothing if the current ref frame is not allowed..
+    if (vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME) &&
+        !vp9_check_segref(xd, segment_id, ref_frame)) {
+      continue;
+    // If the segment mode feature is enabled....
+    // then do nothing if the current mode is not allowed..
+    } else if (vp9_segfeature_active(xd, segment_id, SEG_LVL_MODE) &&
+               (this_mode != vp9_get_segdata(xd, segment_id, SEG_LVL_MODE))) {
+      continue;
+    // Disable this drop out case if either the mode or ref frame
+    // segment level feature is enabled for this segment. This is to
+    // prevent the possibility that we end up unable to pick any mode.
+    } else if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME) &&
+               !vp9_segfeature_active(xd, segment_id, SEG_LVL_MODE)) {
+      // Only consider ZEROMV/ALTREF_FRAME for alt ref frame,
+      // unless ARNR filtering is enabled in which case we want
+      // an unfiltered alternative
+      if (cpi->is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0)) {
+        if (this_mode != ZEROMV || ref_frame != ALTREF_FRAME) {
+          continue;
+        }
+      }
+    }
+
+    this_rd = handle_inter_mode(cpi, x, BLOCK_32X32,
+                                &saddone, near_sadidx, mdcounts, txfm_cache,
+                                &rate2, &distortion2, &skippable,
+                                &compmode_cost, &rate_y, &distortion_y,
+                                &rate_uv, &distortion_uv,
+                                &mode_excluded, &disable_skip, recon_yoffset,
+                                mode_index, frame_mv, frame_best_ref_mv);
+    if (this_rd == INT64_MAX)
+      continue;
+
+    if (cpi->common.comp_pred_mode == HYBRID_PREDICTION) {
+      rate2 += compmode_cost;
+    }
+
+    // Estimate the reference frame signaling cost and add it
+    // to the rolling cost variable.
+    rate2 += ref_costs[xd->mode_info_context->mbmi.ref_frame];
+
+    if (!disable_skip) {
+      // Test for the condition where skip block will be activated
+      // because there are no non zero coefficients and make any
+      // necessary adjustment for rate. Ignore if skip is coded at
+      // segment level as the cost wont have been added in.
+      if (cpi->common.mb_no_coeff_skip) {
+        int mb_skip_allowed;
+
+        // Is Mb level skip allowed for this mb.
+        mb_skip_allowed =
+          !vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) ||
+          vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);
+
+        if (skippable) {
+          // Back out the coefficient coding costs
+          rate2 -= (rate_y + rate_uv);
+          // for best_yrd calculation
+          rate_uv = 0;
+
+          if (mb_skip_allowed) {
+            int prob_skip_cost;
+
+            // Cost the skip mb case
+            vp9_prob skip_prob =
+              vp9_get_pred_prob(cm, xd, PRED_MBSKIP);
+
+            if (skip_prob) {
+              prob_skip_cost = vp9_cost_bit(skip_prob, 1);
+              rate2 += prob_skip_cost;
+              other_cost += prob_skip_cost;
+            }
+          }
+        }
+        // Add in the cost of the no skip flag.
+        else if (mb_skip_allowed) {
+          int prob_skip_cost = vp9_cost_bit(vp9_get_pred_prob(cm, xd,
+                                                          PRED_MBSKIP), 0);
+          rate2 += prob_skip_cost;
+          other_cost += prob_skip_cost;
+        }
+      }
+
+      // Calculate the final RD estimate for this mode.
+      this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
+    }
+
+#if 0
+    // Keep record of best intra distortion
+    if ((xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) &&
+        (this_rd < best_intra_rd)) {
+      best_intra_rd = this_rd;
+      *returnintra = distortion2;
+    }
+#endif
+
+    if (!disable_skip && xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) {
+      if (this_rd < best_comp_rd)
+        best_comp_rd = this_rd;
+      if (this_rd < best_single_rd)
+        best_single_rd = this_rd;
+      if (this_rd < best_hybrid_rd)
+        best_hybrid_rd = this_rd;
+    }
+
+    // Did this mode help.. i.e. is it the new best mode
+    if (this_rd < best_rd || x->skip) {
+      if (!mode_excluded) {
+        // Note index of best mode so far
+        best_mode_index = mode_index;
+
+#if 0
+        if (this_mode <= B_PRED) {
+          xd->mode_info_context->mbmi.uv_mode = uv_intra_mode_8x8;
+          /* required for left and above block mv */
+          xd->mode_info_context->mbmi.mv.as_int = 0;
+        }
+#endif
+
+        other_cost += ref_costs[xd->mode_info_context->mbmi.ref_frame];
+
+        /* Calculate the final y RD estimate for this mode */
+        best_yrd = RDCOST(x->rdmult, x->rddiv, (rate2 - rate_uv - other_cost),
+                          (distortion2 - distortion_uv));
+
+        *returnrate = rate2;
+        *returndistortion = distortion2;
+        best_rd = this_rd;
+        vpx_memcpy(&best_mbmode, mbmi, sizeof(MB_MODE_INFO));
+      }
+#if 0
+      // Testing this mode gave rise to an improvement in best error score. Lower threshold a bit for next time
+      cpi->rd_thresh_mult[mode_index] = (cpi->rd_thresh_mult[mode_index] >= (MIN_THRESHMULT + 2)) ? cpi->rd_thresh_mult[mode_index] - 2 : MIN_THRESHMULT;
+      cpi->rd_threshes[mode_index] = (cpi->rd_baseline_thresh[mode_index] >> 7) * cpi->rd_thresh_mult[mode_index];
+#endif
+    }
+    // If the mode did not help improve the best error case then raise the threshold for testing that mode next time around.
+    else {
+#if 0
+      cpi->rd_thresh_mult[mode_index] += 4;
+
+      if (cpi->rd_thresh_mult[mode_index] > MAX_THRESHMULT)
+        cpi->rd_thresh_mult[mode_index] = MAX_THRESHMULT;
+
+      cpi->rd_threshes[mode_index] = (cpi->rd_baseline_thresh[mode_index] >> 7) * cpi->rd_thresh_mult[mode_index];
+#endif
+    }
+
+    /* keep record of best compound/single-only prediction */
+    if (!disable_skip && mbmi->ref_frame != INTRA_FRAME) {
+      int single_rd, hybrid_rd, single_rate, hybrid_rate;
+
+      if (cpi->common.comp_pred_mode == HYBRID_PREDICTION) {
+        single_rate = rate2 - compmode_cost;
+        hybrid_rate = rate2;
+      } else {
+        single_rate = rate2;
+        hybrid_rate = rate2 + compmode_cost;
+      }
+
+      single_rd = RDCOST(x->rdmult, x->rddiv, single_rate, distortion2);
+      hybrid_rd = RDCOST(x->rdmult, x->rddiv, hybrid_rate, distortion2);
+
+      if (mbmi->second_ref_frame == INTRA_FRAME && single_rd < best_single_rd) {
+        best_single_rd = single_rd;
+      } else if (mbmi->second_ref_frame != INTRA_FRAME &&
+                 single_rd < best_comp_rd) {
+        best_comp_rd = single_rd;
+      }
+      if (hybrid_rd < best_hybrid_rd) {
+        best_hybrid_rd = hybrid_rd;
+      }
+    }
+
+    if (x->skip && !mode_excluded)
+      break;
+  }
+
+  // TODO(rbultje) integrate with RD thresholding
+#if 0
+  // Reduce the activation RD thresholds for the best choice mode
+  if ((cpi->rd_baseline_thresh[best_mode_index] > 0) &&
+      (cpi->rd_baseline_thresh[best_mode_index] < (INT_MAX >> 2))) {
+    int best_adjustment = (cpi->rd_thresh_mult[best_mode_index] >> 2);
+
+    cpi->rd_thresh_mult[best_mode_index] =
+      (cpi->rd_thresh_mult[best_mode_index] >= (MIN_THRESHMULT + best_adjustment)) ?
+      cpi->rd_thresh_mult[best_mode_index] - best_adjustment : MIN_THRESHMULT;
+    cpi->rd_threshes[best_mode_index] =
+      (cpi->rd_baseline_thresh[best_mode_index] >> 7) * cpi->rd_thresh_mult[best_mode_index];
+  }
+#endif
+
+  // This code forces Altref,0,0 and skip for the frame that overlays a
+  // an alrtef unless Altref is filtered. However, this is unsafe if
+  // segment level coding of ref frame or mode is enabled for this
+  // segment.
+  if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME) &&
+      !vp9_segfeature_active(xd, segment_id, SEG_LVL_MODE) &&
+      cpi->is_src_frame_alt_ref &&
+      (cpi->oxcf.arnr_max_frames == 0) &&
+      (best_mbmode.mode != ZEROMV || best_mbmode.ref_frame != ALTREF_FRAME)) {
+    mbmi->mode = ZEROMV;
+    mbmi->ref_frame = ALTREF_FRAME;
+    mbmi->second_ref_frame = 0;
+    mbmi->mv[0].as_int = 0;
+    mbmi->uv_mode = DC_PRED;
+    mbmi->mb_skip_coeff = (cpi->common.mb_no_coeff_skip) ? 1 : 0;
+    mbmi->partitioning = 0;
+    mbmi->txfm_size = TX_8X8;
+
+    if (best_rd != INT64_MAX)
+      store_coding_context(x, &x->sb_context[0], best_mode_index, NULL,
+                           &frame_best_ref_mv[mbmi->ref_frame],
+                           &frame_best_ref_mv[mbmi->second_ref_frame],
+                           0, 0, 0, NULL);
+    return best_rd;
+  }
+
+  // macroblock modes
+  vpx_memcpy(mbmi, &best_mbmode, sizeof(MB_MODE_INFO));
+  mbmi->txfm_size = TX_8X8;
+
+  if (best_rd != INT64_MAX)
+    store_coding_context(x, &x->sb_context[0], best_mode_index, NULL,
+                         &frame_best_ref_mv[mbmi->ref_frame],
+                         &frame_best_ref_mv[mbmi->second_ref_frame],
+                         (best_single_rd == INT64_MAX) ? INT_MIN :
+                                        (best_rd - best_single_rd),
+                         (best_comp_rd   == INT64_MAX) ? INT_MIN :
+                                        (best_rd - best_comp_rd),
+                         (best_hybrid_rd == INT64_MAX) ? INT_MIN :
+                                        (best_rd - best_hybrid_rd),
+                         NULL);
+
+  return best_rd;
+}
+#endif
+
+void vp9_pick_mode_inter_macroblock(VP9_COMP *cpi, MACROBLOCK *x,
+                                    int recon_yoffset,
+                                    int recon_uvoffset,
+                                    int *totalrate, int *totaldist) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;
+  int rate, distortion;
+  int64_t intra_error = 0;
+  unsigned char *segment_id = &mbmi->segment_id;
+
+  if (xd->segmentation_enabled)
+    x->encode_breakout = cpi->segment_encode_breakout[*segment_id];
+  else
+    x->encode_breakout = cpi->oxcf.encode_breakout;
+
+  // if (cpi->sf.RD)
+  // For now this codebase is limited to a single rd encode path
+  {
+    int zbin_mode_boost_enabled = cpi->zbin_mode_boost_enabled;
+
+    vp9_rd_pick_inter_mode(cpi, x, recon_yoffset, recon_uvoffset, &rate,
+                           &distortion, &intra_error);
+
+    /* restore cpi->zbin_mode_boost_enabled */
+    cpi->zbin_mode_boost_enabled = zbin_mode_boost_enabled;
+  }
+  // else
+  // The non rd encode path has been deleted from this code base
+  // to simplify development
+  //    vp9_pick_inter_mode
+
+  // Store metrics so they can be added in to totals if this mode is picked
+  x->mb_context[xd->mb_index].distortion  = distortion;
+  x->mb_context[xd->mb_index].intra_error = intra_error;
+
+  *totalrate = rate;
+  *totaldist = distortion;
+}
--- /dev/null
+++ b/vp9/encoder/rdopt.h
@@ -1,0 +1,41 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_RDOPT_H
+#define __INC_RDOPT_H
+
+#define RDCOST(RM,DM,R,D) ( ((128+((int64_t)R)*(RM)) >> 8) + ((int64_t)DM)*(D) )
+#define RDCOST_8x8(RM,DM,R,D) ( ((128+((int64_t)R)*(RM)) >> 8) + ((int64_t)DM)*(D) )
+
+extern void vp9_initialize_rd_consts(VP9_COMP *cpi, int Qvalue);
+
+extern void vp9_rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
+                                   int recon_yoffset, int recon_uvoffset,
+                                   int *returnrate, int *returndistortion,
+                                   int64_t *returnintra);
+
+extern void vp9_rd_pick_intra_mode(VP9_COMP *cpi, MACROBLOCK *x,
+                                   int *r, int *d);
+
+extern void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
+                                      int *r, int *d);
+
+extern void vp9_mv_pred(VP9_COMP *cpi, MACROBLOCKD *xd,
+                        const MODE_INFO *here, int_mv *mvp,
+                        int refframe, int *ref_frame_sign_bias,
+                        int *sr, int near_sadidx[]);
+
+extern void vp9_init_me_luts();
+
+extern void vp9_set_mbmode_and_mvs(MACROBLOCK *x,
+                                   MB_PREDICTION_MODE mb, int_mv *mv);
+
+#endif
--- /dev/null
+++ b/vp9/encoder/sad_c.c
@@ -1,0 +1,480 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include <stdlib.h>
+#include "vp9/common/sadmxn.h"
+#include "vpx_ports/config.h"
+#include "vpx/vpx_integer.h"
+
+unsigned int vp9_sad32x32_c(const unsigned char *src_ptr,
+                            int  src_stride,
+                            const unsigned char *ref_ptr,
+                            int  ref_stride,
+                            int max_sad) {
+  return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 32, 32);
+}
+
+unsigned int vp9_sad16x16_c(const unsigned char *src_ptr,
+                            int  src_stride,
+                            const unsigned char *ref_ptr,
+                            int  ref_stride,
+                            int max_sad) {
+  return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 16, 16);
+}
+
+unsigned int vp9_sad8x8_c(const unsigned char *src_ptr,
+                          int  src_stride,
+                          const unsigned char *ref_ptr,
+                          int  ref_stride,
+                          int max_sad) {
+  return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 8, 8);
+}
+
+
+unsigned int vp9_sad16x8_c(const unsigned char *src_ptr,
+                           int  src_stride,
+                           const unsigned char *ref_ptr,
+                           int  ref_stride,
+                           int max_sad) {
+  return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 16, 8);
+}
+
+unsigned int vp9_sad8x16_c(const unsigned char *src_ptr,
+                           int  src_stride,
+                           const unsigned char *ref_ptr,
+                           int  ref_stride,
+                           int max_sad) {
+  return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 8, 16);
+}
+
+
+unsigned int vp9_sad4x4_c(const unsigned char *src_ptr,
+                          int  src_stride,
+                          const unsigned char *ref_ptr,
+                          int  ref_stride,
+                          int max_sad) {
+  return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 4, 4);
+}
+
+void vp9_sad32x32x3_c(const unsigned char *src_ptr,
+                      int  src_stride,
+                      const unsigned char *ref_ptr,
+                      int  ref_stride,
+                      unsigned int *sad_array
+                      ) {
+  sad_array[0] = vp9_sad32x32_c(src_ptr, src_stride,
+                                ref_ptr, ref_stride, 0x7fffffff);
+  sad_array[1] = vp9_sad32x32_c(src_ptr, src_stride,
+                                ref_ptr + 1, ref_stride, 0x7fffffff);
+  sad_array[2] = vp9_sad32x32_c(src_ptr, src_stride,
+                                ref_ptr + 2, ref_stride, 0x7fffffff);
+}
+
+void vp9_sad32x32x8_c(const unsigned char *src_ptr,
+                      int  src_stride,
+                      const unsigned char *ref_ptr,
+                      int  ref_stride,
+                      unsigned short *sad_array
+                      ) {
+  sad_array[0] = (unsigned short)vp9_sad32x32_c(src_ptr, src_stride,
+                                                ref_ptr, ref_stride,
+                                                0x7fffffff);
+  sad_array[1] = (unsigned short)vp9_sad32x32_c(src_ptr, src_stride,
+                                                ref_ptr + 1, ref_stride,
+                                                0x7fffffff);
+  sad_array[2] = (unsigned short)vp9_sad32x32_c(src_ptr, src_stride,
+                                                ref_ptr + 2, ref_stride,
+                                                0x7fffffff);
+  sad_array[3] = (unsigned short)vp9_sad32x32_c(src_ptr, src_stride,
+                                                ref_ptr + 3, ref_stride,
+                                                0x7fffffff);
+  sad_array[4] = (unsigned short)vp9_sad32x32_c(src_ptr, src_stride,
+                                                ref_ptr + 4, ref_stride,
+                                                0x7fffffff);
+  sad_array[5] = (unsigned short)vp9_sad32x32_c(src_ptr, src_stride,
+                                                ref_ptr + 5, ref_stride,
+                                                0x7fffffff);
+  sad_array[6] = (unsigned short)vp9_sad32x32_c(src_ptr, src_stride,
+                                                ref_ptr + 6, ref_stride,
+                                                0x7fffffff);
+  sad_array[7] = (unsigned short)vp9_sad32x32_c(src_ptr, src_stride,
+                                                ref_ptr + 7, ref_stride,
+                                                0x7fffffff);
+}
+
+void vp9_sad16x16x3_c(const unsigned char *src_ptr,
+                      int  src_stride,
+                      const unsigned char *ref_ptr,
+                      int  ref_stride,
+                      unsigned int *sad_array) {
+  sad_array[0] = vp9_sad16x16_c(src_ptr, src_stride,
+                                ref_ptr, ref_stride, 0x7fffffff);
+  sad_array[1] = vp9_sad16x16_c(src_ptr, src_stride,
+                                ref_ptr + 1, ref_stride, 0x7fffffff);
+  sad_array[2] = vp9_sad16x16_c(src_ptr, src_stride,
+                                ref_ptr + 2, ref_stride, 0x7fffffff);
+}
+
+void vp9_sad16x16x8_c(const unsigned char *src_ptr,
+                      int  src_stride,
+                      const unsigned char *ref_ptr,
+                      int  ref_stride,
+                      unsigned short *sad_array) {
+  sad_array[0] = (unsigned short)vp9_sad16x16_c(src_ptr, src_stride,
+                                                ref_ptr, ref_stride,
+                                                0x7fffffff);
+  sad_array[1] = (unsigned short)vp9_sad16x16_c(src_ptr, src_stride,
+                                                ref_ptr + 1, ref_stride,
+                                                0x7fffffff);
+  sad_array[2] = (unsigned short)vp9_sad16x16_c(src_ptr, src_stride,
+                                                ref_ptr + 2, ref_stride,
+                                                0x7fffffff);
+  sad_array[3] = (unsigned short)vp9_sad16x16_c(src_ptr, src_stride,
+                                                ref_ptr + 3, ref_stride,
+                                                0x7fffffff);
+  sad_array[4] = (unsigned short)vp9_sad16x16_c(src_ptr, src_stride,
+                                                ref_ptr + 4, ref_stride,
+                                                0x7fffffff);
+  sad_array[5] = (unsigned short)vp9_sad16x16_c(src_ptr, src_stride,
+                                                ref_ptr + 5, ref_stride,
+                                                0x7fffffff);
+  sad_array[6] = (unsigned short)vp9_sad16x16_c(src_ptr, src_stride,
+                                                ref_ptr + 6, ref_stride,
+                                                0x7fffffff);
+  sad_array[7] = (unsigned short)vp9_sad16x16_c(src_ptr, src_stride,
+                                                ref_ptr + 7, ref_stride,
+                                                0x7fffffff);
+}
+
+void vp9_sad16x8x3_c(const unsigned char *src_ptr,
+                     int  src_stride,
+                     const unsigned char *ref_ptr,
+                     int  ref_stride,
+                     unsigned int *sad_array) {
+  sad_array[0] = vp9_sad16x8_c(src_ptr, src_stride,
+                               ref_ptr, ref_stride, 0x7fffffff);
+  sad_array[1] = vp9_sad16x8_c(src_ptr, src_stride,
+                               ref_ptr + 1, ref_stride, 0x7fffffff);
+  sad_array[2] = vp9_sad16x8_c(src_ptr, src_stride,
+                               ref_ptr + 2, ref_stride, 0x7fffffff);
+}
+
+void vp9_sad16x8x8_c(const unsigned char *src_ptr,
+                     int  src_stride,
+                     const unsigned char *ref_ptr,
+                     int  ref_stride,
+                     unsigned short *sad_array) {
+  sad_array[0] = (unsigned short)vp9_sad16x8_c(src_ptr, src_stride,
+                                               ref_ptr, ref_stride,
+                                               0x7fffffff);
+  sad_array[1] = (unsigned short)vp9_sad16x8_c(src_ptr, src_stride,
+                                               ref_ptr + 1, ref_stride,
+                                               0x7fffffff);
+  sad_array[2] = (unsigned short)vp9_sad16x8_c(src_ptr, src_stride,
+                                               ref_ptr + 2, ref_stride,
+                                               0x7fffffff);
+  sad_array[3] = (unsigned short)vp9_sad16x8_c(src_ptr, src_stride,
+                                               ref_ptr + 3, ref_stride,
+                                               0x7fffffff);
+  sad_array[4] = (unsigned short)vp9_sad16x8_c(src_ptr, src_stride,
+                                               ref_ptr + 4, ref_stride,
+                                               0x7fffffff);
+  sad_array[5] = (unsigned short)vp9_sad16x8_c(src_ptr, src_stride,
+                                               ref_ptr + 5, ref_stride,
+                                               0x7fffffff);
+  sad_array[6] = (unsigned short)vp9_sad16x8_c(src_ptr, src_stride,
+                                               ref_ptr + 6, ref_stride,
+                                               0x7fffffff);
+  sad_array[7] = (unsigned short)vp9_sad16x8_c(src_ptr, src_stride,
+                                               ref_ptr + 7, ref_stride,
+                                               0x7fffffff);
+}
+
+void vp9_sad8x8x3_c(const unsigned char *src_ptr,
+                    int  src_stride,
+                    const unsigned char *ref_ptr,
+                    int  ref_stride,
+                    unsigned int *sad_array) {
+  sad_array[0] = vp9_sad8x8_c(src_ptr, src_stride,
+                              ref_ptr, ref_stride, 0x7fffffff);
+  sad_array[1] = vp9_sad8x8_c(src_ptr, src_stride,
+                              ref_ptr + 1, ref_stride, 0x7fffffff);
+  sad_array[2] = vp9_sad8x8_c(src_ptr, src_stride,
+                              ref_ptr + 2, ref_stride, 0x7fffffff);
+}
+
+void vp9_sad8x8x8_c(const unsigned char *src_ptr,
+                    int  src_stride,
+                    const unsigned char *ref_ptr,
+                    int  ref_stride,
+                    unsigned short *sad_array) {
+  sad_array[0] = (unsigned short)vp9_sad8x8_c(src_ptr, src_stride,
+                                              ref_ptr, ref_stride,
+                                              0x7fffffff);
+  sad_array[1] = (unsigned short)vp9_sad8x8_c(src_ptr, src_stride,
+                                              ref_ptr + 1, ref_stride,
+                                              0x7fffffff);
+  sad_array[2] = (unsigned short)vp9_sad8x8_c(src_ptr, src_stride,
+                                              ref_ptr + 2, ref_stride,
+                                              0x7fffffff);
+  sad_array[3] = (unsigned short)vp9_sad8x8_c(src_ptr, src_stride,
+                                              ref_ptr + 3, ref_stride,
+                                              0x7fffffff);
+  sad_array[4] = (unsigned short)vp9_sad8x8_c(src_ptr, src_stride,
+                                              ref_ptr + 4, ref_stride,
+                                              0x7fffffff);
+  sad_array[5] = (unsigned short)vp9_sad8x8_c(src_ptr, src_stride,
+                                              ref_ptr + 5, ref_stride,
+                                              0x7fffffff);
+  sad_array[6] = (unsigned short)vp9_sad8x8_c(src_ptr, src_stride,
+                                              ref_ptr + 6, ref_stride,
+                                              0x7fffffff);
+  sad_array[7] = (unsigned short)vp9_sad8x8_c(src_ptr, src_stride,
+                                              ref_ptr + 7, ref_stride,
+                                              0x7fffffff);
+}
+
+void vp9_sad8x16x3_c(const unsigned char *src_ptr,
+                     int  src_stride,
+                     const unsigned char *ref_ptr,
+                     int  ref_stride,
+                     unsigned int *sad_array) {
+  sad_array[0] = vp9_sad8x16_c(src_ptr, src_stride,
+                               ref_ptr, ref_stride, 0x7fffffff);
+  sad_array[1] = vp9_sad8x16_c(src_ptr, src_stride,
+                               ref_ptr + 1, ref_stride, 0x7fffffff);
+  sad_array[2] = vp9_sad8x16_c(src_ptr, src_stride,
+                               ref_ptr + 2, ref_stride, 0x7fffffff);
+}
+
+void vp9_sad8x16x8_c(const unsigned char *src_ptr,
+                     int  src_stride,
+                     const unsigned char *ref_ptr,
+                     int  ref_stride,
+                     unsigned short *sad_array) {
+  sad_array[0] = (unsigned short)vp9_sad8x16_c(src_ptr, src_stride,
+                                               ref_ptr, ref_stride,
+                                               0x7fffffff);
+  sad_array[1] = (unsigned short)vp9_sad8x16_c(src_ptr, src_stride,
+                                               ref_ptr + 1, ref_stride,
+                                               0x7fffffff);
+  sad_array[2] = (unsigned short)vp9_sad8x16_c(src_ptr, src_stride,
+                                               ref_ptr + 2, ref_stride,
+                                               0x7fffffff);
+  sad_array[3] = (unsigned short)vp9_sad8x16_c(src_ptr, src_stride,
+                                               ref_ptr + 3, ref_stride,
+                                               0x7fffffff);
+  sad_array[4] = (unsigned short)vp9_sad8x16_c(src_ptr, src_stride,
+                                               ref_ptr + 4, ref_stride,
+                                               0x7fffffff);
+  sad_array[5] = (unsigned short)vp9_sad8x16_c(src_ptr, src_stride,
+                                               ref_ptr + 5, ref_stride,
+                                               0x7fffffff);
+  sad_array[6] = (unsigned short)vp9_sad8x16_c(src_ptr, src_stride,
+                                               ref_ptr + 6, ref_stride,
+                                               0x7fffffff);
+  sad_array[7] = (unsigned short)vp9_sad8x16_c(src_ptr, src_stride,
+                                               ref_ptr + 7, ref_stride,
+                                               0x7fffffff);
+}
+
+void vp9_sad4x4x3_c(const unsigned char *src_ptr,
+                    int  src_stride,
+                    const unsigned char *ref_ptr,
+                    int  ref_stride,
+                    unsigned int *sad_array) {
+  sad_array[0] = vp9_sad4x4_c(src_ptr, src_stride,
+                              ref_ptr, ref_stride, 0x7fffffff);
+  sad_array[1] = vp9_sad4x4_c(src_ptr, src_stride,
+                              ref_ptr + 1, ref_stride, 0x7fffffff);
+  sad_array[2] = vp9_sad4x4_c(src_ptr, src_stride,
+                              ref_ptr + 2, ref_stride, 0x7fffffff);
+}
+
+void vp9_sad4x4x8_c(const unsigned char *src_ptr,
+                    int  src_stride,
+                    const unsigned char *ref_ptr,
+                    int  ref_stride,
+                    unsigned short *sad_array) {
+  sad_array[0] = (unsigned short)vp9_sad4x4_c(src_ptr, src_stride,
+                                              ref_ptr, ref_stride,
+                                              0x7fffffff);
+  sad_array[1] = (unsigned short)vp9_sad4x4_c(src_ptr, src_stride,
+                                              ref_ptr + 1, ref_stride,
+                                              0x7fffffff);
+  sad_array[2] = (unsigned short)vp9_sad4x4_c(src_ptr, src_stride,
+                                              ref_ptr + 2, ref_stride,
+                                              0x7fffffff);
+  sad_array[3] = (unsigned short)vp9_sad4x4_c(src_ptr, src_stride,
+                                              ref_ptr + 3, ref_stride,
+                                              0x7fffffff);
+  sad_array[4] = (unsigned short)vp9_sad4x4_c(src_ptr, src_stride,
+                                              ref_ptr + 4, ref_stride,
+                                              0x7fffffff);
+  sad_array[5] = (unsigned short)vp9_sad4x4_c(src_ptr, src_stride,
+                                              ref_ptr + 5, ref_stride,
+                                              0x7fffffff);
+  sad_array[6] = (unsigned short)vp9_sad4x4_c(src_ptr, src_stride,
+                                              ref_ptr + 6, ref_stride,
+                                              0x7fffffff);
+  sad_array[7] = (unsigned short)vp9_sad4x4_c(src_ptr, src_stride,
+                                              ref_ptr + 7, ref_stride,
+                                              0x7fffffff);
+}
+
+void vp9_sad32x32x4d_c(const unsigned char *src_ptr,
+                       int  src_stride,
+                       unsigned char *ref_ptr[],
+                       int  ref_stride,
+                       unsigned int *sad_array
+                       ) {
+  sad_array[0] = vp9_sad32x32_c(src_ptr, src_stride,
+                                ref_ptr[0], ref_stride, 0x7fffffff);
+  sad_array[1] = vp9_sad32x32_c(src_ptr, src_stride,
+                                ref_ptr[1], ref_stride, 0x7fffffff);
+  sad_array[2] = vp9_sad32x32_c(src_ptr, src_stride,
+                                ref_ptr[2], ref_stride, 0x7fffffff);
+  sad_array[3] = vp9_sad32x32_c(src_ptr, src_stride,
+                                ref_ptr[3], ref_stride, 0x7fffffff);
+}
+
+void vp9_sad16x16x4d_c(const unsigned char *src_ptr,
+                       int  src_stride,
+                       unsigned char *ref_ptr[],
+                       int  ref_stride,
+                       unsigned int *sad_array) {
+  sad_array[0] = vp9_sad16x16_c(src_ptr, src_stride,
+                                ref_ptr[0], ref_stride, 0x7fffffff);
+  sad_array[1] = vp9_sad16x16_c(src_ptr, src_stride,
+                                ref_ptr[1], ref_stride, 0x7fffffff);
+  sad_array[2] = vp9_sad16x16_c(src_ptr, src_stride,
+                                ref_ptr[2], ref_stride, 0x7fffffff);
+  sad_array[3] = vp9_sad16x16_c(src_ptr, src_stride,
+                                ref_ptr[3], ref_stride, 0x7fffffff);
+}
+
+void vp9_sad16x8x4d_c(const unsigned char *src_ptr,
+                      int  src_stride,
+                      unsigned char *ref_ptr[],
+                      int  ref_stride,
+                      unsigned int *sad_array) {
+  sad_array[0] = vp9_sad16x8_c(src_ptr, src_stride,
+                               ref_ptr[0], ref_stride, 0x7fffffff);
+  sad_array[1] = vp9_sad16x8_c(src_ptr, src_stride,
+                               ref_ptr[1], ref_stride, 0x7fffffff);
+  sad_array[2] = vp9_sad16x8_c(src_ptr, src_stride,
+                               ref_ptr[2], ref_stride, 0x7fffffff);
+  sad_array[3] = vp9_sad16x8_c(src_ptr, src_stride,
+                               ref_ptr[3], ref_stride, 0x7fffffff);
+}
+
+void vp9_sad8x8x4d_c(const unsigned char *src_ptr,
+                     int  src_stride,
+                     unsigned char *ref_ptr[],
+                     int  ref_stride,
+                     unsigned int *sad_array) {
+  sad_array[0] = vp9_sad8x8_c(src_ptr, src_stride,
+                              ref_ptr[0], ref_stride, 0x7fffffff);
+  sad_array[1] = vp9_sad8x8_c(src_ptr, src_stride,
+                              ref_ptr[1], ref_stride, 0x7fffffff);
+  sad_array[2] = vp9_sad8x8_c(src_ptr, src_stride,
+                              ref_ptr[2], ref_stride, 0x7fffffff);
+  sad_array[3] = vp9_sad8x8_c(src_ptr, src_stride,
+                              ref_ptr[3], ref_stride, 0x7fffffff);
+}
+
+void vp9_sad8x16x4d_c(const unsigned char *src_ptr,
+                      int  src_stride,
+                      unsigned char *ref_ptr[],
+                      int  ref_stride,
+                      unsigned int *sad_array) {
+  sad_array[0] = vp9_sad8x16_c(src_ptr, src_stride,
+                               ref_ptr[0], ref_stride, 0x7fffffff);
+  sad_array[1] = vp9_sad8x16_c(src_ptr, src_stride,
+                               ref_ptr[1], ref_stride, 0x7fffffff);
+  sad_array[2] = vp9_sad8x16_c(src_ptr, src_stride,
+                               ref_ptr[2], ref_stride, 0x7fffffff);
+  sad_array[3] = vp9_sad8x16_c(src_ptr, src_stride,
+                               ref_ptr[3], ref_stride, 0x7fffffff);
+}
+
+void vp9_sad4x4x4d_c(const unsigned char *src_ptr,
+                     int  src_stride,
+                     unsigned char *ref_ptr[],
+                     int  ref_stride,
+                     unsigned int *sad_array) {
+  sad_array[0] = vp9_sad4x4_c(src_ptr, src_stride,
+                              ref_ptr[0], ref_stride, 0x7fffffff);
+  sad_array[1] = vp9_sad4x4_c(src_ptr, src_stride,
+                              ref_ptr[1], ref_stride, 0x7fffffff);
+  sad_array[2] = vp9_sad4x4_c(src_ptr, src_stride,
+                              ref_ptr[2], ref_stride, 0x7fffffff);
+  sad_array[3] = vp9_sad4x4_c(src_ptr, src_stride,
+                              ref_ptr[3], ref_stride, 0x7fffffff);
+}
+
+/* Copy 2 macroblocks to a buffer */
+void vp9_copy32xn_c(unsigned char *src_ptr,
+                    int  src_stride,
+                    unsigned char *dst_ptr,
+                    int  dst_stride,
+                    int height) {
+  int r;
+
+  for (r = 0; r < height; r++) {
+#if !(CONFIG_FAST_UNALIGNED)
+    dst_ptr[0] = src_ptr[0];
+    dst_ptr[1] = src_ptr[1];
+    dst_ptr[2] = src_ptr[2];
+    dst_ptr[3] = src_ptr[3];
+    dst_ptr[4] = src_ptr[4];
+    dst_ptr[5] = src_ptr[5];
+    dst_ptr[6] = src_ptr[6];
+    dst_ptr[7] = src_ptr[7];
+    dst_ptr[8] = src_ptr[8];
+    dst_ptr[9] = src_ptr[9];
+    dst_ptr[10] = src_ptr[10];
+    dst_ptr[11] = src_ptr[11];
+    dst_ptr[12] = src_ptr[12];
+    dst_ptr[13] = src_ptr[13];
+    dst_ptr[14] = src_ptr[14];
+    dst_ptr[15] = src_ptr[15];
+    dst_ptr[16] = src_ptr[16];
+    dst_ptr[17] = src_ptr[17];
+    dst_ptr[18] = src_ptr[18];
+    dst_ptr[19] = src_ptr[19];
+    dst_ptr[20] = src_ptr[20];
+    dst_ptr[21] = src_ptr[21];
+    dst_ptr[22] = src_ptr[22];
+    dst_ptr[23] = src_ptr[23];
+    dst_ptr[24] = src_ptr[24];
+    dst_ptr[25] = src_ptr[25];
+    dst_ptr[26] = src_ptr[26];
+    dst_ptr[27] = src_ptr[27];
+    dst_ptr[28] = src_ptr[28];
+    dst_ptr[29] = src_ptr[29];
+    dst_ptr[30] = src_ptr[30];
+    dst_ptr[31] = src_ptr[31];
+#else
+    ((uint32_t *)dst_ptr)[0] = ((uint32_t *)src_ptr)[0];
+    ((uint32_t *)dst_ptr)[1] = ((uint32_t *)src_ptr)[1];
+    ((uint32_t *)dst_ptr)[2] = ((uint32_t *)src_ptr)[2];
+    ((uint32_t *)dst_ptr)[3] = ((uint32_t *)src_ptr)[3];
+    ((uint32_t *)dst_ptr)[4] = ((uint32_t *)src_ptr)[4];
+    ((uint32_t *)dst_ptr)[5] = ((uint32_t *)src_ptr)[5];
+    ((uint32_t *)dst_ptr)[6] = ((uint32_t *)src_ptr)[6];
+    ((uint32_t *)dst_ptr)[7] = ((uint32_t *)src_ptr)[7];
+#endif
+    src_ptr += src_stride;
+    dst_ptr += dst_stride;
+
+  }
+}
--- /dev/null
+++ b/vp9/encoder/satd_c.c
@@ -1,0 +1,47 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+#include "vpx_ports/mem.h"
+#include "./vpx_rtcd.h"
+unsigned int vp9_satd16x16_c(const unsigned char *src_ptr,
+                             int  src_stride,
+                             const unsigned char *ref_ptr,
+                             int  ref_stride,
+                             unsigned int *psatd) {
+  int r, c, i;
+  unsigned int satd = 0;
+  DECLARE_ALIGNED(16, short, diff_in[256]);
+  DECLARE_ALIGNED(16, short, diff_out[16]);
+  short *in;
+
+  for (r = 0; r < 16; r++) {
+    for (c = 0; c < 16; c++) {
+      diff_in[r * 16 + c] = src_ptr[c] - ref_ptr[c];
+    }
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+  }
+
+  in = diff_in;
+  for (r = 0; r < 16; r += 4) {
+    for (c = 0; c < 16; c += 4) {
+      vp9_short_walsh4x4_c(in + c, diff_out, 32);
+      for (i = 0; i < 16; i++)
+        satd += abs(diff_out[i]);
+    }
+    in += 64;
+  }
+
+  if (psatd)
+    *psatd = satd;
+
+  return satd;
+}
--- /dev/null
+++ b/vp9/encoder/segmentation.c
@@ -1,0 +1,327 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "limits.h"
+#include "vpx_mem/vpx_mem.h"
+#include "segmentation.h"
+#include "vp9/common/pred_common.h"
+
+void vp9_update_gf_useage_maps(VP9_COMP *cpi, VP9_COMMON *cm, MACROBLOCK *x) {
+  int mb_row, mb_col;
+
+  MODE_INFO *this_mb_mode_info = cm->mi;
+
+  x->gf_active_ptr = (signed char *)cpi->gf_active_flags;
+
+  if ((cm->frame_type == KEY_FRAME) || (cm->refresh_golden_frame)) {
+    // Reset Gf useage monitors
+    vpx_memset(cpi->gf_active_flags, 1, (cm->mb_rows * cm->mb_cols));
+    cpi->gf_active_count = cm->mb_rows * cm->mb_cols;
+  } else {
+    // for each macroblock row in image
+    for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) {
+      // for each macroblock col in image
+      for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) {
+
+        // If using golden then set GF active flag if not already set.
+        // If using last frame 0,0 mode then leave flag as it is
+        // else if using non 0,0 motion or intra modes then clear
+        // flag if it is currently set
+        if ((this_mb_mode_info->mbmi.ref_frame == GOLDEN_FRAME) ||
+            (this_mb_mode_info->mbmi.ref_frame == ALTREF_FRAME)) {
+          if (*(x->gf_active_ptr) == 0) {
+            *(x->gf_active_ptr) = 1;
+            cpi->gf_active_count++;
+          }
+        } else if ((this_mb_mode_info->mbmi.mode != ZEROMV) &&
+                   *(x->gf_active_ptr)) {
+          *(x->gf_active_ptr) = 0;
+          cpi->gf_active_count--;
+        }
+
+        x->gf_active_ptr++;          // Step onto next entry
+        this_mb_mode_info++;         // skip to next mb
+
+      }
+
+      // this is to account for the border
+      this_mb_mode_info++;
+    }
+  }
+}
+
+void vp9_enable_segmentation(VP9_PTR ptr) {
+  VP9_COMP *cpi = (VP9_COMP *)(ptr);
+
+  // Set the appropriate feature bit
+  cpi->mb.e_mbd.segmentation_enabled = 1;
+  cpi->mb.e_mbd.update_mb_segmentation_map = 1;
+  cpi->mb.e_mbd.update_mb_segmentation_data = 1;
+}
+
+void vp9_disable_segmentation(VP9_PTR ptr) {
+  VP9_COMP *cpi = (VP9_COMP *)(ptr);
+
+  // Clear the appropriate feature bit
+  cpi->mb.e_mbd.segmentation_enabled = 0;
+}
+
+void vp9_set_segmentation_map(VP9_PTR ptr,
+                              unsigned char *segmentation_map) {
+  VP9_COMP *cpi = (VP9_COMP *)(ptr);
+
+  // Copy in the new segmentation map
+  vpx_memcpy(cpi->segmentation_map, segmentation_map,
+             (cpi->common.mb_rows * cpi->common.mb_cols));
+
+  // Signal that the map should be updated.
+  cpi->mb.e_mbd.update_mb_segmentation_map = 1;
+  cpi->mb.e_mbd.update_mb_segmentation_data = 1;
+}
+
+void vp9_set_segment_data(VP9_PTR ptr,
+                          signed char *feature_data,
+                          unsigned char abs_delta) {
+  VP9_COMP *cpi = (VP9_COMP *)(ptr);
+
+  cpi->mb.e_mbd.mb_segment_abs_delta = abs_delta;
+
+  vpx_memcpy(cpi->mb.e_mbd.segment_feature_data, feature_data,
+             sizeof(cpi->mb.e_mbd.segment_feature_data));
+
+  // TBD ?? Set the feature mask
+  // vpx_memcpy(cpi->mb.e_mbd.segment_feature_mask, 0,
+  //            sizeof(cpi->mb.e_mbd.segment_feature_mask));
+}
+
+// Based on set of segment counts calculate a probability tree
+static void calc_segtree_probs(MACROBLOCKD *xd,
+                               int *segcounts,
+                               vp9_prob *segment_tree_probs) {
+  int count1, count2;
+  int tot_count;
+  int i;
+
+  // Blank the strtucture to start with
+  vpx_memset(segment_tree_probs, 0,
+             MB_FEATURE_TREE_PROBS * sizeof(*segment_tree_probs));
+
+  // Total count for all segments
+  count1 = segcounts[0] + segcounts[1];
+  count2 = segcounts[2] + segcounts[3];
+  tot_count = count1 + count2;
+
+  // Work out probabilities of each segment
+  if (tot_count)
+    segment_tree_probs[0] = (count1 * 255) / tot_count;
+  if (count1 > 0)
+    segment_tree_probs[1] = (segcounts[0] * 255) / count1;
+  if (count2 > 0)
+    segment_tree_probs[2] = (segcounts[2] * 255) / count2;
+
+  // Clamp probabilities to minimum allowed value
+  for (i = 0; i < MB_FEATURE_TREE_PROBS; i++) {
+    if (segment_tree_probs[i] == 0)
+      segment_tree_probs[i] = 1;
+  }
+}
+
+// Based on set of segment counts and probabilities calculate a cost estimate
+static int cost_segmap(MACROBLOCKD *xd,
+                       int *segcounts,
+                       vp9_prob *probs) {
+  int cost;
+  int count1, count2;
+
+  // Cost the top node of the tree
+  count1 = segcounts[0] + segcounts[1];
+  count2 = segcounts[2] + segcounts[3];
+  cost = count1 * vp9_cost_zero(probs[0]) +
+         count2 * vp9_cost_one(probs[0]);
+
+  // Now add the cost of each individual segment branch
+  if (count1 > 0)
+    cost += segcounts[0] * vp9_cost_zero(probs[1]) +
+            segcounts[1] * vp9_cost_one(probs[1]);
+
+  if (count2 > 0)
+    cost += segcounts[2] * vp9_cost_zero(probs[2]) +
+            segcounts[3] * vp9_cost_one(probs[2]);
+
+  return cost;
+
+}
+
+void vp9_choose_segmap_coding_method(VP9_COMP *cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &cpi->mb.e_mbd;
+
+  const int mis = cm->mode_info_stride;
+  int i;
+  int tot_count;
+  int no_pred_cost;
+  int t_pred_cost = INT_MAX;
+  int pred_context;
+
+  int mb_row, mb_col;
+  int segmap_index = 0;
+  unsigned char segment_id;
+
+  int temporal_predictor_count[PREDICTION_PROBS][2];
+  int no_pred_segcounts[MAX_MB_SEGMENTS];
+  int t_unpred_seg_counts[MAX_MB_SEGMENTS];
+
+  vp9_prob no_pred_tree[MB_FEATURE_TREE_PROBS];
+  vp9_prob t_pred_tree[MB_FEATURE_TREE_PROBS];
+  vp9_prob t_nopred_prob[PREDICTION_PROBS];
+
+  // Set default state for the segment tree probabilities and the
+  // temporal coding probabilities
+  vpx_memset(xd->mb_segment_tree_probs, 255,
+             sizeof(xd->mb_segment_tree_probs));
+  vpx_memset(cm->segment_pred_probs, 255,
+             sizeof(cm->segment_pred_probs));
+
+  vpx_memset(no_pred_segcounts, 0, sizeof(no_pred_segcounts));
+  vpx_memset(t_unpred_seg_counts, 0, sizeof(t_unpred_seg_counts));
+  vpx_memset(temporal_predictor_count, 0, sizeof(temporal_predictor_count));
+
+  // First of all generate stats regarding how well the last segment map
+  // predicts this one
+
+  // Initialize macroblock decoder mode info context for the first mb
+  // in the frame
+  xd->mode_info_context = cm->mi;
+
+  for (mb_row = 0; mb_row < cm->mb_rows; mb_row += 2) {
+    for (mb_col = 0; mb_col < cm->mb_cols; mb_col += 2) {
+      for (i = 0; i < 4; i++) {
+        static const int dx[4] = { +1, -1, +1, +1 };
+        static const int dy[4] = {  0, +1,  0, -1 };
+        int x_idx = i & 1, y_idx = i >> 1;
+
+        if (mb_col + x_idx >= cm->mb_cols ||
+            mb_row + y_idx >= cm->mb_rows) {
+          goto end;
+        }
+
+        xd->mb_to_top_edge = -((mb_row * 16) << 3);
+        xd->mb_to_bottom_edge = ((cm->mb_rows - 1 - mb_row) * 16) << 3;
+        xd->mb_to_left_edge = -((mb_col * 16) << 3);
+        xd->mb_to_right_edge = ((cm->mb_cols - 1 - mb_row) * 16) << 3;
+
+        segmap_index = (mb_row + y_idx) * cm->mb_cols + mb_col + x_idx;
+        segment_id = xd->mode_info_context->mbmi.segment_id;
+#if CONFIG_SUPERBLOCKS
+        if (xd->mode_info_context->mbmi.encoded_as_sb) {
+          if (mb_col + 1 < cm->mb_cols)
+            segment_id = segment_id &&
+                         xd->mode_info_context[1].mbmi.segment_id;
+          if (mb_row + 1 < cm->mb_rows) {
+            segment_id = segment_id &&
+                         xd->mode_info_context[mis].mbmi.segment_id;
+            if (mb_col + 1 < cm->mb_cols)
+              segment_id = segment_id &&
+                           xd->mode_info_context[mis + 1].mbmi.segment_id;
+          }
+        }
+#endif
+
+        // Count the number of hits on each segment with no prediction
+        no_pred_segcounts[segment_id]++;
+
+        // Temporal prediction not allowed on key frames
+        if (cm->frame_type != KEY_FRAME) {
+          // Test to see if the segment id matches the predicted value.
+          int seg_predicted =
+            (segment_id == vp9_get_pred_mb_segid(cm, xd, segmap_index));
+
+          // Get the segment id prediction context
+          pred_context =
+            vp9_get_pred_context(cm, xd, PRED_SEG_ID);
+
+          // Store the prediction status for this mb and update counts
+          // as appropriate
+          vp9_set_pred_flag(xd, PRED_SEG_ID, seg_predicted);
+          temporal_predictor_count[pred_context][seg_predicted]++;
+
+          if (!seg_predicted)
+            // Update the "unpredicted" segment count
+            t_unpred_seg_counts[segment_id]++;
+        }
+
+#if CONFIG_SUPERBLOCKS
+        if (xd->mode_info_context->mbmi.encoded_as_sb) {
+          assert(!i);
+          xd->mode_info_context += 2;
+          break;
+        }
+#endif
+      end:
+        xd->mode_info_context += dx[i] + dy[i] * cm->mode_info_stride;
+      }
+    }
+
+    // this is to account for the border in mode_info_context
+    xd->mode_info_context -= mb_col;
+    xd->mode_info_context += cm->mode_info_stride * 2;
+  }
+
+  // Work out probability tree for coding segments without prediction
+  // and the cost.
+  calc_segtree_probs(xd, no_pred_segcounts, no_pred_tree);
+  no_pred_cost = cost_segmap(xd, no_pred_segcounts, no_pred_tree);
+
+  // Key frames cannot use temporal prediction
+  if (cm->frame_type != KEY_FRAME) {
+    // Work out probability tree for coding those segments not
+    // predicted using the temporal method and the cost.
+    calc_segtree_probs(xd, t_unpred_seg_counts, t_pred_tree);
+    t_pred_cost = cost_segmap(xd, t_unpred_seg_counts, t_pred_tree);
+
+    // Add in the cost of the signalling for each prediction context
+    for (i = 0; i < PREDICTION_PROBS; i++) {
+      tot_count = temporal_predictor_count[i][0] +
+                  temporal_predictor_count[i][1];
+
+      // Work out the context probabilities for the segment
+      // prediction flag
+      if (tot_count) {
+        t_nopred_prob[i] = (temporal_predictor_count[i][0] * 255) /
+                           tot_count;
+
+        // Clamp to minimum allowed value
+        if (t_nopred_prob[i] < 1)
+          t_nopred_prob[i] = 1;
+      } else
+        t_nopred_prob[i] = 1;
+
+      // Add in the predictor signaling cost
+      t_pred_cost += (temporal_predictor_count[i][0] *
+                      vp9_cost_zero(t_nopred_prob[i])) +
+                     (temporal_predictor_count[i][1] *
+                      vp9_cost_one(t_nopred_prob[i]));
+    }
+  }
+
+  // Now choose which coding method to use.
+  if (t_pred_cost < no_pred_cost) {
+    cm->temporal_update = 1;
+    vpx_memcpy(xd->mb_segment_tree_probs,
+               t_pred_tree, sizeof(t_pred_tree));
+    vpx_memcpy(&cm->segment_pred_probs,
+               t_nopred_prob, sizeof(t_nopred_prob));
+  } else {
+    cm->temporal_update = 0;
+    vpx_memcpy(xd->mb_segment_tree_probs,
+               no_pred_tree, sizeof(no_pred_tree));
+  }
+}
--- /dev/null
+++ b/vp9/encoder/segmentation.h
@@ -1,0 +1,46 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "string.h"
+#include "vp9/common/blockd.h"
+#include "onyx_int.h"
+
+#ifndef __INC_SEGMENTATION_H__
+#define __INC_SEGMENTATION_H__ 1
+
+extern void vp9_update_gf_useage_maps(VP9_COMP *cpi, VP9_COMMON *cm,
+                                      MACROBLOCK *x);
+
+extern void vp9_enable_segmentation(VP9_PTR ptr);
+extern void vp9_disable_segmentation(VP9_PTR ptr);
+
+// Valid values for a segment are 0 to 3
+// Segmentation map is arrange as [Rows][Columns]
+extern void vp9_set_segmentation_map(VP9_PTR ptr,
+                                     unsigned char *segmentation_map);
+
+// The values given for each segment can be either deltas (from the default
+// value chosen for the frame) or absolute values.
+//
+// Valid range for abs values is (0-127 for MB_LVL_ALT_Q), (0-63 for
+// SEGMENT_ALT_LF)
+// Valid range for delta values are (+/-127 for MB_LVL_ALT_Q), (+/-63 for
+// SEGMENT_ALT_LF)
+//
+// abs_delta = SEGMENT_DELTADATA (deltas) abs_delta = SEGMENT_ABSDATA (use
+// the absolute values given).
+//
+extern void vp9_set_segment_data(VP9_PTR ptr, signed char *feature_data,
+                                 unsigned char abs_delta);
+
+extern void vp9_choose_segmap_coding_method(VP9_COMP *cpi);
+
+#endif /* __INC_SEGMENTATION_H__ */
--- /dev/null
+++ b/vp9/encoder/ssim.c
@@ -1,0 +1,147 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "onyx_int.h"
+
+void vp9_ssim_parms_16x16_c(unsigned char *s, int sp, unsigned char *r,
+                            int rp, unsigned long *sum_s, unsigned long *sum_r,
+                            unsigned long *sum_sq_s, unsigned long *sum_sq_r,
+                            unsigned long *sum_sxr) {
+  int i, j;
+  for (i = 0; i < 16; i++, s += sp, r += rp) {
+    for (j = 0; j < 16; j++) {
+      *sum_s += s[j];
+      *sum_r += r[j];
+      *sum_sq_s += s[j] * s[j];
+      *sum_sq_r += r[j] * r[j];
+      *sum_sxr += s[j] * r[j];
+    }
+  }
+}
+void vp9_ssim_parms_8x8_c(unsigned char *s, int sp, unsigned char *r, int rp,
+                          unsigned long *sum_s, unsigned long *sum_r,
+                          unsigned long *sum_sq_s, unsigned long *sum_sq_r,
+                          unsigned long *sum_sxr) {
+  int i, j;
+  for (i = 0; i < 8; i++, s += sp, r += rp) {
+    for (j = 0; j < 8; j++) {
+      *sum_s += s[j];
+      *sum_r += r[j];
+      *sum_sq_s += s[j] * s[j];
+      *sum_sq_r += r[j] * r[j];
+      *sum_sxr += s[j] * r[j];
+    }
+  }
+}
+
+const static int64_t cc1 =  26634; // (64^2*(.01*255)^2
+const static int64_t cc2 = 239708; // (64^2*(.03*255)^2
+
+static double similarity(unsigned long sum_s, unsigned long sum_r,
+                         unsigned long sum_sq_s, unsigned long sum_sq_r,
+                         unsigned long sum_sxr, int count) {
+  int64_t ssim_n, ssim_d;
+  int64_t c1, c2;
+
+  // scale the constants by number of pixels
+  c1 = (cc1 * count * count) >> 12;
+  c2 = (cc2 * count * count) >> 12;
+
+  ssim_n = (2 * sum_s * sum_r + c1) * ((int64_t) 2 * count * sum_sxr -
+                                       (int64_t) 2 * sum_s * sum_r + c2);
+
+  ssim_d = (sum_s * sum_s + sum_r * sum_r + c1) *
+           ((int64_t)count * sum_sq_s - (int64_t)sum_s * sum_s +
+            (int64_t)count * sum_sq_r - (int64_t) sum_r * sum_r + c2);
+
+  return ssim_n * 1.0 / ssim_d;
+}
+
+static double ssim_16x16(unsigned char *s, int sp, unsigned char *r, int rp) {
+  unsigned long sum_s = 0, sum_r = 0, sum_sq_s = 0, sum_sq_r = 0, sum_sxr = 0;
+  vp9_ssim_parms_16x16(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r,
+                       &sum_sxr);
+  return similarity(sum_s, sum_r, sum_sq_s, sum_sq_r, sum_sxr, 256);
+}
+static double ssim_8x8(unsigned char *s, int sp, unsigned char *r, int rp) {
+  unsigned long sum_s = 0, sum_r = 0, sum_sq_s = 0, sum_sq_r = 0, sum_sxr = 0;
+  vp9_ssim_parms_8x8(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r,
+                     &sum_sxr);
+  return similarity(sum_s, sum_r, sum_sq_s, sum_sq_r, sum_sxr, 64);
+}
+
+// We are using a 8x8 moving window with starting location of each 8x8 window
+// on the 4x4 pixel grid. Such arrangement allows the windows to overlap
+// block boundaries to penalize blocking artifacts.
+double vp9_ssim2(unsigned char *img1, unsigned char *img2, int stride_img1,
+                 int stride_img2, int width, int height) {
+  int i, j;
+  int samples = 0;
+  double ssim_total = 0;
+
+  // sample point start with each 4x4 location
+  for (i = 0; i < height - 8; i += 4, img1 += stride_img1 * 4, img2 += stride_img2 * 4) {
+    for (j = 0; j < width - 8; j += 4) {
+      double v = ssim_8x8(img1 + j, stride_img1, img2 + j, stride_img2);
+      ssim_total += v;
+      samples++;
+    }
+  }
+  ssim_total /= samples;
+  return ssim_total;
+}
+double vp9_calc_ssim(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest,
+                     int lumamask, double *weight) {
+  double a, b, c;
+  double ssimv;
+
+  a = vp9_ssim2(source->y_buffer, dest->y_buffer,
+                source->y_stride, dest->y_stride, source->y_width,
+                source->y_height);
+
+  b = vp9_ssim2(source->u_buffer, dest->u_buffer,
+                source->uv_stride, dest->uv_stride, source->uv_width,
+                source->uv_height);
+
+  c = vp9_ssim2(source->v_buffer, dest->v_buffer,
+                source->uv_stride, dest->uv_stride, source->uv_width,
+                source->uv_height);
+
+  ssimv = a * .8 + .1 * (b + c);
+
+  *weight = 1;
+
+  return ssimv;
+}
+
+double vp9_calc_ssimg(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest,
+                      double *ssim_y, double *ssim_u, double *ssim_v) {
+  double ssim_all = 0;
+  double a, b, c;
+
+  a = vp9_ssim2(source->y_buffer, dest->y_buffer,
+                source->y_stride, dest->y_stride, source->y_width,
+                source->y_height);
+
+  b = vp9_ssim2(source->u_buffer, dest->u_buffer,
+                source->uv_stride, dest->uv_stride, source->uv_width,
+                source->uv_height);
+
+  c = vp9_ssim2(source->v_buffer, dest->v_buffer,
+                source->uv_stride, dest->uv_stride, source->uv_width,
+                source->uv_height);
+  *ssim_y = a;
+  *ssim_u = b;
+  *ssim_v = c;
+  ssim_all = (a * 4 + b + c) / 6;
+
+  return ssim_all;
+}
--- /dev/null
+++ b/vp9/encoder/temporal_filter.c
@@ -1,0 +1,516 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vp9/common/onyxc_int.h"
+#include "onyx_int.h"
+#include "vp9/common/systemdependent.h"
+#include "quantize.h"
+#include "vp9/common/alloccommon.h"
+#include "mcomp.h"
+#include "firstpass.h"
+#include "psnr.h"
+#include "vpx_scale/vpxscale.h"
+#include "vp9/common/extend.h"
+#include "ratectrl.h"
+#include "vp9/common/quant_common.h"
+#include "segmentation.h"
+#include "vpx_scale/yv12extend.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vp9/common/swapyv12buffer.h"
+#include "vpx_ports/vpx_timer.h"
+
+#include <math.h>
+#include <limits.h>
+
+#define ALT_REF_MC_ENABLED 1    // dis/enable MC in AltRef filtering
+#define ALT_REF_SUBPEL_ENABLED 1 // dis/enable subpel in MC AltRef filtering
+
+#if VP9_TEMPORAL_ALT_REF
+
+
+static void temporal_filter_predictors_mb_c
+(
+  MACROBLOCKD *xd,
+  unsigned char *y_mb_ptr,
+  unsigned char *u_mb_ptr,
+  unsigned char *v_mb_ptr,
+  int stride,
+  int mv_row,
+  int mv_col,
+  unsigned char *pred
+) {
+  int offset;
+  unsigned char *yptr, *uptr, *vptr;
+  int omv_row, omv_col;
+
+  // Y
+  yptr = y_mb_ptr + (mv_row >> 3) * stride + (mv_col >> 3);
+
+  if ((mv_row | mv_col) & 7) {
+    xd->subpixel_predict16x16(yptr, stride,
+                             (mv_col & 7) << 1, (mv_row & 7) << 1, &pred[0], 16);
+  } else {
+    vp9_copy_mem16x16(yptr, stride, &pred[0], 16);
+  }
+
+  // U & V
+  omv_row = mv_row;
+  omv_col = mv_col;
+  mv_row >>= 1;
+  mv_col >>= 1;
+  stride = (stride + 1) >> 1;
+  offset = (mv_row >> 3) * stride + (mv_col >> 3);
+  uptr = u_mb_ptr + offset;
+  vptr = v_mb_ptr + offset;
+
+  if ((omv_row | omv_col) & 15) {
+    xd->subpixel_predict8x8(uptr, stride,
+                           (omv_col & 15), (omv_row & 15), &pred[256], 8);
+    xd->subpixel_predict8x8(vptr, stride,
+                           (omv_col & 15), (omv_row & 15), &pred[320], 8);
+  }
+  else {
+    vp9_copy_mem8x8(uptr, stride, &pred[256], 8);
+    vp9_copy_mem8x8(vptr, stride, &pred[320], 8);
+  }
+}
+void vp9_temporal_filter_apply_c
+(
+  unsigned char *frame1,
+  unsigned int stride,
+  unsigned char *frame2,
+  unsigned int block_size,
+  int strength,
+  int filter_weight,
+  unsigned int *accumulator,
+  unsigned short *count
+) {
+  unsigned int i, j, k;
+  int modifier;
+  int byte = 0;
+
+  for (i = 0, k = 0; i < block_size; i++) {
+    for (j = 0; j < block_size; j++, k++) {
+
+      int src_byte = frame1[byte];
+      int pixel_value = *frame2++;
+
+      modifier   = src_byte - pixel_value;
+      // This is an integer approximation of:
+      // float coeff = (3.0 * modifer * modifier) / pow(2, strength);
+      // modifier =  (int)roundf(coeff > 16 ? 0 : 16-coeff);
+      modifier  *= modifier;
+      modifier  *= 3;
+      modifier  += 1 << (strength - 1);
+      modifier >>= strength;
+
+      if (modifier > 16)
+        modifier = 16;
+
+      modifier = 16 - modifier;
+      modifier *= filter_weight;
+
+      count[k] += modifier;
+      accumulator[k] += modifier * pixel_value;
+
+      byte++;
+    }
+
+    byte += stride - block_size;
+  }
+}
+
+#if ALT_REF_MC_ENABLED
+
+static int temporal_filter_find_matching_mb_c
+(
+  VP9_COMP *cpi,
+  YV12_BUFFER_CONFIG *arf_frame,
+  YV12_BUFFER_CONFIG *frame_ptr,
+  int mb_offset,
+  int error_thresh
+) {
+  MACROBLOCK *x = &cpi->mb;
+  int step_param;
+  int further_steps;
+  int sadpb = x->sadperbit16;
+  int bestsme = INT_MAX;
+
+  BLOCK *b = &x->block[0];
+  BLOCKD *d = &x->e_mbd.block[0];
+  int_mv best_ref_mv1;
+  int_mv best_ref_mv1_full; /* full-pixel value of best_ref_mv1 */
+
+  // Save input state
+  unsigned char **base_src = b->base_src;
+  int src = b->src;
+  int src_stride = b->src_stride;
+  unsigned char **base_pre = d->base_pre;
+  int pre = d->pre;
+  int pre_stride = d->pre_stride;
+
+  best_ref_mv1.as_int = 0;
+  best_ref_mv1_full.as_mv.col = best_ref_mv1.as_mv.col >> 3;
+  best_ref_mv1_full.as_mv.row = best_ref_mv1.as_mv.row >> 3;
+
+  // Setup frame pointers
+  b->base_src = &arf_frame->y_buffer;
+  b->src_stride = arf_frame->y_stride;
+  b->src = mb_offset;
+
+  d->base_pre = &frame_ptr->y_buffer;
+  d->pre_stride = frame_ptr->y_stride;
+  d->pre = mb_offset;
+
+  // Further step/diamond searches as necessary
+  if (cpi->Speed < 8) {
+    step_param = cpi->sf.first_step +
+                 ((cpi->Speed > 5) ? 1 : 0);
+    further_steps =
+      (cpi->sf.max_step_search_steps - 1) - step_param;
+  } else {
+    step_param = cpi->sf.first_step + 2;
+    further_steps = 0;
+  }
+
+  /*cpi->sf.search_method == HEX*/
+  // TODO Check that the 16x16 vf & sdf are selected here
+  // Ignore mv costing by sending NULL pointer instead of cost arrays
+  bestsme = vp9_hex_search(x, b, d, &best_ref_mv1_full, &d->bmi.as_mv.first,
+                           step_param, sadpb, &cpi->fn_ptr[BLOCK_16X16],
+                           NULLMVCOST, NULLMVCOST,
+                           &best_ref_mv1);
+
+#if ALT_REF_SUBPEL_ENABLED
+  // Try sub-pixel MC?
+  // if (bestsme > error_thresh && bestsme < INT_MAX)
+  {
+    int distortion;
+    unsigned int sse;
+    // Ignore mv costing by sending NULL pointer instead of cost array
+    bestsme = cpi->find_fractional_mv_step(x, b, d, &d->bmi.as_mv.first,
+                                           &best_ref_mv1,
+                                           x->errorperbit,
+                                           &cpi->fn_ptr[BLOCK_16X16],
+                                           NULLMVCOST,
+                                           &distortion, &sse);
+  }
+#endif
+
+  // Save input state
+  b->base_src = base_src;
+  b->src = src;
+  b->src_stride = src_stride;
+  d->base_pre = base_pre;
+  d->pre = pre;
+  d->pre_stride = pre_stride;
+
+  return bestsme;
+}
+#endif
+
+static void temporal_filter_iterate_c
+(
+  VP9_COMP *cpi,
+  int frame_count,
+  int alt_ref_index,
+  int strength
+) {
+  int byte;
+  int frame;
+  int mb_col, mb_row;
+  unsigned int filter_weight;
+  int mb_cols = cpi->common.mb_cols;
+  int mb_rows = cpi->common.mb_rows;
+  int mb_y_offset = 0;
+  int mb_uv_offset = 0;
+  DECLARE_ALIGNED_ARRAY(16, unsigned int, accumulator, 16 * 16 + 8 * 8 + 8 * 8);
+  DECLARE_ALIGNED_ARRAY(16, unsigned short, count, 16 * 16 + 8 * 8 + 8 * 8);
+  MACROBLOCKD *mbd = &cpi->mb.e_mbd;
+  YV12_BUFFER_CONFIG *f = cpi->frames[alt_ref_index];
+  unsigned char *dst1, *dst2;
+  DECLARE_ALIGNED_ARRAY(16, unsigned char,  predictor, 16 * 16 + 8 * 8 + 8 * 8);
+
+  // Save input state
+  unsigned char *y_buffer = mbd->pre.y_buffer;
+  unsigned char *u_buffer = mbd->pre.u_buffer;
+  unsigned char *v_buffer = mbd->pre.v_buffer;
+
+  for (mb_row = 0; mb_row < mb_rows; mb_row++) {
+#if ALT_REF_MC_ENABLED
+    // Source frames are extended to 16 pixels.  This is different than
+    //  L/A/G reference frames that have a border of 32 (VP8BORDERINPIXELS)
+    // A 6/8 tap filter is used for motion search.  This requires 2 pixels
+    //  before and 3 pixels after.  So the largest Y mv on a border would
+    //  then be 16 - INTERP_EXTEND. The UV blocks are half the size of the Y and
+    //  therefore only extended by 8.  The largest mv that a UV block
+    //  can support is 8 - INTERP_EXTEND.  A UV mv is half of a Y mv.
+    //  (16 - INTERP_EXTEND) >> 1 which is greater than 8 - INTERP_EXTEND.
+    // To keep the mv in play for both Y and UV planes the max that it
+    //  can be on a border is therefore 16 - (2*INTERP_EXTEND+1).
+    cpi->mb.mv_row_min = -((mb_row * 16) + (17 - 2 * INTERP_EXTEND));
+    cpi->mb.mv_row_max = ((cpi->common.mb_rows - 1 - mb_row) * 16)
+                         + (17 - 2 * INTERP_EXTEND);
+#endif
+
+    for (mb_col = 0; mb_col < mb_cols; mb_col++) {
+      int i, j, k;
+      int stride;
+
+      vpx_memset(accumulator, 0, 384 * sizeof(unsigned int));
+      vpx_memset(count, 0, 384 * sizeof(unsigned short));
+
+#if ALT_REF_MC_ENABLED
+      cpi->mb.mv_col_min = -((mb_col * 16) + (17 - 2 * INTERP_EXTEND));
+      cpi->mb.mv_col_max = ((cpi->common.mb_cols - 1 - mb_col) * 16)
+                           + (17 - 2 * INTERP_EXTEND);
+#endif
+
+      for (frame = 0; frame < frame_count; frame++) {
+        if (cpi->frames[frame] == NULL)
+          continue;
+
+        mbd->block[0].bmi.as_mv.first.as_mv.row = 0;
+        mbd->block[0].bmi.as_mv.first.as_mv.col = 0;
+
+        if (frame == alt_ref_index) {
+          filter_weight = 2;
+        } else {
+          int err = 0;
+#if ALT_REF_MC_ENABLED
+#define THRESH_LOW   10000
+#define THRESH_HIGH  20000
+
+          // Find best match in this frame by MC
+          err = temporal_filter_find_matching_mb_c
+                (cpi,
+                 cpi->frames[alt_ref_index],
+                 cpi->frames[frame],
+                 mb_y_offset,
+                 THRESH_LOW);
+#endif
+          // Assign higher weight to matching MB if it's error
+          // score is lower. If not applying MC default behavior
+          // is to weight all MBs equal.
+          filter_weight = err < THRESH_LOW
+                          ? 2 : err < THRESH_HIGH ? 1 : 0;
+        }
+
+        if (filter_weight != 0) {
+          // Construct the predictors
+          temporal_filter_predictors_mb_c
+          (mbd,
+           cpi->frames[frame]->y_buffer + mb_y_offset,
+           cpi->frames[frame]->u_buffer + mb_uv_offset,
+           cpi->frames[frame]->v_buffer + mb_uv_offset,
+           cpi->frames[frame]->y_stride,
+           mbd->block[0].bmi.as_mv.first.as_mv.row,
+           mbd->block[0].bmi.as_mv.first.as_mv.col,
+           predictor);
+
+          // Apply the filter (YUV)
+          TEMPORAL_INVOKE(&cpi->rtcd.temporal, apply)
+          (f->y_buffer + mb_y_offset,
+           f->y_stride,
+           predictor,
+           16,
+           strength,
+           filter_weight,
+           accumulator,
+           count);
+
+          TEMPORAL_INVOKE(&cpi->rtcd.temporal, apply)
+          (f->u_buffer + mb_uv_offset,
+           f->uv_stride,
+           predictor + 256,
+           8,
+           strength,
+           filter_weight,
+           accumulator + 256,
+           count + 256);
+
+          TEMPORAL_INVOKE(&cpi->rtcd.temporal, apply)
+          (f->v_buffer + mb_uv_offset,
+           f->uv_stride,
+           predictor + 320,
+           8,
+           strength,
+           filter_weight,
+           accumulator + 320,
+           count + 320);
+        }
+      }
+
+      // Normalize filter output to produce AltRef frame
+      dst1 = cpi->alt_ref_buffer.y_buffer;
+      stride = cpi->alt_ref_buffer.y_stride;
+      byte = mb_y_offset;
+      for (i = 0, k = 0; i < 16; i++) {
+        for (j = 0; j < 16; j++, k++) {
+          unsigned int pval = accumulator[k] + (count[k] >> 1);
+          pval *= cpi->fixed_divide[count[k]];
+          pval >>= 19;
+
+          dst1[byte] = (unsigned char)pval;
+
+          // move to next pixel
+          byte++;
+        }
+
+        byte += stride - 16;
+      }
+
+      dst1 = cpi->alt_ref_buffer.u_buffer;
+      dst2 = cpi->alt_ref_buffer.v_buffer;
+      stride = cpi->alt_ref_buffer.uv_stride;
+      byte = mb_uv_offset;
+      for (i = 0, k = 256; i < 8; i++) {
+        for (j = 0; j < 8; j++, k++) {
+          int m = k + 64;
+
+          // U
+          unsigned int pval = accumulator[k] + (count[k] >> 1);
+          pval *= cpi->fixed_divide[count[k]];
+          pval >>= 19;
+          dst1[byte] = (unsigned char)pval;
+
+          // V
+          pval = accumulator[m] + (count[m] >> 1);
+          pval *= cpi->fixed_divide[count[m]];
+          pval >>= 19;
+          dst2[byte] = (unsigned char)pval;
+
+          // move to next pixel
+          byte++;
+        }
+
+        byte += stride - 8;
+      }
+
+      mb_y_offset += 16;
+      mb_uv_offset += 8;
+    }
+
+    mb_y_offset += 16 * (f->y_stride - mb_cols);
+    mb_uv_offset += 8 * (f->uv_stride - mb_cols);
+  }
+
+  // Restore input state
+  mbd->pre.y_buffer = y_buffer;
+  mbd->pre.u_buffer = u_buffer;
+  mbd->pre.v_buffer = v_buffer;
+}
+
+void vp9_temporal_filter_prepare_c
+(
+  VP9_COMP *cpi,
+  int distance
+) {
+  int frame = 0;
+
+  int num_frames_backward = 0;
+  int num_frames_forward = 0;
+  int frames_to_blur_backward = 0;
+  int frames_to_blur_forward = 0;
+  int frames_to_blur = 0;
+  int start_frame = 0;
+
+  int strength = cpi->oxcf.arnr_strength;
+
+  int blur_type = cpi->oxcf.arnr_type;
+
+  int max_frames = cpi->active_arnr_frames;
+
+  num_frames_backward = distance;
+  num_frames_forward = vp9_lookahead_depth(cpi->lookahead)
+                       - (num_frames_backward + 1);
+
+  switch (blur_type) {
+    case 1:
+      /////////////////////////////////////////
+      // Backward Blur
+
+      frames_to_blur_backward = num_frames_backward;
+
+      if (frames_to_blur_backward >= max_frames)
+        frames_to_blur_backward = max_frames - 1;
+
+      frames_to_blur = frames_to_blur_backward + 1;
+      break;
+
+    case 2:
+      /////////////////////////////////////////
+      // Forward Blur
+
+      frames_to_blur_forward = num_frames_forward;
+
+      if (frames_to_blur_forward >= max_frames)
+        frames_to_blur_forward = max_frames - 1;
+
+      frames_to_blur = frames_to_blur_forward + 1;
+      break;
+
+    case 3:
+    default:
+      /////////////////////////////////////////
+      // Center Blur
+      frames_to_blur_forward = num_frames_forward;
+      frames_to_blur_backward = num_frames_backward;
+
+      if (frames_to_blur_forward > frames_to_blur_backward)
+        frames_to_blur_forward = frames_to_blur_backward;
+
+      if (frames_to_blur_backward > frames_to_blur_forward)
+        frames_to_blur_backward = frames_to_blur_forward;
+
+      // When max_frames is even we have 1 more frame backward than forward
+      if (frames_to_blur_forward > (max_frames - 1) / 2)
+        frames_to_blur_forward = ((max_frames - 1) / 2);
+
+      if (frames_to_blur_backward > (max_frames / 2))
+        frames_to_blur_backward = (max_frames / 2);
+
+      frames_to_blur = frames_to_blur_backward + frames_to_blur_forward + 1;
+      break;
+  }
+
+  start_frame = distance + frames_to_blur_forward;
+
+#ifdef DEBUGFWG
+  // DEBUG FWG
+  printf("max:%d FBCK:%d FFWD:%d ftb:%d ftbbck:%d ftbfwd:%d sei:%d lasei:%d start:%d"
+, max_frames
+, num_frames_backward
+, num_frames_forward
+, frames_to_blur
+, frames_to_blur_backward
+, frames_to_blur_forward
+, cpi->source_encode_index
+, cpi->last_alt_ref_sei
+, start_frame);
+#endif
+
+  // Setup frame pointers, NULL indicates frame not included in filter
+  vpx_memset(cpi->frames, 0, max_frames * sizeof(YV12_BUFFER_CONFIG *));
+  for (frame = 0; frame < frames_to_blur; frame++) {
+    int which_buffer =  start_frame - frame;
+    struct lookahead_entry *buf = vp9_lookahead_peek(cpi->lookahead,
+                                                     which_buffer);
+    cpi->frames[frames_to_blur - 1 - frame] = &buf->img;
+  }
+
+  temporal_filter_iterate_c(
+    cpi,
+    frames_to_blur,
+    frames_to_blur_backward,
+    strength);
+}
+#endif
--- /dev/null
+++ b/vp9/encoder/temporal_filter.h
@@ -1,0 +1,47 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_TEMPORAL_FILTER_H
+#define __INC_TEMPORAL_FILTER_H
+
+#define prototype_apply(sym)\
+  void (sym) \
+  ( \
+    unsigned char *frame1, \
+    unsigned int stride, \
+    unsigned char *frame2, \
+    unsigned int block_size, \
+    int strength, \
+    int filter_weight, \
+    unsigned int *accumulator, \
+    unsigned short *count \
+  )
+
+#if ARCH_X86 || ARCH_X86_64
+#include "x86/temporal_filter_x86.h"
+#endif
+
+#ifndef vp9_temporal_filter_apply
+#define vp9_temporal_filter_apply vp9_temporal_filter_apply_c
+#endif
+extern prototype_apply(vp9_temporal_filter_apply);
+
+typedef struct {
+  prototype_apply(*apply);
+} vp9_temporal_rtcd_vtable_t;
+
+#if CONFIG_RUNTIME_CPU_DETECT
+#define TEMPORAL_INVOKE(ctx,fn) (ctx)->fn
+#else
+#define TEMPORAL_INVOKE(ctx,fn) vp9_temporal_filter_##fn
+#endif
+
+#endif // __INC_TEMPORAL_FILTER_H
--- /dev/null
+++ b/vp9/encoder/tokenize.c
@@ -1,0 +1,868 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include <math.h>
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+#include "onyx_int.h"
+#include "tokenize.h"
+#include "vpx_mem/vpx_mem.h"
+
+#include "vp9/common/pred_common.h"
+#include "vp9/common/seg_common.h"
+#include "vp9/common/entropy.h"
+
+/* Global event counters used for accumulating statistics across several
+   compressions, then generating context.c = initial stats. */
+
+#ifdef ENTROPY_STATS
+INT64 context_counters[BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];
+INT64 hybrid_context_counters[BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];
+
+INT64 context_counters_8x8[BLOCK_TYPES_8X8] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];
+INT64 hybrid_context_counters_8x8[BLOCK_TYPES_8X8] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];
+
+INT64 context_counters_16x16[BLOCK_TYPES_16X16] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];
+INT64 hybrid_context_counters_16x16[BLOCK_TYPES_16X16] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];
+
+extern unsigned int tree_update_hist[BLOCK_TYPES][COEF_BANDS]
+                    [PREV_COEF_CONTEXTS][ENTROPY_NODES][2];
+extern unsigned int hybrid_tree_update_hist[BLOCK_TYPES][COEF_BANDS]
+                    [PREV_COEF_CONTEXTS][ENTROPY_NODES][2];
+extern unsigned int tree_update_hist_8x8[BLOCK_TYPES_8X8][COEF_BANDS]
+                    [PREV_COEF_CONTEXTS][ENTROPY_NODES] [2];
+extern unsigned int hybrid_tree_update_hist_8x8[BLOCK_TYPES_8X8][COEF_BANDS]
+                    [PREV_COEF_CONTEXTS][ENTROPY_NODES] [2];
+extern unsigned int tree_update_hist_16x16[BLOCK_TYPES_16X16][COEF_BANDS]
+                    [PREV_COEF_CONTEXTS][ENTROPY_NODES] [2];
+extern unsigned int hybrid_tree_update_hist_16x16[BLOCK_TYPES_16X16][COEF_BANDS]
+                    [PREV_COEF_CONTEXTS][ENTROPY_NODES] [2];
+#endif  /* ENTROPY_STATS */
+
+void vp9_stuff_mb(VP9_COMP *cpi, MACROBLOCKD *xd, TOKENEXTRA **t, int dry_run);
+void vp9_fix_contexts(MACROBLOCKD *xd);
+
+static TOKENVALUE dct_value_tokens[DCT_MAX_VALUE * 2];
+const TOKENVALUE *vp9_dct_value_tokens_ptr;
+static int dct_value_cost[DCT_MAX_VALUE * 2];
+const int *vp9_dct_value_cost_ptr;
+
+static void fill_value_tokens() {
+
+  TOKENVALUE *const t = dct_value_tokens + DCT_MAX_VALUE;
+  vp9_extra_bit_struct *const e = vp9_extra_bits;
+
+  int i = -DCT_MAX_VALUE;
+  int sign = 1;
+
+  do {
+    if (!i)
+      sign = 0;
+
+    {
+      const int a = sign ? -i : i;
+      int eb = sign;
+
+      if (a > 4) {
+        int j = 4;
+
+        while (++j < 11  &&  e[j].base_val <= a) {}
+
+        t[i].Token = --j;
+        eb |= (a - e[j].base_val) << 1;
+      } else
+        t[i].Token = a;
+
+      t[i].Extra = eb;
+    }
+
+    // initialize the cost for extra bits for all possible coefficient value.
+    {
+      int cost = 0;
+      vp9_extra_bit_struct *p = vp9_extra_bits + t[i].Token;
+
+      if (p->base_val) {
+        const int extra = t[i].Extra;
+        const int Length = p->Len;
+
+        if (Length)
+          cost += treed_cost(p->tree, p->prob, extra >> 1, Length);
+
+        cost += vp9_cost_bit(vp9_prob_half, extra & 1); /* sign */
+        dct_value_cost[i + DCT_MAX_VALUE] = cost;
+      }
+
+    }
+
+  } while (++i < DCT_MAX_VALUE);
+
+  vp9_dct_value_tokens_ptr = dct_value_tokens + DCT_MAX_VALUE;
+  vp9_dct_value_cost_ptr   = dct_value_cost + DCT_MAX_VALUE;
+}
+
+static void tokenize_b(VP9_COMP *cpi,
+                       MACROBLOCKD *xd,
+                       const BLOCKD * const b,
+                       TOKENEXTRA **tp,
+                       PLANE_TYPE type,
+                       ENTROPY_CONTEXT *a,
+                       ENTROPY_CONTEXT *l,
+                       TX_SIZE tx_size,
+                       int dry_run) {
+  int pt; /* near block/prev token context index */
+  int c = (type == PLANE_TYPE_Y_NO_DC) ? 1 : 0;
+  const int eob = b->eob;     /* one beyond last nonzero coeff */
+  TOKENEXTRA *t = *tp;        /* store tokens starting here */
+  const short *qcoeff_ptr = b->qcoeff;
+  int seg_eob;
+  int segment_id = xd->mode_info_context->mbmi.segment_id;
+  const int *bands, *scan;
+  unsigned int (*counts)[COEF_BANDS][PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS];
+  vp9_prob (*probs)[COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES];
+  const TX_TYPE tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?
+                          get_tx_type(xd, b) : DCT_DCT;
+
+  VP9_COMBINEENTROPYCONTEXTS(pt, *a, *l);
+  switch (tx_size) {
+    default:
+    case TX_4X4:
+      seg_eob = 16;
+      bands = vp9_coef_bands;
+      scan = vp9_default_zig_zag1d;
+      if (tx_type != DCT_DCT) {
+        counts = cpi->hybrid_coef_counts;
+        probs = cpi->common.fc.hybrid_coef_probs;
+        if (tx_type == ADST_DCT) {
+          scan = vp9_row_scan;
+        } else if (tx_type == DCT_ADST) {
+          scan = vp9_col_scan;
+        }
+      } else {
+        counts = cpi->coef_counts;
+        probs = cpi->common.fc.coef_probs;
+      }
+      break;
+    case TX_8X8:
+      if (type == PLANE_TYPE_Y2) {
+        seg_eob = 4;
+        bands = vp9_coef_bands;
+        scan = vp9_default_zig_zag1d;
+      } else {
+        seg_eob = 64;
+        bands = vp9_coef_bands_8x8;
+        scan = vp9_default_zig_zag1d_8x8;
+      }
+      if (tx_type != DCT_DCT) {
+        counts = cpi->hybrid_coef_counts_8x8;
+        probs = cpi->common.fc.hybrid_coef_probs_8x8;
+      } else {
+        counts = cpi->coef_counts_8x8;
+        probs = cpi->common.fc.coef_probs_8x8;
+      }
+      break;
+    case TX_16X16:
+      seg_eob = 256;
+      bands = vp9_coef_bands_16x16;
+      scan = vp9_default_zig_zag1d_16x16;
+      if (tx_type != DCT_DCT) {
+        counts = cpi->hybrid_coef_counts_16x16;
+        probs = cpi->common.fc.hybrid_coef_probs_16x16;
+      } else {
+        counts = cpi->coef_counts_16x16;
+        probs = cpi->common.fc.coef_probs_16x16;
+      }
+      break;
+  }
+
+  if (vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB))
+    seg_eob = vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);
+
+  do {
+    const int band = bands[c];
+    int token;
+
+    if (c < eob) {
+      const int rc = scan[c];
+      const int v = qcoeff_ptr[rc];
+
+      assert(-DCT_MAX_VALUE <= v  &&  v < DCT_MAX_VALUE);
+
+      t->Extra = vp9_dct_value_tokens_ptr[v].Extra;
+      token    = vp9_dct_value_tokens_ptr[v].Token;
+    } else {
+      token = DCT_EOB_TOKEN;
+    }
+
+    t->Token = token;
+    t->context_tree = probs[type][band][pt];
+    t->skip_eob_node = (pt == 0) && ((band > 0 && type != PLANE_TYPE_Y_NO_DC) ||
+                                     (band > 1 && type == PLANE_TYPE_Y_NO_DC));
+    assert(vp9_coef_encodings[t->Token].Len - t->skip_eob_node > 0);
+    if (!dry_run) {
+      ++counts[type][band][pt][token];
+    }
+    pt = vp9_prev_token_class[token];
+    ++t;
+  } while (c < eob && ++c < seg_eob);
+
+  *tp = t;
+  *a = *l = (c != !type); /* 0 <-> all coeff data is zero */
+}
+
+int vp9_mby_is_skippable_4x4(MACROBLOCKD *xd, int has_y2_block) {
+  int skip = 1;
+  int i = 0;
+
+  if (has_y2_block) {
+    for (i = 0; i < 16; i++)
+      skip &= (xd->block[i].eob < 2);
+    skip &= (!xd->block[24].eob);
+  } else {
+    for (i = 0; i < 16; i++)
+      skip &= (!xd->block[i].eob);
+  }
+  return skip;
+}
+
+int vp9_mbuv_is_skippable_4x4(MACROBLOCKD *xd) {
+  int skip = 1;
+  int i;
+
+  for (i = 16; i < 24; i++)
+    skip &= (!xd->block[i].eob);
+  return skip;
+}
+
+static int mb_is_skippable_4x4(MACROBLOCKD *xd, int has_y2_block) {
+  return (vp9_mby_is_skippable_4x4(xd, has_y2_block) &
+          vp9_mbuv_is_skippable_4x4(xd));
+}
+
+int vp9_mby_is_skippable_8x8(MACROBLOCKD *xd, int has_y2_block) {
+  int skip = 1;
+  int i = 0;
+
+  if (has_y2_block) {
+    for (i = 0; i < 16; i += 4)
+      skip &= (xd->block[i].eob < 2);
+    skip &= (!xd->block[24].eob);
+  } else {
+    for (i = 0; i < 16; i += 4)
+      skip &= (!xd->block[i].eob);
+  }
+  return skip;
+}
+
+int vp9_mbuv_is_skippable_8x8(MACROBLOCKD *xd) {
+  return (!xd->block[16].eob) & (!xd->block[20].eob);
+}
+
+static int mb_is_skippable_8x8(MACROBLOCKD *xd, int has_y2_block) {
+  return (vp9_mby_is_skippable_8x8(xd, has_y2_block) &
+          vp9_mbuv_is_skippable_8x8(xd));
+}
+
+static int mb_is_skippable_8x8_4x4uv(MACROBLOCKD *xd, int has_y2_block) {
+  return (vp9_mby_is_skippable_8x8(xd, has_y2_block) &
+          vp9_mbuv_is_skippable_4x4(xd));
+}
+
+int vp9_mby_is_skippable_16x16(MACROBLOCKD *xd) {
+  int skip = 1;
+  skip &= !xd->block[0].eob;
+  return skip;
+}
+
+static int mb_is_skippable_16x16(MACROBLOCKD *xd) {
+  return (vp9_mby_is_skippable_16x16(xd) & vp9_mbuv_is_skippable_8x8(xd));
+}
+
+void vp9_tokenize_mb(VP9_COMP *cpi,
+                     MACROBLOCKD *xd,
+                     TOKENEXTRA **t,
+                     int dry_run) {
+  PLANE_TYPE plane_type;
+  int has_y2_block;
+  int b;
+  int tx_size = xd->mode_info_context->mbmi.txfm_size;
+  int mb_skip_context = vp9_get_pred_context(&cpi->common, xd, PRED_MBSKIP);
+  TOKENEXTRA *t_backup = *t;
+  ENTROPY_CONTEXT * A = (ENTROPY_CONTEXT *) xd->above_context;
+  ENTROPY_CONTEXT * L = (ENTROPY_CONTEXT *) xd->left_context;
+
+  // If the MB is going to be skipped because of a segment level flag
+  // exclude this from the skip count stats used to calculate the
+  // transmitted skip probability;
+  int skip_inc;
+  int segment_id = xd->mode_info_context->mbmi.segment_id;
+
+  if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) ||
+      (vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) != 0)) {
+    skip_inc = 1;
+  } else
+    skip_inc = 0;
+
+  has_y2_block = (tx_size != TX_16X16
+                  && xd->mode_info_context->mbmi.mode != B_PRED
+                  && xd->mode_info_context->mbmi.mode != I8X8_PRED
+                  && xd->mode_info_context->mbmi.mode != SPLITMV);
+
+  switch (tx_size) {
+    case TX_16X16:
+      xd->mode_info_context->mbmi.mb_skip_coeff = mb_is_skippable_16x16(xd);
+      break;
+    case TX_8X8:
+      if (xd->mode_info_context->mbmi.mode == I8X8_PRED ||
+          xd->mode_info_context->mbmi.mode == SPLITMV)
+        xd->mode_info_context->mbmi.mb_skip_coeff = mb_is_skippable_8x8_4x4uv(xd, 0);
+      else
+        xd->mode_info_context->mbmi.mb_skip_coeff = mb_is_skippable_8x8(xd, has_y2_block);
+      break;
+
+    default:
+      xd->mode_info_context->mbmi.mb_skip_coeff = mb_is_skippable_4x4(xd, has_y2_block);
+      break;
+  }
+
+  if (xd->mode_info_context->mbmi.mb_skip_coeff) {
+    if (!dry_run)
+      cpi->skip_true_count[mb_skip_context] += skip_inc;
+    if (!cpi->common.mb_no_coeff_skip) {
+      vp9_stuff_mb(cpi, xd, t, dry_run);
+    } else {
+      vp9_fix_contexts(xd);
+    }
+    if (dry_run)
+      *t = t_backup;
+    return;
+  }
+
+  if (!dry_run)
+    cpi->skip_false_count[mb_skip_context] += skip_inc;
+
+  if (has_y2_block) {
+    if (tx_size == TX_8X8) {
+      tokenize_b(cpi, xd, xd->block + 24, t, PLANE_TYPE_Y2,
+                 A + vp9_block2above_8x8[24], L + vp9_block2left_8x8[24],
+                 TX_8X8, dry_run);
+    } else {
+      tokenize_b(cpi, xd, xd->block + 24, t, PLANE_TYPE_Y2,
+                 A + vp9_block2above[24], L + vp9_block2left[24],
+                 TX_4X4, dry_run);
+    }
+
+    plane_type = PLANE_TYPE_Y_NO_DC;
+  } else
+    plane_type = PLANE_TYPE_Y_WITH_DC;
+
+  if (tx_size == TX_16X16) {
+    tokenize_b(cpi, xd, xd->block, t, PLANE_TYPE_Y_WITH_DC,
+               A, L, TX_16X16, dry_run);
+    A[1] = A[2] = A[3] = A[0];
+    L[1] = L[2] = L[3] = L[0];
+
+    for (b = 16; b < 24; b += 4) {
+      tokenize_b(cpi, xd, xd->block + b, t, PLANE_TYPE_UV,
+                 A + vp9_block2above_8x8[b], L + vp9_block2left_8x8[b],
+                 TX_8X8, dry_run);
+      A[vp9_block2above_8x8[b] + 1] = A[vp9_block2above_8x8[b]];
+      L[vp9_block2left_8x8[b] + 1]  = L[vp9_block2left_8x8[b]];
+    }
+    vpx_memset(&A[8], 0, sizeof(A[8]));
+    vpx_memset(&L[8], 0, sizeof(L[8]));
+  } else if (tx_size == TX_8X8) {
+    for (b = 0; b < 16; b += 4) {
+      tokenize_b(cpi, xd, xd->block + b, t, plane_type,
+                 A + vp9_block2above_8x8[b], L + vp9_block2left_8x8[b],
+                 TX_8X8, dry_run);
+      A[vp9_block2above_8x8[b] + 1] = A[vp9_block2above_8x8[b]];
+      L[vp9_block2left_8x8[b] + 1]  = L[vp9_block2left_8x8[b]];
+    }
+    if (xd->mode_info_context->mbmi.mode == I8X8_PRED ||
+        xd->mode_info_context->mbmi.mode == SPLITMV) {
+      for (b = 16; b < 24; b++) {
+        tokenize_b(cpi, xd, xd->block + b, t, PLANE_TYPE_UV,
+                   A + vp9_block2above[b], L + vp9_block2left[b],
+                   TX_4X4, dry_run);
+      }
+    } else {
+      for (b = 16; b < 24; b += 4) {
+        tokenize_b(cpi, xd, xd->block + b, t, PLANE_TYPE_UV,
+                   A + vp9_block2above_8x8[b], L + vp9_block2left_8x8[b],
+                   TX_8X8, dry_run);
+        A[vp9_block2above_8x8[b] + 1] = A[vp9_block2above_8x8[b]];
+        L[vp9_block2left_8x8[b] + 1]  = L[vp9_block2left_8x8[b]];
+      }
+    }
+  } else {
+    for (b = 0; b < 16; b++) {
+      tokenize_b(cpi, xd, xd->block + b, t, plane_type,
+                 A + vp9_block2above[b], L + vp9_block2left[b],
+                 TX_4X4, dry_run);
+    }
+
+    for (b = 16; b < 24; b++) {
+      tokenize_b(cpi, xd, xd->block + b, t, PLANE_TYPE_UV,
+                 A + vp9_block2above[b], L + vp9_block2left[b],
+                 TX_4X4, dry_run);
+    }
+  }
+  if (dry_run)
+    *t = t_backup;
+}
+
+
+#ifdef ENTROPY_STATS
+void init_context_counters(void) {
+  FILE *f = fopen("context.bin", "rb");
+  if (!f) {
+    vpx_memset(context_counters, 0, sizeof(context_counters));
+    vpx_memset(context_counters_8x8, 0, sizeof(context_counters_8x8));
+    vpx_memset(context_counters_16x16, 0, sizeof(context_counters_16x16));
+  } else {
+    fread(context_counters, sizeof(context_counters), 1, f);
+    fread(context_counters_8x8, sizeof(context_counters_8x8), 1, f);
+    fread(context_counters_16x16, sizeof(context_counters_16x16), 1, f);
+    fclose(f);
+  }
+
+  f = fopen("treeupdate.bin", "rb");
+  if (!f) {
+    vpx_memset(tree_update_hist, 0, sizeof(tree_update_hist));
+    vpx_memset(tree_update_hist_8x8, 0, sizeof(tree_update_hist_8x8));
+    vpx_memset(tree_update_hist_16x16, 0, sizeof(tree_update_hist_16x16));
+  } else {
+    fread(tree_update_hist, sizeof(tree_update_hist), 1, f);
+    fread(tree_update_hist_8x8, sizeof(tree_update_hist_8x8), 1, f);
+    fread(tree_update_hist_16x16, sizeof(tree_update_hist_16x16), 1, f);
+    fclose(f);
+  }
+}
+
+void print_context_counters() {
+  int type, band, pt, t;
+  FILE *f = fopen("context.c", "w");
+
+  fprintf(f, "#include \"entropy.h\"\n");
+  fprintf(f, "\n/* *** GENERATED FILE: DO NOT EDIT *** */\n\n");
+  fprintf(f, "static const unsigned int\n"
+          "vp9_default_coef_counts[BLOCK_TYPES]\n"
+          "                      [COEF_BANDS]\n"
+          "                      [PREV_COEF_CONTEXTS]\n"
+          "                      [MAX_ENTROPY_TOKENS]={\n");
+
+# define Comma( X) (X? ",":"")
+  type = 0;
+  do {
+    fprintf(f, "%s\n  { /* block Type %d */", Comma(type), type);
+    band = 0;
+    do {
+      fprintf(f, "%s\n    { /* Coeff Band %d */", Comma(band), band);
+      pt = 0;
+      do {
+        fprintf(f, "%s\n      {", Comma(pt));
+
+        t = 0;
+        do {
+          const INT64 x = context_counters [type] [band] [pt] [t];
+          const int y = (int) x;
+          assert(x == (INT64) y);  /* no overflow handling yet */
+          fprintf(f, "%s %d", Comma(t), y);
+        } while (++t < MAX_ENTROPY_TOKENS);
+        fprintf(f, "}");
+      } while (++pt < PREV_COEF_CONTEXTS);
+      fprintf(f, "\n    }");
+    } while (++band < COEF_BANDS);
+    fprintf(f, "\n  }");
+  } while (++type < BLOCK_TYPES);
+  fprintf(f, "\n};\n");
+
+  fprintf(f, "static const unsigned int\nvp9_default_coef_counts_8x8"
+          "[BLOCK_TYPES_8X8] [COEF_BANDS]"
+          "[PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS] = {");
+  type = 0;
+  do {
+    fprintf(f, "%s\n  { /* block Type %d */", Comma(type), type);
+    band = 0;
+    do {
+      fprintf(f, "%s\n    { /* Coeff Band %d */", Comma(band), band);
+      pt = 0;
+      do {
+        fprintf(f, "%s\n      {", Comma(pt));
+        t = 0;
+        do {
+          const INT64 x = context_counters_8x8 [type] [band] [pt] [t];
+          const int y = (int) x;
+
+          assert(x == (INT64) y);  /* no overflow handling yet */
+          fprintf(f, "%s %d", Comma(t), y);
+
+        } while (++t < MAX_ENTROPY_TOKENS);
+
+        fprintf(f, "}");
+      } while (++pt < PREV_COEF_CONTEXTS);
+
+      fprintf(f, "\n    }");
+
+    } while (++band < COEF_BANDS);
+
+    fprintf(f, "\n  }");
+  } while (++type < BLOCK_TYPES_8X8);
+  fprintf(f, "\n};\n");
+
+  fprintf(f, "static const unsigned int\nvp9_default_coef_counts_16x16"
+          "[BLOCK_TYPES_16X16] [COEF_BANDS]"
+          "[PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS] = {");
+  type = 0;
+  do {
+    fprintf(f, "%s\n  { /* block Type %d */", Comma(type), type);
+    band = 0;
+    do {
+      fprintf(f, "%s\n    { /* Coeff Band %d */", Comma(band), band);
+      pt = 0;
+      do {
+        fprintf(f, "%s\n      {", Comma(pt));
+        t = 0;
+        do {
+          const INT64 x = context_counters_16x16 [type] [band] [pt] [t];
+          const int y = (int) x;
+
+          assert(x == (INT64) y);  /* no overflow handling yet */
+          fprintf(f, "%s %d", Comma(t), y);
+
+        } while (++t < MAX_ENTROPY_TOKENS);
+
+        fprintf(f, "}");
+      } while (++pt < PREV_COEF_CONTEXTS);
+
+      fprintf(f, "\n    }");
+
+    } while (++band < COEF_BANDS);
+
+    fprintf(f, "\n  }");
+  } while (++type < BLOCK_TYPES_16X16);
+  fprintf(f, "\n};\n");
+
+  fprintf(f, "static const vp9_prob\n"
+          "vp9_default_coef_probs[BLOCK_TYPES] [COEF_BANDS] \n"
+          "[PREV_COEF_CONTEXTS] [ENTROPY_NODES] = {");
+  type = 0;
+  do {
+    fprintf(f, "%s\n  { /* block Type %d */", Comma(type), type);
+    band = 0;
+    do {
+      fprintf(f, "%s\n    { /* Coeff Band %d */", Comma(band), band);
+      pt = 0;
+      do {
+        unsigned int branch_ct [ENTROPY_NODES] [2];
+        unsigned int coef_counts[MAX_ENTROPY_TOKENS];
+        vp9_prob coef_probs[ENTROPY_NODES];
+        for (t = 0; t < MAX_ENTROPY_TOKENS; ++t)
+          coef_counts[t] = context_counters [type] [band] [pt] [t];
+        vp9_tree_probs_from_distribution(
+          MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree,
+          coef_probs, branch_ct, coef_counts, 256, 1);
+        fprintf(f, "%s\n      {", Comma(pt));
+
+        t = 0;
+        do {
+          fprintf(f, "%s %d", Comma(t), coef_probs[t]);
+
+        } while (++t < ENTROPY_NODES);
+
+        fprintf(f, "}");
+      } while (++pt < PREV_COEF_CONTEXTS);
+      fprintf(f, "\n    }");
+    } while (++band < COEF_BANDS);
+    fprintf(f, "\n  }");
+  } while (++type < BLOCK_TYPES);
+  fprintf(f, "\n};\n");
+
+  fprintf(f, "static const vp9_prob\n"
+          "vp9_default_coef_probs_8x8[BLOCK_TYPES_8X8] [COEF_BANDS]\n"
+          "[PREV_COEF_CONTEXTS] [ENTROPY_NODES] = {");
+  type = 0;
+  do {
+    fprintf(f, "%s\n  { /* block Type %d */", Comma(type), type);
+    band = 0;
+    do {
+      fprintf(f, "%s\n    { /* Coeff Band %d */", Comma(band), band);
+      pt = 0;
+      do {
+        unsigned int branch_ct [ENTROPY_NODES] [2];
+        unsigned int coef_counts[MAX_ENTROPY_TOKENS];
+        vp9_prob coef_probs[ENTROPY_NODES];
+        for (t = 0; t < MAX_ENTROPY_TOKENS; ++t)
+          coef_counts[t] = context_counters_8x8[type] [band] [pt] [t];
+        vp9_tree_probs_from_distribution(
+          MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree,
+          coef_probs, branch_ct, coef_counts, 256, 1);
+        fprintf(f, "%s\n      {", Comma(pt));
+
+        t = 0;
+        do {
+          fprintf(f, "%s %d", Comma(t), coef_probs[t]);
+        } while (++t < ENTROPY_NODES);
+        fprintf(f, "}");
+      } while (++pt < PREV_COEF_CONTEXTS);
+      fprintf(f, "\n    }");
+    } while (++band < COEF_BANDS);
+    fprintf(f, "\n  }");
+  } while (++type < BLOCK_TYPES_8X8);
+  fprintf(f, "\n};\n");
+
+  fprintf(f, "static const vp9_prob\n"
+          "vp9_default_coef_probs_16x16[BLOCK_TYPES_16X16] [COEF_BANDS]\n"
+          "[PREV_COEF_CONTEXTS] [ENTROPY_NODES] = {");
+  type = 0;
+  do {
+    fprintf(f, "%s\n  { /* block Type %d */", Comma(type), type);
+    band = 0;
+    do {
+      fprintf(f, "%s\n    { /* Coeff Band %d */", Comma(band), band);
+      pt = 0;
+      do {
+        unsigned int branch_ct [ENTROPY_NODES] [2];
+        unsigned int coef_counts[MAX_ENTROPY_TOKENS];
+        vp9_prob coef_probs[ENTROPY_NODES];
+        for (t = 0; t < MAX_ENTROPY_TOKENS; ++t)
+          coef_counts[t] = context_counters_16x16[type] [band] [pt] [t];
+        vp9_tree_probs_from_distribution(
+          MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree,
+          coef_probs, branch_ct, coef_counts, 256, 1);
+        fprintf(f, "%s\n      {", Comma(pt));
+
+        t = 0;
+        do {
+          fprintf(f, "%s %d", Comma(t), coef_probs[t]);
+        } while (++t < ENTROPY_NODES);
+        fprintf(f, "}");
+      } while (++pt < PREV_COEF_CONTEXTS);
+      fprintf(f, "\n    }");
+    } while (++band < COEF_BANDS);
+    fprintf(f, "\n  }");
+  } while (++type < BLOCK_TYPES_16X16);
+  fprintf(f, "\n};\n");
+
+  fclose(f);
+
+  f = fopen("context.bin", "wb");
+  fwrite(context_counters, sizeof(context_counters), 1, f);
+  fwrite(context_counters_8x8, sizeof(context_counters_8x8), 1, f);
+  fwrite(context_counters_16x16, sizeof(context_counters_16x16), 1, f);
+  fclose(f);
+}
+#endif
+
+void vp9_tokenize_initialize() {
+  fill_value_tokens();
+}
+
+static __inline void stuff_b(VP9_COMP *cpi,
+                             MACROBLOCKD *xd,
+                             const BLOCKD * const b,
+                             TOKENEXTRA **tp,
+                             PLANE_TYPE type,
+                             ENTROPY_CONTEXT *a,
+                             ENTROPY_CONTEXT *l,
+                             TX_SIZE tx_size,
+                             int dry_run) {
+  const int *bands;
+  unsigned int (*counts)[COEF_BANDS][PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS];
+  vp9_prob (*probs)[COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES];
+  int pt, band;
+  TOKENEXTRA *t = *tp;
+  const TX_TYPE tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?
+                          get_tx_type(xd, b) : DCT_DCT;
+  VP9_COMBINEENTROPYCONTEXTS(pt, *a, *l);
+
+  switch (tx_size) {
+    default:
+    case TX_4X4:
+      bands = vp9_coef_bands;
+      if (tx_type != DCT_DCT) {
+        counts = cpi->hybrid_coef_counts;
+        probs = cpi->common.fc.hybrid_coef_probs;
+      } else {
+        counts = cpi->coef_counts;
+        probs = cpi->common.fc.coef_probs;
+      }
+      break;
+    case TX_8X8:
+      bands = vp9_coef_bands_8x8;
+      if (tx_type != DCT_DCT) {
+        counts = cpi->hybrid_coef_counts_8x8;
+        probs = cpi->common.fc.hybrid_coef_probs_8x8;
+      } else {
+        counts = cpi->coef_counts_8x8;
+        probs = cpi->common.fc.coef_probs_8x8;
+      }
+      break;
+    case TX_16X16:
+      bands = vp9_coef_bands_16x16;
+      if (tx_type != DCT_DCT) {
+        counts = cpi->hybrid_coef_counts_16x16;
+        probs = cpi->common.fc.hybrid_coef_probs_16x16;
+      } else {
+        counts = cpi->coef_counts_16x16;
+        probs = cpi->common.fc.coef_probs_16x16;
+      }
+      break;
+  }
+  band = bands[(type == PLANE_TYPE_Y_NO_DC) ? 1 : 0];
+  t->Token = DCT_EOB_TOKEN;
+  t->context_tree = probs[type][band][pt];
+  t->skip_eob_node = 0;
+  ++t;
+  *tp = t;
+  *a = *l = 0;
+  if (!dry_run) {
+    ++counts[type][band][pt][DCT_EOB_TOKEN];
+  }
+}
+
+static void stuff_mb_8x8(VP9_COMP *cpi, MACROBLOCKD *xd,
+                         TOKENEXTRA **t, int dry_run) {
+  ENTROPY_CONTEXT *A = (ENTROPY_CONTEXT *)xd->above_context;
+  ENTROPY_CONTEXT *L = (ENTROPY_CONTEXT *)xd->left_context;
+  PLANE_TYPE plane_type;
+  int b;
+  const int has_y2_block = (xd->mode_info_context->mbmi.mode != B_PRED &&
+                            xd->mode_info_context->mbmi.mode != I8X8_PRED &&
+                            xd->mode_info_context->mbmi.mode != SPLITMV);
+
+  if (has_y2_block) {
+    stuff_b(cpi, xd, xd->block + 24, t, PLANE_TYPE_Y2,
+            A + vp9_block2above_8x8[24], L + vp9_block2left_8x8[24],
+            TX_8X8, dry_run);
+    plane_type = PLANE_TYPE_Y_NO_DC;
+  } else {
+    plane_type = PLANE_TYPE_Y_WITH_DC;
+  }
+
+  for (b = 0; b < 16; b += 4) {
+    stuff_b(cpi, xd, xd->block + b, t, plane_type, A + vp9_block2above_8x8[b],
+            L + vp9_block2left_8x8[b], TX_8X8, dry_run);
+    A[vp9_block2above_8x8[b] + 1] = A[vp9_block2above_8x8[b]];
+    L[vp9_block2left_8x8[b] + 1]  = L[vp9_block2left_8x8[b]];
+  }
+
+  for (b = 16; b < 24; b += 4) {
+    stuff_b(cpi, xd, xd->block + b, t, PLANE_TYPE_UV,
+            A + vp9_block2above_8x8[b], L + vp9_block2left_8x8[b],
+            TX_8X8, dry_run);
+    A[vp9_block2above_8x8[b] + 1] = A[vp9_block2above_8x8[b]];
+    L[vp9_block2left_8x8[b] + 1]  = L[vp9_block2left_8x8[b]];
+  }
+}
+
+static void stuff_mb_16x16(VP9_COMP *cpi, MACROBLOCKD *xd,
+                           TOKENEXTRA **t, int dry_run) {
+  ENTROPY_CONTEXT * A = (ENTROPY_CONTEXT *)xd->above_context;
+  ENTROPY_CONTEXT * L = (ENTROPY_CONTEXT *)xd->left_context;
+  int b;
+
+  stuff_b(cpi, xd, xd->block, t, PLANE_TYPE_Y_WITH_DC, A, L, TX_16X16, dry_run);
+  A[1] = A[2] = A[3] = A[0];
+  L[1] = L[2] = L[3] = L[0];
+  for (b = 16; b < 24; b += 4) {
+    stuff_b(cpi, xd, xd->block + b, t, PLANE_TYPE_UV, A + vp9_block2above[b],
+            L + vp9_block2above_8x8[b], TX_8X8, dry_run);
+    A[vp9_block2above_8x8[b] + 1] = A[vp9_block2above_8x8[b]];
+    L[vp9_block2left_8x8[b] + 1]  = L[vp9_block2left_8x8[b]];
+  }
+  vpx_memset(&A[8], 0, sizeof(A[8]));
+  vpx_memset(&L[8], 0, sizeof(L[8]));
+}
+
+static void stuff_mb_4x4(VP9_COMP *cpi, MACROBLOCKD *xd,
+                         TOKENEXTRA **t, int dry_run) {
+  ENTROPY_CONTEXT *A = (ENTROPY_CONTEXT *)xd->above_context;
+  ENTROPY_CONTEXT *L = (ENTROPY_CONTEXT *)xd->left_context;
+  int b;
+  PLANE_TYPE plane_type;
+  const int has_y2_block = (xd->mode_info_context->mbmi.mode != B_PRED &&
+                            xd->mode_info_context->mbmi.mode != I8X8_PRED &&
+                            xd->mode_info_context->mbmi.mode != SPLITMV);
+
+  if (has_y2_block) {
+    stuff_b(cpi, xd, xd->block + 24, t, PLANE_TYPE_Y2, A + vp9_block2above[24],
+            L + vp9_block2left[24], TX_4X4, dry_run);
+    plane_type = PLANE_TYPE_Y_NO_DC;
+  } else {
+    plane_type = PLANE_TYPE_Y_WITH_DC;
+  }
+
+  for (b = 0; b < 16; b++)
+    stuff_b(cpi, xd, xd->block + b, t, plane_type, A + vp9_block2above[b],
+            L + vp9_block2left[b], TX_4X4, dry_run);
+
+  for (b = 16; b < 24; b++)
+    stuff_b(cpi, xd, xd->block + b, t, PLANE_TYPE_UV, A + vp9_block2above[b],
+            L + vp9_block2left[b], TX_4X4, dry_run);
+}
+
+static void stuff_mb_8x8_4x4uv(VP9_COMP *cpi, MACROBLOCKD *xd,
+                               TOKENEXTRA **t, int dry_run) {
+  ENTROPY_CONTEXT *A = (ENTROPY_CONTEXT *)xd->above_context;
+  ENTROPY_CONTEXT *L = (ENTROPY_CONTEXT *)xd->left_context;
+  int b;
+
+  for (b = 0; b < 16; b += 4) {
+    stuff_b(cpi, xd, xd->block + b, t, PLANE_TYPE_Y_WITH_DC,
+            A + vp9_block2above_8x8[b], L + vp9_block2left_8x8[b],
+            TX_8X8, dry_run);
+    A[vp9_block2above_8x8[b] + 1] = A[vp9_block2above_8x8[b]];
+    L[vp9_block2left_8x8[b] + 1]  = L[vp9_block2left_8x8[b]];
+  }
+
+  for (b = 16; b < 24; b++)
+    stuff_b(cpi, xd, xd->block + b, t, PLANE_TYPE_UV, A + vp9_block2above[b],
+            L + vp9_block2left[b], TX_4X4, dry_run);
+}
+
+void vp9_stuff_mb(VP9_COMP *cpi, MACROBLOCKD *xd, TOKENEXTRA **t, int dry_run) {
+  TX_SIZE tx_size = xd->mode_info_context->mbmi.txfm_size;
+  TOKENEXTRA * const t_backup = *t;
+
+  if (tx_size == TX_16X16) {
+    stuff_mb_16x16(cpi, xd, t, dry_run);
+  } else if (tx_size == TX_8X8) {
+    if (xd->mode_info_context->mbmi.mode == I8X8_PRED ||
+        xd->mode_info_context->mbmi.mode == SPLITMV) {
+      stuff_mb_8x8_4x4uv(cpi, xd, t, dry_run);
+    } else {
+      stuff_mb_8x8(cpi, xd, t, dry_run);
+    }
+  } else {
+    stuff_mb_4x4(cpi, xd, t, dry_run);
+  }
+
+  if (dry_run) {
+    *t = t_backup;
+  }
+}
+
+void vp9_fix_contexts(MACROBLOCKD *xd) {
+  /* Clear entropy contexts for Y2 blocks */
+  if ((xd->mode_info_context->mbmi.mode != B_PRED
+      && xd->mode_info_context->mbmi.mode != I8X8_PRED
+      && xd->mode_info_context->mbmi.mode != SPLITMV)
+      || xd->mode_info_context->mbmi.txfm_size == TX_16X16
+      ) {
+    vpx_memset(xd->above_context, 0, sizeof(ENTROPY_CONTEXT_PLANES));
+    vpx_memset(xd->left_context, 0, sizeof(ENTROPY_CONTEXT_PLANES));
+  } else {
+    vpx_memset(xd->above_context, 0, sizeof(ENTROPY_CONTEXT_PLANES) - 1);
+    vpx_memset(xd->left_context, 0, sizeof(ENTROPY_CONTEXT_PLANES) - 1);
+  }
+}
--- /dev/null
+++ b/vp9/encoder/tokenize.h
@@ -1,0 +1,59 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef tokenize_h
+#define tokenize_h
+
+#include "vp9/common/entropy.h"
+#include "block.h"
+
+void vp9_tokenize_initialize();
+
+typedef struct {
+  short Token;
+  short Extra;
+} TOKENVALUE;
+
+typedef struct {
+  const vp9_prob *context_tree;
+  short           Extra;
+  unsigned char   Token;
+  unsigned char   skip_eob_node;
+} TOKENEXTRA;
+
+int rd_cost_mby(MACROBLOCKD *);
+
+extern int vp9_mby_is_skippable_4x4(MACROBLOCKD *xd, int has_y2_block);
+extern int vp9_mbuv_is_skippable_4x4(MACROBLOCKD *xd);
+extern int vp9_mby_is_skippable_8x8(MACROBLOCKD *xd, int has_y2_block);
+extern int vp9_mbuv_is_skippable_8x8(MACROBLOCKD *xd);
+extern int vp9_mby_is_skippable_16x16(MACROBLOCKD *xd);
+
+#ifdef ENTROPY_STATS
+void init_context_counters();
+void print_context_counters();
+
+extern INT64 context_counters[BLOCK_TYPES][COEF_BANDS]
+                             [PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS];
+extern INT64 context_counters_8x8[BLOCK_TYPES_8X8][COEF_BANDS]
+                                 [PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS];
+extern INT64 context_counters_16x16[BLOCK_TYPES_16X16][COEF_BANDS]
+                                   [PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS];
+#endif
+
+extern const int *vp9_dct_value_cost_ptr;
+/* TODO: The Token field should be broken out into a separate char array to
+ *  improve cache locality, since it's needed for costing when the rest of the
+ *  fields are not.
+ */
+extern const TOKENVALUE *vp9_dct_value_tokens_ptr;
+
+#endif  /* tokenize_h */
--- /dev/null
+++ b/vp9/encoder/treewriter.c
@@ -1,0 +1,39 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "treewriter.h"
+
+static void cost(
+  int *const C,
+  vp9_tree T,
+  const vp9_prob *const P,
+  int i,
+  int c
+) {
+  const vp9_prob p = P [i >> 1];
+
+  do {
+    const vp9_tree_index j = T[i];
+    const int d = c + vp9_cost_bit(p, i & 1);
+
+    if (j <= 0)
+      C[-j] = d;
+    else
+      cost(C, T, P, j, d);
+  } while (++i & 1);
+}
+void vp9_cost_tokens(int *c, const vp9_prob *p, vp9_tree t) {
+  cost(c, t, p, 0, 0);
+}
+
+void vp9_cost_tokens_skip(int *c, const vp9_prob *p, vp9_tree t) {
+  cost(c, t, p, 2, 0);
+}
--- /dev/null
+++ b/vp9/encoder/treewriter.h
@@ -1,0 +1,108 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_TREEWRITER_H
+#define __INC_TREEWRITER_H
+
+/* Trees map alphabets into huffman-like codes suitable for an arithmetic
+   bit coder.  Timothy S Murphy  11 October 2004 */
+
+#include "vp9/common/treecoder.h"
+
+#include "boolhuff.h"       /* for now */
+
+typedef BOOL_CODER vp9_writer;
+
+#define vp9_write encode_bool
+#define vp9_write_literal vp9_encode_value
+#define vp9_write_bit(W, V) vp9_write(W, V, vp9_prob_half)
+
+/* Approximate length of an encoded bool in 256ths of a bit at given prob */
+
+#define vp9_cost_zero(x) (vp9_prob_cost[x])
+#define vp9_cost_one(x) vp9_cost_zero(vp9_complement(x))
+
+#define vp9_cost_bit(x, b) vp9_cost_zero((b) ? vp9_complement(x) : (x))
+
+/* VP8BC version is scaled by 2^20 rather than 2^8; see bool_coder.h */
+
+
+/* Both of these return bits, not scaled bits. */
+
+static __inline unsigned int cost_branch(const unsigned int ct[2],
+                                         vp9_prob p) {
+  /* Imitate existing calculation */
+  return ((ct[0] * vp9_cost_zero(p))
+          + (ct[1] * vp9_cost_one(p))) >> 8;
+}
+
+static __inline unsigned int cost_branch256(const unsigned int ct[2],
+                                            vp9_prob p) {
+  /* Imitate existing calculation */
+  return ((ct[0] * vp9_cost_zero(p))
+          + (ct[1] * vp9_cost_one(p)));
+}
+
+/* Small functions to write explicit values and tokens, as well as
+   estimate their lengths. */
+
+static __inline void treed_write(vp9_writer *const w,
+                                 vp9_tree t,
+                                 const vp9_prob *const p,
+                                 int v,
+                                 /* number of bits in v, assumed nonzero */
+                                 int n) {
+  vp9_tree_index i = 0;
+
+  do {
+    const int b = (v >> --n) & 1;
+    vp9_write(w, b, p[i >> 1]);
+    i = t[i + b];
+  } while (n);
+}
+
+static __inline void write_token(vp9_writer *const w,
+                                 vp9_tree t,
+                                 const vp9_prob *const p,
+                                 vp9_token *const x) {
+  treed_write(w, t, p, x->value, x->Len);
+}
+
+static __inline int treed_cost(vp9_tree t,
+                               const vp9_prob *const p,
+                               int v,
+                               /* number of bits in v, assumed nonzero */
+                               int n) {
+  int c = 0;
+  vp9_tree_index i = 0;
+
+  do {
+    const int b = (v >> --n) & 1;
+    c += vp9_cost_bit(p[i >> 1], b);
+    i = t[i + b];
+  } while (n);
+
+  return c;
+}
+
+static __inline int cost_token(vp9_tree t,
+                               const vp9_prob *const p,
+                               vp9_token *const x) {
+  return treed_cost(t, p, x->value, x->Len);
+}
+
+/* Fill array of costs for all possible token values. */
+
+void vp9_cost_tokens(int *Costs, const vp9_prob *, vp9_tree);
+
+void vp9_cost_tokens_skip(int *c, const vp9_prob *p, vp9_tree t);
+
+#endif
--- /dev/null
+++ b/vp9/encoder/variance.h
@@ -1,0 +1,84 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VARIANCE_H
+#define VARIANCE_H
+
+typedef unsigned int(*vp9_sad_fn_t)(const unsigned char *src_ptr,
+                                    int source_stride,
+                                    const unsigned char *ref_ptr,
+                                    int ref_stride,
+                                    unsigned int max_sad);
+
+typedef void (*vp9_copy32xn_fn_t)(const unsigned char *src_ptr,
+                                  int source_stride,
+                                  const unsigned char *ref_ptr,
+                                  int ref_stride,
+                                  int n);
+
+typedef void (*vp9_sad_multi_fn_t)(const unsigned char *src_ptr,
+                                   int source_stride,
+                                   const unsigned char *ref_ptr,
+                                   int  ref_stride,
+                                   unsigned int *sad_array);
+
+typedef void (*vp9_sad_multi1_fn_t)(const unsigned char *src_ptr,
+                                    int source_stride,
+                                    const unsigned char *ref_ptr,
+                                    int  ref_stride,
+                                    unsigned short *sad_array);
+
+typedef void (*vp9_sad_multi_d_fn_t)(const unsigned char *src_ptr,
+                                     int source_stride,
+                                     const unsigned char * const ref_ptr[],
+                                     int  ref_stride, unsigned int *sad_array);
+
+typedef unsigned int (*vp9_variance_fn_t)(const unsigned char *src_ptr,
+                                          int source_stride,
+                                          const unsigned char *ref_ptr,
+                                          int ref_stride,
+                                          unsigned int *sse);
+
+typedef unsigned int (*vp9_subpixvariance_fn_t)(const unsigned char  *src_ptr,
+                                                int source_stride,
+                                                int xoffset,
+                                                int yoffset,
+                                                const unsigned char *ref_ptr,
+                                                int Refstride,
+                                                unsigned int *sse);
+
+typedef void (*vp9_ssimpf_fn_t)(unsigned char *s, int sp, unsigned char *r,
+                                int rp, unsigned long *sum_s,
+                                unsigned long *sum_r, unsigned long *sum_sq_s,
+                                unsigned long *sum_sq_r,
+                                unsigned long *sum_sxr);
+
+typedef unsigned int (*vp9_getmbss_fn_t)(const short *);
+
+typedef unsigned int (*vp9_get16x16prederror_fn_t)(const unsigned char *src_ptr,
+                                                   int source_stride,
+                                                   const unsigned char *ref_ptr,
+                                                   int  ref_stride);
+
+typedef struct variance_vtable {
+    vp9_sad_fn_t            sdf;
+    vp9_variance_fn_t       vf;
+    vp9_subpixvariance_fn_t svf;
+    vp9_variance_fn_t       svf_halfpix_h;
+    vp9_variance_fn_t       svf_halfpix_v;
+    vp9_variance_fn_t       svf_halfpix_hv;
+    vp9_sad_multi_fn_t      sdx3f;
+    vp9_sad_multi1_fn_t     sdx8f;
+    vp9_sad_multi_d_fn_t    sdx4df;
+    vp9_copy32xn_fn_t       copymem;
+} vp9_variance_fn_ptr_t;
+
+#endif
--- /dev/null
+++ b/vp9/encoder/variance_c.c
@@ -1,0 +1,540 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "variance.h"
+#include "vp9/common/filter.h"
+
+
+unsigned int vp9_get_mb_ss_c(const short *src_ptr) {
+  unsigned int i, sum = 0;
+
+  for (i = 0; i < 256; i++) {
+    sum += (src_ptr[i] * src_ptr[i]);
+  }
+
+  return sum;
+}
+
+
+static void variance(const unsigned char *src_ptr,
+                     int  source_stride,
+                     const unsigned char *ref_ptr,
+                     int  recon_stride,
+                     int  w,
+                     int  h,
+                     unsigned int *sse,
+                     int *sum) {
+  int i, j;
+  int diff;
+
+  *sum = 0;
+  *sse = 0;
+
+  for (i = 0; i < h; i++) {
+    for (j = 0; j < w; j++) {
+      diff = src_ptr[j] - ref_ptr[j];
+      *sum += diff;
+      *sse += diff * diff;
+    }
+
+    src_ptr += source_stride;
+    ref_ptr += recon_stride;
+  }
+}
+
+#if CONFIG_SUPERBLOCKS
+unsigned int vp9_variance32x32_c(const unsigned char *src_ptr,
+                                 int  source_stride,
+                                 const unsigned char *ref_ptr,
+                                 int  recon_stride,
+                                 unsigned int *sse) {
+  unsigned int var;
+  int avg;
+
+  variance(src_ptr, source_stride, ref_ptr, recon_stride, 32, 32, &var, &avg);
+  *sse = var;
+  return (var - ((avg * avg) >> 10));
+}
+#endif
+
+unsigned int vp9_variance16x16_c(const unsigned char *src_ptr,
+                                 int  source_stride,
+                                 const unsigned char *ref_ptr,
+                                 int  recon_stride,
+                                 unsigned int *sse) {
+  unsigned int var;
+  int avg;
+
+  variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 16, &var, &avg);
+  *sse = var;
+  return (var - ((avg * avg) >> 8));
+}
+
+unsigned int vp9_variance8x16_c(const unsigned char *src_ptr,
+                                int  source_stride,
+                                const unsigned char *ref_ptr,
+                                int  recon_stride,
+                                unsigned int *sse) {
+  unsigned int var;
+  int avg;
+
+  variance(src_ptr, source_stride, ref_ptr, recon_stride, 8, 16, &var, &avg);
+  *sse = var;
+  return (var - ((avg * avg) >> 7));
+}
+
+unsigned int vp9_variance16x8_c(const unsigned char *src_ptr,
+                                int  source_stride,
+                                const unsigned char *ref_ptr,
+                                int  recon_stride,
+                                unsigned int *sse) {
+  unsigned int var;
+  int avg;
+
+  variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 8, &var, &avg);
+  *sse = var;
+  return (var - ((avg * avg) >> 7));
+}
+
+
+unsigned int vp9_variance8x8_c(const unsigned char *src_ptr,
+                               int  source_stride,
+                               const unsigned char *ref_ptr,
+                               int  recon_stride,
+                               unsigned int *sse) {
+  unsigned int var;
+  int avg;
+
+  variance(src_ptr, source_stride, ref_ptr, recon_stride, 8, 8, &var, &avg);
+  *sse = var;
+  return (var - ((avg * avg) >> 6));
+}
+
+unsigned int vp9_variance4x4_c(const unsigned char *src_ptr,
+                               int  source_stride,
+                               const unsigned char *ref_ptr,
+                               int  recon_stride,
+                               unsigned int *sse) {
+  unsigned int var;
+  int avg;
+
+  variance(src_ptr, source_stride, ref_ptr, recon_stride, 4, 4, &var, &avg);
+  *sse = var;
+  return (var - ((avg * avg) >> 4));
+}
+
+
+unsigned int vp9_mse16x16_c(const unsigned char *src_ptr,
+                            int  source_stride,
+                            const unsigned char *ref_ptr,
+                            int  recon_stride,
+                            unsigned int *sse) {
+  unsigned int var;
+  int avg;
+
+  variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 16, &var, &avg);
+  *sse = var;
+  return var;
+}
+
+
+/****************************************************************************
+ *
+ *  ROUTINE       : filter_block2d_bil_first_pass
+ *
+ *  INPUTS        : UINT8  *src_ptr          : Pointer to source block.
+ *                  UINT32 src_pixels_per_line : Stride of input block.
+ *                  UINT32 pixel_step        : Offset between filter input samples (see notes).
+ *                  UINT32 output_height     : Input block height.
+ *                  UINT32 output_width      : Input block width.
+ *                  INT32  *vp9_filter          : Array of 2 bi-linear filter taps.
+ *
+ *  OUTPUTS       : INT32 *output_ptr        : Pointer to filtered block.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Applies a 1-D 2-tap bi-linear filter to the source block in
+ *                  either horizontal or vertical direction to produce the
+ *                  filtered output block. Used to implement first-pass
+ *                  of 2-D separable filter.
+ *
+ *  SPECIAL NOTES : Produces INT32 output to retain precision for next pass.
+ *                  Two filter taps should sum to VP9_FILTER_WEIGHT.
+ *                  pixel_step defines whether the filter is applied
+ *                  horizontally (pixel_step=1) or vertically (pixel_step=stride).
+ *                  It defines the offset required to move from one input
+ *                  to the next.
+ *
+ ****************************************************************************/
+static void var_filter_block2d_bil_first_pass(const unsigned char *src_ptr,
+                                              unsigned short *output_ptr,
+                                              unsigned int src_pixels_per_line,
+                                              int pixel_step,
+                                              unsigned int output_height,
+                                              unsigned int output_width,
+                                              const short *vp9_filter) {
+  unsigned int i, j;
+
+  for (i = 0; i < output_height; i++) {
+    for (j = 0; j < output_width; j++) {
+      // Apply bilinear filter
+      output_ptr[j] = (((int)src_ptr[0]          * vp9_filter[0]) +
+                       ((int)src_ptr[pixel_step] * vp9_filter[1]) +
+                       (VP9_FILTER_WEIGHT / 2)) >> VP9_FILTER_SHIFT;
+      src_ptr++;
+    }
+
+    // Next row...
+    src_ptr    += src_pixels_per_line - output_width;
+    output_ptr += output_width;
+  }
+}
+
+/****************************************************************************
+ *
+ *  ROUTINE       : filter_block2d_bil_second_pass
+ *
+ *  INPUTS        : INT32  *src_ptr          : Pointer to source block.
+ *                  UINT32 src_pixels_per_line : Stride of input block.
+ *                  UINT32 pixel_step        : Offset between filter input samples (see notes).
+ *                  UINT32 output_height     : Input block height.
+ *                  UINT32 output_width      : Input block width.
+ *                  INT32  *vp9_filter          : Array of 2 bi-linear filter taps.
+ *
+ *  OUTPUTS       : UINT16 *output_ptr       : Pointer to filtered block.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Applies a 1-D 2-tap bi-linear filter to the source block in
+ *                  either horizontal or vertical direction to produce the
+ *                  filtered output block. Used to implement second-pass
+ *                  of 2-D separable filter.
+ *
+ *  SPECIAL NOTES : Requires 32-bit input as produced by filter_block2d_bil_first_pass.
+ *                  Two filter taps should sum to VP9_FILTER_WEIGHT.
+ *                  pixel_step defines whether the filter is applied
+ *                  horizontally (pixel_step=1) or vertically (pixel_step=stride).
+ *                  It defines the offset required to move from one input
+ *                  to the next.
+ *
+ ****************************************************************************/
+static void var_filter_block2d_bil_second_pass(const unsigned short *src_ptr,
+                                               unsigned char *output_ptr,
+                                               unsigned int src_pixels_per_line,
+                                               unsigned int pixel_step,
+                                               unsigned int output_height,
+                                               unsigned int output_width,
+                                               const short *vp9_filter) {
+  unsigned int  i, j;
+  int  Temp;
+
+  for (i = 0; i < output_height; i++) {
+    for (j = 0; j < output_width; j++) {
+      // Apply filter
+      Temp = ((int)src_ptr[0]         * vp9_filter[0]) +
+             ((int)src_ptr[pixel_step] * vp9_filter[1]) +
+             (VP9_FILTER_WEIGHT / 2);
+      output_ptr[j] = (unsigned int)(Temp >> VP9_FILTER_SHIFT);
+      src_ptr++;
+    }
+
+    // Next row...
+    src_ptr    += src_pixels_per_line - output_width;
+    output_ptr += output_width;
+  }
+}
+
+
+unsigned int vp9_sub_pixel_variance4x4_c(const unsigned char  *src_ptr,
+                                         int  src_pixels_per_line,
+                                         int  xoffset,
+                                         int  yoffset,
+                                         const unsigned char *dst_ptr,
+                                         int dst_pixels_per_line,
+                                         unsigned int *sse) {
+  unsigned char  temp2[20 * 16];
+  const short *HFilter, *VFilter;
+  unsigned short FData3[5 * 4]; // Temp data bufffer used in filtering
+
+  HFilter = vp9_bilinear_filters[xoffset];
+  VFilter = vp9_bilinear_filters[yoffset];
+
+  // First filter 1d Horizontal
+  var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 5, 4, HFilter);
+
+  // Now filter Verticaly
+  var_filter_block2d_bil_second_pass(FData3, temp2, 4,  4,  4,  4, VFilter);
+
+  return vp9_variance4x4_c(temp2, 4, dst_ptr, dst_pixels_per_line, sse);
+}
+
+
+unsigned int vp9_sub_pixel_variance8x8_c(const unsigned char  *src_ptr,
+                                         int  src_pixels_per_line,
+                                         int  xoffset,
+                                         int  yoffset,
+                                         const unsigned char *dst_ptr,
+                                         int dst_pixels_per_line,
+                                         unsigned int *sse) {
+  unsigned short FData3[9 * 8]; // Temp data bufffer used in filtering
+  unsigned char  temp2[20 * 16];
+  const short *HFilter, *VFilter;
+
+  HFilter = vp9_bilinear_filters[xoffset];
+  VFilter = vp9_bilinear_filters[yoffset];
+
+  var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 9, 8, HFilter);
+  var_filter_block2d_bil_second_pass(FData3, temp2, 8, 8, 8, 8, VFilter);
+
+  return vp9_variance8x8_c(temp2, 8, dst_ptr, dst_pixels_per_line, sse);
+}
+
+unsigned int vp9_sub_pixel_variance16x16_c(const unsigned char  *src_ptr,
+                                           int  src_pixels_per_line,
+                                           int  xoffset,
+                                           int  yoffset,
+                                           const unsigned char *dst_ptr,
+                                           int dst_pixels_per_line,
+                                           unsigned int *sse) {
+  unsigned short FData3[17 * 16]; // Temp data bufffer used in filtering
+  unsigned char  temp2[20 * 16];
+  const short *HFilter, *VFilter;
+
+  HFilter = vp9_bilinear_filters[xoffset];
+  VFilter = vp9_bilinear_filters[yoffset];
+
+  var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 17, 16, HFilter);
+  var_filter_block2d_bil_second_pass(FData3, temp2, 16, 16, 16, 16, VFilter);
+
+  return vp9_variance16x16_c(temp2, 16, dst_ptr, dst_pixels_per_line, sse);
+}
+
+#if CONFIG_SUPERBLOCKS
+unsigned int vp9_sub_pixel_variance32x32_c(const unsigned char  *src_ptr,
+                                           int  src_pixels_per_line,
+                                           int  xoffset,
+                                           int  yoffset,
+                                           const unsigned char *dst_ptr,
+                                           int dst_pixels_per_line,
+                                           unsigned int *sse) {
+  unsigned short FData3[33 * 32]; // Temp data bufffer used in filtering
+  unsigned char  temp2[36 * 32];
+  const short *HFilter, *VFilter;
+
+  HFilter = vp9_bilinear_filters[xoffset];
+  VFilter = vp9_bilinear_filters[yoffset];
+
+  var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 33, 32, HFilter);
+  var_filter_block2d_bil_second_pass(FData3, temp2, 32, 32, 32, 32, VFilter);
+
+  return vp9_variance32x32_c(temp2, 32, dst_ptr, dst_pixels_per_line, sse);
+}
+#endif
+
+unsigned int vp9_variance_halfpixvar16x16_h_c(const unsigned char *src_ptr,
+                                              int  source_stride,
+                                              const unsigned char *ref_ptr,
+                                              int  recon_stride,
+                                              unsigned int *sse) {
+  return vp9_sub_pixel_variance16x16_c(src_ptr, source_stride, 8, 0,
+                                       ref_ptr, recon_stride, sse);
+}
+
+#if CONFIG_SUPERBLOCKS
+unsigned int vp9_variance_halfpixvar32x32_h_c(const unsigned char *src_ptr,
+                                              int  source_stride,
+                                              const unsigned char *ref_ptr,
+                                              int  recon_stride,
+                                              unsigned int *sse) {
+  return vp9_sub_pixel_variance32x32_c(src_ptr, source_stride, 8, 0,
+                                       ref_ptr, recon_stride, sse);
+}
+#endif
+
+
+unsigned int vp9_variance_halfpixvar16x16_v_c(const unsigned char *src_ptr,
+                                              int  source_stride,
+                                              const unsigned char *ref_ptr,
+                                              int  recon_stride,
+                                              unsigned int *sse) {
+  return vp9_sub_pixel_variance16x16_c(src_ptr, source_stride, 0, 8,
+                                       ref_ptr, recon_stride, sse);
+}
+
+#if CONFIG_SUPERBLOCKS
+unsigned int vp9_variance_halfpixvar32x32_v_c(const unsigned char *src_ptr,
+                                              int  source_stride,
+                                              const unsigned char *ref_ptr,
+                                              int  recon_stride,
+                                              unsigned int *sse) {
+  return vp9_sub_pixel_variance32x32_c(src_ptr, source_stride, 0, 8,
+                                       ref_ptr, recon_stride, sse);
+}
+#endif
+
+unsigned int vp9_variance_halfpixvar16x16_hv_c(const unsigned char *src_ptr,
+                                               int  source_stride,
+                                               const unsigned char *ref_ptr,
+                                               int  recon_stride,
+                                               unsigned int *sse) {
+  return vp9_sub_pixel_variance16x16_c(src_ptr, source_stride, 8, 8,
+                                       ref_ptr, recon_stride, sse);
+}
+
+#if CONFIG_SUPERBLOCKS
+unsigned int vp9_variance_halfpixvar32x32_hv_c(const unsigned char *src_ptr,
+                                               int  source_stride,
+                                               const unsigned char *ref_ptr,
+                                               int  recon_stride,
+                                               unsigned int *sse) {
+  return vp9_sub_pixel_variance32x32_c(src_ptr, source_stride, 8, 8,
+                                       ref_ptr, recon_stride, sse);
+}
+#endif
+
+unsigned int vp9_sub_pixel_mse16x16_c(const unsigned char  *src_ptr,
+                                      int  src_pixels_per_line,
+                                      int  xoffset,
+                                      int  yoffset,
+                                      const unsigned char *dst_ptr,
+                                      int dst_pixels_per_line,
+                                      unsigned int *sse) {
+  vp9_sub_pixel_variance16x16_c(src_ptr, src_pixels_per_line,
+                                xoffset, yoffset, dst_ptr,
+                                dst_pixels_per_line, sse);
+  return *sse;
+}
+
+#if CONFIG_SUPERBLOCKS
+unsigned int vp9_sub_pixel_mse32x32_c(const unsigned char  *src_ptr,
+                                      int  src_pixels_per_line,
+                                      int  xoffset,
+                                      int  yoffset,
+                                      const unsigned char *dst_ptr,
+                                      int dst_pixels_per_line,
+                                      unsigned int *sse) {
+  vp9_sub_pixel_variance32x32_c(src_ptr, src_pixels_per_line,
+                                xoffset, yoffset, dst_ptr,
+                                dst_pixels_per_line, sse);
+  return *sse;
+}
+#endif
+
+unsigned int vp9_sub_pixel_variance16x8_c(const unsigned char  *src_ptr,
+                                          int  src_pixels_per_line,
+                                          int  xoffset,
+                                          int  yoffset,
+                                          const unsigned char *dst_ptr,
+                                          int dst_pixels_per_line,
+                                          unsigned int *sse) {
+  unsigned short FData3[16 * 9];  // Temp data bufffer used in filtering
+  unsigned char  temp2[20 * 16];
+  const short *HFilter, *VFilter;
+
+  HFilter = vp9_bilinear_filters[xoffset];
+  VFilter = vp9_bilinear_filters[yoffset];
+
+  var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 9, 16, HFilter);
+  var_filter_block2d_bil_second_pass(FData3, temp2, 16, 16, 8, 16, VFilter);
+
+  return vp9_variance16x8_c(temp2, 16, dst_ptr, dst_pixels_per_line, sse);
+}
+
+unsigned int vp9_sub_pixel_variance8x16_c(const unsigned char  *src_ptr,
+                                          int  src_pixels_per_line,
+                                          int  xoffset,
+                                          int  yoffset,
+                                          const unsigned char *dst_ptr,
+                                          int dst_pixels_per_line,
+                                          unsigned int *sse) {
+  unsigned short FData3[9 * 16];  // Temp data bufffer used in filtering
+  unsigned char  temp2[20 * 16];
+  const short *HFilter, *VFilter;
+
+  HFilter = vp9_bilinear_filters[xoffset];
+  VFilter = vp9_bilinear_filters[yoffset];
+
+  var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line,
+                                    1, 17, 8, HFilter);
+  var_filter_block2d_bil_second_pass(FData3, temp2, 8, 8, 16, 8, VFilter);
+
+  return vp9_variance8x16_c(temp2, 8, dst_ptr, dst_pixels_per_line, sse);
+}
+
+#if CONFIG_NEWBESTREFMV
+unsigned int vp9_variance2x16_c(const unsigned char *src_ptr,
+                                const int  source_stride,
+                                const unsigned char *ref_ptr,
+                                const int  recon_stride,
+                                unsigned int *sse) {
+  unsigned int var;
+  int avg;
+
+  variance(src_ptr, source_stride, ref_ptr, recon_stride, 2, 16, &var, &avg);
+  *sse = var;
+  return (var - ((avg * avg) >> 5));
+}
+
+unsigned int vp9_variance16x2_c(const unsigned char *src_ptr,
+                                const int  source_stride,
+                                const unsigned char *ref_ptr,
+                                const int  recon_stride,
+                                unsigned int *sse) {
+  unsigned int var;
+  int avg;
+
+  variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 2, &var, &avg);
+  *sse = var;
+  return (var - ((avg * avg) >> 5));
+}
+
+unsigned int vp9_sub_pixel_variance16x2_c(const unsigned char  *src_ptr,
+                                          const int  src_pixels_per_line,
+                                          const int  xoffset,
+                                          const int  yoffset,
+                                          const unsigned char *dst_ptr,
+                                          const int dst_pixels_per_line,
+                                          unsigned int *sse) {
+  unsigned short FData3[16 * 3];  // Temp data bufffer used in filtering
+  unsigned char  temp2[20 * 16];
+  const short *HFilter, *VFilter;
+
+  HFilter = vp9_bilinear_filters[xoffset];
+  VFilter = vp9_bilinear_filters[yoffset];
+
+  var_filter_block2d_bil_first_pass(src_ptr, FData3,
+                                    src_pixels_per_line, 1, 3, 16, HFilter);
+  var_filter_block2d_bil_second_pass(FData3, temp2, 16, 16, 2, 16, VFilter);
+
+  return vp9_variance16x2_c(temp2, 16, dst_ptr, dst_pixels_per_line, sse);
+}
+
+unsigned int vp9_sub_pixel_variance2x16_c(const unsigned char  *src_ptr,
+                                          const int  src_pixels_per_line,
+                                          const int  xoffset,
+                                          const int  yoffset,
+                                          const unsigned char *dst_ptr,
+                                          const int dst_pixels_per_line,
+                                          unsigned int *sse) {
+  unsigned short FData3[2 * 17];  // Temp data bufffer used in filtering
+  unsigned char  temp2[2 * 16];
+  const short *HFilter, *VFilter;
+
+  HFilter = vp9_bilinear_filters[xoffset];
+  VFilter = vp9_bilinear_filters[yoffset];
+
+  var_filter_block2d_bil_first_pass(src_ptr, FData3,
+                                    src_pixels_per_line, 1, 17, 2, HFilter);
+  var_filter_block2d_bil_second_pass(FData3, temp2, 2, 2, 16, 2, VFilter);
+
+  return vp9_variance2x16_c(temp2, 2, dst_ptr, dst_pixels_per_line, sse);
+}
+#endif
--- /dev/null
+++ b/vp9/encoder/x86/dct_mmx.asm
@@ -1,0 +1,241 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;void vp9_short_fdct4x4_mmx(short *input, short *output, int pitch)
+global sym(vp9_short_fdct4x4_mmx)
+sym(vp9_short_fdct4x4_mmx):
+    push        rbp
+    mov         rbp,        rsp
+    SHADOW_ARGS_TO_STACK 3
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov         rsi,        arg(0)      ; input
+        mov         rdi,        arg(1)      ; output
+
+        movsxd      rax,        dword ptr arg(2) ;pitch
+
+        lea         rcx,        [rsi + rax*2]
+        ; read the input data
+        movq        mm0,        [rsi]
+        movq        mm1,        [rsi + rax]
+
+        movq        mm2,        [rcx]
+        movq        mm4,        [rcx + rax]
+
+        ; transpose for the first stage
+        movq        mm3,        mm0         ; 00 01 02 03
+        movq        mm5,        mm2         ; 20 21 22 23
+
+        punpcklwd   mm0,        mm1         ; 00 10 01 11
+        punpckhwd   mm3,        mm1         ; 02 12 03 13
+
+        punpcklwd   mm2,        mm4         ; 20 30 21 31
+        punpckhwd   mm5,        mm4         ; 22 32 23 33
+
+        movq        mm1,        mm0         ; 00 10 01 11
+        punpckldq   mm0,        mm2         ; 00 10 20 30
+
+        punpckhdq   mm1,        mm2         ; 01 11 21 31
+
+        movq        mm2,        mm3         ; 02 12 03 13
+        punpckldq   mm2,        mm5         ; 02 12 22 32
+
+        punpckhdq   mm3,        mm5         ; 03 13 23 33
+
+        ; mm0 0
+        ; mm1 1
+        ; mm2 2
+        ; mm3 3
+
+        ; first stage
+        movq        mm5,        mm0
+        movq        mm4,        mm1
+
+        paddw       mm0,        mm3         ; a1 = 0 + 3
+        paddw       mm1,        mm2         ; b1 = 1 + 2
+
+        psubw       mm4,        mm2         ; c1 = 1 - 2
+        psubw       mm5,        mm3         ; d1 = 0 - 3
+
+        psllw       mm5,        3
+        psllw       mm4,        3
+
+        psllw       mm0,        3
+        psllw       mm1,        3
+
+        ; output 0 and 2
+        movq        mm2,        mm0         ; a1
+
+        paddw       mm0,        mm1         ; op[0] = a1 + b1
+        psubw       mm2,        mm1         ; op[2] = a1 - b1
+
+        ; output 1 and 3
+        ; interleave c1, d1
+        movq        mm1,        mm5         ; d1
+        punpcklwd   mm1,        mm4         ; c1 d1
+        punpckhwd   mm5,        mm4         ; c1 d1
+
+        movq        mm3,        mm1
+        movq        mm4,        mm5
+
+        pmaddwd     mm1,        MMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
+        pmaddwd     mm4,        MMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
+
+        pmaddwd     mm3,        MMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
+        pmaddwd     mm5,        MMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
+
+        paddd       mm1,        MMWORD PTR[GLOBAL(_14500)]
+        paddd       mm4,        MMWORD PTR[GLOBAL(_14500)]
+        paddd       mm3,        MMWORD PTR[GLOBAL(_7500)]
+        paddd       mm5,        MMWORD PTR[GLOBAL(_7500)]
+
+        psrad       mm1,        12          ; (c1 * 2217 + d1 * 5352 +  14500)>>12
+        psrad       mm4,        12          ; (c1 * 2217 + d1 * 5352 +  14500)>>12
+        psrad       mm3,        12          ; (d1 * 2217 - c1 * 5352 +   7500)>>12
+        psrad       mm5,        12          ; (d1 * 2217 - c1 * 5352 +   7500)>>12
+
+        packssdw    mm1,        mm4         ; op[1]
+        packssdw    mm3,        mm5         ; op[3]
+
+        ; done with vertical
+        ; transpose for the second stage
+        movq        mm4,        mm0         ; 00 10 20 30
+        movq        mm5,        mm2         ; 02 12 22 32
+
+        punpcklwd   mm0,        mm1         ; 00 01 10 11
+        punpckhwd   mm4,        mm1         ; 20 21 30 31
+
+        punpcklwd   mm2,        mm3         ; 02 03 12 13
+        punpckhwd   mm5,        mm3         ; 22 23 32 33
+
+        movq        mm1,        mm0         ; 00 01 10 11
+        punpckldq   mm0,        mm2         ; 00 01 02 03
+
+        punpckhdq   mm1,        mm2         ; 01 22 12 13
+
+        movq        mm2,        mm4         ; 20 31 30 31
+        punpckldq   mm2,        mm5         ; 20 21 22 23
+
+        punpckhdq   mm4,        mm5         ; 30 31 32 33
+
+        ; mm0 0
+        ; mm1 1
+        ; mm2 2
+        ; mm3 4
+
+        movq        mm5,        mm0
+        movq        mm3,        mm1
+
+        paddw       mm0,        mm4         ; a1 = 0 + 3
+        paddw       mm1,        mm2         ; b1 = 1 + 2
+
+        psubw       mm3,        mm2         ; c1 = 1 - 2
+        psubw       mm5,        mm4         ; d1 = 0 - 3
+
+        pxor        mm6,        mm6         ; zero out for compare
+
+        pcmpeqw     mm6,        mm5         ; d1 != 0
+
+        pandn       mm6,        MMWORD PTR[GLOBAL(_cmp_mask)]   ; clear upper,
+                                                                ; and keep bit 0 of lower
+
+        ; output 0 and 2
+        movq        mm2,        mm0         ; a1
+
+        paddw       mm0,        mm1         ; a1 + b1
+        psubw       mm2,        mm1         ; a1 - b1
+
+        paddw       mm0,        MMWORD PTR[GLOBAL(_7w)]
+        paddw       mm2,        MMWORD PTR[GLOBAL(_7w)]
+
+        psraw       mm0,        4           ; op[0] = (a1 + b1 + 7)>>4
+        psraw       mm2,        4           ; op[8] = (a1 - b1 + 7)>>4
+
+        movq        MMWORD PTR[rdi + 0 ],  mm0
+        movq        MMWORD PTR[rdi + 16],  mm2
+
+        ; output 1 and 3
+        ; interleave c1, d1
+        movq        mm1,        mm5         ; d1
+        punpcklwd   mm1,        mm3         ; c1 d1
+        punpckhwd   mm5,        mm3         ; c1 d1
+
+        movq        mm3,        mm1
+        movq        mm4,        mm5
+
+        pmaddwd     mm1,        MMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
+        pmaddwd     mm4,        MMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
+
+        pmaddwd     mm3,        MMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
+        pmaddwd     mm5,        MMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
+
+        paddd       mm1,        MMWORD PTR[GLOBAL(_12000)]
+        paddd       mm4,        MMWORD PTR[GLOBAL(_12000)]
+        paddd       mm3,        MMWORD PTR[GLOBAL(_51000)]
+        paddd       mm5,        MMWORD PTR[GLOBAL(_51000)]
+
+        psrad       mm1,        16          ; (c1 * 2217 + d1 * 5352 +  14500)>>16
+        psrad       mm4,        16          ; (c1 * 2217 + d1 * 5352 +  14500)>>16
+        psrad       mm3,        16          ; (d1 * 2217 - c1 * 5352 +   7500)>>16
+        psrad       mm5,        16          ; (d1 * 2217 - c1 * 5352 +   7500)>>16
+
+        packssdw    mm1,        mm4         ; op[4]
+        packssdw    mm3,        mm5         ; op[12]
+
+        paddw       mm1,        mm6         ; op[4] += (d1!=0)
+
+        movq        MMWORD PTR[rdi + 8 ],  mm1
+        movq        MMWORD PTR[rdi + 24],  mm3
+
+     ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+SECTION_RODATA
+align 8
+_5352_2217:
+    dw 5352
+    dw 2217
+    dw 5352
+    dw 2217
+align 8
+_2217_neg5352:
+    dw 2217
+    dw -5352
+    dw 2217
+    dw -5352
+align 8
+_cmp_mask:
+    times 4 dw 1
+align 8
+_7w:
+    times 4 dw 7
+align 8
+_14500:
+    times 2 dd 14500
+align 8
+_7500:
+    times 2 dd 7500
+align 8
+_12000:
+    times 2 dd 12000
+align 8
+_51000:
+    times 2 dd 51000
--- /dev/null
+++ b/vp9/encoder/x86/dct_sse2.asm
@@ -1,0 +1,432 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+%macro STACK_FRAME_CREATE 0
+%if ABI_IS_32BIT
+  %define       input       rsi
+  %define       output      rdi
+  %define       pitch       rax
+    push        rbp
+    mov         rbp, rsp
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    mov         rsi, arg(0)
+    mov         rdi, arg(1)
+
+    movsxd      rax, dword ptr arg(2)
+    lea         rcx, [rsi + rax*2]
+%else
+  %ifidn __OUTPUT_FORMAT__,x64
+    %define     input       rcx
+    %define     output      rdx
+    %define     pitch       r8
+    SAVE_XMM 7, u
+  %else
+    %define     input       rdi
+    %define     output      rsi
+    %define     pitch       rdx
+  %endif
+%endif
+%endmacro
+
+%macro STACK_FRAME_DESTROY 0
+  %define     input
+  %define     output
+  %define     pitch
+
+%if ABI_IS_32BIT
+    pop         rdi
+    pop         rsi
+    RESTORE_GOT
+    pop         rbp
+%else
+  %ifidn __OUTPUT_FORMAT__,x64
+    RESTORE_XMM
+  %endif
+%endif
+    ret
+%endmacro
+
+;void vp9_short_fdct4x4_sse2(short *input, short *output, int pitch)
+global sym(vp9_short_fdct4x4_sse2)
+sym(vp9_short_fdct4x4_sse2):
+
+    STACK_FRAME_CREATE
+
+    movq        xmm0, MMWORD PTR[input        ] ;03 02 01 00
+    movq        xmm2, MMWORD PTR[input+  pitch] ;13 12 11 10
+    lea         input,          [input+2*pitch]
+    movq        xmm1, MMWORD PTR[input        ] ;23 22 21 20
+    movq        xmm3, MMWORD PTR[input+  pitch] ;33 32 31 30
+
+    punpcklqdq  xmm0, xmm2                      ;13 12 11 10 03 02 01 00
+    punpcklqdq  xmm1, xmm3                      ;33 32 31 30 23 22 21 20
+
+    movdqa      xmm2, xmm0
+    punpckldq   xmm0, xmm1                      ;23 22 03 02 21 20 01 00
+    punpckhdq   xmm2, xmm1                      ;33 32 13 12 31 30 11 10
+    movdqa      xmm1, xmm0
+    punpckldq   xmm0, xmm2                      ;31 21 30 20 11 10 01 00
+    pshufhw     xmm1, xmm1, 0b1h                ;22 23 02 03 xx xx xx xx
+    pshufhw     xmm2, xmm2, 0b1h                ;32 33 12 13 xx xx xx xx
+
+    punpckhdq   xmm1, xmm2                      ;32 33 22 23 12 13 02 03
+    movdqa      xmm3, xmm0
+    paddw       xmm0, xmm1                      ;b1 a1 b1 a1 b1 a1 b1 a1
+    psubw       xmm3, xmm1                      ;c1 d1 c1 d1 c1 d1 c1 d1
+    psllw       xmm0, 3                         ;b1 <<= 3 a1 <<= 3
+    psllw       xmm3, 3                         ;c1 <<= 3 d1 <<= 3
+
+    movdqa      xmm1, xmm0
+    pmaddwd     xmm0, XMMWORD PTR[GLOBAL(_mult_add)]    ;a1 + b1
+    pmaddwd     xmm1, XMMWORD PTR[GLOBAL(_mult_sub)]    ;a1 - b1
+    movdqa      xmm4, xmm3
+    pmaddwd     xmm3, XMMWORD PTR[GLOBAL(_5352_2217)]   ;c1*2217 + d1*5352
+    pmaddwd     xmm4, XMMWORD PTR[GLOBAL(_2217_neg5352)];d1*2217 - c1*5352
+
+    paddd       xmm3, XMMWORD PTR[GLOBAL(_14500)]
+    paddd       xmm4, XMMWORD PTR[GLOBAL(_7500)]
+    psrad       xmm3, 12            ;(c1 * 2217 + d1 * 5352 +  14500)>>12
+    psrad       xmm4, 12            ;(d1 * 2217 - c1 * 5352 +   7500)>>12
+
+    packssdw    xmm0, xmm1                      ;op[2] op[0]
+    packssdw    xmm3, xmm4                      ;op[3] op[1]
+    ; 23 22 21 20 03 02 01 00
+    ;
+    ; 33 32 31 30 13 12 11 10
+    ;
+    movdqa      xmm2, xmm0
+    punpcklqdq  xmm0, xmm3                      ;13 12 11 10 03 02 01 00
+    punpckhqdq  xmm2, xmm3                      ;23 22 21 20 33 32 31 30
+
+    movdqa      xmm3, xmm0
+    punpcklwd   xmm0, xmm2                      ;32 30 22 20 12 10 02 00
+    punpckhwd   xmm3, xmm2                      ;33 31 23 21 13 11 03 01
+    movdqa      xmm2, xmm0
+    punpcklwd   xmm0, xmm3                      ;13 12 11 10 03 02 01 00
+    punpckhwd   xmm2, xmm3                      ;33 32 31 30 23 22 21 20
+
+    movdqa      xmm5, XMMWORD PTR[GLOBAL(_7)]
+    pshufd      xmm2, xmm2, 04eh
+    movdqa      xmm3, xmm0
+    paddw       xmm0, xmm2                      ;b1 b1 b1 b1 a1 a1 a1 a1
+    psubw       xmm3, xmm2                      ;c1 c1 c1 c1 d1 d1 d1 d1
+
+    pshufd      xmm0, xmm0, 0d8h                ;b1 b1 a1 a1 b1 b1 a1 a1
+    movdqa      xmm2, xmm3                      ;save d1 for compare
+    pshufd      xmm3, xmm3, 0d8h                ;c1 c1 d1 d1 c1 c1 d1 d1
+    pshuflw     xmm0, xmm0, 0d8h                ;b1 b1 a1 a1 b1 a1 b1 a1
+    pshuflw     xmm3, xmm3, 0d8h                ;c1 c1 d1 d1 c1 d1 c1 d1
+    pshufhw     xmm0, xmm0, 0d8h                ;b1 a1 b1 a1 b1 a1 b1 a1
+    pshufhw     xmm3, xmm3, 0d8h                ;c1 d1 c1 d1 c1 d1 c1 d1
+    movdqa      xmm1, xmm0
+    pmaddwd     xmm0, XMMWORD PTR[GLOBAL(_mult_add)] ;a1 + b1
+    pmaddwd     xmm1, XMMWORD PTR[GLOBAL(_mult_sub)] ;a1 - b1
+
+    pxor        xmm4, xmm4                      ;zero out for compare
+    paddd       xmm0, xmm5
+    paddd       xmm1, xmm5
+    pcmpeqw     xmm2, xmm4
+    psrad       xmm0, 4                         ;(a1 + b1 + 7)>>4
+    psrad       xmm1, 4                         ;(a1 - b1 + 7)>>4
+    pandn       xmm2, XMMWORD PTR[GLOBAL(_cmp_mask)] ;clear upper,
+                                                     ;and keep bit 0 of lower
+
+    movdqa      xmm4, xmm3
+    pmaddwd     xmm3, XMMWORD PTR[GLOBAL(_5352_2217)]    ;c1*2217 + d1*5352
+    pmaddwd     xmm4, XMMWORD PTR[GLOBAL(_2217_neg5352)] ;d1*2217 - c1*5352
+    paddd       xmm3, XMMWORD PTR[GLOBAL(_12000)]
+    paddd       xmm4, XMMWORD PTR[GLOBAL(_51000)]
+    packssdw    xmm0, xmm1                      ;op[8] op[0]
+    psrad       xmm3, 16                ;(c1 * 2217 + d1 * 5352 +  12000)>>16
+    psrad       xmm4, 16                ;(d1 * 2217 - c1 * 5352 +  51000)>>16
+
+    packssdw    xmm3, xmm4                      ;op[12] op[4]
+    movdqa      xmm1, xmm0
+    paddw       xmm3, xmm2                      ;op[4] += (d1!=0)
+    punpcklqdq  xmm0, xmm3                      ;op[4] op[0]
+    punpckhqdq  xmm1, xmm3                      ;op[12] op[8]
+
+    movdqa      XMMWORD PTR[output +  0], xmm0
+    movdqa      XMMWORD PTR[output + 16], xmm1
+
+    STACK_FRAME_DESTROY
+
+;void vp9_short_fdct8x4_sse2(short *input, short *output, int pitch)
+global sym(vp9_short_fdct8x4_sse2)
+sym(vp9_short_fdct8x4_sse2):
+
+    STACK_FRAME_CREATE
+
+        ; read the input data
+        movdqa      xmm0,       [input        ]
+        movdqa      xmm2,       [input+  pitch]
+        lea         input,      [input+2*pitch]
+        movdqa      xmm4,       [input        ]
+        movdqa      xmm3,       [input+  pitch]
+
+        ; transpose for the first stage
+        movdqa      xmm1,       xmm0        ; 00 01 02 03 04 05 06 07
+        movdqa      xmm5,       xmm4        ; 20 21 22 23 24 25 26 27
+
+        punpcklwd   xmm0,       xmm2        ; 00 10 01 11 02 12 03 13
+        punpckhwd   xmm1,       xmm2        ; 04 14 05 15 06 16 07 17
+
+        punpcklwd   xmm4,       xmm3        ; 20 30 21 31 22 32 23 33
+        punpckhwd   xmm5,       xmm3        ; 24 34 25 35 26 36 27 37
+
+        movdqa      xmm2,       xmm0        ; 00 10 01 11 02 12 03 13
+        punpckldq   xmm0,       xmm4        ; 00 10 20 30 01 11 21 31
+
+        punpckhdq   xmm2,       xmm4        ; 02 12 22 32 03 13 23 33
+
+        movdqa      xmm4,       xmm1        ; 04 14 05 15 06 16 07 17
+        punpckldq   xmm4,       xmm5        ; 04 14 24 34 05 15 25 35
+
+        punpckhdq   xmm1,       xmm5        ; 06 16 26 36 07 17 27 37
+        movdqa      xmm3,       xmm2        ; 02 12 22 32 03 13 23 33
+
+        punpckhqdq  xmm3,       xmm1        ; 03 13 23 33 07 17 27 37
+        punpcklqdq  xmm2,       xmm1        ; 02 12 22 32 06 16 26 36
+
+        movdqa      xmm1,       xmm0        ; 00 10 20 30 01 11 21 31
+        punpcklqdq  xmm0,       xmm4        ; 00 10 20 30 04 14 24 34
+
+        punpckhqdq  xmm1,       xmm4        ; 01 11 21 32 05 15 25 35
+
+        ; xmm0 0
+        ; xmm1 1
+        ; xmm2 2
+        ; xmm3 3
+
+        ; first stage
+        movdqa      xmm5,       xmm0
+        movdqa      xmm4,       xmm1
+
+        paddw       xmm0,       xmm3        ; a1 = 0 + 3
+        paddw       xmm1,       xmm2        ; b1 = 1 + 2
+
+        psubw       xmm4,       xmm2        ; c1 = 1 - 2
+        psubw       xmm5,       xmm3        ; d1 = 0 - 3
+
+        psllw       xmm5,        3
+        psllw       xmm4,        3
+
+        psllw       xmm0,        3
+        psllw       xmm1,        3
+
+        ; output 0 and 2
+        movdqa      xmm2,       xmm0        ; a1
+
+        paddw       xmm0,       xmm1        ; op[0] = a1 + b1
+        psubw       xmm2,       xmm1        ; op[2] = a1 - b1
+
+        ; output 1 and 3
+        ; interleave c1, d1
+        movdqa      xmm1,       xmm5        ; d1
+        punpcklwd   xmm1,       xmm4        ; c1 d1
+        punpckhwd   xmm5,       xmm4        ; c1 d1
+
+        movdqa      xmm3,       xmm1
+        movdqa      xmm4,       xmm5
+
+        pmaddwd     xmm1,       XMMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
+        pmaddwd     xmm4,       XMMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
+
+        pmaddwd     xmm3,       XMMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
+        pmaddwd     xmm5,       XMMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
+
+        paddd       xmm1,       XMMWORD PTR[GLOBAL(_14500)]
+        paddd       xmm4,       XMMWORD PTR[GLOBAL(_14500)]
+        paddd       xmm3,       XMMWORD PTR[GLOBAL(_7500)]
+        paddd       xmm5,       XMMWORD PTR[GLOBAL(_7500)]
+
+        psrad       xmm1,       12          ; (c1 * 2217 + d1 * 5352 +  14500)>>12
+        psrad       xmm4,       12          ; (c1 * 2217 + d1 * 5352 +  14500)>>12
+        psrad       xmm3,       12          ; (d1 * 2217 - c1 * 5352 +   7500)>>12
+        psrad       xmm5,       12          ; (d1 * 2217 - c1 * 5352 +   7500)>>12
+
+        packssdw    xmm1,       xmm4        ; op[1]
+        packssdw    xmm3,       xmm5        ; op[3]
+
+        ; done with vertical
+        ; transpose for the second stage
+        movdqa      xmm4,       xmm0         ; 00 10 20 30 04 14 24 34
+        movdqa      xmm5,       xmm2         ; 02 12 22 32 06 16 26 36
+
+        punpcklwd   xmm0,       xmm1         ; 00 01 10 11 20 21 30 31
+        punpckhwd   xmm4,       xmm1         ; 04 05 14 15 24 25 34 35
+
+        punpcklwd   xmm2,       xmm3         ; 02 03 12 13 22 23 32 33
+        punpckhwd   xmm5,       xmm3         ; 06 07 16 17 26 27 36 37
+
+        movdqa      xmm1,       xmm0         ; 00 01 10 11 20 21 30 31
+        punpckldq   xmm0,       xmm2         ; 00 01 02 03 10 11 12 13
+
+        punpckhdq   xmm1,       xmm2         ; 20 21 22 23 30 31 32 33
+
+        movdqa      xmm2,       xmm4         ; 04 05 14 15 24 25 34 35
+        punpckldq   xmm2,       xmm5         ; 04 05 06 07 14 15 16 17
+
+        punpckhdq   xmm4,       xmm5         ; 24 25 26 27 34 35 36 37
+        movdqa      xmm3,       xmm1         ; 20 21 22 23 30 31 32 33
+
+        punpckhqdq  xmm3,       xmm4         ; 30 31 32 33 34 35 36 37
+        punpcklqdq  xmm1,       xmm4         ; 20 21 22 23 24 25 26 27
+
+        movdqa      xmm4,       xmm0         ; 00 01 02 03 10 11 12 13
+        punpcklqdq  xmm0,       xmm2         ; 00 01 02 03 04 05 06 07
+
+        punpckhqdq  xmm4,       xmm2         ; 10 11 12 13 14 15 16 17
+
+        ; xmm0 0
+        ; xmm1 4
+        ; xmm2 1
+        ; xmm3 3
+
+        movdqa      xmm5,       xmm0
+        movdqa      xmm2,       xmm1
+
+        paddw       xmm0,       xmm3        ; a1 = 0 + 3
+        paddw       xmm1,       xmm4        ; b1 = 1 + 2
+
+        psubw       xmm4,       xmm2        ; c1 = 1 - 2
+        psubw       xmm5,       xmm3        ; d1 = 0 - 3
+
+        pxor        xmm6,       xmm6        ; zero out for compare
+
+        pcmpeqw     xmm6,       xmm5        ; d1 != 0
+
+        pandn       xmm6,       XMMWORD PTR[GLOBAL(_cmp_mask8x4)]   ; clear upper,
+                                                                    ; and keep bit 0 of lower
+
+        ; output 0 and 2
+        movdqa      xmm2,       xmm0        ; a1
+
+        paddw       xmm0,       xmm1        ; a1 + b1
+        psubw       xmm2,       xmm1        ; a1 - b1
+
+        paddw       xmm0,       XMMWORD PTR[GLOBAL(_7w)]
+        paddw       xmm2,       XMMWORD PTR[GLOBAL(_7w)]
+
+        psraw       xmm0,       4           ; op[0] = (a1 + b1 + 7)>>4
+        psraw       xmm2,       4           ; op[8] = (a1 - b1 + 7)>>4
+
+        ; output 1 and 3
+        ; interleave c1, d1
+        movdqa      xmm1,       xmm5        ; d1
+        punpcklwd   xmm1,       xmm4        ; c1 d1
+        punpckhwd   xmm5,       xmm4        ; c1 d1
+
+        movdqa      xmm3,       xmm1
+        movdqa      xmm4,       xmm5
+
+        pmaddwd     xmm1,       XMMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
+        pmaddwd     xmm4,       XMMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
+
+        pmaddwd     xmm3,       XMMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
+        pmaddwd     xmm5,       XMMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
+
+        paddd       xmm1,       XMMWORD PTR[GLOBAL(_12000)]
+        paddd       xmm4,       XMMWORD PTR[GLOBAL(_12000)]
+        paddd       xmm3,       XMMWORD PTR[GLOBAL(_51000)]
+        paddd       xmm5,       XMMWORD PTR[GLOBAL(_51000)]
+
+        psrad       xmm1,       16          ; (c1 * 2217 + d1 * 5352 +  14500)>>16
+        psrad       xmm4,       16          ; (c1 * 2217 + d1 * 5352 +  14500)>>16
+        psrad       xmm3,       16          ; (d1 * 2217 - c1 * 5352 +   7500)>>16
+        psrad       xmm5,       16          ; (d1 * 2217 - c1 * 5352 +   7500)>>16
+
+        packssdw    xmm1,       xmm4        ; op[4]
+        packssdw    xmm3,       xmm5        ; op[12]
+
+        paddw       xmm1,       xmm6        ; op[4] += (d1!=0)
+
+        movdqa      xmm4,       xmm0
+        movdqa      xmm5,       xmm2
+
+        punpcklqdq  xmm0,       xmm1
+        punpckhqdq  xmm4,       xmm1
+
+        punpcklqdq  xmm2,       xmm3
+        punpckhqdq  xmm5,       xmm3
+
+        movdqa      XMMWORD PTR[output + 0 ],  xmm0
+        movdqa      XMMWORD PTR[output + 16],  xmm2
+        movdqa      XMMWORD PTR[output + 32],  xmm4
+        movdqa      XMMWORD PTR[output + 48],  xmm5
+
+    STACK_FRAME_DESTROY
+
+SECTION_RODATA
+align 16
+_5352_2217:
+    dw 5352
+    dw 2217
+    dw 5352
+    dw 2217
+    dw 5352
+    dw 2217
+    dw 5352
+    dw 2217
+align 16
+_2217_neg5352:
+    dw 2217
+    dw -5352
+    dw 2217
+    dw -5352
+    dw 2217
+    dw -5352
+    dw 2217
+    dw -5352
+align 16
+_mult_add:
+    times 8 dw 1
+align 16
+_cmp_mask:
+    times 4 dw 1
+    times 4 dw 0
+align 16
+_cmp_mask8x4:
+    times 8 dw 1
+align 16
+_mult_sub:
+    dw 1
+    dw -1
+    dw 1
+    dw -1
+    dw 1
+    dw -1
+    dw 1
+    dw -1
+align 16
+_7:
+    times 4 dd 7
+align 16
+_7w:
+    times 8 dw 7
+align 16
+_14500:
+    times 4 dd 14500
+align 16
+_7500:
+    times 4 dd 7500
+align 16
+_12000:
+    times 4 dd 12000
+align 16
+_51000:
+    times 4 dd 51000
--- /dev/null
+++ b/vp9/encoder/x86/encodeopt.asm
@@ -1,0 +1,386 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;int vp9_block_error_xmm(short *coeff_ptr,  short *dcoef_ptr)
+global sym(vp9_block_error_xmm)
+sym(vp9_block_error_xmm):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 2
+    push rsi
+    push rdi
+    ; end prologue
+
+        mov         rsi,        arg(0) ;coeff_ptr
+        mov         rdi,        arg(1) ;dcoef_ptr
+
+        movdqa      xmm0,       [rsi]
+        movdqa      xmm1,       [rdi]
+
+        movdqa      xmm2,       [rsi+16]
+        movdqa      xmm3,       [rdi+16]
+
+        psubw       xmm0,       xmm1
+        psubw       xmm2,       xmm3
+
+        pmaddwd     xmm0,       xmm0
+        pmaddwd     xmm2,       xmm2
+
+        paddd       xmm0,       xmm2
+
+        pxor        xmm5,       xmm5
+        movdqa      xmm1,       xmm0
+
+        punpckldq   xmm0,       xmm5
+        punpckhdq   xmm1,       xmm5
+
+        paddd       xmm0,       xmm1
+        movdqa      xmm1,       xmm0
+
+        psrldq      xmm0,       8
+        paddd       xmm0,       xmm1
+
+        movq        rax,        xmm0
+
+    pop rdi
+    pop rsi
+    ; begin epilog
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;int vp9_block_error_mmx(short *coeff_ptr,  short *dcoef_ptr)
+global sym(vp9_block_error_mmx)
+sym(vp9_block_error_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 2
+    push rsi
+    push rdi
+    ; end prolog
+
+
+        mov         rsi,        arg(0) ;coeff_ptr
+        pxor        mm7,        mm7
+
+        mov         rdi,        arg(1) ;dcoef_ptr
+        movq        mm3,        [rsi]
+
+        movq        mm4,        [rdi]
+        movq        mm5,        [rsi+8]
+
+        movq        mm6,        [rdi+8]
+        pxor        mm1,        mm1 ; from movd mm1, dc ; dc =0
+
+        movq        mm2,        mm7
+        psubw       mm5,        mm6
+
+        por         mm1,        mm2
+        pmaddwd     mm5,        mm5
+
+        pcmpeqw     mm1,        mm7
+        psubw       mm3,        mm4
+
+        pand        mm1,        mm3
+        pmaddwd     mm1,        mm1
+
+        paddd       mm1,        mm5
+        movq        mm3,        [rsi+16]
+
+        movq        mm4,        [rdi+16]
+        movq        mm5,        [rsi+24]
+
+        movq        mm6,        [rdi+24]
+        psubw       mm5,        mm6
+
+        pmaddwd     mm5,        mm5
+        psubw       mm3,        mm4
+
+        pmaddwd     mm3,        mm3
+        paddd       mm3,        mm5
+
+        paddd       mm1,        mm3
+        movq        mm0,        mm1
+
+        psrlq       mm1,        32
+        paddd       mm0,        mm1
+
+        movq        rax,        mm0
+
+    pop rdi
+    pop rsi
+    ; begin epilog
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;int vp9_mbblock_error_mmx_impl(short *coeff_ptr, short *dcoef_ptr, int dc);
+global sym(vp9_mbblock_error_mmx_impl)
+sym(vp9_mbblock_error_mmx_impl):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 3
+    push rsi
+    push rdi
+    ; end prolog
+
+
+        mov         rsi,        arg(0) ;coeff_ptr
+        pxor        mm7,        mm7
+
+        mov         rdi,        arg(1) ;dcoef_ptr
+        pxor        mm2,        mm2
+
+        movd        mm1,        dword ptr arg(2) ;dc
+        por         mm1,        mm2
+
+        pcmpeqw     mm1,        mm7
+        mov         rcx,        16
+
+.mberror_loop_mmx:
+        movq        mm3,       [rsi]
+        movq        mm4,       [rdi]
+
+        movq        mm5,       [rsi+8]
+        movq        mm6,       [rdi+8]
+
+
+        psubw       mm5,        mm6
+        pmaddwd     mm5,        mm5
+
+        psubw       mm3,        mm4
+        pand        mm3,        mm1
+
+        pmaddwd     mm3,        mm3
+        paddd       mm2,        mm5
+
+        paddd       mm2,        mm3
+        movq        mm3,       [rsi+16]
+
+        movq        mm4,       [rdi+16]
+        movq        mm5,       [rsi+24]
+
+        movq        mm6,       [rdi+24]
+        psubw       mm5,        mm6
+
+        pmaddwd     mm5,        mm5
+        psubw       mm3,        mm4
+
+        pmaddwd     mm3,        mm3
+        paddd       mm2,        mm5
+
+        paddd       mm2,        mm3
+        add         rsi,        32
+
+        add         rdi,        32
+        sub         rcx,        1
+
+        jnz         .mberror_loop_mmx
+
+        movq        mm0,        mm2
+        psrlq       mm2,        32
+
+        paddd       mm0,        mm2
+        movq        rax,        mm0
+
+    pop rdi
+    pop rsi
+    ; begin epilog
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;int vp9_mbblock_error_xmm_impl(short *coeff_ptr, short *dcoef_ptr, int dc);
+global sym(vp9_mbblock_error_xmm_impl)
+sym(vp9_mbblock_error_xmm_impl):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 3
+    SAVE_XMM 6
+    push rsi
+    push rdi
+    ; end prolog
+
+
+        mov         rsi,        arg(0) ;coeff_ptr
+        pxor        xmm6,       xmm6
+
+        mov         rdi,        arg(1) ;dcoef_ptr
+        pxor        xmm4,       xmm4
+
+        movd        xmm5,       dword ptr arg(2) ;dc
+        por         xmm5,       xmm4
+
+        pcmpeqw     xmm5,       xmm6
+        mov         rcx,        16
+
+.mberror_loop:
+        movdqa      xmm0,       [rsi]
+        movdqa      xmm1,       [rdi]
+
+        movdqa      xmm2,       [rsi+16]
+        movdqa      xmm3,       [rdi+16]
+
+
+        psubw       xmm2,       xmm3
+        pmaddwd     xmm2,       xmm2
+
+        psubw       xmm0,       xmm1
+        pand        xmm0,       xmm5
+
+        pmaddwd     xmm0,       xmm0
+        add         rsi,        32
+
+        add         rdi,        32
+
+        sub         rcx,        1
+        paddd       xmm4,       xmm2
+
+        paddd       xmm4,       xmm0
+        jnz         .mberror_loop
+
+        movdqa      xmm0,       xmm4
+        punpckldq   xmm0,       xmm6
+
+        punpckhdq   xmm4,       xmm6
+        paddd       xmm0,       xmm4
+
+        movdqa      xmm1,       xmm0
+        psrldq      xmm0,       8
+
+        paddd       xmm0,       xmm1
+        movq        rax,        xmm0
+
+    pop rdi
+    pop rsi
+    ; begin epilog
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;int vp9_mbuverror_mmx_impl(short *s_ptr, short *d_ptr);
+global sym(vp9_mbuverror_mmx_impl)
+sym(vp9_mbuverror_mmx_impl):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 2
+    push rsi
+    push rdi
+    ; end prolog
+
+
+        mov             rsi,        arg(0) ;s_ptr
+        mov             rdi,        arg(1) ;d_ptr
+
+        mov             rcx,        16
+        pxor            mm7,        mm7
+
+.mbuverror_loop_mmx:
+
+        movq            mm1,        [rsi]
+        movq            mm2,        [rdi]
+
+        psubw           mm1,        mm2
+        pmaddwd         mm1,        mm1
+
+
+        movq            mm3,        [rsi+8]
+        movq            mm4,        [rdi+8]
+
+        psubw           mm3,        mm4
+        pmaddwd         mm3,        mm3
+
+
+        paddd           mm7,        mm1
+        paddd           mm7,        mm3
+
+
+        add             rsi,        16
+        add             rdi,        16
+
+        dec             rcx
+        jnz             .mbuverror_loop_mmx
+
+        movq            mm0,        mm7
+        psrlq           mm7,        32
+
+        paddd           mm0,        mm7
+        movq            rax,        mm0
+
+    pop rdi
+    pop rsi
+    ; begin epilog
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;int vp9_mbuverror_xmm_impl(short *s_ptr, short *d_ptr);
+global sym(vp9_mbuverror_xmm_impl)
+sym(vp9_mbuverror_xmm_impl):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 2
+    push rsi
+    push rdi
+    ; end prolog
+
+
+        mov             rsi,        arg(0) ;s_ptr
+        mov             rdi,        arg(1) ;d_ptr
+
+        mov             rcx,        16
+        pxor            xmm3,       xmm3
+
+.mbuverror_loop:
+
+        movdqa          xmm1,       [rsi]
+        movdqa          xmm2,       [rdi]
+
+        psubw           xmm1,       xmm2
+        pmaddwd         xmm1,       xmm1
+
+        paddd           xmm3,       xmm1
+
+        add             rsi,        16
+        add             rdi,        16
+
+        dec             rcx
+        jnz             .mbuverror_loop
+
+        pxor        xmm0,           xmm0
+        movdqa      xmm1,           xmm3
+
+        movdqa      xmm2,           xmm1
+        punpckldq   xmm1,           xmm0
+
+        punpckhdq   xmm2,           xmm0
+        paddd       xmm1,           xmm2
+
+        movdqa      xmm2,           xmm1
+
+        psrldq      xmm1,           8
+        paddd       xmm1,           xmm2
+
+        movq            rax,            xmm1
+
+    pop rdi
+    pop rsi
+    ; begin epilog
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
--- /dev/null
+++ b/vp9/encoder/x86/fwalsh_sse2.asm
@@ -1,0 +1,164 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;void vp9_short_walsh4x4_sse2(short *input, short *output, int pitch)
+global sym(vp9_short_walsh4x4_sse2)
+sym(vp9_short_walsh4x4_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 3
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    mov     rsi, arg(0)           ; input
+    mov     rdi, arg(1)           ; output
+    movsxd  rdx, dword ptr arg(2) ; pitch
+
+    ; first for loop
+    movq    xmm0, MMWORD PTR [rsi]           ; load input
+    movq    xmm1, MMWORD PTR [rsi + rdx]
+    lea     rsi,  [rsi + rdx*2]
+    movq    xmm2, MMWORD PTR [rsi]
+    movq    xmm3, MMWORD PTR [rsi + rdx]
+
+    punpcklwd xmm0,  xmm1
+    punpcklwd xmm2,  xmm3
+
+    movdqa    xmm1, xmm0
+    punpckldq xmm0, xmm2           ; ip[1] ip[0]
+    punpckhdq xmm1, xmm2           ; ip[3] ip[2]
+
+    movdqa    xmm2, xmm0
+    paddw     xmm0, xmm1
+    psubw     xmm2, xmm1
+
+    psllw     xmm0, 2              ; d1  a1
+    psllw     xmm2, 2              ; c1  b1
+
+    movdqa    xmm1, xmm0
+    punpcklqdq xmm0, xmm2          ; b1  a1
+    punpckhqdq xmm1, xmm2          ; c1  d1
+
+    pxor      xmm6, xmm6
+    movq      xmm6, xmm0
+    pxor      xmm7, xmm7
+    pcmpeqw   xmm7, xmm6
+    paddw     xmm7, [GLOBAL(c1)]
+
+    movdqa    xmm2, xmm0
+    paddw     xmm0, xmm1           ; b1+c1  a1+d1
+    psubw     xmm2, xmm1           ; b1-c1  a1-d1
+    paddw     xmm0, xmm7           ; b1+c1  a1+d1+(a1!=0)
+
+    ; second for loop
+    ; input: 13  9  5  1 12  8  4  0 (xmm0)
+    ;        14 10  6  2 15 11  7  3 (xmm2)
+    ; after shuffle:
+    ;        13  5  9  1 12  4  8  0 (xmm0)
+    ;        14  6 10  2 15  7 11  3 (xmm1)
+    pshuflw   xmm3, xmm0, 0xd8
+    pshufhw   xmm0, xmm3, 0xd8
+    pshuflw   xmm3, xmm2, 0xd8
+    pshufhw   xmm1, xmm3, 0xd8
+
+    movdqa    xmm2, xmm0
+    pmaddwd   xmm0, [GLOBAL(c1)]    ; d11 a11 d10 a10
+    pmaddwd   xmm2, [GLOBAL(cn1)]   ; c11 b11 c10 b10
+    movdqa    xmm3, xmm1
+    pmaddwd   xmm1, [GLOBAL(c1)]    ; d12 a12 d13 a13
+    pmaddwd   xmm3, [GLOBAL(cn1)]   ; c12 b12 c13 b13
+
+    pshufd    xmm4, xmm0, 0xd8      ; d11 d10 a11 a10
+    pshufd    xmm5, xmm2, 0xd8      ; c11 c10 b11 b10
+    pshufd    xmm6, xmm1, 0x72      ; d13 d12 a13 a12
+    pshufd    xmm7, xmm3, 0x72      ; c13 c12 b13 b12
+
+    movdqa    xmm0, xmm4
+    punpcklqdq xmm0, xmm5           ; b11 b10 a11 a10
+    punpckhqdq xmm4, xmm5           ; c11 c10 d11 d10
+    movdqa    xmm1, xmm6
+    punpcklqdq xmm1, xmm7           ; b13 b12 a13 a12
+    punpckhqdq xmm6, xmm7           ; c13 c12 d13 d12
+
+    movdqa    xmm2, xmm0
+    paddd     xmm0, xmm4            ; b21 b20 a21 a20
+    psubd     xmm2, xmm4            ; c21 c20 d21 d20
+    movdqa    xmm3, xmm1
+    paddd     xmm1, xmm6            ; b23 b22 a23 a22
+    psubd     xmm3, xmm6            ; c23 c22 d23 d22
+
+    pxor      xmm4, xmm4
+    movdqa    xmm5, xmm4
+    pcmpgtd   xmm4, xmm0
+    pcmpgtd   xmm5, xmm2
+    pand      xmm4, [GLOBAL(cd1)]
+    pand      xmm5, [GLOBAL(cd1)]
+
+    pxor      xmm6, xmm6
+    movdqa    xmm7, xmm6
+    pcmpgtd   xmm6, xmm1
+    pcmpgtd   xmm7, xmm3
+    pand      xmm6, [GLOBAL(cd1)]
+    pand      xmm7, [GLOBAL(cd1)]
+
+    paddd     xmm0, xmm4
+    paddd     xmm2, xmm5
+    paddd     xmm0, [GLOBAL(cd3)]
+    paddd     xmm2, [GLOBAL(cd3)]
+    paddd     xmm1, xmm6
+    paddd     xmm3, xmm7
+    paddd     xmm1, [GLOBAL(cd3)]
+    paddd     xmm3, [GLOBAL(cd3)]
+
+    psrad     xmm0, 3
+    psrad     xmm1, 3
+    psrad     xmm2, 3
+    psrad     xmm3, 3
+    movdqa    xmm4, xmm0
+    punpcklqdq xmm0, xmm1           ; a23 a22 a21 a20
+    punpckhqdq xmm4, xmm1           ; b23 b22 b21 b20
+    movdqa    xmm5, xmm2
+    punpckhqdq xmm2, xmm3           ; c23 c22 c21 c20
+    punpcklqdq xmm5, xmm3           ; d23 d22 d21 d20
+
+    packssdw  xmm0, xmm4            ; b23 b22 b21 b20 a23 a22 a21 a20
+    packssdw  xmm2, xmm5            ; d23 d22 d21 d20 c23 c22 c21 c20
+
+    movdqa  XMMWORD PTR [rdi], xmm0
+    movdqa  XMMWORD PTR [rdi + 16], xmm2
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+SECTION_RODATA
+align 16
+c1:
+    dw 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001
+align 16
+cn1:
+    dw 0x0001, 0xffff, 0x0001, 0xffff, 0x0001, 0xffff, 0x0001, 0xffff
+align 16
+cd1:
+    dd 0x00000001, 0x00000001, 0x00000001, 0x00000001
+align 16
+cd3:
+    dd 0x00000003, 0x00000003, 0x00000003, 0x00000003
--- /dev/null
+++ b/vp9/encoder/x86/mcomp_x86.h
@@ -1,0 +1,40 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef MCOMP_X86_H
+#define MCOMP_X86_H
+
+#if HAVE_SSE3
+#if !CONFIG_RUNTIME_CPU_DETECT
+
+#undef  vp9_search_full_search
+#define vp9_search_full_search vp9_full_search_sadx3
+
+#undef  vp9_search_refining_search
+#define vp9_search_refining_search vp9_refining_search_sadx4
+
+#undef  vp9_search_diamond_search
+#define vp9_search_diamond_search vp9_diamond_search_sadx4
+
+#endif
+#endif
+
+#if HAVE_SSE4_1
+#if !CONFIG_RUNTIME_CPU_DETECT
+
+#undef  vp9_search_full_search
+#define vp9_search_full_search vp9_full_search_sadx8
+
+#endif
+#endif
+
+#endif
+
--- /dev/null
+++ b/vp9/encoder/x86/quantize_mmx.asm
@@ -1,0 +1,286 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;int vp9_fast_quantize_b_impl_mmx(short *coeff_ptr, short *zbin_ptr,
+;                           short *qcoeff_ptr,short *dequant_ptr,
+;                           short *scan_mask, short *round_ptr,
+;                           short *quant_ptr, short *dqcoeff_ptr);
+global sym(vp9_fast_quantize_b_impl_mmx)
+sym(vp9_fast_quantize_b_impl_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 8
+    push rsi
+    push rdi
+    ; end prolog
+
+
+        mov             rsi,        arg(0) ;coeff_ptr
+        movq            mm0,        [rsi]
+
+        mov             rax,        arg(1) ;zbin_ptr
+        movq            mm1,        [rax]
+
+        movq            mm3,        mm0
+        psraw           mm0,        15
+
+        pxor            mm3,        mm0
+        psubw           mm3,        mm0         ; abs
+
+        movq            mm2,        mm3
+        pcmpgtw         mm1,        mm2
+
+        pandn           mm1,        mm2
+        movq            mm3,        mm1
+
+        mov             rdx,        arg(6) ;quant_ptr
+        movq            mm1,        [rdx]
+
+        mov             rcx,        arg(5) ;round_ptr
+        movq            mm2,        [rcx]
+
+        paddw           mm3,        mm2
+        pmulhuw         mm3,        mm1
+
+        pxor            mm3,        mm0
+        psubw           mm3,        mm0     ;gain the sign back
+
+        mov             rdi,        arg(2) ;qcoeff_ptr
+        movq            mm0,        mm3
+
+        movq            [rdi],      mm3
+
+        mov             rax,        arg(3) ;dequant_ptr
+        movq            mm2,        [rax]
+
+        pmullw          mm3,        mm2
+        mov             rax,        arg(7) ;dqcoeff_ptr
+
+        movq            [rax],      mm3
+
+        ; next 8
+        movq            mm4,        [rsi+8]
+
+        mov             rax,        arg(1) ;zbin_ptr
+        movq            mm5,        [rax+8]
+
+        movq            mm7,        mm4
+        psraw           mm4,        15
+
+        pxor            mm7,        mm4
+        psubw           mm7,        mm4         ; abs
+
+        movq            mm6,        mm7
+        pcmpgtw         mm5,        mm6
+
+        pandn           mm5,        mm6
+        movq            mm7,        mm5
+
+        movq            mm5,        [rdx+8]
+        movq            mm6,        [rcx+8]
+
+        paddw           mm7,        mm6
+        pmulhuw         mm7,        mm5
+
+        pxor            mm7,        mm4
+        psubw           mm7,        mm4;gain the sign back
+
+        mov             rdi,        arg(2) ;qcoeff_ptr
+
+        movq            mm1,        mm7
+        movq            [rdi+8],    mm7
+
+        mov             rax,        arg(3) ;dequant_ptr
+        movq            mm6,        [rax+8]
+
+        pmullw          mm7,        mm6
+        mov             rax,        arg(7) ;dqcoeff_ptr
+
+        movq            [rax+8],    mm7
+
+
+                ; next 8
+        movq            mm4,        [rsi+16]
+
+        mov             rax,        arg(1) ;zbin_ptr
+        movq            mm5,        [rax+16]
+
+        movq            mm7,        mm4
+        psraw           mm4,        15
+
+        pxor            mm7,        mm4
+        psubw           mm7,        mm4         ; abs
+
+        movq            mm6,        mm7
+        pcmpgtw         mm5,        mm6
+
+        pandn           mm5,        mm6
+        movq            mm7,        mm5
+
+        movq            mm5,        [rdx+16]
+        movq            mm6,        [rcx+16]
+
+        paddw           mm7,        mm6
+        pmulhuw         mm7,        mm5
+
+        pxor            mm7,        mm4
+        psubw           mm7,        mm4;gain the sign back
+
+        mov             rdi,        arg(2) ;qcoeff_ptr
+
+        movq            mm1,        mm7
+        movq            [rdi+16],   mm7
+
+        mov             rax,        arg(3) ;dequant_ptr
+        movq            mm6,        [rax+16]
+
+        pmullw          mm7,        mm6
+        mov             rax,        arg(7) ;dqcoeff_ptr
+
+        movq            [rax+16],   mm7
+
+
+                ; next 8
+        movq            mm4,        [rsi+24]
+
+        mov             rax,        arg(1) ;zbin_ptr
+        movq            mm5,        [rax+24]
+
+        movq            mm7,        mm4
+        psraw           mm4,        15
+
+        pxor            mm7,        mm4
+        psubw           mm7,        mm4         ; abs
+
+        movq            mm6,        mm7
+        pcmpgtw         mm5,        mm6
+
+        pandn           mm5,        mm6
+        movq            mm7,        mm5
+
+        movq            mm5,        [rdx+24]
+        movq            mm6,        [rcx+24]
+
+        paddw           mm7,        mm6
+        pmulhuw         mm7,        mm5
+
+        pxor            mm7,        mm4
+        psubw           mm7,        mm4;gain the sign back
+
+        mov             rdi,        arg(2) ;qcoeff_ptr
+
+        movq            mm1,        mm7
+        movq            [rdi+24],   mm7
+
+        mov             rax,        arg(3) ;dequant_ptr
+        movq            mm6,        [rax+24]
+
+        pmullw          mm7,        mm6
+        mov             rax,        arg(7) ;dqcoeff_ptr
+
+        movq            [rax+24],   mm7
+
+
+
+        mov             rdi,        arg(4) ;scan_mask
+        mov             rsi,        arg(2) ;qcoeff_ptr
+
+        pxor            mm5,        mm5
+        pxor            mm7,        mm7
+
+        movq            mm0,        [rsi]
+        movq            mm1,        [rsi+8]
+
+        movq            mm2,        [rdi]
+        movq            mm3,        [rdi+8];
+
+        pcmpeqw         mm0,        mm7
+        pcmpeqw         mm1,        mm7
+
+        pcmpeqw         mm6,        mm6
+        pxor            mm0,        mm6
+
+        pxor            mm1,        mm6
+        psrlw           mm0,        15
+
+        psrlw           mm1,        15
+        pmaddwd         mm0,        mm2
+
+        pmaddwd         mm1,        mm3
+        movq            mm5,        mm0
+
+        paddd           mm5,        mm1
+
+        movq            mm0,        [rsi+16]
+        movq            mm1,        [rsi+24]
+
+        movq            mm2,        [rdi+16]
+        movq            mm3,        [rdi+24];
+
+        pcmpeqw         mm0,        mm7
+        pcmpeqw         mm1,        mm7
+
+        pcmpeqw         mm6,        mm6
+        pxor            mm0,        mm6
+
+        pxor            mm1,        mm6
+        psrlw           mm0,        15
+
+        psrlw           mm1,        15
+        pmaddwd         mm0,        mm2
+
+        pmaddwd         mm1,        mm3
+        paddd           mm5,        mm0
+
+        paddd           mm5,        mm1
+        movq            mm0,        mm5
+
+        psrlq           mm5,        32
+        paddd           mm0,        mm5
+
+        ; eob adjustment begins here
+        movq            rcx,        mm0
+        and             rcx,        0xffff
+
+        xor             rdx,        rdx
+        sub             rdx,        rcx ; rdx=-rcx
+
+        bsr             rax,        rcx
+        inc             rax
+
+        sar             rdx,        31
+        and             rax,        rdx
+        ; Substitute the sse assembly for the old mmx mixed assembly/C. The
+        ; following is kept as reference
+        ;    movq            rcx,        mm0
+        ;    bsr             rax,        rcx
+        ;
+        ;    mov             eob,        rax
+        ;    mov             eee,        rcx
+        ;
+        ;if(eee==0)
+        ;{
+        ;    eob=-1;
+        ;}
+        ;else if(eee<0)
+        ;{
+        ;    eob=15;
+        ;}
+        ;d->eob = eob+1;
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
--- /dev/null
+++ b/vp9/encoder/x86/quantize_sse2.asm
@@ -1,0 +1,380 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+%include "asm_enc_offsets.asm"
+
+
+; void vp9_regular_quantize_b_sse2 | arg
+;  (BLOCK  *b,                     |  0
+;   BLOCKD *d)                     |  1
+
+global sym(vp9_regular_quantize_b_sse2)
+sym(vp9_regular_quantize_b_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SAVE_XMM 7
+    GET_GOT     rbx
+
+%if ABI_IS_32BIT
+    push        rdi
+    push        rsi
+%else
+  %ifidn __OUTPUT_FORMAT__,x64
+    push        rdi
+    push        rsi
+  %endif
+%endif
+
+    ALIGN_STACK 16, rax
+    %define zrun_zbin_boost   0  ;  8
+    %define abs_minus_zbin    8  ; 32
+    %define temp_qcoeff       40 ; 32
+    %define qcoeff            72 ; 32
+    %define stack_size        104
+    sub         rsp, stack_size
+    ; end prolog
+
+%if ABI_IS_32BIT
+    mov         rdi, arg(0)                 ; BLOCK *b
+    mov         rsi, arg(1)                 ; BLOCKD *d
+%else
+  %ifidn __OUTPUT_FORMAT__,x64
+    mov         rdi, rcx                    ; BLOCK *b
+    mov         rsi, rdx                    ; BLOCKD *d
+  %else
+    ;mov         rdi, rdi                    ; BLOCK *b
+    ;mov         rsi, rsi                    ; BLOCKD *d
+  %endif
+%endif
+
+    mov         rdx, [rdi + vp9_block_coeff] ; coeff_ptr
+    mov         rcx, [rdi + vp9_block_zbin] ; zbin_ptr
+    movd        xmm7, [rdi + vp9_block_zbin_extra] ; zbin_oq_value
+
+    ; z
+    movdqa      xmm0, [rdx]
+    movdqa      xmm4, [rdx + 16]
+    mov         rdx, [rdi + vp9_block_round] ; round_ptr
+
+    pshuflw     xmm7, xmm7, 0
+    punpcklwd   xmm7, xmm7                  ; duplicated zbin_oq_value
+
+    movdqa      xmm1, xmm0
+    movdqa      xmm5, xmm4
+
+    ; sz
+    psraw       xmm0, 15
+    psraw       xmm4, 15
+
+    ; (z ^ sz)
+    pxor        xmm1, xmm0
+    pxor        xmm5, xmm4
+
+    ; x = abs(z)
+    psubw       xmm1, xmm0
+    psubw       xmm5, xmm4
+
+    movdqa      xmm2, [rcx]
+    movdqa      xmm3, [rcx + 16]
+    mov         rcx, [rdi + vp9_block_quant] ; quant_ptr
+
+    ; *zbin_ptr + zbin_oq_value
+    paddw       xmm2, xmm7
+    paddw       xmm3, xmm7
+
+    ; x - (*zbin_ptr + zbin_oq_value)
+    psubw       xmm1, xmm2
+    psubw       xmm5, xmm3
+    movdqa      [rsp + abs_minus_zbin], xmm1
+    movdqa      [rsp + abs_minus_zbin + 16], xmm5
+
+    ; add (zbin_ptr + zbin_oq_value) back
+    paddw       xmm1, xmm2
+    paddw       xmm5, xmm3
+
+    movdqa      xmm2, [rdx]
+    movdqa      xmm6, [rdx + 16]
+
+    movdqa      xmm3, [rcx]
+    movdqa      xmm7, [rcx + 16]
+
+    ; x + round
+    paddw       xmm1, xmm2
+    paddw       xmm5, xmm6
+
+    ; y = x * quant_ptr >> 16
+    pmulhw      xmm3, xmm1
+    pmulhw      xmm7, xmm5
+
+    ; y += x
+    paddw       xmm1, xmm3
+    paddw       xmm5, xmm7
+
+    movdqa      [rsp + temp_qcoeff], xmm1
+    movdqa      [rsp + temp_qcoeff + 16], xmm5
+
+    pxor        xmm6, xmm6
+    ; zero qcoeff
+    movdqa      [rsp + qcoeff], xmm6
+    movdqa      [rsp + qcoeff + 16], xmm6
+
+    mov         rdx, [rdi + vp9_block_zrun_zbin_boost] ; zbin_boost_ptr
+    mov         rax, [rdi + vp9_block_quant_shift] ; quant_shift_ptr
+    mov         [rsp + zrun_zbin_boost], rdx
+
+%macro ZIGZAG_LOOP 1
+    ; x
+    movsx       ecx, WORD PTR[rsp + abs_minus_zbin + %1 * 2]
+
+    ; if (x >= zbin)
+    sub         cx, WORD PTR[rdx]           ; x - zbin
+    lea         rdx, [rdx + 2]              ; zbin_boost_ptr++
+    jl          .rq_zigzag_loop_%1           ; x < zbin
+
+    movsx       edi, WORD PTR[rsp + temp_qcoeff + %1 * 2]
+
+    ; downshift by quant_shift[rc]
+    movsx       cx, BYTE PTR[rax + %1]      ; quant_shift_ptr[rc]
+    sar         edi, cl                     ; also sets Z bit
+    je          .rq_zigzag_loop_%1           ; !y
+    mov         WORD PTR[rsp + qcoeff + %1 * 2], di ;qcoeff_ptr[rc] = temp_qcoeff[rc]
+    mov         rdx, [rsp + zrun_zbin_boost] ; reset to b->zrun_zbin_boost
+.rq_zigzag_loop_%1:
+%endmacro
+; in vp9_default_zig_zag1d order: see vp9/common/entropy.c
+ZIGZAG_LOOP  0
+ZIGZAG_LOOP  1
+ZIGZAG_LOOP  4
+ZIGZAG_LOOP  8
+ZIGZAG_LOOP  5
+ZIGZAG_LOOP  2
+ZIGZAG_LOOP  3
+ZIGZAG_LOOP  6
+ZIGZAG_LOOP  9
+ZIGZAG_LOOP 12
+ZIGZAG_LOOP 13
+ZIGZAG_LOOP 10
+ZIGZAG_LOOP  7
+ZIGZAG_LOOP 11
+ZIGZAG_LOOP 14
+ZIGZAG_LOOP 15
+
+    movdqa      xmm2, [rsp + qcoeff]
+    movdqa      xmm3, [rsp + qcoeff + 16]
+
+    mov         rcx, [rsi + vp9_blockd_dequant] ; dequant_ptr
+    mov         rdi, [rsi + vp9_blockd_dqcoeff] ; dqcoeff_ptr
+
+    ; y ^ sz
+    pxor        xmm2, xmm0
+    pxor        xmm3, xmm4
+    ; x = (y ^ sz) - sz
+    psubw       xmm2, xmm0
+    psubw       xmm3, xmm4
+
+    ; dequant
+    movdqa      xmm0, [rcx]
+    movdqa      xmm1, [rcx + 16]
+
+    mov         rcx, [rsi + vp9_blockd_qcoeff] ; qcoeff_ptr
+
+    pmullw      xmm0, xmm2
+    pmullw      xmm1, xmm3
+
+    movdqa      [rcx], xmm2        ; store qcoeff
+    movdqa      [rcx + 16], xmm3
+    movdqa      [rdi], xmm0        ; store dqcoeff
+    movdqa      [rdi + 16], xmm1
+
+    ; select the last value (in zig_zag order) for EOB
+    pcmpeqw     xmm2, xmm6
+    pcmpeqw     xmm3, xmm6
+    ; !
+    pcmpeqw     xmm6, xmm6
+    pxor        xmm2, xmm6
+    pxor        xmm3, xmm6
+    ; mask inv_zig_zag
+    pand        xmm2, [GLOBAL(inv_zig_zag)]
+    pand        xmm3, [GLOBAL(inv_zig_zag + 16)]
+    ; select the max value
+    pmaxsw      xmm2, xmm3
+    pshufd      xmm3, xmm2, 00001110b
+    pmaxsw      xmm2, xmm3
+    pshuflw     xmm3, xmm2, 00001110b
+    pmaxsw      xmm2, xmm3
+    pshuflw     xmm3, xmm2, 00000001b
+    pmaxsw      xmm2, xmm3
+    movd        eax, xmm2
+    and         eax, 0xff
+    mov         [rsi + vp9_blockd_eob], eax
+
+    ; begin epilog
+    add         rsp, stack_size
+    pop         rsp
+%if ABI_IS_32BIT
+    pop         rsi
+    pop         rdi
+%else
+  %ifidn __OUTPUT_FORMAT__,x64
+    pop         rsi
+    pop         rdi
+  %endif
+%endif
+    RESTORE_GOT
+    RESTORE_XMM
+    pop         rbp
+    ret
+
+; void vp9_fast_quantize_b_sse2 | arg
+;  (BLOCK  *b,                  |  0
+;   BLOCKD *d)                  |  1
+
+global sym(vp9_fast_quantize_b_sse2)
+sym(vp9_fast_quantize_b_sse2):
+    push        rbp
+    mov         rbp, rsp
+    GET_GOT     rbx
+
+%if ABI_IS_32BIT
+    push        rdi
+    push        rsi
+%else
+  %ifidn __OUTPUT_FORMAT__,x64
+    push        rdi
+    push        rsi
+  %else
+    ; these registers are used for passing arguments
+  %endif
+%endif
+
+    ; end prolog
+
+%if ABI_IS_32BIT
+    mov         rdi, arg(0)                 ; BLOCK *b
+    mov         rsi, arg(1)                 ; BLOCKD *d
+%else
+  %ifidn __OUTPUT_FORMAT__,x64
+    mov         rdi, rcx                    ; BLOCK *b
+    mov         rsi, rdx                    ; BLOCKD *d
+  %else
+    ;mov         rdi, rdi                    ; BLOCK *b
+    ;mov         rsi, rsi                    ; BLOCKD *d
+  %endif
+%endif
+
+    mov         rax, [rdi + vp9_block_coeff]
+    mov         rcx, [rdi + vp9_block_round]
+    mov         rdx, [rdi + vp9_block_quant_fast]
+
+    ; z = coeff
+    movdqa      xmm0, [rax]
+    movdqa      xmm4, [rax + 16]
+
+    ; dup z so we can save sz
+    movdqa      xmm1, xmm0
+    movdqa      xmm5, xmm4
+
+    ; sz = z >> 15
+    psraw       xmm0, 15
+    psraw       xmm4, 15
+
+    ; x = abs(z) = (z ^ sz) - sz
+    pxor        xmm1, xmm0
+    pxor        xmm5, xmm4
+    psubw       xmm1, xmm0
+    psubw       xmm5, xmm4
+
+    ; x += round
+    paddw       xmm1, [rcx]
+    paddw       xmm5, [rcx + 16]
+
+    mov         rax, [rsi + vp9_blockd_qcoeff]
+    mov         rcx, [rsi + vp9_blockd_dequant]
+    mov         rdi, [rsi + vp9_blockd_dqcoeff]
+
+    ; y = x * quant >> 16
+    pmulhw      xmm1, [rdx]
+    pmulhw      xmm5, [rdx + 16]
+
+    ; x = (y ^ sz) - sz
+    pxor        xmm1, xmm0
+    pxor        xmm5, xmm4
+    psubw       xmm1, xmm0
+    psubw       xmm5, xmm4
+
+    ; qcoeff = x
+    movdqa      [rax], xmm1
+    movdqa      [rax + 16], xmm5
+
+    ; x * dequant
+    movdqa      xmm2, xmm1
+    movdqa      xmm3, xmm5
+    pmullw      xmm2, [rcx]
+    pmullw      xmm3, [rcx + 16]
+
+    ; dqcoeff = x * dequant
+    movdqa      [rdi], xmm2
+    movdqa      [rdi + 16], xmm3
+
+    pxor        xmm4, xmm4                  ;clear all bits
+    pcmpeqw     xmm1, xmm4
+    pcmpeqw     xmm5, xmm4
+
+    pcmpeqw     xmm4, xmm4                  ;set all bits
+    pxor        xmm1, xmm4
+    pxor        xmm5, xmm4
+
+    pand        xmm1, [GLOBAL(inv_zig_zag)]
+    pand        xmm5, [GLOBAL(inv_zig_zag + 16)]
+
+    pmaxsw      xmm1, xmm5
+
+    ; now down to 8
+    pshufd      xmm5, xmm1, 00001110b
+
+    pmaxsw      xmm1, xmm5
+
+    ; only 4 left
+    pshuflw     xmm5, xmm1, 00001110b
+
+    pmaxsw      xmm1, xmm5
+
+    ; okay, just 2!
+    pshuflw     xmm5, xmm1, 00000001b
+
+    pmaxsw      xmm1, xmm5
+
+    movd        eax, xmm1
+    and         eax, 0xff
+    mov         [rsi + vp9_blockd_eob], eax
+
+    ; begin epilog
+%if ABI_IS_32BIT
+    pop         rsi
+    pop         rdi
+%else
+  %ifidn __OUTPUT_FORMAT__,x64
+    pop         rsi
+    pop         rdi
+  %endif
+%endif
+
+    RESTORE_GOT
+    pop         rbp
+    ret
+
+SECTION_RODATA
+align 16
+inv_zig_zag:
+  dw 0x0001, 0x0002, 0x0006, 0x0007
+  dw 0x0003, 0x0005, 0x0008, 0x000d
+  dw 0x0004, 0x0009, 0x000c, 0x000e
+  dw 0x000a, 0x000b, 0x000f, 0x0010
--- /dev/null
+++ b/vp9/encoder/x86/quantize_sse4.asm
@@ -1,0 +1,254 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+%include "asm_enc_offsets.asm"
+
+
+; void vp9_regular_quantize_b_sse4 | arg
+;  (BLOCK  *b,                     |  0
+;   BLOCKD *d)                     |  1
+
+global sym(vp9_regular_quantize_b_sse4)
+sym(vp9_regular_quantize_b_sse4):
+
+%if ABI_IS_32BIT
+    push        rbp
+    mov         rbp, rsp
+    GET_GOT     rbx
+    push        rdi
+    push        rsi
+
+    ALIGN_STACK 16, rax
+    %define qcoeff      0 ; 32
+    %define stack_size 32
+    sub         rsp, stack_size
+%else
+  %ifidn __OUTPUT_FORMAT__,x64
+    SAVE_XMM 8, u
+    push        rdi
+    push        rsi
+  %endif
+%endif
+    ; end prolog
+
+%if ABI_IS_32BIT
+    mov         rdi, arg(0)                 ; BLOCK *b
+    mov         rsi, arg(1)                 ; BLOCKD *d
+%else
+  %ifidn __OUTPUT_FORMAT__,x64
+    mov         rdi, rcx                    ; BLOCK *b
+    mov         rsi, rdx                    ; BLOCKD *d
+  %else
+    ;mov         rdi, rdi                    ; BLOCK *b
+    ;mov         rsi, rsi                    ; BLOCKD *d
+  %endif
+%endif
+
+    mov         rax, [rdi + vp9_block_coeff]
+    mov         rcx, [rdi + vp9_block_zbin]
+    mov         rdx, [rdi + vp9_block_round]
+    movd        xmm7, [rdi + vp9_block_zbin_extra]
+
+    ; z
+    movdqa      xmm0, [rax]
+    movdqa      xmm1, [rax + 16]
+
+    ; duplicate zbin_oq_value
+    pshuflw     xmm7, xmm7, 0
+    punpcklwd   xmm7, xmm7
+
+    movdqa      xmm2, xmm0
+    movdqa      xmm3, xmm1
+
+    ; sz
+    psraw       xmm0, 15
+    psraw       xmm1, 15
+
+    ; (z ^ sz)
+    pxor        xmm2, xmm0
+    pxor        xmm3, xmm1
+
+    ; x = abs(z)
+    psubw       xmm2, xmm0
+    psubw       xmm3, xmm1
+
+    ; zbin
+    movdqa      xmm4, [rcx]
+    movdqa      xmm5, [rcx + 16]
+
+    ; *zbin_ptr + zbin_oq_value
+    paddw       xmm4, xmm7
+    paddw       xmm5, xmm7
+
+    movdqa      xmm6, xmm2
+    movdqa      xmm7, xmm3
+
+    ; x - (*zbin_ptr + zbin_oq_value)
+    psubw       xmm6, xmm4
+    psubw       xmm7, xmm5
+
+    ; round
+    movdqa      xmm4, [rdx]
+    movdqa      xmm5, [rdx + 16]
+
+    mov         rax, [rdi + vp9_block_quant_shift]
+    mov         rcx, [rdi + vp9_block_quant]
+    mov         rdx, [rdi + vp9_block_zrun_zbin_boost]
+
+    ; x + round
+    paddw       xmm2, xmm4
+    paddw       xmm3, xmm5
+
+    ; quant
+    movdqa      xmm4, [rcx]
+    movdqa      xmm5, [rcx + 16]
+
+    ; y = x * quant_ptr >> 16
+    pmulhw      xmm4, xmm2
+    pmulhw      xmm5, xmm3
+
+    ; y += x
+    paddw       xmm2, xmm4
+    paddw       xmm3, xmm5
+
+    pxor        xmm4, xmm4
+%if ABI_IS_32BIT
+    movdqa      [rsp + qcoeff], xmm4
+    movdqa      [rsp + qcoeff + 16], xmm4
+%else
+    pxor        xmm8, xmm8
+%endif
+
+    ; quant_shift
+    movdqa      xmm5, [rax]
+
+    ; zrun_zbin_boost
+    mov         rax, rdx
+
+%macro ZIGZAG_LOOP 5
+    ; x
+    pextrw      ecx, %4, %2
+
+    ; if (x >= zbin)
+    sub         cx, WORD PTR[rdx]           ; x - zbin
+    lea         rdx, [rdx + 2]              ; zbin_boost_ptr++
+    jl          .rq_zigzag_loop_%1          ; x < zbin
+
+    pextrw      edi, %3, %2                 ; y
+
+    ; downshift by quant_shift[rc]
+    pextrb      ecx, xmm5, %1               ; quant_shift[rc]
+    sar         edi, cl                     ; also sets Z bit
+    je          .rq_zigzag_loop_%1          ; !y
+%if ABI_IS_32BIT
+    mov         WORD PTR[rsp + qcoeff + %1 *2], di
+%else
+    pinsrw      %5, edi, %2                 ; qcoeff[rc]
+%endif
+    mov         rdx, rax                    ; reset to b->zrun_zbin_boost
+.rq_zigzag_loop_%1:
+%endmacro
+; in vp9_default_zig_zag1d order: see vp9/common/entropy.c
+ZIGZAG_LOOP  0, 0, xmm2, xmm6, xmm4
+ZIGZAG_LOOP  1, 1, xmm2, xmm6, xmm4
+ZIGZAG_LOOP  4, 4, xmm2, xmm6, xmm4
+ZIGZAG_LOOP  8, 0, xmm3, xmm7, xmm8
+ZIGZAG_LOOP  5, 5, xmm2, xmm6, xmm4
+ZIGZAG_LOOP  2, 2, xmm2, xmm6, xmm4
+ZIGZAG_LOOP  3, 3, xmm2, xmm6, xmm4
+ZIGZAG_LOOP  6, 6, xmm2, xmm6, xmm4
+ZIGZAG_LOOP  9, 1, xmm3, xmm7, xmm8
+ZIGZAG_LOOP 12, 4, xmm3, xmm7, xmm8
+ZIGZAG_LOOP 13, 5, xmm3, xmm7, xmm8
+ZIGZAG_LOOP 10, 2, xmm3, xmm7, xmm8
+ZIGZAG_LOOP  7, 7, xmm2, xmm6, xmm4
+ZIGZAG_LOOP 11, 3, xmm3, xmm7, xmm8
+ZIGZAG_LOOP 14, 6, xmm3, xmm7, xmm8
+ZIGZAG_LOOP 15, 7, xmm3, xmm7, xmm8
+
+    mov         rcx, [rsi + vp9_blockd_dequant]
+    mov         rdi, [rsi + vp9_blockd_dqcoeff]
+
+%if ABI_IS_32BIT
+    movdqa      xmm4, [rsp + qcoeff]
+    movdqa      xmm5, [rsp + qcoeff + 16]
+%else
+    %define     xmm5 xmm8
+%endif
+
+    ; y ^ sz
+    pxor        xmm4, xmm0
+    pxor        xmm5, xmm1
+    ; x = (y ^ sz) - sz
+    psubw       xmm4, xmm0
+    psubw       xmm5, xmm1
+
+    ; dequant
+    movdqa      xmm0, [rcx]
+    movdqa      xmm1, [rcx + 16]
+
+    mov         rcx, [rsi + vp9_blockd_qcoeff]
+
+    pmullw      xmm0, xmm4
+    pmullw      xmm1, xmm5
+
+    ; store qcoeff
+    movdqa      [rcx], xmm4
+    movdqa      [rcx + 16], xmm5
+
+    ; store dqcoeff
+    movdqa      [rdi], xmm0
+    movdqa      [rdi + 16], xmm1
+
+    ; select the last value (in zig_zag order) for EOB
+    pxor        xmm6, xmm6
+    pcmpeqw     xmm4, xmm6
+    pcmpeqw     xmm5, xmm6
+
+    packsswb    xmm4, xmm5
+    pshufb      xmm4, [GLOBAL(zig_zag1d)]
+    pmovmskb    edx, xmm4
+    xor         rdi, rdi
+    mov         eax, -1
+    xor         dx, ax
+    bsr         eax, edx
+    sub         edi, edx
+    sar         edi, 31
+    add         eax, 1
+    and         eax, edi
+
+    mov         [rsi + vp9_blockd_eob], eax
+
+    ; begin epilog
+%if ABI_IS_32BIT
+    add         rsp, stack_size
+    pop         rsp
+
+    pop         rsi
+    pop         rdi
+    RESTORE_GOT
+    pop         rbp
+%else
+  %undef xmm5
+  %ifidn __OUTPUT_FORMAT__,x64
+    pop         rsi
+    pop         rdi
+    RESTORE_XMM
+  %endif
+%endif
+
+    ret
+
+SECTION_RODATA
+align 16
+; vp9/common/entropy.c: vp9_default_zig_zag1d
+zig_zag1d:
+    db 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
--- /dev/null
+++ b/vp9/encoder/x86/quantize_ssse3.asm
@@ -1,0 +1,138 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+%include "asm_enc_offsets.asm"
+
+
+; void vp9_fast_quantize_b_ssse3 | arg
+;  (BLOCK  *b,                   |  0
+;   BLOCKD *d)                   |  1
+;
+
+global sym(vp9_fast_quantize_b_ssse3)
+sym(vp9_fast_quantize_b_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    GET_GOT     rbx
+
+%if ABI_IS_32BIT
+    push        rdi
+    push        rsi
+%else
+  %ifidn __OUTPUT_FORMAT__,x64
+    push        rdi
+    push        rsi
+  %endif
+%endif
+    ; end prolog
+
+%if ABI_IS_32BIT
+    mov         rdi, arg(0)                 ; BLOCK *b
+    mov         rsi, arg(1)                 ; BLOCKD *d
+%else
+  %ifidn __OUTPUT_FORMAT__,x64
+    mov         rdi, rcx                    ; BLOCK *b
+    mov         rsi, rdx                    ; BLOCKD *d
+  %else
+    ;mov         rdi, rdi                    ; BLOCK *b
+    ;mov         rsi, rsi                    ; BLOCKD *d
+  %endif
+%endif
+
+    mov         rax, [rdi + vp9_block_coeff]
+    mov         rcx, [rdi + vp9_block_round]
+    mov         rdx, [rdi + vp9_block_quant_fast]
+
+    ; coeff
+    movdqa      xmm0, [rax]
+    movdqa      xmm4, [rax + 16]
+
+    ; round
+    movdqa      xmm2, [rcx]
+    movdqa      xmm3, [rcx + 16]
+
+    movdqa      xmm1, xmm0
+    movdqa      xmm5, xmm4
+
+    ; sz = z >> 15
+    psraw       xmm0, 15
+    psraw       xmm4, 15
+
+    pabsw       xmm1, xmm1
+    pabsw       xmm5, xmm5
+
+    paddw       xmm1, xmm2
+    paddw       xmm5, xmm3
+
+    ; quant_fast
+    pmulhw      xmm1, [rdx]
+    pmulhw      xmm5, [rdx + 16]
+
+    mov         rax, [rsi + vp9_blockd_qcoeff]
+    mov         rdi, [rsi + vp9_blockd_dequant]
+    mov         rcx, [rsi + vp9_blockd_dqcoeff]
+
+    pxor        xmm1, xmm0
+    pxor        xmm5, xmm4
+    psubw       xmm1, xmm0
+    psubw       xmm5, xmm4
+
+    movdqa      [rax], xmm1
+    movdqa      [rax + 16], xmm5
+
+    movdqa      xmm2, [rdi]
+    movdqa      xmm3, [rdi + 16]
+
+    pxor        xmm4, xmm4
+    pmullw      xmm2, xmm1
+    pmullw      xmm3, xmm5
+
+    pcmpeqw     xmm1, xmm4                  ;non zero mask
+    pcmpeqw     xmm5, xmm4                  ;non zero mask
+    packsswb    xmm1, xmm5
+    pshufb      xmm1, [GLOBAL(zz_shuf)]
+
+    pmovmskb    edx, xmm1
+
+    xor         rdi, rdi
+    mov         eax, -1
+    xor         dx, ax                      ;flip the bits for bsr
+    bsr         eax, edx
+
+    movdqa      [rcx], xmm2                 ;store dqcoeff
+    movdqa      [rcx + 16], xmm3            ;store dqcoeff
+
+    sub         edi, edx                    ;check for all zeros in bit mask
+    sar         edi, 31                     ;0 or -1
+    add         eax, 1
+    and         eax, edi                    ;if the bit mask was all zero,
+                                            ;then eob = 0
+    mov         [rsi + vp9_blockd_eob], eax
+
+    ; begin epilog
+%if ABI_IS_32BIT
+    pop         rsi
+    pop         rdi
+%else
+  %ifidn __OUTPUT_FORMAT__,x64
+    pop         rsi
+    pop         rdi
+  %endif
+%endif
+
+    RESTORE_GOT
+    pop         rbp
+    ret
+
+SECTION_RODATA
+align 16
+zz_shuf:
+    db 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
--- /dev/null
+++ b/vp9/encoder/x86/quantize_x86.h
@@ -1,0 +1,48 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+#ifndef QUANTIZE_X86_H
+#define QUANTIZE_X86_H
+
+
+/* Note:
+ *
+ * This platform is commonly built for runtime CPU detection. If you modify
+ * any of the function mappings present in this file, be sure to also update
+ * them in the function pointer initialization code
+ */
+#if HAVE_MMX
+
+#endif /* HAVE_MMX */
+
+
+#if HAVE_SSE2
+extern prototype_quantize_block(vp9_regular_quantize_b_sse2);
+#if !CONFIG_RUNTIME_CPU_DETECT
+
+#undef vp9_quantize_quantb
+#define vp9_quantize_quantb vp9_regular_quantize_b_sse2
+#endif /* !CONFIG_RUNTIME_CPU_DETECT */
+
+#endif /* HAVE_SSE2 */
+
+
+#if HAVE_SSE4_1
+extern prototype_quantize_block(vp9_regular_quantize_b_sse4);
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+
+#undef vp9_quantize_quantb
+#define vp9_quantize_quantb vp9_regular_quantize_b_sse4
+
+#endif /* !CONFIG_RUNTIME_CPU_DETECT */
+
+#endif /* HAVE_SSE4_1 */
+
+#endif /* QUANTIZE_X86_H */
--- /dev/null
+++ b/vp9/encoder/x86/sad_mmx.asm
@@ -1,0 +1,427 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+global sym(vp9_sad16x16_mmx)
+global sym(vp9_sad8x16_mmx)
+global sym(vp9_sad8x8_mmx)
+global sym(vp9_sad4x4_mmx)
+global sym(vp9_sad16x8_mmx)
+
+;unsigned int vp9_sad16x16_mmx(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *ref_ptr,
+;    int  ref_stride)
+sym(vp9_sad16x16_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 4
+    push rsi
+    push rdi
+    ; end prolog
+
+        mov             rsi,        arg(0) ;src_ptr
+        mov             rdi,        arg(2) ;ref_ptr
+
+        movsxd          rax,        dword ptr arg(1) ;src_stride
+        movsxd          rdx,        dword ptr arg(3) ;ref_stride
+
+        lea             rcx,        [rsi+rax*8]
+
+        lea             rcx,        [rcx+rax*8]
+        pxor            mm7,        mm7
+
+        pxor            mm6,        mm6
+
+.x16x16sad_mmx_loop:
+
+        movq            mm0,        QWORD PTR [rsi]
+        movq            mm2,        QWORD PTR [rsi+8]
+
+        movq            mm1,        QWORD PTR [rdi]
+        movq            mm3,        QWORD PTR [rdi+8]
+
+        movq            mm4,        mm0
+        movq            mm5,        mm2
+
+        psubusb         mm0,        mm1
+        psubusb         mm1,        mm4
+
+        psubusb         mm2,        mm3
+        psubusb         mm3,        mm5
+
+        por             mm0,        mm1
+        por             mm2,        mm3
+
+        movq            mm1,        mm0
+        movq            mm3,        mm2
+
+        punpcklbw       mm0,        mm6
+        punpcklbw       mm2,        mm6
+
+        punpckhbw       mm1,        mm6
+        punpckhbw       mm3,        mm6
+
+        paddw           mm0,        mm2
+        paddw           mm1,        mm3
+
+
+        lea             rsi,        [rsi+rax]
+        add             rdi,        rdx
+
+        paddw           mm7,        mm0
+        paddw           mm7,        mm1
+
+        cmp             rsi,        rcx
+        jne             .x16x16sad_mmx_loop
+
+
+        movq            mm0,        mm7
+
+        punpcklwd       mm0,        mm6
+        punpckhwd       mm7,        mm6
+
+        paddw           mm0,        mm7
+        movq            mm7,        mm0
+
+
+        psrlq           mm0,        32
+        paddw           mm7,        mm0
+
+        movq            rax,        mm7
+
+    pop rdi
+    pop rsi
+    mov rsp, rbp
+    ; begin epilog
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;unsigned int vp9_sad8x16_mmx(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *ref_ptr,
+;    int  ref_stride)
+sym(vp9_sad8x16_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 4
+    push rsi
+    push rdi
+    ; end prolog
+
+        mov             rsi,        arg(0) ;src_ptr
+        mov             rdi,        arg(2) ;ref_ptr
+
+        movsxd          rax,        dword ptr arg(1) ;src_stride
+        movsxd          rdx,        dword ptr arg(3) ;ref_stride
+
+        lea             rcx,        [rsi+rax*8]
+
+        lea             rcx,        [rcx+rax*8]
+        pxor            mm7,        mm7
+
+        pxor            mm6,        mm6
+
+.x8x16sad_mmx_loop:
+
+        movq            mm0,        QWORD PTR [rsi]
+        movq            mm1,        QWORD PTR [rdi]
+
+        movq            mm2,        mm0
+        psubusb         mm0,        mm1
+
+        psubusb         mm1,        mm2
+        por             mm0,        mm1
+
+        movq            mm2,        mm0
+        punpcklbw       mm0,        mm6
+
+        punpckhbw       mm2,        mm6
+        lea             rsi,        [rsi+rax]
+
+        add             rdi,        rdx
+        paddw           mm7,        mm0
+
+        paddw           mm7,        mm2
+        cmp             rsi,        rcx
+
+        jne             .x8x16sad_mmx_loop
+
+        movq            mm0,        mm7
+        punpcklwd       mm0,        mm6
+
+        punpckhwd       mm7,        mm6
+        paddw           mm0,        mm7
+
+        movq            mm7,        mm0
+        psrlq           mm0,        32
+
+        paddw           mm7,        mm0
+        movq            rax,        mm7
+
+    pop rdi
+    pop rsi
+    mov rsp, rbp
+    ; begin epilog
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;unsigned int vp9_sad8x8_mmx(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *ref_ptr,
+;    int  ref_stride)
+sym(vp9_sad8x8_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 4
+    push rsi
+    push rdi
+    ; end prolog
+
+        mov             rsi,        arg(0) ;src_ptr
+        mov             rdi,        arg(2) ;ref_ptr
+
+        movsxd          rax,        dword ptr arg(1) ;src_stride
+        movsxd          rdx,        dword ptr arg(3) ;ref_stride
+
+        lea             rcx,        [rsi+rax*8]
+        pxor            mm7,        mm7
+
+        pxor            mm6,        mm6
+
+.x8x8sad_mmx_loop:
+
+        movq            mm0,        QWORD PTR [rsi]
+        movq            mm1,        QWORD PTR [rdi]
+
+        movq            mm2,        mm0
+        psubusb         mm0,        mm1
+
+        psubusb         mm1,        mm2
+        por             mm0,        mm1
+
+        movq            mm2,        mm0
+        punpcklbw       mm0,        mm6
+
+        punpckhbw       mm2,        mm6
+        paddw           mm0,        mm2
+
+        lea             rsi,       [rsi+rax]
+        add             rdi,        rdx
+
+        paddw           mm7,       mm0
+        cmp             rsi,        rcx
+
+        jne             .x8x8sad_mmx_loop
+
+        movq            mm0,        mm7
+        punpcklwd       mm0,        mm6
+
+        punpckhwd       mm7,        mm6
+        paddw           mm0,        mm7
+
+        movq            mm7,        mm0
+        psrlq           mm0,        32
+
+        paddw           mm7,        mm0
+        movq            rax,        mm7
+
+    pop rdi
+    pop rsi
+    mov rsp, rbp
+    ; begin epilog
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;unsigned int vp9_sad4x4_mmx(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *ref_ptr,
+;    int  ref_stride)
+sym(vp9_sad4x4_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 4
+    push rsi
+    push rdi
+    ; end prolog
+
+        mov             rsi,        arg(0) ;src_ptr
+        mov             rdi,        arg(2) ;ref_ptr
+
+        movsxd          rax,        dword ptr arg(1) ;src_stride
+        movsxd          rdx,        dword ptr arg(3) ;ref_stride
+
+        movd            mm0,        DWORD PTR [rsi]
+        movd            mm1,        DWORD PTR [rdi]
+
+        movd            mm2,        DWORD PTR [rsi+rax]
+        movd            mm3,        DWORD PTR [rdi+rdx]
+
+        punpcklbw       mm0,        mm2
+        punpcklbw       mm1,        mm3
+
+        movq            mm2,        mm0
+        psubusb         mm0,        mm1
+
+        psubusb         mm1,        mm2
+        por             mm0,        mm1
+
+        movq            mm2,        mm0
+        pxor            mm3,        mm3
+
+        punpcklbw       mm0,        mm3
+        punpckhbw       mm2,        mm3
+
+        paddw           mm0,        mm2
+
+        lea             rsi,        [rsi+rax*2]
+        lea             rdi,        [rdi+rdx*2]
+
+        movd            mm4,        DWORD PTR [rsi]
+        movd            mm5,        DWORD PTR [rdi]
+
+        movd            mm6,        DWORD PTR [rsi+rax]
+        movd            mm7,        DWORD PTR [rdi+rdx]
+
+        punpcklbw       mm4,        mm6
+        punpcklbw       mm5,        mm7
+
+        movq            mm6,        mm4
+        psubusb         mm4,        mm5
+
+        psubusb         mm5,        mm6
+        por             mm4,        mm5
+
+        movq            mm5,        mm4
+        punpcklbw       mm4,        mm3
+
+        punpckhbw       mm5,        mm3
+        paddw           mm4,        mm5
+
+        paddw           mm0,        mm4
+        movq            mm1,        mm0
+
+        punpcklwd       mm0,        mm3
+        punpckhwd       mm1,        mm3
+
+        paddw           mm0,        mm1
+        movq            mm1,        mm0
+
+        psrlq           mm0,        32
+        paddw           mm0,        mm1
+
+        movq            rax,        mm0
+
+    pop rdi
+    pop rsi
+    mov rsp, rbp
+    ; begin epilog
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;unsigned int vp9_sad16x8_mmx(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *ref_ptr,
+;    int  ref_stride)
+sym(vp9_sad16x8_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 4
+    push rsi
+    push rdi
+    ; end prolog
+
+        mov             rsi,        arg(0) ;src_ptr
+        mov             rdi,        arg(2) ;ref_ptr
+
+        movsxd          rax,        dword ptr arg(1) ;src_stride
+        movsxd          rdx,        dword ptr arg(3) ;ref_stride
+
+        lea             rcx,        [rsi+rax*8]
+        pxor            mm7,        mm7
+
+        pxor            mm6,        mm6
+
+.x16x8sad_mmx_loop:
+
+        movq            mm0,       [rsi]
+        movq            mm1,       [rdi]
+
+        movq            mm2,        [rsi+8]
+        movq            mm3,        [rdi+8]
+
+        movq            mm4,        mm0
+        movq            mm5,        mm2
+
+        psubusb         mm0,        mm1
+        psubusb         mm1,        mm4
+
+        psubusb         mm2,        mm3
+        psubusb         mm3,        mm5
+
+        por             mm0,        mm1
+        por             mm2,        mm3
+
+        movq            mm1,        mm0
+        movq            mm3,        mm2
+
+        punpcklbw       mm0,        mm6
+        punpckhbw       mm1,        mm6
+
+        punpcklbw       mm2,        mm6
+        punpckhbw       mm3,        mm6
+
+
+        paddw           mm0,        mm2
+        paddw           mm1,        mm3
+
+        paddw           mm0,        mm1
+        lea             rsi,        [rsi+rax]
+
+        add             rdi,        rdx
+        paddw           mm7,        mm0
+
+        cmp             rsi,        rcx
+        jne             .x16x8sad_mmx_loop
+
+        movq            mm0,        mm7
+        punpcklwd       mm0,        mm6
+
+        punpckhwd       mm7,        mm6
+        paddw           mm0,        mm7
+
+        movq            mm7,        mm0
+        psrlq           mm0,        32
+
+        paddw           mm7,        mm0
+        movq            rax,        mm7
+
+    pop rdi
+    pop rsi
+    mov rsp, rbp
+    ; begin epilog
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
--- /dev/null
+++ b/vp9/encoder/x86/sad_sse2.asm
@@ -1,0 +1,410 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;unsigned int vp9_sad16x16_wmt(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *ref_ptr,
+;    int  ref_stride)
+global sym(vp9_sad16x16_wmt)
+sym(vp9_sad16x16_wmt):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 4
+    SAVE_XMM 6
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov             rsi,        arg(0) ;src_ptr
+        mov             rdi,        arg(2) ;ref_ptr
+
+        movsxd          rax,        dword ptr arg(1) ;src_stride
+        movsxd          rdx,        dword ptr arg(3) ;ref_stride
+
+        lea             rcx,        [rsi+rax*8]
+
+        lea             rcx,        [rcx+rax*8]
+        pxor            xmm6,       xmm6
+
+.x16x16sad_wmt_loop:
+
+        movq            xmm0,       QWORD PTR [rsi]
+        movq            xmm2,       QWORD PTR [rsi+8]
+
+        movq            xmm1,       QWORD PTR [rdi]
+        movq            xmm3,       QWORD PTR [rdi+8]
+
+        movq            xmm4,       QWORD PTR [rsi+rax]
+        movq            xmm5,       QWORD PTR [rdi+rdx]
+
+
+        punpcklbw       xmm0,       xmm2
+        punpcklbw       xmm1,       xmm3
+
+        psadbw          xmm0,       xmm1
+        movq            xmm2,       QWORD PTR [rsi+rax+8]
+
+        movq            xmm3,       QWORD PTR [rdi+rdx+8]
+        lea             rsi,        [rsi+rax*2]
+
+        lea             rdi,        [rdi+rdx*2]
+        punpcklbw       xmm4,       xmm2
+
+        punpcklbw       xmm5,       xmm3
+        psadbw          xmm4,       xmm5
+
+        paddw           xmm6,       xmm0
+        paddw           xmm6,       xmm4
+
+        cmp             rsi,        rcx
+        jne             .x16x16sad_wmt_loop
+
+        movq            xmm0,       xmm6
+        psrldq          xmm6,       8
+
+        paddw           xmm0,       xmm6
+        movq            rax,        xmm0
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;unsigned int vp9_sad8x16_wmt(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *ref_ptr,
+;    int  ref_stride,
+;    int  max_err)
+global sym(vp9_sad8x16_wmt)
+sym(vp9_sad8x16_wmt):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    push        rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov             rsi,        arg(0) ;src_ptr
+        mov             rdi,        arg(2) ;ref_ptr
+
+        movsxd          rbx,        dword ptr arg(1) ;src_stride
+        movsxd          rdx,        dword ptr arg(3) ;ref_stride
+
+        lea             rcx,        [rsi+rbx*8]
+
+        lea             rcx,        [rcx+rbx*8]
+        pxor            mm7,        mm7
+
+.x8x16sad_wmt_loop:
+
+        movq            rax,        mm7
+        cmp             eax,        arg(4)
+        jg              .x8x16sad_wmt_early_exit
+
+        movq            mm0,        QWORD PTR [rsi]
+        movq            mm1,        QWORD PTR [rdi]
+
+        movq            mm2,        QWORD PTR [rsi+rbx]
+        movq            mm3,        QWORD PTR [rdi+rdx]
+
+        psadbw          mm0,        mm1
+        psadbw          mm2,        mm3
+
+        lea             rsi,        [rsi+rbx*2]
+        lea             rdi,        [rdi+rdx*2]
+
+        paddw           mm7,        mm0
+        paddw           mm7,        mm2
+
+        cmp             rsi,        rcx
+        jne             .x8x16sad_wmt_loop
+
+        movq            rax,        mm7
+
+.x8x16sad_wmt_early_exit:
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    pop         rbx
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;unsigned int vp9_sad8x8_wmt(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *ref_ptr,
+;    int  ref_stride)
+global sym(vp9_sad8x8_wmt)
+sym(vp9_sad8x8_wmt):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    push        rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov             rsi,        arg(0) ;src_ptr
+        mov             rdi,        arg(2) ;ref_ptr
+
+        movsxd          rbx,        dword ptr arg(1) ;src_stride
+        movsxd          rdx,        dword ptr arg(3) ;ref_stride
+
+        lea             rcx,        [rsi+rbx*8]
+        pxor            mm7,        mm7
+
+.x8x8sad_wmt_loop:
+
+        movq            rax,        mm7
+        cmp             eax,        arg(4)
+        jg              .x8x8sad_wmt_early_exit
+
+        movq            mm0,        QWORD PTR [rsi]
+        movq            mm1,        QWORD PTR [rdi]
+
+        psadbw          mm0,        mm1
+        lea             rsi,        [rsi+rbx]
+
+        add             rdi,        rdx
+        paddw           mm7,        mm0
+
+        cmp             rsi,        rcx
+        jne             .x8x8sad_wmt_loop
+
+        movq            rax,        mm7
+.x8x8sad_wmt_early_exit:
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    pop         rbx
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;unsigned int vp9_sad4x4_wmt(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *ref_ptr,
+;    int  ref_stride)
+global sym(vp9_sad4x4_wmt)
+sym(vp9_sad4x4_wmt):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 4
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov             rsi,        arg(0) ;src_ptr
+        mov             rdi,        arg(2) ;ref_ptr
+
+        movsxd          rax,        dword ptr arg(1) ;src_stride
+        movsxd          rdx,        dword ptr arg(3) ;ref_stride
+
+        movd            mm0,        DWORD PTR [rsi]
+        movd            mm1,        DWORD PTR [rdi]
+
+        movd            mm2,        DWORD PTR [rsi+rax]
+        movd            mm3,        DWORD PTR [rdi+rdx]
+
+        punpcklbw       mm0,        mm2
+        punpcklbw       mm1,        mm3
+
+        psadbw          mm0,        mm1
+        lea             rsi,        [rsi+rax*2]
+
+        lea             rdi,        [rdi+rdx*2]
+        movd            mm4,        DWORD PTR [rsi]
+
+        movd            mm5,        DWORD PTR [rdi]
+        movd            mm6,        DWORD PTR [rsi+rax]
+
+        movd            mm7,        DWORD PTR [rdi+rdx]
+        punpcklbw       mm4,        mm6
+
+        punpcklbw       mm5,        mm7
+        psadbw          mm4,        mm5
+
+        paddw           mm0,        mm4
+        movq            rax,        mm0
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;unsigned int vp9_sad16x8_wmt(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *ref_ptr,
+;    int  ref_stride)
+global sym(vp9_sad16x8_wmt)
+sym(vp9_sad16x8_wmt):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    push        rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+
+        mov             rsi,        arg(0) ;src_ptr
+        mov             rdi,        arg(2) ;ref_ptr
+
+        movsxd          rbx,        dword ptr arg(1) ;src_stride
+        movsxd          rdx,        dword ptr arg(3) ;ref_stride
+
+        lea             rcx,        [rsi+rbx*8]
+        pxor            mm7,        mm7
+
+.x16x8sad_wmt_loop:
+
+        movq            rax,        mm7
+        cmp             eax,        arg(4)
+        jg              .x16x8sad_wmt_early_exit
+
+        movq            mm0,        QWORD PTR [rsi]
+        movq            mm2,        QWORD PTR [rsi+8]
+
+        movq            mm1,        QWORD PTR [rdi]
+        movq            mm3,        QWORD PTR [rdi+8]
+
+        movq            mm4,        QWORD PTR [rsi+rbx]
+        movq            mm5,        QWORD PTR [rdi+rdx]
+
+        psadbw          mm0,        mm1
+        psadbw          mm2,        mm3
+
+        movq            mm1,        QWORD PTR [rsi+rbx+8]
+        movq            mm3,        QWORD PTR [rdi+rdx+8]
+
+        psadbw          mm4,        mm5
+        psadbw          mm1,        mm3
+
+        lea             rsi,        [rsi+rbx*2]
+        lea             rdi,        [rdi+rdx*2]
+
+        paddw           mm0,        mm2
+        paddw           mm4,        mm1
+
+        paddw           mm7,        mm0
+        paddw           mm7,        mm4
+
+        cmp             rsi,        rcx
+        jne             .x16x8sad_wmt_loop
+
+        movq            rax,        mm7
+
+.x16x8sad_wmt_early_exit:
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    pop         rbx
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vp9_copy32xn_sse2(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *dst_ptr,
+;    int  dst_stride,
+;    int height);
+global sym(vp9_copy32xn_sse2)
+sym(vp9_copy32xn_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov             rsi,        arg(0) ;src_ptr
+        mov             rdi,        arg(2) ;dst_ptr
+
+        movsxd          rax,        dword ptr arg(1) ;src_stride
+        movsxd          rdx,        dword ptr arg(3) ;dst_stride
+        movsxd          rcx,        dword ptr arg(4) ;height
+
+.block_copy_sse2_loopx4:
+        movdqu          xmm0,       XMMWORD PTR [rsi]
+        movdqu          xmm1,       XMMWORD PTR [rsi + 16]
+        movdqu          xmm2,       XMMWORD PTR [rsi + rax]
+        movdqu          xmm3,       XMMWORD PTR [rsi + rax + 16]
+
+        lea             rsi,        [rsi+rax*2]
+
+        movdqu          xmm4,       XMMWORD PTR [rsi]
+        movdqu          xmm5,       XMMWORD PTR [rsi + 16]
+        movdqu          xmm6,       XMMWORD PTR [rsi + rax]
+        movdqu          xmm7,       XMMWORD PTR [rsi + rax + 16]
+
+        lea             rsi,    [rsi+rax*2]
+
+        movdqa          XMMWORD PTR [rdi], xmm0
+        movdqa          XMMWORD PTR [rdi + 16], xmm1
+        movdqa          XMMWORD PTR [rdi + rdx], xmm2
+        movdqa          XMMWORD PTR [rdi + rdx + 16], xmm3
+
+        lea             rdi,    [rdi+rdx*2]
+
+        movdqa          XMMWORD PTR [rdi], xmm4
+        movdqa          XMMWORD PTR [rdi + 16], xmm5
+        movdqa          XMMWORD PTR [rdi + rdx], xmm6
+        movdqa          XMMWORD PTR [rdi + rdx + 16], xmm7
+
+        lea             rdi,    [rdi+rdx*2]
+
+        sub             rcx,     4
+        cmp             rcx,     4
+        jge             .block_copy_sse2_loopx4
+
+        cmp             rcx, 0
+        je              .copy_is_done
+
+.block_copy_sse2_loop:
+        movdqu          xmm0,       XMMWORD PTR [rsi]
+        movdqu          xmm1,       XMMWORD PTR [rsi + 16]
+        lea             rsi,    [rsi+rax]
+
+        movdqa          XMMWORD PTR [rdi], xmm0
+        movdqa          XMMWORD PTR [rdi + 16], xmm1
+        lea             rdi,    [rdi+rdx]
+
+        sub             rcx,     1
+        jne             .block_copy_sse2_loop
+
+.copy_is_done:
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
--- /dev/null
+++ b/vp9/encoder/x86/sad_sse3.asm
@@ -1,0 +1,960 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "vpx_ports/x86_abi_support.asm"
+
+%macro STACK_FRAME_CREATE_X3 0
+%if ABI_IS_32BIT
+  %define     src_ptr       rsi
+  %define     src_stride    rax
+  %define     ref_ptr       rdi
+  %define     ref_stride    rdx
+  %define     end_ptr       rcx
+  %define     ret_var       rbx
+  %define     result_ptr    arg(4)
+  %define     max_err       arg(4)
+  %define     height        dword ptr arg(4)
+    push        rbp
+    mov         rbp,        rsp
+    push        rsi
+    push        rdi
+    push        rbx
+
+    mov         rsi,        arg(0)              ; src_ptr
+    mov         rdi,        arg(2)              ; ref_ptr
+
+    movsxd      rax,        dword ptr arg(1)    ; src_stride
+    movsxd      rdx,        dword ptr arg(3)    ; ref_stride
+%else
+  %ifidn __OUTPUT_FORMAT__,x64
+    SAVE_XMM 7, u
+    %define     src_ptr     rcx
+    %define     src_stride  rdx
+    %define     ref_ptr     r8
+    %define     ref_stride  r9
+    %define     end_ptr     r10
+    %define     ret_var     r11
+    %define     result_ptr  [rsp+xmm_stack_space+8+4*8]
+    %define     max_err     [rsp+xmm_stack_space+8+4*8]
+    %define     height      dword ptr [rsp+xmm_stack_space+8+4*8]
+  %else
+    %define     src_ptr     rdi
+    %define     src_stride  rsi
+    %define     ref_ptr     rdx
+    %define     ref_stride  rcx
+    %define     end_ptr     r9
+    %define     ret_var     r10
+    %define     result_ptr  r8
+    %define     max_err     r8
+    %define     height      r8
+  %endif
+%endif
+
+%endmacro
+
+%macro STACK_FRAME_DESTROY_X3 0
+  %define     src_ptr
+  %define     src_stride
+  %define     ref_ptr
+  %define     ref_stride
+  %define     end_ptr
+  %define     ret_var
+  %define     result_ptr
+  %define     max_err
+  %define     height
+
+%if ABI_IS_32BIT
+    pop         rbx
+    pop         rdi
+    pop         rsi
+    pop         rbp
+%else
+  %ifidn __OUTPUT_FORMAT__,x64
+    RESTORE_XMM
+  %endif
+%endif
+    ret
+%endmacro
+
+%macro STACK_FRAME_CREATE_X4 0
+%if ABI_IS_32BIT
+  %define     src_ptr       rsi
+  %define     src_stride    rax
+  %define     r0_ptr        rcx
+  %define     r1_ptr        rdx
+  %define     r2_ptr        rbx
+  %define     r3_ptr        rdi
+  %define     ref_stride    rbp
+  %define     result_ptr    arg(4)
+    push        rbp
+    mov         rbp,        rsp
+    push        rsi
+    push        rdi
+    push        rbx
+
+    push        rbp
+    mov         rdi,        arg(2)              ; ref_ptr_base
+
+    LOAD_X4_ADDRESSES rdi, rcx, rdx, rax, rdi
+
+    mov         rsi,        arg(0)              ; src_ptr
+
+    movsxd      rbx,        dword ptr arg(1)    ; src_stride
+    movsxd      rbp,        dword ptr arg(3)    ; ref_stride
+
+    xchg        rbx,        rax
+%else
+  %ifidn __OUTPUT_FORMAT__,x64
+    SAVE_XMM 7, u
+    %define     src_ptr     rcx
+    %define     src_stride  rdx
+    %define     r0_ptr      rsi
+    %define     r1_ptr      r10
+    %define     r2_ptr      r11
+    %define     r3_ptr      r8
+    %define     ref_stride  r9
+    %define     result_ptr  [rsp+xmm_stack_space+16+4*8]
+    push        rsi
+
+    LOAD_X4_ADDRESSES r8, r0_ptr, r1_ptr, r2_ptr, r3_ptr
+  %else
+    %define     src_ptr     rdi
+    %define     src_stride  rsi
+    %define     r0_ptr      r9
+    %define     r1_ptr      r10
+    %define     r2_ptr      r11
+    %define     r3_ptr      rdx
+    %define     ref_stride  rcx
+    %define     result_ptr  r8
+
+    LOAD_X4_ADDRESSES rdx, r0_ptr, r1_ptr, r2_ptr, r3_ptr
+
+  %endif
+%endif
+%endmacro
+
+%macro STACK_FRAME_DESTROY_X4 0
+  %define     src_ptr
+  %define     src_stride
+  %define     r0_ptr
+  %define     r1_ptr
+  %define     r2_ptr
+  %define     r3_ptr
+  %define     ref_stride
+  %define     result_ptr
+
+%if ABI_IS_32BIT
+    pop         rbx
+    pop         rdi
+    pop         rsi
+    pop         rbp
+%else
+  %ifidn __OUTPUT_FORMAT__,x64
+    pop         rsi
+    RESTORE_XMM
+  %endif
+%endif
+    ret
+%endmacro
+
+%macro PROCESS_16X2X3 5
+%if %1==0
+        movdqa          xmm0,       XMMWORD PTR [%2]
+        lddqu           xmm5,       XMMWORD PTR [%3]
+        lddqu           xmm6,       XMMWORD PTR [%3+1]
+        lddqu           xmm7,       XMMWORD PTR [%3+2]
+
+        psadbw          xmm5,       xmm0
+        psadbw          xmm6,       xmm0
+        psadbw          xmm7,       xmm0
+%else
+        movdqa          xmm0,       XMMWORD PTR [%2]
+        lddqu           xmm1,       XMMWORD PTR [%3]
+        lddqu           xmm2,       XMMWORD PTR [%3+1]
+        lddqu           xmm3,       XMMWORD PTR [%3+2]
+
+        psadbw          xmm1,       xmm0
+        psadbw          xmm2,       xmm0
+        psadbw          xmm3,       xmm0
+
+        paddw           xmm5,       xmm1
+        paddw           xmm6,       xmm2
+        paddw           xmm7,       xmm3
+%endif
+        movdqa          xmm0,       XMMWORD PTR [%2+%4]
+        lddqu           xmm1,       XMMWORD PTR [%3+%5]
+        lddqu           xmm2,       XMMWORD PTR [%3+%5+1]
+        lddqu           xmm3,       XMMWORD PTR [%3+%5+2]
+
+%if %1==0 || %1==1
+        lea             %2,         [%2+%4*2]
+        lea             %3,         [%3+%5*2]
+%endif
+
+        psadbw          xmm1,       xmm0
+        psadbw          xmm2,       xmm0
+        psadbw          xmm3,       xmm0
+
+        paddw           xmm5,       xmm1
+        paddw           xmm6,       xmm2
+        paddw           xmm7,       xmm3
+%endmacro
+
+%macro PROCESS_8X2X3 5
+%if %1==0
+        movq            mm0,       QWORD PTR [%2]
+        movq            mm5,       QWORD PTR [%3]
+        movq            mm6,       QWORD PTR [%3+1]
+        movq            mm7,       QWORD PTR [%3+2]
+
+        psadbw          mm5,       mm0
+        psadbw          mm6,       mm0
+        psadbw          mm7,       mm0
+%else
+        movq            mm0,       QWORD PTR [%2]
+        movq            mm1,       QWORD PTR [%3]
+        movq            mm2,       QWORD PTR [%3+1]
+        movq            mm3,       QWORD PTR [%3+2]
+
+        psadbw          mm1,       mm0
+        psadbw          mm2,       mm0
+        psadbw          mm3,       mm0
+
+        paddw           mm5,       mm1
+        paddw           mm6,       mm2
+        paddw           mm7,       mm3
+%endif
+        movq            mm0,       QWORD PTR [%2+%4]
+        movq            mm1,       QWORD PTR [%3+%5]
+        movq            mm2,       QWORD PTR [%3+%5+1]
+        movq            mm3,       QWORD PTR [%3+%5+2]
+
+%if %1==0 || %1==1
+        lea             %2,        [%2+%4*2]
+        lea             %3,        [%3+%5*2]
+%endif
+
+        psadbw          mm1,       mm0
+        psadbw          mm2,       mm0
+        psadbw          mm3,       mm0
+
+        paddw           mm5,       mm1
+        paddw           mm6,       mm2
+        paddw           mm7,       mm3
+%endmacro
+
+%macro LOAD_X4_ADDRESSES 5
+        mov             %2,         [%1+REG_SZ_BYTES*0]
+        mov             %3,         [%1+REG_SZ_BYTES*1]
+
+        mov             %4,         [%1+REG_SZ_BYTES*2]
+        mov             %5,         [%1+REG_SZ_BYTES*3]
+%endmacro
+
+%macro PROCESS_16X2X4 8
+%if %1==0
+        movdqa          xmm0,       XMMWORD PTR [%2]
+        lddqu           xmm4,       XMMWORD PTR [%3]
+        lddqu           xmm5,       XMMWORD PTR [%4]
+        lddqu           xmm6,       XMMWORD PTR [%5]
+        lddqu           xmm7,       XMMWORD PTR [%6]
+
+        psadbw          xmm4,       xmm0
+        psadbw          xmm5,       xmm0
+        psadbw          xmm6,       xmm0
+        psadbw          xmm7,       xmm0
+%else
+        movdqa          xmm0,       XMMWORD PTR [%2]
+        lddqu           xmm1,       XMMWORD PTR [%3]
+        lddqu           xmm2,       XMMWORD PTR [%4]
+        lddqu           xmm3,       XMMWORD PTR [%5]
+
+        psadbw          xmm1,       xmm0
+        psadbw          xmm2,       xmm0
+        psadbw          xmm3,       xmm0
+
+        paddw           xmm4,       xmm1
+        lddqu           xmm1,       XMMWORD PTR [%6]
+        paddw           xmm5,       xmm2
+        paddw           xmm6,       xmm3
+
+        psadbw          xmm1,       xmm0
+        paddw           xmm7,       xmm1
+%endif
+        movdqa          xmm0,       XMMWORD PTR [%2+%7]
+        lddqu           xmm1,       XMMWORD PTR [%3+%8]
+        lddqu           xmm2,       XMMWORD PTR [%4+%8]
+        lddqu           xmm3,       XMMWORD PTR [%5+%8]
+
+        psadbw          xmm1,       xmm0
+        psadbw          xmm2,       xmm0
+        psadbw          xmm3,       xmm0
+
+        paddw           xmm4,       xmm1
+        lddqu           xmm1,       XMMWORD PTR [%6+%8]
+        paddw           xmm5,       xmm2
+        paddw           xmm6,       xmm3
+
+%if %1==0 || %1==1
+        lea             %2,         [%2+%7*2]
+        lea             %3,         [%3+%8*2]
+
+        lea             %4,         [%4+%8*2]
+        lea             %5,         [%5+%8*2]
+
+        lea             %6,         [%6+%8*2]
+%endif
+        psadbw          xmm1,       xmm0
+        paddw           xmm7,       xmm1
+
+%endmacro
+
+%macro PROCESS_8X2X4 8
+%if %1==0
+        movq            mm0,        QWORD PTR [%2]
+        movq            mm4,        QWORD PTR [%3]
+        movq            mm5,        QWORD PTR [%4]
+        movq            mm6,        QWORD PTR [%5]
+        movq            mm7,        QWORD PTR [%6]
+
+        psadbw          mm4,        mm0
+        psadbw          mm5,        mm0
+        psadbw          mm6,        mm0
+        psadbw          mm7,        mm0
+%else
+        movq            mm0,        QWORD PTR [%2]
+        movq            mm1,        QWORD PTR [%3]
+        movq            mm2,        QWORD PTR [%4]
+        movq            mm3,        QWORD PTR [%5]
+
+        psadbw          mm1,        mm0
+        psadbw          mm2,        mm0
+        psadbw          mm3,        mm0
+
+        paddw           mm4,        mm1
+        movq            mm1,        QWORD PTR [%6]
+        paddw           mm5,        mm2
+        paddw           mm6,        mm3
+
+        psadbw          mm1,        mm0
+        paddw           mm7,        mm1
+%endif
+        movq            mm0,        QWORD PTR [%2+%7]
+        movq            mm1,        QWORD PTR [%3+%8]
+        movq            mm2,        QWORD PTR [%4+%8]
+        movq            mm3,        QWORD PTR [%5+%8]
+
+        psadbw          mm1,        mm0
+        psadbw          mm2,        mm0
+        psadbw          mm3,        mm0
+
+        paddw           mm4,        mm1
+        movq            mm1,        QWORD PTR [%6+%8]
+        paddw           mm5,        mm2
+        paddw           mm6,        mm3
+
+%if %1==0 || %1==1
+        lea             %2,         [%2+%7*2]
+        lea             %3,         [%3+%8*2]
+
+        lea             %4,         [%4+%8*2]
+        lea             %5,         [%5+%8*2]
+
+        lea             %6,         [%6+%8*2]
+%endif
+        psadbw          mm1,        mm0
+        paddw           mm7,        mm1
+
+%endmacro
+
+;void int vp9_sad16x16x3_sse3(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *ref_ptr,
+;    int  ref_stride,
+;    int  *results)
+global sym(vp9_sad16x16x3_sse3)
+sym(vp9_sad16x16x3_sse3):
+
+    STACK_FRAME_CREATE_X3
+
+        PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
+        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+        PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
+
+        mov             rcx,        result_ptr
+
+        movq            xmm0,       xmm5
+        psrldq          xmm5,       8
+
+        paddw           xmm0,       xmm5
+        movd            [rcx],      xmm0
+;-
+        movq            xmm0,       xmm6
+        psrldq          xmm6,       8
+
+        paddw           xmm0,       xmm6
+        movd            [rcx+4],    xmm0
+;-
+        movq            xmm0,       xmm7
+        psrldq          xmm7,       8
+
+        paddw           xmm0,       xmm7
+        movd            [rcx+8],    xmm0
+
+    STACK_FRAME_DESTROY_X3
+
+;void int vp9_sad16x8x3_sse3(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *ref_ptr,
+;    int  ref_stride,
+;    int  *results)
+global sym(vp9_sad16x8x3_sse3)
+sym(vp9_sad16x8x3_sse3):
+
+    STACK_FRAME_CREATE_X3
+
+        PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
+        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+        PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
+
+        mov             rcx,        result_ptr
+
+        movq            xmm0,       xmm5
+        psrldq          xmm5,       8
+
+        paddw           xmm0,       xmm5
+        movd            [rcx],      xmm0
+;-
+        movq            xmm0,       xmm6
+        psrldq          xmm6,       8
+
+        paddw           xmm0,       xmm6
+        movd            [rcx+4],    xmm0
+;-
+        movq            xmm0,       xmm7
+        psrldq          xmm7,       8
+
+        paddw           xmm0,       xmm7
+        movd            [rcx+8],    xmm0
+
+    STACK_FRAME_DESTROY_X3
+
+;void int vp9_sad8x16x3_sse3(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *ref_ptr,
+;    int  ref_stride,
+;    int  *results)
+global sym(vp9_sad8x16x3_sse3)
+sym(vp9_sad8x16x3_sse3):
+
+    STACK_FRAME_CREATE_X3
+
+        PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
+        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+        PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
+
+        mov             rcx,        result_ptr
+
+        punpckldq       mm5,        mm6
+
+        movq            [rcx],      mm5
+        movd            [rcx+8],    mm7
+
+    STACK_FRAME_DESTROY_X3
+
+;void int vp9_sad8x8x3_sse3(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *ref_ptr,
+;    int  ref_stride,
+;    int  *results)
+global sym(vp9_sad8x8x3_sse3)
+sym(vp9_sad8x8x3_sse3):
+
+    STACK_FRAME_CREATE_X3
+
+        PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
+        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+        PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
+
+        mov             rcx,        result_ptr
+
+        punpckldq       mm5,        mm6
+
+        movq            [rcx],      mm5
+        movd            [rcx+8],    mm7
+
+    STACK_FRAME_DESTROY_X3
+
+;void int vp9_sad4x4x3_sse3(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *ref_ptr,
+;    int  ref_stride,
+;    int  *results)
+global sym(vp9_sad4x4x3_sse3)
+sym(vp9_sad4x4x3_sse3):
+
+    STACK_FRAME_CREATE_X3
+
+        movd            mm0,        DWORD PTR [src_ptr]
+        movd            mm1,        DWORD PTR [ref_ptr]
+
+        movd            mm2,        DWORD PTR [src_ptr+src_stride]
+        movd            mm3,        DWORD PTR [ref_ptr+ref_stride]
+
+        punpcklbw       mm0,        mm2
+        punpcklbw       mm1,        mm3
+
+        movd            mm4,        DWORD PTR [ref_ptr+1]
+        movd            mm5,        DWORD PTR [ref_ptr+2]
+
+        movd            mm2,        DWORD PTR [ref_ptr+ref_stride+1]
+        movd            mm3,        DWORD PTR [ref_ptr+ref_stride+2]
+
+        psadbw          mm1,        mm0
+
+        punpcklbw       mm4,        mm2
+        punpcklbw       mm5,        mm3
+
+        psadbw          mm4,        mm0
+        psadbw          mm5,        mm0
+
+        lea             src_ptr,    [src_ptr+src_stride*2]
+        lea             ref_ptr,    [ref_ptr+ref_stride*2]
+
+        movd            mm0,        DWORD PTR [src_ptr]
+        movd            mm2,        DWORD PTR [ref_ptr]
+
+        movd            mm3,        DWORD PTR [src_ptr+src_stride]
+        movd            mm6,        DWORD PTR [ref_ptr+ref_stride]
+
+        punpcklbw       mm0,        mm3
+        punpcklbw       mm2,        mm6
+
+        movd            mm3,        DWORD PTR [ref_ptr+1]
+        movd            mm7,        DWORD PTR [ref_ptr+2]
+
+        psadbw          mm2,        mm0
+
+        paddw           mm1,        mm2
+
+        movd            mm2,        DWORD PTR [ref_ptr+ref_stride+1]
+        movd            mm6,        DWORD PTR [ref_ptr+ref_stride+2]
+
+        punpcklbw       mm3,        mm2
+        punpcklbw       mm7,        mm6
+
+        psadbw          mm3,        mm0
+        psadbw          mm7,        mm0
+
+        paddw           mm3,        mm4
+        paddw           mm7,        mm5
+
+        mov             rcx,        result_ptr
+
+        punpckldq       mm1,        mm3
+
+        movq            [rcx],      mm1
+        movd            [rcx+8],    mm7
+
+    STACK_FRAME_DESTROY_X3
+
+;unsigned int vp9_sad16x16_sse3(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *ref_ptr,
+;    int  ref_stride,
+;    int  max_err)
+;%define lddqu movdqu
+global sym(vp9_sad16x16_sse3)
+sym(vp9_sad16x16_sse3):
+
+    STACK_FRAME_CREATE_X3
+
+        mov             end_ptr,    4
+        pxor            xmm7,        xmm7
+
+.vp9_sad16x16_sse3_loop:
+        movdqa          xmm0,       XMMWORD PTR [src_ptr]
+        movdqu          xmm1,       XMMWORD PTR [ref_ptr]
+        movdqa          xmm2,       XMMWORD PTR [src_ptr+src_stride]
+        movdqu          xmm3,       XMMWORD PTR [ref_ptr+ref_stride]
+
+        lea             src_ptr,    [src_ptr+src_stride*2]
+        lea             ref_ptr,    [ref_ptr+ref_stride*2]
+
+        movdqa          xmm4,       XMMWORD PTR [src_ptr]
+        movdqu          xmm5,       XMMWORD PTR [ref_ptr]
+        movdqa          xmm6,       XMMWORD PTR [src_ptr+src_stride]
+
+        psadbw          xmm0,       xmm1
+
+        movdqu          xmm1,       XMMWORD PTR [ref_ptr+ref_stride]
+
+        psadbw          xmm2,       xmm3
+        psadbw          xmm4,       xmm5
+        psadbw          xmm6,       xmm1
+
+        lea             src_ptr,    [src_ptr+src_stride*2]
+        lea             ref_ptr,    [ref_ptr+ref_stride*2]
+
+        paddw           xmm7,        xmm0
+        paddw           xmm7,        xmm2
+        paddw           xmm7,        xmm4
+        paddw           xmm7,        xmm6
+
+        sub             end_ptr,     1
+        jne             .vp9_sad16x16_sse3_loop
+
+        movq            xmm0,       xmm7
+        psrldq          xmm7,       8
+        paddw           xmm0,       xmm7
+        movq            rax,        xmm0
+
+    STACK_FRAME_DESTROY_X3
+
+;void vp9_copy32xn_sse3(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *dst_ptr,
+;    int  dst_stride,
+;    int height);
+global sym(vp9_copy32xn_sse3)
+sym(vp9_copy32xn_sse3):
+
+    STACK_FRAME_CREATE_X3
+
+.block_copy_sse3_loopx4:
+        lea             end_ptr,    [src_ptr+src_stride*2]
+
+        movdqu          xmm0,       XMMWORD PTR [src_ptr]
+        movdqu          xmm1,       XMMWORD PTR [src_ptr + 16]
+        movdqu          xmm2,       XMMWORD PTR [src_ptr + src_stride]
+        movdqu          xmm3,       XMMWORD PTR [src_ptr + src_stride + 16]
+        movdqu          xmm4,       XMMWORD PTR [end_ptr]
+        movdqu          xmm5,       XMMWORD PTR [end_ptr + 16]
+        movdqu          xmm6,       XMMWORD PTR [end_ptr + src_stride]
+        movdqu          xmm7,       XMMWORD PTR [end_ptr + src_stride + 16]
+
+        lea             src_ptr,    [src_ptr+src_stride*4]
+
+        lea             end_ptr,    [ref_ptr+ref_stride*2]
+
+        movdqa          XMMWORD PTR [ref_ptr], xmm0
+        movdqa          XMMWORD PTR [ref_ptr + 16], xmm1
+        movdqa          XMMWORD PTR [ref_ptr + ref_stride], xmm2
+        movdqa          XMMWORD PTR [ref_ptr + ref_stride + 16], xmm3
+        movdqa          XMMWORD PTR [end_ptr], xmm4
+        movdqa          XMMWORD PTR [end_ptr + 16], xmm5
+        movdqa          XMMWORD PTR [end_ptr + ref_stride], xmm6
+        movdqa          XMMWORD PTR [end_ptr + ref_stride + 16], xmm7
+
+        lea             ref_ptr,    [ref_ptr+ref_stride*4]
+
+        sub             height,     4
+        cmp             height,     4
+        jge             .block_copy_sse3_loopx4
+
+        ;Check to see if there is more rows need to be copied.
+        cmp             height, 0
+        je              .copy_is_done
+
+.block_copy_sse3_loop:
+        movdqu          xmm0,       XMMWORD PTR [src_ptr]
+        movdqu          xmm1,       XMMWORD PTR [src_ptr + 16]
+        lea             src_ptr,    [src_ptr+src_stride]
+
+        movdqa          XMMWORD PTR [ref_ptr], xmm0
+        movdqa          XMMWORD PTR [ref_ptr + 16], xmm1
+        lea             ref_ptr,    [ref_ptr+ref_stride]
+
+        sub             height,     1
+        jne             .block_copy_sse3_loop
+
+.copy_is_done:
+    STACK_FRAME_DESTROY_X3
+
+;void vp9_sad16x16x4d_sse3(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *ref_ptr_base,
+;    int  ref_stride,
+;    int  *results)
+global sym(vp9_sad16x16x4d_sse3)
+sym(vp9_sad16x16x4d_sse3):
+
+    STACK_FRAME_CREATE_X4
+
+        PROCESS_16X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+        PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+        PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+        PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+        PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+        PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+        PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+        PROCESS_16X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+
+%if ABI_IS_32BIT
+        pop             rbp
+%endif
+        mov             rcx,        result_ptr
+
+        movq            xmm0,       xmm4
+        psrldq          xmm4,       8
+
+        paddw           xmm0,       xmm4
+        movd            [rcx],      xmm0
+;-
+        movq            xmm0,       xmm5
+        psrldq          xmm5,       8
+
+        paddw           xmm0,       xmm5
+        movd            [rcx+4],    xmm0
+;-
+        movq            xmm0,       xmm6
+        psrldq          xmm6,       8
+
+        paddw           xmm0,       xmm6
+        movd            [rcx+8],    xmm0
+;-
+        movq            xmm0,       xmm7
+        psrldq          xmm7,       8
+
+        paddw           xmm0,       xmm7
+        movd            [rcx+12],   xmm0
+
+    STACK_FRAME_DESTROY_X4
+
+;void vp9_sad16x8x4d_sse3(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *ref_ptr_base,
+;    int  ref_stride,
+;    int  *results)
+global sym(vp9_sad16x8x4d_sse3)
+sym(vp9_sad16x8x4d_sse3):
+
+    STACK_FRAME_CREATE_X4
+
+        PROCESS_16X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+        PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+        PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+        PROCESS_16X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+
+%if ABI_IS_32BIT
+        pop             rbp
+%endif
+        mov             rcx,        result_ptr
+
+        movq            xmm0,       xmm4
+        psrldq          xmm4,       8
+
+        paddw           xmm0,       xmm4
+        movd            [rcx],      xmm0
+;-
+        movq            xmm0,       xmm5
+        psrldq          xmm5,       8
+
+        paddw           xmm0,       xmm5
+        movd            [rcx+4],    xmm0
+;-
+        movq            xmm0,       xmm6
+        psrldq          xmm6,       8
+
+        paddw           xmm0,       xmm6
+        movd            [rcx+8],    xmm0
+;-
+        movq            xmm0,       xmm7
+        psrldq          xmm7,       8
+
+        paddw           xmm0,       xmm7
+        movd            [rcx+12],   xmm0
+
+    STACK_FRAME_DESTROY_X4
+
+;void int vp9_sad8x16x4d_sse3(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *ref_ptr,
+;    int  ref_stride,
+;    int  *results)
+global sym(vp9_sad8x16x4d_sse3)
+sym(vp9_sad8x16x4d_sse3):
+
+    STACK_FRAME_CREATE_X4
+
+        PROCESS_8X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+        PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+        PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+        PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+        PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+        PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+        PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+        PROCESS_8X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+
+%if ABI_IS_32BIT
+        pop             rbp
+%endif
+        mov             rcx,        result_ptr
+
+        punpckldq       mm4,        mm5
+        punpckldq       mm6,        mm7
+
+        movq            [rcx],      mm4
+        movq            [rcx+8],    mm6
+
+    STACK_FRAME_DESTROY_X4
+
+;void int vp9_sad8x8x4d_sse3(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *ref_ptr,
+;    int  ref_stride,
+;    int  *results)
+global sym(vp9_sad8x8x4d_sse3)
+sym(vp9_sad8x8x4d_sse3):
+
+    STACK_FRAME_CREATE_X4
+
+        PROCESS_8X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+        PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+        PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+        PROCESS_8X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+
+%if ABI_IS_32BIT
+        pop             rbp
+%endif
+        mov             rcx,        result_ptr
+
+        punpckldq       mm4,        mm5
+        punpckldq       mm6,        mm7
+
+        movq            [rcx],      mm4
+        movq            [rcx+8],    mm6
+
+    STACK_FRAME_DESTROY_X4
+
+;void int vp9_sad4x4x4d_sse3(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *ref_ptr,
+;    int  ref_stride,
+;    int  *results)
+global sym(vp9_sad4x4x4d_sse3)
+sym(vp9_sad4x4x4d_sse3):
+
+    STACK_FRAME_CREATE_X4
+
+        movd            mm0,        DWORD PTR [src_ptr]
+        movd            mm1,        DWORD PTR [r0_ptr]
+
+        movd            mm2,        DWORD PTR [src_ptr+src_stride]
+        movd            mm3,        DWORD PTR [r0_ptr+ref_stride]
+
+        punpcklbw       mm0,        mm2
+        punpcklbw       mm1,        mm3
+
+        movd            mm4,        DWORD PTR [r1_ptr]
+        movd            mm5,        DWORD PTR [r2_ptr]
+
+        movd            mm6,        DWORD PTR [r3_ptr]
+        movd            mm2,        DWORD PTR [r1_ptr+ref_stride]
+
+        movd            mm3,        DWORD PTR [r2_ptr+ref_stride]
+        movd            mm7,        DWORD PTR [r3_ptr+ref_stride]
+
+        psadbw          mm1,        mm0
+
+        punpcklbw       mm4,        mm2
+        punpcklbw       mm5,        mm3
+
+        punpcklbw       mm6,        mm7
+        psadbw          mm4,        mm0
+
+        psadbw          mm5,        mm0
+        psadbw          mm6,        mm0
+
+
+
+        lea             src_ptr,    [src_ptr+src_stride*2]
+        lea             r0_ptr,     [r0_ptr+ref_stride*2]
+
+        lea             r1_ptr,     [r1_ptr+ref_stride*2]
+        lea             r2_ptr,     [r2_ptr+ref_stride*2]
+
+        lea             r3_ptr,     [r3_ptr+ref_stride*2]
+
+        movd            mm0,        DWORD PTR [src_ptr]
+        movd            mm2,        DWORD PTR [r0_ptr]
+
+        movd            mm3,        DWORD PTR [src_ptr+src_stride]
+        movd            mm7,        DWORD PTR [r0_ptr+ref_stride]
+
+        punpcklbw       mm0,        mm3
+        punpcklbw       mm2,        mm7
+
+        movd            mm3,        DWORD PTR [r1_ptr]
+        movd            mm7,        DWORD PTR [r2_ptr]
+
+        psadbw          mm2,        mm0
+%if ABI_IS_32BIT
+        mov             rax,        rbp
+
+        pop             rbp
+%define     ref_stride    rax
+%endif
+        mov             rsi,        result_ptr
+
+        paddw           mm1,        mm2
+        movd            [rsi],      mm1
+
+        movd            mm2,        DWORD PTR [r1_ptr+ref_stride]
+        movd            mm1,        DWORD PTR [r2_ptr+ref_stride]
+
+        punpcklbw       mm3,        mm2
+        punpcklbw       mm7,        mm1
+
+        psadbw          mm3,        mm0
+        psadbw          mm7,        mm0
+
+        movd            mm2,        DWORD PTR [r3_ptr]
+        movd            mm1,        DWORD PTR [r3_ptr+ref_stride]
+
+        paddw           mm3,        mm4
+        paddw           mm7,        mm5
+
+        movd            [rsi+4],    mm3
+        punpcklbw       mm2,        mm1
+
+        movd            [rsi+8],    mm7
+        psadbw          mm2,        mm0
+
+        paddw           mm2,        mm6
+        movd            [rsi+12],   mm2
+
+
+    STACK_FRAME_DESTROY_X4
+
--- /dev/null
+++ b/vp9/encoder/x86/sad_sse4.asm
@@ -1,0 +1,353 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+%macro PROCESS_16X2X8 1
+%if %1
+        movdqa          xmm0,       XMMWORD PTR [rsi]
+        movq            xmm1,       MMWORD PTR [rdi]
+        movq            xmm3,       MMWORD PTR [rdi+8]
+        movq            xmm2,       MMWORD PTR [rdi+16]
+        punpcklqdq      xmm1,       xmm3
+        punpcklqdq      xmm3,       xmm2
+
+        movdqa          xmm2,       xmm1
+        mpsadbw         xmm1,       xmm0,  0x0
+        mpsadbw         xmm2,       xmm0,  0x5
+
+        psrldq          xmm0,       8
+
+        movdqa          xmm4,       xmm3
+        mpsadbw         xmm3,       xmm0,  0x0
+        mpsadbw         xmm4,       xmm0,  0x5
+
+        paddw           xmm1,       xmm2
+        paddw           xmm1,       xmm3
+        paddw           xmm1,       xmm4
+%else
+        movdqa          xmm0,       XMMWORD PTR [rsi]
+        movq            xmm5,       MMWORD PTR [rdi]
+        movq            xmm3,       MMWORD PTR [rdi+8]
+        movq            xmm2,       MMWORD PTR [rdi+16]
+        punpcklqdq      xmm5,       xmm3
+        punpcklqdq      xmm3,       xmm2
+
+        movdqa          xmm2,       xmm5
+        mpsadbw         xmm5,       xmm0,  0x0
+        mpsadbw         xmm2,       xmm0,  0x5
+
+        psrldq          xmm0,       8
+
+        movdqa          xmm4,       xmm3
+        mpsadbw         xmm3,       xmm0,  0x0
+        mpsadbw         xmm4,       xmm0,  0x5
+
+        paddw           xmm5,       xmm2
+        paddw           xmm5,       xmm3
+        paddw           xmm5,       xmm4
+
+        paddw           xmm1,       xmm5
+%endif
+        movdqa          xmm0,       XMMWORD PTR [rsi + rax]
+        movq            xmm5,       MMWORD PTR [rdi+ rdx]
+        movq            xmm3,       MMWORD PTR [rdi+ rdx+8]
+        movq            xmm2,       MMWORD PTR [rdi+ rdx+16]
+        punpcklqdq      xmm5,       xmm3
+        punpcklqdq      xmm3,       xmm2
+
+        lea             rsi,        [rsi+rax*2]
+        lea             rdi,        [rdi+rdx*2]
+
+        movdqa          xmm2,       xmm5
+        mpsadbw         xmm5,       xmm0,  0x0
+        mpsadbw         xmm2,       xmm0,  0x5
+
+        psrldq          xmm0,       8
+        movdqa          xmm4,       xmm3
+        mpsadbw         xmm3,       xmm0,  0x0
+        mpsadbw         xmm4,       xmm0,  0x5
+
+        paddw           xmm5,       xmm2
+        paddw           xmm5,       xmm3
+        paddw           xmm5,       xmm4
+
+        paddw           xmm1,       xmm5
+%endmacro
+
+%macro PROCESS_8X2X8 1
+%if %1
+        movq            xmm0,       MMWORD PTR [rsi]
+        movq            xmm1,       MMWORD PTR [rdi]
+        movq            xmm3,       MMWORD PTR [rdi+8]
+        punpcklqdq      xmm1,       xmm3
+
+        movdqa          xmm2,       xmm1
+        mpsadbw         xmm1,       xmm0,  0x0
+        mpsadbw         xmm2,       xmm0,  0x5
+        paddw           xmm1,       xmm2
+%else
+        movq            xmm0,       MMWORD PTR [rsi]
+        movq            xmm5,       MMWORD PTR [rdi]
+        movq            xmm3,       MMWORD PTR [rdi+8]
+        punpcklqdq      xmm5,       xmm3
+
+        movdqa          xmm2,       xmm5
+        mpsadbw         xmm5,       xmm0,  0x0
+        mpsadbw         xmm2,       xmm0,  0x5
+        paddw           xmm5,       xmm2
+
+        paddw           xmm1,       xmm5
+%endif
+        movq            xmm0,       MMWORD PTR [rsi + rax]
+        movq            xmm5,       MMWORD PTR [rdi+ rdx]
+        movq            xmm3,       MMWORD PTR [rdi+ rdx+8]
+        punpcklqdq      xmm5,       xmm3
+
+        lea             rsi,        [rsi+rax*2]
+        lea             rdi,        [rdi+rdx*2]
+
+        movdqa          xmm2,       xmm5
+        mpsadbw         xmm5,       xmm0,  0x0
+        mpsadbw         xmm2,       xmm0,  0x5
+        paddw           xmm5,       xmm2
+
+        paddw           xmm1,       xmm5
+%endmacro
+
+%macro PROCESS_4X2X8 1
+%if %1
+        movd            xmm0,       [rsi]
+        movq            xmm1,       MMWORD PTR [rdi]
+        movq            xmm3,       MMWORD PTR [rdi+8]
+        punpcklqdq      xmm1,       xmm3
+
+        mpsadbw         xmm1,       xmm0,  0x0
+%else
+        movd            xmm0,       [rsi]
+        movq            xmm5,       MMWORD PTR [rdi]
+        movq            xmm3,       MMWORD PTR [rdi+8]
+        punpcklqdq      xmm5,       xmm3
+
+        mpsadbw         xmm5,       xmm0,  0x0
+
+        paddw           xmm1,       xmm5
+%endif
+        movd            xmm0,       [rsi + rax]
+        movq            xmm5,       MMWORD PTR [rdi+ rdx]
+        movq            xmm3,       MMWORD PTR [rdi+ rdx+8]
+        punpcklqdq      xmm5,       xmm3
+
+        lea             rsi,        [rsi+rax*2]
+        lea             rdi,        [rdi+rdx*2]
+
+        mpsadbw         xmm5,       xmm0,  0x0
+
+        paddw           xmm1,       xmm5
+%endmacro
+
+
+;void vp9_sad16x16x8_sse4(
+;    const unsigned char *src_ptr,
+;    int  src_stride,
+;    const unsigned char *ref_ptr,
+;    int  ref_stride,
+;    unsigned short *sad_array);
+global sym(vp9_sad16x16x8_sse4)
+sym(vp9_sad16x16x8_sse4):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov             rsi,        arg(0)           ;src_ptr
+        mov             rdi,        arg(2)           ;ref_ptr
+
+        movsxd          rax,        dword ptr arg(1) ;src_stride
+        movsxd          rdx,        dword ptr arg(3) ;ref_stride
+
+        PROCESS_16X2X8 1
+        PROCESS_16X2X8 0
+        PROCESS_16X2X8 0
+        PROCESS_16X2X8 0
+        PROCESS_16X2X8 0
+        PROCESS_16X2X8 0
+        PROCESS_16X2X8 0
+        PROCESS_16X2X8 0
+
+        mov             rdi,        arg(4)           ;Results
+        movdqa          XMMWORD PTR [rdi],    xmm1
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp9_sad16x8x8_sse4(
+;    const unsigned char *src_ptr,
+;    int  src_stride,
+;    const unsigned char *ref_ptr,
+;    int  ref_stride,
+;    unsigned short *sad_array
+;);
+global sym(vp9_sad16x8x8_sse4)
+sym(vp9_sad16x8x8_sse4):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov             rsi,        arg(0)           ;src_ptr
+        mov             rdi,        arg(2)           ;ref_ptr
+
+        movsxd          rax,        dword ptr arg(1) ;src_stride
+        movsxd          rdx,        dword ptr arg(3) ;ref_stride
+
+        PROCESS_16X2X8 1
+        PROCESS_16X2X8 0
+        PROCESS_16X2X8 0
+        PROCESS_16X2X8 0
+
+        mov             rdi,        arg(4)           ;Results
+        movdqa          XMMWORD PTR [rdi],    xmm1
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp9_sad8x8x8_sse4(
+;    const unsigned char *src_ptr,
+;    int  src_stride,
+;    const unsigned char *ref_ptr,
+;    int  ref_stride,
+;    unsigned short *sad_array
+;);
+global sym(vp9_sad8x8x8_sse4)
+sym(vp9_sad8x8x8_sse4):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov             rsi,        arg(0)           ;src_ptr
+        mov             rdi,        arg(2)           ;ref_ptr
+
+        movsxd          rax,        dword ptr arg(1) ;src_stride
+        movsxd          rdx,        dword ptr arg(3) ;ref_stride
+
+        PROCESS_8X2X8 1
+        PROCESS_8X2X8 0
+        PROCESS_8X2X8 0
+        PROCESS_8X2X8 0
+
+        mov             rdi,        arg(4)           ;Results
+        movdqa          XMMWORD PTR [rdi],    xmm1
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp9_sad8x16x8_sse4(
+;    const unsigned char *src_ptr,
+;    int  src_stride,
+;    const unsigned char *ref_ptr,
+;    int  ref_stride,
+;    unsigned short *sad_array
+;);
+global sym(vp9_sad8x16x8_sse4)
+sym(vp9_sad8x16x8_sse4):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov             rsi,        arg(0)           ;src_ptr
+        mov             rdi,        arg(2)           ;ref_ptr
+
+        movsxd          rax,        dword ptr arg(1) ;src_stride
+        movsxd          rdx,        dword ptr arg(3) ;ref_stride
+
+        PROCESS_8X2X8 1
+        PROCESS_8X2X8 0
+        PROCESS_8X2X8 0
+        PROCESS_8X2X8 0
+        PROCESS_8X2X8 0
+        PROCESS_8X2X8 0
+        PROCESS_8X2X8 0
+        PROCESS_8X2X8 0
+        mov             rdi,        arg(4)           ;Results
+        movdqa          XMMWORD PTR [rdi],    xmm1
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp9_sad4x4x8_c(
+;    const unsigned char *src_ptr,
+;    int  src_stride,
+;    const unsigned char *ref_ptr,
+;    int  ref_stride,
+;    unsigned short *sad_array
+;);
+global sym(vp9_sad4x4x8_sse4)
+sym(vp9_sad4x4x8_sse4):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov             rsi,        arg(0)           ;src_ptr
+        mov             rdi,        arg(2)           ;ref_ptr
+
+        movsxd          rax,        dword ptr arg(1) ;src_stride
+        movsxd          rdx,        dword ptr arg(3) ;ref_stride
+
+        PROCESS_4X2X8 1
+        PROCESS_4X2X8 0
+
+        mov             rdi,        arg(4)           ;Results
+        movdqa          XMMWORD PTR [rdi],    xmm1
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+
+
--- /dev/null
+++ b/vp9/encoder/x86/sad_ssse3.asm
@@ -1,0 +1,370 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+%macro PROCESS_16X2X3 1
+%if %1
+        movdqa          xmm0,       XMMWORD PTR [rsi]
+        lddqu           xmm5,       XMMWORD PTR [rdi]
+        lddqu           xmm6,       XMMWORD PTR [rdi+1]
+        lddqu           xmm7,       XMMWORD PTR [rdi+2]
+
+        psadbw          xmm5,       xmm0
+        psadbw          xmm6,       xmm0
+        psadbw          xmm7,       xmm0
+%else
+        movdqa          xmm0,       XMMWORD PTR [rsi]
+        lddqu           xmm1,       XMMWORD PTR [rdi]
+        lddqu           xmm2,       XMMWORD PTR [rdi+1]
+        lddqu           xmm3,       XMMWORD PTR [rdi+2]
+
+        psadbw          xmm1,       xmm0
+        psadbw          xmm2,       xmm0
+        psadbw          xmm3,       xmm0
+
+        paddw           xmm5,       xmm1
+        paddw           xmm6,       xmm2
+        paddw           xmm7,       xmm3
+%endif
+        movdqa          xmm0,       XMMWORD PTR [rsi+rax]
+        lddqu           xmm1,       XMMWORD PTR [rdi+rdx]
+        lddqu           xmm2,       XMMWORD PTR [rdi+rdx+1]
+        lddqu           xmm3,       XMMWORD PTR [rdi+rdx+2]
+
+        lea             rsi,        [rsi+rax*2]
+        lea             rdi,        [rdi+rdx*2]
+
+        psadbw          xmm1,       xmm0
+        psadbw          xmm2,       xmm0
+        psadbw          xmm3,       xmm0
+
+        paddw           xmm5,       xmm1
+        paddw           xmm6,       xmm2
+        paddw           xmm7,       xmm3
+%endmacro
+
+%macro PROCESS_16X2X3_OFFSET 2
+%if %1
+        movdqa          xmm0,       XMMWORD PTR [rsi]
+        movdqa          xmm4,       XMMWORD PTR [rdi]
+        movdqa          xmm7,       XMMWORD PTR [rdi+16]
+
+        movdqa          xmm5,       xmm7
+        palignr         xmm5,       xmm4,       %2
+
+        movdqa          xmm6,       xmm7
+        palignr         xmm6,       xmm4,       (%2+1)
+
+        palignr         xmm7,       xmm4,       (%2+2)
+
+        psadbw          xmm5,       xmm0
+        psadbw          xmm6,       xmm0
+        psadbw          xmm7,       xmm0
+%else
+        movdqa          xmm0,       XMMWORD PTR [rsi]
+        movdqa          xmm4,       XMMWORD PTR [rdi]
+        movdqa          xmm3,       XMMWORD PTR [rdi+16]
+
+        movdqa          xmm1,       xmm3
+        palignr         xmm1,       xmm4,       %2
+
+        movdqa          xmm2,       xmm3
+        palignr         xmm2,       xmm4,       (%2+1)
+
+        palignr         xmm3,       xmm4,       (%2+2)
+
+        psadbw          xmm1,       xmm0
+        psadbw          xmm2,       xmm0
+        psadbw          xmm3,       xmm0
+
+        paddw           xmm5,       xmm1
+        paddw           xmm6,       xmm2
+        paddw           xmm7,       xmm3
+%endif
+        movdqa          xmm0,       XMMWORD PTR [rsi+rax]
+        movdqa          xmm4,       XMMWORD PTR [rdi+rdx]
+        movdqa          xmm3,       XMMWORD PTR [rdi+rdx+16]
+
+        movdqa          xmm1,       xmm3
+        palignr         xmm1,       xmm4,       %2
+
+        movdqa          xmm2,       xmm3
+        palignr         xmm2,       xmm4,       (%2+1)
+
+        palignr         xmm3,       xmm4,       (%2+2)
+
+        lea             rsi,        [rsi+rax*2]
+        lea             rdi,        [rdi+rdx*2]
+
+        psadbw          xmm1,       xmm0
+        psadbw          xmm2,       xmm0
+        psadbw          xmm3,       xmm0
+
+        paddw           xmm5,       xmm1
+        paddw           xmm6,       xmm2
+        paddw           xmm7,       xmm3
+%endmacro
+
+%macro PROCESS_16X16X3_OFFSET 2
+%2_aligned_by_%1:
+
+        sub             rdi,        %1
+
+        PROCESS_16X2X3_OFFSET 1, %1
+        PROCESS_16X2X3_OFFSET 0, %1
+        PROCESS_16X2X3_OFFSET 0, %1
+        PROCESS_16X2X3_OFFSET 0, %1
+        PROCESS_16X2X3_OFFSET 0, %1
+        PROCESS_16X2X3_OFFSET 0, %1
+        PROCESS_16X2X3_OFFSET 0, %1
+        PROCESS_16X2X3_OFFSET 0, %1
+
+        jmp             %2_store_off
+
+%endmacro
+
+%macro PROCESS_16X8X3_OFFSET 2
+%2_aligned_by_%1:
+
+        sub             rdi,        %1
+
+        PROCESS_16X2X3_OFFSET 1, %1
+        PROCESS_16X2X3_OFFSET 0, %1
+        PROCESS_16X2X3_OFFSET 0, %1
+        PROCESS_16X2X3_OFFSET 0, %1
+
+        jmp             %2_store_off
+
+%endmacro
+
+;void int vp9_sad16x16x3_ssse3(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *ref_ptr,
+;    int  ref_stride,
+;    int  *results)
+global sym(vp9_sad16x16x3_ssse3)
+sym(vp9_sad16x16x3_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    push        rcx
+    ; end prolog
+
+        mov             rsi,        arg(0) ;src_ptr
+        mov             rdi,        arg(2) ;ref_ptr
+
+        mov             rdx,        0xf
+        and             rdx,        rdi
+
+        jmp .vp9_sad16x16x3_ssse3_skiptable
+.vp9_sad16x16x3_ssse3_jumptable:
+        dd .vp9_sad16x16x3_ssse3_aligned_by_0  - .vp9_sad16x16x3_ssse3_do_jump
+        dd .vp9_sad16x16x3_ssse3_aligned_by_1  - .vp9_sad16x16x3_ssse3_do_jump
+        dd .vp9_sad16x16x3_ssse3_aligned_by_2  - .vp9_sad16x16x3_ssse3_do_jump
+        dd .vp9_sad16x16x3_ssse3_aligned_by_3  - .vp9_sad16x16x3_ssse3_do_jump
+        dd .vp9_sad16x16x3_ssse3_aligned_by_4  - .vp9_sad16x16x3_ssse3_do_jump
+        dd .vp9_sad16x16x3_ssse3_aligned_by_5  - .vp9_sad16x16x3_ssse3_do_jump
+        dd .vp9_sad16x16x3_ssse3_aligned_by_6  - .vp9_sad16x16x3_ssse3_do_jump
+        dd .vp9_sad16x16x3_ssse3_aligned_by_7  - .vp9_sad16x16x3_ssse3_do_jump
+        dd .vp9_sad16x16x3_ssse3_aligned_by_8  - .vp9_sad16x16x3_ssse3_do_jump
+        dd .vp9_sad16x16x3_ssse3_aligned_by_9  - .vp9_sad16x16x3_ssse3_do_jump
+        dd .vp9_sad16x16x3_ssse3_aligned_by_10 - .vp9_sad16x16x3_ssse3_do_jump
+        dd .vp9_sad16x16x3_ssse3_aligned_by_11 - .vp9_sad16x16x3_ssse3_do_jump
+        dd .vp9_sad16x16x3_ssse3_aligned_by_12 - .vp9_sad16x16x3_ssse3_do_jump
+        dd .vp9_sad16x16x3_ssse3_aligned_by_13 - .vp9_sad16x16x3_ssse3_do_jump
+        dd .vp9_sad16x16x3_ssse3_aligned_by_14 - .vp9_sad16x16x3_ssse3_do_jump
+        dd .vp9_sad16x16x3_ssse3_aligned_by_15 - .vp9_sad16x16x3_ssse3_do_jump
+.vp9_sad16x16x3_ssse3_skiptable:
+
+        call .vp9_sad16x16x3_ssse3_do_jump
+.vp9_sad16x16x3_ssse3_do_jump:
+        pop             rcx                         ; get the address of do_jump
+        mov             rax,  .vp9_sad16x16x3_ssse3_jumptable - .vp9_sad16x16x3_ssse3_do_jump
+        add             rax,  rcx  ; get the absolute address of vp9_sad16x16x3_ssse3_jumptable
+
+        movsxd          rax,  dword [rax + 4*rdx]   ; get the 32 bit offset from the jumptable
+        add             rcx,        rax
+
+        movsxd          rax,        dword ptr arg(1) ;src_stride
+        movsxd          rdx,        dword ptr arg(3) ;ref_stride
+
+        jmp             rcx
+
+        PROCESS_16X16X3_OFFSET 0,  .vp9_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 1,  .vp9_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 2,  .vp9_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 3,  .vp9_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 4,  .vp9_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 5,  .vp9_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 6,  .vp9_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 7,  .vp9_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 8,  .vp9_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 9,  .vp9_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 10, .vp9_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 11, .vp9_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 12, .vp9_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 13, .vp9_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 14, .vp9_sad16x16x3_ssse3
+
+.vp9_sad16x16x3_ssse3_aligned_by_15:
+        PROCESS_16X2X3 1
+        PROCESS_16X2X3 0
+        PROCESS_16X2X3 0
+        PROCESS_16X2X3 0
+        PROCESS_16X2X3 0
+        PROCESS_16X2X3 0
+        PROCESS_16X2X3 0
+        PROCESS_16X2X3 0
+
+.vp9_sad16x16x3_ssse3_store_off:
+        mov             rdi,        arg(4) ;Results
+
+        movq            xmm0,       xmm5
+        psrldq          xmm5,       8
+
+        paddw           xmm0,       xmm5
+        movd            [rdi],      xmm0
+;-
+        movq            xmm0,       xmm6
+        psrldq          xmm6,       8
+
+        paddw           xmm0,       xmm6
+        movd            [rdi+4],    xmm0
+;-
+        movq            xmm0,       xmm7
+        psrldq          xmm7,       8
+
+        paddw           xmm0,       xmm7
+        movd            [rdi+8],    xmm0
+
+    ; begin epilog
+    pop         rcx
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void int vp9_sad16x8x3_ssse3(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *ref_ptr,
+;    int  ref_stride,
+;    int  *results)
+global sym(vp9_sad16x8x3_ssse3)
+sym(vp9_sad16x8x3_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    push        rcx
+    ; end prolog
+
+        mov             rsi,        arg(0) ;src_ptr
+        mov             rdi,        arg(2) ;ref_ptr
+
+        mov             rdx,        0xf
+        and             rdx,        rdi
+
+        jmp .vp9_sad16x8x3_ssse3_skiptable
+.vp9_sad16x8x3_ssse3_jumptable:
+        dd .vp9_sad16x8x3_ssse3_aligned_by_0  - .vp9_sad16x8x3_ssse3_do_jump
+        dd .vp9_sad16x8x3_ssse3_aligned_by_1  - .vp9_sad16x8x3_ssse3_do_jump
+        dd .vp9_sad16x8x3_ssse3_aligned_by_2  - .vp9_sad16x8x3_ssse3_do_jump
+        dd .vp9_sad16x8x3_ssse3_aligned_by_3  - .vp9_sad16x8x3_ssse3_do_jump
+        dd .vp9_sad16x8x3_ssse3_aligned_by_4  - .vp9_sad16x8x3_ssse3_do_jump
+        dd .vp9_sad16x8x3_ssse3_aligned_by_5  - .vp9_sad16x8x3_ssse3_do_jump
+        dd .vp9_sad16x8x3_ssse3_aligned_by_6  - .vp9_sad16x8x3_ssse3_do_jump
+        dd .vp9_sad16x8x3_ssse3_aligned_by_7  - .vp9_sad16x8x3_ssse3_do_jump
+        dd .vp9_sad16x8x3_ssse3_aligned_by_8  - .vp9_sad16x8x3_ssse3_do_jump
+        dd .vp9_sad16x8x3_ssse3_aligned_by_9  - .vp9_sad16x8x3_ssse3_do_jump
+        dd .vp9_sad16x8x3_ssse3_aligned_by_10 - .vp9_sad16x8x3_ssse3_do_jump
+        dd .vp9_sad16x8x3_ssse3_aligned_by_11 - .vp9_sad16x8x3_ssse3_do_jump
+        dd .vp9_sad16x8x3_ssse3_aligned_by_12 - .vp9_sad16x8x3_ssse3_do_jump
+        dd .vp9_sad16x8x3_ssse3_aligned_by_13 - .vp9_sad16x8x3_ssse3_do_jump
+        dd .vp9_sad16x8x3_ssse3_aligned_by_14 - .vp9_sad16x8x3_ssse3_do_jump
+        dd .vp9_sad16x8x3_ssse3_aligned_by_15 - .vp9_sad16x8x3_ssse3_do_jump
+.vp9_sad16x8x3_ssse3_skiptable:
+
+        call .vp9_sad16x8x3_ssse3_do_jump
+.vp9_sad16x8x3_ssse3_do_jump:
+        pop             rcx                         ; get the address of do_jump
+        mov             rax,  .vp9_sad16x8x3_ssse3_jumptable - .vp9_sad16x8x3_ssse3_do_jump
+        add             rax,  rcx  ; get the absolute address of vp9_sad16x8x3_ssse3_jumptable
+
+        movsxd          rax,  dword [rax + 4*rdx]   ; get the 32 bit offset from the jumptable
+        add             rcx,        rax
+
+        movsxd          rax,        dword ptr arg(1) ;src_stride
+        movsxd          rdx,        dword ptr arg(3) ;ref_stride
+
+        jmp             rcx
+
+        PROCESS_16X8X3_OFFSET 0,  .vp9_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 1,  .vp9_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 2,  .vp9_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 3,  .vp9_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 4,  .vp9_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 5,  .vp9_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 6,  .vp9_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 7,  .vp9_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 8,  .vp9_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 9,  .vp9_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 10, .vp9_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 11, .vp9_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 12, .vp9_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 13, .vp9_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 14, .vp9_sad16x8x3_ssse3
+
+.vp9_sad16x8x3_ssse3_aligned_by_15:
+
+        PROCESS_16X2X3 1
+        PROCESS_16X2X3 0
+        PROCESS_16X2X3 0
+        PROCESS_16X2X3 0
+
+.vp9_sad16x8x3_ssse3_store_off:
+        mov             rdi,        arg(4) ;Results
+
+        movq            xmm0,       xmm5
+        psrldq          xmm5,       8
+
+        paddw           xmm0,       xmm5
+        movd            [rdi],      xmm0
+;-
+        movq            xmm0,       xmm6
+        psrldq          xmm6,       8
+
+        paddw           xmm0,       xmm6
+        movd            [rdi+4],    xmm0
+;-
+        movq            xmm0,       xmm7
+        psrldq          xmm7,       8
+
+        paddw           xmm0,       xmm7
+        movd            [rdi+8],    xmm0
+
+    ; begin epilog
+    pop         rcx
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
--- /dev/null
+++ b/vp9/encoder/x86/ssim_opt.asm
@@ -1,0 +1,216 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "vpx_ports/x86_abi_support.asm"
+
+; tabulate_ssim - sums sum_s,sum_r,sum_sq_s,sum_sq_r, sum_sxr
+%macro TABULATE_SSIM 0
+        paddusw         xmm15, xmm3  ; sum_s
+        paddusw         xmm14, xmm4  ; sum_r
+        movdqa          xmm1, xmm3
+        pmaddwd         xmm1, xmm1
+        paddd           xmm13, xmm1 ; sum_sq_s
+        movdqa          xmm2, xmm4
+        pmaddwd         xmm2, xmm2
+        paddd           xmm12, xmm2 ; sum_sq_r
+        pmaddwd         xmm3, xmm4
+        paddd           xmm11, xmm3  ; sum_sxr
+%endmacro
+
+; Sum across the register %1 starting with q words
+%macro SUM_ACROSS_Q 1
+        movdqa          xmm2,%1
+        punpckldq       %1,xmm0
+        punpckhdq       xmm2,xmm0
+        paddq           %1,xmm2
+        movdqa          xmm2,%1
+        punpcklqdq      %1,xmm0
+        punpckhqdq      xmm2,xmm0
+        paddq           %1,xmm2
+%endmacro
+
+; Sum across the register %1 starting with q words
+%macro SUM_ACROSS_W 1
+        movdqa          xmm1, %1
+        punpcklwd       %1,xmm0
+        punpckhwd       xmm1,xmm0
+        paddd           %1, xmm1
+        SUM_ACROSS_Q    %1
+%endmacro
+;void ssim_parms_sse2(
+;    unsigned char *s,
+;    int sp,
+;    unsigned char *r,
+;    int rp
+;    unsigned long *sum_s,
+;    unsigned long *sum_r,
+;    unsigned long *sum_sq_s,
+;    unsigned long *sum_sq_r,
+;    unsigned long *sum_sxr);
+;
+; TODO: Use parm passing through structure, probably don't need the pxors
+; ( calling app will initialize to 0 ) could easily fit everything in sse2
+; without too much hastle, and can probably do better estimates with psadw
+; or pavgb At this point this is just meant to be first pass for calculating
+; all the parms needed for 16x16 ssim so we can play with dssim as distortion
+; in mode selection code.
+global sym(vp9_ssim_parms_16x16_sse2)
+sym(vp9_ssim_parms_16x16_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 9
+    SAVE_XMM 15
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    mov             rsi,        arg(0) ;s
+    mov             rcx,        arg(1) ;sp
+    mov             rdi,        arg(2) ;r
+    mov             rax,        arg(3) ;rp
+
+    pxor            xmm0, xmm0
+    pxor            xmm15,xmm15  ;sum_s
+    pxor            xmm14,xmm14  ;sum_r
+    pxor            xmm13,xmm13  ;sum_sq_s
+    pxor            xmm12,xmm12  ;sum_sq_r
+    pxor            xmm11,xmm11  ;sum_sxr
+
+    mov             rdx, 16      ;row counter
+.NextRow:
+
+    ;grab source and reference pixels
+    movdqu          xmm5, [rsi]
+    movdqu          xmm6, [rdi]
+    movdqa          xmm3, xmm5
+    movdqa          xmm4, xmm6
+    punpckhbw       xmm3, xmm0 ; high_s
+    punpckhbw       xmm4, xmm0 ; high_r
+
+    TABULATE_SSIM
+
+    movdqa          xmm3, xmm5
+    movdqa          xmm4, xmm6
+    punpcklbw       xmm3, xmm0 ; low_s
+    punpcklbw       xmm4, xmm0 ; low_r
+
+    TABULATE_SSIM
+
+    add             rsi, rcx   ; next s row
+    add             rdi, rax   ; next r row
+
+    dec             rdx        ; counter
+    jnz .NextRow
+
+    SUM_ACROSS_W    xmm15
+    SUM_ACROSS_W    xmm14
+    SUM_ACROSS_Q    xmm13
+    SUM_ACROSS_Q    xmm12
+    SUM_ACROSS_Q    xmm11
+
+    mov             rdi,arg(4)
+    movd            [rdi], xmm15;
+    mov             rdi,arg(5)
+    movd            [rdi], xmm14;
+    mov             rdi,arg(6)
+    movd            [rdi], xmm13;
+    mov             rdi,arg(7)
+    movd            [rdi], xmm12;
+    mov             rdi,arg(8)
+    movd            [rdi], xmm11;
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void ssim_parms_sse2(
+;    unsigned char *s,
+;    int sp,
+;    unsigned char *r,
+;    int rp
+;    unsigned long *sum_s,
+;    unsigned long *sum_r,
+;    unsigned long *sum_sq_s,
+;    unsigned long *sum_sq_r,
+;    unsigned long *sum_sxr);
+;
+; TODO: Use parm passing through structure, probably don't need the pxors
+; ( calling app will initialize to 0 ) could easily fit everything in sse2
+; without too much hastle, and can probably do better estimates with psadw
+; or pavgb At this point this is just meant to be first pass for calculating
+; all the parms needed for 16x16 ssim so we can play with dssim as distortion
+; in mode selection code.
+global sym(vp9_ssim_parms_8x8_sse2)
+sym(vp9_ssim_parms_8x8_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 9
+    SAVE_XMM 15
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    mov             rsi,        arg(0) ;s
+    mov             rcx,        arg(1) ;sp
+    mov             rdi,        arg(2) ;r
+    mov             rax,        arg(3) ;rp
+
+    pxor            xmm0, xmm0
+    pxor            xmm15,xmm15  ;sum_s
+    pxor            xmm14,xmm14  ;sum_r
+    pxor            xmm13,xmm13  ;sum_sq_s
+    pxor            xmm12,xmm12  ;sum_sq_r
+    pxor            xmm11,xmm11  ;sum_sxr
+
+    mov             rdx, 8      ;row counter
+.NextRow:
+
+    ;grab source and reference pixels
+    movq            xmm3, [rsi]
+    movq            xmm4, [rdi]
+    punpcklbw       xmm3, xmm0 ; low_s
+    punpcklbw       xmm4, xmm0 ; low_r
+
+    TABULATE_SSIM
+
+    add             rsi, rcx   ; next s row
+    add             rdi, rax   ; next r row
+
+    dec             rdx        ; counter
+    jnz .NextRow
+
+    SUM_ACROSS_W    xmm15
+    SUM_ACROSS_W    xmm14
+    SUM_ACROSS_Q    xmm13
+    SUM_ACROSS_Q    xmm12
+    SUM_ACROSS_Q    xmm11
+
+    mov             rdi,arg(4)
+    movd            [rdi], xmm15;
+    mov             rdi,arg(5)
+    movd            [rdi], xmm14;
+    mov             rdi,arg(6)
+    movd            [rdi], xmm13;
+    mov             rdi,arg(7)
+    movd            [rdi], xmm12;
+    mov             rdi,arg(8)
+    movd            [rdi], xmm11;
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
--- /dev/null
+++ b/vp9/encoder/x86/subtract_mmx.asm
@@ -1,0 +1,432 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;void vp9_subtract_b_mmx_impl(unsigned char *z,  int src_stride,
+;                            short *diff, unsigned char *Predictor,
+;                            int pitch);
+global sym(vp9_subtract_b_mmx_impl)
+sym(vp9_subtract_b_mmx_impl):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    push rsi
+    push rdi
+    ; end prolog
+
+
+        mov     rdi,        arg(2) ;diff
+        mov     rax,        arg(3) ;Predictor
+        mov     rsi,        arg(0) ;z
+        movsxd  rdx,        dword ptr arg(1);src_stride;
+        movsxd  rcx,        dword ptr arg(4);pitch
+        pxor    mm7,        mm7
+
+        movd    mm0,        [rsi]
+        movd    mm1,        [rax]
+        punpcklbw   mm0,    mm7
+        punpcklbw   mm1,    mm7
+        psubw   mm0,        mm1
+        movq    [rdi],      mm0
+
+
+        movd    mm0,        [rsi+rdx]
+        movd    mm1,        [rax+rcx]
+        punpcklbw   mm0,    mm7
+        punpcklbw   mm1,    mm7
+        psubw   mm0,        mm1
+        movq    [rdi+rcx*2],mm0
+
+
+        movd    mm0,        [rsi+rdx*2]
+        movd    mm1,        [rax+rcx*2]
+        punpcklbw   mm0,    mm7
+        punpcklbw   mm1,    mm7
+        psubw   mm0,        mm1
+        movq    [rdi+rcx*4],        mm0
+
+        lea     rsi,        [rsi+rdx*2]
+        lea     rcx,        [rcx+rcx*2]
+
+
+
+        movd    mm0,        [rsi+rdx]
+        movd    mm1,        [rax+rcx]
+        punpcklbw   mm0,    mm7
+        punpcklbw   mm1,    mm7
+        psubw   mm0,        mm1
+        movq    [rdi+rcx*2],        mm0
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vp9_subtract_mby_mmx(short *diff, unsigned char *src, unsigned char *pred, int stride)
+global sym(vp9_subtract_mby_mmx)
+sym(vp9_subtract_mby_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 4
+    push rsi
+    push rdi
+    ; end prolog
+
+
+            mov         rsi,            arg(1) ;src
+            mov         rdi,            arg(0) ;diff
+
+            mov         rax,            arg(2) ;pred
+            movsxd      rdx,            dword ptr arg(3) ;stride
+
+            mov         rcx,            16
+            pxor        mm0,            mm0
+
+.submby_loop:
+
+            movq        mm1,            [rsi]
+            movq        mm3,            [rax]
+
+            movq        mm2,            mm1
+            movq        mm4,            mm3
+
+            punpcklbw   mm1,            mm0
+            punpcklbw   mm3,            mm0
+
+            punpckhbw   mm2,            mm0
+            punpckhbw   mm4,            mm0
+
+            psubw       mm1,            mm3
+            psubw       mm2,            mm4
+
+            movq        [rdi],          mm1
+            movq        [rdi+8],        mm2
+
+
+            movq        mm1,            [rsi+8]
+            movq        mm3,            [rax+8]
+
+            movq        mm2,            mm1
+            movq        mm4,            mm3
+
+            punpcklbw   mm1,            mm0
+            punpcklbw   mm3,            mm0
+
+            punpckhbw   mm2,            mm0
+            punpckhbw   mm4,            mm0
+
+            psubw       mm1,            mm3
+            psubw       mm2,            mm4
+
+            movq        [rdi+16],       mm1
+            movq        [rdi+24],       mm2
+
+
+            add         rdi,            32
+            add         rax,            16
+
+            lea         rsi,            [rsi+rdx]
+
+            sub         rcx,            1
+            jnz         .submby_loop
+
+    pop rdi
+    pop rsi
+    ; begin epilog
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp9_subtract_mbuv_mmx(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride)
+global sym(vp9_subtract_mbuv_mmx)
+sym(vp9_subtract_mbuv_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    push rsi
+    push rdi
+    ; end prolog
+
+    ;short *udiff = diff + 256;
+    ;short *vdiff = diff + 320;
+    ;unsigned char *upred = pred + 256;
+    ;unsigned char *vpred = pred + 320;
+
+        ;unsigned char  *z    = usrc;
+        ;unsigned short *diff = udiff;
+        ;unsigned char  *Predictor= upred;
+
+            mov     rdi,        arg(0) ;diff
+            mov     rax,        arg(3) ;pred
+            mov     rsi,        arg(1) ;z = usrc
+            add     rdi,        256*2  ;diff = diff + 256 (shorts)
+            add     rax,        256    ;Predictor = pred + 256
+            movsxd  rdx,        dword ptr arg(4) ;stride;
+            pxor    mm7,        mm7
+
+            movq    mm0,        [rsi]
+            movq    mm1,        [rax]
+            movq    mm3,        mm0
+            movq    mm4,        mm1
+            punpcklbw   mm0,    mm7
+            punpcklbw   mm1,    mm7
+            punpckhbw   mm3,    mm7
+            punpckhbw   mm4,    mm7
+            psubw   mm0,        mm1
+            psubw   mm3,        mm4
+            movq    [rdi],      mm0
+            movq    [rdi+8],    mm3
+
+
+            movq    mm0,        [rsi+rdx]
+            movq    mm1,        [rax+8]
+            movq    mm3,        mm0
+            movq    mm4,        mm1
+            punpcklbw   mm0,    mm7
+            punpcklbw   mm1,    mm7
+            punpckhbw   mm3,    mm7
+            punpckhbw   mm4,    mm7
+            psubw   mm0,        mm1
+            psubw   mm3,        mm4
+            movq    [rdi+16],   mm0
+            movq    [rdi+24],   mm3
+
+            movq    mm0,        [rsi+rdx*2]
+            movq    mm1,        [rax+16]
+            movq    mm3,        mm0
+            movq    mm4,        mm1
+            punpcklbw   mm0,    mm7
+            punpcklbw   mm1,    mm7
+            punpckhbw   mm3,    mm7
+            punpckhbw   mm4,    mm7
+            psubw   mm0,        mm1
+            psubw   mm3,        mm4
+            movq    [rdi+32],   mm0
+            movq    [rdi+40],   mm3
+            lea     rsi,        [rsi+rdx*2]
+
+
+            movq    mm0,        [rsi+rdx]
+            movq    mm1,        [rax+24]
+            movq    mm3,        mm0
+            movq    mm4,        mm1
+            punpcklbw   mm0,    mm7
+            punpcklbw   mm1,    mm7
+            punpckhbw   mm3,    mm7
+            punpckhbw   mm4,    mm7
+            psubw   mm0,        mm1
+            psubw   mm3,        mm4
+
+            movq    [rdi+48],   mm0
+            movq    [rdi+56],   mm3
+
+
+            add     rdi,        64
+            add     rax,        32
+            lea     rsi,        [rsi+rdx*2]
+
+
+            movq    mm0,        [rsi]
+            movq    mm1,        [rax]
+            movq    mm3,        mm0
+            movq    mm4,        mm1
+            punpcklbw   mm0,    mm7
+            punpcklbw   mm1,    mm7
+            punpckhbw   mm3,    mm7
+            punpckhbw   mm4,    mm7
+            psubw   mm0,        mm1
+            psubw   mm3,        mm4
+            movq    [rdi],      mm0
+            movq    [rdi+8],    mm3
+
+
+            movq    mm0,        [rsi+rdx]
+            movq    mm1,        [rax+8]
+            movq    mm3,        mm0
+            movq    mm4,        mm1
+            punpcklbw   mm0,    mm7
+            punpcklbw   mm1,    mm7
+            punpckhbw   mm3,    mm7
+            punpckhbw   mm4,    mm7
+            psubw   mm0,        mm1
+            psubw   mm3,        mm4
+            movq    [rdi+16],   mm0
+            movq    [rdi+24],   mm3
+
+            movq    mm0,        [rsi+rdx*2]
+            movq    mm1,        [rax+16]
+            movq    mm3,        mm0
+            movq    mm4,        mm1
+            punpcklbw   mm0,    mm7
+            punpcklbw   mm1,    mm7
+            punpckhbw   mm3,    mm7
+            punpckhbw   mm4,    mm7
+            psubw   mm0,        mm1
+            psubw   mm3,        mm4
+            movq    [rdi+32],   mm0
+            movq    [rdi+40],   mm3
+            lea     rsi,        [rsi+rdx*2]
+
+
+            movq    mm0,        [rsi+rdx]
+            movq    mm1,        [rax+24]
+            movq    mm3,        mm0
+            movq    mm4,        mm1
+            punpcklbw   mm0,    mm7
+            punpcklbw   mm1,    mm7
+            punpckhbw   mm3,    mm7
+            punpckhbw   mm4,    mm7
+            psubw   mm0,        mm1
+            psubw   mm3,        mm4
+
+            movq    [rdi+48],   mm0
+            movq    [rdi+56],   mm3
+
+        ;unsigned char  *z    = vsrc;
+        ;unsigned short *diff = vdiff;
+        ;unsigned char  *Predictor= vpred;
+
+            mov     rdi,        arg(0) ;diff
+            mov     rax,        arg(3) ;pred
+            mov     rsi,        arg(2) ;z = usrc
+            add     rdi,        320*2  ;diff = diff + 320 (shorts)
+            add     rax,        320    ;Predictor = pred + 320
+            movsxd  rdx,        dword ptr arg(4) ;stride;
+            pxor    mm7,        mm7
+
+            movq    mm0,        [rsi]
+            movq    mm1,        [rax]
+            movq    mm3,        mm0
+            movq    mm4,        mm1
+            punpcklbw   mm0,    mm7
+            punpcklbw   mm1,    mm7
+            punpckhbw   mm3,    mm7
+            punpckhbw   mm4,    mm7
+            psubw   mm0,        mm1
+            psubw   mm3,        mm4
+            movq    [rdi],      mm0
+            movq    [rdi+8],    mm3
+
+
+            movq    mm0,        [rsi+rdx]
+            movq    mm1,        [rax+8]
+            movq    mm3,        mm0
+            movq    mm4,        mm1
+            punpcklbw   mm0,    mm7
+            punpcklbw   mm1,    mm7
+            punpckhbw   mm3,    mm7
+            punpckhbw   mm4,    mm7
+            psubw   mm0,        mm1
+            psubw   mm3,        mm4
+            movq    [rdi+16],   mm0
+            movq    [rdi+24],   mm3
+
+            movq    mm0,        [rsi+rdx*2]
+            movq    mm1,        [rax+16]
+            movq    mm3,        mm0
+            movq    mm4,        mm1
+            punpcklbw   mm0,    mm7
+            punpcklbw   mm1,    mm7
+            punpckhbw   mm3,    mm7
+            punpckhbw   mm4,    mm7
+            psubw   mm0,        mm1
+            psubw   mm3,        mm4
+            movq    [rdi+32],   mm0
+            movq    [rdi+40],   mm3
+            lea     rsi,        [rsi+rdx*2]
+
+
+            movq    mm0,        [rsi+rdx]
+            movq    mm1,        [rax+24]
+            movq    mm3,        mm0
+            movq    mm4,        mm1
+            punpcklbw   mm0,    mm7
+            punpcklbw   mm1,    mm7
+            punpckhbw   mm3,    mm7
+            punpckhbw   mm4,    mm7
+            psubw   mm0,        mm1
+            psubw   mm3,        mm4
+
+            movq    [rdi+48],   mm0
+            movq    [rdi+56],   mm3
+
+
+            add     rdi,        64
+            add     rax,        32
+            lea     rsi,        [rsi+rdx*2]
+
+
+            movq    mm0,        [rsi]
+            movq    mm1,        [rax]
+            movq    mm3,        mm0
+            movq    mm4,        mm1
+            punpcklbw   mm0,    mm7
+            punpcklbw   mm1,    mm7
+            punpckhbw   mm3,    mm7
+            punpckhbw   mm4,    mm7
+            psubw   mm0,        mm1
+            psubw   mm3,        mm4
+            movq    [rdi],      mm0
+            movq    [rdi+8],    mm3
+
+
+            movq    mm0,        [rsi+rdx]
+            movq    mm1,        [rax+8]
+            movq    mm3,        mm0
+            movq    mm4,        mm1
+            punpcklbw   mm0,    mm7
+            punpcklbw   mm1,    mm7
+            punpckhbw   mm3,    mm7
+            punpckhbw   mm4,    mm7
+            psubw   mm0,        mm1
+            psubw   mm3,        mm4
+            movq    [rdi+16],   mm0
+            movq    [rdi+24],   mm3
+
+            movq    mm0,        [rsi+rdx*2]
+            movq    mm1,        [rax+16]
+            movq    mm3,        mm0
+            movq    mm4,        mm1
+            punpcklbw   mm0,    mm7
+            punpcklbw   mm1,    mm7
+            punpckhbw   mm3,    mm7
+            punpckhbw   mm4,    mm7
+            psubw   mm0,        mm1
+            psubw   mm3,        mm4
+            movq    [rdi+32],   mm0
+            movq    [rdi+40],   mm3
+            lea     rsi,        [rsi+rdx*2]
+
+
+            movq    mm0,        [rsi+rdx]
+            movq    mm1,        [rax+24]
+            movq    mm3,        mm0
+            movq    mm4,        mm1
+            punpcklbw   mm0,    mm7
+            punpcklbw   mm1,    mm7
+            punpckhbw   mm3,    mm7
+            punpckhbw   mm4,    mm7
+            psubw   mm0,        mm1
+            psubw   mm3,        mm4
+
+            movq    [rdi+48],   mm0
+            movq    [rdi+56],   mm3
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
--- /dev/null
+++ b/vp9/encoder/x86/subtract_sse2.asm
@@ -1,0 +1,356 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;void vp9_subtract_b_sse2_impl(unsigned char *z,  int src_stride,
+;                            short *diff, unsigned char *Predictor,
+;                            int pitch);
+global sym(vp9_subtract_b_sse2_impl)
+sym(vp9_subtract_b_sse2_impl):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    GET_GOT     rbx
+    push rsi
+    push rdi
+    ; end prolog
+
+        mov     rdi,        arg(2) ;diff
+        mov     rax,        arg(3) ;Predictor
+        mov     rsi,        arg(0) ;z
+        movsxd  rdx,        dword ptr arg(1);src_stride;
+        movsxd  rcx,        dword ptr arg(4);pitch
+        pxor    mm7,        mm7
+
+        movd    mm0,        [rsi]
+        movd    mm1,        [rax]
+        punpcklbw   mm0,    mm7
+        punpcklbw   mm1,    mm7
+        psubw   mm0,        mm1
+        movq    MMWORD PTR [rdi],      mm0
+
+        movd    mm0,        [rsi+rdx]
+        movd    mm1,        [rax+rcx]
+        punpcklbw   mm0,    mm7
+        punpcklbw   mm1,    mm7
+        psubw   mm0,        mm1
+        movq    MMWORD PTR [rdi+rcx*2], mm0
+
+        movd    mm0,        [rsi+rdx*2]
+        movd    mm1,        [rax+rcx*2]
+        punpcklbw   mm0,    mm7
+        punpcklbw   mm1,    mm7
+        psubw   mm0,        mm1
+        movq    MMWORD PTR [rdi+rcx*4], mm0
+
+        lea     rsi,        [rsi+rdx*2]
+        lea     rcx,        [rcx+rcx*2]
+
+        movd    mm0,        [rsi+rdx]
+        movd    mm1,        [rax+rcx]
+        punpcklbw   mm0,    mm7
+        punpcklbw   mm1,    mm7
+        psubw   mm0,        mm1
+        movq    MMWORD PTR [rdi+rcx*2], mm0
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp9_subtract_mby_sse2(short *diff, unsigned char *src, unsigned char *pred, int stride)
+global sym(vp9_subtract_mby_sse2)
+sym(vp9_subtract_mby_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 4
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push rsi
+    push rdi
+    ; end prolog
+
+            mov         rsi,            arg(1) ;src
+            mov         rdi,            arg(0) ;diff
+
+            mov         rax,            arg(2) ;pred
+            movsxd      rdx,            dword ptr arg(3) ;stride
+
+            mov         rcx,            8      ; do two lines at one time
+
+.submby_loop:
+            movdqa      xmm0,           XMMWORD PTR [rsi]   ; src
+            movdqa      xmm1,           XMMWORD PTR [rax]   ; pred
+
+            movdqa      xmm2,           xmm0
+            psubb       xmm0,           xmm1
+
+            pxor        xmm1,           [GLOBAL(t80)]   ;convert to signed values
+            pxor        xmm2,           [GLOBAL(t80)]
+            pcmpgtb     xmm1,           xmm2            ; obtain sign information
+
+            movdqa      xmm2,    xmm0
+            movdqa      xmm3,    xmm1
+            punpcklbw   xmm0,    xmm1            ; put sign back to subtraction
+            punpckhbw   xmm2,    xmm3            ; put sign back to subtraction
+
+            movdqa      XMMWORD PTR [rdi],   xmm0
+            movdqa      XMMWORD PTR [rdi +16], xmm2
+
+            movdqa      xmm4,           XMMWORD PTR [rsi + rdx]
+            movdqa      xmm5,           XMMWORD PTR [rax + 16]
+
+            movdqa      xmm6,           xmm4
+            psubb       xmm4,           xmm5
+
+            pxor        xmm5,           [GLOBAL(t80)]   ;convert to signed values
+            pxor        xmm6,           [GLOBAL(t80)]
+            pcmpgtb     xmm5,           xmm6            ; obtain sign information
+
+            movdqa      xmm6,    xmm4
+            movdqa      xmm7,    xmm5
+            punpcklbw   xmm4,    xmm5            ; put sign back to subtraction
+            punpckhbw   xmm6,    xmm7            ; put sign back to subtraction
+
+            movdqa      XMMWORD PTR [rdi +32], xmm4
+            movdqa      XMMWORD PTR [rdi +48], xmm6
+
+            add         rdi,            64
+            add         rax,            32
+            lea         rsi,            [rsi+rdx*2]
+
+            sub         rcx,            1
+            jnz         .submby_loop
+
+    pop rdi
+    pop rsi
+    ; begin epilog
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp9_subtract_mbuv_sse2(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride)
+global sym(vp9_subtract_mbuv_sse2)
+sym(vp9_subtract_mbuv_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    GET_GOT     rbx
+    push rsi
+    push rdi
+    ; end prolog
+
+            mov     rdi,        arg(0) ;diff
+            mov     rax,        arg(3) ;pred
+            mov     rsi,        arg(1) ;z = usrc
+            add     rdi,        256*2  ;diff = diff + 256 (shorts)
+            add     rax,        256    ;Predictor = pred + 256
+            movsxd  rdx,        dword ptr arg(4) ;stride;
+            lea     rcx,        [rdx + rdx*2]
+
+            ;u
+            ;line 0 1
+            movq       xmm0,    MMWORD PTR [rsi]  ; src
+            movq       xmm2,    MMWORD PTR [rsi+rdx]
+            movdqa     xmm1,    XMMWORD PTR [rax]  ; pred
+            punpcklqdq xmm0,    xmm2
+
+            movdqa     xmm2,    xmm0
+            psubb      xmm0,    xmm1            ; subtraction with sign missed
+
+            pxor       xmm1,    [GLOBAL(t80)]   ;convert to signed values
+            pxor       xmm2,    [GLOBAL(t80)]
+            pcmpgtb    xmm1,    xmm2            ; obtain sign information
+
+            movdqa     xmm2,    xmm0
+            movdqa     xmm3,    xmm1
+            punpcklbw  xmm0,    xmm1            ; put sign back to subtraction
+            punpckhbw  xmm2,    xmm3            ; put sign back to subtraction
+
+            movdqa     XMMWORD PTR [rdi],   xmm0
+            movdqa     XMMWORD PTR [rdi +16],   xmm2
+
+            ;line 2 3
+            movq       xmm0,    MMWORD PTR [rsi+rdx*2]  ; src
+            movq       xmm2,    MMWORD PTR [rsi+rcx]
+            movdqa     xmm1,    XMMWORD PTR [rax+16]  ; pred
+            punpcklqdq xmm0,    xmm2
+
+            movdqa     xmm2,    xmm0
+            psubb      xmm0,    xmm1            ; subtraction with sign missed
+
+            pxor       xmm1,    [GLOBAL(t80)]   ;convert to signed values
+            pxor       xmm2,    [GLOBAL(t80)]
+            pcmpgtb    xmm1,    xmm2            ; obtain sign information
+
+            movdqa     xmm2,    xmm0
+            movdqa     xmm3,    xmm1
+            punpcklbw  xmm0,    xmm1            ; put sign back to subtraction
+            punpckhbw  xmm2,    xmm3            ; put sign back to subtraction
+
+            movdqa     XMMWORD PTR [rdi + 32],   xmm0
+            movdqa     XMMWORD PTR [rdi + 48],   xmm2
+
+            ;line 4 5
+            lea        rsi,     [rsi + rdx*4]
+
+            movq       xmm0,    MMWORD PTR [rsi]  ; src
+            movq       xmm2,    MMWORD PTR [rsi+rdx]
+            movdqa     xmm1,    XMMWORD PTR [rax + 32]  ; pred
+            punpcklqdq xmm0,    xmm2
+
+            movdqa     xmm2,    xmm0
+            psubb      xmm0,    xmm1            ; subtraction with sign missed
+
+            pxor       xmm1,    [GLOBAL(t80)]   ;convert to signed values
+            pxor       xmm2,    [GLOBAL(t80)]
+            pcmpgtb    xmm1,    xmm2            ; obtain sign information
+
+            movdqa     xmm2,    xmm0
+            movdqa     xmm3,    xmm1
+            punpcklbw  xmm0,    xmm1            ; put sign back to subtraction
+            punpckhbw  xmm2,    xmm3            ; put sign back to subtraction
+
+            movdqa     XMMWORD PTR [rdi + 64],   xmm0
+            movdqa     XMMWORD PTR [rdi + 80],   xmm2
+
+            ;line 6 7
+            movq       xmm0,    MMWORD PTR [rsi+rdx*2]  ; src
+            movq       xmm2,    MMWORD PTR [rsi+rcx]
+            movdqa     xmm1,    XMMWORD PTR [rax+ 48]  ; pred
+            punpcklqdq xmm0,    xmm2
+
+            movdqa     xmm2,    xmm0
+            psubb      xmm0,    xmm1            ; subtraction with sign missed
+
+            pxor       xmm1,    [GLOBAL(t80)]   ;convert to signed values
+            pxor       xmm2,    [GLOBAL(t80)]
+            pcmpgtb    xmm1,    xmm2            ; obtain sign information
+
+            movdqa     xmm2,    xmm0
+            movdqa     xmm3,    xmm1
+            punpcklbw  xmm0,    xmm1            ; put sign back to subtraction
+            punpckhbw  xmm2,    xmm3            ; put sign back to subtraction
+
+            movdqa     XMMWORD PTR [rdi + 96],   xmm0
+            movdqa     XMMWORD PTR [rdi + 112],  xmm2
+
+            ;v
+            mov     rsi,        arg(2) ;z = vsrc
+            add     rdi,        64*2  ;diff = diff + 320 (shorts)
+            add     rax,        64    ;Predictor = pred + 320
+
+            ;line 0 1
+            movq       xmm0,    MMWORD PTR [rsi]  ; src
+            movq       xmm2,    MMWORD PTR [rsi+rdx]
+            movdqa     xmm1,    XMMWORD PTR [rax]  ; pred
+            punpcklqdq xmm0,    xmm2
+
+            movdqa     xmm2,    xmm0
+            psubb      xmm0,    xmm1            ; subtraction with sign missed
+
+            pxor       xmm1,    [GLOBAL(t80)]   ;convert to signed values
+            pxor       xmm2,    [GLOBAL(t80)]
+            pcmpgtb    xmm1,    xmm2            ; obtain sign information
+
+            movdqa     xmm2,    xmm0
+            movdqa     xmm3,    xmm1
+            punpcklbw  xmm0,    xmm1            ; put sign back to subtraction
+            punpckhbw  xmm2,    xmm3            ; put sign back to subtraction
+
+            movdqa     XMMWORD PTR [rdi],   xmm0
+            movdqa     XMMWORD PTR [rdi +16],   xmm2
+
+            ;line 2 3
+            movq       xmm0,    MMWORD PTR [rsi+rdx*2]  ; src
+            movq       xmm2,    MMWORD PTR [rsi+rcx]
+            movdqa     xmm1,    XMMWORD PTR [rax+16]  ; pred
+            punpcklqdq xmm0,    xmm2
+
+            movdqa     xmm2,    xmm0
+            psubb      xmm0,    xmm1            ; subtraction with sign missed
+
+            pxor       xmm1,    [GLOBAL(t80)]   ;convert to signed values
+            pxor       xmm2,    [GLOBAL(t80)]
+            pcmpgtb    xmm1,    xmm2            ; obtain sign information
+
+            movdqa     xmm2,    xmm0
+            movdqa     xmm3,    xmm1
+            punpcklbw  xmm0,    xmm1            ; put sign back to subtraction
+            punpckhbw  xmm2,    xmm3            ; put sign back to subtraction
+
+            movdqa     XMMWORD PTR [rdi + 32],   xmm0
+            movdqa     XMMWORD PTR [rdi + 48],   xmm2
+
+            ;line 4 5
+            lea        rsi,     [rsi + rdx*4]
+
+            movq       xmm0,    MMWORD PTR [rsi]  ; src
+            movq       xmm2,    MMWORD PTR [rsi+rdx]
+            movdqa     xmm1,    XMMWORD PTR [rax + 32]  ; pred
+            punpcklqdq xmm0,    xmm2
+
+            movdqa     xmm2,    xmm0
+            psubb      xmm0,    xmm1            ; subtraction with sign missed
+
+            pxor       xmm1,    [GLOBAL(t80)]   ;convert to signed values
+            pxor       xmm2,    [GLOBAL(t80)]
+            pcmpgtb    xmm1,    xmm2            ; obtain sign information
+
+            movdqa     xmm2,    xmm0
+            movdqa     xmm3,    xmm1
+            punpcklbw  xmm0,    xmm1            ; put sign back to subtraction
+            punpckhbw  xmm2,    xmm3            ; put sign back to subtraction
+
+            movdqa     XMMWORD PTR [rdi + 64],   xmm0
+            movdqa     XMMWORD PTR [rdi + 80],   xmm2
+
+            ;line 6 7
+            movq       xmm0,    MMWORD PTR [rsi+rdx*2]  ; src
+            movq       xmm2,    MMWORD PTR [rsi+rcx]
+            movdqa     xmm1,    XMMWORD PTR [rax+ 48]  ; pred
+            punpcklqdq xmm0,    xmm2
+
+            movdqa     xmm2,    xmm0
+            psubb      xmm0,    xmm1            ; subtraction with sign missed
+
+            pxor       xmm1,    [GLOBAL(t80)]   ;convert to signed values
+            pxor       xmm2,    [GLOBAL(t80)]
+            pcmpgtb    xmm1,    xmm2            ; obtain sign information
+
+            movdqa     xmm2,    xmm0
+            movdqa     xmm3,    xmm1
+            punpcklbw  xmm0,    xmm1            ; put sign back to subtraction
+            punpckhbw  xmm2,    xmm3            ; put sign back to subtraction
+
+            movdqa     XMMWORD PTR [rdi + 96],   xmm0
+            movdqa     XMMWORD PTR [rdi + 112],  xmm2
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+SECTION_RODATA
+align 16
+t80:
+    times 16 db 0x80
--- /dev/null
+++ b/vp9/encoder/x86/temporal_filter_apply_sse2.asm
@@ -1,0 +1,207 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+; void vp9_temporal_filter_apply_sse2 | arg
+;  (unsigned char  *frame1,           |  0
+;   unsigned int    stride,           |  1
+;   unsigned char  *frame2,           |  2
+;   unsigned int    block_size,       |  3
+;   int             strength,         |  4
+;   int             filter_weight,    |  5
+;   unsigned int   *accumulator,      |  6
+;   unsigned short *count)            |  7
+global sym(vp9_temporal_filter_apply_sse2)
+sym(vp9_temporal_filter_apply_sse2):
+
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 8
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ALIGN_STACK 16, rax
+    %define block_size    0
+    %define strength      16
+    %define filter_weight 32
+    %define rounding_bit  48
+    %define rbp_backup    64
+    %define stack_size    80
+    sub         rsp,           stack_size
+    mov         [rsp + rbp_backup], rbp
+    ; end prolog
+
+        mov         rdx,            arg(3)
+        mov         [rsp + block_size], rdx
+        movd        xmm6,            arg(4)
+        movdqa      [rsp + strength], xmm6 ; where strength is used, all 16 bytes are read
+
+        ; calculate the rounding bit outside the loop
+        ; 0x8000 >> (16 - strength)
+        mov         rdx,            16
+        sub         rdx,            arg(4) ; 16 - strength
+        movd        xmm4,           rdx    ; can't use rdx w/ shift
+        movdqa      xmm5,           [GLOBAL(_const_top_bit)]
+        psrlw       xmm5,           xmm4
+        movdqa      [rsp + rounding_bit], xmm5
+
+        mov         rsi,            arg(0) ; src/frame1
+        mov         rdx,            arg(2) ; predictor frame
+        mov         rdi,            arg(6) ; accumulator
+        mov         rax,            arg(7) ; count
+
+        ; dup the filter weight and store for later
+        movd        xmm0,           arg(5) ; filter_weight
+        pshuflw     xmm0,           xmm0, 0
+        punpcklwd   xmm0,           xmm0
+        movdqa      [rsp + filter_weight], xmm0
+
+        mov         rbp,            arg(1) ; stride
+        pxor        xmm7,           xmm7   ; zero for extraction
+
+        lea         rcx,            [rdx + 16*16*1]
+        cmp         dword ptr [rsp + block_size], 8
+        jne         .temporal_filter_apply_load_16
+        lea         rcx,            [rdx + 8*8*1]
+
+.temporal_filter_apply_load_8:
+        movq        xmm0,           [rsi]  ; first row
+        lea         rsi,            [rsi + rbp] ; += stride
+        punpcklbw   xmm0,           xmm7   ; src[ 0- 7]
+        movq        xmm1,           [rsi]  ; second row
+        lea         rsi,            [rsi + rbp] ; += stride
+        punpcklbw   xmm1,           xmm7   ; src[ 8-15]
+        jmp         .temporal_filter_apply_load_finished
+
+.temporal_filter_apply_load_16:
+        movdqa      xmm0,           [rsi]  ; src (frame1)
+        lea         rsi,            [rsi + rbp] ; += stride
+        movdqa      xmm1,           xmm0
+        punpcklbw   xmm0,           xmm7   ; src[ 0- 7]
+        punpckhbw   xmm1,           xmm7   ; src[ 8-15]
+
+.temporal_filter_apply_load_finished:
+        movdqa      xmm2,           [rdx]  ; predictor (frame2)
+        movdqa      xmm3,           xmm2
+        punpcklbw   xmm2,           xmm7   ; pred[ 0- 7]
+        punpckhbw   xmm3,           xmm7   ; pred[ 8-15]
+
+        ; modifier = src_byte - pixel_value
+        psubw       xmm0,           xmm2   ; src - pred[ 0- 7]
+        psubw       xmm1,           xmm3   ; src - pred[ 8-15]
+
+        ; modifier *= modifier
+        pmullw      xmm0,           xmm0   ; modifer[ 0- 7]^2
+        pmullw      xmm1,           xmm1   ; modifer[ 8-15]^2
+
+        ; modifier *= 3
+        pmullw      xmm0,           [GLOBAL(_const_3w)]
+        pmullw      xmm1,           [GLOBAL(_const_3w)]
+
+        ; modifer += 0x8000 >> (16 - strength)
+        paddw       xmm0,           [rsp + rounding_bit]
+        paddw       xmm1,           [rsp + rounding_bit]
+
+        ; modifier >>= strength
+        psrlw       xmm0,           [rsp + strength]
+        psrlw       xmm1,           [rsp + strength]
+
+        ; modifier = 16 - modifier
+        ; saturation takes care of modifier > 16
+        movdqa      xmm3,           [GLOBAL(_const_16w)]
+        movdqa      xmm2,           [GLOBAL(_const_16w)]
+        psubusw     xmm3,           xmm1
+        psubusw     xmm2,           xmm0
+
+        ; modifier *= filter_weight
+        pmullw      xmm2,           [rsp + filter_weight]
+        pmullw      xmm3,           [rsp + filter_weight]
+
+        ; count
+        movdqa      xmm4,           [rax]
+        movdqa      xmm5,           [rax+16]
+        ; += modifier
+        paddw       xmm4,           xmm2
+        paddw       xmm5,           xmm3
+        ; write back
+        movdqa      [rax],          xmm4
+        movdqa      [rax+16],       xmm5
+        lea         rax,            [rax + 16*2] ; count += 16*(sizeof(short))
+
+        ; load and extract the predictor up to shorts
+        pxor        xmm7,           xmm7
+        movdqa      xmm0,           [rdx]
+        lea         rdx,            [rdx + 16*1] ; pred += 16*(sizeof(char))
+        movdqa      xmm1,           xmm0
+        punpcklbw   xmm0,           xmm7   ; pred[ 0- 7]
+        punpckhbw   xmm1,           xmm7   ; pred[ 8-15]
+
+        ; modifier *= pixel_value
+        pmullw      xmm0,           xmm2
+        pmullw      xmm1,           xmm3
+
+        ; expand to double words
+        movdqa      xmm2,           xmm0
+        punpcklwd   xmm0,           xmm7   ; [ 0- 3]
+        punpckhwd   xmm2,           xmm7   ; [ 4- 7]
+        movdqa      xmm3,           xmm1
+        punpcklwd   xmm1,           xmm7   ; [ 8-11]
+        punpckhwd   xmm3,           xmm7   ; [12-15]
+
+        ; accumulator
+        movdqa      xmm4,           [rdi]
+        movdqa      xmm5,           [rdi+16]
+        movdqa      xmm6,           [rdi+32]
+        movdqa      xmm7,           [rdi+48]
+        ; += modifier
+        paddd       xmm4,           xmm0
+        paddd       xmm5,           xmm2
+        paddd       xmm6,           xmm1
+        paddd       xmm7,           xmm3
+        ; write back
+        movdqa      [rdi],          xmm4
+        movdqa      [rdi+16],       xmm5
+        movdqa      [rdi+32],       xmm6
+        movdqa      [rdi+48],       xmm7
+        lea         rdi,            [rdi + 16*4] ; accumulator += 16*(sizeof(int))
+
+        cmp         rdx,            rcx
+        je          .temporal_filter_apply_epilog
+        pxor        xmm7,           xmm7   ; zero for extraction
+        cmp         dword ptr [rsp + block_size], 16
+        je          .temporal_filter_apply_load_16
+        jmp         .temporal_filter_apply_load_8
+
+.temporal_filter_apply_epilog:
+    ; begin epilog
+    mov         rbp,            [rsp + rbp_backup]
+    add         rsp,            stack_size
+    pop         rsp
+    pop         rdi
+    pop         rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+SECTION_RODATA
+align 16
+_const_3w:
+    times 8 dw 3
+align 16
+_const_top_bit:
+    times 8 dw 1<<15
+align 16
+_const_16w
+    times 8 dw 16
--- /dev/null
+++ b/vp9/encoder/x86/temporal_filter_x86.h
@@ -1,0 +1,27 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_TEMPORAL_FILTER_X86_H
+#define __INC_TEMPORAL_FILTER_X86_H
+
+#if HAVE_SSE2
+extern prototype_apply(vp9_temporal_filter_apply_sse2);
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+
+#undef  vp9_temporal_filter_apply
+#define vp9_temporal_filter_apply vp9_temporal_filter_apply_sse2
+
+#endif
+
+#endif
+
+#endif // __INC_TEMPORAL_FILTER_X86_H
--- /dev/null
+++ b/vp9/encoder/x86/variance_impl_mmx.asm
@@ -1,0 +1,851 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;unsigned int vp9_get_mb_ss_mmx( short *src_ptr )
+global sym(vp9_get_mb_ss_mmx)
+sym(vp9_get_mb_ss_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    GET_GOT     rbx
+    push rsi
+    push rdi
+    sub         rsp, 8
+    ; end prolog
+
+        mov         rax, arg(0) ;src_ptr
+        mov         rcx, 16
+        pxor        mm4, mm4
+
+.NEXTROW:
+        movq        mm0, [rax]
+        movq        mm1, [rax+8]
+        movq        mm2, [rax+16]
+        movq        mm3, [rax+24]
+        pmaddwd     mm0, mm0
+        pmaddwd     mm1, mm1
+        pmaddwd     mm2, mm2
+        pmaddwd     mm3, mm3
+
+        paddd       mm4, mm0
+        paddd       mm4, mm1
+        paddd       mm4, mm2
+        paddd       mm4, mm3
+
+        add         rax, 32
+        dec         rcx
+        ja          .NEXTROW
+        movq        QWORD PTR [rsp], mm4
+
+        ;return sum[0]+sum[1];
+        movsxd      rax, dword ptr [rsp]
+        movsxd      rcx, dword ptr [rsp+4]
+        add         rax, rcx
+
+
+    ; begin epilog
+    add rsp, 8
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;unsigned int vp9_get8x8var_mmx
+;(
+;    unsigned char *src_ptr,
+;    int  source_stride,
+;    unsigned char *ref_ptr,
+;    int  recon_stride,
+;    unsigned int *SSE,
+;    int *Sum
+;)
+global sym(vp9_get8x8var_mmx)
+sym(vp9_get8x8var_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    push rsi
+    push rdi
+    push rbx
+    sub         rsp, 16
+    ; end prolog
+
+
+        pxor        mm5, mm5                    ; Blank mmx6
+        pxor        mm6, mm6                    ; Blank mmx7
+        pxor        mm7, mm7                    ; Blank mmx7
+
+        mov         rax, arg(0) ;[src_ptr]  ; Load base addresses
+        mov         rbx, arg(2) ;[ref_ptr]
+        movsxd      rcx, dword ptr arg(1) ;[source_stride]
+        movsxd      rdx, dword ptr arg(3) ;[recon_stride]
+
+        ; Row 1
+        movq        mm0, [rax]                  ; Copy eight bytes to mm0
+        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
+        movq        mm2, mm0                    ; Take copies
+        movq        mm3, mm1                    ; Take copies
+
+        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
+        punpcklbw   mm1, mm6
+        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
+        punpckhbw   mm3, mm6
+        psubsw      mm0, mm1                    ; A-B (low order) to MM0
+        psubsw      mm2, mm3                    ; A-B (high order) to MM2
+
+        paddw       mm5, mm0                    ; accumulate differences in mm5
+        paddw       mm5, mm2                    ; accumulate differences in mm5
+
+        pmaddwd     mm0, mm0                    ; square and accumulate
+        pmaddwd     mm2, mm2                    ; square and accumulate
+        add         rbx,rdx                     ; Inc pointer into ref data
+        add         rax,rcx                     ; Inc pointer into the new data
+        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
+        paddd       mm7, mm0                    ; accumulate in mm7
+        paddd       mm7, mm2                    ; accumulate in mm7
+
+
+        ; Row 2
+        movq        mm0, [rax]                  ; Copy eight bytes to mm0
+        movq        mm2, mm0                    ; Take copies
+        movq        mm3, mm1                    ; Take copies
+
+        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
+        punpcklbw   mm1, mm6
+        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
+        punpckhbw   mm3, mm6
+        psubsw      mm0, mm1                    ; A-B (low order) to MM0
+        psubsw      mm2, mm3                    ; A-B (high order) to MM2
+
+        paddw       mm5, mm0                    ; accumulate differences in mm5
+        paddw       mm5, mm2                    ; accumulate differences in mm5
+
+        pmaddwd     mm0, mm0                    ; square and accumulate
+        pmaddwd     mm2, mm2                    ; square and accumulate
+        add         rbx,rdx                     ; Inc pointer into ref data
+        add         rax,rcx                     ; Inc pointer into the new data
+        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
+        paddd       mm7, mm0                    ; accumulate in mm7
+        paddd       mm7, mm2                    ; accumulate in mm7
+
+        ; Row 3
+        movq        mm0, [rax]                  ; Copy eight bytes to mm0
+        movq        mm2, mm0                    ; Take copies
+        movq        mm3, mm1                    ; Take copies
+
+        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
+        punpcklbw   mm1, mm6
+        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
+        punpckhbw   mm3, mm6
+        psubsw      mm0, mm1                    ; A-B (low order) to MM0
+        psubsw      mm2, mm3                    ; A-B (high order) to MM2
+
+        paddw       mm5, mm0                    ; accumulate differences in mm5
+        paddw       mm5, mm2                    ; accumulate differences in mm5
+
+        pmaddwd     mm0, mm0                    ; square and accumulate
+        pmaddwd     mm2, mm2                    ; square and accumulate
+        add         rbx,rdx                     ; Inc pointer into ref data
+        add         rax,rcx                     ; Inc pointer into the new data
+        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
+        paddd       mm7, mm0                    ; accumulate in mm7
+        paddd       mm7, mm2                    ; accumulate in mm7
+
+        ; Row 4
+        movq        mm0, [rax]                  ; Copy eight bytes to mm0
+        movq        mm2, mm0                    ; Take copies
+        movq        mm3, mm1                    ; Take copies
+
+        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
+        punpcklbw   mm1, mm6
+        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
+        punpckhbw   mm3, mm6
+        psubsw      mm0, mm1                    ; A-B (low order) to MM0
+        psubsw      mm2, mm3                    ; A-B (high order) to MM2
+
+        paddw       mm5, mm0                    ; accumulate differences in mm5
+        paddw       mm5, mm2                    ; accumulate differences in mm5
+
+        pmaddwd     mm0, mm0                    ; square and accumulate
+        pmaddwd     mm2, mm2                    ; square and accumulate
+        add         rbx,rdx                     ; Inc pointer into ref data
+        add         rax,rcx                     ; Inc pointer into the new data
+        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
+        paddd       mm7, mm0                    ; accumulate in mm7
+        paddd       mm7, mm2                    ; accumulate in mm7
+
+        ; Row 5
+        movq        mm0, [rax]                  ; Copy eight bytes to mm0
+        movq        mm2, mm0                    ; Take copies
+        movq        mm3, mm1                    ; Take copies
+
+        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
+        punpcklbw   mm1, mm6
+        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
+        punpckhbw   mm3, mm6
+        psubsw      mm0, mm1                    ; A-B (low order) to MM0
+        psubsw      mm2, mm3                    ; A-B (high order) to MM2
+
+        paddw       mm5, mm0                    ; accumulate differences in mm5
+        paddw       mm5, mm2                    ; accumulate differences in mm5
+
+        pmaddwd     mm0, mm0                    ; square and accumulate
+        pmaddwd     mm2, mm2                    ; square and accumulate
+        add         rbx,rdx                     ; Inc pointer into ref data
+        add         rax,rcx                     ; Inc pointer into the new data
+        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
+        ;              movq        mm4, [rbx + rdx]
+        paddd       mm7, mm0                    ; accumulate in mm7
+        paddd       mm7, mm2                    ; accumulate in mm7
+
+        ; Row 6
+        movq        mm0, [rax]                  ; Copy eight bytes to mm0
+        movq        mm2, mm0                    ; Take copies
+        movq        mm3, mm1                    ; Take copies
+
+        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
+        punpcklbw   mm1, mm6
+        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
+        punpckhbw   mm3, mm6
+        psubsw      mm0, mm1                    ; A-B (low order) to MM0
+        psubsw      mm2, mm3                    ; A-B (high order) to MM2
+
+        paddw       mm5, mm0                    ; accumulate differences in mm5
+        paddw       mm5, mm2                    ; accumulate differences in mm5
+
+        pmaddwd     mm0, mm0                    ; square and accumulate
+        pmaddwd     mm2, mm2                    ; square and accumulate
+        add         rbx,rdx                     ; Inc pointer into ref data
+        add         rax,rcx                     ; Inc pointer into the new data
+        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
+        paddd       mm7, mm0                    ; accumulate in mm7
+        paddd       mm7, mm2                    ; accumulate in mm7
+
+        ; Row 7
+        movq        mm0, [rax]                  ; Copy eight bytes to mm0
+        movq        mm2, mm0                    ; Take copies
+        movq        mm3, mm1                    ; Take copies
+
+        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
+        punpcklbw   mm1, mm6
+        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
+        punpckhbw   mm3, mm6
+        psubsw      mm0, mm1                    ; A-B (low order) to MM0
+        psubsw      mm2, mm3                    ; A-B (high order) to MM2
+
+        paddw       mm5, mm0                    ; accumulate differences in mm5
+        paddw       mm5, mm2                    ; accumulate differences in mm5
+
+        pmaddwd     mm0, mm0                    ; square and accumulate
+        pmaddwd     mm2, mm2                    ; square and accumulate
+        add         rbx,rdx                     ; Inc pointer into ref data
+        add         rax,rcx                     ; Inc pointer into the new data
+        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
+        paddd       mm7, mm0                    ; accumulate in mm7
+        paddd       mm7, mm2                    ; accumulate in mm7
+
+        ; Row 8
+        movq        mm0, [rax]                  ; Copy eight bytes to mm0
+        movq        mm2, mm0                    ; Take copies
+        movq        mm3, mm1                    ; Take copies
+
+        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
+        punpcklbw   mm1, mm6
+        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
+        punpckhbw   mm3, mm6
+        psubsw      mm0, mm1                    ; A-B (low order) to MM0
+        psubsw      mm2, mm3                    ; A-B (high order) to MM2
+
+        paddw       mm5, mm0                    ; accumulate differences in mm5
+        paddw       mm5, mm2                    ; accumulate differences in mm5
+
+        pmaddwd     mm0, mm0                    ; square and accumulate
+        pmaddwd     mm2, mm2                    ; square and accumulate
+        add         rbx,rdx                     ; Inc pointer into ref data
+        add         rax,rcx                     ; Inc pointer into the new data
+        paddd       mm7, mm0                    ; accumulate in mm7
+        paddd       mm7, mm2                    ; accumulate in mm7
+
+        ; Now accumulate the final results.
+        movq        QWORD PTR [rsp+8], mm5      ; copy back accumulated results into normal memory
+        movq        QWORD PTR [rsp], mm7        ; copy back accumulated results into normal memory
+        movsx       rdx, WORD PTR [rsp+8]
+        movsx       rcx, WORD PTR [rsp+10]
+        movsx       rbx, WORD PTR [rsp+12]
+        movsx       rax, WORD PTR [rsp+14]
+        add         rdx, rcx
+        add         rbx, rax
+        add         rdx, rbx    ;XSum
+        movsxd      rax, DWORD PTR [rsp]
+        movsxd      rcx, DWORD PTR [rsp+4]
+        add         rax, rcx    ;XXSum
+        mov         rsi, arg(4) ;SSE
+        mov         rdi, arg(5) ;Sum
+        mov         dword ptr [rsi], eax
+        mov         dword ptr [rdi], edx
+        xor         rax, rax    ; return 0
+
+
+    ; begin epilog
+    add rsp, 16
+    pop rbx
+    pop rdi
+    pop rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+
+;unsigned int
+;vp9_get4x4var_mmx
+;(
+;    unsigned char *src_ptr,
+;    int  source_stride,
+;    unsigned char *ref_ptr,
+;    int  recon_stride,
+;    unsigned int *SSE,
+;    int *Sum
+;)
+global sym(vp9_get4x4var_mmx)
+sym(vp9_get4x4var_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    push rsi
+    push rdi
+    push rbx
+    sub         rsp, 16
+    ; end prolog
+
+
+        pxor        mm5, mm5                    ; Blank mmx6
+        pxor        mm6, mm6                    ; Blank mmx7
+        pxor        mm7, mm7                    ; Blank mmx7
+
+        mov         rax, arg(0) ;[src_ptr]  ; Load base addresses
+        mov         rbx, arg(2) ;[ref_ptr]
+        movsxd      rcx, dword ptr arg(1) ;[source_stride]
+        movsxd      rdx, dword ptr arg(3) ;[recon_stride]
+
+        ; Row 1
+        movq        mm0, [rax]                  ; Copy eight bytes to mm0
+        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
+        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
+        punpcklbw   mm1, mm6
+        psubsw      mm0, mm1                    ; A-B (low order) to MM0
+        paddw       mm5, mm0                    ; accumulate differences in mm5
+        pmaddwd     mm0, mm0                    ; square and accumulate
+        add         rbx,rdx                     ; Inc pointer into ref data
+        add         rax,rcx                     ; Inc pointer into the new data
+        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
+        paddd       mm7, mm0                    ; accumulate in mm7
+
+
+        ; Row 2
+        movq        mm0, [rax]                  ; Copy eight bytes to mm0
+        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
+        punpcklbw   mm1, mm6
+        psubsw      mm0, mm1                    ; A-B (low order) to MM0
+        paddw       mm5, mm0                    ; accumulate differences in mm5
+
+        pmaddwd     mm0, mm0                    ; square and accumulate
+        add         rbx,rdx                     ; Inc pointer into ref data
+        add         rax,rcx                     ; Inc pointer into the new data
+        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
+        paddd       mm7, mm0                    ; accumulate in mm7
+
+        ; Row 3
+        movq        mm0, [rax]                  ; Copy eight bytes to mm0
+        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
+        punpcklbw   mm1, mm6
+        psubsw      mm0, mm1                    ; A-B (low order) to MM0
+        paddw       mm5, mm0                    ; accumulate differences in mm5
+
+        pmaddwd     mm0, mm0                    ; square and accumulate
+        add         rbx,rdx                     ; Inc pointer into ref data
+        add         rax,rcx                     ; Inc pointer into the new data
+        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
+        paddd       mm7, mm0                    ; accumulate in mm7
+
+        ; Row 4
+        movq        mm0, [rax]                  ; Copy eight bytes to mm0
+
+        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
+        punpcklbw   mm1, mm6
+        psubsw      mm0, mm1                    ; A-B (low order) to MM0
+
+        paddw       mm5, mm0                    ; accumulate differences in mm5
+
+        pmaddwd     mm0, mm0                    ; square and accumulate
+        paddd       mm7, mm0                    ; accumulate in mm7
+
+
+        ; Now accumulate the final results.
+        movq        QWORD PTR [rsp+8], mm5      ; copy back accumulated results into normal memory
+        movq        QWORD PTR [rsp], mm7        ; copy back accumulated results into normal memory
+        movsx       rdx, WORD PTR [rsp+8]
+        movsx       rcx, WORD PTR [rsp+10]
+        movsx       rbx, WORD PTR [rsp+12]
+        movsx       rax, WORD PTR [rsp+14]
+        add         rdx, rcx
+        add         rbx, rax
+        add         rdx, rbx    ;XSum
+        movsxd      rax, DWORD PTR [rsp]
+        movsxd      rcx, DWORD PTR [rsp+4]
+        add         rax, rcx    ;XXSum
+        mov         rsi, arg(4) ;SSE
+        mov         rdi, arg(5) ;Sum
+        mov         dword ptr [rsi], eax
+        mov         dword ptr [rdi], edx
+        xor         rax, rax    ; return 0
+
+
+    ; begin epilog
+    add rsp, 16
+    pop rbx
+    pop rdi
+    pop rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+
+;unsigned int
+;vp9_get4x4sse_cs_mmx
+;(
+;    unsigned char *src_ptr,
+;    int  source_stride,
+;    unsigned char *ref_ptr,
+;    int  recon_stride
+;)
+global sym(vp9_get4x4sse_cs_mmx)
+sym(vp9_get4x4sse_cs_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 4
+    push rsi
+    push rdi
+    push rbx
+    ; end prolog
+
+
+        pxor        mm6, mm6                    ; Blank mmx7
+        pxor        mm7, mm7                    ; Blank mmx7
+
+        mov         rax, arg(0) ;[src_ptr]  ; Load base addresses
+        mov         rbx, arg(2) ;[ref_ptr]
+        movsxd      rcx, dword ptr arg(1) ;[source_stride]
+        movsxd      rdx, dword ptr arg(3) ;[recon_stride]
+        ; Row 1
+        movd        mm0, [rax]                  ; Copy eight bytes to mm0
+        movd        mm1, [rbx]                  ; Copy eight bytes to mm1
+        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
+        punpcklbw   mm1, mm6
+        psubsw      mm0, mm1                    ; A-B (low order) to MM0
+        pmaddwd     mm0, mm0                    ; square and accumulate
+        add         rbx,rdx                     ; Inc pointer into ref data
+        add         rax,rcx                     ; Inc pointer into the new data
+        movd        mm1, [rbx]                  ; Copy eight bytes to mm1
+        paddd       mm7, mm0                    ; accumulate in mm7
+
+        ; Row 2
+        movd        mm0, [rax]                  ; Copy eight bytes to mm0
+        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
+        punpcklbw   mm1, mm6
+        psubsw      mm0, mm1                    ; A-B (low order) to MM0
+        pmaddwd     mm0, mm0                    ; square and accumulate
+        add         rbx,rdx                     ; Inc pointer into ref data
+        add         rax,rcx                     ; Inc pointer into the new data
+        movd        mm1, [rbx]                  ; Copy eight bytes to mm1
+        paddd       mm7, mm0                    ; accumulate in mm7
+
+        ; Row 3
+        movd        mm0, [rax]                  ; Copy eight bytes to mm0
+        punpcklbw   mm1, mm6
+        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
+        psubsw      mm0, mm1                    ; A-B (low order) to MM0
+
+        pmaddwd     mm0, mm0                    ; square and accumulate
+        add         rbx,rdx                     ; Inc pointer into ref data
+        add         rax,rcx                     ; Inc pointer into the new data
+        movd        mm1, [rbx]                  ; Copy eight bytes to mm1
+        paddd       mm7, mm0                    ; accumulate in mm7
+
+        ; Row 4
+        movd        mm0, [rax]                  ; Copy eight bytes to mm0
+        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
+        punpcklbw   mm1, mm6
+        psubsw      mm0, mm1                    ; A-B (low order) to MM0
+        pmaddwd     mm0, mm0                    ; square and accumulate
+        paddd       mm7, mm0                    ; accumulate in mm7
+
+        movq        mm0,    mm7                 ;
+        psrlq       mm7,    32
+
+        paddd       mm0,    mm7
+        movq        rax,    mm0
+
+
+    ; begin epilog
+    pop rbx
+    pop rdi
+    pop rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+%define mmx_filter_shift            7
+
+;void vp9_filter_block2d_bil4x4_var_mmx
+;(
+;    unsigned char *ref_ptr,
+;    int ref_pixels_per_line,
+;    unsigned char *src_ptr,
+;    int src_pixels_per_line,
+;    unsigned short *HFilter,
+;    unsigned short *VFilter,
+;    int *sum,
+;    unsigned int *sumsquared
+;)
+global sym(vp9_filter_block2d_bil4x4_var_mmx)
+sym(vp9_filter_block2d_bil4x4_var_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 8
+    GET_GOT     rbx
+    push rsi
+    push rdi
+    sub         rsp, 16
+    ; end prolog
+
+
+        pxor            mm6,            mm6                 ;
+        pxor            mm7,            mm7                 ;
+
+        mov             rax,            arg(4) ;HFilter             ;
+        mov             rdx,            arg(5) ;VFilter             ;
+
+        mov             rsi,            arg(0) ;ref_ptr              ;
+        mov             rdi,            arg(2) ;src_ptr              ;
+
+        mov             rcx,            4                   ;
+        pxor            mm0,            mm0                 ;
+
+        movd            mm1,            [rsi]               ;
+        movd            mm3,            [rsi+1]             ;
+
+        punpcklbw       mm1,            mm0                 ;
+        pmullw          mm1,            [rax]               ;
+
+        punpcklbw       mm3,            mm0                 ;
+        pmullw          mm3,            [rax+8]             ;
+
+        paddw           mm1,            mm3                 ;
+        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
+
+        psraw           mm1,            mmx_filter_shift    ;
+        movq            mm5,            mm1
+
+%if ABI_IS_32BIT
+        add             rsi, dword ptr  arg(1) ;ref_pixels_per_line    ;
+%else
+        movsxd          r8, dword ptr  arg(1) ;ref_pixels_per_line    ;
+        add             rsi, r8
+%endif
+
+.filter_block2d_bil4x4_var_mmx_loop:
+
+        movd            mm1,            [rsi]               ;
+        movd            mm3,            [rsi+1]             ;
+
+        punpcklbw       mm1,            mm0                 ;
+        pmullw          mm1,            [rax]               ;
+
+        punpcklbw       mm3,            mm0                 ;
+        pmullw          mm3,            [rax+8]             ;
+
+        paddw           mm1,            mm3                 ;
+        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
+
+        psraw           mm1,            mmx_filter_shift    ;
+        movq            mm3,            mm5                 ;
+
+        movq            mm5,            mm1                 ;
+        pmullw          mm3,            [rdx]               ;
+
+        pmullw          mm1,            [rdx+8]             ;
+        paddw           mm1,            mm3                 ;
+
+
+        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
+        psraw           mm1,            mmx_filter_shift    ;
+
+        movd            mm3,            [rdi]               ;
+        punpcklbw       mm3,            mm0                 ;
+
+        psubw           mm1,            mm3                 ;
+        paddw           mm6,            mm1                 ;
+
+        pmaddwd         mm1,            mm1                 ;
+        paddd           mm7,            mm1                 ;
+
+%if ABI_IS_32BIT
+        add             rsi,            dword ptr arg(1) ;ref_pixels_per_line    ;
+        add             rdi,            dword ptr arg(3) ;src_pixels_per_line    ;
+%else
+        movsxd          r8,             dword ptr arg(1) ;ref_pixels_per_line
+        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line
+        add             rsi,            r8
+        add             rdi,            r9
+%endif
+        sub             rcx,            1                   ;
+        jnz             .filter_block2d_bil4x4_var_mmx_loop       ;
+
+
+        pxor            mm3,            mm3                 ;
+        pxor            mm2,            mm2                 ;
+
+        punpcklwd       mm2,            mm6                 ;
+        punpckhwd       mm3,            mm6                 ;
+
+        paddd           mm2,            mm3                 ;
+        movq            mm6,            mm2                 ;
+
+        psrlq           mm6,            32                  ;
+        paddd           mm2,            mm6                 ;
+
+        psrad           mm2,            16                  ;
+        movq            mm4,            mm7                 ;
+
+        psrlq           mm4,            32                  ;
+        paddd           mm4,            mm7                 ;
+
+        mov             rdi,            arg(6) ;sum
+        mov             rsi,            arg(7) ;sumsquared
+
+        movd            dword ptr [rdi],          mm2                 ;
+        movd            dword ptr [rsi],          mm4                 ;
+
+
+
+    ; begin epilog
+    add rsp, 16
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+
+
+;void vp9_filter_block2d_bil_var_mmx
+;(
+;    unsigned char *ref_ptr,
+;    int ref_pixels_per_line,
+;    unsigned char *src_ptr,
+;    int src_pixels_per_line,
+;    unsigned int Height,
+;    unsigned short *HFilter,
+;    unsigned short *VFilter,
+;    int *sum,
+;    unsigned int *sumsquared
+;)
+global sym(vp9_filter_block2d_bil_var_mmx)
+sym(vp9_filter_block2d_bil_var_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 9
+    GET_GOT     rbx
+    push rsi
+    push rdi
+    sub         rsp, 16
+    ; end prolog
+
+        pxor            mm6,            mm6                 ;
+        pxor            mm7,            mm7                 ;
+        mov             rax,            arg(5) ;HFilter             ;
+
+        mov             rdx,            arg(6) ;VFilter             ;
+        mov             rsi,            arg(0) ;ref_ptr              ;
+
+        mov             rdi,            arg(2) ;src_ptr              ;
+        movsxd          rcx,            dword ptr arg(4) ;Height              ;
+
+        pxor            mm0,            mm0                 ;
+        movq            mm1,            [rsi]               ;
+
+        movq            mm3,            [rsi+1]             ;
+        movq            mm2,            mm1                 ;
+
+        movq            mm4,            mm3                 ;
+        punpcklbw       mm1,            mm0                 ;
+
+        punpckhbw       mm2,            mm0                 ;
+        pmullw          mm1,            [rax]               ;
+
+        pmullw          mm2,            [rax]               ;
+        punpcklbw       mm3,            mm0                 ;
+
+        punpckhbw       mm4,            mm0                 ;
+        pmullw          mm3,            [rax+8]             ;
+
+        pmullw          mm4,            [rax+8]             ;
+        paddw           mm1,            mm3                 ;
+
+        paddw           mm2,            mm4                 ;
+        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
+
+        psraw           mm1,            mmx_filter_shift    ;
+        paddw           mm2,            [GLOBAL(mmx_bi_rd)] ;
+
+        psraw           mm2,            mmx_filter_shift    ;
+        movq            mm5,            mm1
+
+        packuswb        mm5,            mm2                 ;
+%if ABI_IS_32BIT
+        add             rsi,            dword ptr arg(1) ;ref_pixels_per_line
+%else
+        movsxd          r8,             dword ptr arg(1) ;ref_pixels_per_line
+        add             rsi,            r8
+%endif
+
+.filter_block2d_bil_var_mmx_loop:
+
+        movq            mm1,            [rsi]               ;
+        movq            mm3,            [rsi+1]             ;
+
+        movq            mm2,            mm1                 ;
+        movq            mm4,            mm3                 ;
+
+        punpcklbw       mm1,            mm0                 ;
+        punpckhbw       mm2,            mm0                 ;
+
+        pmullw          mm1,            [rax]               ;
+        pmullw          mm2,            [rax]               ;
+
+        punpcklbw       mm3,            mm0                 ;
+        punpckhbw       mm4,            mm0                 ;
+
+        pmullw          mm3,            [rax+8]             ;
+        pmullw          mm4,            [rax+8]             ;
+
+        paddw           mm1,            mm3                 ;
+        paddw           mm2,            mm4                 ;
+
+        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
+        psraw           mm1,            mmx_filter_shift    ;
+
+        paddw           mm2,            [GLOBAL(mmx_bi_rd)] ;
+        psraw           mm2,            mmx_filter_shift    ;
+
+        movq            mm3,            mm5                 ;
+        movq            mm4,            mm5                 ;
+
+        punpcklbw       mm3,            mm0                 ;
+        punpckhbw       mm4,            mm0                 ;
+
+        movq            mm5,            mm1                 ;
+        packuswb        mm5,            mm2                 ;
+
+        pmullw          mm3,            [rdx]               ;
+        pmullw          mm4,            [rdx]               ;
+
+        pmullw          mm1,            [rdx+8]             ;
+        pmullw          mm2,            [rdx+8]             ;
+
+        paddw           mm1,            mm3                 ;
+        paddw           mm2,            mm4                 ;
+
+        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
+        paddw           mm2,            [GLOBAL(mmx_bi_rd)] ;
+
+        psraw           mm1,            mmx_filter_shift    ;
+        psraw           mm2,            mmx_filter_shift    ;
+
+        movq            mm3,            [rdi]               ;
+        movq            mm4,            mm3                 ;
+
+        punpcklbw       mm3,            mm0                 ;
+        punpckhbw       mm4,            mm0                 ;
+
+        psubw           mm1,            mm3                 ;
+        psubw           mm2,            mm4                 ;
+
+        paddw           mm6,            mm1                 ;
+        pmaddwd         mm1,            mm1                 ;
+
+        paddw           mm6,            mm2                 ;
+        pmaddwd         mm2,            mm2                 ;
+
+        paddd           mm7,            mm1                 ;
+        paddd           mm7,            mm2                 ;
+
+%if ABI_IS_32BIT
+        add             rsi,            dword ptr arg(1) ;ref_pixels_per_line    ;
+        add             rdi,            dword ptr arg(3) ;src_pixels_per_line    ;
+%else
+        movsxd          r8,             dword ptr arg(1) ;ref_pixels_per_line    ;
+        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line    ;
+        add             rsi,            r8
+        add             rdi,            r9
+%endif
+        sub             rcx,            1                   ;
+        jnz             .filter_block2d_bil_var_mmx_loop       ;
+
+
+        pxor            mm3,            mm3                 ;
+        pxor            mm2,            mm2                 ;
+
+        punpcklwd       mm2,            mm6                 ;
+        punpckhwd       mm3,            mm6                 ;
+
+        paddd           mm2,            mm3                 ;
+        movq            mm6,            mm2                 ;
+
+        psrlq           mm6,            32                  ;
+        paddd           mm2,            mm6                 ;
+
+        psrad           mm2,            16                  ;
+        movq            mm4,            mm7                 ;
+
+        psrlq           mm4,            32                  ;
+        paddd           mm4,            mm7                 ;
+
+        mov             rdi,            arg(7) ;sum
+        mov             rsi,            arg(8) ;sumsquared
+
+        movd            dword ptr [rdi],          mm2                 ;
+        movd            dword ptr [rsi],          mm4                 ;
+
+    ; begin epilog
+    add rsp, 16
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+SECTION_RODATA
+;short mmx_bi_rd[4] = { 64, 64, 64, 64};
+align 16
+mmx_bi_rd:
+    times 4 dw 64
--- /dev/null
+++ b/vp9/encoder/x86/variance_impl_sse2.asm
@@ -1,0 +1,1367 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+%define xmm_filter_shift            7
+
+;unsigned int vp9_get_mb_ss_sse2
+;(
+;    short *src_ptr
+;)
+global sym(vp9_get_mb_ss_sse2)
+sym(vp9_get_mb_ss_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 1
+    GET_GOT     rbx
+    push rsi
+    push rdi
+    sub         rsp, 16
+    ; end prolog
+
+
+        mov         rax, arg(0) ;[src_ptr]
+        mov         rcx, 8
+        pxor        xmm4, xmm4
+
+.NEXTROW:
+        movdqa      xmm0, [rax]
+        movdqa      xmm1, [rax+16]
+        movdqa      xmm2, [rax+32]
+        movdqa      xmm3, [rax+48]
+        pmaddwd     xmm0, xmm0
+        pmaddwd     xmm1, xmm1
+        pmaddwd     xmm2, xmm2
+        pmaddwd     xmm3, xmm3
+
+        paddd       xmm0, xmm1
+        paddd       xmm2, xmm3
+        paddd       xmm4, xmm0
+        paddd       xmm4, xmm2
+
+        add         rax, 0x40
+        dec         rcx
+        ja          .NEXTROW
+
+        movdqa      xmm3,xmm4
+        psrldq      xmm4,8
+        paddd       xmm4,xmm3
+        movdqa      xmm3,xmm4
+        psrldq      xmm4,4
+        paddd       xmm4,xmm3
+        movq        rax,xmm4
+
+
+    ; begin epilog
+    add rsp, 16
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;unsigned int vp9_get16x16var_sse2
+;(
+;    unsigned char   *  src_ptr,
+;    int             source_stride,
+;    unsigned char   *  ref_ptr,
+;    int             recon_stride,
+;    unsigned int    *  SSE,
+;    int             *  Sum
+;)
+global sym(vp9_get16x16var_sse2)
+sym(vp9_get16x16var_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push rbx
+    push rsi
+    push rdi
+    ; end prolog
+
+        mov         rsi,            arg(0) ;[src_ptr]
+        mov         rdi,            arg(2) ;[ref_ptr]
+
+        movsxd      rax,            DWORD PTR arg(1) ;[source_stride]
+        movsxd      rdx,            DWORD PTR arg(3) ;[recon_stride]
+
+        ; Prefetch data
+        lea             rcx,    [rax+rax*2]
+        prefetcht0      [rsi]
+        prefetcht0      [rsi+rax]
+        prefetcht0      [rsi+rax*2]
+        prefetcht0      [rsi+rcx]
+        lea             rbx,    [rsi+rax*4]
+        prefetcht0      [rbx]
+        prefetcht0      [rbx+rax]
+        prefetcht0      [rbx+rax*2]
+        prefetcht0      [rbx+rcx]
+
+        lea             rcx,    [rdx+rdx*2]
+        prefetcht0      [rdi]
+        prefetcht0      [rdi+rdx]
+        prefetcht0      [rdi+rdx*2]
+        prefetcht0      [rdi+rcx]
+        lea             rbx,    [rdi+rdx*4]
+        prefetcht0      [rbx]
+        prefetcht0      [rbx+rdx]
+        prefetcht0      [rbx+rdx*2]
+        prefetcht0      [rbx+rcx]
+
+        pxor        xmm0,           xmm0                        ; clear xmm0 for unpack
+        pxor        xmm7,           xmm7                        ; clear xmm7 for accumulating diffs
+
+        pxor        xmm6,           xmm6                        ; clear xmm6 for accumulating sse
+        mov         rcx,            16
+
+.var16loop:
+        movdqu      xmm1,           XMMWORD PTR [rsi]
+        movdqu      xmm2,           XMMWORD PTR [rdi]
+
+        prefetcht0      [rsi+rax*8]
+        prefetcht0      [rdi+rdx*8]
+
+        movdqa      xmm3,           xmm1
+        movdqa      xmm4,           xmm2
+
+
+        punpcklbw   xmm1,           xmm0
+        punpckhbw   xmm3,           xmm0
+
+        punpcklbw   xmm2,           xmm0
+        punpckhbw   xmm4,           xmm0
+
+
+        psubw       xmm1,           xmm2
+        psubw       xmm3,           xmm4
+
+        paddw       xmm7,           xmm1
+        pmaddwd     xmm1,           xmm1
+
+        paddw       xmm7,           xmm3
+        pmaddwd     xmm3,           xmm3
+
+        paddd       xmm6,           xmm1
+        paddd       xmm6,           xmm3
+
+        add         rsi,            rax
+        add         rdi,            rdx
+
+        sub         rcx,            1
+        jnz         .var16loop
+
+
+        movdqa      xmm1,           xmm6
+        pxor        xmm6,           xmm6
+
+        pxor        xmm5,           xmm5
+        punpcklwd   xmm6,           xmm7
+
+        punpckhwd   xmm5,           xmm7
+        psrad       xmm5,           16
+
+        psrad       xmm6,           16
+        paddd       xmm6,           xmm5
+
+        movdqa      xmm2,           xmm1
+        punpckldq   xmm1,           xmm0
+
+        punpckhdq   xmm2,           xmm0
+        movdqa      xmm7,           xmm6
+
+        paddd       xmm1,           xmm2
+        punpckldq   xmm6,           xmm0
+
+        punpckhdq   xmm7,           xmm0
+        paddd       xmm6,           xmm7
+
+        movdqa      xmm2,           xmm1
+        movdqa      xmm7,           xmm6
+
+        psrldq      xmm1,           8
+        psrldq      xmm6,           8
+
+        paddd       xmm7,           xmm6
+        paddd       xmm1,           xmm2
+
+        mov         rax,            arg(5) ;[Sum]
+        mov         rdi,            arg(4) ;[SSE]
+
+        movd DWORD PTR [rax],       xmm7
+        movd DWORD PTR [rdi],       xmm1
+
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    pop rbx
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+
+
+;unsigned int vp9_get8x8var_sse2
+;(
+;    unsigned char   *  src_ptr,
+;    int             source_stride,
+;    unsigned char   *  ref_ptr,
+;    int             recon_stride,
+;    unsigned int    *  SSE,
+;    int             *  Sum
+;)
+global sym(vp9_get8x8var_sse2)
+sym(vp9_get8x8var_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push rsi
+    push rdi
+    sub         rsp, 16
+    ; end prolog
+
+        mov         rsi,            arg(0) ;[src_ptr]
+        mov         rdi,            arg(2) ;[ref_ptr]
+
+        movsxd      rax,            DWORD PTR arg(1) ;[source_stride]
+        movsxd      rdx,            DWORD PTR arg(3) ;[recon_stride]
+
+        pxor        xmm0,           xmm0                        ; clear xmm0 for unpack
+        pxor        xmm7,           xmm7                        ; clear xmm7 for accumulating diffs
+
+        movq        xmm1,           QWORD PTR [rsi]
+        movq        xmm2,           QWORD PTR [rdi]
+
+        punpcklbw   xmm1,           xmm0
+        punpcklbw   xmm2,           xmm0
+
+        psubsw      xmm1,           xmm2
+        paddw       xmm7,           xmm1
+
+        pmaddwd     xmm1,           xmm1
+
+        movq        xmm2,           QWORD PTR[rsi + rax]
+        movq        xmm3,           QWORD PTR[rdi + rdx]
+
+        punpcklbw   xmm2,           xmm0
+        punpcklbw   xmm3,           xmm0
+
+        psubsw      xmm2,           xmm3
+        paddw       xmm7,           xmm2
+
+        pmaddwd     xmm2,           xmm2
+        paddd       xmm1,           xmm2
+
+
+        movq        xmm2,           QWORD PTR[rsi + rax * 2]
+        movq        xmm3,           QWORD PTR[rdi + rdx * 2]
+
+        punpcklbw   xmm2,           xmm0
+        punpcklbw   xmm3,           xmm0
+
+        psubsw      xmm2,           xmm3
+        paddw       xmm7,           xmm2
+
+        pmaddwd     xmm2,           xmm2
+        paddd       xmm1,           xmm2
+
+
+        lea         rsi,            [rsi + rax * 2]
+        lea         rdi,            [rdi + rdx * 2]
+        movq        xmm2,           QWORD PTR[rsi + rax]
+        movq        xmm3,           QWORD PTR[rdi + rdx]
+
+        punpcklbw   xmm2,           xmm0
+        punpcklbw   xmm3,           xmm0
+
+        psubsw      xmm2,           xmm3
+        paddw       xmm7,           xmm2
+
+        pmaddwd     xmm2,           xmm2
+        paddd       xmm1,           xmm2
+
+        movq        xmm2,           QWORD PTR[rsi + rax *2]
+        movq        xmm3,           QWORD PTR[rdi + rdx *2]
+
+        punpcklbw   xmm2,           xmm0
+        punpcklbw   xmm3,           xmm0
+
+        psubsw      xmm2,           xmm3
+        paddw       xmm7,           xmm2
+
+        pmaddwd     xmm2,           xmm2
+        paddd       xmm1,           xmm2
+
+
+        lea         rsi,            [rsi + rax * 2]
+        lea         rdi,            [rdi + rdx * 2]
+
+
+        movq        xmm2,           QWORD PTR[rsi + rax]
+        movq        xmm3,           QWORD PTR[rdi + rdx]
+
+        punpcklbw   xmm2,           xmm0
+        punpcklbw   xmm3,           xmm0
+
+        psubsw      xmm2,           xmm3
+        paddw       xmm7,           xmm2
+
+        pmaddwd     xmm2,           xmm2
+        paddd       xmm1,           xmm2
+
+        movq        xmm2,           QWORD PTR[rsi + rax *2]
+        movq        xmm3,           QWORD PTR[rdi + rdx *2]
+
+        punpcklbw   xmm2,           xmm0
+        punpcklbw   xmm3,           xmm0
+
+        psubsw      xmm2,           xmm3
+        paddw       xmm7,           xmm2
+
+        pmaddwd     xmm2,           xmm2
+        paddd       xmm1,           xmm2
+
+
+        lea         rsi,            [rsi + rax * 2]
+        lea         rdi,            [rdi + rdx * 2]
+
+        movq        xmm2,           QWORD PTR[rsi + rax]
+        movq        xmm3,           QWORD PTR[rdi + rdx]
+
+        punpcklbw   xmm2,           xmm0
+        punpcklbw   xmm3,           xmm0
+
+        psubsw      xmm2,           xmm3
+        paddw       xmm7,           xmm2
+
+        pmaddwd     xmm2,           xmm2
+        paddd       xmm1,           xmm2
+
+
+        movdqa      xmm6,           xmm7
+        punpcklwd   xmm6,           xmm0
+
+        punpckhwd   xmm7,           xmm0
+        movdqa      xmm2,           xmm1
+
+        paddw       xmm6,           xmm7
+        punpckldq   xmm1,           xmm0
+
+        punpckhdq   xmm2,           xmm0
+        movdqa      xmm7,           xmm6
+
+        paddd       xmm1,           xmm2
+        punpckldq   xmm6,           xmm0
+
+        punpckhdq   xmm7,           xmm0
+        paddw       xmm6,           xmm7
+
+        movdqa      xmm2,           xmm1
+        movdqa      xmm7,           xmm6
+
+        psrldq      xmm1,           8
+        psrldq      xmm6,           8
+
+        paddw       xmm7,           xmm6
+        paddd       xmm1,           xmm2
+
+        mov         rax,            arg(5) ;[Sum]
+        mov         rdi,            arg(4) ;[SSE]
+
+        movq        rdx,            xmm7
+        movsx       rcx,            dx
+
+        mov  dword ptr [rax],       ecx
+        movd DWORD PTR [rdi],       xmm1
+
+    ; begin epilog
+    add rsp, 16
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vp9_filter_block2d_bil_var_sse2
+;(
+;    unsigned char *ref_ptr,
+;    int ref_pixels_per_line,
+;    unsigned char *src_ptr,
+;    int src_pixels_per_line,
+;    unsigned int Height,
+;    int  xoffset,
+;    int  yoffset,
+;    int *sum,
+;    unsigned int *sumsquared;;
+;
+;)
+global sym(vp9_filter_block2d_bil_var_sse2)
+sym(vp9_filter_block2d_bil_var_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 9
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push rsi
+    push rdi
+    push rbx
+    ; end prolog
+
+        pxor            xmm6,           xmm6                 ;
+        pxor            xmm7,           xmm7                 ;
+
+        lea             rsi,            [GLOBAL(xmm_bi_rd)]  ; rounding
+        movdqa          xmm4,           XMMWORD PTR [rsi]
+
+        lea             rcx,            [GLOBAL(bilinear_filters_sse2)]
+        movsxd          rax,            dword ptr arg(5)     ; xoffset
+
+        cmp             rax,            0                    ; skip first_pass filter if xoffset=0
+        je              filter_block2d_bil_var_sse2_sp_only
+
+        shl             rax,            5                    ; point to filter coeff with xoffset
+        lea             rax,            [rax + rcx]          ; HFilter
+
+        movsxd          rdx,            dword ptr arg(6)     ; yoffset
+
+        cmp             rdx,            0                    ; skip second_pass filter if yoffset=0
+        je              filter_block2d_bil_var_sse2_fp_only
+
+        shl             rdx,            5
+        lea             rdx,            [rdx + rcx]          ; VFilter
+
+        mov             rsi,            arg(0)               ;ref_ptr
+        mov             rdi,            arg(2)               ;src_ptr
+        movsxd          rcx,            dword ptr arg(4)     ;Height
+
+        pxor            xmm0,           xmm0                 ;
+        movq            xmm1,           QWORD PTR [rsi]      ;
+        movq            xmm3,           QWORD PTR [rsi+1]    ;
+
+        punpcklbw       xmm1,           xmm0                 ;
+        pmullw          xmm1,           [rax]                ;
+        punpcklbw       xmm3,           xmm0
+        pmullw          xmm3,           [rax+16]             ;
+
+        paddw           xmm1,           xmm3                 ;
+        paddw           xmm1,           xmm4                 ;
+        psraw           xmm1,           xmm_filter_shift     ;
+        movdqa          xmm5,           xmm1
+
+        movsxd          rbx,            dword ptr arg(1) ;ref_pixels_per_line
+        lea             rsi,            [rsi + rbx]
+%if ABI_IS_32BIT=0
+        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line
+%endif
+
+filter_block2d_bil_var_sse2_loop:
+        movq            xmm1,           QWORD PTR [rsi]               ;
+        movq            xmm3,           QWORD PTR [rsi+1]             ;
+
+        punpcklbw       xmm1,           xmm0                 ;
+        pmullw          xmm1,           [rax]               ;
+        punpcklbw       xmm3,           xmm0                 ;
+        pmullw          xmm3,           [rax+16]             ;
+
+        paddw           xmm1,           xmm3                 ;
+        paddw           xmm1,           xmm4               ;
+        psraw           xmm1,           xmm_filter_shift    ;
+
+        movdqa          xmm3,           xmm5                 ;
+        movdqa          xmm5,           xmm1                 ;
+
+        pmullw          xmm3,           [rdx]               ;
+        pmullw          xmm1,           [rdx+16]             ;
+        paddw           xmm1,           xmm3                 ;
+        paddw           xmm1,           xmm4                 ;
+        psraw           xmm1,           xmm_filter_shift    ;
+
+        movq            xmm3,           QWORD PTR [rdi]               ;
+        punpcklbw       xmm3,           xmm0                 ;
+
+        psubw           xmm1,           xmm3                 ;
+        paddw           xmm6,           xmm1                 ;
+
+        pmaddwd         xmm1,           xmm1                 ;
+        paddd           xmm7,           xmm1                 ;
+
+        lea             rsi,            [rsi + rbx]          ;ref_pixels_per_line
+%if ABI_IS_32BIT
+        add             rdi,            dword ptr arg(3)     ;src_pixels_per_line
+%else
+        lea             rdi,            [rdi + r9]
+%endif
+
+        sub             rcx,            1                   ;
+        jnz             filter_block2d_bil_var_sse2_loop       ;
+
+        jmp             filter_block2d_bil_variance
+
+filter_block2d_bil_var_sse2_sp_only:
+        movsxd          rdx,            dword ptr arg(6)     ; yoffset
+
+        cmp             rdx,            0                    ; skip all if both xoffset=0 and yoffset=0
+        je              filter_block2d_bil_var_sse2_full_pixel
+
+        shl             rdx,            5
+        lea             rdx,            [rdx + rcx]          ; VFilter
+
+        mov             rsi,            arg(0)               ;ref_ptr
+        mov             rdi,            arg(2)               ;src_ptr
+        movsxd          rcx,            dword ptr arg(4)     ;Height
+        movsxd          rax,            dword ptr arg(1)     ;ref_pixels_per_line
+
+        pxor            xmm0,           xmm0                 ;
+        movq            xmm1,           QWORD PTR [rsi]      ;
+        punpcklbw       xmm1,           xmm0                 ;
+
+        movsxd          rbx,            dword ptr arg(3)     ;src_pixels_per_line
+        lea             rsi,            [rsi + rax]
+
+filter_block2d_bil_sp_only_loop:
+        movq            xmm3,           QWORD PTR [rsi]             ;
+        punpcklbw       xmm3,           xmm0                 ;
+        movdqa          xmm5,           xmm3
+
+        pmullw          xmm1,           [rdx]               ;
+        pmullw          xmm3,           [rdx+16]             ;
+        paddw           xmm1,           xmm3                 ;
+        paddw           xmm1,           xmm4                 ;
+        psraw           xmm1,           xmm_filter_shift    ;
+
+        movq            xmm3,           QWORD PTR [rdi]               ;
+        punpcklbw       xmm3,           xmm0                 ;
+
+        psubw           xmm1,           xmm3                 ;
+        paddw           xmm6,           xmm1                 ;
+
+        pmaddwd         xmm1,           xmm1                 ;
+        paddd           xmm7,           xmm1                 ;
+
+        movdqa          xmm1,           xmm5                 ;
+        lea             rsi,            [rsi + rax]          ;ref_pixels_per_line
+        lea             rdi,            [rdi + rbx]          ;src_pixels_per_line
+
+        sub             rcx,            1                   ;
+        jnz             filter_block2d_bil_sp_only_loop       ;
+
+        jmp             filter_block2d_bil_variance
+
+filter_block2d_bil_var_sse2_full_pixel:
+        mov             rsi,            arg(0)               ;ref_ptr
+        mov             rdi,            arg(2)               ;src_ptr
+        movsxd          rcx,            dword ptr arg(4)     ;Height
+        movsxd          rax,            dword ptr arg(1)     ;ref_pixels_per_line
+        movsxd          rbx,            dword ptr arg(3)     ;src_pixels_per_line
+        pxor            xmm0,           xmm0                 ;
+
+filter_block2d_bil_full_pixel_loop:
+        movq            xmm1,           QWORD PTR [rsi]               ;
+        punpcklbw       xmm1,           xmm0                 ;
+
+        movq            xmm2,           QWORD PTR [rdi]               ;
+        punpcklbw       xmm2,           xmm0                 ;
+
+        psubw           xmm1,           xmm2                 ;
+        paddw           xmm6,           xmm1                 ;
+
+        pmaddwd         xmm1,           xmm1                 ;
+        paddd           xmm7,           xmm1                 ;
+
+        lea             rsi,            [rsi + rax]          ;ref_pixels_per_line
+        lea             rdi,            [rdi + rbx]          ;src_pixels_per_line
+
+        sub             rcx,            1                   ;
+        jnz             filter_block2d_bil_full_pixel_loop       ;
+
+        jmp             filter_block2d_bil_variance
+
+filter_block2d_bil_var_sse2_fp_only:
+        mov             rsi,            arg(0)               ;ref_ptr
+        mov             rdi,            arg(2)               ;src_ptr
+        movsxd          rcx,            dword ptr arg(4)     ;Height
+        movsxd          rdx,            dword ptr arg(1)     ;ref_pixels_per_line
+
+        pxor            xmm0,           xmm0                 ;
+        movsxd          rbx,            dword ptr arg(3)     ;src_pixels_per_line
+
+filter_block2d_bil_fp_only_loop:
+        movq            xmm1,           QWORD PTR [rsi]       ;
+        movq            xmm3,           QWORD PTR [rsi+1]     ;
+
+        punpcklbw       xmm1,           xmm0                 ;
+        pmullw          xmm1,           [rax]               ;
+        punpcklbw       xmm3,           xmm0                 ;
+        pmullw          xmm3,           [rax+16]             ;
+
+        paddw           xmm1,           xmm3                 ;
+        paddw           xmm1,           xmm4  ;
+        psraw           xmm1,           xmm_filter_shift    ;
+
+        movq            xmm3,           QWORD PTR [rdi]     ;
+        punpcklbw       xmm3,           xmm0                 ;
+
+        psubw           xmm1,           xmm3                 ;
+        paddw           xmm6,           xmm1                 ;
+
+        pmaddwd         xmm1,           xmm1                 ;
+        paddd           xmm7,           xmm1                 ;
+        lea             rsi,            [rsi + rdx]
+        lea             rdi,            [rdi + rbx]          ;src_pixels_per_line
+
+        sub             rcx,            1                   ;
+        jnz             filter_block2d_bil_fp_only_loop       ;
+
+        jmp             filter_block2d_bil_variance
+
+filter_block2d_bil_variance:
+        movdq2q         mm6,            xmm6                ;
+        movdq2q         mm7,            xmm7                ;
+
+        psrldq          xmm6,           8
+        psrldq          xmm7,           8
+
+        movdq2q         mm2,            xmm6
+        movdq2q         mm3,            xmm7
+
+        paddw           mm6,            mm2
+        paddd           mm7,            mm3
+
+        pxor            mm3,            mm3                 ;
+        pxor            mm2,            mm2                 ;
+
+        punpcklwd       mm2,            mm6                 ;
+        punpckhwd       mm3,            mm6                 ;
+
+        paddd           mm2,            mm3                 ;
+        movq            mm6,            mm2                 ;
+
+        psrlq           mm6,            32                  ;
+        paddd           mm2,            mm6                 ;
+
+        psrad           mm2,            16                  ;
+        movq            mm4,            mm7                 ;
+
+        psrlq           mm4,            32                  ;
+        paddd           mm4,            mm7                 ;
+
+        mov             rsi,            arg(7) ; sum
+        mov             rdi,            arg(8) ; sumsquared
+
+        movd            [rsi],          mm2    ; xsum
+        movd            [rdi],          mm4    ; xxsum
+
+    ; begin epilog
+    pop rbx
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp9_half_horiz_vert_variance8x_h_sse2
+;(
+;    unsigned char *ref_ptr,
+;    int ref_pixels_per_line,
+;    unsigned char *src_ptr,
+;    int src_pixels_per_line,
+;    unsigned int Height,
+;    int *sum,
+;    unsigned int *sumsquared
+;)
+global sym(vp9_half_horiz_vert_variance8x_h_sse2)
+sym(vp9_half_horiz_vert_variance8x_h_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push rsi
+    push rdi
+    ; end prolog
+
+%if ABI_IS_32BIT=0
+    movsxd          r8, dword ptr arg(1) ;ref_pixels_per_line
+    movsxd          r9, dword ptr arg(3) ;src_pixels_per_line
+%endif
+
+        pxor            xmm6,           xmm6                ;  error accumulator
+        pxor            xmm7,           xmm7                ;  sse eaccumulator
+        mov             rsi,            arg(0) ;ref_ptr              ;
+
+        mov             rdi,            arg(2) ;src_ptr              ;
+        movsxd          rcx,            dword ptr arg(4) ;Height              ;
+        movsxd          rax,            dword ptr arg(1) ;ref_pixels_per_line
+
+        pxor            xmm0,           xmm0                ;
+
+        movq            xmm5,           QWORD PTR [rsi]     ;  xmm5 = s0,s1,s2..s8
+        movq            xmm3,           QWORD PTR [rsi+1]   ;  xmm3 = s1,s2,s3..s9
+        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3) horizontal line 1
+
+%if ABI_IS_32BIT
+        add             rsi,            dword ptr arg(1) ;ref_pixels_per_line    ;  next source
+%else
+        add             rsi, r8
+%endif
+
+.half_horiz_vert_variance8x_h_1:
+
+        movq            xmm1,           QWORD PTR [rsi]     ;
+        movq            xmm2,           QWORD PTR [rsi+1]   ;
+        pavgb           xmm1,           xmm2                ;  xmm1 = avg(xmm1,xmm3) horizontal line i+1
+
+        pavgb           xmm5,           xmm1                ;  xmm = vertical average of the above
+        punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above
+
+        movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d8
+        punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above
+
+        psubw           xmm5,           xmm3                ;  xmm5 -= xmm3
+        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
+        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
+        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
+
+        movdqa          xmm5,           xmm1                ;  save xmm1 for use on the next row
+
+%if ABI_IS_32BIT
+        add             esi,            dword ptr arg(1) ;ref_pixels_per_line    ;  next source
+        add             edi,            dword ptr arg(3) ;src_pixels_per_line    ;  next destination
+%else
+        add             rsi, r8
+        add             rdi, r9
+%endif
+
+        sub             rcx,            1                   ;
+        jnz             .half_horiz_vert_variance8x_h_1     ;
+
+        movdq2q         mm6,            xmm6                ;
+        movdq2q         mm7,            xmm7                ;
+
+        psrldq          xmm6,           8
+        psrldq          xmm7,           8
+
+        movdq2q         mm2,            xmm6
+        movdq2q         mm3,            xmm7
+
+        paddw           mm6,            mm2
+        paddd           mm7,            mm3
+
+        pxor            mm3,            mm3                 ;
+        pxor            mm2,            mm2                 ;
+
+        punpcklwd       mm2,            mm6                 ;
+        punpckhwd       mm3,            mm6                 ;
+
+        paddd           mm2,            mm3                 ;
+        movq            mm6,            mm2                 ;
+
+        psrlq           mm6,            32                  ;
+        paddd           mm2,            mm6                 ;
+
+        psrad           mm2,            16                  ;
+        movq            mm4,            mm7                 ;
+
+        psrlq           mm4,            32                  ;
+        paddd           mm4,            mm7                 ;
+
+        mov             rsi,            arg(5) ; sum
+        mov             rdi,            arg(6) ; sumsquared
+
+        movd            [rsi],          mm2                 ;
+        movd            [rdi],          mm4                 ;
+
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vp9_half_horiz_vert_variance16x_h_sse2
+;(
+;    unsigned char *ref_ptr,
+;    int ref_pixels_per_line,
+;    unsigned char *src_ptr,
+;    int src_pixels_per_line,
+;    unsigned int Height,
+;    int *sum,
+;    unsigned int *sumsquared
+;)
+global sym(vp9_half_horiz_vert_variance16x_h_sse2)
+sym(vp9_half_horiz_vert_variance16x_h_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push rsi
+    push rdi
+    ; end prolog
+
+        pxor            xmm6,           xmm6                ;  error accumulator
+        pxor            xmm7,           xmm7                ;  sse eaccumulator
+        mov             rsi,            arg(0) ;ref_ptr              ;
+
+        mov             rdi,            arg(2) ;src_ptr              ;
+        movsxd          rcx,            dword ptr arg(4) ;Height              ;
+        movsxd          rax,            dword ptr arg(1) ;ref_pixels_per_line
+        movsxd          rdx,            dword ptr arg(3)    ;src_pixels_per_line
+
+        pxor            xmm0,           xmm0                ;
+
+        movdqu          xmm5,           XMMWORD PTR [rsi]
+        movdqu          xmm3,           XMMWORD PTR [rsi+1]
+        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3) horizontal line 1
+
+        lea             rsi,            [rsi + rax]
+
+.half_horiz_vert_variance16x_h_1:
+        movdqu          xmm1,           XMMWORD PTR [rsi]     ;
+        movdqu          xmm2,           XMMWORD PTR [rsi+1]   ;
+        pavgb           xmm1,           xmm2                ;  xmm1 = avg(xmm1,xmm3) horizontal line i+1
+
+        pavgb           xmm5,           xmm1                ;  xmm = vertical average of the above
+
+        movdqa          xmm4,           xmm5
+        punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above
+        punpckhbw       xmm4,           xmm0
+
+        movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d7
+        punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above
+        psubw           xmm5,           xmm3                ;  xmm5 -= xmm3
+
+        movq            xmm3,           QWORD PTR [rdi+8]
+        punpcklbw       xmm3,           xmm0
+        psubw           xmm4,           xmm3
+
+        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
+        paddw           xmm6,           xmm4
+        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
+        pmaddwd         xmm4,           xmm4
+        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
+        paddd           xmm7,           xmm4
+
+        movdqa          xmm5,           xmm1                ;  save xmm1 for use on the next row
+
+        lea             rsi,            [rsi + rax]
+        lea             rdi,            [rdi + rdx]
+
+        sub             rcx,            1                   ;
+        jnz             .half_horiz_vert_variance16x_h_1    ;
+
+        pxor        xmm1,           xmm1
+        pxor        xmm5,           xmm5
+
+        punpcklwd   xmm0,           xmm6
+        punpckhwd   xmm1,           xmm6
+        psrad       xmm0,           16
+        psrad       xmm1,           16
+        paddd       xmm0,           xmm1
+        movdqa      xmm1,           xmm0
+
+        movdqa      xmm6,           xmm7
+        punpckldq   xmm6,           xmm5
+        punpckhdq   xmm7,           xmm5
+        paddd       xmm6,           xmm7
+
+        punpckldq   xmm0,           xmm5
+        punpckhdq   xmm1,           xmm5
+        paddd       xmm0,           xmm1
+
+        movdqa      xmm7,           xmm6
+        movdqa      xmm1,           xmm0
+
+        psrldq      xmm7,           8
+        psrldq      xmm1,           8
+
+        paddd       xmm6,           xmm7
+        paddd       xmm0,           xmm1
+
+        mov         rsi,            arg(5) ;[Sum]
+        mov         rdi,            arg(6) ;[SSE]
+
+        movd        [rsi],       xmm0
+        movd        [rdi],       xmm6
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp9_half_vert_variance8x_h_sse2
+;(
+;    unsigned char *ref_ptr,
+;    int ref_pixels_per_line,
+;    unsigned char *src_ptr,
+;    int src_pixels_per_line,
+;    unsigned int Height,
+;    int *sum,
+;    unsigned int *sumsquared
+;)
+global sym(vp9_half_vert_variance8x_h_sse2)
+sym(vp9_half_vert_variance8x_h_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push rsi
+    push rdi
+    ; end prolog
+
+%if ABI_IS_32BIT=0
+    movsxd          r8, dword ptr arg(1) ;ref_pixels_per_line
+    movsxd          r9, dword ptr arg(3) ;src_pixels_per_line
+%endif
+
+        pxor            xmm6,           xmm6                ;  error accumulator
+        pxor            xmm7,           xmm7                ;  sse eaccumulator
+        mov             rsi,            arg(0) ;ref_ptr              ;
+
+        mov             rdi,            arg(2) ;src_ptr              ;
+        movsxd          rcx,            dword ptr arg(4) ;Height              ;
+        movsxd          rax,            dword ptr arg(1) ;ref_pixels_per_line
+
+        pxor            xmm0,           xmm0                ;
+.half_vert_variance8x_h_1:
+        movq            xmm5,           QWORD PTR [rsi]     ;  xmm5 = s0,s1,s2..s8
+        movq            xmm3,           QWORD PTR [rsi+rax] ;  xmm3 = s1,s2,s3..s9
+
+        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3)
+        punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above
+
+        movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d8
+        punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above
+
+        psubw           xmm5,           xmm3                ;  xmm5 -= xmm3
+        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
+        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
+        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
+
+%if ABI_IS_32BIT
+        add             esi,            dword ptr arg(1) ;ref_pixels_per_line    ;  next source
+        add             edi,            dword ptr arg(3) ;src_pixels_per_line    ;  next destination
+%else
+        add             rsi, r8
+        add             rdi, r9
+%endif
+
+        sub             rcx,            1                   ;
+        jnz             .half_vert_variance8x_h_1          ;
+
+        movdq2q         mm6,            xmm6                ;
+        movdq2q         mm7,            xmm7                ;
+
+        psrldq          xmm6,           8
+        psrldq          xmm7,           8
+
+        movdq2q         mm2,            xmm6
+        movdq2q         mm3,            xmm7
+
+        paddw           mm6,            mm2
+        paddd           mm7,            mm3
+
+        pxor            mm3,            mm3                 ;
+        pxor            mm2,            mm2                 ;
+
+        punpcklwd       mm2,            mm6                 ;
+        punpckhwd       mm3,            mm6                 ;
+
+        paddd           mm2,            mm3                 ;
+        movq            mm6,            mm2                 ;
+
+        psrlq           mm6,            32                  ;
+        paddd           mm2,            mm6                 ;
+
+        psrad           mm2,            16                  ;
+        movq            mm4,            mm7                 ;
+
+        psrlq           mm4,            32                  ;
+        paddd           mm4,            mm7                 ;
+
+        mov             rsi,            arg(5) ; sum
+        mov             rdi,            arg(6) ; sumsquared
+
+        movd            [rsi],          mm2                 ;
+        movd            [rdi],          mm4                 ;
+
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vp9_half_vert_variance16x_h_sse2
+;(
+;    unsigned char *ref_ptr,
+;    int ref_pixels_per_line,
+;    unsigned char *src_ptr,
+;    int src_pixels_per_line,
+;    unsigned int Height,
+;    int *sum,
+;    unsigned int *sumsquared
+;)
+global sym(vp9_half_vert_variance16x_h_sse2)
+sym(vp9_half_vert_variance16x_h_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push rsi
+    push rdi
+    ; end prolog
+
+        pxor            xmm6,           xmm6                ;  error accumulator
+        pxor            xmm7,           xmm7                ;  sse eaccumulator
+        mov             rsi,            arg(0)              ;ref_ptr
+
+        mov             rdi,            arg(2)              ;src_ptr
+        movsxd          rcx,            dword ptr arg(4)    ;Height
+        movsxd          rax,            dword ptr arg(1)    ;ref_pixels_per_line
+        movsxd          rdx,            dword ptr arg(3)    ;src_pixels_per_line
+
+        movdqu          xmm5,           XMMWORD PTR [rsi]
+        lea             rsi,            [rsi + rax          ]
+        pxor            xmm0,           xmm0
+
+.half_vert_variance16x_h_1:
+        movdqu          xmm3,           XMMWORD PTR [rsi]
+
+        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3)
+        movdqa          xmm4,           xmm5
+        punpcklbw       xmm5,           xmm0
+        punpckhbw       xmm4,           xmm0
+
+        movq            xmm2,           QWORD PTR [rdi]
+        punpcklbw       xmm2,           xmm0
+        psubw           xmm5,           xmm2
+        movq            xmm2,           QWORD PTR [rdi+8]
+        punpcklbw       xmm2,           xmm0
+        psubw           xmm4,           xmm2
+
+        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
+        paddw           xmm6,           xmm4
+        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
+        pmaddwd         xmm4,           xmm4
+        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
+        paddd           xmm7,           xmm4
+
+        movdqa          xmm5,           xmm3
+
+        lea             rsi,            [rsi + rax]
+        lea             rdi,            [rdi + rdx]
+
+        sub             rcx,            1
+        jnz             .half_vert_variance16x_h_1
+
+        pxor        xmm1,           xmm1
+        pxor        xmm5,           xmm5
+
+        punpcklwd   xmm0,           xmm6
+        punpckhwd   xmm1,           xmm6
+        psrad       xmm0,           16
+        psrad       xmm1,           16
+        paddd       xmm0,           xmm1
+        movdqa      xmm1,           xmm0
+
+        movdqa      xmm6,           xmm7
+        punpckldq   xmm6,           xmm5
+        punpckhdq   xmm7,           xmm5
+        paddd       xmm6,           xmm7
+
+        punpckldq   xmm0,           xmm5
+        punpckhdq   xmm1,           xmm5
+        paddd       xmm0,           xmm1
+
+        movdqa      xmm7,           xmm6
+        movdqa      xmm1,           xmm0
+
+        psrldq      xmm7,           8
+        psrldq      xmm1,           8
+
+        paddd       xmm6,           xmm7
+        paddd       xmm0,           xmm1
+
+        mov         rsi,            arg(5) ;[Sum]
+        mov         rdi,            arg(6) ;[SSE]
+
+        movd        [rsi],       xmm0
+        movd        [rdi],       xmm6
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp9_half_horiz_variance8x_h_sse2
+;(
+;    unsigned char *ref_ptr,
+;    int ref_pixels_per_line,
+;    unsigned char *src_ptr,
+;    int src_pixels_per_line,
+;    unsigned int Height,
+;    int *sum,
+;    unsigned int *sumsquared
+;)
+global sym(vp9_half_horiz_variance8x_h_sse2)
+sym(vp9_half_horiz_variance8x_h_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push rsi
+    push rdi
+    ; end prolog
+
+%if ABI_IS_32BIT=0
+    movsxd          r8, dword ptr arg(1) ;ref_pixels_per_line
+    movsxd          r9, dword ptr arg(3) ;src_pixels_per_line
+%endif
+
+        pxor            xmm6,           xmm6                ;  error accumulator
+        pxor            xmm7,           xmm7                ;  sse eaccumulator
+        mov             rsi,            arg(0) ;ref_ptr              ;
+
+        mov             rdi,            arg(2) ;src_ptr              ;
+        movsxd          rcx,            dword ptr arg(4) ;Height              ;
+
+        pxor            xmm0,           xmm0                ;
+.half_horiz_variance8x_h_1:
+        movq            xmm5,           QWORD PTR [rsi]     ;  xmm5 = s0,s1,s2..s8
+        movq            xmm3,           QWORD PTR [rsi+1]   ;  xmm3 = s1,s2,s3..s9
+
+        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3)
+        punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above
+
+        movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d8
+        punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above
+
+        psubw           xmm5,           xmm3                ;  xmm5 -= xmm3
+        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
+        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
+        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
+
+%if ABI_IS_32BIT
+        add             esi,            dword ptr arg(1) ;ref_pixels_per_line    ;  next source
+        add             edi,            dword ptr arg(3) ;src_pixels_per_line    ;  next destination
+%else
+        add             rsi, r8
+        add             rdi, r9
+%endif
+        sub             rcx,            1                   ;
+        jnz             .half_horiz_variance8x_h_1          ;
+
+        movdq2q         mm6,            xmm6                ;
+        movdq2q         mm7,            xmm7                ;
+
+        psrldq          xmm6,           8
+        psrldq          xmm7,           8
+
+        movdq2q         mm2,            xmm6
+        movdq2q         mm3,            xmm7
+
+        paddw           mm6,            mm2
+        paddd           mm7,            mm3
+
+        pxor            mm3,            mm3                 ;
+        pxor            mm2,            mm2                 ;
+
+        punpcklwd       mm2,            mm6                 ;
+        punpckhwd       mm3,            mm6                 ;
+
+        paddd           mm2,            mm3                 ;
+        movq            mm6,            mm2                 ;
+
+        psrlq           mm6,            32                  ;
+        paddd           mm2,            mm6                 ;
+
+        psrad           mm2,            16                  ;
+        movq            mm4,            mm7                 ;
+
+        psrlq           mm4,            32                  ;
+        paddd           mm4,            mm7                 ;
+
+        mov             rsi,            arg(5) ; sum
+        mov             rdi,            arg(6) ; sumsquared
+
+        movd            [rsi],          mm2                 ;
+        movd            [rdi],          mm4                 ;
+
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vp9_half_horiz_variance16x_h_sse2
+;(
+;    unsigned char *ref_ptr,
+;    int ref_pixels_per_line,
+;    unsigned char *src_ptr,
+;    int src_pixels_per_line,
+;    unsigned int Height,
+;    int *sum,
+;    unsigned int *sumsquared
+;)
+global sym(vp9_half_horiz_variance16x_h_sse2)
+sym(vp9_half_horiz_variance16x_h_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push rsi
+    push rdi
+    ; end prolog
+
+        pxor            xmm6,           xmm6                ;  error accumulator
+        pxor            xmm7,           xmm7                ;  sse eaccumulator
+        mov             rsi,            arg(0) ;ref_ptr              ;
+
+        mov             rdi,            arg(2) ;src_ptr              ;
+        movsxd          rcx,            dword ptr arg(4) ;Height              ;
+        movsxd          rax,            dword ptr arg(1) ;ref_pixels_per_line
+        movsxd          rdx,            dword ptr arg(3)    ;src_pixels_per_line
+
+        pxor            xmm0,           xmm0                ;
+
+.half_horiz_variance16x_h_1:
+        movdqu          xmm5,           XMMWORD PTR [rsi]     ;  xmm5 = s0,s1,s2..s15
+        movdqu          xmm3,           XMMWORD PTR [rsi+1]   ;  xmm3 = s1,s2,s3..s16
+
+        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3)
+        movdqa          xmm1,           xmm5
+        punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above
+        punpckhbw       xmm1,           xmm0
+
+        movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d7
+        punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above
+        movq            xmm2,           QWORD PTR [rdi+8]
+        punpcklbw       xmm2,           xmm0
+
+        psubw           xmm5,           xmm3                ;  xmm5 -= xmm3
+        psubw           xmm1,           xmm2
+        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
+        paddw           xmm6,           xmm1
+        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
+        pmaddwd         xmm1,           xmm1
+        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
+        paddd           xmm7,           xmm1
+
+        lea             rsi,            [rsi + rax]
+        lea             rdi,            [rdi + rdx]
+
+        sub             rcx,            1                   ;
+        jnz             .half_horiz_variance16x_h_1         ;
+
+        pxor        xmm1,           xmm1
+        pxor        xmm5,           xmm5
+
+        punpcklwd   xmm0,           xmm6
+        punpckhwd   xmm1,           xmm6
+        psrad       xmm0,           16
+        psrad       xmm1,           16
+        paddd       xmm0,           xmm1
+        movdqa      xmm1,           xmm0
+
+        movdqa      xmm6,           xmm7
+        punpckldq   xmm6,           xmm5
+        punpckhdq   xmm7,           xmm5
+        paddd       xmm6,           xmm7
+
+        punpckldq   xmm0,           xmm5
+        punpckhdq   xmm1,           xmm5
+        paddd       xmm0,           xmm1
+
+        movdqa      xmm7,           xmm6
+        movdqa      xmm1,           xmm0
+
+        psrldq      xmm7,           8
+        psrldq      xmm1,           8
+
+        paddd       xmm6,           xmm7
+        paddd       xmm0,           xmm1
+
+        mov         rsi,            arg(5) ;[Sum]
+        mov         rdi,            arg(6) ;[SSE]
+
+        movd        [rsi],       xmm0
+        movd        [rdi],       xmm6
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+SECTION_RODATA
+;    short xmm_bi_rd[8] = { 64, 64, 64, 64,64, 64, 64, 64};
+align 16
+xmm_bi_rd:
+    times 8 dw 64
+align 16
+bilinear_filters_sse2:
+    dw 128, 128, 128, 128, 128, 128, 128, 128,  0,  0,  0,  0,  0,  0,  0,  0
+    dw 120, 120, 120, 120, 120, 120, 120, 120,  8,  8,  8,  8,  8,  8,  8,  8
+    dw 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16
+    dw 104, 104, 104, 104, 104, 104, 104, 104, 24, 24, 24, 24, 24, 24, 24, 24
+    dw 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32
+    dw 88, 88, 88, 88, 88, 88, 88, 88, 40, 40, 40, 40, 40, 40, 40, 40
+    dw 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48
+    dw 72, 72, 72, 72, 72, 72, 72, 72, 56, 56, 56, 56, 56, 56, 56, 56
+    dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
+    dw 56, 56, 56, 56, 56, 56, 56, 56, 72, 72, 72, 72, 72, 72, 72, 72
+    dw 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80
+    dw 40, 40, 40, 40, 40, 40, 40, 40, 88, 88, 88, 88, 88, 88, 88, 88
+    dw 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96
+    dw 24, 24, 24, 24, 24, 24, 24, 24, 104, 104, 104, 104, 104, 104, 104, 104
+    dw 16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112
+    dw 8, 8, 8, 8, 8, 8, 8, 8, 120, 120, 120, 120, 120, 120, 120, 120
--- /dev/null
+++ b/vp9/encoder/x86/variance_impl_ssse3.asm
@@ -1,0 +1,372 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+%define xmm_filter_shift            7
+
+
+;void vp9_filter_block2d_bil_var_ssse3
+;(
+;    unsigned char *ref_ptr,
+;    int ref_pixels_per_line,
+;    unsigned char *src_ptr,
+;    int src_pixels_per_line,
+;    unsigned int Height,
+;    int  xoffset,
+;    int  yoffset,
+;    int *sum,
+;    unsigned int *sumsquared;;
+;
+;)
+;Note: The filter coefficient at offset=0 is 128. Since the second register
+;for Pmaddubsw is signed bytes, we must calculate zero offset seperately.
+global sym(vp9_filter_block2d_bil_var_ssse3)
+sym(vp9_filter_block2d_bil_var_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 9
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push rsi
+    push rdi
+    ; end prolog
+
+        pxor            xmm6,           xmm6
+        pxor            xmm7,           xmm7
+
+        lea             rcx,            [GLOBAL(bilinear_filters_ssse3)]
+        movsxd          rax,            dword ptr arg(5)     ; xoffset
+
+        cmp             rax,            0                    ; skip first_pass filter if xoffset=0
+        je              .filter_block2d_bil_var_ssse3_sp_only
+
+        shl             rax,            4                    ; point to filter coeff with xoffset
+        lea             rax,            [rax + rcx]          ; HFilter
+
+        movsxd          rdx,            dword ptr arg(6)     ; yoffset
+
+        cmp             rdx,            0                    ; skip second_pass filter if yoffset=0
+        je              .filter_block2d_bil_var_ssse3_fp_only
+
+        shl             rdx,            4
+        lea             rdx,            [rdx + rcx]          ; VFilter
+
+        mov             rsi,            arg(0)               ;ref_ptr
+        mov             rdi,            arg(2)               ;src_ptr
+        movsxd          rcx,            dword ptr arg(4)     ;Height
+
+        movdqu          xmm0,           XMMWORD PTR [rsi]
+        movdqu          xmm1,           XMMWORD PTR [rsi+1]
+        movdqa          xmm2,           xmm0
+
+        punpcklbw       xmm0,           xmm1
+        punpckhbw       xmm2,           xmm1
+        pmaddubsw       xmm0,           [rax]
+        pmaddubsw       xmm2,           [rax]
+
+        paddw           xmm0,           [GLOBAL(xmm_bi_rd)]
+        paddw           xmm2,           [GLOBAL(xmm_bi_rd)]
+        psraw           xmm0,           xmm_filter_shift
+        psraw           xmm2,           xmm_filter_shift
+
+        packuswb        xmm0,           xmm2
+
+%if ABI_IS_32BIT
+        add             rsi,            dword ptr arg(1) ;ref_pixels_per_line
+%else
+        movsxd          r8,             dword ptr arg(1) ;ref_pixels_per_line
+        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line
+        lea             rsi,            [rsi + r8]
+%endif
+
+.filter_block2d_bil_var_ssse3_loop:
+        movdqu          xmm1,           XMMWORD PTR [rsi]
+        movdqu          xmm2,           XMMWORD PTR [rsi+1]
+        movdqa          xmm3,           xmm1
+
+        punpcklbw       xmm1,           xmm2
+        punpckhbw       xmm3,           xmm2
+        pmaddubsw       xmm1,           [rax]
+        pmaddubsw       xmm3,           [rax]
+
+        paddw           xmm1,           [GLOBAL(xmm_bi_rd)]
+        paddw           xmm3,           [GLOBAL(xmm_bi_rd)]
+        psraw           xmm1,           xmm_filter_shift
+        psraw           xmm3,           xmm_filter_shift
+        packuswb        xmm1,           xmm3
+
+        movdqa          xmm2,           xmm0
+        movdqa          xmm0,           xmm1
+        movdqa          xmm3,           xmm2
+
+        punpcklbw       xmm2,           xmm1
+        punpckhbw       xmm3,           xmm1
+        pmaddubsw       xmm2,           [rdx]
+        pmaddubsw       xmm3,           [rdx]
+
+        paddw           xmm2,           [GLOBAL(xmm_bi_rd)]
+        paddw           xmm3,           [GLOBAL(xmm_bi_rd)]
+        psraw           xmm2,           xmm_filter_shift
+        psraw           xmm3,           xmm_filter_shift
+
+        movq            xmm1,           QWORD PTR [rdi]
+        pxor            xmm4,           xmm4
+        punpcklbw       xmm1,           xmm4
+        movq            xmm5,           QWORD PTR [rdi+8]
+        punpcklbw       xmm5,           xmm4
+
+        psubw           xmm2,           xmm1
+        psubw           xmm3,           xmm5
+        paddw           xmm6,           xmm2
+        paddw           xmm6,           xmm3
+        pmaddwd         xmm2,           xmm2
+        pmaddwd         xmm3,           xmm3
+        paddd           xmm7,           xmm2
+        paddd           xmm7,           xmm3
+
+%if ABI_IS_32BIT
+        add             rsi,            dword ptr arg(1)     ;ref_pixels_per_line
+        add             rdi,            dword ptr arg(3)     ;src_pixels_per_line
+%else
+        lea             rsi,            [rsi + r8]
+        lea             rdi,            [rdi + r9]
+%endif
+
+        sub             rcx,            1
+        jnz             .filter_block2d_bil_var_ssse3_loop
+
+        jmp             .filter_block2d_bil_variance
+
+.filter_block2d_bil_var_ssse3_sp_only:
+        movsxd          rdx,            dword ptr arg(6)     ; yoffset
+
+        cmp             rdx,            0                    ; Both xoffset =0 and yoffset=0
+        je              .filter_block2d_bil_var_ssse3_full_pixel
+
+        shl             rdx,            4
+        lea             rdx,            [rdx + rcx]          ; VFilter
+
+        mov             rsi,            arg(0)               ;ref_ptr
+        mov             rdi,            arg(2)               ;src_ptr
+        movsxd          rcx,            dword ptr arg(4)     ;Height
+        movsxd          rax,            dword ptr arg(1)     ;ref_pixels_per_line
+
+        movdqu          xmm1,           XMMWORD PTR [rsi]
+        movdqa          xmm0,           xmm1
+
+%if ABI_IS_32BIT=0
+        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line
+%endif
+
+        lea             rsi,            [rsi + rax]
+
+.filter_block2d_bil_sp_only_loop:
+        movdqu          xmm3,           XMMWORD PTR [rsi]
+        movdqa          xmm2,           xmm1
+        movdqa          xmm0,           xmm3
+
+        punpcklbw       xmm1,           xmm3
+        punpckhbw       xmm2,           xmm3
+        pmaddubsw       xmm1,           [rdx]
+        pmaddubsw       xmm2,           [rdx]
+
+        paddw           xmm1,           [GLOBAL(xmm_bi_rd)]
+        paddw           xmm2,           [GLOBAL(xmm_bi_rd)]
+        psraw           xmm1,           xmm_filter_shift
+        psraw           xmm2,           xmm_filter_shift
+
+        movq            xmm3,           QWORD PTR [rdi]
+        pxor            xmm4,           xmm4
+        punpcklbw       xmm3,           xmm4
+        movq            xmm5,           QWORD PTR [rdi+8]
+        punpcklbw       xmm5,           xmm4
+
+        psubw           xmm1,           xmm3
+        psubw           xmm2,           xmm5
+        paddw           xmm6,           xmm1
+        paddw           xmm6,           xmm2
+        pmaddwd         xmm1,           xmm1
+        pmaddwd         xmm2,           xmm2
+        paddd           xmm7,           xmm1
+        paddd           xmm7,           xmm2
+
+        movdqa          xmm1,           xmm0
+        lea             rsi,            [rsi + rax]          ;ref_pixels_per_line
+
+%if ABI_IS_32BIT
+        add             rdi,            dword ptr arg(3)     ;src_pixels_per_line
+%else
+        lea             rdi,            [rdi + r9]
+%endif
+
+        sub             rcx,            1
+        jnz             .filter_block2d_bil_sp_only_loop
+
+        jmp             .filter_block2d_bil_variance
+
+.filter_block2d_bil_var_ssse3_full_pixel:
+        mov             rsi,            arg(0)               ;ref_ptr
+        mov             rdi,            arg(2)               ;src_ptr
+        movsxd          rcx,            dword ptr arg(4)     ;Height
+        movsxd          rax,            dword ptr arg(1)     ;ref_pixels_per_line
+        movsxd          rdx,            dword ptr arg(3)     ;src_pixels_per_line
+        pxor            xmm0,           xmm0
+
+.filter_block2d_bil_full_pixel_loop:
+        movq            xmm1,           QWORD PTR [rsi]
+        punpcklbw       xmm1,           xmm0
+        movq            xmm2,           QWORD PTR [rsi+8]
+        punpcklbw       xmm2,           xmm0
+
+        movq            xmm3,           QWORD PTR [rdi]
+        punpcklbw       xmm3,           xmm0
+        movq            xmm4,           QWORD PTR [rdi+8]
+        punpcklbw       xmm4,           xmm0
+
+        psubw           xmm1,           xmm3
+        psubw           xmm2,           xmm4
+        paddw           xmm6,           xmm1
+        paddw           xmm6,           xmm2
+        pmaddwd         xmm1,           xmm1
+        pmaddwd         xmm2,           xmm2
+        paddd           xmm7,           xmm1
+        paddd           xmm7,           xmm2
+
+        lea             rsi,            [rsi + rax]          ;ref_pixels_per_line
+        lea             rdi,            [rdi + rdx]          ;src_pixels_per_line
+        sub             rcx,            1
+        jnz             .filter_block2d_bil_full_pixel_loop
+
+        jmp             .filter_block2d_bil_variance
+
+.filter_block2d_bil_var_ssse3_fp_only:
+        mov             rsi,            arg(0)               ;ref_ptr
+        mov             rdi,            arg(2)               ;src_ptr
+        movsxd          rcx,            dword ptr arg(4)     ;Height
+        movsxd          rdx,            dword ptr arg(1)     ;ref_pixels_per_line
+
+        pxor            xmm0,           xmm0
+
+%if ABI_IS_32BIT=0
+        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line
+%endif
+
+.filter_block2d_bil_fp_only_loop:
+        movdqu          xmm1,           XMMWORD PTR [rsi]
+        movdqu          xmm2,           XMMWORD PTR [rsi+1]
+        movdqa          xmm3,           xmm1
+
+        punpcklbw       xmm1,           xmm2
+        punpckhbw       xmm3,           xmm2
+        pmaddubsw       xmm1,           [rax]
+        pmaddubsw       xmm3,           [rax]
+
+        paddw           xmm1,           [GLOBAL(xmm_bi_rd)]
+        paddw           xmm3,           [GLOBAL(xmm_bi_rd)]
+        psraw           xmm1,           xmm_filter_shift
+        psraw           xmm3,           xmm_filter_shift
+
+        movq            xmm2,           XMMWORD PTR [rdi]
+        pxor            xmm4,           xmm4
+        punpcklbw       xmm2,           xmm4
+        movq            xmm5,           QWORD PTR [rdi+8]
+        punpcklbw       xmm5,           xmm4
+
+        psubw           xmm1,           xmm2
+        psubw           xmm3,           xmm5
+        paddw           xmm6,           xmm1
+        paddw           xmm6,           xmm3
+        pmaddwd         xmm1,           xmm1
+        pmaddwd         xmm3,           xmm3
+        paddd           xmm7,           xmm1
+        paddd           xmm7,           xmm3
+
+        lea             rsi,            [rsi + rdx]
+%if ABI_IS_32BIT
+        add             rdi,            dword ptr arg(3)     ;src_pixels_per_line
+%else
+        lea             rdi,            [rdi + r9]
+%endif
+
+        sub             rcx,            1
+        jnz             .filter_block2d_bil_fp_only_loop
+
+        jmp             .filter_block2d_bil_variance
+
+.filter_block2d_bil_variance:
+        pxor        xmm0,           xmm0
+        pxor        xmm1,           xmm1
+        pxor        xmm5,           xmm5
+
+        punpcklwd   xmm0,           xmm6
+        punpckhwd   xmm1,           xmm6
+        psrad       xmm0,           16
+        psrad       xmm1,           16
+        paddd       xmm0,           xmm1
+        movdqa      xmm1,           xmm0
+
+        movdqa      xmm6,           xmm7
+        punpckldq   xmm6,           xmm5
+        punpckhdq   xmm7,           xmm5
+        paddd       xmm6,           xmm7
+
+        punpckldq   xmm0,           xmm5
+        punpckhdq   xmm1,           xmm5
+        paddd       xmm0,           xmm1
+
+        movdqa      xmm7,           xmm6
+        movdqa      xmm1,           xmm0
+
+        psrldq      xmm7,           8
+        psrldq      xmm1,           8
+
+        paddd       xmm6,           xmm7
+        paddd       xmm0,           xmm1
+
+        mov         rsi,            arg(7) ;[Sum]
+        mov         rdi,            arg(8) ;[SSE]
+
+        movd        [rsi],       xmm0
+        movd        [rdi],       xmm6
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+SECTION_RODATA
+align 16
+xmm_bi_rd:
+    times 8 dw 64
+align 16
+bilinear_filters_ssse3:
+    times 8 db 128, 0
+    times 8 db 120, 8
+    times 8 db 112, 16
+    times 8 db 104, 24
+    times 8 db  96, 32
+    times 8 db  88, 40
+    times 8 db  80, 48
+    times 8 db  72, 56
+    times 8 db  64, 64
+    times 8 db  56, 72
+    times 8 db  48, 80
+    times 8 db  40, 88
+    times 8 db  32, 96
+    times 8 db  24, 104
+    times 8 db  16, 112
+    times 8 db   8, 120
--- /dev/null
+++ b/vp9/encoder/x86/variance_mmx.c
@@ -1,0 +1,406 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_config.h"
+#include "vp9/encoder/variance.h"
+#include "vp9/common/pragmas.h"
+#include "vpx_ports/mem.h"
+
+extern void filter_block1d_h6_mmx
+(
+  const unsigned char *src_ptr,
+  unsigned short *output_ptr,
+  unsigned int src_pixels_per_line,
+  unsigned int pixel_step,
+  unsigned int output_height,
+  unsigned int output_width,
+  short *vp7_filter
+);
+extern void filter_block1d_v6_mmx
+(
+  const short *src_ptr,
+  unsigned char *output_ptr,
+  unsigned int pixels_per_line,
+  unsigned int pixel_step,
+  unsigned int output_height,
+  unsigned int output_width,
+  short *vp7_filter
+);
+
+extern unsigned int vp9_get_mb_ss_mmx(const short *src_ptr);
+extern unsigned int vp9_get8x8var_mmx
+(
+  const unsigned char *src_ptr,
+  int  source_stride,
+  const unsigned char *ref_ptr,
+  int  recon_stride,
+  unsigned int *SSE,
+  int *Sum
+);
+extern unsigned int vp9_get4x4var_mmx
+(
+  const unsigned char *src_ptr,
+  int  source_stride,
+  const unsigned char *ref_ptr,
+  int  recon_stride,
+  unsigned int *SSE,
+  int *Sum
+);
+extern void vp9_filter_block2d_bil4x4_var_mmx
+(
+  const unsigned char *ref_ptr,
+  int ref_pixels_per_line,
+  const unsigned char *src_ptr,
+  int src_pixels_per_line,
+  const short *HFilter,
+  const short *VFilter,
+  int *sum,
+  unsigned int *sumsquared
+);
+extern void vp9_filter_block2d_bil_var_mmx
+(
+  const unsigned char *ref_ptr,
+  int ref_pixels_per_line,
+  const unsigned char *src_ptr,
+  int src_pixels_per_line,
+  unsigned int Height,
+  const short *HFilter,
+  const short *VFilter,
+  int *sum,
+  unsigned int *sumsquared
+);
+
+
+unsigned int vp9_variance4x4_mmx(
+  const unsigned char *src_ptr,
+  int  source_stride,
+  const unsigned char *ref_ptr,
+  int  recon_stride,
+  unsigned int *sse) {
+  unsigned int var;
+  int avg;
+
+  vp9_get4x4var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg);
+  *sse = var;
+  return (var - ((avg * avg) >> 4));
+
+}
+
+unsigned int vp9_variance8x8_mmx(
+  const unsigned char *src_ptr,
+  int  source_stride,
+  const unsigned char *ref_ptr,
+  int  recon_stride,
+  unsigned int *sse) {
+  unsigned int var;
+  int avg;
+
+  vp9_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg);
+  *sse = var;
+
+  return (var - ((avg * avg) >> 6));
+
+}
+
+unsigned int vp9_mse16x16_mmx(
+  const unsigned char *src_ptr,
+  int  source_stride,
+  const unsigned char *ref_ptr,
+  int  recon_stride,
+  unsigned int *sse) {
+  unsigned int sse0, sse1, sse2, sse3, var;
+  int sum0, sum1, sum2, sum3;
+
+
+  vp9_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0);
+  vp9_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
+  vp9_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse2, &sum2);
+  vp9_get8x8var_mmx(src_ptr + 8 * source_stride + 8, source_stride, ref_ptr + 8 * recon_stride + 8, recon_stride, &sse3, &sum3);
+
+  var = sse0 + sse1 + sse2 + sse3;
+  *sse = var;
+  return var;
+}
+
+
+unsigned int vp9_variance16x16_mmx(
+  const unsigned char *src_ptr,
+  int  source_stride,
+  const unsigned char *ref_ptr,
+  int  recon_stride,
+  unsigned int *sse) {
+  unsigned int sse0, sse1, sse2, sse3, var;
+  int sum0, sum1, sum2, sum3, avg;
+
+
+  vp9_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0);
+  vp9_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
+  vp9_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse2, &sum2);
+  vp9_get8x8var_mmx(src_ptr + 8 * source_stride + 8, source_stride, ref_ptr + 8 * recon_stride + 8, recon_stride, &sse3, &sum3);
+
+  var = sse0 + sse1 + sse2 + sse3;
+  avg = sum0 + sum1 + sum2 + sum3;
+  *sse = var;
+  return (var - ((avg * avg) >> 8));
+}
+
+unsigned int vp9_variance16x8_mmx(
+  const unsigned char *src_ptr,
+  int  source_stride,
+  const unsigned char *ref_ptr,
+  int  recon_stride,
+  unsigned int *sse) {
+  unsigned int sse0, sse1, var;
+  int sum0, sum1, avg;
+
+  vp9_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0);
+  vp9_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
+
+  var = sse0 + sse1;
+  avg = sum0 + sum1;
+  *sse = var;
+  return (var - ((avg * avg) >> 7));
+
+}
+
+
+unsigned int vp9_variance8x16_mmx(
+  const unsigned char *src_ptr,
+  int  source_stride,
+  const unsigned char *ref_ptr,
+  int  recon_stride,
+  unsigned int *sse) {
+  unsigned int sse0, sse1, var;
+  int sum0, sum1, avg;
+
+  vp9_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0);
+  vp9_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse1, &sum1);
+
+  var = sse0 + sse1;
+  avg = sum0 + sum1;
+  *sse = var;
+
+  return (var - ((avg * avg) >> 7));
+
+}
+
+
+
+
+///////////////////////////////////////////////////////////////////////////
+// the mmx function that does the bilinear filtering and var calculation //
+// int one pass                                                          //
+///////////////////////////////////////////////////////////////////////////
+DECLARE_ALIGNED(16, const short, vp9_bilinear_filters_mmx[16][8]) = {
+  { 128, 128, 128, 128,  0,  0,  0,  0 },
+  { 120, 120, 120, 120,  8,  8,  8,  8 },
+  { 112, 112, 112, 112, 16, 16, 16, 16 },
+  { 104, 104, 104, 104, 24, 24, 24, 24 },
+  {  96, 96, 96, 96, 32, 32, 32, 32 },
+  {  88, 88, 88, 88, 40, 40, 40, 40 },
+  {  80, 80, 80, 80, 48, 48, 48, 48 },
+  {  72, 72, 72, 72, 56, 56, 56, 56 },
+  {  64, 64, 64, 64, 64, 64, 64, 64 },
+  {  56, 56, 56, 56, 72, 72, 72, 72 },
+  {  48, 48, 48, 48, 80, 80, 80, 80 },
+  {  40, 40, 40, 40, 88, 88, 88, 88 },
+  {  32, 32, 32, 32, 96, 96, 96, 96 },
+  {  24, 24, 24, 24, 104, 104, 104, 104 },
+  {  16, 16, 16, 16, 112, 112, 112, 112 },
+  {   8,  8,  8,  8, 120, 120, 120, 120 }
+};
+
+unsigned int vp9_sub_pixel_variance4x4_mmx
+(
+  const unsigned char  *src_ptr,
+  int  src_pixels_per_line,
+  int  xoffset,
+  int  yoffset,
+  const unsigned char *dst_ptr,
+  int dst_pixels_per_line,
+  unsigned int *sse)
+
+{
+  int xsum;
+  unsigned int xxsum;
+  vp9_filter_block2d_bil4x4_var_mmx(
+    src_ptr, src_pixels_per_line,
+    dst_ptr, dst_pixels_per_line,
+    vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset],
+    &xsum, &xxsum
+  );
+  *sse = xxsum;
+  return (xxsum - ((xsum * xsum) >> 4));
+}
+
+
+unsigned int vp9_sub_pixel_variance8x8_mmx
+(
+  const unsigned char  *src_ptr,
+  int  src_pixels_per_line,
+  int  xoffset,
+  int  yoffset,
+  const unsigned char *dst_ptr,
+  int dst_pixels_per_line,
+  unsigned int *sse
+) {
+
+  int xsum;
+  unsigned int xxsum;
+  vp9_filter_block2d_bil_var_mmx(
+    src_ptr, src_pixels_per_line,
+    dst_ptr, dst_pixels_per_line, 8,
+    vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset],
+    &xsum, &xxsum
+  );
+  *sse = xxsum;
+  return (xxsum - ((xsum * xsum) >> 6));
+}
+
+unsigned int vp9_sub_pixel_variance16x16_mmx
+(
+  const unsigned char  *src_ptr,
+  int  src_pixels_per_line,
+  int  xoffset,
+  int  yoffset,
+  const unsigned char *dst_ptr,
+  int dst_pixels_per_line,
+  unsigned int *sse
+) {
+
+  int xsum0, xsum1;
+  unsigned int xxsum0, xxsum1;
+
+  vp9_filter_block2d_bil_var_mmx(
+    src_ptr, src_pixels_per_line,
+    dst_ptr, dst_pixels_per_line, 16,
+    vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset],
+    &xsum0, &xxsum0
+  );
+
+  vp9_filter_block2d_bil_var_mmx(
+    src_ptr + 8, src_pixels_per_line,
+    dst_ptr + 8, dst_pixels_per_line, 16,
+    vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset],
+    &xsum1, &xxsum1
+  );
+
+  xsum0 += xsum1;
+  xxsum0 += xxsum1;
+
+  *sse = xxsum0;
+  return (xxsum0 - ((xsum0 * xsum0) >> 8));
+
+
+}
+
+unsigned int vp9_sub_pixel_mse16x16_mmx(
+  const unsigned char  *src_ptr,
+  int  src_pixels_per_line,
+  int  xoffset,
+  int  yoffset,
+  const unsigned char *dst_ptr,
+  int dst_pixels_per_line,
+  unsigned int *sse
+) {
+  vp9_sub_pixel_variance16x16_mmx(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse);
+  return *sse;
+}
+
+unsigned int vp9_sub_pixel_variance16x8_mmx
+(
+  const unsigned char  *src_ptr,
+  int  src_pixels_per_line,
+  int  xoffset,
+  int  yoffset,
+  const unsigned char *dst_ptr,
+  int dst_pixels_per_line,
+  unsigned int *sse
+) {
+  int xsum0, xsum1;
+  unsigned int xxsum0, xxsum1;
+
+
+  vp9_filter_block2d_bil_var_mmx(
+    src_ptr, src_pixels_per_line,
+    dst_ptr, dst_pixels_per_line, 8,
+    vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset],
+    &xsum0, &xxsum0
+  );
+
+
+  vp9_filter_block2d_bil_var_mmx(
+    src_ptr + 8, src_pixels_per_line,
+    dst_ptr + 8, dst_pixels_per_line, 8,
+    vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset],
+    &xsum1, &xxsum1
+  );
+
+  xsum0 += xsum1;
+  xxsum0 += xxsum1;
+
+  *sse = xxsum0;
+  return (xxsum0 - ((xsum0 * xsum0) >> 7));
+}
+
+unsigned int vp9_sub_pixel_variance8x16_mmx
+(
+  const unsigned char  *src_ptr,
+  int  src_pixels_per_line,
+  int  xoffset,
+  int  yoffset,
+  const unsigned char *dst_ptr,
+  int dst_pixels_per_line,
+  unsigned int *sse
+) {
+  int xsum;
+  unsigned int xxsum;
+  vp9_filter_block2d_bil_var_mmx(
+    src_ptr, src_pixels_per_line,
+    dst_ptr, dst_pixels_per_line, 16,
+    vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset],
+    &xsum, &xxsum
+  );
+  *sse = xxsum;
+  return (xxsum - ((xsum * xsum) >> 7));
+}
+
+
+unsigned int vp9_variance_halfpixvar16x16_h_mmx(
+  const unsigned char *src_ptr,
+  int  source_stride,
+  const unsigned char *ref_ptr,
+  int  recon_stride,
+  unsigned int *sse) {
+  return vp9_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 8, 0,
+                                         ref_ptr, recon_stride, sse);
+}
+
+
+unsigned int vp9_variance_halfpixvar16x16_v_mmx(
+  const unsigned char *src_ptr,
+  int  source_stride,
+  const unsigned char *ref_ptr,
+  int  recon_stride,
+  unsigned int *sse) {
+  return vp9_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 0, 8,
+                                         ref_ptr, recon_stride, sse);
+}
+
+
+unsigned int vp9_variance_halfpixvar16x16_hv_mmx(
+  const unsigned char *src_ptr,
+  int  source_stride,
+  const unsigned char *ref_ptr,
+  int  recon_stride,
+  unsigned int *sse) {
+  return vp9_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 8, 8,
+                                         ref_ptr, recon_stride, sse);
+}
--- /dev/null
+++ b/vp9/encoder/x86/variance_sse2.c
@@ -1,0 +1,517 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_config.h"
+#include "vp9/encoder/variance.h"
+#include "vp9/common/pragmas.h"
+#include "vpx_ports/mem.h"
+
+#define HALFNDX 8
+
+extern void filter_block1d_h6_mmx(const unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
+extern void filter_block1d_v6_mmx(const short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
+extern void filter_block1d8_h6_sse2(const unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
+extern void filter_block1d8_v6_sse2(const short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
+
+extern void vp9_filter_block2d_bil4x4_var_mmx
+(
+  const unsigned char *ref_ptr,
+  int ref_pixels_per_line,
+  const unsigned char *src_ptr,
+  int src_pixels_per_line,
+  const short *HFilter,
+  const short *VFilter,
+  int *sum,
+  unsigned int *sumsquared
+);
+
+extern unsigned int vp9_get4x4var_mmx
+(
+  const unsigned char *src_ptr,
+  int  source_stride,
+  const unsigned char *ref_ptr,
+  int  recon_stride,
+  unsigned int *SSE,
+  int *Sum
+);
+
+unsigned int vp9_get_mb_ss_sse2
+(
+  const short *src_ptr
+);
+unsigned int vp9_get16x16var_sse2
+(
+  const unsigned char *src_ptr,
+  int source_stride,
+  const unsigned char *ref_ptr,
+  int recon_stride,
+  unsigned int *SSE,
+  int *Sum
+);
+unsigned int vp9_get8x8var_sse2
+(
+  const unsigned char *src_ptr,
+  int source_stride,
+  const unsigned char *ref_ptr,
+  int recon_stride,
+  unsigned int *SSE,
+  int *Sum
+);
+void vp9_filter_block2d_bil_var_sse2
+(
+  const unsigned char *ref_ptr,
+  int ref_pixels_per_line,
+  const unsigned char *src_ptr,
+  int src_pixels_per_line,
+  unsigned int Height,
+  int  xoffset,
+  int  yoffset,
+  int *sum,
+  unsigned int *sumsquared
+);
+void vp9_half_horiz_vert_variance8x_h_sse2
+(
+  const unsigned char *ref_ptr,
+  int ref_pixels_per_line,
+  const unsigned char *src_ptr,
+  int src_pixels_per_line,
+  unsigned int Height,
+  int *sum,
+  unsigned int *sumsquared
+);
+void vp9_half_horiz_vert_variance16x_h_sse2
+(
+  const unsigned char *ref_ptr,
+  int ref_pixels_per_line,
+  const unsigned char *src_ptr,
+  int src_pixels_per_line,
+  unsigned int Height,
+  int *sum,
+  unsigned int *sumsquared
+);
+void vp9_half_horiz_variance8x_h_sse2
+(
+  const unsigned char *ref_ptr,
+  int ref_pixels_per_line,
+  const unsigned char *src_ptr,
+  int src_pixels_per_line,
+  unsigned int Height,
+  int *sum,
+  unsigned int *sumsquared
+);
+void vp9_half_horiz_variance16x_h_sse2
+(
+  const unsigned char *ref_ptr,
+  int ref_pixels_per_line,
+  const unsigned char *src_ptr,
+  int src_pixels_per_line,
+  unsigned int Height,
+  int *sum,
+  unsigned int *sumsquared
+);
+void vp9_half_vert_variance8x_h_sse2
+(
+  const unsigned char *ref_ptr,
+  int ref_pixels_per_line,
+  const unsigned char *src_ptr,
+  int src_pixels_per_line,
+  unsigned int Height,
+  int *sum,
+  unsigned int *sumsquared
+);
+void vp9_half_vert_variance16x_h_sse2
+(
+  const unsigned char *ref_ptr,
+  int ref_pixels_per_line,
+  const unsigned char *src_ptr,
+  int src_pixels_per_line,
+  unsigned int Height,
+  int *sum,
+  unsigned int *sumsquared
+);
+
+DECLARE_ALIGNED(16, extern short, vp9_bilinear_filters_mmx[16][8]);
+
+unsigned int vp9_variance4x4_wmt(
+  const unsigned char *src_ptr,
+  int  source_stride,
+  const unsigned char *ref_ptr,
+  int  recon_stride,
+  unsigned int *sse) {
+  unsigned int var;
+  int avg;
+
+  vp9_get4x4var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg);
+  *sse = var;
+  return (var - ((avg * avg) >> 4));
+
+}
+
+unsigned int vp9_variance8x8_wmt
+(
+  const unsigned char *src_ptr,
+  int  source_stride,
+  const unsigned char *ref_ptr,
+  int  recon_stride,
+  unsigned int *sse) {
+  unsigned int var;
+  int avg;
+
+  vp9_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg);
+  *sse = var;
+  return (var - ((avg * avg) >> 6));
+
+}
+
+
+unsigned int vp9_variance16x16_wmt
+(
+  const unsigned char *src_ptr,
+  int  source_stride,
+  const unsigned char *ref_ptr,
+  int  recon_stride,
+  unsigned int *sse) {
+  unsigned int sse0;
+  int sum0;
+
+
+  vp9_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0);
+  *sse = sse0;
+  return (sse0 - ((sum0 * sum0) >> 8));
+}
+unsigned int vp9_mse16x16_wmt(
+  const unsigned char *src_ptr,
+  int  source_stride,
+  const unsigned char *ref_ptr,
+  int  recon_stride,
+  unsigned int *sse) {
+
+  unsigned int sse0;
+  int sum0;
+  vp9_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0);
+  *sse = sse0;
+  return sse0;
+
+}
+
+
+unsigned int vp9_variance16x8_wmt
+(
+  const unsigned char *src_ptr,
+  int  source_stride,
+  const unsigned char *ref_ptr,
+  int  recon_stride,
+  unsigned int *sse) {
+  unsigned int sse0, sse1, var;
+  int sum0, sum1, avg;
+
+  vp9_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0);
+  vp9_get8x8var_sse2(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
+
+  var = sse0 + sse1;
+  avg = sum0 + sum1;
+  *sse = var;
+  return (var - ((avg * avg) >> 7));
+
+}
+
+unsigned int vp9_variance8x16_wmt
+(
+  const unsigned char *src_ptr,
+  int  source_stride,
+  const unsigned char *ref_ptr,
+  int  recon_stride,
+  unsigned int *sse) {
+  unsigned int sse0, sse1, var;
+  int sum0, sum1, avg;
+
+  vp9_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0);
+  vp9_get8x8var_sse2(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse1, &sum1);
+
+  var = sse0 + sse1;
+  avg = sum0 + sum1;
+  *sse = var;
+  return (var - ((avg * avg) >> 7));
+
+}
+
+unsigned int vp9_sub_pixel_variance4x4_wmt
+(
+  const unsigned char  *src_ptr,
+  int  src_pixels_per_line,
+  int  xoffset,
+  int  yoffset,
+  const unsigned char *dst_ptr,
+  int dst_pixels_per_line,
+  unsigned int *sse
+) {
+  int xsum;
+  unsigned int xxsum;
+  vp9_filter_block2d_bil4x4_var_mmx(
+    src_ptr, src_pixels_per_line,
+    dst_ptr, dst_pixels_per_line,
+    vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset],
+    &xsum, &xxsum
+  );
+  *sse = xxsum;
+  return (xxsum - ((xsum * xsum) >> 4));
+}
+
+
+unsigned int vp9_sub_pixel_variance8x8_wmt
+(
+  const unsigned char  *src_ptr,
+  int  src_pixels_per_line,
+  int  xoffset,
+  int  yoffset,
+  const unsigned char *dst_ptr,
+  int dst_pixels_per_line,
+  unsigned int *sse
+) {
+  int xsum;
+  unsigned int xxsum;
+
+  if (xoffset == HALFNDX && yoffset == 0) {
+    vp9_half_horiz_variance8x_h_sse2(
+      src_ptr, src_pixels_per_line,
+      dst_ptr, dst_pixels_per_line, 8,
+      &xsum, &xxsum);
+  } else if (xoffset == 0 && yoffset == HALFNDX) {
+    vp9_half_vert_variance8x_h_sse2(
+      src_ptr, src_pixels_per_line,
+      dst_ptr, dst_pixels_per_line, 8,
+      &xsum, &xxsum);
+  } else if (xoffset == HALFNDX && yoffset == HALFNDX) {
+    vp9_half_horiz_vert_variance8x_h_sse2(
+      src_ptr, src_pixels_per_line,
+      dst_ptr, dst_pixels_per_line, 8,
+      &xsum, &xxsum);
+  } else {
+    vp9_filter_block2d_bil_var_sse2(
+      src_ptr, src_pixels_per_line,
+      dst_ptr, dst_pixels_per_line, 8,
+      xoffset, yoffset,
+      &xsum, &xxsum);
+  }
+
+  *sse = xxsum;
+  return (xxsum - ((xsum * xsum) >> 6));
+}
+
+unsigned int vp9_sub_pixel_variance16x16_wmt
+(
+  const unsigned char  *src_ptr,
+  int  src_pixels_per_line,
+  int  xoffset,
+  int  yoffset,
+  const unsigned char *dst_ptr,
+  int dst_pixels_per_line,
+  unsigned int *sse
+) {
+  int xsum0, xsum1;
+  unsigned int xxsum0, xxsum1;
+
+
+  // note we could avoid these if statements if the calling function
+  // just called the appropriate functions inside.
+  if (xoffset == HALFNDX && yoffset == 0) {
+    vp9_half_horiz_variance16x_h_sse2(
+      src_ptr, src_pixels_per_line,
+      dst_ptr, dst_pixels_per_line, 16,
+      &xsum0, &xxsum0);
+  } else if (xoffset == 0 && yoffset == HALFNDX) {
+    vp9_half_vert_variance16x_h_sse2(
+      src_ptr, src_pixels_per_line,
+      dst_ptr, dst_pixels_per_line, 16,
+      &xsum0, &xxsum0);
+  } else if (xoffset == HALFNDX && yoffset == HALFNDX) {
+    vp9_half_horiz_vert_variance16x_h_sse2(
+      src_ptr, src_pixels_per_line,
+      dst_ptr, dst_pixels_per_line, 16,
+      &xsum0, &xxsum0);
+  } else {
+    vp9_filter_block2d_bil_var_sse2(
+      src_ptr, src_pixels_per_line,
+      dst_ptr, dst_pixels_per_line, 16,
+      xoffset, yoffset,
+      &xsum0, &xxsum0
+    );
+
+    vp9_filter_block2d_bil_var_sse2(
+      src_ptr + 8, src_pixels_per_line,
+      dst_ptr + 8, dst_pixels_per_line, 16,
+      xoffset, yoffset,
+      &xsum1, &xxsum1
+    );
+    xsum0 += xsum1;
+    xxsum0 += xxsum1;
+  }
+
+  *sse = xxsum0;
+  return (xxsum0 - ((xsum0 * xsum0) >> 8));
+}
+
+unsigned int vp9_sub_pixel_mse16x16_wmt(
+  const unsigned char  *src_ptr,
+  int  src_pixels_per_line,
+  int  xoffset,
+  int  yoffset,
+  const unsigned char *dst_ptr,
+  int dst_pixels_per_line,
+  unsigned int *sse
+) {
+  vp9_sub_pixel_variance16x16_wmt(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse);
+  return *sse;
+}
+
+unsigned int vp9_sub_pixel_variance16x8_wmt
+(
+  const unsigned char  *src_ptr,
+  int  src_pixels_per_line,
+  int  xoffset,
+  int  yoffset,
+  const unsigned char *dst_ptr,
+  int dst_pixels_per_line,
+  unsigned int *sse
+
+) {
+  int xsum0, xsum1;
+  unsigned int xxsum0, xxsum1;
+
+  if (xoffset == HALFNDX && yoffset == 0) {
+    vp9_half_horiz_variance16x_h_sse2(
+      src_ptr, src_pixels_per_line,
+      dst_ptr, dst_pixels_per_line, 8,
+      &xsum0, &xxsum0);
+  } else if (xoffset == 0 && yoffset == HALFNDX) {
+    vp9_half_vert_variance16x_h_sse2(
+      src_ptr, src_pixels_per_line,
+      dst_ptr, dst_pixels_per_line, 8,
+      &xsum0, &xxsum0);
+  } else if (xoffset == HALFNDX && yoffset == HALFNDX) {
+    vp9_half_horiz_vert_variance16x_h_sse2(
+      src_ptr, src_pixels_per_line,
+      dst_ptr, dst_pixels_per_line, 8,
+      &xsum0, &xxsum0);
+  } else {
+    vp9_filter_block2d_bil_var_sse2(
+      src_ptr, src_pixels_per_line,
+      dst_ptr, dst_pixels_per_line, 8,
+      xoffset, yoffset,
+      &xsum0, &xxsum0);
+
+    vp9_filter_block2d_bil_var_sse2(
+      src_ptr + 8, src_pixels_per_line,
+      dst_ptr + 8, dst_pixels_per_line, 8,
+      xoffset, yoffset,
+      &xsum1, &xxsum1);
+    xsum0 += xsum1;
+    xxsum0 += xxsum1;
+  }
+
+  *sse = xxsum0;
+  return (xxsum0 - ((xsum0 * xsum0) >> 7));
+}
+
+unsigned int vp9_sub_pixel_variance8x16_wmt
+(
+  const unsigned char  *src_ptr,
+  int  src_pixels_per_line,
+  int  xoffset,
+  int  yoffset,
+  const unsigned char *dst_ptr,
+  int dst_pixels_per_line,
+  unsigned int *sse
+) {
+  int xsum;
+  unsigned int xxsum;
+
+  if (xoffset == HALFNDX && yoffset == 0) {
+    vp9_half_horiz_variance8x_h_sse2(
+      src_ptr, src_pixels_per_line,
+      dst_ptr, dst_pixels_per_line, 16,
+      &xsum, &xxsum);
+  } else if (xoffset == 0 && yoffset == HALFNDX) {
+    vp9_half_vert_variance8x_h_sse2(
+      src_ptr, src_pixels_per_line,
+      dst_ptr, dst_pixels_per_line, 16,
+      &xsum, &xxsum);
+  } else if (xoffset == HALFNDX && yoffset == HALFNDX) {
+    vp9_half_horiz_vert_variance8x_h_sse2(
+      src_ptr, src_pixels_per_line,
+      dst_ptr, dst_pixels_per_line, 16,
+      &xsum, &xxsum);
+  } else {
+    vp9_filter_block2d_bil_var_sse2(
+      src_ptr, src_pixels_per_line,
+      dst_ptr, dst_pixels_per_line, 16,
+      xoffset, yoffset,
+      &xsum, &xxsum);
+  }
+
+  *sse = xxsum;
+  return (xxsum - ((xsum * xsum) >> 7));
+}
+
+
+unsigned int vp9_variance_halfpixvar16x16_h_wmt(
+  const unsigned char *src_ptr,
+  int  src_pixels_per_line,
+  const unsigned char *dst_ptr,
+  int  dst_pixels_per_line,
+  unsigned int *sse) {
+  int xsum0;
+  unsigned int xxsum0;
+
+  vp9_half_horiz_variance16x_h_sse2(
+    src_ptr, src_pixels_per_line,
+    dst_ptr, dst_pixels_per_line, 16,
+    &xsum0, &xxsum0);
+
+  *sse = xxsum0;
+  return (xxsum0 - ((xsum0 * xsum0) >> 8));
+}
+
+
+unsigned int vp9_variance_halfpixvar16x16_v_wmt(
+  const unsigned char *src_ptr,
+  int  src_pixels_per_line,
+  const unsigned char *dst_ptr,
+  int  dst_pixels_per_line,
+  unsigned int *sse) {
+  int xsum0;
+  unsigned int xxsum0;
+  vp9_half_vert_variance16x_h_sse2(
+    src_ptr, src_pixels_per_line,
+    dst_ptr, dst_pixels_per_line, 16,
+    &xsum0, &xxsum0);
+
+  *sse = xxsum0;
+  return (xxsum0 - ((xsum0 * xsum0) >> 8));
+}
+
+
+unsigned int vp9_variance_halfpixvar16x16_hv_wmt(
+  const unsigned char *src_ptr,
+  int  src_pixels_per_line,
+  const unsigned char *dst_ptr,
+  int  dst_pixels_per_line,
+  unsigned int *sse) {
+  int xsum0;
+  unsigned int xxsum0;
+
+  vp9_half_horiz_vert_variance16x_h_sse2(
+    src_ptr, src_pixels_per_line,
+    dst_ptr, dst_pixels_per_line, 16,
+    &xsum0, &xxsum0);
+
+  *sse = xxsum0;
+  return (xxsum0 - ((xsum0 * xsum0) >> 8));
+}
--- /dev/null
+++ b/vp9/encoder/x86/variance_ssse3.c
@@ -1,0 +1,151 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_config.h"
+#include "vp9/encoder/variance.h"
+#include "vp9/common/pragmas.h"
+#include "vpx_ports/mem.h"
+
+#define HALFNDX 8
+
+extern unsigned int vp9_get16x16var_sse2
+(
+  const unsigned char *src_ptr,
+  int source_stride,
+  const unsigned char *ref_ptr,
+  int recon_stride,
+  unsigned int *SSE,
+  int *Sum
+);
+extern void vp9_half_horiz_vert_variance16x_h_sse2
+(
+  const unsigned char *ref_ptr,
+  int ref_pixels_per_line,
+  const unsigned char *src_ptr,
+  int src_pixels_per_line,
+  unsigned int Height,
+  int *sum,
+  unsigned int *sumsquared
+);
+extern void vp9_half_horiz_variance16x_h_sse2
+(
+  const unsigned char *ref_ptr,
+  int ref_pixels_per_line,
+  const unsigned char *src_ptr,
+  int src_pixels_per_line,
+  unsigned int Height,
+  int *sum,
+  unsigned int *sumsquared
+);
+extern void vp9_half_vert_variance16x_h_sse2
+(
+  const unsigned char *ref_ptr,
+  int ref_pixels_per_line,
+  const unsigned char *src_ptr,
+  int src_pixels_per_line,
+  unsigned int Height,
+  int *sum,
+  unsigned int *sumsquared
+);
+extern void vp9_filter_block2d_bil_var_ssse3
+(
+  const unsigned char *ref_ptr,
+  int ref_pixels_per_line,
+  const unsigned char *src_ptr,
+  int src_pixels_per_line,
+  unsigned int Height,
+  int  xoffset,
+  int  yoffset,
+  int *sum,
+  unsigned int *sumsquared
+);
+
+unsigned int vp9_sub_pixel_variance16x16_ssse3
+(
+  const unsigned char  *src_ptr,
+  int  src_pixels_per_line,
+  int  xoffset,
+  int  yoffset,
+  const unsigned char *dst_ptr,
+  int dst_pixels_per_line,
+  unsigned int *sse
+) {
+  int xsum0;
+  unsigned int xxsum0;
+
+  // note we could avoid these if statements if the calling function
+  // just called the appropriate functions inside.
+  if (xoffset == HALFNDX && yoffset == 0) {
+    vp9_half_horiz_variance16x_h_sse2(
+      src_ptr, src_pixels_per_line,
+      dst_ptr, dst_pixels_per_line, 16,
+      &xsum0, &xxsum0);
+  } else if (xoffset == 0 && yoffset == HALFNDX) {
+    vp9_half_vert_variance16x_h_sse2(
+      src_ptr, src_pixels_per_line,
+      dst_ptr, dst_pixels_per_line, 16,
+      &xsum0, &xxsum0);
+  } else if (xoffset == HALFNDX && yoffset == HALFNDX) {
+    vp9_half_horiz_vert_variance16x_h_sse2(
+      src_ptr, src_pixels_per_line,
+      dst_ptr, dst_pixels_per_line, 16,
+      &xsum0, &xxsum0);
+  } else {
+    vp9_filter_block2d_bil_var_ssse3(
+      src_ptr, src_pixels_per_line,
+      dst_ptr, dst_pixels_per_line, 16,
+      xoffset, yoffset,
+      &xsum0, &xxsum0);
+  }
+
+  *sse = xxsum0;
+  return (xxsum0 - ((xsum0 * xsum0) >> 8));
+}
+
+unsigned int vp9_sub_pixel_variance16x8_ssse3
+(
+  const unsigned char  *src_ptr,
+  int  src_pixels_per_line,
+  int  xoffset,
+  int  yoffset,
+  const unsigned char *dst_ptr,
+  int dst_pixels_per_line,
+  unsigned int *sse
+
+) {
+  int xsum0;
+  unsigned int xxsum0;
+
+  if (xoffset == HALFNDX && yoffset == 0) {
+    vp9_half_horiz_variance16x_h_sse2(
+      src_ptr, src_pixels_per_line,
+      dst_ptr, dst_pixels_per_line, 8,
+      &xsum0, &xxsum0);
+  } else if (xoffset == 0 && yoffset == HALFNDX) {
+    vp9_half_vert_variance16x_h_sse2(
+      src_ptr, src_pixels_per_line,
+      dst_ptr, dst_pixels_per_line, 8,
+      &xsum0, &xxsum0);
+  } else if (xoffset == HALFNDX && yoffset == HALFNDX) {
+    vp9_half_horiz_vert_variance16x_h_sse2(
+      src_ptr, src_pixels_per_line,
+      dst_ptr, dst_pixels_per_line, 8,
+      &xsum0, &xxsum0);
+  } else {
+    vp9_filter_block2d_bil_var_ssse3(
+      src_ptr, src_pixels_per_line,
+      dst_ptr, dst_pixels_per_line, 8,
+      xoffset, yoffset,
+      &xsum0, &xxsum0);
+  }
+
+  *sse = xxsum0;
+  return (xxsum0 - ((xsum0 * xsum0) >> 7));
+}
--- /dev/null
+++ b/vp9/encoder/x86/x86_csystemdependent.c
@@ -1,0 +1,114 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_ports/config.h"
+#include "vpx_ports/x86.h"
+#include "vp9/encoder/variance.h"
+#include "vp9/encoder/onyx_int.h"
+
+
+#if HAVE_MMX
+void vp9_short_fdct8x4_mmx(short *input, short *output, int pitch) {
+  vp9_short_fdct4x4_mmx(input,   output,    pitch);
+  vp9_short_fdct4x4_mmx(input + 4, output + 16, pitch);
+}
+
+int vp9_mbblock_error_mmx_impl(short *coeff_ptr, short *dcoef_ptr, int dc);
+int vp9_mbblock_error_mmx(MACROBLOCK *mb, int dc) {
+  short *coeff_ptr =  mb->block[0].coeff;
+  short *dcoef_ptr =  mb->e_mbd.block[0].dqcoeff;
+  return vp9_mbblock_error_mmx_impl(coeff_ptr, dcoef_ptr, dc);
+}
+
+int vp9_mbuverror_mmx_impl(short *s_ptr, short *d_ptr);
+int vp9_mbuverror_mmx(MACROBLOCK *mb) {
+  short *s_ptr = &mb->coeff[256];
+  short *d_ptr = &mb->e_mbd.dqcoeff[256];
+  return vp9_mbuverror_mmx_impl(s_ptr, d_ptr);
+}
+
+void vp9_subtract_b_mmx_impl(unsigned char *z,  int src_stride,
+                             short *diff, unsigned char *predictor,
+                             int pitch);
+void vp9_subtract_b_mmx(BLOCK *be, BLOCKD *bd, int pitch) {
+  unsigned char *z = *(be->base_src) + be->src;
+  unsigned int  src_stride = be->src_stride;
+  short *diff = &be->src_diff[0];
+  unsigned char *predictor = &bd->predictor[0];
+  vp9_subtract_b_mmx_impl(z, src_stride, diff, predictor, pitch);
+}
+
+#endif
+
+#if HAVE_SSE2
+int vp9_mbblock_error_xmm_impl(short *coeff_ptr, short *dcoef_ptr, int dc);
+int vp9_mbblock_error_xmm(MACROBLOCK *mb, int dc) {
+  short *coeff_ptr =  mb->block[0].coeff;
+  short *dcoef_ptr =  mb->e_mbd.block[0].dqcoeff;
+  return vp9_mbblock_error_xmm_impl(coeff_ptr, dcoef_ptr, dc);
+}
+
+int vp9_mbuverror_xmm_impl(short *s_ptr, short *d_ptr);
+int vp9_mbuverror_xmm(MACROBLOCK *mb) {
+  short *s_ptr = &mb->coeff[256];
+  short *d_ptr = &mb->e_mbd.dqcoeff[256];
+  return vp9_mbuverror_xmm_impl(s_ptr, d_ptr);
+}
+
+void vp9_subtract_b_sse2_impl(unsigned char *z,  int src_stride,
+                              short *diff, unsigned char *predictor,
+                              int pitch);
+void vp9_subtract_b_sse2(BLOCK *be, BLOCKD *bd, int pitch) {
+  unsigned char *z = *(be->base_src) + be->src;
+  unsigned int  src_stride = be->src_stride;
+  short *diff = &be->src_diff[0];
+  unsigned char *predictor = &bd->predictor[0];
+  vp9_subtract_b_sse2_impl(z, src_stride, diff, predictor, pitch);
+}
+
+#endif
+
+void vp9_arch_x86_encoder_init(VP9_COMP *cpi) {
+#if CONFIG_RUNTIME_CPU_DETECT
+  int flags = x86_simd_caps();
+
+  /* Note:
+   *
+   * This platform can be built without runtime CPU detection as well. If
+   * you modify any of the function mappings present in this file, be sure
+   * to also update them in static mapings (<arch>/filename_<arch>.h)
+   */
+
+  /* Override default functions with fastest ones for this CPU. */
+#if HAVE_SSE2
+  if (flags & HAS_SSE2) {
+    cpi->rtcd.temporal.apply                 = vp9_temporal_filter_apply_sse2;
+
+  }
+#endif
+
+#if HAVE_SSE3
+  if (flags & HAS_SSE3) {
+    cpi->rtcd.search.full_search             = vp9_full_search_sadx3;
+    cpi->rtcd.search.diamond_search          = vp9_diamond_search_sadx4;
+    cpi->rtcd.search.refining_search         = vp9_refining_search_sadx4;
+  }
+#endif
+
+
+#if HAVE_SSE4_1
+  if (flags & HAS_SSE4_1) {
+    cpi->rtcd.search.full_search             = vp9_full_search_sadx8;
+  }
+#endif
+
+#endif
+}
--- /dev/null
+++ b/vp9/exports_dec
@@ -1,0 +1,2 @@
+data vpx_codec_vp8_dx_algo
+text vpx_codec_vp8_dx
--- /dev/null
+++ b/vp9/exports_enc
@@ -1,0 +1,4 @@
+data vpx_codec_vp8_cx_algo
+text vpx_codec_vp8_cx
+data vpx_codec_vp8x_cx_algo
+text vpx_codec_vp8x_cx
--- /dev/null
+++ b/vp9/vp9_common.mk
@@ -1,0 +1,179 @@
+##
+##  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+##
+##  Use of this source code is governed by a BSD-style license
+##  that can be found in the LICENSE file in the root of the source
+##  tree. An additional intellectual property rights grant can be found
+##  in the file PATENTS.  All contributing project authors may
+##  be found in the AUTHORS file in the root of the source tree.
+##
+
+VP9_COMMON_SRCS-yes += vp9_common.mk
+VP9_COMMON_SRCS-yes += common/type_aliases.h
+VP9_COMMON_SRCS-yes += common/pragmas.h
+VP9_COMMON_SRCS-yes += common/ppflags.h
+VP9_COMMON_SRCS-yes += common/onyx.h
+VP9_COMMON_SRCS-yes += common/onyxd.h
+VP9_COMMON_SRCS-yes += common/alloccommon.c
+VP9_COMMON_SRCS-yes += common/asm_com_offsets.c
+VP9_COMMON_SRCS-yes += common/blockd.c
+VP9_COMMON_SRCS-yes += common/coefupdateprobs.h
+VP9_COMMON_SRCS-yes += common/debugmodes.c
+VP9_COMMON_SRCS-yes += common/entropy.c
+VP9_COMMON_SRCS-yes += common/entropymode.c
+VP9_COMMON_SRCS-yes += common/entropymv.c
+VP9_COMMON_SRCS-yes += common/extend.c
+VP9_COMMON_SRCS-yes += common/filter.c
+VP9_COMMON_SRCS-yes += common/filter.h
+VP9_COMMON_SRCS-yes += common/findnearmv.c
+VP9_COMMON_SRCS-yes += common/generic/systemdependent.c
+VP9_COMMON_SRCS-yes += common/idctllm.c
+VP9_COMMON_SRCS-yes += common/alloccommon.h
+VP9_COMMON_SRCS-yes += common/blockd.h
+VP9_COMMON_SRCS-yes += common/common.h
+VP9_COMMON_SRCS-yes += common/common_types.h
+VP9_COMMON_SRCS-yes += common/entropy.h
+VP9_COMMON_SRCS-yes += common/entropymode.h
+VP9_COMMON_SRCS-yes += common/entropymv.h
+VP9_COMMON_SRCS-yes += common/extend.h
+VP9_COMMON_SRCS-yes += common/findnearmv.h
+VP9_COMMON_SRCS-yes += common/header.h
+VP9_COMMON_SRCS-yes += common/idct.h
+VP9_COMMON_SRCS-yes += common/invtrans.h
+VP9_COMMON_SRCS-yes += common/loopfilter.h
+VP9_COMMON_SRCS-yes += common/modecont.h
+VP9_COMMON_SRCS-yes += common/mv.h
+VP9_COMMON_SRCS-yes += common/onyxc_int.h
+VP9_COMMON_SRCS-yes += common/pred_common.h
+VP9_COMMON_SRCS-yes += common/pred_common.c
+VP9_COMMON_SRCS-yes += common/quant_common.h
+VP9_COMMON_SRCS-yes += common/reconinter.h
+VP9_COMMON_SRCS-yes += common/reconintra.h
+VP9_COMMON_SRCS-yes += common/reconintra4x4.h
+VP9_COMMON_SRCS-yes += common/rtcd.c
+VP9_COMMON_SRCS-yes += common/rtcd_defs.sh
+VP9_COMMON_SRCS-yes += common/sadmxn.h
+VP9_COMMON_SRCS-yes += common/seg_common.h
+VP9_COMMON_SRCS-yes += common/seg_common.c
+VP9_COMMON_SRCS-yes += common/setupintrarecon.h
+VP9_COMMON_SRCS-yes += common/subpixel.h
+VP9_COMMON_SRCS-yes += common/swapyv12buffer.h
+VP9_COMMON_SRCS-yes += common/systemdependent.h
+VP9_COMMON_SRCS-yes += common/treecoder.h
+VP9_COMMON_SRCS-yes += common/invtrans.c
+VP9_COMMON_SRCS-yes += common/loopfilter.c
+VP9_COMMON_SRCS-yes += common/loopfilter_filters.c
+VP9_COMMON_SRCS-yes += common/mbpitch.c
+VP9_COMMON_SRCS-yes += common/modecont.c
+VP9_COMMON_SRCS-yes += common/modecontext.c
+VP9_COMMON_SRCS-yes += common/mvref_common.c
+VP9_COMMON_SRCS-yes += common/mvref_common.h
+VP9_COMMON_SRCS-yes += common/quant_common.c
+VP9_COMMON_SRCS-yes += common/recon.c
+VP9_COMMON_SRCS-yes += common/reconinter.c
+VP9_COMMON_SRCS-yes += common/reconintra.c
+VP9_COMMON_SRCS-yes += common/reconintra4x4.c
+VP9_COMMON_SRCS-yes += common/setupintrarecon.c
+VP9_COMMON_SRCS-yes += common/swapyv12buffer.c
+VP9_COMMON_SRCS-$(CONFIG_POSTPROC_VISUALIZER) += common/textblit.c
+VP9_COMMON_SRCS-yes += common/treecoder.c
+VP9_COMMON_SRCS-$(CONFIG_IMPLICIT_SEGMENTATION) += common/implicit_segmentation.c
+
+VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/idct_x86.h
+VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/subpixel_x86.h
+VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/loopfilter_x86.h
+VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/postproc_x86.h
+VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/x86_systemdependent.c
+VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp8_asm_stubs.c
+VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/loopfilter_x86.c
+VP9_COMMON_SRCS-$(CONFIG_POSTPROC) += common/postproc.h
+VP9_COMMON_SRCS-$(CONFIG_POSTPROC) += common/postproc.c
+VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/idctllm_mmx.asm
+VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/iwalsh_mmx.asm
+VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/recon_mmx.asm
+VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/subpixel_mmx.asm
+VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/loopfilter_mmx.asm
+VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/idctllm_sse2.asm
+VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/recon_sse2.asm
+VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/recon_wrapper_sse2.c
+VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/subpixel_sse2.asm
+VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/loopfilter_sse2.asm
+VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/iwalsh_sse2.asm
+VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/subpixel_8t_ssse3.asm
+VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/subpixel_ssse3.asm
+ifeq ($(CONFIG_POSTPROC),yes)
+VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/postproc_mmx.asm
+VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/postproc_sse2.asm
+endif
+
+# common (c)
+ifeq ($(CONFIG_CSM),yes)
+VP9_COMMON_SRCS-yes += common/maskingmv.c
+VP9_COMMON_SRCS-$(HAVE_SSE3) += common/x86/mask_sse3.asm
+endif
+
+VP9_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/filter_sse4.c
+ifeq ($(HAVE_SSE4_1),yes)
+vp9/common/x86/filter_sse4.c.o: CFLAGS += -msse4
+endif
+
+VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/filter_sse2.c
+VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/sadmxn_x86.c
+ifeq ($(HAVE_SSE2),yes)
+vp9/common/x86/filter_sse2.c.o: CFLAGS += -msse2
+vp9/common/x86/loopfilter_x86.c.o: CFLAGS += -msse2
+vp9/common/x86/sadmxn_x86.c.o: CFLAGS += -msse2
+endif
+
+VP9_COMMON_SRCS-$(ARCH_ARM)  += common/arm/arm_systemdependent.c
+VP9_COMMON_SRCS-$(ARCH_ARM)  += common/arm/bilinearfilter_arm.c
+VP9_COMMON_SRCS-$(ARCH_ARM)  += common/arm/bilinearfilter_arm.h
+VP9_COMMON_SRCS-$(ARCH_ARM)  += common/arm/filter_arm.c
+VP9_COMMON_SRCS-$(ARCH_ARM)  += common/arm/idct_arm.h
+VP9_COMMON_SRCS-$(ARCH_ARM)  += common/arm/loopfilter_arm.c
+VP9_COMMON_SRCS-$(ARCH_ARM)  += common/arm/loopfilter_arm.h
+VP9_COMMON_SRCS-$(ARCH_ARM)  += common/arm/recon_arm.h
+VP9_COMMON_SRCS-$(ARCH_ARM)  += common/arm/reconintra_arm.c
+VP9_COMMON_SRCS-$(ARCH_ARM)  += common/arm/subpixel_arm.h
+
+# common (armv6)
+VP9_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/armv6/bilinearfilter_v6$(ASM)
+VP9_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/armv6/copymem8x4_v6$(ASM)
+VP9_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/armv6/copymem8x8_v6$(ASM)
+VP9_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/armv6/copymem16x16_v6$(ASM)
+VP9_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/armv6/dc_only_idct_add_v6$(ASM)
+VP9_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/armv6/iwalsh_v6$(ASM)
+VP9_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/armv6/filter_v6$(ASM)
+VP9_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/armv6/idct_v6$(ASM)
+VP9_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/armv6/loopfilter_v6$(ASM)
+VP9_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/armv6/recon_v6$(ASM)
+VP9_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/armv6/simpleloopfilter_v6$(ASM)
+VP9_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/armv6/sixtappredict8x4_v6$(ASM)
+
+# common (neon)
+VP9_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/bilinearpredict4x4_neon$(ASM)
+VP9_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/bilinearpredict8x4_neon$(ASM)
+VP9_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/bilinearpredict8x8_neon$(ASM)
+VP9_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/bilinearpredict16x16_neon$(ASM)
+VP9_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/copymem8x4_neon$(ASM)
+VP9_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/copymem8x8_neon$(ASM)
+VP9_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/copymem16x16_neon$(ASM)
+VP9_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/dc_only_idct_add_neon$(ASM)
+VP9_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/iwalsh_neon$(ASM)
+VP9_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/loopfilter_neon$(ASM)
+VP9_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/loopfiltersimplehorizontaledge_neon$(ASM)
+VP9_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/loopfiltersimpleverticaledge_neon$(ASM)
+VP9_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/mbloopfilter_neon$(ASM)
+VP9_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/recon2b_neon$(ASM)
+VP9_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/recon4b_neon$(ASM)
+VP9_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/reconb_neon$(ASM)
+VP9_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/shortidct4x4llm_1_neon$(ASM)
+VP9_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/shortidct4x4llm_neon$(ASM)
+VP9_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/sixtappredict4x4_neon$(ASM)
+VP9_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/sixtappredict8x4_neon$(ASM)
+VP9_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/sixtappredict8x8_neon$(ASM)
+VP9_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/sixtappredict16x16_neon$(ASM)
+VP9_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/recon16x16mb_neon$(ASM)
+VP9_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/buildintrapredictorsmby_neon$(ASM)
+VP9_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/save_neon_reg$(ASM)
+VP9_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/recon_neon.c
--- /dev/null
+++ b/vp9/vp9_cx_iface.c
@@ -1,0 +1,1169 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx/vpx_codec.h"
+#include "vpx/internal/vpx_codec_internal.h"
+#include "vpx_version.h"
+#include "vp9/encoder/onyx_int.h"
+#include "vpx/vp8e.h"
+#include "vp9/encoder/firstpass.h"
+#include "vp9/common/onyx.h"
+#include <stdlib.h>
+#include <string.h>
+
+/* This value is a sentinel for determining whether the user has set a mode
+ * directly through the deprecated VP8E_SET_ENCODING_MODE control.
+ */
+#define NO_MODE_SET 255
+
+struct vp8_extracfg {
+  struct vpx_codec_pkt_list *pkt_list;
+  vp8e_encoding_mode      encoding_mode;               /** best, good, realtime            */
+  int                         cpu_used;                    /** available cpu percentage in 1/16*/
+  unsigned int                enable_auto_alt_ref;           /** if encoder decides to uses alternate reference frame */
+  unsigned int                noise_sensitivity;
+  unsigned int                Sharpness;
+  unsigned int                static_thresh;
+  unsigned int                token_partitions;
+  unsigned int                arnr_max_frames;    /* alt_ref Noise Reduction Max Frame Count */
+  unsigned int                arnr_strength;    /* alt_ref Noise Reduction Strength */
+  unsigned int                arnr_type;        /* alt_ref filter type */
+  unsigned int                experimental;
+  vp8e_tuning                 tuning;
+  unsigned int                cq_level;         /* constrained quality level */
+  unsigned int                rc_max_intra_bitrate_pct;
+
+};
+
+struct extraconfig_map {
+  int                 usage;
+  struct vp8_extracfg cfg;
+};
+
+static const struct extraconfig_map extracfg_map[] = {
+  {
+    0,
+    {
+      NULL,
+      VP8_BEST_QUALITY_ENCODING,  /* Encoding Mode */
+      0,                          /* cpu_used      */
+      0,                          /* enable_auto_alt_ref */
+      0,                          /* noise_sensitivity */
+      0,                          /* Sharpness */
+      0,                          /* static_thresh */
+      VP8_ONE_TOKENPARTITION,     /* token_partitions */
+      0,                          /* arnr_max_frames */
+      3,                          /* arnr_strength */
+      3,                          /* arnr_type*/
+      0,                          /* experimental mode */
+      0,                          /* tuning*/
+      10,                         /* cq_level */
+      0,                          /* rc_max_intra_bitrate_pct */
+    }
+  }
+};
+
+struct vpx_codec_alg_priv {
+  vpx_codec_priv_t        base;
+  vpx_codec_enc_cfg_t     cfg;
+  struct vp8_extracfg     vp8_cfg;
+  VP9_CONFIG              oxcf;
+  VP9_PTR             cpi;
+  unsigned char          *cx_data;
+  unsigned int            cx_data_sz;
+  vpx_image_t             preview_img;
+  unsigned int            next_frame_flag;
+  vp8_postproc_cfg_t      preview_ppcfg;
+  vpx_codec_pkt_list_decl(64) pkt_list;              // changed to accomendate the maximum number of lagged frames allowed
+  int                         deprecated_mode;
+  unsigned int                fixed_kf_cntr;
+};
+
+
+static vpx_codec_err_t
+update_error_state(vpx_codec_alg_priv_t                 *ctx,
+                   const struct vpx_internal_error_info *error) {
+  vpx_codec_err_t res;
+
+  if ((res = error->error_code))
+    ctx->base.err_detail = error->has_detail
+                           ? error->detail
+                           : NULL;
+
+  return res;
+}
+
+
+#undef ERROR
+#define ERROR(str) do {\
+    ctx->base.err_detail = str;\
+    return VPX_CODEC_INVALID_PARAM;\
+  } while(0)
+
+#define RANGE_CHECK(p,memb,lo,hi) do {\
+    if(!(((p)->memb == lo || (p)->memb > (lo)) && (p)->memb <= hi)) \
+      ERROR(#memb " out of range ["#lo".."#hi"]");\
+  } while(0)
+
+#define RANGE_CHECK_HI(p,memb,hi) do {\
+    if(!((p)->memb <= (hi))) \
+      ERROR(#memb " out of range [.."#hi"]");\
+  } while(0)
+
+#define RANGE_CHECK_LO(p,memb,lo) do {\
+    if(!((p)->memb >= (lo))) \
+      ERROR(#memb " out of range ["#lo"..]");\
+  } while(0)
+
+#define RANGE_CHECK_BOOL(p,memb) do {\
+    if(!!((p)->memb) != (p)->memb) ERROR(#memb " expected boolean");\
+  } while(0)
+
+static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t      *ctx,
+                                       const vpx_codec_enc_cfg_t *cfg,
+                                       const struct vp8_extracfg *vp8_cfg) {
+  RANGE_CHECK(cfg, g_w,                   1, 16383); /* 14 bits available */
+  RANGE_CHECK(cfg, g_h,                   1, 16383); /* 14 bits available */
+  RANGE_CHECK(cfg, g_timebase.den,        1, 1000000000);
+  RANGE_CHECK(cfg, g_timebase.num,        1, cfg->g_timebase.den);
+  RANGE_CHECK_HI(cfg, g_profile,          3);
+  RANGE_CHECK_HI(cfg, rc_max_quantizer,   63);
+  RANGE_CHECK_HI(cfg, rc_min_quantizer,   cfg->rc_max_quantizer);
+  RANGE_CHECK_HI(cfg, g_threads,          64);
+  RANGE_CHECK_HI(cfg, g_lag_in_frames,    MAX_LAG_BUFFERS);
+  RANGE_CHECK(cfg, rc_end_usage,          VPX_VBR, VPX_CQ);
+  RANGE_CHECK_HI(cfg, rc_undershoot_pct,  1000);
+  RANGE_CHECK_HI(cfg, rc_overshoot_pct,   1000);
+  RANGE_CHECK_HI(cfg, rc_2pass_vbr_bias_pct, 100);
+  RANGE_CHECK(cfg, kf_mode,               VPX_KF_DISABLED, VPX_KF_AUTO);
+  // RANGE_CHECK_BOOL(cfg,                 g_delete_firstpassfile);
+  RANGE_CHECK_BOOL(cfg,                   rc_resize_allowed);
+  RANGE_CHECK_HI(cfg, rc_dropframe_thresh,   100);
+  RANGE_CHECK_HI(cfg, rc_resize_up_thresh,   100);
+  RANGE_CHECK_HI(cfg, rc_resize_down_thresh, 100);
+  RANGE_CHECK(cfg,        g_pass,         VPX_RC_ONE_PASS, VPX_RC_LAST_PASS);
+
+  /* VP8 does not support a lower bound on the keyframe interval in
+   * automatic keyframe placement mode.
+   */
+  if (cfg->kf_mode != VPX_KF_DISABLED && cfg->kf_min_dist != cfg->kf_max_dist
+      && cfg->kf_min_dist > 0)
+    ERROR("kf_min_dist not supported in auto mode, use 0 "
+          "or kf_max_dist instead.");
+
+  RANGE_CHECK_BOOL(vp8_cfg,               enable_auto_alt_ref);
+  RANGE_CHECK(vp8_cfg, cpu_used,           -16, 16);
+
+  RANGE_CHECK(vp8_cfg, encoding_mode,      VP8_BEST_QUALITY_ENCODING, VP8_REAL_TIME_ENCODING);
+  RANGE_CHECK_HI(vp8_cfg, noise_sensitivity,  6);
+
+  RANGE_CHECK(vp8_cfg, token_partitions,   VP8_ONE_TOKENPARTITION, VP8_EIGHT_TOKENPARTITION);
+  RANGE_CHECK_HI(vp8_cfg, Sharpness,       7);
+  RANGE_CHECK(vp8_cfg, arnr_max_frames, 0, 15);
+  RANGE_CHECK_HI(vp8_cfg, arnr_strength,   6);
+  RANGE_CHECK(vp8_cfg, arnr_type,       1, 3);
+  RANGE_CHECK(vp8_cfg, cq_level, 0, 63);
+
+  if (cfg->g_pass == VPX_RC_LAST_PASS) {
+    size_t           packet_sz = sizeof(FIRSTPASS_STATS);
+    int              n_packets = cfg->rc_twopass_stats_in.sz / packet_sz;
+    FIRSTPASS_STATS *stats;
+
+    if (!cfg->rc_twopass_stats_in.buf)
+      ERROR("rc_twopass_stats_in.buf not set.");
+
+    if (cfg->rc_twopass_stats_in.sz % packet_sz)
+      ERROR("rc_twopass_stats_in.sz indicates truncated packet.");
+
+    if (cfg->rc_twopass_stats_in.sz < 2 * packet_sz)
+      ERROR("rc_twopass_stats_in requires at least two packets.");
+
+    stats = (void *)((char *)cfg->rc_twopass_stats_in.buf
+                     + (n_packets - 1) * packet_sz);
+
+    if ((int)(stats->count + 0.5) != n_packets - 1)
+      ERROR("rc_twopass_stats_in missing EOS stats packet");
+  }
+
+  return VPX_CODEC_OK;
+}
+
+
+static vpx_codec_err_t validate_img(vpx_codec_alg_priv_t *ctx,
+                                    const vpx_image_t    *img) {
+  switch (img->fmt) {
+    case VPX_IMG_FMT_YV12:
+    case VPX_IMG_FMT_I420:
+    case VPX_IMG_FMT_VPXI420:
+    case VPX_IMG_FMT_VPXYV12:
+      break;
+    default:
+      ERROR("Invalid image format. Only YV12 and I420 images are supported");
+  }
+
+  if ((img->d_w != ctx->cfg.g_w) || (img->d_h != ctx->cfg.g_h))
+    ERROR("Image size must match encoder init configuration size");
+
+  return VPX_CODEC_OK;
+}
+
+
+static vpx_codec_err_t set_vp8e_config(VP9_CONFIG *oxcf,
+                                       vpx_codec_enc_cfg_t cfg,
+                                       struct vp8_extracfg vp8_cfg) {
+  oxcf->Version               = cfg.g_profile;
+  oxcf->Version              |= vp8_cfg.experimental ? 0x4 : 0;
+
+  oxcf->Width                 = cfg.g_w;
+  oxcf->Height                = cfg.g_h;
+  /* guess a frame rate if out of whack, use 30 */
+  oxcf->frame_rate             = (double)(cfg.g_timebase.den) / (double)(cfg.g_timebase.num);
+
+  if (oxcf->frame_rate > 180) {
+    oxcf->frame_rate = 30;
+  }
+
+  switch (cfg.g_pass) {
+    case VPX_RC_ONE_PASS:
+      oxcf->Mode = MODE_BESTQUALITY;
+      break;
+    case VPX_RC_FIRST_PASS:
+      oxcf->Mode = MODE_FIRSTPASS;
+      break;
+    case VPX_RC_LAST_PASS:
+      oxcf->Mode = MODE_SECONDPASS_BEST;
+      break;
+  }
+
+  if (cfg.g_pass == VPX_RC_FIRST_PASS) {
+    oxcf->allow_lag              = 0;
+    oxcf->lag_in_frames           = 0;
+  } else {
+    oxcf->allow_lag              = (cfg.g_lag_in_frames) > 0;
+    oxcf->lag_in_frames           = cfg.g_lag_in_frames;
+  }
+
+  // VBR only supported for now.
+  // CBR code has been deprectated for experimental phase.
+  // CQ mode not yet tested
+  oxcf->end_usage          = USAGE_LOCAL_FILE_PLAYBACK;
+  /*if (cfg.rc_end_usage == VPX_CQ)
+      oxcf->end_usage      = USAGE_CONSTRAINED_QUALITY;
+  else
+      oxcf->end_usage      = USAGE_LOCAL_FILE_PLAYBACK;*/
+
+  oxcf->target_bandwidth       = cfg.rc_target_bitrate;
+  oxcf->rc_max_intra_bitrate_pct = vp8_cfg.rc_max_intra_bitrate_pct;
+
+  oxcf->best_allowed_q          = cfg.rc_min_quantizer;
+  oxcf->worst_allowed_q         = cfg.rc_max_quantizer;
+  oxcf->cq_level                = vp8_cfg.cq_level;
+  oxcf->fixed_q = -1;
+
+  oxcf->under_shoot_pct         = cfg.rc_undershoot_pct;
+  oxcf->over_shoot_pct          = cfg.rc_overshoot_pct;
+
+  oxcf->maximum_buffer_size     = cfg.rc_buf_sz;
+  oxcf->starting_buffer_level   = cfg.rc_buf_initial_sz;
+  oxcf->optimal_buffer_level    = cfg.rc_buf_optimal_sz;
+
+  oxcf->two_pass_vbrbias        = cfg.rc_2pass_vbr_bias_pct;
+  oxcf->two_pass_vbrmin_section  = cfg.rc_2pass_vbr_minsection_pct;
+  oxcf->two_pass_vbrmax_section  = cfg.rc_2pass_vbr_maxsection_pct;
+
+  oxcf->auto_key               = cfg.kf_mode == VPX_KF_AUTO
+                                 && cfg.kf_min_dist != cfg.kf_max_dist;
+  // oxcf->kf_min_dist         = cfg.kf_min_dis;
+  oxcf->key_freq               = cfg.kf_max_dist;
+
+  // oxcf->delete_first_pass_file = cfg.g_delete_firstpassfile;
+  // strcpy(oxcf->first_pass_file, cfg.g_firstpass_file);
+
+  oxcf->cpu_used               =  vp8_cfg.cpu_used;
+  oxcf->encode_breakout        =  vp8_cfg.static_thresh;
+  oxcf->play_alternate         =  vp8_cfg.enable_auto_alt_ref;
+  oxcf->noise_sensitivity      =  vp8_cfg.noise_sensitivity;
+  oxcf->Sharpness             =  vp8_cfg.Sharpness;
+
+  oxcf->two_pass_stats_in        =  cfg.rc_twopass_stats_in;
+  oxcf->output_pkt_list         =  vp8_cfg.pkt_list;
+
+  oxcf->arnr_max_frames = vp8_cfg.arnr_max_frames;
+  oxcf->arnr_strength =  vp8_cfg.arnr_strength;
+  oxcf->arnr_type =      vp8_cfg.arnr_type;
+
+  oxcf->tuning = vp8_cfg.tuning;
+
+#if CONFIG_LOSSLESS
+  oxcf->lossless = cfg.lossless;
+#endif
+
+  /*
+      printf("Current VP8 Settings: \n");
+      printf("target_bandwidth: %d\n", oxcf->target_bandwidth);
+      printf("noise_sensitivity: %d\n", oxcf->noise_sensitivity);
+      printf("Sharpness: %d\n",    oxcf->Sharpness);
+      printf("cpu_used: %d\n",  oxcf->cpu_used);
+      printf("Mode: %d\n",     oxcf->Mode);
+      printf("delete_first_pass_file: %d\n",  oxcf->delete_first_pass_file);
+      printf("auto_key: %d\n",  oxcf->auto_key);
+      printf("key_freq: %d\n", oxcf->key_freq);
+      printf("end_usage: %d\n", oxcf->end_usage);
+      printf("under_shoot_pct: %d\n", oxcf->under_shoot_pct);
+      printf("over_shoot_pct: %d\n", oxcf->over_shoot_pct);
+      printf("starting_buffer_level: %d\n", oxcf->starting_buffer_level);
+      printf("optimal_buffer_level: %d\n",  oxcf->optimal_buffer_level);
+      printf("maximum_buffer_size: %d\n", oxcf->maximum_buffer_size);
+      printf("fixed_q: %d\n",  oxcf->fixed_q);
+      printf("worst_allowed_q: %d\n", oxcf->worst_allowed_q);
+      printf("best_allowed_q: %d\n", oxcf->best_allowed_q);
+      printf("two_pass_vbrbias: %d\n",  oxcf->two_pass_vbrbias);
+      printf("two_pass_vbrmin_section: %d\n", oxcf->two_pass_vbrmin_section);
+      printf("two_pass_vbrmax_section: %d\n", oxcf->two_pass_vbrmax_section);
+      printf("allow_lag: %d\n", oxcf->allow_lag);
+      printf("lag_in_frames: %d\n", oxcf->lag_in_frames);
+      printf("play_alternate: %d\n", oxcf->play_alternate);
+      printf("Version: %d\n", oxcf->Version);
+      printf("encode_breakout: %d\n", oxcf->encode_breakout);
+  */
+  return VPX_CODEC_OK;
+}
+
+static vpx_codec_err_t vp8e_set_config(vpx_codec_alg_priv_t       *ctx,
+                                       const vpx_codec_enc_cfg_t  *cfg) {
+  vpx_codec_err_t res;
+
+  if ((cfg->g_w != ctx->cfg.g_w) || (cfg->g_h != ctx->cfg.g_h))
+    ERROR("Cannot change width or height after initialization");
+
+  /* Prevent increasing lag_in_frames. This check is stricter than it needs
+   * to be -- the limit is not increasing past the first lag_in_frames
+   * value, but we don't track the initial config, only the last successful
+   * config.
+   */
+  if ((cfg->g_lag_in_frames > ctx->cfg.g_lag_in_frames))
+    ERROR("Cannot increase lag_in_frames");
+
+  res = validate_config(ctx, cfg, &ctx->vp8_cfg);
+
+  if (!res) {
+    ctx->cfg = *cfg;
+    set_vp8e_config(&ctx->oxcf, ctx->cfg, ctx->vp8_cfg);
+    vp9_change_config(ctx->cpi, &ctx->oxcf);
+  }
+
+  return res;
+}
+
+
+int vp9_reverse_trans(int q);
+
+
+static vpx_codec_err_t get_param(vpx_codec_alg_priv_t *ctx,
+                                 int                   ctrl_id,
+                                 va_list               args) {
+  void *arg = va_arg(args, void *);
+
+#define MAP(id, var) case id: *(RECAST(id, arg)) = var; break
+
+  if (!arg)
+    return VPX_CODEC_INVALID_PARAM;
+
+  switch (ctrl_id) {
+      MAP(VP8E_GET_LAST_QUANTIZER, vp9_get_quantizer(ctx->cpi));
+      MAP(VP8E_GET_LAST_QUANTIZER_64,
+          vp9_reverse_trans(vp9_get_quantizer(ctx->cpi)));
+  }
+
+  return VPX_CODEC_OK;
+#undef MAP
+}
+
+
+static vpx_codec_err_t set_param(vpx_codec_alg_priv_t *ctx,
+                                 int                   ctrl_id,
+                                 va_list               args) {
+  vpx_codec_err_t     res  = VPX_CODEC_OK;
+  struct vp8_extracfg xcfg = ctx->vp8_cfg;
+
+#define MAP(id, var) case id: var = CAST(id, args); break;
+
+  switch (ctrl_id) {
+      MAP(VP8E_SET_ENCODING_MODE,         ctx->deprecated_mode);
+      MAP(VP8E_SET_CPUUSED,               xcfg.cpu_used);
+      MAP(VP8E_SET_ENABLEAUTOALTREF,      xcfg.enable_auto_alt_ref);
+      MAP(VP8E_SET_NOISE_SENSITIVITY,     xcfg.noise_sensitivity);
+      MAP(VP8E_SET_SHARPNESS,             xcfg.Sharpness);
+      MAP(VP8E_SET_STATIC_THRESHOLD,      xcfg.static_thresh);
+      MAP(VP8E_SET_TOKEN_PARTITIONS,      xcfg.token_partitions);
+
+      MAP(VP8E_SET_ARNR_MAXFRAMES,        xcfg.arnr_max_frames);
+      MAP(VP8E_SET_ARNR_STRENGTH,        xcfg.arnr_strength);
+      MAP(VP8E_SET_ARNR_TYPE,        xcfg.arnr_type);
+      MAP(VP8E_SET_TUNING,                xcfg.tuning);
+      MAP(VP8E_SET_CQ_LEVEL,              xcfg.cq_level);
+      MAP(VP8E_SET_MAX_INTRA_BITRATE_PCT, xcfg.rc_max_intra_bitrate_pct);
+
+  }
+
+  res = validate_config(ctx, &ctx->cfg, &xcfg);
+
+  if (!res) {
+    ctx->vp8_cfg = xcfg;
+    set_vp8e_config(&ctx->oxcf, ctx->cfg, ctx->vp8_cfg);
+    vp9_change_config(ctx->cpi, &ctx->oxcf);
+  }
+
+  return res;
+#undef MAP
+}
+
+
+static vpx_codec_err_t vp8e_common_init(vpx_codec_ctx_t *ctx,
+                                        int              experimental) {
+  vpx_codec_err_t        res = VPX_DEC_OK;
+  struct vpx_codec_alg_priv *priv;
+  vpx_codec_enc_cfg_t       *cfg;
+  unsigned int               i;
+
+  VP9_PTR optr;
+
+  if (!ctx->priv) {
+    priv = calloc(1, sizeof(struct vpx_codec_alg_priv));
+
+    if (!priv) {
+      return VPX_CODEC_MEM_ERROR;
+    }
+
+    ctx->priv = &priv->base;
+    ctx->priv->sz = sizeof(*ctx->priv);
+    ctx->priv->iface = ctx->iface;
+    ctx->priv->alg_priv = priv;
+    ctx->priv->init_flags = ctx->init_flags;
+
+    if (ctx->config.enc) {
+      /* Update the reference to the config structure to an
+       * internal copy.
+       */
+      ctx->priv->alg_priv->cfg = *ctx->config.enc;
+      ctx->config.enc = &ctx->priv->alg_priv->cfg;
+    }
+
+    cfg =  &ctx->priv->alg_priv->cfg;
+
+    /* Select the extra vp6 configuration table based on the current
+     * usage value. If the current usage value isn't found, use the
+     * values for usage case 0.
+     */
+    for (i = 0;
+         extracfg_map[i].usage && extracfg_map[i].usage != cfg->g_usage;
+         i++);
+
+    priv->vp8_cfg = extracfg_map[i].cfg;
+    priv->vp8_cfg.pkt_list = &priv->pkt_list.head;
+    priv->vp8_cfg.experimental = experimental;
+
+    priv->cx_data_sz = priv->cfg.g_w * priv->cfg.g_h * 3 / 2 * 2;
+
+    if (priv->cx_data_sz < 4096) priv->cx_data_sz = 4096;
+
+    priv->cx_data = malloc(priv->cx_data_sz);
+
+    if (!priv->cx_data) {
+      return VPX_CODEC_MEM_ERROR;
+    }
+
+    priv->deprecated_mode = NO_MODE_SET;
+
+    vp9_initialize_enc();
+
+    res = validate_config(priv, &priv->cfg, &priv->vp8_cfg);
+
+    if (!res) {
+      set_vp8e_config(&ctx->priv->alg_priv->oxcf,
+                      ctx->priv->alg_priv->cfg,
+                      ctx->priv->alg_priv->vp8_cfg);
+      optr = vp9_create_compressor(&ctx->priv->alg_priv->oxcf);
+
+      if (!optr)
+        res = VPX_CODEC_MEM_ERROR;
+      else
+        ctx->priv->alg_priv->cpi = optr;
+    }
+  }
+
+  return res;
+}
+
+
+static vpx_codec_err_t vp8e_init(vpx_codec_ctx_t *ctx) {
+  return vp8e_common_init(ctx, 0);
+}
+
+
+#if CONFIG_EXPERIMENTAL
+static vpx_codec_err_t vp8e_exp_init(vpx_codec_ctx_t *ctx) {
+  return vp8e_common_init(ctx, 1);
+}
+#endif
+
+
+static vpx_codec_err_t vp8e_destroy(vpx_codec_alg_priv_t *ctx) {
+
+  free(ctx->cx_data);
+  vp9_remove_compressor(&ctx->cpi);
+  free(ctx);
+  return VPX_CODEC_OK;
+}
+
+static vpx_codec_err_t image2yuvconfig(const vpx_image_t   *img,
+                                       YV12_BUFFER_CONFIG  *yv12) {
+  vpx_codec_err_t        res = VPX_CODEC_OK;
+  yv12->y_buffer = img->planes[VPX_PLANE_Y];
+  yv12->u_buffer = img->planes[VPX_PLANE_U];
+  yv12->v_buffer = img->planes[VPX_PLANE_V];
+
+  yv12->y_width  = img->d_w;
+  yv12->y_height = img->d_h;
+  yv12->uv_width = (1 + yv12->y_width) / 2;
+  yv12->uv_height = (1 + yv12->y_height) / 2;
+
+  yv12->y_stride = img->stride[VPX_PLANE_Y];
+  yv12->uv_stride = img->stride[VPX_PLANE_U];
+
+  yv12->border  = (img->stride[VPX_PLANE_Y] - img->w) / 2;
+  yv12->clrtype = (img->fmt == VPX_IMG_FMT_VPXI420 || img->fmt == VPX_IMG_FMT_VPXYV12); // REG_YUV = 0
+  return res;
+}
+
+static void pick_quickcompress_mode(vpx_codec_alg_priv_t  *ctx,
+                                    unsigned long          duration,
+                                    unsigned long          deadline) {
+  unsigned int new_qc;
+
+  /* Use best quality mode if no deadline is given. */
+  if (deadline)
+    new_qc = MODE_GOODQUALITY;
+  else
+    new_qc = MODE_BESTQUALITY;
+
+  if (ctx->cfg.g_pass == VPX_RC_FIRST_PASS)
+    new_qc = MODE_FIRSTPASS;
+  else if (ctx->cfg.g_pass == VPX_RC_LAST_PASS)
+    new_qc = (new_qc == MODE_BESTQUALITY)
+             ? MODE_SECONDPASS_BEST
+             : MODE_SECONDPASS;
+
+  if (ctx->oxcf.Mode != new_qc) {
+    ctx->oxcf.Mode = new_qc;
+    vp9_change_config(ctx->cpi, &ctx->oxcf);
+  }
+}
+
+
+static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t  *ctx,
+                                   const vpx_image_t     *img,
+                                   vpx_codec_pts_t        pts,
+                                   unsigned long          duration,
+                                   vpx_enc_frame_flags_t  flags,
+                                   unsigned long          deadline) {
+  vpx_codec_err_t res = VPX_CODEC_OK;
+
+  if (img)
+    res = validate_img(ctx, img);
+
+  pick_quickcompress_mode(ctx, duration, deadline);
+  vpx_codec_pkt_list_init(&ctx->pkt_list);
+
+  /* Handle Flags */
+  if (((flags & VP8_EFLAG_NO_UPD_GF) && (flags & VP8_EFLAG_FORCE_GF))
+      || ((flags & VP8_EFLAG_NO_UPD_ARF) && (flags & VP8_EFLAG_FORCE_ARF))) {
+    ctx->base.err_detail = "Conflicting flags.";
+    return VPX_CODEC_INVALID_PARAM;
+  }
+
+  if (flags & (VP8_EFLAG_NO_REF_LAST | VP8_EFLAG_NO_REF_GF
+               | VP8_EFLAG_NO_REF_ARF)) {
+    int ref = 7;
+
+    if (flags & VP8_EFLAG_NO_REF_LAST)
+      ref ^= VP9_LAST_FLAG;
+
+    if (flags & VP8_EFLAG_NO_REF_GF)
+      ref ^= VP9_GOLD_FLAG;
+
+    if (flags & VP8_EFLAG_NO_REF_ARF)
+      ref ^= VP9_ALT_FLAG;
+
+    vp9_use_as_reference(ctx->cpi, ref);
+  }
+
+  if (flags & (VP8_EFLAG_NO_UPD_LAST | VP8_EFLAG_NO_UPD_GF
+               | VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_FORCE_GF
+               | VP8_EFLAG_FORCE_ARF)) {
+    int upd = 7;
+
+    if (flags & VP8_EFLAG_NO_UPD_LAST)
+      upd ^= VP9_LAST_FLAG;
+
+    if (flags & VP8_EFLAG_NO_UPD_GF)
+      upd ^= VP9_GOLD_FLAG;
+
+    if (flags & VP8_EFLAG_NO_UPD_ARF)
+      upd ^= VP9_ALT_FLAG;
+
+    vp9_update_reference(ctx->cpi, upd);
+  }
+
+  if (flags & VP8_EFLAG_NO_UPD_ENTROPY) {
+    vp9_update_entropy(ctx->cpi, 0);
+  }
+
+  /* Handle fixed keyframe intervals */
+  if (ctx->cfg.kf_mode == VPX_KF_AUTO
+      && ctx->cfg.kf_min_dist == ctx->cfg.kf_max_dist) {
+    if (++ctx->fixed_kf_cntr > ctx->cfg.kf_min_dist) {
+      flags |= VPX_EFLAG_FORCE_KF;
+      ctx->fixed_kf_cntr = 1;
+    }
+  }
+
+  /* Initialize the encoder instance on the first frame*/
+  if (!res && ctx->cpi) {
+    unsigned int lib_flags;
+    YV12_BUFFER_CONFIG sd;
+    int64_t dst_time_stamp, dst_end_time_stamp;
+    unsigned long size, cx_data_sz;
+    unsigned char *cx_data;
+
+    /* Set up internal flags */
+    if (ctx->base.init_flags & VPX_CODEC_USE_PSNR)
+      ((VP9_COMP *)ctx->cpi)->b_calculate_psnr = 1;
+
+    // if (ctx->base.init_flags & VPX_CODEC_USE_OUTPUT_PARTITION)
+    //    ((VP9_COMP *)ctx->cpi)->output_partition = 1;
+
+    /* Convert API flags to internal codec lib flags */
+    lib_flags = (flags & VPX_EFLAG_FORCE_KF) ? FRAMEFLAGS_KEY : 0;
+
+    /* vp8 use 10,000,000 ticks/second as time stamp */
+    dst_time_stamp    = pts * 10000000 * ctx->cfg.g_timebase.num / ctx->cfg.g_timebase.den;
+    dst_end_time_stamp = (pts + duration) * 10000000 * ctx->cfg.g_timebase.num / ctx->cfg.g_timebase.den;
+
+    if (img != NULL) {
+      res = image2yuvconfig(img, &sd);
+
+      if (vp9_receive_raw_frame(ctx->cpi, ctx->next_frame_flag | lib_flags,
+                                &sd, dst_time_stamp, dst_end_time_stamp)) {
+        VP9_COMP *cpi = (VP9_COMP *)ctx->cpi;
+        res = update_error_state(ctx, &cpi->common.error);
+      }
+
+      /* reset for next frame */
+      ctx->next_frame_flag = 0;
+    }
+
+    cx_data = ctx->cx_data;
+    cx_data_sz = ctx->cx_data_sz;
+    lib_flags = 0;
+
+    while (cx_data_sz >= ctx->cx_data_sz / 2 &&
+           -1 != vp9_get_compressed_data(ctx->cpi, &lib_flags, &size,
+                                         cx_data, &dst_time_stamp,
+                                         &dst_end_time_stamp, !img)) {
+      if (size) {
+        vpx_codec_pts_t    round, delta;
+        vpx_codec_cx_pkt_t pkt;
+        VP9_COMP *cpi = (VP9_COMP *)ctx->cpi;
+
+        /* Add the frame packet to the list of returned packets. */
+        round = 1000000 * ctx->cfg.g_timebase.num / 2 - 1;
+        delta = (dst_end_time_stamp - dst_time_stamp);
+        pkt.kind = VPX_CODEC_CX_FRAME_PKT;
+        pkt.data.frame.pts =
+          (dst_time_stamp * ctx->cfg.g_timebase.den + round)
+          / ctx->cfg.g_timebase.num / 10000000;
+        pkt.data.frame.duration =
+          (delta * ctx->cfg.g_timebase.den + round)
+          / ctx->cfg.g_timebase.num / 10000000;
+        pkt.data.frame.flags = lib_flags << 16;
+
+        if (lib_flags & FRAMEFLAGS_KEY)
+          pkt.data.frame.flags |= VPX_FRAME_IS_KEY;
+
+        if (!cpi->common.show_frame) {
+          pkt.data.frame.flags |= VPX_FRAME_IS_INVISIBLE;
+
+          // This timestamp should be as close as possible to the
+          // prior PTS so that if a decoder uses pts to schedule when
+          // to do this, we start right after last frame was decoded.
+          // Invisible frames have no duration.
+          pkt.data.frame.pts = ((cpi->last_time_stamp_seen
+                                 * ctx->cfg.g_timebase.den + round)
+                                / ctx->cfg.g_timebase.num / 10000000) + 1;
+          pkt.data.frame.duration = 0;
+        }
+
+        if (cpi->droppable)
+          pkt.data.frame.flags |= VPX_FRAME_IS_DROPPABLE;
+
+        /*if (cpi->output_partition)
+        {
+            int i;
+            const int num_partitions = 1;
+
+            pkt.data.frame.flags |= VPX_FRAME_IS_FRAGMENT;
+
+            for (i = 0; i < num_partitions; ++i)
+            {
+                pkt.data.frame.buf = cx_data;
+                pkt.data.frame.sz = cpi->partition_sz[i];
+                pkt.data.frame.partition_id = i;
+                // don't set the fragment bit for the last partition
+                if (i == (num_partitions - 1))
+                    pkt.data.frame.flags &= ~VPX_FRAME_IS_FRAGMENT;
+                vpx_codec_pkt_list_add(&ctx->pkt_list.head, &pkt);
+                cx_data += cpi->partition_sz[i];
+                cx_data_sz -= cpi->partition_sz[i];
+            }
+        }
+        else*/
+        {
+          pkt.data.frame.buf = cx_data;
+          pkt.data.frame.sz  = size;
+          pkt.data.frame.partition_id = -1;
+          vpx_codec_pkt_list_add(&ctx->pkt_list.head, &pkt);
+          cx_data += size;
+          cx_data_sz -= size;
+        }
+
+        // printf("timestamp: %lld, duration: %d\n", pkt->data.frame.pts, pkt->data.frame.duration);
+      }
+    }
+  }
+
+  return res;
+}
+
+
+static const vpx_codec_cx_pkt_t *vp8e_get_cxdata(vpx_codec_alg_priv_t  *ctx,
+                                                 vpx_codec_iter_t      *iter) {
+  return vpx_codec_pkt_list_get(&ctx->pkt_list.head, iter);
+}
+
+static vpx_codec_err_t vp8e_set_reference(vpx_codec_alg_priv_t *ctx,
+                                          int ctr_id,
+                                          va_list args) {
+  vpx_ref_frame_t *data = va_arg(args, vpx_ref_frame_t *);
+
+  if (data) {
+    vpx_ref_frame_t *frame = (vpx_ref_frame_t *)data;
+    YV12_BUFFER_CONFIG sd;
+
+    image2yuvconfig(&frame->img, &sd);
+    vp9_set_reference_enc(ctx->cpi, frame->frame_type, &sd);
+    return VPX_CODEC_OK;
+  } else
+    return VPX_CODEC_INVALID_PARAM;
+
+}
+
+static vpx_codec_err_t vp8e_get_reference(vpx_codec_alg_priv_t *ctx,
+                                          int ctr_id,
+                                          va_list args) {
+
+  vpx_ref_frame_t *data = va_arg(args, vpx_ref_frame_t *);
+
+  if (data) {
+    vpx_ref_frame_t *frame = (vpx_ref_frame_t *)data;
+    YV12_BUFFER_CONFIG sd;
+
+    image2yuvconfig(&frame->img, &sd);
+    vp9_get_reference_enc(ctx->cpi, frame->frame_type, &sd);
+    return VPX_CODEC_OK;
+  } else
+    return VPX_CODEC_INVALID_PARAM;
+}
+
+static vpx_codec_err_t vp8e_set_previewpp(vpx_codec_alg_priv_t *ctx,
+                                          int ctr_id,
+                                          va_list args) {
+#if CONFIG_POSTPROC
+  vp8_postproc_cfg_t *data = va_arg(args, vp8_postproc_cfg_t *);
+  (void)ctr_id;
+
+  if (data) {
+    ctx->preview_ppcfg = *((vp8_postproc_cfg_t *)data);
+    return VPX_CODEC_OK;
+  } else
+    return VPX_CODEC_INVALID_PARAM;
+#else
+  (void)ctx;
+  (void)ctr_id;
+  (void)args;
+  return VPX_CODEC_INCAPABLE;
+#endif
+}
+
+
+static vpx_image_t *vp8e_get_preview(vpx_codec_alg_priv_t *ctx) {
+
+  YV12_BUFFER_CONFIG sd;
+  vp9_ppflags_t flags = {0};
+
+  if (ctx->preview_ppcfg.post_proc_flag) {
+    flags.post_proc_flag        = ctx->preview_ppcfg.post_proc_flag;
+    flags.deblocking_level      = ctx->preview_ppcfg.deblocking_level;
+    flags.noise_level           = ctx->preview_ppcfg.noise_level;
+  }
+
+  if (0 == vp9_get_preview_raw_frame(ctx->cpi, &sd, &flags)) {
+
+    /*
+    vpx_img_wrap(&ctx->preview_img, VPX_IMG_FMT_YV12,
+        sd.y_width + 2*VP8BORDERINPIXELS,
+        sd.y_height + 2*VP8BORDERINPIXELS,
+        1,
+        sd.buffer_alloc);
+    vpx_img_set_rect(&ctx->preview_img,
+        VP8BORDERINPIXELS, VP8BORDERINPIXELS,
+        sd.y_width, sd.y_height);
+        */
+
+    ctx->preview_img.bps = 12;
+    ctx->preview_img.planes[VPX_PLANE_Y] = sd.y_buffer;
+    ctx->preview_img.planes[VPX_PLANE_U] = sd.u_buffer;
+    ctx->preview_img.planes[VPX_PLANE_V] = sd.v_buffer;
+
+    if (sd.clrtype == REG_YUV)
+      ctx->preview_img.fmt = VPX_IMG_FMT_I420;
+    else
+      ctx->preview_img.fmt = VPX_IMG_FMT_VPXI420;
+
+    ctx->preview_img.x_chroma_shift = 1;
+    ctx->preview_img.y_chroma_shift = 1;
+
+    ctx->preview_img.d_w = sd.y_width;
+    ctx->preview_img.d_h = sd.y_height;
+    ctx->preview_img.stride[VPX_PLANE_Y] = sd.y_stride;
+    ctx->preview_img.stride[VPX_PLANE_U] = sd.uv_stride;
+    ctx->preview_img.stride[VPX_PLANE_V] = sd.uv_stride;
+    ctx->preview_img.w   = sd.y_width;
+    ctx->preview_img.h   = sd.y_height;
+
+    return &ctx->preview_img;
+  } else
+    return NULL;
+}
+
+static vpx_codec_err_t vp8e_update_entropy(vpx_codec_alg_priv_t *ctx,
+                                           int ctr_id,
+                                           va_list args) {
+  int update = va_arg(args, int);
+  vp9_update_entropy(ctx->cpi, update);
+  return VPX_CODEC_OK;
+
+}
+
+static vpx_codec_err_t vp8e_update_reference(vpx_codec_alg_priv_t *ctx,
+                                             int ctr_id,
+                                             va_list args) {
+  int update = va_arg(args, int);
+  vp9_update_reference(ctx->cpi, update);
+  return VPX_CODEC_OK;
+}
+
+static vpx_codec_err_t vp8e_use_reference(vpx_codec_alg_priv_t *ctx,
+                                          int ctr_id,
+                                          va_list args) {
+  int reference_flag = va_arg(args, int);
+  vp9_use_as_reference(ctx->cpi, reference_flag);
+  return VPX_CODEC_OK;
+}
+
+static vpx_codec_err_t vp8e_set_roi_map(vpx_codec_alg_priv_t *ctx,
+                                        int ctr_id,
+                                        va_list args) {
+  vpx_roi_map_t *data = va_arg(args, vpx_roi_map_t *);
+
+  if (data) {
+    vpx_roi_map_t *roi = (vpx_roi_map_t *)data;
+
+    if (!vp9_set_roimap(ctx->cpi, roi->roi_map, roi->rows, roi->cols,
+                        roi->delta_q, roi->delta_lf, roi->static_threshold))
+      return VPX_CODEC_OK;
+    else
+      return VPX_CODEC_INVALID_PARAM;
+  } else
+    return VPX_CODEC_INVALID_PARAM;
+}
+
+
+static vpx_codec_err_t vp8e_set_activemap(vpx_codec_alg_priv_t *ctx,
+                                          int ctr_id,
+                                          va_list args) {
+  vpx_active_map_t *data = va_arg(args, vpx_active_map_t *);
+
+  if (data) {
+
+    vpx_active_map_t *map = (vpx_active_map_t *)data;
+
+    if (!vp9_set_active_map(ctx->cpi, map->active_map, map->rows, map->cols))
+      return VPX_CODEC_OK;
+    else
+      return VPX_CODEC_INVALID_PARAM;
+  } else
+    return VPX_CODEC_INVALID_PARAM;
+}
+
+static vpx_codec_err_t vp8e_set_scalemode(vpx_codec_alg_priv_t *ctx,
+                                          int ctr_id,
+                                          va_list args) {
+
+  vpx_scaling_mode_t *data =  va_arg(args, vpx_scaling_mode_t *);
+
+  if (data) {
+    int res;
+    vpx_scaling_mode_t scalemode = *(vpx_scaling_mode_t *)data;
+    res = vp9_set_internal_size(ctx->cpi, scalemode.h_scaling_mode,
+                                scalemode.v_scaling_mode);
+
+    if (!res) {
+      /*force next frame a key frame to effect scaling mode */
+      ctx->next_frame_flag |= FRAMEFLAGS_KEY;
+      return VPX_CODEC_OK;
+    } else
+      return VPX_CODEC_INVALID_PARAM;
+  } else
+    return VPX_CODEC_INVALID_PARAM;
+}
+
+
+static vpx_codec_ctrl_fn_map_t vp8e_ctf_maps[] = {
+  {VP8_SET_REFERENCE,                 vp8e_set_reference},
+  {VP8_COPY_REFERENCE,                vp8e_get_reference},
+  {VP8_SET_POSTPROC,                  vp8e_set_previewpp},
+  {VP8E_UPD_ENTROPY,                  vp8e_update_entropy},
+  {VP8E_UPD_REFERENCE,                vp8e_update_reference},
+  {VP8E_USE_REFERENCE,                vp8e_use_reference},
+  {VP8E_SET_ROI_MAP,                  vp8e_set_roi_map},
+  {VP8E_SET_ACTIVEMAP,                vp8e_set_activemap},
+  {VP8E_SET_SCALEMODE,                vp8e_set_scalemode},
+  {VP8E_SET_ENCODING_MODE,            set_param},
+  {VP8E_SET_CPUUSED,                  set_param},
+  {VP8E_SET_NOISE_SENSITIVITY,        set_param},
+  {VP8E_SET_ENABLEAUTOALTREF,         set_param},
+  {VP8E_SET_SHARPNESS,                set_param},
+  {VP8E_SET_STATIC_THRESHOLD,         set_param},
+  {VP8E_SET_TOKEN_PARTITIONS,         set_param},
+  {VP8E_GET_LAST_QUANTIZER,           get_param},
+  {VP8E_GET_LAST_QUANTIZER_64,        get_param},
+  {VP8E_SET_ARNR_MAXFRAMES,           set_param},
+  {VP8E_SET_ARNR_STRENGTH,           set_param},
+  {VP8E_SET_ARNR_TYPE,           set_param},
+  {VP8E_SET_TUNING,                   set_param},
+  {VP8E_SET_CQ_LEVEL,                 set_param},
+  {VP8E_SET_MAX_INTRA_BITRATE_PCT,    set_param},
+  { -1, NULL},
+};
+
+static vpx_codec_enc_cfg_map_t vp8e_usage_cfg_map[] = {
+  {
+    0,
+    {
+      0,                  /* g_usage */
+      0,                  /* g_threads */
+      0,                  /* g_profile */
+
+      320,                /* g_width */
+      240,                /* g_height */
+      {1, 30},            /* g_timebase */
+
+      0,                  /* g_error_resilient */
+
+      VPX_RC_ONE_PASS,    /* g_pass */
+
+      0,                  /* g_lag_in_frames */
+
+      0,                  /* rc_dropframe_thresh */
+      0,                  /* rc_resize_allowed */
+      60,                 /* rc_resize_down_thresold */
+      30,                 /* rc_resize_up_thresold */
+
+      VPX_VBR,            /* rc_end_usage */
+#if VPX_ENCODER_ABI_VERSION > (1 + VPX_CODEC_ABI_VERSION)
+      {0},                /* rc_twopass_stats_in */
+#endif
+      256,                /* rc_target_bandwidth */
+      4,                  /* rc_min_quantizer */
+      63,                 /* rc_max_quantizer */
+      100,                /* rc_undershoot_pct */
+      100,                /* rc_overshoot_pct */
+
+      6000,               /* rc_max_buffer_size */
+      4000,               /* rc_buffer_initial_size; */
+      5000,               /* rc_buffer_optimal_size; */
+
+      50,                 /* rc_two_pass_vbrbias  */
+      0,                  /* rc_two_pass_vbrmin_section */
+      400,                /* rc_two_pass_vbrmax_section */
+
+      /* keyframing settings (kf) */
+      VPX_KF_AUTO,        /* g_kfmode*/
+      0,                  /* kf_min_dist */
+      9999,               /* kf_max_dist */
+
+#if VPX_ENCODER_ABI_VERSION == (1 + VPX_CODEC_ABI_VERSION)
+      1,                  /* g_delete_first_pass_file */
+      "vp8.fpf"           /* first pass filename */
+#endif
+    }
+  },
+  { -1, {NOT_IMPLEMENTED}}
+};
+
+
+#ifndef VERSION_STRING
+#define VERSION_STRING
+#endif
+CODEC_INTERFACE(vpx_codec_vp8_cx) = {
+  "WebM Project VP8 Encoder" VERSION_STRING,
+  VPX_CODEC_INTERNAL_ABI_VERSION,
+  VPX_CODEC_CAP_ENCODER | VPX_CODEC_CAP_PSNR |
+  VPX_CODEC_CAP_OUTPUT_PARTITION,
+  /* vpx_codec_caps_t          caps; */
+  vp8e_init,          /* vpx_codec_init_fn_t       init; */
+  vp8e_destroy,       /* vpx_codec_destroy_fn_t    destroy; */
+  vp8e_ctf_maps,      /* vpx_codec_ctrl_fn_map_t  *ctrl_maps; */
+  NOT_IMPLEMENTED,    /* vpx_codec_get_mmap_fn_t   get_mmap; */
+  NOT_IMPLEMENTED,    /* vpx_codec_set_mmap_fn_t   set_mmap; */
+  {
+    NOT_IMPLEMENTED,    /* vpx_codec_peek_si_fn_t    peek_si; */
+    NOT_IMPLEMENTED,    /* vpx_codec_get_si_fn_t     get_si; */
+    NOT_IMPLEMENTED,    /* vpx_codec_decode_fn_t     decode; */
+    NOT_IMPLEMENTED,    /* vpx_codec_frame_get_fn_t  frame_get; */
+  },
+  {
+    vp8e_usage_cfg_map, /* vpx_codec_enc_cfg_map_t    peek_si; */
+    vp8e_encode,        /* vpx_codec_encode_fn_t      encode; */
+    vp8e_get_cxdata,    /* vpx_codec_get_cx_data_fn_t   frame_get; */
+    vp8e_set_config,
+    NOT_IMPLEMENTED,
+    vp8e_get_preview,
+  } /* encoder functions */
+};
+
+
+#if CONFIG_EXPERIMENTAL
+
+CODEC_INTERFACE(vpx_codec_vp8x_cx) = {
+  "VP8 Experimental Encoder" VERSION_STRING,
+  VPX_CODEC_INTERNAL_ABI_VERSION,
+  VPX_CODEC_CAP_ENCODER | VPX_CODEC_CAP_PSNR,
+  /* vpx_codec_caps_t          caps; */
+  vp8e_exp_init,      /* vpx_codec_init_fn_t       init; */
+  vp8e_destroy,       /* vpx_codec_destroy_fn_t    destroy; */
+  vp8e_ctf_maps,      /* vpx_codec_ctrl_fn_map_t  *ctrl_maps; */
+  NOT_IMPLEMENTED,    /* vpx_codec_get_mmap_fn_t   get_mmap; */
+  NOT_IMPLEMENTED,    /* vpx_codec_set_mmap_fn_t   set_mmap; */
+  {
+    NOT_IMPLEMENTED,    /* vpx_codec_peek_si_fn_t    peek_si; */
+    NOT_IMPLEMENTED,    /* vpx_codec_get_si_fn_t     get_si; */
+    NOT_IMPLEMENTED,    /* vpx_codec_decode_fn_t     decode; */
+    NOT_IMPLEMENTED,    /* vpx_codec_frame_get_fn_t  frame_get; */
+  },
+  {
+    vp8e_usage_cfg_map, /* vpx_codec_enc_cfg_map_t    peek_si; */
+    vp8e_encode,        /* vpx_codec_encode_fn_t      encode; */
+    vp8e_get_cxdata,    /* vpx_codec_get_cx_data_fn_t   frame_get; */
+    vp8e_set_config,
+    NOT_IMPLEMENTED,
+    vp8e_get_preview,
+  } /* encoder functions */
+};
+#endif
+
+
+/*
+ * BEGIN BACKWARDS COMPATIBILITY SHIM.
+ */
+#define FORCE_KEY   2
+static vpx_codec_err_t api1_control(vpx_codec_alg_priv_t *ctx,
+                                    int                   ctrl_id,
+                                    va_list               args) {
+  vpx_codec_ctrl_fn_map_t *entry;
+
+  switch (ctrl_id) {
+    case VP8E_SET_FLUSHFLAG:
+      /* VP8 sample code did VP8E_SET_FLUSHFLAG followed by
+       * vpx_codec_get_cx_data() rather than vpx_codec_encode().
+       */
+      return vp8e_encode(ctx, NULL, 0, 0, 0, 0);
+    case VP8E_SET_FRAMETYPE:
+      ctx->base.enc.tbd |= FORCE_KEY;
+      return VPX_CODEC_OK;
+  }
+
+  for (entry = vp8e_ctf_maps; entry && entry->fn; entry++) {
+    if (!entry->ctrl_id || entry->ctrl_id == ctrl_id) {
+      return entry->fn(ctx, ctrl_id, args);
+    }
+  }
+
+  return VPX_CODEC_ERROR;
+}
+
+
+static vpx_codec_ctrl_fn_map_t api1_ctrl_maps[] = {
+  {0, api1_control},
+  { -1, NULL}
+};
+
+
+static vpx_codec_err_t api1_encode(vpx_codec_alg_priv_t  *ctx,
+                                   const vpx_image_t     *img,
+                                   vpx_codec_pts_t        pts,
+                                   unsigned long          duration,
+                                   vpx_enc_frame_flags_t  flags,
+                                   unsigned long          deadline) {
+  int force = ctx->base.enc.tbd;
+
+  ctx->base.enc.tbd = 0;
+  return vp8e_encode
+         (ctx,
+          img,
+          pts,
+          duration,
+          flags | ((force & FORCE_KEY) ? VPX_EFLAG_FORCE_KF : 0),
+          deadline);
+}
+
+
+vpx_codec_iface_t vpx_enc_vp8_algo = {
+  "WebM Project VP8 Encoder (Deprecated API)" VERSION_STRING,
+  VPX_CODEC_INTERNAL_ABI_VERSION,
+  VPX_CODEC_CAP_ENCODER,
+  /* vpx_codec_caps_t          caps; */
+  vp8e_init,          /* vpx_codec_init_fn_t       init; */
+  vp8e_destroy,       /* vpx_codec_destroy_fn_t    destroy; */
+  api1_ctrl_maps,     /* vpx_codec_ctrl_fn_map_t  *ctrl_maps; */
+  NOT_IMPLEMENTED,    /* vpx_codec_get_mmap_fn_t   get_mmap; */
+  NOT_IMPLEMENTED,    /* vpx_codec_set_mmap_fn_t   set_mmap; */
+  {NOT_IMPLEMENTED},  /* decoder functions */
+  {
+    vp8e_usage_cfg_map, /* vpx_codec_enc_cfg_map_t    peek_si; */
+    api1_encode,        /* vpx_codec_encode_fn_t      encode; */
+    vp8e_get_cxdata,    /* vpx_codec_get_cx_data_fn_t   frame_get; */
+    vp8e_set_config,
+    NOT_IMPLEMENTED,
+    vp8e_get_preview,
+  } /* encoder functions */
+};
--- /dev/null
+++ b/vp9/vp9_dx_iface.c
@@ -1,0 +1,717 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include <stdlib.h>
+#include <string.h>
+#include "vpx/vpx_decoder.h"
+#include "vpx/vp8dx.h"
+#include "vpx/internal/vpx_codec_internal.h"
+#include "vpx_version.h"
+#include "common/onyxd.h"
+#include "decoder/onyxd_int.h"
+
+#define VP8_CAP_POSTPROC (CONFIG_POSTPROC ? VPX_CODEC_CAP_POSTPROC : 0)
+typedef vpx_codec_stream_info_t  vp8_stream_info_t;
+
+/* Structures for handling memory allocations */
+typedef enum {
+  VP8_SEG_ALG_PRIV     = 256,
+  VP8_SEG_MAX
+} mem_seg_id_t;
+#define NELEMENTS(x) ((int)(sizeof(x)/sizeof(x[0])))
+
+static unsigned long vp8_priv_sz(const vpx_codec_dec_cfg_t *si, vpx_codec_flags_t);
+
+typedef struct {
+  unsigned int   id;
+  unsigned long  sz;
+  unsigned int   align;
+  unsigned int   flags;
+  unsigned long(*calc_sz)(const vpx_codec_dec_cfg_t *, vpx_codec_flags_t);
+} mem_req_t;
+
+static const mem_req_t vp8_mem_req_segs[] = {
+  {VP8_SEG_ALG_PRIV,    0, 8, VPX_CODEC_MEM_ZERO, vp8_priv_sz},
+  {VP8_SEG_MAX, 0, 0, 0, NULL}
+};
+
+struct vpx_codec_alg_priv {
+  vpx_codec_priv_t        base;
+  vpx_codec_mmap_t        mmaps[NELEMENTS(vp8_mem_req_segs) - 1];
+  vpx_codec_dec_cfg_t     cfg;
+  vp8_stream_info_t       si;
+  int                     defer_alloc;
+  int                     decoder_init;
+  VP9D_PTR                pbi;
+  int                     postproc_cfg_set;
+  vp8_postproc_cfg_t      postproc_cfg;
+#if CONFIG_POSTPROC_VISUALIZER
+  unsigned int            dbg_postproc_flag;
+  int                     dbg_color_ref_frame_flag;
+  int                     dbg_color_mb_modes_flag;
+  int                     dbg_color_b_modes_flag;
+  int                     dbg_display_mv_flag;
+#endif
+  vpx_image_t             img;
+  int                     img_setup;
+  int                     img_avail;
+};
+
+static unsigned long vp8_priv_sz(const vpx_codec_dec_cfg_t *si,
+                                 vpx_codec_flags_t flags) {
+  /* Although this declaration is constant, we can't use it in the requested
+   * segments list because we want to define the requested segments list
+   * before defining the private type (so that the number of memory maps is
+   * known)
+   */
+  (void)si;
+  return sizeof(vpx_codec_alg_priv_t);
+}
+
+
+static void vp8_mmap_dtor(vpx_codec_mmap_t *mmap) {
+  free(mmap->priv);
+}
+
+static vpx_codec_err_t vp8_mmap_alloc(vpx_codec_mmap_t *mmap) {
+  vpx_codec_err_t  res;
+  unsigned int   align;
+
+  align = mmap->align ? mmap->align - 1 : 0;
+
+  if (mmap->flags & VPX_CODEC_MEM_ZERO)
+    mmap->priv = calloc(1, mmap->sz + align);
+  else
+    mmap->priv = malloc(mmap->sz + align);
+
+  res = (mmap->priv) ? VPX_CODEC_OK : VPX_CODEC_MEM_ERROR;
+  mmap->base = (void *)((((uintptr_t)mmap->priv) + align) & ~(uintptr_t)align);
+  mmap->dtor = vp8_mmap_dtor;
+  return res;
+}
+
+static vpx_codec_err_t vp8_validate_mmaps(const vp8_stream_info_t *si,
+                                          const vpx_codec_mmap_t *mmaps,
+                                          vpx_codec_flags_t init_flags) {
+  int i;
+  vpx_codec_err_t res = VPX_CODEC_OK;
+
+  for (i = 0; i < NELEMENTS(vp8_mem_req_segs) - 1; i++) {
+    /* Ensure the segment has been allocated */
+    if (!mmaps[i].base) {
+      res = VPX_CODEC_MEM_ERROR;
+      break;
+    }
+
+    /* Verify variable size segment is big enough for the current si. */
+    if (vp8_mem_req_segs[i].calc_sz) {
+      vpx_codec_dec_cfg_t cfg;
+
+      cfg.w = si->w;
+      cfg.h = si->h;
+
+      if (mmaps[i].sz < vp8_mem_req_segs[i].calc_sz(&cfg, init_flags)) {
+        res = VPX_CODEC_MEM_ERROR;
+        break;
+      }
+    }
+  }
+
+  return res;
+}
+
+static void vp8_init_ctx(vpx_codec_ctx_t *ctx, const vpx_codec_mmap_t *mmap) {
+  int i;
+
+  ctx->priv = mmap->base;
+  ctx->priv->sz = sizeof(*ctx->priv);
+  ctx->priv->iface = ctx->iface;
+  ctx->priv->alg_priv = mmap->base;
+
+  for (i = 0; i < NELEMENTS(ctx->priv->alg_priv->mmaps); i++)
+    ctx->priv->alg_priv->mmaps[i].id = vp8_mem_req_segs[i].id;
+
+  ctx->priv->alg_priv->mmaps[0] = *mmap;
+  ctx->priv->alg_priv->si.sz = sizeof(ctx->priv->alg_priv->si);
+  ctx->priv->init_flags = ctx->init_flags;
+
+  if (ctx->config.dec) {
+    /* Update the reference to the config structure to an internal copy. */
+    ctx->priv->alg_priv->cfg = *ctx->config.dec;
+    ctx->config.dec = &ctx->priv->alg_priv->cfg;
+  }
+}
+
+static void *mmap_lkup(vpx_codec_alg_priv_t *ctx, unsigned int id) {
+  int i;
+
+  for (i = 0; i < NELEMENTS(ctx->mmaps); i++)
+    if (ctx->mmaps[i].id == id)
+      return ctx->mmaps[i].base;
+
+  return NULL;
+}
+static void vp8_finalize_mmaps(vpx_codec_alg_priv_t *ctx) {
+  /* nothing to clean up */
+}
+
+static vpx_codec_err_t vp8_init(vpx_codec_ctx_t *ctx) {
+  vpx_codec_err_t        res = VPX_CODEC_OK;
+
+  /* This function only allocates space for the vpx_codec_alg_priv_t
+   * structure. More memory may be required at the time the stream
+   * information becomes known.
+   */
+  if (!ctx->priv) {
+    vpx_codec_mmap_t mmap;
+
+    mmap.id = vp8_mem_req_segs[0].id;
+    mmap.sz = sizeof(vpx_codec_alg_priv_t);
+    mmap.align = vp8_mem_req_segs[0].align;
+    mmap.flags = vp8_mem_req_segs[0].flags;
+
+    res = vp8_mmap_alloc(&mmap);
+
+    if (!res) {
+      vp8_init_ctx(ctx, &mmap);
+
+      ctx->priv->alg_priv->defer_alloc = 1;
+      /*post processing level initialized to do nothing */
+    }
+  }
+
+  return res;
+}
+
+static vpx_codec_err_t vp8_destroy(vpx_codec_alg_priv_t *ctx) {
+  int i;
+
+  vp9_remove_decompressor(ctx->pbi);
+
+  for (i = NELEMENTS(ctx->mmaps) - 1; i >= 0; i--) {
+    if (ctx->mmaps[i].dtor)
+      ctx->mmaps[i].dtor(&ctx->mmaps[i]);
+  }
+
+  return VPX_CODEC_OK;
+}
+
+static vpx_codec_err_t vp8_peek_si(const uint8_t         *data,
+                                   unsigned int           data_sz,
+                                   vpx_codec_stream_info_t *si) {
+  vpx_codec_err_t res = VPX_CODEC_OK;
+
+  if (data + data_sz <= data)
+    res = VPX_CODEC_INVALID_PARAM;
+  else {
+    /* Parse uncompresssed part of key frame header.
+     * 3 bytes:- including version, frame type and an offset
+     * 3 bytes:- sync code (0x9d, 0x01, 0x2a)
+     * 4 bytes:- including image width and height in the lowest 14 bits
+     *           of each 2-byte value.
+     */
+    si->is_kf = 0;
+
+    if (data_sz >= 10 && !(data[0] & 0x01)) { /* I-Frame */
+      const uint8_t *c = data + 3;
+      si->is_kf = 1;
+
+      /* vet via sync code */
+      if (c[0] != 0x9d || c[1] != 0x01 || c[2] != 0x2a)
+        res = VPX_CODEC_UNSUP_BITSTREAM;
+
+      si->w = (c[3] | (c[4] << 8)) & 0x3fff;
+      si->h = (c[5] | (c[6] << 8)) & 0x3fff;
+
+      /*printf("w=%d, h=%d\n", si->w, si->h);*/
+      if (!(si->h | si->w))
+        res = VPX_CODEC_UNSUP_BITSTREAM;
+    } else
+      res = VPX_CODEC_UNSUP_BITSTREAM;
+  }
+
+  return res;
+
+}
+
+static vpx_codec_err_t vp8_get_si(vpx_codec_alg_priv_t    *ctx,
+                                  vpx_codec_stream_info_t *si) {
+
+  unsigned int sz;
+
+  if (si->sz >= sizeof(vp8_stream_info_t))
+    sz = sizeof(vp8_stream_info_t);
+  else
+    sz = sizeof(vpx_codec_stream_info_t);
+
+  memcpy(si, &ctx->si, sz);
+  si->sz = sz;
+
+  return VPX_CODEC_OK;
+}
+
+
+static vpx_codec_err_t
+update_error_state(vpx_codec_alg_priv_t                 *ctx,
+                   const struct vpx_internal_error_info *error) {
+  vpx_codec_err_t res;
+
+  if ((res = error->error_code))
+    ctx->base.err_detail = error->has_detail
+                           ? error->detail
+                           : NULL;
+
+  return res;
+}
+
+static void yuvconfig2image(vpx_image_t               *img,
+                            const YV12_BUFFER_CONFIG  *yv12,
+                            void                      *user_priv) {
+  /** vpx_img_wrap() doesn't allow specifying independent strides for
+    * the Y, U, and V planes, nor other alignment adjustments that
+    * might be representable by a YV12_BUFFER_CONFIG, so we just
+    * initialize all the fields.*/
+  img->fmt = yv12->clrtype == REG_YUV ?
+             VPX_IMG_FMT_I420 : VPX_IMG_FMT_VPXI420;
+  img->w = yv12->y_stride;
+  img->h = (yv12->y_height + 2 * VP8BORDERINPIXELS + 15) & ~15;
+  img->d_w = yv12->y_width;
+  img->d_h = yv12->y_height;
+  img->x_chroma_shift = 1;
+  img->y_chroma_shift = 1;
+  img->planes[VPX_PLANE_Y] = yv12->y_buffer;
+  img->planes[VPX_PLANE_U] = yv12->u_buffer;
+  img->planes[VPX_PLANE_V] = yv12->v_buffer;
+  img->planes[VPX_PLANE_ALPHA] = NULL;
+  img->stride[VPX_PLANE_Y] = yv12->y_stride;
+  img->stride[VPX_PLANE_U] = yv12->uv_stride;
+  img->stride[VPX_PLANE_V] = yv12->uv_stride;
+  img->stride[VPX_PLANE_ALPHA] = yv12->y_stride;
+  img->bps = 12;
+  img->user_priv = user_priv;
+  img->img_data = yv12->buffer_alloc;
+  img->img_data_owner = 0;
+  img->self_allocd = 0;
+}
+
+static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t  *ctx,
+                                  const uint8_t         *data,
+                                  unsigned int            data_sz,
+                                  void                    *user_priv,
+                                  long                    deadline) {
+  vpx_codec_err_t res = VPX_CODEC_OK;
+
+  ctx->img_avail = 0;
+
+  /* Determine the stream parameters. Note that we rely on peek_si to
+   * validate that we have a buffer that does not wrap around the top
+   * of the heap.
+   */
+  if (!ctx->si.h)
+    res = ctx->base.iface->dec.peek_si(data, data_sz, &ctx->si);
+
+
+  /* Perform deferred allocations, if required */
+  if (!res && ctx->defer_alloc) {
+    int i;
+
+    for (i = 1; !res && i < NELEMENTS(ctx->mmaps); i++) {
+      vpx_codec_dec_cfg_t cfg;
+
+      cfg.w = ctx->si.w;
+      cfg.h = ctx->si.h;
+      ctx->mmaps[i].id = vp8_mem_req_segs[i].id;
+      ctx->mmaps[i].sz = vp8_mem_req_segs[i].sz;
+      ctx->mmaps[i].align = vp8_mem_req_segs[i].align;
+      ctx->mmaps[i].flags = vp8_mem_req_segs[i].flags;
+
+      if (!ctx->mmaps[i].sz)
+        ctx->mmaps[i].sz = vp8_mem_req_segs[i].calc_sz(&cfg,
+                                                       ctx->base.init_flags);
+
+      res = vp8_mmap_alloc(&ctx->mmaps[i]);
+    }
+
+    if (!res)
+      vp8_finalize_mmaps(ctx);
+
+    ctx->defer_alloc = 0;
+  }
+
+  /* Initialize the decoder instance on the first frame*/
+  if (!res && !ctx->decoder_init) {
+    res = vp8_validate_mmaps(&ctx->si, ctx->mmaps, ctx->base.init_flags);
+
+    if (!res) {
+      VP9D_CONFIG oxcf;
+      VP9D_PTR optr;
+
+      vp9_initialize_dec();
+
+      oxcf.Width = ctx->si.w;
+      oxcf.Height = ctx->si.h;
+      oxcf.Version = 9;
+      oxcf.postprocess = 0;
+      oxcf.max_threads = ctx->cfg.threads;
+      optr = vp9_create_decompressor(&oxcf);
+
+      /* If postprocessing was enabled by the application and a
+       * configuration has not been provided, default it.
+       */
+      if (!ctx->postproc_cfg_set
+          && (ctx->base.init_flags & VPX_CODEC_USE_POSTPROC)) {
+        ctx->postproc_cfg.post_proc_flag =
+          VP8_DEBLOCK | VP8_DEMACROBLOCK;
+        ctx->postproc_cfg.deblocking_level = 4;
+        ctx->postproc_cfg.noise_level = 0;
+      }
+
+      if (!optr)
+        res = VPX_CODEC_ERROR;
+      else
+        ctx->pbi = optr;
+    }
+
+    ctx->decoder_init = 1;
+  }
+
+  if (!res && ctx->pbi) {
+    YV12_BUFFER_CONFIG sd;
+    int64_t time_stamp = 0, time_end_stamp = 0;
+    vp9_ppflags_t flags = {0};
+
+    if (ctx->base.init_flags & VPX_CODEC_USE_POSTPROC) {
+      flags.post_proc_flag = ctx->postproc_cfg.post_proc_flag
+#if CONFIG_POSTPROC_VISUALIZER
+
+                             | ((ctx->dbg_color_ref_frame_flag != 0) ? VP9D_DEBUG_CLR_FRM_REF_BLKS : 0)
+                             | ((ctx->dbg_color_mb_modes_flag != 0) ? VP9D_DEBUG_CLR_BLK_MODES : 0)
+                             | ((ctx->dbg_color_b_modes_flag != 0) ? VP9D_DEBUG_CLR_BLK_MODES : 0)
+                             | ((ctx->dbg_display_mv_flag != 0) ? VP9D_DEBUG_DRAW_MV : 0)
+#endif
+;
+      flags.deblocking_level      = ctx->postproc_cfg.deblocking_level;
+      flags.noise_level           = ctx->postproc_cfg.noise_level;
+#if CONFIG_POSTPROC_VISUALIZER
+      flags.display_ref_frame_flag = ctx->dbg_color_ref_frame_flag;
+      flags.display_mb_modes_flag = ctx->dbg_color_mb_modes_flag;
+      flags.display_b_modes_flag  = ctx->dbg_color_b_modes_flag;
+      flags.display_mv_flag       = ctx->dbg_display_mv_flag;
+#endif
+    }
+
+    if (vp9_receive_compressed_data(ctx->pbi, data_sz, data, deadline)) {
+      VP9D_COMP *pbi = (VP9D_COMP *)ctx->pbi;
+      res = update_error_state(ctx, &pbi->common.error);
+    }
+
+    if (!res && 0 == vp9_get_raw_frame(ctx->pbi, &sd, &time_stamp,
+                                       &time_end_stamp, &flags)) {
+      yuvconfig2image(&ctx->img, &sd, user_priv);
+      ctx->img_avail = 1;
+    }
+  }
+
+  return res;
+}
+
+static vpx_image_t *vp8_get_frame(vpx_codec_alg_priv_t  *ctx,
+                                  vpx_codec_iter_t      *iter) {
+  vpx_image_t *img = NULL;
+
+  if (ctx->img_avail) {
+    /* iter acts as a flip flop, so an image is only returned on the first
+     * call to get_frame.
+     */
+    if (!(*iter)) {
+      img = &ctx->img;
+      *iter = img;
+    }
+  }
+
+  return img;
+}
+
+
+static
+vpx_codec_err_t vp8_xma_get_mmap(const vpx_codec_ctx_t      *ctx,
+                                 vpx_codec_mmap_t           *mmap,
+                                 vpx_codec_iter_t           *iter) {
+  vpx_codec_err_t     res;
+  const mem_req_t  *seg_iter = *iter;
+
+  /* Get address of next segment request */
+  do {
+    if (!seg_iter)
+      seg_iter = vp8_mem_req_segs;
+    else if (seg_iter->id != VP8_SEG_MAX)
+      seg_iter++;
+
+    *iter = (vpx_codec_iter_t)seg_iter;
+
+    if (seg_iter->id != VP8_SEG_MAX) {
+      mmap->id = seg_iter->id;
+      mmap->sz = seg_iter->sz;
+      mmap->align = seg_iter->align;
+      mmap->flags = seg_iter->flags;
+
+      if (!seg_iter->sz)
+        mmap->sz = seg_iter->calc_sz(ctx->config.dec, ctx->init_flags);
+
+      res = VPX_CODEC_OK;
+    } else
+      res = VPX_CODEC_LIST_END;
+  } while (!mmap->sz && res != VPX_CODEC_LIST_END);
+
+  return res;
+}
+
+static vpx_codec_err_t vp8_xma_set_mmap(vpx_codec_ctx_t         *ctx,
+                                        const vpx_codec_mmap_t  *mmap) {
+  vpx_codec_err_t res = VPX_CODEC_MEM_ERROR;
+  int i, done;
+
+  if (!ctx->priv) {
+    if (mmap->id == VP8_SEG_ALG_PRIV) {
+      if (!ctx->priv) {
+        vp8_init_ctx(ctx, mmap);
+        res = VPX_CODEC_OK;
+      }
+    }
+  }
+
+  done = 1;
+
+  if (!res && ctx->priv->alg_priv) {
+    for (i = 0; i < NELEMENTS(ctx->priv->alg_priv->mmaps); i++) {
+      if (ctx->priv->alg_priv->mmaps[i].id == mmap->id)
+        if (!ctx->priv->alg_priv->mmaps[i].base) {
+          ctx->priv->alg_priv->mmaps[i] = *mmap;
+          res = VPX_CODEC_OK;
+        }
+
+      done &= (ctx->priv->alg_priv->mmaps[i].base != NULL);
+    }
+  }
+
+  if (done && !res) {
+    vp8_finalize_mmaps(ctx->priv->alg_priv);
+    res = ctx->iface->init(ctx);
+  }
+
+  return res;
+}
+
+static vpx_codec_err_t image2yuvconfig(const vpx_image_t   *img,
+                                       YV12_BUFFER_CONFIG  *yv12) {
+  vpx_codec_err_t        res = VPX_CODEC_OK;
+  yv12->y_buffer = img->planes[VPX_PLANE_Y];
+  yv12->u_buffer = img->planes[VPX_PLANE_U];
+  yv12->v_buffer = img->planes[VPX_PLANE_V];
+
+  yv12->y_width  = img->d_w;
+  yv12->y_height = img->d_h;
+  yv12->uv_width = yv12->y_width / 2;
+  yv12->uv_height = yv12->y_height / 2;
+
+  yv12->y_stride = img->stride[VPX_PLANE_Y];
+  yv12->uv_stride = img->stride[VPX_PLANE_U];
+
+  yv12->border  = (img->stride[VPX_PLANE_Y] - img->d_w) / 2;
+  yv12->clrtype = (img->fmt == VPX_IMG_FMT_VPXI420 ||
+                   img->fmt == VPX_IMG_FMT_VPXYV12);
+
+  return res;
+}
+
+
+static vpx_codec_err_t vp9_set_reference(vpx_codec_alg_priv_t *ctx,
+                                         int ctr_id,
+                                         va_list args) {
+
+  vpx_ref_frame_t *data = va_arg(args, vpx_ref_frame_t *);
+
+  if (data) {
+    vpx_ref_frame_t *frame = (vpx_ref_frame_t *)data;
+    YV12_BUFFER_CONFIG sd;
+
+    image2yuvconfig(&frame->img, &sd);
+
+    return vp9_set_reference_dec(ctx->pbi, frame->frame_type, &sd);
+  } else
+    return VPX_CODEC_INVALID_PARAM;
+
+}
+
+static vpx_codec_err_t vp9_get_reference(vpx_codec_alg_priv_t *ctx,
+                                         int ctr_id,
+                                         va_list args) {
+
+  vpx_ref_frame_t *data = va_arg(args, vpx_ref_frame_t *);
+
+  if (data) {
+    vpx_ref_frame_t *frame = (vpx_ref_frame_t *)data;
+    YV12_BUFFER_CONFIG sd;
+
+    image2yuvconfig(&frame->img, &sd);
+
+    return vp9_get_reference_dec(ctx->pbi, frame->frame_type, &sd);
+  } else
+    return VPX_CODEC_INVALID_PARAM;
+
+}
+
+static vpx_codec_err_t vp8_set_postproc(vpx_codec_alg_priv_t *ctx,
+                                        int ctr_id,
+                                        va_list args) {
+#if CONFIG_POSTPROC
+  vp8_postproc_cfg_t *data = va_arg(args, vp8_postproc_cfg_t *);
+
+  if (data) {
+    ctx->postproc_cfg_set = 1;
+    ctx->postproc_cfg = *((vp8_postproc_cfg_t *)data);
+    return VPX_CODEC_OK;
+  } else
+    return VPX_CODEC_INVALID_PARAM;
+
+#else
+  return VPX_CODEC_INCAPABLE;
+#endif
+}
+
+static vpx_codec_err_t vp8_set_dbg_options(vpx_codec_alg_priv_t *ctx,
+                                           int ctrl_id,
+                                           va_list args) {
+#if CONFIG_POSTPROC_VISUALIZER && CONFIG_POSTPROC
+  int data = va_arg(args, int);
+
+#define MAP(id, var) case id: var = data; break;
+
+  switch (ctrl_id) {
+      MAP(VP8_SET_DBG_COLOR_REF_FRAME,   ctx->dbg_color_ref_frame_flag);
+      MAP(VP8_SET_DBG_COLOR_MB_MODES,    ctx->dbg_color_mb_modes_flag);
+      MAP(VP8_SET_DBG_COLOR_B_MODES,     ctx->dbg_color_b_modes_flag);
+      MAP(VP8_SET_DBG_DISPLAY_MV,        ctx->dbg_display_mv_flag);
+  }
+
+  return VPX_CODEC_OK;
+#else
+  return VPX_CODEC_INCAPABLE;
+#endif
+}
+
+static vpx_codec_err_t vp8_get_last_ref_updates(vpx_codec_alg_priv_t *ctx,
+                                                int ctrl_id,
+                                                va_list args) {
+  int *update_info = va_arg(args, int *);
+  VP9D_COMP *pbi = (VP9D_COMP *)ctx->pbi;
+
+  if (update_info) {
+    *update_info = pbi->common.refresh_alt_ref_frame * (int) VP8_ALTR_FRAME
+                   + pbi->common.refresh_golden_frame * (int) VP8_GOLD_FRAME
+                   + pbi->common.refresh_last_frame * (int) VP8_LAST_FRAME;
+
+    return VPX_CODEC_OK;
+  } else
+    return VPX_CODEC_INVALID_PARAM;
+}
+
+
+static vpx_codec_err_t vp8_get_frame_corrupted(vpx_codec_alg_priv_t *ctx,
+                                               int ctrl_id,
+                                               va_list args) {
+
+  int *corrupted = va_arg(args, int *);
+
+  if (corrupted) {
+    VP9D_COMP *pbi = (VP9D_COMP *)ctx->pbi;
+    *corrupted = pbi->common.frame_to_show->corrupted;
+
+    return VPX_CODEC_OK;
+  } else
+    return VPX_CODEC_INVALID_PARAM;
+
+}
+
+static vpx_codec_ctrl_fn_map_t ctf_maps[] = {
+  {VP8_SET_REFERENCE,             vp9_set_reference},
+  {VP8_COPY_REFERENCE,            vp9_get_reference},
+  {VP8_SET_POSTPROC,              vp8_set_postproc},
+  {VP8_SET_DBG_COLOR_REF_FRAME,   vp8_set_dbg_options},
+  {VP8_SET_DBG_COLOR_MB_MODES,    vp8_set_dbg_options},
+  {VP8_SET_DBG_COLOR_B_MODES,     vp8_set_dbg_options},
+  {VP8_SET_DBG_DISPLAY_MV,        vp8_set_dbg_options},
+  {VP8D_GET_LAST_REF_UPDATES,     vp8_get_last_ref_updates},
+  {VP8D_GET_FRAME_CORRUPTED,      vp8_get_frame_corrupted},
+  { -1, NULL},
+};
+
+
+#ifndef VERSION_STRING
+#define VERSION_STRING
+#endif
+CODEC_INTERFACE(vpx_codec_vp8_dx) = {
+  "WebM Project VP8 Decoder" VERSION_STRING,
+  VPX_CODEC_INTERNAL_ABI_VERSION,
+  VPX_CODEC_CAP_DECODER | VP8_CAP_POSTPROC |
+  VPX_CODEC_CAP_INPUT_PARTITION,
+  /* vpx_codec_caps_t          caps; */
+  vp8_init,         /* vpx_codec_init_fn_t       init; */
+  vp8_destroy,      /* vpx_codec_destroy_fn_t    destroy; */
+  ctf_maps,         /* vpx_codec_ctrl_fn_map_t  *ctrl_maps; */
+  vp8_xma_get_mmap, /* vpx_codec_get_mmap_fn_t   get_mmap; */
+  vp8_xma_set_mmap, /* vpx_codec_set_mmap_fn_t   set_mmap; */
+  {
+    vp8_peek_si,      /* vpx_codec_peek_si_fn_t    peek_si; */
+    vp8_get_si,       /* vpx_codec_get_si_fn_t     get_si; */
+    vp8_decode,       /* vpx_codec_decode_fn_t     decode; */
+    vp8_get_frame,    /* vpx_codec_frame_get_fn_t  frame_get; */
+  },
+  {
+    /* encoder functions */
+    NOT_IMPLEMENTED,
+    NOT_IMPLEMENTED,
+    NOT_IMPLEMENTED,
+    NOT_IMPLEMENTED,
+    NOT_IMPLEMENTED,
+    NOT_IMPLEMENTED
+  }
+};
+
+/*
+ * BEGIN BACKWARDS COMPATIBILITY SHIM.
+ */
+vpx_codec_iface_t vpx_codec_vp8_algo = {
+  "WebM Project VP8 Decoder (Deprecated API)" VERSION_STRING,
+  VPX_CODEC_INTERNAL_ABI_VERSION,
+  VPX_CODEC_CAP_DECODER | VP8_CAP_POSTPROC,
+  /* vpx_codec_caps_t          caps; */
+  vp8_init,         /* vpx_codec_init_fn_t       init; */
+  vp8_destroy,      /* vpx_codec_destroy_fn_t    destroy; */
+  ctf_maps,         /* vpx_codec_ctrl_fn_map_t  *ctrl_maps; */
+  vp8_xma_get_mmap, /* vpx_codec_get_mmap_fn_t   get_mmap; */
+  vp8_xma_set_mmap, /* vpx_codec_set_mmap_fn_t   set_mmap; */
+  {
+    vp8_peek_si,      /* vpx_codec_peek_si_fn_t    peek_si; */
+    vp8_get_si,       /* vpx_codec_get_si_fn_t     get_si; */
+    vp8_decode,       /* vpx_codec_decode_fn_t     decode; */
+    vp8_get_frame,    /* vpx_codec_frame_get_fn_t  frame_get; */
+  },
+  {
+    /* encoder functions */
+    NOT_IMPLEMENTED,
+    NOT_IMPLEMENTED,
+    NOT_IMPLEMENTED,
+    NOT_IMPLEMENTED,
+    NOT_IMPLEMENTED,
+    NOT_IMPLEMENTED
+  }
+};
--- /dev/null
+++ b/vp9/vp9cx.mk
@@ -1,0 +1,120 @@
+##
+##  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+##
+##  Use of this source code is governed by a BSD-style license
+##  that can be found in the LICENSE file in the root of the source
+##  tree. An additional intellectual property rights grant can be found
+##  in the file PATENTS.  All contributing project authors may
+##  be found in the AUTHORS file in the root of the source tree.
+##
+
+
+include $(SRC_PATH_BARE)/$(VP9_PREFIX)vp9_common.mk
+
+VP9_CX_EXPORTS += exports_enc
+
+VP9_CX_SRCS-yes += $(VP9_COMMON_SRCS-yes)
+VP9_CX_SRCS-no  += $(VP9_COMMON_SRCS-no)
+VP9_CX_SRCS_REMOVE-yes += $(VP9_COMMON_SRCS_REMOVE-yes)
+VP9_CX_SRCS_REMOVE-no  += $(VP9_COMMON_SRCS_REMOVE-no)
+
+ifeq ($(ARCH_ARM),yes)
+  include $(SRC_PATH_BARE)/$(VP9_PREFIX)vp9cx_arm.mk
+endif
+
+VP9_CX_SRCS-yes += vp9_cx_iface.c
+
+# encoder
+#INCLUDES += algo/vpx_common/vpx_mem/include
+#INCLUDES += common
+#INCLUDES += common
+#INCLUDES += common
+#INCLUDES += algo/vpx_ref/cpu_id/include
+#INCLUDES += common
+#INCLUDES += encoder
+
+VP9_CX_SRCS-yes += encoder/asm_enc_offsets.c
+VP9_CX_SRCS-yes += encoder/bitstream.c
+VP9_CX_SRCS-yes += encoder/boolhuff.c
+VP9_CX_SRCS-yes += encoder/dct.c
+VP9_CX_SRCS-yes += encoder/encodeframe.c
+VP9_CX_SRCS-yes += encoder/encodeintra.c
+VP9_CX_SRCS-yes += encoder/encodemb.c
+VP9_CX_SRCS-yes += encoder/encodemv.c
+VP9_CX_SRCS-yes += encoder/firstpass.c
+VP9_CX_SRCS-yes += encoder/generic/csystemdependent.c
+VP9_CX_SRCS-yes += encoder/block.h
+VP9_CX_SRCS-yes += encoder/boolhuff.h
+VP9_CX_SRCS-yes += encoder/bitstream.h
+VP9_CX_SRCS-yes += encoder/encodeintra.h
+VP9_CX_SRCS-yes += encoder/encodemb.h
+VP9_CX_SRCS-yes += encoder/encodemv.h
+VP9_CX_SRCS-yes += encoder/firstpass.h
+VP9_CX_SRCS-yes += encoder/lookahead.c
+VP9_CX_SRCS-yes += encoder/lookahead.h
+VP9_CX_SRCS-yes += encoder/mcomp.h
+VP9_CX_SRCS-yes += encoder/modecosts.h
+VP9_CX_SRCS-yes += encoder/onyx_int.h
+VP9_CX_SRCS-yes += encoder/psnr.h
+VP9_CX_SRCS-yes += encoder/quantize.h
+VP9_CX_SRCS-yes += encoder/ratectrl.h
+VP9_CX_SRCS-yes += encoder/rdopt.h
+VP9_CX_SRCS-yes += encoder/tokenize.h
+VP9_CX_SRCS-yes += encoder/treewriter.h
+VP9_CX_SRCS-yes += encoder/variance.h
+VP9_CX_SRCS-yes += encoder/mcomp.c
+VP9_CX_SRCS-yes += encoder/modecosts.c
+VP9_CX_SRCS-yes += encoder/onyx_if.c
+VP9_CX_SRCS-yes += encoder/picklpf.c
+VP9_CX_SRCS-yes += encoder/psnr.c
+VP9_CX_SRCS-yes += encoder/quantize.c
+VP9_CX_SRCS-yes += encoder/ratectrl.c
+VP9_CX_SRCS-yes += encoder/rdopt.c
+VP9_CX_SRCS-yes += encoder/sad_c.c
+VP9_CX_SRCS-yes += encoder/satd_c.c
+VP9_CX_SRCS-yes += encoder/segmentation.c
+VP9_CX_SRCS-yes += encoder/segmentation.h
+VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += encoder/ssim.c
+VP9_CX_SRCS-yes += encoder/tokenize.c
+VP9_CX_SRCS-yes += encoder/treewriter.c
+VP9_CX_SRCS-yes += encoder/variance_c.c
+ifeq ($(CONFIG_POSTPROC),yes)
+VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += common/postproc.h
+VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += common/postproc.c
+endif
+VP9_CX_SRCS-yes += encoder/temporal_filter.c
+VP9_CX_SRCS-yes += encoder/temporal_filter.h
+VP9_CX_SRCS-yes += encoder/mbgraph.c
+VP9_CX_SRCS-yes += encoder/mbgraph.h
+
+
+VP9_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/mcomp_x86.h
+VP9_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/quantize_x86.h
+VP9_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/temporal_filter_x86.h
+VP9_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/x86_csystemdependent.c
+VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/variance_mmx.c
+VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/variance_impl_mmx.asm
+VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/sad_mmx.asm
+VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/dct_mmx.asm
+VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/subtract_mmx.asm
+VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/dct_sse2.asm
+VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/variance_sse2.c
+VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/variance_impl_sse2.asm
+VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/sad_sse2.asm
+VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/fwalsh_sse2.asm
+VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/quantize_sse2.asm
+VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/subtract_sse2.asm
+VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/temporal_filter_apply_sse2.asm
+VP9_CX_SRCS-$(HAVE_SSE3) += encoder/x86/sad_sse3.asm
+VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/sad_ssse3.asm
+VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/variance_ssse3.c
+VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/variance_impl_ssse3.asm
+VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/quantize_ssse3.asm
+VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/sad_sse4.asm
+VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/quantize_sse4.asm
+VP9_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/quantize_mmx.asm
+VP9_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/encodeopt.asm
+VP9_CX_SRCS-$(ARCH_X86_64) += encoder/x86/ssim_opt.asm
+
+
+VP9_CX_SRCS-yes := $(filter-out $(VP9_CX_SRCS_REMOVE-yes),$(VP9_CX_SRCS-yes))
--- /dev/null
+++ b/vp9/vp9cx_arm.mk
@@ -1,0 +1,63 @@
+##
+##  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+##
+##  Use of this source code is governed by a BSD-style license
+##  that can be found in the LICENSE file in the root of the source
+##  tree. An additional intellectual property rights grant can be found
+##  in the file PATENTS.  All contributing project authors may
+##  be found in the AUTHORS file in the root of the source tree.
+##
+
+
+#VP9_CX_SRCS list is modified according to different platforms.
+
+#File list for arm
+# encoder
+VP9_CX_SRCS-$(ARCH_ARM)  += encoder/arm/arm_csystemdependent.c
+
+VP9_CX_SRCS-$(ARCH_ARM)  += encoder/arm/dct_arm.c
+VP9_CX_SRCS-$(ARCH_ARM)  += encoder/arm/dct_arm.h
+VP9_CX_SRCS-$(ARCH_ARM)  += encoder/arm/encodemb_arm.h
+VP9_CX_SRCS-$(ARCH_ARM)  += encoder/arm/quantize_arm.c
+VP9_CX_SRCS-$(ARCH_ARM)  += encoder/arm/quantize_arm.h
+VP9_CX_SRCS-$(ARCH_ARM)  += encoder/arm/variance_arm.c
+VP9_CX_SRCS-$(ARCH_ARM)  += encoder/arm/variance_arm.h
+
+#File list for armv5te
+# encoder
+VP9_CX_SRCS-$(HAVE_ARMV5TE) += encoder/arm/boolhuff_arm.c
+VP9_CX_SRCS_REMOVE-$(HAVE_ARMV5TE)  += encoder/boolhuff.c
+VP9_CX_SRCS-$(HAVE_ARMV5TE)  += encoder/arm/armv5te/boolhuff_armv5te$(ASM)
+VP9_CX_SRCS-$(HAVE_ARMV5TE)  += encoder/arm/armv5te/vp8_packtokens_armv5$(ASM)
+VP9_CX_SRCS-$(HAVE_ARMV5TE)  += encoder/arm/armv5te/vp8_packtokens_mbrow_armv5$(ASM)
+VP9_CX_SRCS-$(HAVE_ARMV5TE)  += encoder/arm/armv5te/vp8_packtokens_partitions_armv5$(ASM)
+
+#File list for armv6
+# encoder
+VP9_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/armv6/vp8_subtract_armv6$(ASM)
+VP9_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/armv6/vp8_short_fdct4x4_armv6$(ASM)
+VP9_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/armv6/vp8_fast_quantize_b_armv6$(ASM)
+VP9_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/armv6/vp8_sad16x16_armv6$(ASM)
+VP9_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/armv6/vp8_variance16x16_armv6$(ASM)
+VP9_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6$(ASM)
+VP9_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6$(ASM)
+VP9_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6$(ASM)
+VP9_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/armv6/vp8_mse16x16_armv6$(ASM)
+VP9_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/armv6/vp8_variance8x8_armv6$(ASM)
+VP9_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/armv6/walsh_v6$(ASM)
+
+#File list for neon
+# encoder
+VP9_CX_SRCS-$(HAVE_ARMV7)  += encoder/arm/neon/fastquantizeb_neon$(ASM)
+VP9_CX_SRCS-$(HAVE_ARMV7)  += encoder/arm/neon/picklpf_arm.c
+VP9_CX_SRCS-$(HAVE_ARMV7)  += encoder/arm/neon/sad8_neon$(ASM)
+VP9_CX_SRCS-$(HAVE_ARMV7)  += encoder/arm/neon/sad16_neon$(ASM)
+VP9_CX_SRCS-$(HAVE_ARMV7)  += encoder/arm/neon/shortfdct_neon$(ASM)
+VP9_CX_SRCS-$(HAVE_ARMV7)  += encoder/arm/neon/subtract_neon$(ASM)
+VP9_CX_SRCS-$(HAVE_ARMV7)  += encoder/arm/neon/variance_neon$(ASM)
+VP9_CX_SRCS-$(HAVE_ARMV7)  += encoder/arm/neon/vp8_mse16x16_neon$(ASM)
+VP9_CX_SRCS-$(HAVE_ARMV7)  += encoder/arm/neon/vp8_subpixelvariance8x8_neon$(ASM)
+VP9_CX_SRCS-$(HAVE_ARMV7)  += encoder/arm/neon/vp8_subpixelvariance16x16_neon$(ASM)
+VP9_CX_SRCS-$(HAVE_ARMV7)  += encoder/arm/neon/vp8_subpixelvariance16x16s_neon$(ASM)
+VP9_CX_SRCS-$(HAVE_ARMV7)  += encoder/arm/neon/vp8_memcpy_neon$(ASM)
+VP9_CX_SRCS-$(HAVE_ARMV7)  += encoder/arm/neon/vp8_shortwalsh4x4_neon$(ASM)
--- /dev/null
+++ b/vp9/vp9dx.mk
@@ -1,0 +1,71 @@
+##
+##  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+##
+##  Use of this source code is governed by a BSD-style license
+##  that can be found in the LICENSE file in the root of the source
+##  tree. An additional intellectual property rights grant can be found
+##  in the file PATENTS.  All contributing project authors may
+##  be found in the AUTHORS file in the root of the source tree.
+##
+
+
+include $(SRC_PATH_BARE)/$(VP9_PREFIX)vp9_common.mk
+
+VP9_DX_EXPORTS += exports_dec
+
+VP9_DX_SRCS-yes += $(VP9_COMMON_SRCS-yes)
+VP9_DX_SRCS-no  += $(VP9_COMMON_SRCS-no)
+VP9_DX_SRCS_REMOVE-yes += $(VP9_COMMON_SRCS_REMOVE-yes)
+VP9_DX_SRCS_REMOVE-no  += $(VP9_COMMON_SRCS_REMOVE-no)
+
+ifeq ($(ARCH_ARM),yes)
+  include $(SRC_PATH_BARE)/$(VP9_PREFIX)vp9dx_arm.mk
+endif
+
+VP9_DX_SRCS-yes += vp9_dx_iface.c
+
+# common
+#define ARM
+#define DISABLE_THREAD
+
+#INCLUDES += algo/vpx_common/vpx_mem/include
+#INCLUDES += common
+#INCLUDES += common
+#INCLUDES += common
+#INCLUDES += common
+#INCLUDES += decoder
+
+
+
+# decoder
+#define ARM
+#define DISABLE_THREAD
+
+#INCLUDES += algo/vpx_common/vpx_mem/include
+#INCLUDES += common
+#INCLUDES += common
+#INCLUDES += common
+#INCLUDES += common
+#INCLUDES += decoder
+
+VP9_DX_SRCS-yes += decoder/asm_dec_offsets.c
+VP9_DX_SRCS-yes += decoder/dboolhuff.c
+VP9_DX_SRCS-yes += decoder/decodemv.c
+VP9_DX_SRCS-yes += decoder/decodframe.c
+VP9_DX_SRCS-yes += decoder/dequantize.c
+VP9_DX_SRCS-yes += decoder/detokenize.c
+VP9_DX_SRCS-yes += decoder/dboolhuff.h
+VP9_DX_SRCS-yes += decoder/decodemv.h
+VP9_DX_SRCS-yes += decoder/dequantize.h
+VP9_DX_SRCS-yes += decoder/detokenize.h
+VP9_DX_SRCS-yes += decoder/onyxd_int.h
+VP9_DX_SRCS-yes += decoder/treereader.h
+VP9_DX_SRCS-yes += decoder/onyxd_if.c
+VP9_DX_SRCS-yes += decoder/idct_blk.c
+
+VP9_DX_SRCS-yes := $(filter-out $(VP9_DX_SRCS_REMOVE-yes),$(VP9_DX_SRCS-yes))
+
+VP9_DX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += decoder/x86/x86_dsystemdependent.c
+VP9_DX_SRCS-$(HAVE_MMX) += decoder/x86/dequantize_mmx.asm
+VP9_DX_SRCS-$(HAVE_MMX) += decoder/x86/idct_blk_mmx.c
+VP9_DX_SRCS-$(HAVE_SSE2) += decoder/x86/idct_blk_sse2.c
--- /dev/null
+++ b/vp9/vp9dx_arm.mk
@@ -1,0 +1,29 @@
+##
+##  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+##
+##  Use of this source code is governed by a BSD-style license
+##  that can be found in the LICENSE file in the root of the source
+##  tree. An additional intellectual property rights grant can be found
+##  in the file PATENTS.  All contributing project authors may
+##  be found in the AUTHORS file in the root of the source tree.
+##
+
+
+#VP8_DX_SRCS list is modified according to different platforms.
+
+VP8_DX_SRCS-$(ARCH_ARM)  += decoder/arm/dequantize_arm.c
+
+#File list for armv6
+VP8_DX_SRCS-$(HAVE_ARMV6)  += decoder/arm/armv6/dequant_dc_idct_v6$(ASM)
+VP8_DX_SRCS-$(HAVE_ARMV6)  += decoder/arm/armv6/dequant_idct_v6$(ASM)
+VP8_DX_SRCS-$(HAVE_ARMV6)  += decoder/arm/armv6/dequantize_v6$(ASM)
+VP8_DX_SRCS-$(HAVE_ARMV6)  += decoder/arm/armv6/idct_blk_v6.c
+
+#File list for neon
+VP8_DX_SRCS-$(HAVE_ARMV7)  += decoder/arm/neon/idct_dequant_dc_full_2x_neon$(ASM)
+VP8_DX_SRCS-$(HAVE_ARMV7)  += decoder/arm/neon/idct_dequant_dc_0_2x_neon$(ASM)
+VP8_DX_SRCS-$(HAVE_ARMV7)  += decoder/arm/neon/dequant_idct_neon$(ASM)
+VP8_DX_SRCS-$(HAVE_ARMV7)  += decoder/arm/neon/idct_dequant_full_2x_neon$(ASM)
+VP8_DX_SRCS-$(HAVE_ARMV7)  += decoder/arm/neon/idct_dequant_0_2x_neon$(ASM)
+VP8_DX_SRCS-$(HAVE_ARMV7)  += decoder/arm/neon/dequantizeb_neon$(ASM)
+VP8_DX_SRCS-$(HAVE_ARMV7)  += decoder/arm/neon/idct_blk_neon.c
--- a/vpx/vp8.h
+++ b/vpx/vp8.h
@@ -28,8 +28,8 @@
 /*!\file
  * \brief Provides controls common to both the VP8 encoder and decoder.
  */
-#ifndef VP8_H
-#define VP8_H
+#ifndef VP9_H
+#define VP9_H
 #include "vpx_codec_impl_top.h"
 
 /*!\brief Control functions
--- a/vpx/vp8cx.h
+++ b/vpx/vp8cx.h
@@ -20,8 +20,8 @@
  * \brief Provides definitions for using the VP8 encoder algorithm within the
  *        vpx Codec Interface.
  */
-#ifndef VP8CX_H
-#define VP8CX_H
+#ifndef VP9CX_H
+#define VP9CX_H
 #include "vpx_config.h"
 #include "vpx_codec_impl_top.h"
 
--- a/vpx/vp8dx.h
+++ b/vpx/vp8dx.h
@@ -20,8 +20,8 @@
  * \brief Provides definitions for using the VP8 algorithm within the vpx Decoder
  *        interface.
  */
-#ifndef VP8DX_H
-#define VP8DX_H
+#ifndef VP9DX_H
+#define VP9DX_H
 #include "vpx_codec_impl_top.h"
 
 /*!\name Algorithm interface for VP8
--- a/vpx/vp8e.h
+++ b/vpx/vp8e.h
@@ -12,8 +12,8 @@
 /* This file contains backwards compatibility stubs for applications using
  * the VP8 version 1.0 API.
  */
-#ifndef VP8E_H
-#define VP8E_H
+#ifndef VP9E_H
+#define VP9E_H
 #include "vpx_codec_impl_top.h"
 
 #if defined(VPX_CODEC_DISABLE_COMPAT) && VPX_CODEC_DISABLE_COMPAT
--- a/vpxdec.c
+++ b/vpxdec.c
@@ -22,7 +22,7 @@
 #include "vpx_config.h"
 #include "vpx/vpx_decoder.h"
 #include "vpx_ports/vpx_timer.h"
-#if CONFIG_VP8_DECODER
+#if CONFIG_VP9_DECODER
 #include "vpx/vp8dx.h"
 #endif
 #if CONFIG_MD5
@@ -56,8 +56,8 @@
    unsigned int             fourcc;
    unsigned int             fourcc_mask;
 } ifaces[] = {
-#if CONFIG_VP8_DECODER
-  {"vp8",  vpx_codec_vp8_dx,   VP8_FOURCC, 0x00FFFFFF},
+#if CONFIG_VP9_DECODER
+  {"vp9",  vpx_codec_vp8_dx,   VP8_FOURCC, 0x00FFFFFF},
 #endif
 };
 
@@ -104,7 +104,7 @@
   NULL
 };
 
-#if CONFIG_VP8_DECODER
+#if CONFIG_VP9_DECODER
 static const arg_def_t addnoise_level = ARG_DEF(NULL, "noise-level", 1,
                                                 "Enable VP8 postproc add noise");
 static const arg_def_t deblock = ARG_DEF(NULL, "deblock", 0,
@@ -135,7 +135,7 @@
   fprintf(stderr, "Usage: %s <options> filename\n\n"
           "Options:\n", exec_name);
   arg_show_usage(stderr, all_args);
-#if CONFIG_VP8_DECODER
+#if CONFIG_VP9_DECODER
   fprintf(stderr, "\nVP8 Postprocessing Options:\n");
   arg_show_usage(stderr, vp8_pp_args);
 #endif
@@ -684,7 +684,7 @@
   unsigned int            fps_num;
   void                   *out = NULL;
   vpx_codec_dec_cfg_t     cfg = {0};
-#if CONFIG_VP8_DECODER
+#if CONFIG_VP9_DECODER
   vp8_postproc_cfg_t      vp8_pp_cfg = {0};
   int                     vp8_dbg_color_ref_frame = 0;
   int                     vp8_dbg_color_mb_modes = 0;
@@ -744,7 +744,7 @@
     else if (arg_match(&arg, &verbosearg, argi))
       quiet = 0;
 
-#if CONFIG_VP8_DECODER
+#if CONFIG_VP9_DECODER
     else if (arg_match(&arg, &addnoise_level, argi)) {
       postproc = 1;
       vp8_pp_cfg.post_proc_flag |= VP8_ADDNOISE;
@@ -909,7 +909,7 @@
   if (!quiet)
     fprintf(stderr, "%s\n", decoder.name);
 
-#if CONFIG_VP8_DECODER
+#if CONFIG_VP9_DECODER
 
   if (vp8_pp_cfg.post_proc_flag
       && vpx_codec_control(&decoder, VP8_SET_POSTPROC, &vp8_pp_cfg)) {
--- a/vpxenc.c
+++ b/vpxenc.c
@@ -82,8 +82,8 @@
   unsigned int             fourcc;
   unsigned int             fourcc_mask;
 } ifaces[] = {
-#if CONFIG_VP8_DECODER
-  {"vp8",  &vpx_codec_vp8_dx,   VP8_FOURCC, 0x00FFFFFF},
+#if CONFIG_VP9_DECODER
+  {"vp9",  &vpx_codec_vp8_dx,   VP8_FOURCC, 0x00FFFFFF},
 #endif
 };
 
@@ -93,8 +93,8 @@
   unsigned int             fourcc;
   unsigned int             fourcc_mask;
 } codecs[] = {
-#if CONFIG_VP8_ENCODER
-  {"vp8",  vpx_codec_vp8x_cx,   VP8_FOURCC, 0x00FFFFFF},
+#if CONFIG_VP9_ENCODER
+  {"vp9",  vpx_codec_vp8x_cx,   VP8_FOURCC, 0x00FFFFFF},
 #endif
 };
 
@@ -1011,7 +1011,7 @@
 };
 
 
-#if CONFIG_VP8_ENCODER
+#if CONFIG_VP9_ENCODER
 static const arg_def_t noise_sens = ARG_DEF(NULL, "noise-sensitivity", 1,
                                             "Noise sensitivity (frames to blur)");
 static const arg_def_t sharpness = ARG_DEF(NULL, "sharpness", 1,
@@ -1020,13 +1020,13 @@
                                                "Motion detection threshold");
 #endif
 
-#if CONFIG_VP8_ENCODER
+#if CONFIG_VP9_ENCODER
 static const arg_def_t cpu_used = ARG_DEF(NULL, "cpu-used", 1,
                                           "CPU Used (-16..16)");
 #endif
 
 
-#if CONFIG_VP8_ENCODER
+#if CONFIG_VP9_ENCODER
 static const arg_def_t token_parts = ARG_DEF(NULL, "token-parts", 1,
                                              "Number of token partitions to use, log2");
 static const arg_def_t auto_altref = ARG_DEF(NULL, "auto-alt-ref", 1,
@@ -1081,7 +1081,7 @@
   arg_show_usage(stdout, rc_twopass_args);
   fprintf(stderr, "\nKeyframe Placement Options:\n");
   arg_show_usage(stdout, kf_args);
-#if CONFIG_VP8_ENCODER
+#if CONFIG_VP9_ENCODER
   fprintf(stderr, "\nVP8 Specific Options:\n");
   arg_show_usage(stdout, vp8_args);
 #endif
@@ -1659,7 +1659,7 @@
 #endif
 
   /* Handle codec specific options */
-#if CONFIG_VP8_ENCODER
+#if CONFIG_VP9_ENCODER
 
   if (codec->fourcc == VP8_FOURCC) {
     ctrl_args = vp8_args;